@fugood/llama.node 1.0.2 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/package.json +14 -14
  2. package/src/llama.cpp/CMakeLists.txt +0 -1
  3. package/src/llama.cpp/common/CMakeLists.txt +4 -5
  4. package/src/llama.cpp/common/arg.cpp +44 -0
  5. package/src/llama.cpp/common/common.cpp +22 -6
  6. package/src/llama.cpp/common/common.h +15 -1
  7. package/src/llama.cpp/ggml/CMakeLists.txt +10 -2
  8. package/src/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
  9. package/src/llama.cpp/ggml/include/ggml.h +104 -10
  10. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
  11. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
  12. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +12 -1
  13. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
  14. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +749 -163
  15. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -0
  16. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
  17. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +12 -9
  18. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +88 -9
  19. package/src/llama.cpp/include/llama.h +13 -47
  20. package/src/llama.cpp/src/llama-arch.cpp +298 -3
  21. package/src/llama.cpp/src/llama-arch.h +22 -1
  22. package/src/llama.cpp/src/llama-batch.cpp +103 -71
  23. package/src/llama.cpp/src/llama-batch.h +31 -18
  24. package/src/llama.cpp/src/llama-chat.cpp +59 -1
  25. package/src/llama.cpp/src/llama-chat.h +3 -0
  26. package/src/llama.cpp/src/llama-context.cpp +134 -95
  27. package/src/llama.cpp/src/llama-context.h +13 -16
  28. package/src/llama.cpp/src/llama-cparams.h +3 -2
  29. package/src/llama.cpp/src/llama-graph.cpp +279 -180
  30. package/src/llama.cpp/src/llama-graph.h +183 -122
  31. package/src/llama.cpp/src/llama-hparams.cpp +47 -1
  32. package/src/llama.cpp/src/llama-hparams.h +12 -1
  33. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +38 -22
  34. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +7 -2
  35. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +849 -304
  36. package/src/llama.cpp/src/llama-kv-cache-unified.h +143 -47
  37. package/src/llama.cpp/src/llama-kv-cells.h +62 -10
  38. package/src/llama.cpp/src/llama-memory-hybrid.cpp +10 -4
  39. package/src/llama.cpp/src/llama-memory-hybrid.h +3 -1
  40. package/src/llama.cpp/src/llama-memory-recurrent.cpp +21 -11
  41. package/src/llama.cpp/src/llama-memory.cpp +17 -0
  42. package/src/llama.cpp/src/llama-memory.h +3 -0
  43. package/src/llama.cpp/src/llama-model.cpp +3373 -743
  44. package/src/llama.cpp/src/llama-model.h +20 -4
  45. package/src/llama.cpp/src/llama-quant.cpp +2 -2
  46. package/src/llama.cpp/src/llama-vocab.cpp +376 -10
  47. package/src/llama.cpp/src/llama-vocab.h +43 -0
  48. package/src/llama.cpp/src/unicode.cpp +207 -0
  49. package/src/llama.cpp/src/unicode.h +2 -0
  50. package/src/llama.cpp/ggml/include/ggml-kompute.h +0 -50
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.0.2",
4
+ "version": "1.0.4",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -70,19 +70,19 @@
70
70
  "CMakeLists.txt"
71
71
  ],
72
72
  "optionalDependencies": {
73
- "@fugood/node-llama-linux-x64": "1.0.2",
74
- "@fugood/node-llama-linux-x64-vulkan": "1.0.2",
75
- "@fugood/node-llama-linux-x64-cuda": "1.0.2",
76
- "@fugood/node-llama-linux-arm64": "1.0.2",
77
- "@fugood/node-llama-linux-arm64-vulkan": "1.0.2",
78
- "@fugood/node-llama-linux-arm64-cuda": "1.0.2",
79
- "@fugood/node-llama-win32-x64": "1.0.2",
80
- "@fugood/node-llama-win32-x64-vulkan": "1.0.2",
81
- "@fugood/node-llama-win32-x64-cuda": "1.0.2",
82
- "@fugood/node-llama-win32-arm64": "1.0.2",
83
- "@fugood/node-llama-win32-arm64-vulkan": "1.0.2",
84
- "@fugood/node-llama-darwin-x64": "1.0.2",
85
- "@fugood/node-llama-darwin-arm64": "1.0.2"
73
+ "@fugood/node-llama-linux-x64": "1.0.4",
74
+ "@fugood/node-llama-linux-x64-vulkan": "1.0.4",
75
+ "@fugood/node-llama-linux-x64-cuda": "1.0.4",
76
+ "@fugood/node-llama-linux-arm64": "1.0.4",
77
+ "@fugood/node-llama-linux-arm64-vulkan": "1.0.4",
78
+ "@fugood/node-llama-linux-arm64-cuda": "1.0.4",
79
+ "@fugood/node-llama-win32-x64": "1.0.4",
80
+ "@fugood/node-llama-win32-x64-vulkan": "1.0.4",
81
+ "@fugood/node-llama-win32-x64-cuda": "1.0.4",
82
+ "@fugood/node-llama-win32-arm64": "1.0.4",
83
+ "@fugood/node-llama-win32-arm64-vulkan": "1.0.4",
84
+ "@fugood/node-llama-darwin-x64": "1.0.4",
85
+ "@fugood/node-llama-darwin-arm64": "1.0.4"
86
86
  },
87
87
  "devDependencies": {
88
88
  "@babel/preset-env": "^7.24.4",
@@ -120,7 +120,6 @@ endfunction()
120
120
 
121
121
  llama_option_depr(FATAL_ERROR LLAMA_CUBLAS GGML_CUDA)
122
122
  llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA)
123
- llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE)
124
123
  llama_option_depr(WARNING LLAMA_METAL GGML_METAL)
125
124
  llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
126
125
  llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE)
@@ -86,8 +86,7 @@ if (LLAMA_CURL)
86
86
  endif()
87
87
  target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
88
88
  include_directories(${CURL_INCLUDE_DIRS})
89
- find_library(CURL_LIBRARY curl REQUIRED)
90
- set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
89
+ set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARIES})
91
90
  endif ()
92
91
 
93
92
  if (LLAMA_LLGUIDANCE)
@@ -112,13 +111,13 @@ if (LLAMA_LLGUIDANCE)
112
111
 
113
112
  ExternalProject_Add(llguidance_ext
114
113
  GIT_REPOSITORY https://github.com/guidance-ai/llguidance
115
- # v0.7.20 (+ fix to build on GCC 15):
116
- GIT_TAG b5b8b64dba11c4e4ee6b1d1450d3a3ae279891e8
114
+ # v1.0.1:
115
+ GIT_TAG d795912fedc7d393de740177ea9ea761e7905774
117
116
  PREFIX ${CMAKE_BINARY_DIR}/llguidance
118
117
  SOURCE_DIR ${LLGUIDANCE_SRC}
119
118
  BUILD_IN_SOURCE TRUE
120
119
  CONFIGURE_COMMAND ""
121
- BUILD_COMMAND cargo build --release
120
+ BUILD_COMMAND cargo build --release --package llguidance
122
121
  INSTALL_COMMAND ""
123
122
  BUILD_BYPRODUCTS ${LLGUIDANCE_PATH}/${LLGUIDANCE_LIB_NAME} ${LLGUIDANCE_PATH}/llguidance.h
124
123
  UPDATE_COMMAND ""
@@ -1464,6 +1464,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1464
1464
  params.swa_full = true;
1465
1465
  }
1466
1466
  ).set_env("LLAMA_ARG_SWA_FULL"));
1467
+ add_opt(common_arg(
1468
+ {"--kv-unified", "-kvu"},
1469
+ string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
1470
+ "[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"),
1471
+ [](common_params & params) {
1472
+ params.kv_unified = true;
1473
+ }
1474
+ ).set_env("LLAMA_ARG_KV_SPLIT"));
1467
1475
  add_opt(common_arg(
1468
1476
  {"--no-context-shift"},
1469
1477
  string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
@@ -2734,6 +2742,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2734
2742
  params.public_path = value;
2735
2743
  }
2736
2744
  ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
2745
+ add_opt(common_arg(
2746
+ {"--api-prefix"}, "PREFIX",
2747
+ string_format("prefix path the server serves from, without the trailing slash (default: %s)", params.api_prefix.c_str()),
2748
+ [](common_params & params, const std::string & value) {
2749
+ params.api_prefix = value;
2750
+ }
2751
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
2737
2752
  add_opt(common_arg(
2738
2753
  {"--no-webui"},
2739
2754
  string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
@@ -3416,5 +3431,34 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3416
3431
  }
3417
3432
  ).set_examples({LLAMA_EXAMPLE_SERVER}));
3418
3433
 
3434
+ // diffusion parameters
3435
+ add_opt(common_arg(
3436
+ { "--diffusion-steps" }, "N",
3437
+ string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
3438
+ [](common_params & params, int value) { params.diffusion.steps = value; }
3439
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3440
+ add_opt(common_arg(
3441
+ { "--diffusion-eps" }, "F",
3442
+ string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
3443
+ [](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
3444
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3445
+ add_opt(common_arg(
3446
+ { "--diffusion-algorithm" }, "N",
3447
+ string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)",
3448
+ params.diffusion.algorithm),
3449
+ [](common_params & params, int value) { params.diffusion.algorithm = value; }
3450
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3451
+ add_opt(common_arg(
3452
+ { "--diffusion-alg-temp" }, "F",
3453
+ string_format("algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
3454
+ [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
3455
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3456
+ add_opt(common_arg(
3457
+ { "--diffusion-visual" },
3458
+ string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
3459
+ params.diffusion.visual_mode ? "true" : "false"),
3460
+ [](common_params & params) { params.diffusion.visual_mode = true; }
3461
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3462
+
3419
3463
  return ctx_arg;
3420
3464
  }
@@ -448,6 +448,15 @@ void string_replace_all(std::string & s, const std::string & search, const std::
448
448
  bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
449
449
  return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
450
450
  }
451
+
452
+ bool string_remove_suffix(std::string & str, const std::string_view & suffix) {
453
+ bool has_suffix = string_ends_with(str, suffix);
454
+ if (has_suffix) {
455
+ str = str.substr(0, str.size() - suffix.size());
456
+ }
457
+ return has_suffix;
458
+ }
459
+
451
460
  size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) {
452
461
  if (!str.empty() && !stop.empty()) {
453
462
  const char text_last_char = str.back();
@@ -1005,15 +1014,21 @@ struct common_init_result common_init_from_params(common_params & params) {
1005
1014
  params.sampling.ignore_eos = false;
1006
1015
  }
1007
1016
 
1008
- if (params.sampling.ignore_eos) {
1009
- for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
1010
- if (llama_vocab_is_eog(vocab, i)) {
1011
- LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
1012
- params.sampling.logit_bias.push_back({i, -INFINITY});
1013
- }
1017
+ // initialize once
1018
+ for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
1019
+ if (llama_vocab_is_eog(vocab, i)) {
1020
+ LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
1021
+ params.sampling.logit_bias_eog.push_back({i, -INFINITY});
1014
1022
  }
1015
1023
  }
1016
1024
 
1025
+ if (params.sampling.ignore_eos) {
1026
+ // add EOG biases to the active set of logit biases
1027
+ params.sampling.logit_bias.insert(
1028
+ params.sampling.logit_bias.end(),
1029
+ params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
1030
+ }
1031
+
1017
1032
  if (params.sampling.penalty_last_n == -1) {
1018
1033
  LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
1019
1034
  params.sampling.penalty_last_n = llama_n_ctx(lctx);
@@ -1158,6 +1173,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
1158
1173
  cparams.no_perf = params.no_perf;
1159
1174
  cparams.op_offload = !params.no_op_offload;
1160
1175
  cparams.swa_full = params.swa_full;
1176
+ cparams.kv_unified = params.kv_unified;
1161
1177
 
1162
1178
  cparams.type_k = params.cache_type_k;
1163
1179
  cparams.type_v = params.cache_type_v;
@@ -81,6 +81,7 @@ enum llama_example {
81
81
  LLAMA_EXAMPLE_LOOKUP,
82
82
  LLAMA_EXAMPLE_PARALLEL,
83
83
  LLAMA_EXAMPLE_TTS,
84
+ LLAMA_EXAMPLE_DIFFUSION,
84
85
 
85
86
  LLAMA_EXAMPLE_COUNT,
86
87
  };
@@ -177,7 +178,8 @@ struct common_params_sampling {
177
178
  std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
178
179
  std::set<llama_token> preserved_tokens;
179
180
 
180
- std::vector<llama_logit_bias> logit_bias; // logit biases to apply
181
+ std::vector<llama_logit_bias> logit_bias; // logit biases to apply
182
+ std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
181
183
 
182
184
  // print the parameters into a string
183
185
  std::string print() const;
@@ -217,6 +219,14 @@ struct common_params_vocoder {
217
219
  bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
218
220
  };
219
221
 
222
+ struct common_params_diffusion {
223
+ int32_t steps = 64; // number of diffusion steps
224
+ float eps = 1e-3f; // epsilon for timesteps
225
+ int32_t algorithm = 0; // diffusion algorithm (0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY)
226
+ float alg_temp = 0.0f; // algorithm temperature
227
+ bool visual_mode = false; // show progressive diffusion on screen
228
+ };
229
+
220
230
  enum common_reasoning_format {
221
231
  COMMON_REASONING_FORMAT_NONE,
222
232
  COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
@@ -269,6 +279,7 @@ struct common_params {
269
279
  struct common_params_sampling sampling;
270
280
  struct common_params_speculative speculative;
271
281
  struct common_params_vocoder vocoder;
282
+ struct common_params_diffusion diffusion;
272
283
 
273
284
  struct common_params_model model;
274
285
 
@@ -331,6 +342,7 @@ struct common_params {
331
342
  bool no_perf = false; // disable performance metrics
332
343
  bool ctx_shift = true; // context shift on inifinite text generation
333
344
  bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
345
+ bool kv_unified = false; // enable unified KV cache
334
346
 
335
347
  bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
336
348
  bool use_mmap = true; // use mmap for faster loads
@@ -371,6 +383,7 @@ struct common_params {
371
383
 
372
384
  std::string hostname = "127.0.0.1";
373
385
  std::string public_path = ""; // NOLINT
386
+ std::string api_prefix = ""; // NOLINT
374
387
  std::string chat_template = ""; // NOLINT
375
388
  bool use_jinja = false; // NOLINT
376
389
  bool enable_chat_template = true;
@@ -522,6 +535,7 @@ static bool string_starts_with(const std::string & str,
522
535
 
523
536
  // While we wait for C++20's std::string::ends_with...
524
537
  bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
538
+ bool string_remove_suffix(std::string & str, const std::string_view & suffix);
525
539
  size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
526
540
 
527
541
  bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
@@ -181,7 +181,8 @@ option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug ou
181
181
  option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
182
182
  option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
183
183
  option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
184
- option(GGML_KOMPUTE "ggml: use Kompute" OFF)
184
+ option(GGML_WEBGPU "ggml: use WebGPU" OFF)
185
+ option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
185
186
  option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
186
187
  option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
187
188
  option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
@@ -266,12 +267,12 @@ set(GGML_PUBLIC_HEADERS
266
267
  include/ggml-cann.h
267
268
  include/ggml-cpp.h
268
269
  include/ggml-cuda.h
269
- include/ggml-kompute.h
270
270
  include/ggml-opt.h
271
271
  include/ggml-metal.h
272
272
  include/ggml-rpc.h
273
273
  include/ggml-sycl.h
274
274
  include/ggml-vulkan.h
275
+ include/ggml-webgpu.h
275
276
  include/gguf.h)
276
277
 
277
278
  set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
@@ -360,6 +361,13 @@ write_basic_package_version_file(
360
361
  VERSION ${GGML_INSTALL_VERSION}
361
362
  COMPATIBILITY SameMajorVersion)
362
363
 
364
+ target_compile_definitions(ggml-base PRIVATE
365
+ GGML_VERSION="${GGML_INSTALL_VERSION}"
366
+ GGML_COMMIT="${GGML_BUILD_COMMIT}"
367
+ )
368
+ message(STATUS "ggml version: ${GGML_INSTALL_VERSION}")
369
+ message(STATUS "ggml commit: ${GGML_BUILD_COMMIT}")
370
+
363
371
  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
364
372
  ${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
365
373
  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)
@@ -0,0 +1,19 @@
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+ #include "ggml-backend.h"
5
+
6
+ #ifdef __cplusplus
7
+ extern "C" {
8
+ #endif
9
+
10
+ #define GGML_WEBGPU_NAME "WebGPU"
11
+
12
+ // Needed for examples in ggml
13
+ GGML_BACKEND_API ggml_backend_t ggml_backend_webgpu_init(void);
14
+
15
+ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_webgpu_reg(void);
16
+
17
+ #ifdef __cplusplus
18
+ }
19
+ #endif
@@ -314,6 +314,13 @@
314
314
  extern "C" {
315
315
  #endif
316
316
 
317
+ // Function type used in fatal error callbacks
318
+ typedef void (*ggml_abort_callback_t)(const char * error_message);
319
+
320
+ // Set the abort callback (passing null will restore original abort functionality: printing a message to stdout)
321
+ // Returns the old callback for chaining
322
+ GGML_API ggml_abort_callback_t ggml_set_abort_callback(ggml_abort_callback_t callback);
323
+
317
324
  GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
318
325
  GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
319
326
 
@@ -482,12 +489,13 @@ extern "C" {
482
489
  GGML_OP_CONV_TRANSPOSE_1D,
483
490
  GGML_OP_IM2COL,
484
491
  GGML_OP_IM2COL_BACK,
492
+ GGML_OP_CONV_2D,
485
493
  GGML_OP_CONV_2D_DW,
486
494
  GGML_OP_CONV_TRANSPOSE_2D,
487
495
  GGML_OP_POOL_1D,
488
496
  GGML_OP_POOL_2D,
489
497
  GGML_OP_POOL_2D_BACK,
490
- GGML_OP_UPSCALE, // nearest interpolate
498
+ GGML_OP_UPSCALE,
491
499
  GGML_OP_PAD,
492
500
  GGML_OP_PAD_REFLECT_1D,
493
501
  GGML_OP_ROLL,
@@ -549,6 +557,8 @@ extern "C" {
549
557
  GGML_GLU_OP_REGLU,
550
558
  GGML_GLU_OP_GEGLU,
551
559
  GGML_GLU_OP_SWIGLU,
560
+ GGML_GLU_OP_GEGLU_ERF,
561
+ GGML_GLU_OP_GEGLU_QUICK,
552
562
 
553
563
  GGML_GLU_OP_COUNT,
554
564
  };
@@ -638,6 +648,9 @@ extern "C" {
638
648
 
639
649
  // misc
640
650
 
651
+ GGML_API const char * ggml_version(void);
652
+ GGML_API const char * ggml_commit(void);
653
+
641
654
  GGML_API void ggml_time_init(void); // call this once at the beginning of the program
642
655
  GGML_API int64_t ggml_time_ms(void);
643
656
  GGML_API int64_t ggml_time_us(void);
@@ -1136,6 +1149,22 @@ extern "C" {
1136
1149
  struct ggml_context * ctx,
1137
1150
  struct ggml_tensor * a);
1138
1151
 
1152
+ GGML_API struct ggml_tensor * ggml_geglu_erf(
1153
+ struct ggml_context * ctx,
1154
+ struct ggml_tensor * a);
1155
+
1156
+ GGML_API struct ggml_tensor * ggml_geglu_erf_swapped(
1157
+ struct ggml_context * ctx,
1158
+ struct ggml_tensor * a);
1159
+
1160
+ GGML_API struct ggml_tensor * ggml_geglu_quick(
1161
+ struct ggml_context * ctx,
1162
+ struct ggml_tensor * a);
1163
+
1164
+ GGML_API struct ggml_tensor * ggml_geglu_quick_swapped(
1165
+ struct ggml_context * ctx,
1166
+ struct ggml_tensor * a);
1167
+
1139
1168
  // A: n columns, r rows,
1140
1169
  // B: n columns, r rows,
1141
1170
  GGML_API struct ggml_tensor * ggml_glu_split(
@@ -1159,6 +1188,16 @@ extern "C" {
1159
1188
  struct ggml_tensor * a,
1160
1189
  struct ggml_tensor * b);
1161
1190
 
1191
+ GGML_API struct ggml_tensor * ggml_geglu_erf_split(
1192
+ struct ggml_context * ctx,
1193
+ struct ggml_tensor * a,
1194
+ struct ggml_tensor * b);
1195
+
1196
+ GGML_API struct ggml_tensor * ggml_geglu_quick_split(
1197
+ struct ggml_context * ctx,
1198
+ struct ggml_tensor * a,
1199
+ struct ggml_tensor * b);
1200
+
1162
1201
  // normalize along rows
1163
1202
  GGML_API struct ggml_tensor * ggml_norm(
1164
1203
  struct ggml_context * ctx,
@@ -1258,6 +1297,19 @@ extern "C" {
1258
1297
  struct ggml_tensor * a,
1259
1298
  float s);
1260
1299
 
1300
+ // x = s * a + b
1301
+ GGML_API struct ggml_tensor * ggml_scale_bias(
1302
+ struct ggml_context * ctx,
1303
+ struct ggml_tensor * a,
1304
+ float s,
1305
+ float b);
1306
+
1307
+ GGML_API struct ggml_tensor * ggml_scale_bias_inplace(
1308
+ struct ggml_context * ctx,
1309
+ struct ggml_tensor * a,
1310
+ float s,
1311
+ float b);
1312
+
1261
1313
  // b -> view(a,offset,nb1,nb2,3), return modified a
1262
1314
  GGML_API struct ggml_tensor * ggml_set(
1263
1315
  struct ggml_context * ctx,
@@ -1502,8 +1554,14 @@ extern "C" {
1502
1554
  struct ggml_context * ctx,
1503
1555
  struct ggml_tensor * a);
1504
1556
 
1557
+ // a [ne0, ne01, ne02, ne03]
1558
+ // mask [ne0, ne11, ne12, ne13] | ne11 >= ne01, F16 or F32, optional
1559
+ //
1560
+ // broadcast:
1561
+ // ne02 % ne12 == 0
1562
+ // ne03 % ne13 == 0
1563
+ //
1505
1564
  // fused soft_max(a*scale + mask*(ALiBi slope))
1506
- // mask is optional
1507
1565
  // max_bias = 0.0f for no ALiBi
1508
1566
  GGML_API struct ggml_tensor * ggml_soft_max_ext(
1509
1567
  struct ggml_context * ctx,
@@ -1813,6 +1871,17 @@ extern "C" {
1813
1871
  struct ggml_tensor * b,
1814
1872
  int stride);
1815
1873
 
1874
+ GGML_API struct ggml_tensor * ggml_conv_2d_direct(
1875
+ struct ggml_context * ctx,
1876
+ struct ggml_tensor * a, // convolution kernel [KW, KH, IC, OC]
1877
+ struct ggml_tensor * b, // input data [W, H, C, N]
1878
+ int s0, // stride dimension 0
1879
+ int s1, // stride dimension 1
1880
+ int p0, // padding dimension 0
1881
+ int p1, // padding dimension 1
1882
+ int d0, // dilation dimension 0
1883
+ int d1); // dilation dimension 1
1884
+
1816
1885
  enum ggml_op_pool {
1817
1886
  GGML_OP_POOL_MAX,
1818
1887
  GGML_OP_POOL_AVG,
@@ -1855,6 +1924,12 @@ extern "C" {
1855
1924
  enum ggml_scale_mode {
1856
1925
  GGML_SCALE_MODE_NEAREST = 0,
1857
1926
  GGML_SCALE_MODE_BILINEAR = 1,
1927
+
1928
+ GGML_SCALE_MODE_COUNT
1929
+ };
1930
+
1931
+ enum ggml_scale_flag {
1932
+ GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8)
1858
1933
  };
1859
1934
 
1860
1935
  // interpolate
@@ -1867,14 +1942,26 @@ extern "C" {
1867
1942
 
1868
1943
  // interpolate
1869
1944
  // interpolate scale to specified dimensions
1870
- GGML_API struct ggml_tensor * ggml_upscale_ext(
1945
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_upscale_ext(
1871
1946
  struct ggml_context * ctx,
1872
1947
  struct ggml_tensor * a,
1873
1948
  int ne0,
1874
1949
  int ne1,
1875
1950
  int ne2,
1876
1951
  int ne3,
1877
- enum ggml_scale_mode mode);
1952
+ enum ggml_scale_mode mode),
1953
+ "use ggml_interpolate instead");
1954
+
1955
+ // Up- or downsamples the input to the specified size.
1956
+ // 2D scale modes (eg. bilinear) are applied to the first two dimensions.
1957
+ GGML_API struct ggml_tensor * ggml_interpolate(
1958
+ struct ggml_context * ctx,
1959
+ struct ggml_tensor * a,
1960
+ int64_t ne0,
1961
+ int64_t ne1,
1962
+ int64_t ne2,
1963
+ int64_t ne3,
1964
+ uint32_t mode); // ggml_scale_mode [ | ggml_scale_flag...]
1878
1965
 
1879
1966
  // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
1880
1967
  GGML_API struct ggml_tensor * ggml_pad(
@@ -1937,11 +2024,17 @@ extern "C" {
1937
2024
 
1938
2025
  #define GGML_KQ_MASK_PAD 64
1939
2026
 
1940
- // q: [n_embd_k, n_batch, n_head, 1]
1941
- // k: [n_embd_k, n_kv, n_head_kv, 1]
1942
- // v: [n_embd_v, n_kv, n_head_kv, 1] !! not transposed !!
1943
- // mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
1944
- // res: [n_embd_v, n_head, n_batch, 1] !! permuted !!
2027
+ // q: [n_embd_k, n_batch, n_head, ne3 ]
2028
+ // k: [n_embd_k, n_kv, n_head_kv, ne3 ]
2029
+ // v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
2030
+ // mask: [n_kv, n_batch_pad, ne32, ne33] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
2031
+ // res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
2032
+ //
2033
+ // broadcast:
2034
+ // n_head % n_head_kv == 0
2035
+ // n_head % ne32 == 0
2036
+ // ne3 % ne33 == 0
2037
+ //
1945
2038
  GGML_API struct ggml_tensor * ggml_flash_attn_ext(
1946
2039
  struct ggml_context * ctx,
1947
2040
  struct ggml_tensor * q,
@@ -1980,7 +2073,8 @@ extern "C" {
1980
2073
  struct ggml_tensor * dt,
1981
2074
  struct ggml_tensor * A,
1982
2075
  struct ggml_tensor * B,
1983
- struct ggml_tensor * C);
2076
+ struct ggml_tensor * C,
2077
+ struct ggml_tensor * ids);
1984
2078
 
1985
2079
  // partition into non-overlapping windows with padding if needed
1986
2080
  // example:
@@ -365,12 +365,12 @@ ggml_add_backend(BLAS)
365
365
  ggml_add_backend(CANN)
366
366
  ggml_add_backend(CUDA)
367
367
  ggml_add_backend(HIP)
368
- ggml_add_backend(Kompute)
369
368
  ggml_add_backend(METAL)
370
369
  ggml_add_backend(MUSA)
371
370
  ggml_add_backend(RPC)
372
371
  ggml_add_backend(SYCL)
373
372
  ggml_add_backend(Vulkan)
373
+ ggml_add_backend(WebGPU)
374
374
  ggml_add_backend(OpenCL)
375
375
 
376
376
  foreach (target ggml-base ggml)
@@ -5,7 +5,7 @@ function(ggml_add_cpu_backend_features cpu_name arch)
5
5
  # build, using set_source_files_properties() to set the arch flags is not possible
6
6
  set(GGML_CPU_FEATS_NAME ${cpu_name}-feats)
7
7
  add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/arch/${arch}/cpu-feats.cpp)
8
- target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
8
+ target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . ../include)
9
9
  target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARGN})
10
10
  target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
11
11
  set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
@@ -589,4 +589,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
589
589
  if (EMSCRIPTEN)
590
590
  set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128")
591
591
  endif()
592
+
593
+ if (CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
594
+ # The compiler automatically enables "-ffast-math" which can cause NaNs in tests due to "-fassociative-math"
595
+ target_compile_options(${GGML_CPU_NAME} PRIVATE "-fno-associative-math")
596
+ endif()
592
597
  endfunction()
@@ -1193,7 +1193,7 @@ static void ggml_compute_forward_mul_mat_one_chunk(
1193
1193
  }
1194
1194
  }
1195
1195
 
1196
- static void ggml_compute_forward_mul_mat(
1196
+ void ggml_compute_forward_mul_mat(
1197
1197
  const struct ggml_compute_params * params,
1198
1198
  struct ggml_tensor * dst) {
1199
1199
 
@@ -1866,6 +1866,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
1866
1866
  {
1867
1867
  ggml_compute_forward_im2col_back_f32(params, tensor);
1868
1868
  } break;
1869
+ case GGML_OP_CONV_2D:
1870
+ {
1871
+ ggml_compute_forward_conv_2d(params, tensor);
1872
+ } break;
1869
1873
  case GGML_OP_CONV_2D_DW:
1870
1874
  {
1871
1875
  ggml_compute_forward_conv_2d_dw(params, tensor);
@@ -2168,6 +2172,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
2168
2172
  case GGML_GLU_OP_REGLU:
2169
2173
  case GGML_GLU_OP_GEGLU:
2170
2174
  case GGML_GLU_OP_SWIGLU:
2175
+ case GGML_GLU_OP_GEGLU_ERF:
2176
+ case GGML_GLU_OP_GEGLU_QUICK:
2171
2177
  {
2172
2178
  n_tasks = n_threads;
2173
2179
  } break;
@@ -2228,6 +2234,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
2228
2234
  } break;
2229
2235
  case GGML_OP_IM2COL:
2230
2236
  case GGML_OP_IM2COL_BACK:
2237
+ case GGML_OP_CONV_2D:
2231
2238
  case GGML_OP_CONV_2D_DW:
2232
2239
  case GGML_OP_CONV_TRANSPOSE_1D:
2233
2240
  case GGML_OP_CONV_TRANSPOSE_2D:
@@ -2746,6 +2753,10 @@ struct ggml_cplan ggml_graph_plan(
2746
2753
  GGML_ABORT("fatal error");
2747
2754
  }
2748
2755
  } break;
2756
+ case GGML_OP_CONV_2D:
2757
+ {
2758
+ cur = GGML_IM2COL_WORK_SIZE;
2759
+ } break;
2749
2760
  case GGML_OP_CONV_TRANSPOSE_2D:
2750
2761
  {
2751
2762
  const int64_t ne00 = node->src[0]->ne[0]; // W