@fugood/llama.node 1.3.0-rc.1 → 1.3.0-rc.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/binding.js CHANGED
@@ -15,13 +15,23 @@ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (
15
15
  }) : function(o, v) {
16
16
  o["default"] = v;
17
17
  });
18
- var __importStar = (this && this.__importStar) || function (mod) {
19
- if (mod && mod.__esModule) return mod;
20
- var result = {};
21
- if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
22
- __setModuleDefault(result, mod);
23
- return result;
24
- };
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
25
35
  var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
26
36
  function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
27
37
  return new (P || (P = Promise))(function (resolve, reject) {
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.3.0-rc.1",
4
+ "version": "1.3.0-rc.2",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -72,19 +72,19 @@
72
72
  "CMakeLists.txt"
73
73
  ],
74
74
  "optionalDependencies": {
75
- "@fugood/node-llama-linux-x64": "1.3.0-rc.1",
76
- "@fugood/node-llama-linux-x64-vulkan": "1.3.0-rc.1",
77
- "@fugood/node-llama-linux-x64-cuda": "1.3.0-rc.1",
78
- "@fugood/node-llama-linux-arm64": "1.3.0-rc.1",
79
- "@fugood/node-llama-linux-arm64-vulkan": "1.3.0-rc.1",
80
- "@fugood/node-llama-linux-arm64-cuda": "1.3.0-rc.1",
81
- "@fugood/node-llama-win32-x64": "1.3.0-rc.1",
82
- "@fugood/node-llama-win32-x64-vulkan": "1.3.0-rc.1",
83
- "@fugood/node-llama-win32-x64-cuda": "1.3.0-rc.1",
84
- "@fugood/node-llama-win32-arm64": "1.3.0-rc.1",
85
- "@fugood/node-llama-win32-arm64-vulkan": "1.3.0-rc.1",
86
- "@fugood/node-llama-darwin-x64": "1.3.0-rc.1",
87
- "@fugood/node-llama-darwin-arm64": "1.3.0-rc.1"
75
+ "@fugood/node-llama-linux-x64": "1.3.0-rc.2",
76
+ "@fugood/node-llama-linux-x64-vulkan": "1.3.0-rc.2",
77
+ "@fugood/node-llama-linux-x64-cuda": "1.3.0-rc.2",
78
+ "@fugood/node-llama-linux-arm64": "1.3.0-rc.2",
79
+ "@fugood/node-llama-linux-arm64-vulkan": "1.3.0-rc.2",
80
+ "@fugood/node-llama-linux-arm64-cuda": "1.3.0-rc.2",
81
+ "@fugood/node-llama-win32-x64": "1.3.0-rc.2",
82
+ "@fugood/node-llama-win32-x64-vulkan": "1.3.0-rc.2",
83
+ "@fugood/node-llama-win32-x64-cuda": "1.3.0-rc.2",
84
+ "@fugood/node-llama-win32-arm64": "1.3.0-rc.2",
85
+ "@fugood/node-llama-win32-arm64-vulkan": "1.3.0-rc.2",
86
+ "@fugood/node-llama-darwin-x64": "1.3.0-rc.2",
87
+ "@fugood/node-llama-darwin-arm64": "1.3.0-rc.2"
88
88
  },
89
89
  "devDependencies": {
90
90
  "@babel/preset-env": "^7.24.4",
@@ -41,9 +41,9 @@ static std::string build_repetition(const std::string & item_rule, int min_items
41
41
  return result;
42
42
  }
43
43
 
44
- static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
45
- auto has_min = min_value != std::numeric_limits<int>::min();
46
- auto has_max = max_value != std::numeric_limits<int>::max();
44
+ static void _build_min_max_int(int64_t min_value, int64_t max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
45
+ auto has_min = min_value != std::numeric_limits<int64_t>::min();
46
+ auto has_max = max_value != std::numeric_limits<int64_t>::max();
47
47
 
48
48
  auto digit_range = [&](char from, char to) {
49
49
  out << "[";
@@ -159,7 +159,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
159
159
  if (has_min) {
160
160
  if (min_value < 0) {
161
161
  out << "\"-\" (";
162
- _build_min_max_int(std::numeric_limits<int>::min(), -min_value, out, decimals_left, /* top_level= */ false);
162
+ _build_min_max_int(std::numeric_limits<int64_t>::min(), -min_value, out, decimals_left, /* top_level= */ false);
163
163
  out << ") | [0] | [1-9] ";
164
164
  more_digits(0, decimals_left - 1);
165
165
  } else if (min_value == 0) {
@@ -194,7 +194,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
194
194
  }
195
195
  digit_range(c, c);
196
196
  out << " (";
197
- _build_min_max_int(std::stoi(min_s.substr(1)), std::numeric_limits<int>::max(), out, less_decimals, /* top_level= */ false);
197
+ _build_min_max_int(std::stoll(min_s.substr(1)), std::numeric_limits<int64_t>::max(), out, less_decimals, /* top_level= */ false);
198
198
  out << ")";
199
199
  if (c < '9') {
200
200
  out << " | ";
@@ -216,7 +216,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
216
216
  _build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true);
217
217
  } else {
218
218
  out << "\"-\" (";
219
- _build_min_max_int(-max_value, std::numeric_limits<int>::max(), out, decimals_left, /* top_level= */ false);
219
+ _build_min_max_int(-max_value, std::numeric_limits<int64_t>::max(), out, decimals_left, /* top_level= */ false);
220
220
  out << ")";
221
221
  }
222
222
  return;
@@ -925,17 +925,17 @@ public:
925
925
  int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
926
926
  return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
927
927
  } else if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
928
- int min_value = std::numeric_limits<int>::min();
929
- int max_value = std::numeric_limits<int>::max();
928
+ int64_t min_value = std::numeric_limits<int64_t>::min();
929
+ int64_t max_value = std::numeric_limits<int64_t>::max();
930
930
  if (schema.contains("minimum")) {
931
- min_value = schema["minimum"].get<int>();
931
+ min_value = schema["minimum"].get<int64_t>();
932
932
  } else if (schema.contains("exclusiveMinimum")) {
933
- min_value = schema["exclusiveMinimum"].get<int>() + 1;
933
+ min_value = schema["exclusiveMinimum"].get<int64_t>() + 1;
934
934
  }
935
935
  if (schema.contains("maximum")) {
936
- max_value = schema["maximum"].get<int>();
936
+ max_value = schema["maximum"].get<int64_t>();
937
937
  } else if (schema.contains("exclusiveMaximum")) {
938
- max_value = schema["exclusiveMaximum"].get<int>() - 1;
938
+ max_value = schema["exclusiveMaximum"].get<int64_t>() - 1;
939
939
  }
940
940
  std::stringstream out;
941
941
  out << "(";
@@ -21,8 +21,7 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const c
21
21
  GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, uint32_t device, size_t * free, size_t * total);
22
22
 
23
23
  GGML_BACKEND_API void ggml_backend_rpc_start_server(const char * endpoint, const char * cache_dir,
24
- size_t n_threads, size_t n_devices,
25
- ggml_backend_dev_t * devices, size_t * free_mem, size_t * total_mem);
24
+ size_t n_threads, size_t n_devices, ggml_backend_dev_t * devices);
26
25
 
27
26
  GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
28
27
  GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_add_server(const char * endpoint);
@@ -485,8 +485,9 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS> class tensor_
485
485
  int32_t start = ith * task_per_thread;
486
486
  int32_t end = std::min((ith + 1) * task_per_thread, task_count);
487
487
  for (int32_t compute_idx = start; compute_idx < end; compute_idx++) {
488
- int32_t gemm_idx = compute_idx / block_size_m;
489
- int32_t m_idx = compute_idx % block_size_m * block_size_m;
488
+ int32_t gemm_idx = compute_idx / per_gemm_block_count_m;
489
+ int32_t block_idx_in_gemm = compute_idx % per_gemm_block_count_m;
490
+ int32_t m_idx = block_idx_in_gemm * block_size_m;
490
491
  const qnbitgemm_spacemit_ime_args & data = qnbitgemm_args[gemm_idx];
491
492
  int32_t rows_tobe_handled = (gemm_m - m_idx) > block_size_m ? block_size_m : (gemm_m - m_idx);
492
493
 
@@ -421,11 +421,8 @@ struct llama_model::impl {
421
421
  llama_mlocks mlock_bufs;
422
422
  llama_mlocks mlock_mmaps;
423
423
 
424
- // contexts where the model tensors metadata is stored
425
- std::vector<ggml_context_ptr> ctxs;
426
-
427
- // the model memory buffers for the tensor data
428
- std::vector<ggml_backend_buffer_ptr> bufs;
424
+ // contexts where the model tensors metadata is stored as well ass the corresponding buffers:
425
+ std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
429
426
 
430
427
  buft_list_t cpu_buft_list;
431
428
  std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
@@ -2182,7 +2179,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2182
2179
  max_n_tensors += n_layer*2; // duplicated rope freq tensors
2183
2180
  const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
2184
2181
 
2185
- std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
2182
+ // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
2183
+ struct ggml_backend_buft_comparator {
2184
+ bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
2185
+ return ggml_backend_buft_name(lhs) < ggml_backend_buft_name(rhs);
2186
+ }
2187
+ };
2188
+ std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
2189
+
2186
2190
  auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
2187
2191
  auto it = ctx_map.find(buft);
2188
2192
  if (it == ctx_map.end()) {
@@ -2197,12 +2201,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2197
2201
  throw std::runtime_error(format("failed to create ggml context"));
2198
2202
  }
2199
2203
 
2200
- ctx_map[buft] = ctx;
2201
- pimpl->ctxs.emplace_back(ctx);
2204
+ ctx_map.emplace(buft, ctx);
2202
2205
 
2203
2206
  return ctx;
2204
2207
  }
2205
- return it->second;
2208
+ return it->second.get();
2206
2209
  };
2207
2210
 
2208
2211
  const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
@@ -6037,16 +6040,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6037
6040
  pimpl->mappings.reserve(ml.mappings.size());
6038
6041
 
6039
6042
  // create the backend buffers
6040
- std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
6041
- ctx_bufs.reserve(ctx_map.size());
6043
+ std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_buf_maps;
6044
+ ctx_buf_maps.reserve(ctx_map.size());
6042
6045
 
6043
6046
  // Ensure we have enough capacity for the maximum backend buffer we will potentially create
6044
6047
  const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
6045
- pimpl->bufs.reserve(n_max_backend_buffer);
6048
+ pimpl->ctxs_bufs.reserve(n_max_backend_buffer);
6046
6049
 
6047
- for (auto & it : ctx_map) {
6048
- ggml_backend_buffer_type_t buft = it.first;
6049
- ggml_context * ctx = it.second;
6050
+ for (auto & [buft, ctx_ptr] : ctx_map) {
6051
+ ggml_context * ctx = ctx_ptr.get();
6050
6052
 
6051
6053
  // skip contexts without tensors
6052
6054
  if (ggml_get_first_tensor(ctx) == nullptr) {
@@ -6070,6 +6072,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6070
6072
  bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
6071
6073
  bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
6072
6074
 
6075
+ ggml_backend_buffer_t buf = nullptr;
6073
6076
  if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
6074
6077
  for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
6075
6078
  // only the mmap region containing the tensors in the model is mapped to the backend buffer
@@ -6082,20 +6085,18 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6082
6085
  continue;
6083
6086
  }
6084
6087
  const size_t max_size = ggml_get_max_tensor_size(ctx);
6085
- ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
6088
+ buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
6086
6089
  if (buf == nullptr) {
6087
6090
  throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
6088
6091
  }
6089
- pimpl->bufs.emplace_back(buf);
6090
6092
  buf_map.emplace(idx, buf);
6091
6093
  }
6092
6094
  }
6093
6095
  else {
6094
- ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
6096
+ buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
6095
6097
  if (buf == nullptr) {
6096
6098
  throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
6097
6099
  }
6098
- pimpl->bufs.emplace_back(buf);
6099
6100
  if (use_mlock && ggml_backend_buffer_is_host(buf)) {
6100
6101
  pimpl->mlock_bufs.emplace_back(new llama_mlock);
6101
6102
  auto & mlock_buf = pimpl->mlock_bufs.back();
@@ -6106,10 +6107,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6106
6107
  buf_map.emplace(idx, buf);
6107
6108
  }
6108
6109
  }
6109
-
6110
- if (pimpl->bufs.empty()) {
6111
- throw std::runtime_error("failed to allocate buffer");
6112
- }
6110
+ pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), buf);
6113
6111
 
6114
6112
  for (auto & buf : buf_map) {
6115
6113
  // indicate that this buffer contains weights
@@ -6117,7 +6115,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6117
6115
  ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
6118
6116
  }
6119
6117
 
6120
- ctx_bufs.emplace_back(ctx, buf_map);
6118
+ ctx_buf_maps.emplace_back(ctx, buf_map);
6121
6119
  }
6122
6120
 
6123
6121
  if (llama_supports_gpu_offload()) {
@@ -6135,22 +6133,20 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6135
6133
  }
6136
6134
 
6137
6135
  // print memory requirements per buffer type
6138
- for (auto & buf : pimpl->bufs) {
6136
+ for (auto & [_, buf] : pimpl->ctxs_bufs) {
6139
6137
  LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
6140
6138
  }
6141
6139
 
6142
6140
  // populate tensors_by_name
6143
- for (auto & ctx : pimpl->ctxs) {
6141
+ for (auto & [ctx, _] : pimpl->ctxs_bufs) {
6144
6142
  for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
6145
6143
  tensors_by_name.emplace_back(ggml_get_name(cur), cur);
6146
6144
  }
6147
6145
  }
6148
6146
 
6149
6147
  // load tensor data
6150
- for (auto & it : ctx_bufs) {
6151
- ggml_context * ctx = it.first;
6152
- auto & bufs = it.second;
6153
- if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
6148
+ for (auto & [ctx, buf_map] : ctx_buf_maps) {
6149
+ if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
6154
6150
  return false;
6155
6151
  }
6156
6152
  }
@@ -6190,8 +6186,8 @@ size_t llama_model::n_devices() const {
6190
6186
 
6191
6187
  std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
6192
6188
  std::map<ggml_backend_buffer_type_t, size_t> ret;
6193
- for (const ggml_backend_buffer_ptr & buf_ptr : pimpl->bufs) {
6194
- ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
6189
+ for (const auto & [_, buf] : pimpl->ctxs_bufs) {
6190
+ ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
6195
6191
  }
6196
6192
  return ret;
6197
6193
  }