@fugood/llama.node 1.3.0-rc.1 → 1.3.0-rc.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -37,11 +37,9 @@ import { loadModel } from '@fugood/llama.node'
37
37
  // Initial a Llama context with the model (may take a while)
38
38
  const context = await loadModel({
39
39
  model: 'path/to/gguf/model',
40
- use_mlock: true,
41
40
  n_ctx: 2048,
42
- n_gpu_layers: 1, // > 0: enable GPU
43
- // embedding: true, // use embedding
44
- // lib_variant: 'opencl', // Change backend
41
+ n_gpu_layers: 99, // > 0: enable GPU
42
+ // lib_variant: 'vulkan', // Change backend
45
43
  })
46
44
 
47
45
  // Do completion
package/lib/binding.js CHANGED
@@ -15,13 +15,23 @@ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (
15
15
  }) : function(o, v) {
16
16
  o["default"] = v;
17
17
  });
18
- var __importStar = (this && this.__importStar) || function (mod) {
19
- if (mod && mod.__esModule) return mod;
20
- var result = {};
21
- if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
22
- __setModuleDefault(result, mod);
23
- return result;
24
- };
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
25
35
  var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
26
36
  function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
27
37
  return new (P || (P = Promise))(function (resolve, reject) {
@@ -32,7 +42,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
32
42
  });
33
43
  };
34
44
  Object.defineProperty(exports, "__esModule", { value: true });
35
- exports.loadModule = void 0;
45
+ exports.isLibVariantAvailable = exports.loadModule = void 0;
36
46
  const getPlatformPackageName = (variant) => {
37
47
  const platform = process.platform;
38
48
  const arch = process.arch;
@@ -62,3 +72,21 @@ const loadModule = (variant) => __awaiter(void 0, void 0, void 0, function* () {
62
72
  return (yield Promise.resolve().then(() => __importStar(require('../build/Release/index.node'))));
63
73
  });
64
74
  exports.loadModule = loadModule;
75
+ const isLibVariantAvailable = (variant) => __awaiter(void 0, void 0, void 0, function* () {
76
+ if (variant && variant !== 'default') {
77
+ const module = yield loadPlatformPackage(getPlatformPackageName(variant));
78
+ return module != null;
79
+ }
80
+ const defaultModule = yield loadPlatformPackage(getPlatformPackageName());
81
+ if (defaultModule)
82
+ return true;
83
+ try {
84
+ // @ts-ignore
85
+ yield Promise.resolve().then(() => __importStar(require('../build/Release/index.node')));
86
+ return true;
87
+ }
88
+ catch (error) {
89
+ return false;
90
+ }
91
+ });
92
+ exports.isLibVariantAvailable = isLibVariantAvailable;
package/lib/binding.ts CHANGED
@@ -587,3 +587,21 @@ export const loadModule = async (variant?: LibVariant): Promise<Module> => {
587
587
  // @ts-ignore
588
588
  return (await import('../build/Release/index.node')) as Module
589
589
  }
590
+
591
+ export const isLibVariantAvailable = async (variant?: LibVariant): Promise<boolean> => {
592
+ if (variant && variant !== 'default') {
593
+ const module = await loadPlatformPackage(getPlatformPackageName(variant))
594
+ return module != null
595
+ }
596
+
597
+ const defaultModule = await loadPlatformPackage(getPlatformPackageName())
598
+ if (defaultModule) return true
599
+
600
+ try {
601
+ // @ts-ignore
602
+ await import('../build/Release/index.node')
603
+ return true
604
+ } catch (error) {
605
+ return false
606
+ }
607
+ }
package/lib/index.js CHANGED
@@ -23,14 +23,14 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
23
23
  });
24
24
  };
25
25
  Object.defineProperty(exports, "__esModule", { value: true });
26
- exports.BuildInfo = exports.getBackendDevicesInfo = exports.loadLlamaModelInfo = exports.initLlama = exports.loadModel = exports.toggleNativeLog = exports.MTMD_DEFAULT_MEDIA_MARKER = exports.LlamaParallelAPI = void 0;
26
+ exports.BuildInfo = exports.getBackendDevicesInfo = exports.loadLlamaModelInfo = exports.initLlama = exports.loadModel = exports.toggleNativeLog = exports.LlamaParallelAPI = void 0;
27
27
  exports.addNativeLogListener = addNativeLogListener;
28
28
  const binding_1 = require("./binding");
29
29
  const version_1 = require("./version");
30
30
  const parallel_1 = require("./parallel");
31
31
  Object.defineProperty(exports, "LlamaParallelAPI", { enumerable: true, get: function () { return parallel_1.LlamaParallelAPI; } });
32
+ const utils_1 = require("./utils");
32
33
  __exportStar(require("./binding"), exports);
33
- exports.MTMD_DEFAULT_MEDIA_MARKER = '<__media__>';
34
34
  const mods = {};
35
35
  const logListeners = [];
36
36
  const logCallback = (level, text) => {
@@ -83,60 +83,9 @@ class LlamaContextWrapper {
83
83
  isLlamaChatSupported() {
84
84
  return !!this.ctx.getModelInfo().chatTemplates.llamaChat;
85
85
  }
86
- _formatMediaChat(messages) {
87
- if (!messages)
88
- return {
89
- messages,
90
- has_media: false,
91
- };
92
- const mediaPaths = [];
93
- return {
94
- messages: messages.map((msg) => {
95
- if (Array.isArray(msg.content)) {
96
- const content = msg.content.map((part) => {
97
- var _a;
98
- // Handle multimodal content
99
- if (part.type === 'image_url') {
100
- let path = ((_a = part.image_url) === null || _a === void 0 ? void 0 : _a.url) || '';
101
- mediaPaths.push(path);
102
- return {
103
- type: 'text',
104
- text: exports.MTMD_DEFAULT_MEDIA_MARKER,
105
- };
106
- }
107
- else if (part.type === 'input_audio') {
108
- const { input_audio: audio } = part;
109
- if (!audio)
110
- throw new Error('input_audio is required');
111
- const { format } = audio;
112
- if (format != 'wav' && format != 'mp3') {
113
- throw new Error(`Unsupported audio format: ${format}`);
114
- }
115
- if (audio.url) {
116
- const path = audio.url.replace(/file:\/\//, '');
117
- mediaPaths.push(path);
118
- }
119
- else if (audio.data) {
120
- mediaPaths.push(audio.data);
121
- }
122
- return {
123
- type: 'text',
124
- text: exports.MTMD_DEFAULT_MEDIA_MARKER,
125
- };
126
- }
127
- return part;
128
- });
129
- return Object.assign(Object.assign({}, msg), { content });
130
- }
131
- return msg;
132
- }),
133
- has_media: mediaPaths.length > 0,
134
- media_paths: mediaPaths,
135
- };
136
- }
137
86
  getFormattedChat(messages, template, params) {
138
87
  var _a;
139
- const { messages: chat, has_media, media_paths, } = this._formatMediaChat(messages);
88
+ const { messages: chat, has_media, media_paths, } = (0, utils_1.formatMediaChat)(messages);
140
89
  const useJinja = this.isJinjaSupported() && (params === null || params === void 0 ? void 0 : params.jinja);
141
90
  let tmpl;
142
91
  if (template)
@@ -170,7 +119,7 @@ class LlamaContextWrapper {
170
119
  media_paths }, jinjaResult);
171
120
  }
172
121
  completion(options, callback) {
173
- const { messages, media_paths = options.media_paths } = this._formatMediaChat(options.messages);
122
+ const { messages, media_paths = options.media_paths } = (0, utils_1.formatMediaChat)(options.messages);
174
123
  return this.ctx.completion(Object.assign(Object.assign({}, options), { messages, media_paths: options.media_paths || media_paths }), callback || (() => { }));
175
124
  }
176
125
  stopCompletion() {
package/lib/index.ts CHANGED
@@ -19,12 +19,11 @@ import type {
19
19
  } from './binding'
20
20
  import { BUILD_NUMBER, BUILD_COMMIT } from './version'
21
21
  import { LlamaParallelAPI } from './parallel'
22
+ import { formatMediaChat } from './utils'
22
23
 
23
24
  export * from './binding'
24
25
  export { LlamaParallelAPI }
25
26
 
26
- export const MTMD_DEFAULT_MEDIA_MARKER = '<__media__>'
27
-
28
27
  export interface LlamaModelOptionsExtended extends LlamaModelOptions {
29
28
  lib_variant?: LibVariant
30
29
  }
@@ -104,63 +103,6 @@ class LlamaContextWrapper {
104
103
  return !!this.ctx.getModelInfo().chatTemplates.llamaChat
105
104
  }
106
105
 
107
- _formatMediaChat(messages: ChatMessage[] | undefined): {
108
- messages: ChatMessage[] | undefined
109
- has_media: boolean
110
- media_paths?: string[]
111
- } {
112
- if (!messages)
113
- return {
114
- messages,
115
- has_media: false,
116
- }
117
- const mediaPaths: string[] = []
118
- return {
119
- messages: messages.map((msg) => {
120
- if (Array.isArray(msg.content)) {
121
- const content = msg.content.map((part) => {
122
- // Handle multimodal content
123
- if (part.type === 'image_url') {
124
- let path = part.image_url?.url || ''
125
- mediaPaths.push(path)
126
- return {
127
- type: 'text',
128
- text: MTMD_DEFAULT_MEDIA_MARKER,
129
- }
130
- } else if (part.type === 'input_audio') {
131
- const { input_audio: audio } = part
132
- if (!audio) throw new Error('input_audio is required')
133
-
134
- const { format } = audio
135
- if (format != 'wav' && format != 'mp3') {
136
- throw new Error(`Unsupported audio format: ${format}`)
137
- }
138
- if (audio.url) {
139
- const path = audio.url.replace(/file:\/\//, '')
140
- mediaPaths.push(path)
141
- } else if (audio.data) {
142
- mediaPaths.push(audio.data)
143
- }
144
- return {
145
- type: 'text',
146
- text: MTMD_DEFAULT_MEDIA_MARKER,
147
- }
148
- }
149
- return part
150
- })
151
-
152
- return {
153
- ...msg,
154
- content,
155
- }
156
- }
157
- return msg
158
- }),
159
- has_media: mediaPaths.length > 0,
160
- media_paths: mediaPaths,
161
- }
162
- }
163
-
164
106
  getFormattedChat(
165
107
  messages: ChatMessage[],
166
108
  template?: string,
@@ -180,7 +122,7 @@ class LlamaContextWrapper {
180
122
  messages: chat,
181
123
  has_media,
182
124
  media_paths,
183
- } = this._formatMediaChat(messages)
125
+ } = formatMediaChat(messages)
184
126
 
185
127
  const useJinja = this.isJinjaSupported() && params?.jinja
186
128
  let tmpl
@@ -228,7 +170,7 @@ class LlamaContextWrapper {
228
170
  callback?: (token: LlamaCompletionToken) => void,
229
171
  ): Promise<LlamaCompletionResult> {
230
172
  const { messages, media_paths = options.media_paths } =
231
- this._formatMediaChat(options.messages)
173
+ formatMediaChat(options.messages)
232
174
  return this.ctx.completion(
233
175
  {
234
176
  ...options,
package/lib/parallel.js CHANGED
@@ -10,6 +10,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
10
10
  };
11
11
  Object.defineProperty(exports, "__esModule", { value: true });
12
12
  exports.LlamaParallelAPI = void 0;
13
+ const utils_1 = require("./utils");
13
14
  class LlamaParallelAPI {
14
15
  constructor(context) {
15
16
  this.enabled = false;
@@ -87,8 +88,9 @@ class LlamaParallelAPI {
87
88
  }
88
89
  }
89
90
  : undefined;
91
+ const { messages, media_paths = options.media_paths } = (0, utils_1.formatMediaChat)(options.messages);
90
92
  // Queue the completion immediately (this is synchronous!)
91
- const { requestId } = this.context.queueCompletion(options, tokenCallback ||
93
+ const { requestId } = this.context.queueCompletion(Object.assign(Object.assign({}, options), { messages, media_paths: media_paths }), tokenCallback ||
92
94
  ((error, result) => {
93
95
  if (error) {
94
96
  const pendingReq = this.pendingRequests.get(result === null || result === void 0 ? void 0 : result.requestId);
package/lib/parallel.ts CHANGED
@@ -5,6 +5,7 @@ import type {
5
5
  LlamaCompletionToken,
6
6
  RerankParams,
7
7
  } from './binding'
8
+ import { formatMediaChat } from './utils'
8
9
 
9
10
  export class LlamaParallelAPI {
10
11
  private context: LlamaContext
@@ -109,9 +110,16 @@ export class LlamaParallelAPI {
109
110
  }
110
111
  : undefined
111
112
 
113
+ const { messages, media_paths = options.media_paths } = formatMediaChat(
114
+ options.messages,
115
+ )
112
116
  // Queue the completion immediately (this is synchronous!)
113
117
  const { requestId } = this.context.queueCompletion(
114
- options,
118
+ {
119
+ ...options,
120
+ messages,
121
+ media_paths: media_paths,
122
+ },
115
123
  tokenCallback ||
116
124
  ((error, result) => {
117
125
  if (error) {
package/lib/utils.js ADDED
@@ -0,0 +1,56 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.formatMediaChat = exports.MTMD_DEFAULT_MEDIA_MARKER = void 0;
4
+ exports.MTMD_DEFAULT_MEDIA_MARKER = '<__media__>';
5
+ const formatMediaChat = (messages) => {
6
+ if (!messages)
7
+ return {
8
+ messages,
9
+ has_media: false,
10
+ };
11
+ const mediaPaths = [];
12
+ return {
13
+ messages: messages.map((msg) => {
14
+ if (Array.isArray(msg.content)) {
15
+ const content = msg.content.map((part) => {
16
+ var _a;
17
+ // Handle multimodal content
18
+ if (part.type === 'image_url') {
19
+ let path = ((_a = part.image_url) === null || _a === void 0 ? void 0 : _a.url) || '';
20
+ mediaPaths.push(path);
21
+ return {
22
+ type: 'text',
23
+ text: exports.MTMD_DEFAULT_MEDIA_MARKER,
24
+ };
25
+ }
26
+ else if (part.type === 'input_audio') {
27
+ const { input_audio: audio } = part;
28
+ if (!audio)
29
+ throw new Error('input_audio is required');
30
+ const { format } = audio;
31
+ if (format != 'wav' && format != 'mp3') {
32
+ throw new Error(`Unsupported audio format: ${format}`);
33
+ }
34
+ if (audio.url) {
35
+ const path = audio.url.replace(/file:\/\//, '');
36
+ mediaPaths.push(path);
37
+ }
38
+ else if (audio.data) {
39
+ mediaPaths.push(audio.data);
40
+ }
41
+ return {
42
+ type: 'text',
43
+ text: exports.MTMD_DEFAULT_MEDIA_MARKER,
44
+ };
45
+ }
46
+ return part;
47
+ });
48
+ return Object.assign(Object.assign({}, msg), { content });
49
+ }
50
+ return msg;
51
+ }),
52
+ has_media: mediaPaths.length > 0,
53
+ media_paths: mediaPaths,
54
+ };
55
+ };
56
+ exports.formatMediaChat = formatMediaChat;
package/lib/utils.ts ADDED
@@ -0,0 +1,63 @@
1
+
2
+ import type {
3
+ ChatMessage,
4
+ } from './binding'
5
+
6
+ export const MTMD_DEFAULT_MEDIA_MARKER = '<__media__>'
7
+
8
+ export const formatMediaChat = (messages: ChatMessage[] | undefined): {
9
+ messages: ChatMessage[] | undefined
10
+ has_media: boolean
11
+ media_paths?: string[]
12
+ } => {
13
+ if (!messages)
14
+ return {
15
+ messages,
16
+ has_media: false,
17
+ }
18
+ const mediaPaths: string[] = []
19
+ return {
20
+ messages: messages.map((msg) => {
21
+ if (Array.isArray(msg.content)) {
22
+ const content = msg.content.map((part) => {
23
+ // Handle multimodal content
24
+ if (part.type === 'image_url') {
25
+ let path = part.image_url?.url || ''
26
+ mediaPaths.push(path)
27
+ return {
28
+ type: 'text',
29
+ text: MTMD_DEFAULT_MEDIA_MARKER,
30
+ }
31
+ } else if (part.type === 'input_audio') {
32
+ const { input_audio: audio } = part
33
+ if (!audio) throw new Error('input_audio is required')
34
+
35
+ const { format } = audio
36
+ if (format != 'wav' && format != 'mp3') {
37
+ throw new Error(`Unsupported audio format: ${format}`)
38
+ }
39
+ if (audio.url) {
40
+ const path = audio.url.replace(/file:\/\//, '')
41
+ mediaPaths.push(path)
42
+ } else if (audio.data) {
43
+ mediaPaths.push(audio.data)
44
+ }
45
+ return {
46
+ type: 'text',
47
+ text: MTMD_DEFAULT_MEDIA_MARKER,
48
+ }
49
+ }
50
+ return part
51
+ })
52
+
53
+ return {
54
+ ...msg,
55
+ content,
56
+ }
57
+ }
58
+ return msg
59
+ }),
60
+ has_media: mediaPaths.length > 0,
61
+ media_paths: mediaPaths,
62
+ }
63
+ }
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.3.0-rc.1",
4
+ "version": "1.3.0-rc.4",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -72,19 +72,19 @@
72
72
  "CMakeLists.txt"
73
73
  ],
74
74
  "optionalDependencies": {
75
- "@fugood/node-llama-linux-x64": "1.3.0-rc.1",
76
- "@fugood/node-llama-linux-x64-vulkan": "1.3.0-rc.1",
77
- "@fugood/node-llama-linux-x64-cuda": "1.3.0-rc.1",
78
- "@fugood/node-llama-linux-arm64": "1.3.0-rc.1",
79
- "@fugood/node-llama-linux-arm64-vulkan": "1.3.0-rc.1",
80
- "@fugood/node-llama-linux-arm64-cuda": "1.3.0-rc.1",
81
- "@fugood/node-llama-win32-x64": "1.3.0-rc.1",
82
- "@fugood/node-llama-win32-x64-vulkan": "1.3.0-rc.1",
83
- "@fugood/node-llama-win32-x64-cuda": "1.3.0-rc.1",
84
- "@fugood/node-llama-win32-arm64": "1.3.0-rc.1",
85
- "@fugood/node-llama-win32-arm64-vulkan": "1.3.0-rc.1",
86
- "@fugood/node-llama-darwin-x64": "1.3.0-rc.1",
87
- "@fugood/node-llama-darwin-arm64": "1.3.0-rc.1"
75
+ "@fugood/node-llama-linux-x64": "1.3.0-rc.4",
76
+ "@fugood/node-llama-linux-x64-vulkan": "1.3.0-rc.4",
77
+ "@fugood/node-llama-linux-x64-cuda": "1.3.0-rc.4",
78
+ "@fugood/node-llama-linux-arm64": "1.3.0-rc.4",
79
+ "@fugood/node-llama-linux-arm64-vulkan": "1.3.0-rc.4",
80
+ "@fugood/node-llama-linux-arm64-cuda": "1.3.0-rc.4",
81
+ "@fugood/node-llama-win32-x64": "1.3.0-rc.4",
82
+ "@fugood/node-llama-win32-x64-vulkan": "1.3.0-rc.4",
83
+ "@fugood/node-llama-win32-x64-cuda": "1.3.0-rc.4",
84
+ "@fugood/node-llama-win32-arm64": "1.3.0-rc.4",
85
+ "@fugood/node-llama-win32-arm64-vulkan": "1.3.0-rc.4",
86
+ "@fugood/node-llama-darwin-x64": "1.3.0-rc.4",
87
+ "@fugood/node-llama-darwin-arm64": "1.3.0-rc.4"
88
88
  },
89
89
  "devDependencies": {
90
90
  "@babel/preset-env": "^7.24.4",
@@ -41,9 +41,9 @@ static std::string build_repetition(const std::string & item_rule, int min_items
41
41
  return result;
42
42
  }
43
43
 
44
- static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
45
- auto has_min = min_value != std::numeric_limits<int>::min();
46
- auto has_max = max_value != std::numeric_limits<int>::max();
44
+ static void _build_min_max_int(int64_t min_value, int64_t max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
45
+ auto has_min = min_value != std::numeric_limits<int64_t>::min();
46
+ auto has_max = max_value != std::numeric_limits<int64_t>::max();
47
47
 
48
48
  auto digit_range = [&](char from, char to) {
49
49
  out << "[";
@@ -159,7 +159,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
159
159
  if (has_min) {
160
160
  if (min_value < 0) {
161
161
  out << "\"-\" (";
162
- _build_min_max_int(std::numeric_limits<int>::min(), -min_value, out, decimals_left, /* top_level= */ false);
162
+ _build_min_max_int(std::numeric_limits<int64_t>::min(), -min_value, out, decimals_left, /* top_level= */ false);
163
163
  out << ") | [0] | [1-9] ";
164
164
  more_digits(0, decimals_left - 1);
165
165
  } else if (min_value == 0) {
@@ -194,7 +194,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
194
194
  }
195
195
  digit_range(c, c);
196
196
  out << " (";
197
- _build_min_max_int(std::stoi(min_s.substr(1)), std::numeric_limits<int>::max(), out, less_decimals, /* top_level= */ false);
197
+ _build_min_max_int(std::stoll(min_s.substr(1)), std::numeric_limits<int64_t>::max(), out, less_decimals, /* top_level= */ false);
198
198
  out << ")";
199
199
  if (c < '9') {
200
200
  out << " | ";
@@ -216,7 +216,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
216
216
  _build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true);
217
217
  } else {
218
218
  out << "\"-\" (";
219
- _build_min_max_int(-max_value, std::numeric_limits<int>::max(), out, decimals_left, /* top_level= */ false);
219
+ _build_min_max_int(-max_value, std::numeric_limits<int64_t>::max(), out, decimals_left, /* top_level= */ false);
220
220
  out << ")";
221
221
  }
222
222
  return;
@@ -925,17 +925,17 @@ public:
925
925
  int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
926
926
  return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
927
927
  } else if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
928
- int min_value = std::numeric_limits<int>::min();
929
- int max_value = std::numeric_limits<int>::max();
928
+ int64_t min_value = std::numeric_limits<int64_t>::min();
929
+ int64_t max_value = std::numeric_limits<int64_t>::max();
930
930
  if (schema.contains("minimum")) {
931
- min_value = schema["minimum"].get<int>();
931
+ min_value = schema["minimum"].get<int64_t>();
932
932
  } else if (schema.contains("exclusiveMinimum")) {
933
- min_value = schema["exclusiveMinimum"].get<int>() + 1;
933
+ min_value = schema["exclusiveMinimum"].get<int64_t>() + 1;
934
934
  }
935
935
  if (schema.contains("maximum")) {
936
- max_value = schema["maximum"].get<int>();
936
+ max_value = schema["maximum"].get<int64_t>();
937
937
  } else if (schema.contains("exclusiveMaximum")) {
938
- max_value = schema["exclusiveMaximum"].get<int>() - 1;
938
+ max_value = schema["exclusiveMaximum"].get<int64_t>() - 1;
939
939
  }
940
940
  std::stringstream out;
941
941
  out << "(";
@@ -21,8 +21,7 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const c
21
21
  GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, uint32_t device, size_t * free, size_t * total);
22
22
 
23
23
  GGML_BACKEND_API void ggml_backend_rpc_start_server(const char * endpoint, const char * cache_dir,
24
- size_t n_threads, size_t n_devices,
25
- ggml_backend_dev_t * devices, size_t * free_mem, size_t * total_mem);
24
+ size_t n_threads, size_t n_devices, ggml_backend_dev_t * devices);
26
25
 
27
26
  GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
28
27
  GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_add_server(const char * endpoint);
@@ -485,8 +485,9 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS> class tensor_
485
485
  int32_t start = ith * task_per_thread;
486
486
  int32_t end = std::min((ith + 1) * task_per_thread, task_count);
487
487
  for (int32_t compute_idx = start; compute_idx < end; compute_idx++) {
488
- int32_t gemm_idx = compute_idx / block_size_m;
489
- int32_t m_idx = compute_idx % block_size_m * block_size_m;
488
+ int32_t gemm_idx = compute_idx / per_gemm_block_count_m;
489
+ int32_t block_idx_in_gemm = compute_idx % per_gemm_block_count_m;
490
+ int32_t m_idx = block_idx_in_gemm * block_size_m;
490
491
  const qnbitgemm_spacemit_ime_args & data = qnbitgemm_args[gemm_idx];
491
492
  int32_t rows_tobe_handled = (gemm_m - m_idx) > block_size_m ? block_size_m : (gemm_m - m_idx);
492
493
 
@@ -421,11 +421,8 @@ struct llama_model::impl {
421
421
  llama_mlocks mlock_bufs;
422
422
  llama_mlocks mlock_mmaps;
423
423
 
424
- // contexts where the model tensors metadata is stored
425
- std::vector<ggml_context_ptr> ctxs;
426
-
427
- // the model memory buffers for the tensor data
428
- std::vector<ggml_backend_buffer_ptr> bufs;
424
+ // contexts where the model tensors metadata is stored as well ass the corresponding buffers:
425
+ std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
429
426
 
430
427
  buft_list_t cpu_buft_list;
431
428
  std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
@@ -2182,7 +2179,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2182
2179
  max_n_tensors += n_layer*2; // duplicated rope freq tensors
2183
2180
  const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
2184
2181
 
2185
- std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
2182
+ // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
2183
+ struct ggml_backend_buft_comparator {
2184
+ bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
2185
+ return ggml_backend_buft_name(lhs) < ggml_backend_buft_name(rhs);
2186
+ }
2187
+ };
2188
+ std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
2189
+
2186
2190
  auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
2187
2191
  auto it = ctx_map.find(buft);
2188
2192
  if (it == ctx_map.end()) {
@@ -2197,12 +2201,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2197
2201
  throw std::runtime_error(format("failed to create ggml context"));
2198
2202
  }
2199
2203
 
2200
- ctx_map[buft] = ctx;
2201
- pimpl->ctxs.emplace_back(ctx);
2204
+ ctx_map.emplace(buft, ctx);
2202
2205
 
2203
2206
  return ctx;
2204
2207
  }
2205
- return it->second;
2208
+ return it->second.get();
2206
2209
  };
2207
2210
 
2208
2211
  const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
@@ -6037,16 +6040,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6037
6040
  pimpl->mappings.reserve(ml.mappings.size());
6038
6041
 
6039
6042
  // create the backend buffers
6040
- std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
6041
- ctx_bufs.reserve(ctx_map.size());
6043
+ std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_buf_maps;
6044
+ ctx_buf_maps.reserve(ctx_map.size());
6042
6045
 
6043
6046
  // Ensure we have enough capacity for the maximum backend buffer we will potentially create
6044
6047
  const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
6045
- pimpl->bufs.reserve(n_max_backend_buffer);
6048
+ pimpl->ctxs_bufs.reserve(n_max_backend_buffer);
6046
6049
 
6047
- for (auto & it : ctx_map) {
6048
- ggml_backend_buffer_type_t buft = it.first;
6049
- ggml_context * ctx = it.second;
6050
+ for (auto & [buft, ctx_ptr] : ctx_map) {
6051
+ ggml_context * ctx = ctx_ptr.get();
6050
6052
 
6051
6053
  // skip contexts without tensors
6052
6054
  if (ggml_get_first_tensor(ctx) == nullptr) {
@@ -6070,6 +6072,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6070
6072
  bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
6071
6073
  bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
6072
6074
 
6075
+ ggml_backend_buffer_t buf = nullptr;
6073
6076
  if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
6074
6077
  for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
6075
6078
  // only the mmap region containing the tensors in the model is mapped to the backend buffer
@@ -6082,20 +6085,18 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6082
6085
  continue;
6083
6086
  }
6084
6087
  const size_t max_size = ggml_get_max_tensor_size(ctx);
6085
- ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
6088
+ buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
6086
6089
  if (buf == nullptr) {
6087
6090
  throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
6088
6091
  }
6089
- pimpl->bufs.emplace_back(buf);
6090
6092
  buf_map.emplace(idx, buf);
6091
6093
  }
6092
6094
  }
6093
6095
  else {
6094
- ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
6096
+ buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
6095
6097
  if (buf == nullptr) {
6096
6098
  throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
6097
6099
  }
6098
- pimpl->bufs.emplace_back(buf);
6099
6100
  if (use_mlock && ggml_backend_buffer_is_host(buf)) {
6100
6101
  pimpl->mlock_bufs.emplace_back(new llama_mlock);
6101
6102
  auto & mlock_buf = pimpl->mlock_bufs.back();
@@ -6106,10 +6107,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6106
6107
  buf_map.emplace(idx, buf);
6107
6108
  }
6108
6109
  }
6109
-
6110
- if (pimpl->bufs.empty()) {
6111
- throw std::runtime_error("failed to allocate buffer");
6112
- }
6110
+ pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), buf);
6113
6111
 
6114
6112
  for (auto & buf : buf_map) {
6115
6113
  // indicate that this buffer contains weights
@@ -6117,7 +6115,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6117
6115
  ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
6118
6116
  }
6119
6117
 
6120
- ctx_bufs.emplace_back(ctx, buf_map);
6118
+ ctx_buf_maps.emplace_back(ctx, buf_map);
6121
6119
  }
6122
6120
 
6123
6121
  if (llama_supports_gpu_offload()) {
@@ -6135,22 +6133,20 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6135
6133
  }
6136
6134
 
6137
6135
  // print memory requirements per buffer type
6138
- for (auto & buf : pimpl->bufs) {
6136
+ for (auto & [_, buf] : pimpl->ctxs_bufs) {
6139
6137
  LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
6140
6138
  }
6141
6139
 
6142
6140
  // populate tensors_by_name
6143
- for (auto & ctx : pimpl->ctxs) {
6141
+ for (auto & [ctx, _] : pimpl->ctxs_bufs) {
6144
6142
  for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
6145
6143
  tensors_by_name.emplace_back(ggml_get_name(cur), cur);
6146
6144
  }
6147
6145
  }
6148
6146
 
6149
6147
  // load tensor data
6150
- for (auto & it : ctx_bufs) {
6151
- ggml_context * ctx = it.first;
6152
- auto & bufs = it.second;
6153
- if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
6148
+ for (auto & [ctx, buf_map] : ctx_buf_maps) {
6149
+ if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
6154
6150
  return false;
6155
6151
  }
6156
6152
  }
@@ -6190,8 +6186,8 @@ size_t llama_model::n_devices() const {
6190
6186
 
6191
6187
  std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
6192
6188
  std::map<ggml_backend_buffer_type_t, size_t> ret;
6193
- for (const ggml_backend_buffer_ptr & buf_ptr : pimpl->bufs) {
6194
- ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
6189
+ for (const auto & [_, buf] : pimpl->ctxs_bufs) {
6190
+ ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
6195
6191
  }
6196
6192
  return ret;
6197
6193
  }