@fugood/llama.node 1.3.0-rc.1 → 1.3.0-rc.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -4
- package/lib/binding.js +36 -8
- package/lib/binding.ts +18 -0
- package/lib/index.js +4 -55
- package/lib/index.ts +3 -61
- package/lib/parallel.js +3 -1
- package/lib/parallel.ts +9 -1
- package/lib/utils.js +56 -0
- package/lib/utils.ts +63 -0
- package/package.json +14 -14
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +12 -12
- package/src/llama.cpp/ggml/include/ggml-rpc.h +1 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +3 -2
- package/src/llama.cpp/src/llama-model.cpp +28 -32
package/README.md
CHANGED
|
@@ -37,11 +37,9 @@ import { loadModel } from '@fugood/llama.node'
|
|
|
37
37
|
// Initial a Llama context with the model (may take a while)
|
|
38
38
|
const context = await loadModel({
|
|
39
39
|
model: 'path/to/gguf/model',
|
|
40
|
-
use_mlock: true,
|
|
41
40
|
n_ctx: 2048,
|
|
42
|
-
n_gpu_layers:
|
|
43
|
-
//
|
|
44
|
-
// lib_variant: 'opencl', // Change backend
|
|
41
|
+
n_gpu_layers: 99, // > 0: enable GPU
|
|
42
|
+
// lib_variant: 'vulkan', // Change backend
|
|
45
43
|
})
|
|
46
44
|
|
|
47
45
|
// Do completion
|
package/lib/binding.js
CHANGED
|
@@ -15,13 +15,23 @@ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (
|
|
|
15
15
|
}) : function(o, v) {
|
|
16
16
|
o["default"] = v;
|
|
17
17
|
});
|
|
18
|
-
var __importStar = (this && this.__importStar) || function (
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
};
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
25
35
|
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
26
36
|
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
27
37
|
return new (P || (P = Promise))(function (resolve, reject) {
|
|
@@ -32,7 +42,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
32
42
|
});
|
|
33
43
|
};
|
|
34
44
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
35
|
-
exports.loadModule = void 0;
|
|
45
|
+
exports.isLibVariantAvailable = exports.loadModule = void 0;
|
|
36
46
|
const getPlatformPackageName = (variant) => {
|
|
37
47
|
const platform = process.platform;
|
|
38
48
|
const arch = process.arch;
|
|
@@ -62,3 +72,21 @@ const loadModule = (variant) => __awaiter(void 0, void 0, void 0, function* () {
|
|
|
62
72
|
return (yield Promise.resolve().then(() => __importStar(require('../build/Release/index.node'))));
|
|
63
73
|
});
|
|
64
74
|
exports.loadModule = loadModule;
|
|
75
|
+
const isLibVariantAvailable = (variant) => __awaiter(void 0, void 0, void 0, function* () {
|
|
76
|
+
if (variant && variant !== 'default') {
|
|
77
|
+
const module = yield loadPlatformPackage(getPlatformPackageName(variant));
|
|
78
|
+
return module != null;
|
|
79
|
+
}
|
|
80
|
+
const defaultModule = yield loadPlatformPackage(getPlatformPackageName());
|
|
81
|
+
if (defaultModule)
|
|
82
|
+
return true;
|
|
83
|
+
try {
|
|
84
|
+
// @ts-ignore
|
|
85
|
+
yield Promise.resolve().then(() => __importStar(require('../build/Release/index.node')));
|
|
86
|
+
return true;
|
|
87
|
+
}
|
|
88
|
+
catch (error) {
|
|
89
|
+
return false;
|
|
90
|
+
}
|
|
91
|
+
});
|
|
92
|
+
exports.isLibVariantAvailable = isLibVariantAvailable;
|
package/lib/binding.ts
CHANGED
|
@@ -587,3 +587,21 @@ export const loadModule = async (variant?: LibVariant): Promise<Module> => {
|
|
|
587
587
|
// @ts-ignore
|
|
588
588
|
return (await import('../build/Release/index.node')) as Module
|
|
589
589
|
}
|
|
590
|
+
|
|
591
|
+
export const isLibVariantAvailable = async (variant?: LibVariant): Promise<boolean> => {
|
|
592
|
+
if (variant && variant !== 'default') {
|
|
593
|
+
const module = await loadPlatformPackage(getPlatformPackageName(variant))
|
|
594
|
+
return module != null
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
const defaultModule = await loadPlatformPackage(getPlatformPackageName())
|
|
598
|
+
if (defaultModule) return true
|
|
599
|
+
|
|
600
|
+
try {
|
|
601
|
+
// @ts-ignore
|
|
602
|
+
await import('../build/Release/index.node')
|
|
603
|
+
return true
|
|
604
|
+
} catch (error) {
|
|
605
|
+
return false
|
|
606
|
+
}
|
|
607
|
+
}
|
package/lib/index.js
CHANGED
|
@@ -23,14 +23,14 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
23
23
|
});
|
|
24
24
|
};
|
|
25
25
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
26
|
-
exports.BuildInfo = exports.getBackendDevicesInfo = exports.loadLlamaModelInfo = exports.initLlama = exports.loadModel = exports.toggleNativeLog = exports.
|
|
26
|
+
exports.BuildInfo = exports.getBackendDevicesInfo = exports.loadLlamaModelInfo = exports.initLlama = exports.loadModel = exports.toggleNativeLog = exports.LlamaParallelAPI = void 0;
|
|
27
27
|
exports.addNativeLogListener = addNativeLogListener;
|
|
28
28
|
const binding_1 = require("./binding");
|
|
29
29
|
const version_1 = require("./version");
|
|
30
30
|
const parallel_1 = require("./parallel");
|
|
31
31
|
Object.defineProperty(exports, "LlamaParallelAPI", { enumerable: true, get: function () { return parallel_1.LlamaParallelAPI; } });
|
|
32
|
+
const utils_1 = require("./utils");
|
|
32
33
|
__exportStar(require("./binding"), exports);
|
|
33
|
-
exports.MTMD_DEFAULT_MEDIA_MARKER = '<__media__>';
|
|
34
34
|
const mods = {};
|
|
35
35
|
const logListeners = [];
|
|
36
36
|
const logCallback = (level, text) => {
|
|
@@ -83,60 +83,9 @@ class LlamaContextWrapper {
|
|
|
83
83
|
isLlamaChatSupported() {
|
|
84
84
|
return !!this.ctx.getModelInfo().chatTemplates.llamaChat;
|
|
85
85
|
}
|
|
86
|
-
_formatMediaChat(messages) {
|
|
87
|
-
if (!messages)
|
|
88
|
-
return {
|
|
89
|
-
messages,
|
|
90
|
-
has_media: false,
|
|
91
|
-
};
|
|
92
|
-
const mediaPaths = [];
|
|
93
|
-
return {
|
|
94
|
-
messages: messages.map((msg) => {
|
|
95
|
-
if (Array.isArray(msg.content)) {
|
|
96
|
-
const content = msg.content.map((part) => {
|
|
97
|
-
var _a;
|
|
98
|
-
// Handle multimodal content
|
|
99
|
-
if (part.type === 'image_url') {
|
|
100
|
-
let path = ((_a = part.image_url) === null || _a === void 0 ? void 0 : _a.url) || '';
|
|
101
|
-
mediaPaths.push(path);
|
|
102
|
-
return {
|
|
103
|
-
type: 'text',
|
|
104
|
-
text: exports.MTMD_DEFAULT_MEDIA_MARKER,
|
|
105
|
-
};
|
|
106
|
-
}
|
|
107
|
-
else if (part.type === 'input_audio') {
|
|
108
|
-
const { input_audio: audio } = part;
|
|
109
|
-
if (!audio)
|
|
110
|
-
throw new Error('input_audio is required');
|
|
111
|
-
const { format } = audio;
|
|
112
|
-
if (format != 'wav' && format != 'mp3') {
|
|
113
|
-
throw new Error(`Unsupported audio format: ${format}`);
|
|
114
|
-
}
|
|
115
|
-
if (audio.url) {
|
|
116
|
-
const path = audio.url.replace(/file:\/\//, '');
|
|
117
|
-
mediaPaths.push(path);
|
|
118
|
-
}
|
|
119
|
-
else if (audio.data) {
|
|
120
|
-
mediaPaths.push(audio.data);
|
|
121
|
-
}
|
|
122
|
-
return {
|
|
123
|
-
type: 'text',
|
|
124
|
-
text: exports.MTMD_DEFAULT_MEDIA_MARKER,
|
|
125
|
-
};
|
|
126
|
-
}
|
|
127
|
-
return part;
|
|
128
|
-
});
|
|
129
|
-
return Object.assign(Object.assign({}, msg), { content });
|
|
130
|
-
}
|
|
131
|
-
return msg;
|
|
132
|
-
}),
|
|
133
|
-
has_media: mediaPaths.length > 0,
|
|
134
|
-
media_paths: mediaPaths,
|
|
135
|
-
};
|
|
136
|
-
}
|
|
137
86
|
getFormattedChat(messages, template, params) {
|
|
138
87
|
var _a;
|
|
139
|
-
const { messages: chat, has_media, media_paths, } =
|
|
88
|
+
const { messages: chat, has_media, media_paths, } = (0, utils_1.formatMediaChat)(messages);
|
|
140
89
|
const useJinja = this.isJinjaSupported() && (params === null || params === void 0 ? void 0 : params.jinja);
|
|
141
90
|
let tmpl;
|
|
142
91
|
if (template)
|
|
@@ -170,7 +119,7 @@ class LlamaContextWrapper {
|
|
|
170
119
|
media_paths }, jinjaResult);
|
|
171
120
|
}
|
|
172
121
|
completion(options, callback) {
|
|
173
|
-
const { messages, media_paths = options.media_paths } =
|
|
122
|
+
const { messages, media_paths = options.media_paths } = (0, utils_1.formatMediaChat)(options.messages);
|
|
174
123
|
return this.ctx.completion(Object.assign(Object.assign({}, options), { messages, media_paths: options.media_paths || media_paths }), callback || (() => { }));
|
|
175
124
|
}
|
|
176
125
|
stopCompletion() {
|
package/lib/index.ts
CHANGED
|
@@ -19,12 +19,11 @@ import type {
|
|
|
19
19
|
} from './binding'
|
|
20
20
|
import { BUILD_NUMBER, BUILD_COMMIT } from './version'
|
|
21
21
|
import { LlamaParallelAPI } from './parallel'
|
|
22
|
+
import { formatMediaChat } from './utils'
|
|
22
23
|
|
|
23
24
|
export * from './binding'
|
|
24
25
|
export { LlamaParallelAPI }
|
|
25
26
|
|
|
26
|
-
export const MTMD_DEFAULT_MEDIA_MARKER = '<__media__>'
|
|
27
|
-
|
|
28
27
|
export interface LlamaModelOptionsExtended extends LlamaModelOptions {
|
|
29
28
|
lib_variant?: LibVariant
|
|
30
29
|
}
|
|
@@ -104,63 +103,6 @@ class LlamaContextWrapper {
|
|
|
104
103
|
return !!this.ctx.getModelInfo().chatTemplates.llamaChat
|
|
105
104
|
}
|
|
106
105
|
|
|
107
|
-
_formatMediaChat(messages: ChatMessage[] | undefined): {
|
|
108
|
-
messages: ChatMessage[] | undefined
|
|
109
|
-
has_media: boolean
|
|
110
|
-
media_paths?: string[]
|
|
111
|
-
} {
|
|
112
|
-
if (!messages)
|
|
113
|
-
return {
|
|
114
|
-
messages,
|
|
115
|
-
has_media: false,
|
|
116
|
-
}
|
|
117
|
-
const mediaPaths: string[] = []
|
|
118
|
-
return {
|
|
119
|
-
messages: messages.map((msg) => {
|
|
120
|
-
if (Array.isArray(msg.content)) {
|
|
121
|
-
const content = msg.content.map((part) => {
|
|
122
|
-
// Handle multimodal content
|
|
123
|
-
if (part.type === 'image_url') {
|
|
124
|
-
let path = part.image_url?.url || ''
|
|
125
|
-
mediaPaths.push(path)
|
|
126
|
-
return {
|
|
127
|
-
type: 'text',
|
|
128
|
-
text: MTMD_DEFAULT_MEDIA_MARKER,
|
|
129
|
-
}
|
|
130
|
-
} else if (part.type === 'input_audio') {
|
|
131
|
-
const { input_audio: audio } = part
|
|
132
|
-
if (!audio) throw new Error('input_audio is required')
|
|
133
|
-
|
|
134
|
-
const { format } = audio
|
|
135
|
-
if (format != 'wav' && format != 'mp3') {
|
|
136
|
-
throw new Error(`Unsupported audio format: ${format}`)
|
|
137
|
-
}
|
|
138
|
-
if (audio.url) {
|
|
139
|
-
const path = audio.url.replace(/file:\/\//, '')
|
|
140
|
-
mediaPaths.push(path)
|
|
141
|
-
} else if (audio.data) {
|
|
142
|
-
mediaPaths.push(audio.data)
|
|
143
|
-
}
|
|
144
|
-
return {
|
|
145
|
-
type: 'text',
|
|
146
|
-
text: MTMD_DEFAULT_MEDIA_MARKER,
|
|
147
|
-
}
|
|
148
|
-
}
|
|
149
|
-
return part
|
|
150
|
-
})
|
|
151
|
-
|
|
152
|
-
return {
|
|
153
|
-
...msg,
|
|
154
|
-
content,
|
|
155
|
-
}
|
|
156
|
-
}
|
|
157
|
-
return msg
|
|
158
|
-
}),
|
|
159
|
-
has_media: mediaPaths.length > 0,
|
|
160
|
-
media_paths: mediaPaths,
|
|
161
|
-
}
|
|
162
|
-
}
|
|
163
|
-
|
|
164
106
|
getFormattedChat(
|
|
165
107
|
messages: ChatMessage[],
|
|
166
108
|
template?: string,
|
|
@@ -180,7 +122,7 @@ class LlamaContextWrapper {
|
|
|
180
122
|
messages: chat,
|
|
181
123
|
has_media,
|
|
182
124
|
media_paths,
|
|
183
|
-
} =
|
|
125
|
+
} = formatMediaChat(messages)
|
|
184
126
|
|
|
185
127
|
const useJinja = this.isJinjaSupported() && params?.jinja
|
|
186
128
|
let tmpl
|
|
@@ -228,7 +170,7 @@ class LlamaContextWrapper {
|
|
|
228
170
|
callback?: (token: LlamaCompletionToken) => void,
|
|
229
171
|
): Promise<LlamaCompletionResult> {
|
|
230
172
|
const { messages, media_paths = options.media_paths } =
|
|
231
|
-
|
|
173
|
+
formatMediaChat(options.messages)
|
|
232
174
|
return this.ctx.completion(
|
|
233
175
|
{
|
|
234
176
|
...options,
|
package/lib/parallel.js
CHANGED
|
@@ -10,6 +10,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
10
10
|
};
|
|
11
11
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
12
|
exports.LlamaParallelAPI = void 0;
|
|
13
|
+
const utils_1 = require("./utils");
|
|
13
14
|
class LlamaParallelAPI {
|
|
14
15
|
constructor(context) {
|
|
15
16
|
this.enabled = false;
|
|
@@ -87,8 +88,9 @@ class LlamaParallelAPI {
|
|
|
87
88
|
}
|
|
88
89
|
}
|
|
89
90
|
: undefined;
|
|
91
|
+
const { messages, media_paths = options.media_paths } = (0, utils_1.formatMediaChat)(options.messages);
|
|
90
92
|
// Queue the completion immediately (this is synchronous!)
|
|
91
|
-
const { requestId } = this.context.queueCompletion(options, tokenCallback ||
|
|
93
|
+
const { requestId } = this.context.queueCompletion(Object.assign(Object.assign({}, options), { messages, media_paths: media_paths }), tokenCallback ||
|
|
92
94
|
((error, result) => {
|
|
93
95
|
if (error) {
|
|
94
96
|
const pendingReq = this.pendingRequests.get(result === null || result === void 0 ? void 0 : result.requestId);
|
package/lib/parallel.ts
CHANGED
|
@@ -5,6 +5,7 @@ import type {
|
|
|
5
5
|
LlamaCompletionToken,
|
|
6
6
|
RerankParams,
|
|
7
7
|
} from './binding'
|
|
8
|
+
import { formatMediaChat } from './utils'
|
|
8
9
|
|
|
9
10
|
export class LlamaParallelAPI {
|
|
10
11
|
private context: LlamaContext
|
|
@@ -109,9 +110,16 @@ export class LlamaParallelAPI {
|
|
|
109
110
|
}
|
|
110
111
|
: undefined
|
|
111
112
|
|
|
113
|
+
const { messages, media_paths = options.media_paths } = formatMediaChat(
|
|
114
|
+
options.messages,
|
|
115
|
+
)
|
|
112
116
|
// Queue the completion immediately (this is synchronous!)
|
|
113
117
|
const { requestId } = this.context.queueCompletion(
|
|
114
|
-
|
|
118
|
+
{
|
|
119
|
+
...options,
|
|
120
|
+
messages,
|
|
121
|
+
media_paths: media_paths,
|
|
122
|
+
},
|
|
115
123
|
tokenCallback ||
|
|
116
124
|
((error, result) => {
|
|
117
125
|
if (error) {
|
package/lib/utils.js
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.formatMediaChat = exports.MTMD_DEFAULT_MEDIA_MARKER = void 0;
|
|
4
|
+
exports.MTMD_DEFAULT_MEDIA_MARKER = '<__media__>';
|
|
5
|
+
const formatMediaChat = (messages) => {
|
|
6
|
+
if (!messages)
|
|
7
|
+
return {
|
|
8
|
+
messages,
|
|
9
|
+
has_media: false,
|
|
10
|
+
};
|
|
11
|
+
const mediaPaths = [];
|
|
12
|
+
return {
|
|
13
|
+
messages: messages.map((msg) => {
|
|
14
|
+
if (Array.isArray(msg.content)) {
|
|
15
|
+
const content = msg.content.map((part) => {
|
|
16
|
+
var _a;
|
|
17
|
+
// Handle multimodal content
|
|
18
|
+
if (part.type === 'image_url') {
|
|
19
|
+
let path = ((_a = part.image_url) === null || _a === void 0 ? void 0 : _a.url) || '';
|
|
20
|
+
mediaPaths.push(path);
|
|
21
|
+
return {
|
|
22
|
+
type: 'text',
|
|
23
|
+
text: exports.MTMD_DEFAULT_MEDIA_MARKER,
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
else if (part.type === 'input_audio') {
|
|
27
|
+
const { input_audio: audio } = part;
|
|
28
|
+
if (!audio)
|
|
29
|
+
throw new Error('input_audio is required');
|
|
30
|
+
const { format } = audio;
|
|
31
|
+
if (format != 'wav' && format != 'mp3') {
|
|
32
|
+
throw new Error(`Unsupported audio format: ${format}`);
|
|
33
|
+
}
|
|
34
|
+
if (audio.url) {
|
|
35
|
+
const path = audio.url.replace(/file:\/\//, '');
|
|
36
|
+
mediaPaths.push(path);
|
|
37
|
+
}
|
|
38
|
+
else if (audio.data) {
|
|
39
|
+
mediaPaths.push(audio.data);
|
|
40
|
+
}
|
|
41
|
+
return {
|
|
42
|
+
type: 'text',
|
|
43
|
+
text: exports.MTMD_DEFAULT_MEDIA_MARKER,
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
return part;
|
|
47
|
+
});
|
|
48
|
+
return Object.assign(Object.assign({}, msg), { content });
|
|
49
|
+
}
|
|
50
|
+
return msg;
|
|
51
|
+
}),
|
|
52
|
+
has_media: mediaPaths.length > 0,
|
|
53
|
+
media_paths: mediaPaths,
|
|
54
|
+
};
|
|
55
|
+
};
|
|
56
|
+
exports.formatMediaChat = formatMediaChat;
|
package/lib/utils.ts
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
|
|
2
|
+
import type {
|
|
3
|
+
ChatMessage,
|
|
4
|
+
} from './binding'
|
|
5
|
+
|
|
6
|
+
export const MTMD_DEFAULT_MEDIA_MARKER = '<__media__>'
|
|
7
|
+
|
|
8
|
+
export const formatMediaChat = (messages: ChatMessage[] | undefined): {
|
|
9
|
+
messages: ChatMessage[] | undefined
|
|
10
|
+
has_media: boolean
|
|
11
|
+
media_paths?: string[]
|
|
12
|
+
} => {
|
|
13
|
+
if (!messages)
|
|
14
|
+
return {
|
|
15
|
+
messages,
|
|
16
|
+
has_media: false,
|
|
17
|
+
}
|
|
18
|
+
const mediaPaths: string[] = []
|
|
19
|
+
return {
|
|
20
|
+
messages: messages.map((msg) => {
|
|
21
|
+
if (Array.isArray(msg.content)) {
|
|
22
|
+
const content = msg.content.map((part) => {
|
|
23
|
+
// Handle multimodal content
|
|
24
|
+
if (part.type === 'image_url') {
|
|
25
|
+
let path = part.image_url?.url || ''
|
|
26
|
+
mediaPaths.push(path)
|
|
27
|
+
return {
|
|
28
|
+
type: 'text',
|
|
29
|
+
text: MTMD_DEFAULT_MEDIA_MARKER,
|
|
30
|
+
}
|
|
31
|
+
} else if (part.type === 'input_audio') {
|
|
32
|
+
const { input_audio: audio } = part
|
|
33
|
+
if (!audio) throw new Error('input_audio is required')
|
|
34
|
+
|
|
35
|
+
const { format } = audio
|
|
36
|
+
if (format != 'wav' && format != 'mp3') {
|
|
37
|
+
throw new Error(`Unsupported audio format: ${format}`)
|
|
38
|
+
}
|
|
39
|
+
if (audio.url) {
|
|
40
|
+
const path = audio.url.replace(/file:\/\//, '')
|
|
41
|
+
mediaPaths.push(path)
|
|
42
|
+
} else if (audio.data) {
|
|
43
|
+
mediaPaths.push(audio.data)
|
|
44
|
+
}
|
|
45
|
+
return {
|
|
46
|
+
type: 'text',
|
|
47
|
+
text: MTMD_DEFAULT_MEDIA_MARKER,
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
return part
|
|
51
|
+
})
|
|
52
|
+
|
|
53
|
+
return {
|
|
54
|
+
...msg,
|
|
55
|
+
content,
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
return msg
|
|
59
|
+
}),
|
|
60
|
+
has_media: mediaPaths.length > 0,
|
|
61
|
+
media_paths: mediaPaths,
|
|
62
|
+
}
|
|
63
|
+
}
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.3.0-rc.
|
|
4
|
+
"version": "1.3.0-rc.4",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -72,19 +72,19 @@
|
|
|
72
72
|
"CMakeLists.txt"
|
|
73
73
|
],
|
|
74
74
|
"optionalDependencies": {
|
|
75
|
-
"@fugood/node-llama-linux-x64": "1.3.0-rc.
|
|
76
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.3.0-rc.
|
|
77
|
-
"@fugood/node-llama-linux-x64-cuda": "1.3.0-rc.
|
|
78
|
-
"@fugood/node-llama-linux-arm64": "1.3.0-rc.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.3.0-rc.
|
|
80
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.3.0-rc.
|
|
81
|
-
"@fugood/node-llama-win32-x64": "1.3.0-rc.
|
|
82
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.3.0-rc.
|
|
83
|
-
"@fugood/node-llama-win32-x64-cuda": "1.3.0-rc.
|
|
84
|
-
"@fugood/node-llama-win32-arm64": "1.3.0-rc.
|
|
85
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.3.0-rc.
|
|
86
|
-
"@fugood/node-llama-darwin-x64": "1.3.0-rc.
|
|
87
|
-
"@fugood/node-llama-darwin-arm64": "1.3.0-rc.
|
|
75
|
+
"@fugood/node-llama-linux-x64": "1.3.0-rc.4",
|
|
76
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.3.0-rc.4",
|
|
77
|
+
"@fugood/node-llama-linux-x64-cuda": "1.3.0-rc.4",
|
|
78
|
+
"@fugood/node-llama-linux-arm64": "1.3.0-rc.4",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.3.0-rc.4",
|
|
80
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.3.0-rc.4",
|
|
81
|
+
"@fugood/node-llama-win32-x64": "1.3.0-rc.4",
|
|
82
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.3.0-rc.4",
|
|
83
|
+
"@fugood/node-llama-win32-x64-cuda": "1.3.0-rc.4",
|
|
84
|
+
"@fugood/node-llama-win32-arm64": "1.3.0-rc.4",
|
|
85
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.3.0-rc.4",
|
|
86
|
+
"@fugood/node-llama-darwin-x64": "1.3.0-rc.4",
|
|
87
|
+
"@fugood/node-llama-darwin-arm64": "1.3.0-rc.4"
|
|
88
88
|
},
|
|
89
89
|
"devDependencies": {
|
|
90
90
|
"@babel/preset-env": "^7.24.4",
|
|
@@ -41,9 +41,9 @@ static std::string build_repetition(const std::string & item_rule, int min_items
|
|
|
41
41
|
return result;
|
|
42
42
|
}
|
|
43
43
|
|
|
44
|
-
static void _build_min_max_int(
|
|
45
|
-
auto has_min = min_value != std::numeric_limits<
|
|
46
|
-
auto has_max = max_value != std::numeric_limits<
|
|
44
|
+
static void _build_min_max_int(int64_t min_value, int64_t max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
|
|
45
|
+
auto has_min = min_value != std::numeric_limits<int64_t>::min();
|
|
46
|
+
auto has_max = max_value != std::numeric_limits<int64_t>::max();
|
|
47
47
|
|
|
48
48
|
auto digit_range = [&](char from, char to) {
|
|
49
49
|
out << "[";
|
|
@@ -159,7 +159,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
|
|
|
159
159
|
if (has_min) {
|
|
160
160
|
if (min_value < 0) {
|
|
161
161
|
out << "\"-\" (";
|
|
162
|
-
_build_min_max_int(std::numeric_limits<
|
|
162
|
+
_build_min_max_int(std::numeric_limits<int64_t>::min(), -min_value, out, decimals_left, /* top_level= */ false);
|
|
163
163
|
out << ") | [0] | [1-9] ";
|
|
164
164
|
more_digits(0, decimals_left - 1);
|
|
165
165
|
} else if (min_value == 0) {
|
|
@@ -194,7 +194,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
|
|
|
194
194
|
}
|
|
195
195
|
digit_range(c, c);
|
|
196
196
|
out << " (";
|
|
197
|
-
_build_min_max_int(std::
|
|
197
|
+
_build_min_max_int(std::stoll(min_s.substr(1)), std::numeric_limits<int64_t>::max(), out, less_decimals, /* top_level= */ false);
|
|
198
198
|
out << ")";
|
|
199
199
|
if (c < '9') {
|
|
200
200
|
out << " | ";
|
|
@@ -216,7 +216,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
|
|
|
216
216
|
_build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true);
|
|
217
217
|
} else {
|
|
218
218
|
out << "\"-\" (";
|
|
219
|
-
_build_min_max_int(-max_value, std::numeric_limits<
|
|
219
|
+
_build_min_max_int(-max_value, std::numeric_limits<int64_t>::max(), out, decimals_left, /* top_level= */ false);
|
|
220
220
|
out << ")";
|
|
221
221
|
}
|
|
222
222
|
return;
|
|
@@ -925,17 +925,17 @@ public:
|
|
|
925
925
|
int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
|
|
926
926
|
return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
|
|
927
927
|
} else if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
|
|
928
|
-
|
|
929
|
-
|
|
928
|
+
int64_t min_value = std::numeric_limits<int64_t>::min();
|
|
929
|
+
int64_t max_value = std::numeric_limits<int64_t>::max();
|
|
930
930
|
if (schema.contains("minimum")) {
|
|
931
|
-
min_value = schema["minimum"].get<
|
|
931
|
+
min_value = schema["minimum"].get<int64_t>();
|
|
932
932
|
} else if (schema.contains("exclusiveMinimum")) {
|
|
933
|
-
min_value = schema["exclusiveMinimum"].get<
|
|
933
|
+
min_value = schema["exclusiveMinimum"].get<int64_t>() + 1;
|
|
934
934
|
}
|
|
935
935
|
if (schema.contains("maximum")) {
|
|
936
|
-
max_value = schema["maximum"].get<
|
|
936
|
+
max_value = schema["maximum"].get<int64_t>();
|
|
937
937
|
} else if (schema.contains("exclusiveMaximum")) {
|
|
938
|
-
max_value = schema["exclusiveMaximum"].get<
|
|
938
|
+
max_value = schema["exclusiveMaximum"].get<int64_t>() - 1;
|
|
939
939
|
}
|
|
940
940
|
std::stringstream out;
|
|
941
941
|
out << "(";
|
|
@@ -21,8 +21,7 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const c
|
|
|
21
21
|
GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, uint32_t device, size_t * free, size_t * total);
|
|
22
22
|
|
|
23
23
|
GGML_BACKEND_API void ggml_backend_rpc_start_server(const char * endpoint, const char * cache_dir,
|
|
24
|
-
size_t n_threads, size_t n_devices,
|
|
25
|
-
ggml_backend_dev_t * devices, size_t * free_mem, size_t * total_mem);
|
|
24
|
+
size_t n_threads, size_t n_devices, ggml_backend_dev_t * devices);
|
|
26
25
|
|
|
27
26
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
|
|
28
27
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_add_server(const char * endpoint);
|
|
@@ -485,8 +485,9 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS> class tensor_
|
|
|
485
485
|
int32_t start = ith * task_per_thread;
|
|
486
486
|
int32_t end = std::min((ith + 1) * task_per_thread, task_count);
|
|
487
487
|
for (int32_t compute_idx = start; compute_idx < end; compute_idx++) {
|
|
488
|
-
int32_t gemm_idx = compute_idx /
|
|
489
|
-
int32_t
|
|
488
|
+
int32_t gemm_idx = compute_idx / per_gemm_block_count_m;
|
|
489
|
+
int32_t block_idx_in_gemm = compute_idx % per_gemm_block_count_m;
|
|
490
|
+
int32_t m_idx = block_idx_in_gemm * block_size_m;
|
|
490
491
|
const qnbitgemm_spacemit_ime_args & data = qnbitgemm_args[gemm_idx];
|
|
491
492
|
int32_t rows_tobe_handled = (gemm_m - m_idx) > block_size_m ? block_size_m : (gemm_m - m_idx);
|
|
492
493
|
|
|
@@ -421,11 +421,8 @@ struct llama_model::impl {
|
|
|
421
421
|
llama_mlocks mlock_bufs;
|
|
422
422
|
llama_mlocks mlock_mmaps;
|
|
423
423
|
|
|
424
|
-
// contexts where the model tensors metadata is stored
|
|
425
|
-
std::vector<ggml_context_ptr
|
|
426
|
-
|
|
427
|
-
// the model memory buffers for the tensor data
|
|
428
|
-
std::vector<ggml_backend_buffer_ptr> bufs;
|
|
424
|
+
// contexts where the model tensors metadata is stored as well ass the corresponding buffers:
|
|
425
|
+
std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
|
|
429
426
|
|
|
430
427
|
buft_list_t cpu_buft_list;
|
|
431
428
|
std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
|
|
@@ -2182,7 +2179,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2182
2179
|
max_n_tensors += n_layer*2; // duplicated rope freq tensors
|
|
2183
2180
|
const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
|
|
2184
2181
|
|
|
2185
|
-
|
|
2182
|
+
// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
|
|
2183
|
+
struct ggml_backend_buft_comparator {
|
|
2184
|
+
bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
|
|
2185
|
+
return ggml_backend_buft_name(lhs) < ggml_backend_buft_name(rhs);
|
|
2186
|
+
}
|
|
2187
|
+
};
|
|
2188
|
+
std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
|
|
2189
|
+
|
|
2186
2190
|
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
|
|
2187
2191
|
auto it = ctx_map.find(buft);
|
|
2188
2192
|
if (it == ctx_map.end()) {
|
|
@@ -2197,12 +2201,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2197
2201
|
throw std::runtime_error(format("failed to create ggml context"));
|
|
2198
2202
|
}
|
|
2199
2203
|
|
|
2200
|
-
ctx_map
|
|
2201
|
-
pimpl->ctxs.emplace_back(ctx);
|
|
2204
|
+
ctx_map.emplace(buft, ctx);
|
|
2202
2205
|
|
|
2203
2206
|
return ctx;
|
|
2204
2207
|
}
|
|
2205
|
-
return it->second;
|
|
2208
|
+
return it->second.get();
|
|
2206
2209
|
};
|
|
2207
2210
|
|
|
2208
2211
|
const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
|
|
@@ -6037,16 +6040,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6037
6040
|
pimpl->mappings.reserve(ml.mappings.size());
|
|
6038
6041
|
|
|
6039
6042
|
// create the backend buffers
|
|
6040
|
-
std::vector<std::pair<ggml_context *, llama_buf_map>>
|
|
6041
|
-
|
|
6043
|
+
std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_buf_maps;
|
|
6044
|
+
ctx_buf_maps.reserve(ctx_map.size());
|
|
6042
6045
|
|
|
6043
6046
|
// Ensure we have enough capacity for the maximum backend buffer we will potentially create
|
|
6044
6047
|
const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
|
|
6045
|
-
pimpl->
|
|
6048
|
+
pimpl->ctxs_bufs.reserve(n_max_backend_buffer);
|
|
6046
6049
|
|
|
6047
|
-
for (auto &
|
|
6048
|
-
|
|
6049
|
-
ggml_context * ctx = it.second;
|
|
6050
|
+
for (auto & [buft, ctx_ptr] : ctx_map) {
|
|
6051
|
+
ggml_context * ctx = ctx_ptr.get();
|
|
6050
6052
|
|
|
6051
6053
|
// skip contexts without tensors
|
|
6052
6054
|
if (ggml_get_first_tensor(ctx) == nullptr) {
|
|
@@ -6070,6 +6072,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6070
6072
|
bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
|
|
6071
6073
|
bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
|
|
6072
6074
|
|
|
6075
|
+
ggml_backend_buffer_t buf = nullptr;
|
|
6073
6076
|
if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
|
|
6074
6077
|
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
|
6075
6078
|
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
|
@@ -6082,20 +6085,18 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6082
6085
|
continue;
|
|
6083
6086
|
}
|
|
6084
6087
|
const size_t max_size = ggml_get_max_tensor_size(ctx);
|
|
6085
|
-
|
|
6088
|
+
buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
|
|
6086
6089
|
if (buf == nullptr) {
|
|
6087
6090
|
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
|
|
6088
6091
|
}
|
|
6089
|
-
pimpl->bufs.emplace_back(buf);
|
|
6090
6092
|
buf_map.emplace(idx, buf);
|
|
6091
6093
|
}
|
|
6092
6094
|
}
|
|
6093
6095
|
else {
|
|
6094
|
-
|
|
6096
|
+
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
|
6095
6097
|
if (buf == nullptr) {
|
|
6096
6098
|
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
|
|
6097
6099
|
}
|
|
6098
|
-
pimpl->bufs.emplace_back(buf);
|
|
6099
6100
|
if (use_mlock && ggml_backend_buffer_is_host(buf)) {
|
|
6100
6101
|
pimpl->mlock_bufs.emplace_back(new llama_mlock);
|
|
6101
6102
|
auto & mlock_buf = pimpl->mlock_bufs.back();
|
|
@@ -6106,10 +6107,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6106
6107
|
buf_map.emplace(idx, buf);
|
|
6107
6108
|
}
|
|
6108
6109
|
}
|
|
6109
|
-
|
|
6110
|
-
if (pimpl->bufs.empty()) {
|
|
6111
|
-
throw std::runtime_error("failed to allocate buffer");
|
|
6112
|
-
}
|
|
6110
|
+
pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), buf);
|
|
6113
6111
|
|
|
6114
6112
|
for (auto & buf : buf_map) {
|
|
6115
6113
|
// indicate that this buffer contains weights
|
|
@@ -6117,7 +6115,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6117
6115
|
ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
|
6118
6116
|
}
|
|
6119
6117
|
|
|
6120
|
-
|
|
6118
|
+
ctx_buf_maps.emplace_back(ctx, buf_map);
|
|
6121
6119
|
}
|
|
6122
6120
|
|
|
6123
6121
|
if (llama_supports_gpu_offload()) {
|
|
@@ -6135,22 +6133,20 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6135
6133
|
}
|
|
6136
6134
|
|
|
6137
6135
|
// print memory requirements per buffer type
|
|
6138
|
-
for (auto & buf : pimpl->
|
|
6136
|
+
for (auto & [_, buf] : pimpl->ctxs_bufs) {
|
|
6139
6137
|
LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
|
|
6140
6138
|
}
|
|
6141
6139
|
|
|
6142
6140
|
// populate tensors_by_name
|
|
6143
|
-
for (auto & ctx : pimpl->
|
|
6141
|
+
for (auto & [ctx, _] : pimpl->ctxs_bufs) {
|
|
6144
6142
|
for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
|
|
6145
6143
|
tensors_by_name.emplace_back(ggml_get_name(cur), cur);
|
|
6146
6144
|
}
|
|
6147
6145
|
}
|
|
6148
6146
|
|
|
6149
6147
|
// load tensor data
|
|
6150
|
-
for (auto &
|
|
6151
|
-
|
|
6152
|
-
auto & bufs = it.second;
|
|
6153
|
-
if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
|
|
6148
|
+
for (auto & [ctx, buf_map] : ctx_buf_maps) {
|
|
6149
|
+
if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
|
|
6154
6150
|
return false;
|
|
6155
6151
|
}
|
|
6156
6152
|
}
|
|
@@ -6190,8 +6186,8 @@ size_t llama_model::n_devices() const {
|
|
|
6190
6186
|
|
|
6191
6187
|
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
|
|
6192
6188
|
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
|
6193
|
-
for (const
|
|
6194
|
-
ret[ggml_backend_buffer_get_type(
|
|
6189
|
+
for (const auto & [_, buf] : pimpl->ctxs_bufs) {
|
|
6190
|
+
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
|
|
6195
6191
|
}
|
|
6196
6192
|
return ret;
|
|
6197
6193
|
}
|