@fugood/llama.node 1.4.4 → 1.4.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +49 -0
- package/lib/index.js +13 -0
- package/lib/index.ts +13 -0
- package/package.json +15 -15
- package/src/LlamaContext.cpp +69 -0
- package/src/LlamaContext.h +3 -0
package/lib/binding.ts
CHANGED
|
@@ -309,6 +309,45 @@ export type BackendDeviceInfo = {
|
|
|
309
309
|
metadata?: Record<string, any>
|
|
310
310
|
}
|
|
311
311
|
|
|
312
|
+
export type BenchResult = {
|
|
313
|
+
/** Maximum KV cache size */
|
|
314
|
+
nKvMax: number
|
|
315
|
+
/** Batch size */
|
|
316
|
+
nBatch: number
|
|
317
|
+
/** Micro-batch size */
|
|
318
|
+
nUBatch: number
|
|
319
|
+
/** Flash attention type (0=disabled, 1=enabled, 2=auto) */
|
|
320
|
+
flashAttn: number
|
|
321
|
+
/** Whether prompt processing is shared */
|
|
322
|
+
isPpShared: boolean
|
|
323
|
+
/** Number of GPU layers */
|
|
324
|
+
nGpuLayers: number
|
|
325
|
+
/** Number of threads */
|
|
326
|
+
nThreads: number
|
|
327
|
+
/** Number of threads for batch processing */
|
|
328
|
+
nThreadsBatch: number
|
|
329
|
+
/** Prompt processing tokens count */
|
|
330
|
+
pp: number
|
|
331
|
+
/** Text generation tokens count */
|
|
332
|
+
tg: number
|
|
333
|
+
/** Parallel level */
|
|
334
|
+
pl: number
|
|
335
|
+
/** KV cache used */
|
|
336
|
+
nKv: number
|
|
337
|
+
/** Time for prompt processing (ms) */
|
|
338
|
+
tPp: number
|
|
339
|
+
/** Speed of prompt processing (tokens/sec) */
|
|
340
|
+
speedPp: number
|
|
341
|
+
/** Time for text generation (ms) */
|
|
342
|
+
tTg: number
|
|
343
|
+
/** Speed of text generation (tokens/sec) */
|
|
344
|
+
speedTg: number
|
|
345
|
+
/** Total time (ms) */
|
|
346
|
+
t: number
|
|
347
|
+
/** Overall speed (tokens/sec) */
|
|
348
|
+
speed: number
|
|
349
|
+
}
|
|
350
|
+
|
|
312
351
|
export type ModelInfo = {
|
|
313
352
|
desc: string
|
|
314
353
|
nEmbd: number
|
|
@@ -573,6 +612,16 @@ export interface LlamaContext {
|
|
|
573
612
|
*/
|
|
574
613
|
clearCache(clearData?: boolean): void
|
|
575
614
|
|
|
615
|
+
/**
|
|
616
|
+
* Run a benchmark to measure model performance
|
|
617
|
+
* @param pp Number of tokens to process for prompt processing benchmark
|
|
618
|
+
* @param tg Number of tokens to generate for text generation benchmark
|
|
619
|
+
* @param pl Parallel level (number of sequences)
|
|
620
|
+
* @param nr Number of repetitions
|
|
621
|
+
* @returns Benchmark results
|
|
622
|
+
*/
|
|
623
|
+
bench(pp: number, tg: number, pl: number, nr: number): Promise<BenchResult>
|
|
624
|
+
|
|
576
625
|
// static
|
|
577
626
|
loadModelInfo(path: string, skip: string[]): Promise<GGUFModelInfo>
|
|
578
627
|
toggleNativeLog(
|
package/lib/index.js
CHANGED
|
@@ -204,6 +204,19 @@ class LlamaContextWrapper {
|
|
|
204
204
|
clearCache(clearData) {
|
|
205
205
|
this.ctx.clearCache(clearData);
|
|
206
206
|
}
|
|
207
|
+
/**
|
|
208
|
+
* Run a benchmark to measure model performance
|
|
209
|
+
* @param pp Number of tokens to process for prompt processing benchmark
|
|
210
|
+
* @param tg Number of tokens to generate for text generation benchmark
|
|
211
|
+
* @param pl Parallel level (number of sequences)
|
|
212
|
+
* @param nr Number of repetitions
|
|
213
|
+
* @returns Benchmark results including timing and speed metrics
|
|
214
|
+
*/
|
|
215
|
+
bench(pp, tg, pl, nr) {
|
|
216
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
217
|
+
return this.ctx.bench(pp, tg, pl, nr);
|
|
218
|
+
});
|
|
219
|
+
}
|
|
207
220
|
}
|
|
208
221
|
const loadModel = (options, onProgress) => __awaiter(void 0, void 0, void 0, function* () {
|
|
209
222
|
var _a, _b;
|
package/lib/index.ts
CHANGED
|
@@ -16,6 +16,7 @@ import type {
|
|
|
16
16
|
JinjaFormattedChatResult,
|
|
17
17
|
Tool,
|
|
18
18
|
GGUFModelInfo,
|
|
19
|
+
BenchResult,
|
|
19
20
|
} from './binding'
|
|
20
21
|
import { BUILD_NUMBER, BUILD_COMMIT } from './version'
|
|
21
22
|
import { LlamaParallelAPI } from './parallel'
|
|
@@ -309,6 +310,18 @@ class LlamaContextWrapper {
|
|
|
309
310
|
clearCache(clearData?: boolean): void {
|
|
310
311
|
this.ctx.clearCache(clearData)
|
|
311
312
|
}
|
|
313
|
+
|
|
314
|
+
/**
|
|
315
|
+
* Run a benchmark to measure model performance
|
|
316
|
+
* @param pp Number of tokens to process for prompt processing benchmark
|
|
317
|
+
* @param tg Number of tokens to generate for text generation benchmark
|
|
318
|
+
* @param pl Parallel level (number of sequences)
|
|
319
|
+
* @param nr Number of repetitions
|
|
320
|
+
* @returns Benchmark results including timing and speed metrics
|
|
321
|
+
*/
|
|
322
|
+
async bench(pp: number, tg: number, pl: number, nr: number): Promise<BenchResult> {
|
|
323
|
+
return this.ctx.bench(pp, tg, pl, nr)
|
|
324
|
+
}
|
|
312
325
|
}
|
|
313
326
|
|
|
314
327
|
export const loadModel = async (
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.4.
|
|
4
|
+
"version": "1.4.6",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -72,20 +72,20 @@
|
|
|
72
72
|
"CMakeLists.txt"
|
|
73
73
|
],
|
|
74
74
|
"optionalDependencies": {
|
|
75
|
-
"@fugood/node-llama-darwin-arm64": "1.4.
|
|
76
|
-
"@fugood/node-llama-darwin-x64": "1.4.
|
|
77
|
-
"@fugood/node-llama-linux-arm64": "1.4.
|
|
78
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.4.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-snapdragon": "1.4.
|
|
80
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.4.
|
|
81
|
-
"@fugood/node-llama-linux-x64": "1.4.
|
|
82
|
-
"@fugood/node-llama-linux-x64-cuda": "1.4.
|
|
83
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.4.
|
|
84
|
-
"@fugood/node-llama-win32-arm64": "1.4.
|
|
85
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.4.
|
|
86
|
-
"@fugood/node-llama-win32-x64": "1.4.
|
|
87
|
-
"@fugood/node-llama-win32-x64-cuda": "1.4.
|
|
88
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.4.
|
|
75
|
+
"@fugood/node-llama-darwin-arm64": "1.4.6",
|
|
76
|
+
"@fugood/node-llama-darwin-x64": "1.4.6",
|
|
77
|
+
"@fugood/node-llama-linux-arm64": "1.4.6",
|
|
78
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.4.6",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-snapdragon": "1.4.6",
|
|
80
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.4.6",
|
|
81
|
+
"@fugood/node-llama-linux-x64": "1.4.6",
|
|
82
|
+
"@fugood/node-llama-linux-x64-cuda": "1.4.6",
|
|
83
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.4.6",
|
|
84
|
+
"@fugood/node-llama-win32-arm64": "1.4.6",
|
|
85
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.4.6",
|
|
86
|
+
"@fugood/node-llama-win32-x64": "1.4.6",
|
|
87
|
+
"@fugood/node-llama-win32-x64-cuda": "1.4.6",
|
|
88
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.4.6"
|
|
89
89
|
},
|
|
90
90
|
"devDependencies": {
|
|
91
91
|
"@babel/preset-env": "^7.24.4",
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -203,6 +203,9 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
|
|
|
203
203
|
static_cast<napi_property_attributes>(napi_enumerable)),
|
|
204
204
|
InstanceMethod<&LlamaContext::ClearCache>(
|
|
205
205
|
"clearCache",
|
|
206
|
+
static_cast<napi_property_attributes>(napi_enumerable)),
|
|
207
|
+
InstanceMethod<&LlamaContext::Bench>(
|
|
208
|
+
"bench",
|
|
206
209
|
static_cast<napi_property_attributes>(napi_enumerable))});
|
|
207
210
|
Napi::FunctionReference *constructor = new Napi::FunctionReference();
|
|
208
211
|
*constructor = Napi::Persistent(func);
|
|
@@ -1529,3 +1532,69 @@ void LlamaContext::ClearCache(const Napi::CallbackInfo &info) {
|
|
|
1529
1532
|
|
|
1530
1533
|
_rn_ctx->clearCache(clear_data);
|
|
1531
1534
|
}
|
|
1535
|
+
|
|
1536
|
+
// bench(pp: number, tg: number, pl: number, nr: number): Promise<BenchResult>
|
|
1537
|
+
Napi::Value LlamaContext::Bench(const Napi::CallbackInfo &info) {
|
|
1538
|
+
Napi::Env env = info.Env();
|
|
1539
|
+
|
|
1540
|
+
if (info.Length() < 4) {
|
|
1541
|
+
Napi::TypeError::New(env, "Expected 4 arguments: pp, tg, pl, nr")
|
|
1542
|
+
.ThrowAsJavaScriptException();
|
|
1543
|
+
return env.Undefined();
|
|
1544
|
+
}
|
|
1545
|
+
|
|
1546
|
+
if (!_rn_ctx) {
|
|
1547
|
+
Napi::TypeError::New(env, "Context is disposed").ThrowAsJavaScriptException();
|
|
1548
|
+
return env.Undefined();
|
|
1549
|
+
}
|
|
1550
|
+
|
|
1551
|
+
if (!_rn_ctx->completion) {
|
|
1552
|
+
Napi::TypeError::New(env, "Completion context not initialized")
|
|
1553
|
+
.ThrowAsJavaScriptException();
|
|
1554
|
+
return env.Undefined();
|
|
1555
|
+
}
|
|
1556
|
+
|
|
1557
|
+
int pp = info[0].ToNumber().Int32Value();
|
|
1558
|
+
int tg = info[1].ToNumber().Int32Value();
|
|
1559
|
+
int pl = info[2].ToNumber().Int32Value();
|
|
1560
|
+
int nr = info[3].ToNumber().Int32Value();
|
|
1561
|
+
|
|
1562
|
+
std::string result;
|
|
1563
|
+
try {
|
|
1564
|
+
result = _rn_ctx->completion->bench(pp, tg, pl, nr);
|
|
1565
|
+
} catch (const std::exception &e) {
|
|
1566
|
+
Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
|
|
1567
|
+
return env.Undefined();
|
|
1568
|
+
}
|
|
1569
|
+
|
|
1570
|
+
// Parse the JSON result and return as object
|
|
1571
|
+
try {
|
|
1572
|
+
auto parsed = json::parse(result);
|
|
1573
|
+
Napi::Object benchResult = Napi::Object::New(env);
|
|
1574
|
+
|
|
1575
|
+
benchResult.Set("nKvMax", Napi::Number::New(env, parsed["n_kv_max"].get<int>()));
|
|
1576
|
+
benchResult.Set("nBatch", Napi::Number::New(env, parsed["n_batch"].get<int>()));
|
|
1577
|
+
benchResult.Set("nUBatch", Napi::Number::New(env, parsed["n_ubatch"].get<int>()));
|
|
1578
|
+
benchResult.Set("flashAttn", Napi::Number::New(env, parsed["flash_attn"].get<int>()));
|
|
1579
|
+
benchResult.Set("isPpShared", Napi::Boolean::New(env, parsed["is_pp_shared"].get<int>() != 0));
|
|
1580
|
+
benchResult.Set("nGpuLayers", Napi::Number::New(env, parsed["n_gpu_layers"].get<int>()));
|
|
1581
|
+
benchResult.Set("nThreads", Napi::Number::New(env, parsed["n_threads"].get<int>()));
|
|
1582
|
+
benchResult.Set("nThreadsBatch", Napi::Number::New(env, parsed["n_threads_batch"].get<int>()));
|
|
1583
|
+
benchResult.Set("pp", Napi::Number::New(env, parsed["pp"].get<int>()));
|
|
1584
|
+
benchResult.Set("tg", Napi::Number::New(env, parsed["tg"].get<int>()));
|
|
1585
|
+
benchResult.Set("pl", Napi::Number::New(env, parsed["pl"].get<int>()));
|
|
1586
|
+
benchResult.Set("nKv", Napi::Number::New(env, parsed["n_kv"].get<int>()));
|
|
1587
|
+
benchResult.Set("tPp", Napi::Number::New(env, parsed["t_pp"].get<double>()));
|
|
1588
|
+
benchResult.Set("speedPp", Napi::Number::New(env, parsed["speed_pp"].get<double>()));
|
|
1589
|
+
benchResult.Set("tTg", Napi::Number::New(env, parsed["t_tg"].get<double>()));
|
|
1590
|
+
benchResult.Set("speedTg", Napi::Number::New(env, parsed["speed_tg"].get<double>()));
|
|
1591
|
+
benchResult.Set("t", Napi::Number::New(env, parsed["t"].get<double>()));
|
|
1592
|
+
benchResult.Set("speed", Napi::Number::New(env, parsed["speed"].get<double>()));
|
|
1593
|
+
|
|
1594
|
+
return benchResult;
|
|
1595
|
+
} catch (const std::exception &e) {
|
|
1596
|
+
Napi::Error::New(env, std::string("Failed to parse benchmark result: ") + e.what())
|
|
1597
|
+
.ThrowAsJavaScriptException();
|
|
1598
|
+
return env.Undefined();
|
|
1599
|
+
}
|
|
1600
|
+
}
|
package/src/LlamaContext.h
CHANGED