npm - @fugood/llama.node - Versions diffs - 1.2.5 → 1.3.0-rc.0 - Mend

@fugood/llama.node 1.2.5 → 1.3.0-rc.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

package/CMakeLists.txt +1 -0
package/lib/binding.ts +96 -1
package/lib/index.js +4 -2
package/lib/index.ts +4 -1
package/lib/parallel.js +214 -0
package/lib/parallel.ts +273 -0
package/package.json +14 -14
package/src/LlamaContext.cpp +34 -1
package/src/LlamaContext.h +16 -0
package/src/common.hpp +4 -3
package/src/llama.cpp/common/arg.cpp +1 -1
package/src/llama.cpp/ggml/include/ggml.h +44 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +16 -3
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +16 -0
package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +32 -0
package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +4 -0
package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +1 -1
package/src/llama.cpp/src/llama-arch.cpp +5 -0
package/src/llama.cpp/src/llama-arch.h +1 -0
package/src/llama.cpp/src/llama-graph.cpp +74 -43
package/src/llama.cpp/src/llama-graph.h +7 -3
package/src/llama.cpp/src/llama-model.cpp +8 -7
package/src/llama.cpp/src/llama-quant.cpp +7 -1
package/src/llama.cpp/src/llama.cpp +4 -0

package/CMakeLists.txt CHANGED Viewed

@@ -137,6 +137,7 @@ file(
     "src/LlamaCompletionWorker.h"
     "src/LlamaContext.cpp"
     "src/LlamaContext.h"
+    "src/LlamaContext_parallel.cpp"
     "src/EmbeddingWorker.cpp"
     "src/EmbeddingWorker.h"
     "src/RerankWorker.cpp"

package/lib/binding.ts CHANGED Viewed

@@ -25,6 +25,12 @@ export type LlamaModelOptions = {
   n_ctx?: number
   n_batch?: number
   n_ubatch?: number
+  /**
+   * Number of parallel sequences to support (sets n_seq_max).
+   * This determines the maximum number of parallel slots that can be used.
+   * Default: 8
+   */
+  n_parallel?: number
   n_threads?: number
   n_gpu_layers?: number
   flash_attn_type?: 'auto' | 'on' | 'off'
@@ -157,6 +163,36 @@ export type LlamaCompletionOptions = {
   n_probs?: number
 }
+/**
+ * Parameters for parallel completion requests (queueCompletion).
+ * Extends LlamaCompletionOptions with parallel-mode specific options.
+ */
+export type LlamaParallelCompletionOptions = LlamaCompletionOptions & {
+  /**
+   * File path to load session state from before processing.
+   * This allows you to resume from a previously saved completion state.
+   * Use with `save_state_path` to enable conversation continuity across requests.
+   * Example: `'/path/to/session.bin'` or `'file:///path/to/session.bin'`
+   */
+  load_state_path?: string
+  /**
+   * File path to save session state to after completion.
+   * The session state will be saved to this file path when the completion finishes.
+   * You can then pass this path to `load_state_path` in a subsequent request to resume.
+   * Example: `'/path/to/session.bin'` or `'file:///path/to/session.bin'`
+   */
+  save_state_path?: string
+  /**
+   * Number of tokens to save when saving session state.
+   * If not specified or <= 0, all tokens will be saved.
+   * Use this to limit the size of saved session files.
+   * Example: `512` to save only the last 512 tokens
+   */
+  save_state_size?: number
+}
 export type TokenProbability = {
   tok_str: string
   prob: number
@@ -271,7 +307,7 @@ export type JinjaFormattedChatResult = {
   prompt: string
   chat_format: number
   grammar: string
-  grammea_lazy: boolean
+  grammar_lazy: boolean
   grammar_triggers: Array<{
     type: number
     value: string
@@ -404,6 +440,65 @@ export interface LlamaContext {
    */
   decodeAudioTokens(tokens: number[]|Int32Array): Promise<Float32Array>
+  // Parallel decoding methods
+  /**
+   * Enable parallel decoding mode
+   * @param params Configuration for parallel mode
+   * @returns boolean indicating if successful
+   */
+  enableParallelMode(params: { n_parallel?: number, n_batch?: number }): boolean
+  /**
+   * Disable parallel decoding mode
+   */
+  disableParallelMode(): void
+  /**
+   * Queue a completion request for parallel processing
+   * @param options Completion options with parallel-specific state management
+   * @param callback Optional token callback
+   * @returns Object with requestId
+   */
+  queueCompletion(
+    options: LlamaParallelCompletionOptions,
+    callback?: (error: any, result: any) => void,
+  ): { requestId: number }
+  /**
+   * Queue an embedding request for parallel processing
+   * @param text Text to embed
+   * @param params Optional embedding parameters
+   * @param callback Optional result callback
+   * @returns Object with requestId
+   */
+  queueEmbedding(
+    text: string,
+    params?: { embd_normalize?: number },
+    callback?: (error: any, result: any) => void,
+  ): { requestId: number }
+  /**
+   * Queue a rerank request for parallel processing
+   * @param query Query text
+   * @param documents Documents to rank
+   * @param params Optional rerank parameters
+   * @param callback Optional result callback
+   * @returns Object with requestId
+   */
+  queueRerank(
+    query: string,
+    documents: string[],
+    params?: RerankParams,
+    callback?: (error: any, result: any) => void,
+  ): { requestId: number }
+  /**
+   * Cancel a queued request
+   * @param requestId Request ID to cancel
+   */
+  cancelRequest(requestId: number): void
   // static
   loadModelInfo(path: string, skip: string[]): Promise<GGUFModelInfo>
   toggleNativeLog(

package/lib/index.js CHANGED Viewed

@@ -23,10 +23,12 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
     });
 };
 Object.defineProperty(exports, "__esModule", { value: true });
-exports.BuildInfo = exports.loadLlamaModelInfo = exports.initLlama = exports.loadModel = exports.toggleNativeLog = exports.MTMD_DEFAULT_MEDIA_MARKER = void 0;
+exports.BuildInfo = exports.loadLlamaModelInfo = exports.initLlama = exports.loadModel = exports.toggleNativeLog = exports.MTMD_DEFAULT_MEDIA_MARKER = exports.LlamaParallelAPI = void 0;
 exports.addNativeLogListener = addNativeLogListener;
 const binding_1 = require("./binding");
 const version_1 = require("./version");
+const parallel_1 = require("./parallel");
+Object.defineProperty(exports, "LlamaParallelAPI", { enumerable: true, get: function () { return parallel_1.LlamaParallelAPI; } });
 __exportStar(require("./binding"), exports);
 exports.MTMD_DEFAULT_MEDIA_MARKER = '<__media__>';
 const mods = {};
@@ -66,6 +68,7 @@ const getJsonSchema = (responseFormat) => {
 class LlamaContextWrapper {
     constructor(nativeCtx) {
         this.ctx = nativeCtx;
+        this.parallel = new parallel_1.LlamaParallelAPI(nativeCtx);
     }
     getSystemInfo() {
         return this.ctx.getSystemInfo();
@@ -138,7 +141,6 @@ class LlamaContextWrapper {
         let tmpl;
         if (template)
             tmpl = template; // Force replace if provided
-        const jsonSchema = getJsonSchema(params === null || params === void 0 ? void 0 : params.response_format);
         const result = this.ctx.getFormattedChat(chat, tmpl, {
             jinja: useJinja,
             response_format: params === null || params === void 0 ? void 0 : params.response_format,

package/lib/index.ts CHANGED Viewed

@@ -18,8 +18,10 @@ import type {
   GGUFModelInfo,
 } from './binding'
 import { BUILD_NUMBER, BUILD_COMMIT } from './version'
+import { LlamaParallelAPI } from './parallel'
 export * from './binding'
+export { LlamaParallelAPI }
 export const MTMD_DEFAULT_MEDIA_MARKER = '<__media__>'
@@ -78,9 +80,11 @@ export type FormattedChatResult = {
 class LlamaContextWrapper {
   ctx: LlamaContext
+  parallel: LlamaParallelAPI
   constructor(nativeCtx: LlamaContext) {
     this.ctx = nativeCtx
+    this.parallel = new LlamaParallelAPI(nativeCtx)
   }
   getSystemInfo(): string {
@@ -181,7 +185,6 @@ class LlamaContextWrapper {
     const useJinja = this.isJinjaSupported() && params?.jinja
     let tmpl
     if (template) tmpl = template // Force replace if provided
-    const jsonSchema = getJsonSchema(params?.response_format)
     const result = this.ctx.getFormattedChat(chat!, tmpl, {
       jinja: useJinja,

package/lib/parallel.js ADDED Viewed

@@ -0,0 +1,214 @@
+"use strict";
+var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
+    function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
+    return new (P || (P = Promise))(function (resolve, reject) {
+        function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
+        function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
+        function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
+        step((generator = generator.apply(thisArg, _arguments || [])).next());
+    });
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.LlamaParallelAPI = void 0;
+class LlamaParallelAPI {
+    constructor(context) {
+        this.enabled = false;
+        this.pendingRequests = new Map();
+        this.context = context;
+    }
+    /**
+     * Enable parallel decoding mode
+     * @param config Configuration for parallel mode
+     * @returns boolean indicating if successful
+     */
+    enable(config) {
+        return __awaiter(this, void 0, void 0, function* () {
+            const defaultConfig = { n_parallel: 2, n_batch: 512 };
+            const result = this.context.enableParallelMode(Object.assign(Object.assign({}, defaultConfig), config));
+            this.enabled = result;
+            return result;
+        });
+    }
+    /**
+     * Disable parallel decoding mode
+     */
+    disable() {
+        this.context.disableParallelMode();
+        this.enabled = false;
+    }
+    /**
+     * Configure parallel decoding mode (enables if not already enabled)
+     * @param config Configuration for parallel mode
+     * @returns boolean indicating if successful
+     */
+    configure(config) {
+        return __awaiter(this, void 0, void 0, function* () {
+            return this.enable(config);
+        });
+    }
+    /**
+     * Queue a completion request for parallel processing
+     * @param options Completion options
+     * @param onToken Optional callback for each token
+     * @returns Object with requestId, promise for result, and stop function
+     */
+    completion(options, onToken) {
+        return __awaiter(this, void 0, void 0, function* () {
+            if (!this.enabled) {
+                throw new Error('Parallel mode is not enabled. Call enable() first.');
+            }
+            const tokenCallback = onToken
+                ? (error, result) => {
+                    if (error) {
+                        console.error('Token callback error:', error);
+                        // Handle completion error
+                        const pendingReq = this.pendingRequests.get(result === null || result === void 0 ? void 0 : result.requestId);
+                        if (pendingReq) {
+                            pendingReq.reject(error);
+                            this.pendingRequests.delete(result === null || result === void 0 ? void 0 : result.requestId);
+                        }
+                        return;
+                    }
+                    // Check if this is a token callback or final result
+                    if (result) {
+                        if (result.token !== undefined) {
+                            // This is a token callback
+                            onToken(result.requestId, result);
+                        }
+                        else if (result.text !== undefined ||
+                            result.content !== undefined) {
+                            // This is the final result
+                            const pendingReq = this.pendingRequests.get(result.requestId);
+                            if (pendingReq) {
+                                pendingReq.resolve(result);
+                                this.pendingRequests.delete(result.requestId);
+                            }
+                        }
+                    }
+                }
+                : undefined;
+            // Queue the completion immediately (this is synchronous!)
+            const { requestId } = this.context.queueCompletion(options, tokenCallback ||
+                ((error, result) => {
+                    if (error) {
+                        const pendingReq = this.pendingRequests.get(result === null || result === void 0 ? void 0 : result.requestId);
+                        if (pendingReq) {
+                            pendingReq.reject(error);
+                            this.pendingRequests.delete(result === null || result === void 0 ? void 0 : result.requestId);
+                        }
+                    }
+                    else if (result &&
+                        (result.text !== undefined || result.content !== undefined)) {
+                        // Final result for non-streaming
+                        const pendingReq = this.pendingRequests.get(result.requestId);
+                        if (pendingReq) {
+                            pendingReq.resolve(result);
+                            this.pendingRequests.delete(result.requestId);
+                        }
+                    }
+                }));
+            // Create promise for final result
+            const promise = new Promise((resolveResult, rejectResult) => {
+                this.pendingRequests.set(requestId, {
+                    resolve: resolveResult,
+                    reject: rejectResult,
+                });
+            });
+            // Create stop function
+            const stop = () => {
+                this.context.cancelRequest(requestId);
+                const pendingReq = this.pendingRequests.get(requestId);
+                if (pendingReq) {
+                    pendingReq.reject(new Error('Request cancelled'));
+                    this.pendingRequests.delete(requestId);
+                }
+            };
+            // Return immediately without wrapping in a Promise
+            return {
+                requestId,
+                promise,
+                stop,
+            };
+        });
+    }
+    /**
+     * Queue an embedding request for parallel processing
+     * @param text Text to embed
+     * @param params Optional embedding parameters
+     * @returns Object with requestId and promise for result
+     */
+    embedding(text, params) {
+        return __awaiter(this, void 0, void 0, function* () {
+            if (!this.enabled) {
+                throw new Error('Parallel mode is not enabled. Call enable() first.');
+            }
+            // Create promise for result
+            let resolveResult;
+            let rejectResult;
+            const promise = new Promise((res, rej) => {
+                resolveResult = res;
+                rejectResult = rej;
+            });
+            // Queue the embedding immediately (this is synchronous!)
+            const { requestId } = this.context.queueEmbedding(text, params, (error, result) => {
+                if (error) {
+                    rejectResult(error);
+                }
+                else {
+                    resolveResult(result);
+                }
+            });
+            // Return immediately without wrapping in a Promise
+            return {
+                requestId,
+                promise,
+            };
+        });
+    }
+    /**
+     * Queue a rerank request for parallel processing
+     * @param query Query text
+     * @param documents Documents to rank
+     * @param params Optional rerank parameters
+     * @returns Object with requestId and promise for results
+     */
+    rerank(query, documents, params) {
+        return __awaiter(this, void 0, void 0, function* () {
+            if (!this.enabled) {
+                throw new Error('Parallel mode is not enabled. Call enable() first.');
+            }
+            // Create promise for result
+            let resolveResult;
+            let rejectResult;
+            const promise = new Promise((res, rej) => {
+                resolveResult = res;
+                rejectResult = rej;
+            });
+            // Queue the rerank immediately (this is synchronous!)
+            const { requestId } = this.context.queueRerank(query, documents, params, (error, result) => {
+                if (error) {
+                    rejectResult(error);
+                }
+                else {
+                    // Add document text to results and sort by score
+                    const enrichedResults = result.results
+                        .map((r) => (Object.assign(Object.assign({}, r), { document: documents[r.index] })))
+                        .sort((a, b) => b.score - a.score);
+                    resolveResult(enrichedResults);
+                }
+            });
+            // Return immediately without wrapping in a Promise
+            return {
+                requestId,
+                promise,
+            };
+        });
+    }
+    /**
+     * Check if parallel mode is enabled
+     */
+    isEnabled() {
+        return this.enabled;
+    }
+}
+exports.LlamaParallelAPI = LlamaParallelAPI;

package/lib/parallel.ts ADDED Viewed

@@ -0,0 +1,273 @@
+// Parallel decoding API implementation for llama.node
+import type {
+  LlamaContext,
+  LlamaCompletionOptions,
+  LlamaCompletionToken,
+  RerankParams,
+} from './binding'
+export class LlamaParallelAPI {
+  private context: LlamaContext
+  private enabled: boolean = false
+  private pendingRequests = new Map<
+    number,
+    {
+      resolve: (value: any) => void
+      reject: (reason?: any) => void
+    }
+  >()
+  constructor(context: LlamaContext) {
+    this.context = context
+  }
+  /**
+   * Enable parallel decoding mode
+   * @param config Configuration for parallel mode
+   * @returns boolean indicating if successful
+   */
+  async enable(config?: {
+    n_parallel?: number
+    n_batch?: number
+  }): Promise<boolean> {
+    const defaultConfig = { n_parallel: 2, n_batch: 512 }
+    const result = this.context.enableParallelMode({
+      ...defaultConfig,
+      ...config,
+    })
+    this.enabled = result
+    return result
+  }
+  /**
+   * Disable parallel decoding mode
+   */
+  disable(): void {
+    this.context.disableParallelMode()
+    this.enabled = false
+  }
+  /**
+   * Configure parallel decoding mode (enables if not already enabled)
+   * @param config Configuration for parallel mode
+   * @returns boolean indicating if successful
+   */
+  async configure(config: {
+    n_parallel?: number
+    n_batch?: number
+  }): Promise<boolean> {
+    return this.enable(config)
+  }
+  /**
+   * Queue a completion request for parallel processing
+   * @param options Completion options
+   * @param onToken Optional callback for each token
+   * @returns Object with requestId, promise for result, and stop function
+   */
+  async completion(
+    options: LlamaCompletionOptions,
+    onToken?: (requestId: number, data: LlamaCompletionToken) => void,
+  ): Promise<{
+    requestId: number
+    promise: Promise<any>
+    stop: () => void
+  }> {
+    if (!this.enabled) {
+      throw new Error('Parallel mode is not enabled. Call enable() first.')
+    }
+    const tokenCallback = onToken
+      ? (error: any, result: any) => {
+          if (error) {
+            console.error('Token callback error:', error)
+            // Handle completion error
+            const pendingReq = this.pendingRequests.get(result?.requestId)
+            if (pendingReq) {
+              pendingReq.reject(error)
+              this.pendingRequests.delete(result?.requestId)
+            }
+            return
+          }
+          // Check if this is a token callback or final result
+          if (result) {
+            if (result.token !== undefined) {
+              // This is a token callback
+              onToken(result.requestId, result)
+            } else if (
+              result.text !== undefined ||
+              result.content !== undefined
+            ) {
+              // This is the final result
+              const pendingReq = this.pendingRequests.get(result.requestId)
+              if (pendingReq) {
+                pendingReq.resolve(result)
+                this.pendingRequests.delete(result.requestId)
+              }
+            }
+          }
+        }
+      : undefined
+    // Queue the completion immediately (this is synchronous!)
+    const { requestId } = this.context.queueCompletion(
+      options,
+      tokenCallback ||
+        ((error, result) => {
+          if (error) {
+            const pendingReq = this.pendingRequests.get(result?.requestId)
+            if (pendingReq) {
+              pendingReq.reject(error)
+              this.pendingRequests.delete(result?.requestId)
+            }
+          } else if (
+            result &&
+            (result.text !== undefined || result.content !== undefined)
+          ) {
+            // Final result for non-streaming
+            const pendingReq = this.pendingRequests.get(result.requestId)
+            if (pendingReq) {
+              pendingReq.resolve(result)
+              this.pendingRequests.delete(result.requestId)
+            }
+          }
+        }),
+    )
+    // Create promise for final result
+    const promise = new Promise((resolveResult, rejectResult) => {
+      this.pendingRequests.set(requestId, {
+        resolve: resolveResult,
+        reject: rejectResult,
+      })
+    })
+    // Create stop function
+    const stop = () => {
+      this.context.cancelRequest(requestId)
+      const pendingReq = this.pendingRequests.get(requestId)
+      if (pendingReq) {
+        pendingReq.reject(new Error('Request cancelled'))
+        this.pendingRequests.delete(requestId)
+      }
+    }
+    // Return immediately without wrapping in a Promise
+    return {
+      requestId,
+      promise,
+      stop,
+    }
+  }
+  /**
+   * Queue an embedding request for parallel processing
+   * @param text Text to embed
+   * @param params Optional embedding parameters
+   * @returns Object with requestId and promise for result
+   */
+  async embedding(
+    text: string,
+    params?: { embd_normalize?: number },
+  ): Promise<{
+    requestId: number
+    promise: Promise<{ embedding: number[] }>
+  }> {
+    if (!this.enabled) {
+      throw new Error('Parallel mode is not enabled. Call enable() first.')
+    }
+    // Create promise for result
+    let resolveResult: (value: any) => void
+    let rejectResult: (reason?: any) => void
+    const promise = new Promise<{ embedding: number[] }>((res, rej) => {
+      resolveResult = res
+      rejectResult = rej
+    })
+    // Queue the embedding immediately (this is synchronous!)
+    const { requestId } = this.context.queueEmbedding(
+      text,
+      params,
+      (error, result) => {
+        if (error) {
+          rejectResult(error)
+        } else {
+          resolveResult(result)
+        }
+      },
+    )
+    // Return immediately without wrapping in a Promise
+    return {
+      requestId,
+      promise,
+    }
+  }
+  /**
+   * Queue a rerank request for parallel processing
+   * @param query Query text
+   * @param documents Documents to rank
+   * @param params Optional rerank parameters
+   * @returns Object with requestId and promise for results
+   */
+  async rerank(
+    query: string,
+    documents: string[],
+    params?: RerankParams,
+  ): Promise<{
+    requestId: number
+    promise: Promise<Array<{ score: number; index: number; document: string }>>
+  }> {
+    if (!this.enabled) {
+      throw new Error('Parallel mode is not enabled. Call enable() first.')
+    }
+    // Create promise for result
+    let resolveResult: (value: any) => void
+    let rejectResult: (reason?: any) => void
+    const promise = new Promise<
+      Array<{ score: number; index: number; document: string }>
+    >((res, rej) => {
+      resolveResult = res
+      rejectResult = rej
+    })
+    // Queue the rerank immediately (this is synchronous!)
+    const { requestId } = this.context.queueRerank(
+      query,
+      documents,
+      params,
+      (error, result) => {
+        if (error) {
+          rejectResult(error)
+        } else {
+          // Add document text to results and sort by score
+          const enrichedResults = result.results
+            .map((r: any) => ({
+              ...r,
+              document: documents[r.index],
+            }))
+            .sort((a: any, b: any) => b.score - a.score)
+          resolveResult(enrichedResults)
+        }
+      },
+    )
+    // Return immediately without wrapping in a Promise
+    return {
+      requestId,
+      promise,
+    }
+  }
+  /**
+   * Check if parallel mode is enabled
+   */
+  isEnabled(): boolean {
+    return this.enabled
+  }
+}