npm - @fugood/llama.node - Versions diffs - 1.2.6 → 1.3.0-rc.1 - Mend

@fugood/llama.node 1.2.6 → 1.3.0-rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/CMakeLists.txt +3 -0
package/lib/binding.js +7 -17
package/lib/binding.ts +139 -1
package/lib/index.js +12 -2
package/lib/index.ts +13 -1
package/lib/parallel.js +214 -0
package/lib/parallel.ts +273 -0
package/package.json +14 -14
package/src/LlamaContext.cpp +44 -1
package/src/LlamaContext.h +17 -0
package/src/common.hpp +4 -3
package/src/llama.cpp/common/arg.cpp +1 -1
package/src/llama.cpp/ggml/include/ggml.h +44 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +10 -2
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +16 -0
package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +32 -0
package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +4 -0
package/src/llama.cpp/src/llama-arch.cpp +5 -0
package/src/llama.cpp/src/llama-arch.h +1 -0
package/src/llama.cpp/src/llama-model.cpp +3 -1
package/src/llama.cpp/src/llama-quant.cpp +7 -1
package/src/llama.cpp/src/llama.cpp +3 -0

package/CMakeLists.txt CHANGED Viewed

@@ -124,6 +124,8 @@ include_directories(
   ${CMAKE_JS_INC}
   "src/llama.cpp"
   "src/llama.cpp/src"
+  "src/llama.cpp/ggml/include"
+  "src/llama.cpp/ggml/src"
   "src/tools/mtmd"
 )
@@ -137,6 +139,7 @@ file(
     "src/LlamaCompletionWorker.h"
     "src/LlamaContext.cpp"
     "src/LlamaContext.h"
+    "src/LlamaContext_parallel.cpp"
     "src/EmbeddingWorker.cpp"
     "src/EmbeddingWorker.h"
     "src/RerankWorker.cpp"

package/lib/binding.js CHANGED Viewed

@@ -15,23 +15,13 @@ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (
 }) : function(o, v) {
     o["default"] = v;
 });
-var __importStar = (this && this.__importStar) || (function () {
-    var ownKeys = function(o) {
-        ownKeys = Object.getOwnPropertyNames || function (o) {
-            var ar = [];
-            for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
-            return ar;
-        };
-        return ownKeys(o);
-    };
-    return function (mod) {
-        if (mod && mod.__esModule) return mod;
-        var result = {};
-        if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
-        __setModuleDefault(result, mod);
-        return result;
-    };
-})();
+var __importStar = (this && this.__importStar) || function (mod) {
+    if (mod && mod.__esModule) return mod;
+    var result = {};
+    if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
+    __setModuleDefault(result, mod);
+    return result;
+};
 var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
     function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
     return new (P || (P = Promise))(function (resolve, reject) {

package/lib/binding.ts CHANGED Viewed

@@ -25,6 +25,12 @@ export type LlamaModelOptions = {
   n_ctx?: number
   n_batch?: number
   n_ubatch?: number
+  /**
+   * Number of parallel sequences to support (sets n_seq_max).
+   * This determines the maximum number of parallel slots that can be used.
+   * Default: 8
+   */
+  n_parallel?: number
   n_threads?: number
   n_gpu_layers?: number
   flash_attn_type?: 'auto' | 'on' | 'off'
@@ -157,6 +163,36 @@ export type LlamaCompletionOptions = {
   n_probs?: number
 }
+/**
+ * Parameters for parallel completion requests (queueCompletion).
+ * Extends LlamaCompletionOptions with parallel-mode specific options.
+ */
+export type LlamaParallelCompletionOptions = LlamaCompletionOptions & {
+  /**
+   * File path to load session state from before processing.
+   * This allows you to resume from a previously saved completion state.
+   * Use with `save_state_path` to enable conversation continuity across requests.
+   * Example: `'/path/to/session.bin'` or `'file:///path/to/session.bin'`
+   */
+  load_state_path?: string
+  /**
+   * File path to save session state to after completion.
+   * The session state will be saved to this file path when the completion finishes.
+   * You can then pass this path to `load_state_path` in a subsequent request to resume.
+   * Example: `'/path/to/session.bin'` or `'file:///path/to/session.bin'`
+   */
+  save_state_path?: string
+  /**
+   * Number of tokens to save when saving session state.
+   * If not specified or <= 0, all tokens will be saved.
+   * Use this to limit the size of saved session files.
+   * Example: `512` to save only the last 512 tokens
+   */
+  save_state_size?: number
+}
 export type TokenProbability = {
   tok_str: string
   prob: number
@@ -200,6 +236,36 @@ export type LlamaCompletionToken = {
   completion_probabilities?: CompletionProbability[]
 }
+/**
+ * Result from a parallel completion request (queueCompletion callback).
+ * Extends the basic completion result with per-slot timing information.
+ */
+export type LlamaParallelCompletionResult = {
+  requestId: number
+  text: string
+  reasoning_content?: string
+  content?: string
+  tool_calls?: ToolCall[]
+  chat_format: number
+  stopped_eos: boolean
+  stopped_limit: boolean
+  stopped_word: boolean
+  context_full: boolean
+  tokens_evaluated: number
+  tokens_predicted: number
+  timings: {
+    cache_n: number
+    prompt_n: number
+    prompt_ms: number
+    prompt_per_token_ms: number
+    prompt_per_second: number
+    predicted_n: number
+    predicted_ms: number
+    predicted_per_token_ms: number
+    predicted_per_second: number
+  }
+}
 export type TokenizeResult = {
   tokens: Int32Array
   has_media: boolean
@@ -221,6 +287,14 @@ export type RerankResult = {
   index: number
 }
+export type BackendDeviceInfo = {
+  backend: string
+  type: string
+  deviceName: string
+  maxMemorySize: number
+  metadata?: Record<string, any>
+}
 export type ModelInfo = {
   desc: string
   nEmbd: number
@@ -271,7 +345,7 @@ export type JinjaFormattedChatResult = {
   prompt: string
   chat_format: number
   grammar: string
-  grammea_lazy: boolean
+  grammar_lazy: boolean
   grammar_triggers: Array<{
     type: number
     value: string
@@ -404,12 +478,76 @@ export interface LlamaContext {
    */
   decodeAudioTokens(tokens: number[]|Int32Array): Promise<Float32Array>
+  // Parallel decoding methods
+  /**
+   * Enable parallel decoding mode
+   * @param params Configuration for parallel mode
+   * @returns boolean indicating if successful
+   */
+  enableParallelMode(params: { n_parallel?: number, n_batch?: number }): boolean
+  /**
+   * Disable parallel decoding mode
+   */
+  disableParallelMode(): void
+  /**
+   * Queue a completion request for parallel processing
+   * @param options Completion options with parallel-specific state management
+   * @param callback Optional callback that receives tokens during generation and final result
+   * @returns Object with requestId
+   */
+  queueCompletion(
+    options: LlamaParallelCompletionOptions,
+    callback?: (error: any, result: LlamaParallelCompletionResult) => void,
+  ): { requestId: number }
+  /**
+   * Queue an embedding request for parallel processing
+   * @param text Text to embed
+   * @param params Optional embedding parameters
+   * @param callback Optional result callback
+   * @returns Object with requestId
+   */
+  queueEmbedding(
+    text: string,
+    params?: { embd_normalize?: number },
+    callback?: (error: any, result: any) => void,
+  ): { requestId: number }
+  /**
+   * Queue a rerank request for parallel processing
+   * @param query Query text
+   * @param documents Documents to rank
+   * @param params Optional rerank parameters
+   * @param callback Optional result callback
+   * @returns Object with requestId
+   */
+  queueRerank(
+    query: string,
+    documents: string[],
+    params?: RerankParams,
+    callback?: (error: any, result: any) => void,
+  ): { requestId: number }
+  /**
+   * Cancel a queued request
+   * @param requestId Request ID to cancel
+   */
+  cancelRequest(requestId: number): void
   // static
   loadModelInfo(path: string, skip: string[]): Promise<GGUFModelInfo>
   toggleNativeLog(
     enable: boolean,
     callback: (level: string, text: string) => void,
   ): void
+  /**
+   * Get information about available backend devices
+   * @returns Array of backend device information
+   */
+  getBackendDevicesInfo(): BackendDeviceInfo[]
 }
 export interface Module {

package/lib/index.js CHANGED Viewed

@@ -23,10 +23,12 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
     });
 };
 Object.defineProperty(exports, "__esModule", { value: true });
-exports.BuildInfo = exports.loadLlamaModelInfo = exports.initLlama = exports.loadModel = exports.toggleNativeLog = exports.MTMD_DEFAULT_MEDIA_MARKER = void 0;
+exports.BuildInfo = exports.getBackendDevicesInfo = exports.loadLlamaModelInfo = exports.initLlama = exports.loadModel = exports.toggleNativeLog = exports.MTMD_DEFAULT_MEDIA_MARKER = exports.LlamaParallelAPI = void 0;
 exports.addNativeLogListener = addNativeLogListener;
 const binding_1 = require("./binding");
 const version_1 = require("./version");
+const parallel_1 = require("./parallel");
+Object.defineProperty(exports, "LlamaParallelAPI", { enumerable: true, get: function () { return parallel_1.LlamaParallelAPI; } });
 __exportStar(require("./binding"), exports);
 exports.MTMD_DEFAULT_MEDIA_MARKER = '<__media__>';
 const mods = {};
@@ -66,6 +68,7 @@ const getJsonSchema = (responseFormat) => {
 class LlamaContextWrapper {
     constructor(nativeCtx) {
         this.ctx = nativeCtx;
+        this.parallel = new parallel_1.LlamaParallelAPI(nativeCtx);
     }
     getSystemInfo() {
         return this.ctx.getSystemInfo();
@@ -138,7 +141,6 @@ class LlamaContextWrapper {
         let tmpl;
         if (template)
             tmpl = template; // Force replace if provided
-        const jsonSchema = getJsonSchema(params === null || params === void 0 ? void 0 : params.response_format);
         const result = this.ctx.getFormattedChat(chat, tmpl, {
             jinja: useJinja,
             response_format: params === null || params === void 0 ? void 0 : params.response_format,
@@ -267,6 +269,14 @@ const loadLlamaModelInfo = (path) => __awaiter(void 0, void 0, void 0, function*
     return mods[variant].LlamaContext.loadModelInfo(path, modelInfoSkip);
 });
 exports.loadLlamaModelInfo = loadLlamaModelInfo;
+const getBackendDevicesInfo = (...args_1) => __awaiter(void 0, [...args_1], void 0, function* (variant = 'default') {
+    var _a;
+    (_a = mods[variant]) !== null && _a !== void 0 ? _a : (mods[variant] = yield (0, binding_1.loadModule)(variant));
+    refreshNativeLogSetup();
+    const jsonString = mods[variant].LlamaContext.getBackendDevicesInfo();
+    return JSON.parse(jsonString);
+});
+exports.getBackendDevicesInfo = getBackendDevicesInfo;
 exports.BuildInfo = {
     number: version_1.BUILD_NUMBER,
     commit: version_1.BUILD_COMMIT,

package/lib/index.ts CHANGED Viewed

@@ -18,8 +18,10 @@ import type {
   GGUFModelInfo,
 } from './binding'
 import { BUILD_NUMBER, BUILD_COMMIT } from './version'
+import { LlamaParallelAPI } from './parallel'
 export * from './binding'
+export { LlamaParallelAPI }
 export const MTMD_DEFAULT_MEDIA_MARKER = '<__media__>'
@@ -78,9 +80,11 @@ export type FormattedChatResult = {
 class LlamaContextWrapper {
   ctx: LlamaContext
+  parallel: LlamaParallelAPI
   constructor(nativeCtx: LlamaContext) {
     this.ctx = nativeCtx
+    this.parallel = new LlamaParallelAPI(nativeCtx)
   }
   getSystemInfo(): string {
@@ -181,7 +185,6 @@ class LlamaContextWrapper {
     const useJinja = this.isJinjaSupported() && params?.jinja
     let tmpl
     if (template) tmpl = template // Force replace if provided
-    const jsonSchema = getJsonSchema(params?.response_format)
     const result = this.ctx.getFormattedChat(chat!, tmpl, {
       jinja: useJinja,
@@ -382,6 +385,15 @@ export const loadLlamaModelInfo = async (
   return mods[variant].LlamaContext.loadModelInfo(path, modelInfoSkip)
 }
+export const getBackendDevicesInfo = async (
+  variant: LibVariant = 'default'
+): Promise<import('./binding').BackendDeviceInfo[]> => {
+  mods[variant] ??= await loadModule(variant)
+  refreshNativeLogSetup()
+  const jsonString = mods[variant].LlamaContext.getBackendDevicesInfo()
+  return JSON.parse(jsonString as any)
+}
 export const BuildInfo = {
   number: BUILD_NUMBER,
   commit: BUILD_COMMIT,

package/lib/parallel.js ADDED Viewed

@@ -0,0 +1,214 @@
+"use strict";
+var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
+    function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
+    return new (P || (P = Promise))(function (resolve, reject) {
+        function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
+        function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
+        function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
+        step((generator = generator.apply(thisArg, _arguments || [])).next());
+    });
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.LlamaParallelAPI = void 0;
+class LlamaParallelAPI {
+    constructor(context) {
+        this.enabled = false;
+        this.pendingRequests = new Map();
+        this.context = context;
+    }
+    /**
+     * Enable parallel decoding mode
+     * @param config Configuration for parallel mode
+     * @returns boolean indicating if successful
+     */
+    enable(config) {
+        return __awaiter(this, void 0, void 0, function* () {
+            const defaultConfig = { n_parallel: 2, n_batch: 512 };
+            const result = this.context.enableParallelMode(Object.assign(Object.assign({}, defaultConfig), config));
+            this.enabled = result;
+            return result;
+        });
+    }
+    /**
+     * Disable parallel decoding mode
+     */
+    disable() {
+        this.context.disableParallelMode();
+        this.enabled = false;
+    }
+    /**
+     * Configure parallel decoding mode (enables if not already enabled)
+     * @param config Configuration for parallel mode
+     * @returns boolean indicating if successful
+     */
+    configure(config) {
+        return __awaiter(this, void 0, void 0, function* () {
+            return this.enable(config);
+        });
+    }
+    /**
+     * Queue a completion request for parallel processing
+     * @param options Completion options
+     * @param onToken Optional callback for each token
+     * @returns Object with requestId, promise for result, and stop function
+     */
+    completion(options, onToken) {
+        return __awaiter(this, void 0, void 0, function* () {
+            if (!this.enabled) {
+                throw new Error('Parallel mode is not enabled. Call enable() first.');
+            }
+            const tokenCallback = onToken
+                ? (error, result) => {
+                    if (error) {
+                        console.error('Token callback error:', error);
+                        // Handle completion error
+                        const pendingReq = this.pendingRequests.get(result === null || result === void 0 ? void 0 : result.requestId);
+                        if (pendingReq) {
+                            pendingReq.reject(error);
+                            this.pendingRequests.delete(result === null || result === void 0 ? void 0 : result.requestId);
+                        }
+                        return;
+                    }
+                    // Check if this is a token callback or final result
+                    if (result) {
+                        if (result.token !== undefined) {
+                            // This is a token callback
+                            onToken(result.requestId, result);
+                        }
+                        else if (result.text !== undefined ||
+                            result.content !== undefined) {
+                            // This is the final result
+                            const pendingReq = this.pendingRequests.get(result.requestId);
+                            if (pendingReq) {
+                                pendingReq.resolve(result);
+                                this.pendingRequests.delete(result.requestId);
+                            }
+                        }
+                    }
+                }
+                : undefined;
+            // Queue the completion immediately (this is synchronous!)
+            const { requestId } = this.context.queueCompletion(options, tokenCallback ||
+                ((error, result) => {
+                    if (error) {
+                        const pendingReq = this.pendingRequests.get(result === null || result === void 0 ? void 0 : result.requestId);
+                        if (pendingReq) {
+                            pendingReq.reject(error);
+                            this.pendingRequests.delete(result === null || result === void 0 ? void 0 : result.requestId);
+                        }
+                    }
+                    else if (result &&
+                        (result.text !== undefined || result.content !== undefined)) {
+                        // Final result for non-streaming
+                        const pendingReq = this.pendingRequests.get(result.requestId);
+                        if (pendingReq) {
+                            pendingReq.resolve(result);
+                            this.pendingRequests.delete(result.requestId);
+                        }
+                    }
+                }));
+            // Create promise for final result
+            const promise = new Promise((resolveResult, rejectResult) => {
+                this.pendingRequests.set(requestId, {
+                    resolve: resolveResult,
+                    reject: rejectResult,
+                });
+            });
+            // Create stop function
+            const stop = () => {
+                this.context.cancelRequest(requestId);
+                const pendingReq = this.pendingRequests.get(requestId);
+                if (pendingReq) {
+                    pendingReq.reject(new Error('Request cancelled'));
+                    this.pendingRequests.delete(requestId);
+                }
+            };
+            // Return immediately without wrapping in a Promise
+            return {
+                requestId,
+                promise,
+                stop,
+            };
+        });
+    }
+    /**
+     * Queue an embedding request for parallel processing
+     * @param text Text to embed
+     * @param params Optional embedding parameters
+     * @returns Object with requestId and promise for result
+     */
+    embedding(text, params) {
+        return __awaiter(this, void 0, void 0, function* () {
+            if (!this.enabled) {
+                throw new Error('Parallel mode is not enabled. Call enable() first.');
+            }
+            // Create promise for result
+            let resolveResult;
+            let rejectResult;
+            const promise = new Promise((res, rej) => {
+                resolveResult = res;
+                rejectResult = rej;
+            });
+            // Queue the embedding immediately (this is synchronous!)
+            const { requestId } = this.context.queueEmbedding(text, params, (error, result) => {
+                if (error) {
+                    rejectResult(error);
+                }
+                else {
+                    resolveResult(result);
+                }
+            });
+            // Return immediately without wrapping in a Promise
+            return {
+                requestId,
+                promise,
+            };
+        });
+    }
+    /**
+     * Queue a rerank request for parallel processing
+     * @param query Query text
+     * @param documents Documents to rank
+     * @param params Optional rerank parameters
+     * @returns Object with requestId and promise for results
+     */
+    rerank(query, documents, params) {
+        return __awaiter(this, void 0, void 0, function* () {
+            if (!this.enabled) {
+                throw new Error('Parallel mode is not enabled. Call enable() first.');
+            }
+            // Create promise for result
+            let resolveResult;
+            let rejectResult;
+            const promise = new Promise((res, rej) => {
+                resolveResult = res;
+                rejectResult = rej;
+            });
+            // Queue the rerank immediately (this is synchronous!)
+            const { requestId } = this.context.queueRerank(query, documents, params, (error, result) => {
+                if (error) {
+                    rejectResult(error);
+                }
+                else {
+                    // Add document text to results and sort by score
+                    const enrichedResults = result.results
+                        .map((r) => (Object.assign(Object.assign({}, r), { document: documents[r.index] })))
+                        .sort((a, b) => b.score - a.score);
+                    resolveResult(enrichedResults);
+                }
+            });
+            // Return immediately without wrapping in a Promise
+            return {
+                requestId,
+                promise,
+            };
+        });
+    }
+    /**
+     * Check if parallel mode is enabled
+     */
+    isEnabled() {
+        return this.enabled;
+    }
+}
+exports.LlamaParallelAPI = LlamaParallelAPI;

package/lib/parallel.ts ADDED Viewed

@@ -0,0 +1,273 @@
+// Parallel decoding API implementation for llama.node
+import type {
+  LlamaContext,
+  LlamaCompletionOptions,
+  LlamaCompletionToken,
+  RerankParams,
+} from './binding'
+export class LlamaParallelAPI {
+  private context: LlamaContext
+  private enabled: boolean = false
+  private pendingRequests = new Map<
+    number,
+    {
+      resolve: (value: any) => void
+      reject: (reason?: any) => void
+    }
+  >()
+  constructor(context: LlamaContext) {
+    this.context = context
+  }
+  /**
+   * Enable parallel decoding mode
+   * @param config Configuration for parallel mode
+   * @returns boolean indicating if successful
+   */
+  async enable(config?: {
+    n_parallel?: number
+    n_batch?: number
+  }): Promise<boolean> {
+    const defaultConfig = { n_parallel: 2, n_batch: 512 }
+    const result = this.context.enableParallelMode({
+      ...defaultConfig,
+      ...config,
+    })
+    this.enabled = result
+    return result
+  }
+  /**
+   * Disable parallel decoding mode
+   */
+  disable(): void {
+    this.context.disableParallelMode()
+    this.enabled = false
+  }
+  /**
+   * Configure parallel decoding mode (enables if not already enabled)
+   * @param config Configuration for parallel mode
+   * @returns boolean indicating if successful
+   */
+  async configure(config: {
+    n_parallel?: number
+    n_batch?: number
+  }): Promise<boolean> {
+    return this.enable(config)
+  }
+  /**
+   * Queue a completion request for parallel processing
+   * @param options Completion options
+   * @param onToken Optional callback for each token
+   * @returns Object with requestId, promise for result, and stop function
+   */
+  async completion(
+    options: LlamaCompletionOptions,
+    onToken?: (requestId: number, data: LlamaCompletionToken) => void,
+  ): Promise<{
+    requestId: number
+    promise: Promise<any>
+    stop: () => void
+  }> {
+    if (!this.enabled) {
+      throw new Error('Parallel mode is not enabled. Call enable() first.')
+    }
+    const tokenCallback = onToken
+      ? (error: any, result: any) => {
+          if (error) {
+            console.error('Token callback error:', error)
+            // Handle completion error
+            const pendingReq = this.pendingRequests.get(result?.requestId)
+            if (pendingReq) {
+              pendingReq.reject(error)
+              this.pendingRequests.delete(result?.requestId)
+            }
+            return
+          }
+          // Check if this is a token callback or final result
+          if (result) {
+            if (result.token !== undefined) {
+              // This is a token callback
+              onToken(result.requestId, result)
+            } else if (
+              result.text !== undefined ||
+              result.content !== undefined
+            ) {
+              // This is the final result
+              const pendingReq = this.pendingRequests.get(result.requestId)
+              if (pendingReq) {
+                pendingReq.resolve(result)
+                this.pendingRequests.delete(result.requestId)
+              }
+            }
+          }
+        }
+      : undefined
+    // Queue the completion immediately (this is synchronous!)
+    const { requestId } = this.context.queueCompletion(
+      options,
+      tokenCallback ||
+        ((error, result) => {
+          if (error) {
+            const pendingReq = this.pendingRequests.get(result?.requestId)
+            if (pendingReq) {
+              pendingReq.reject(error)
+              this.pendingRequests.delete(result?.requestId)
+            }
+          } else if (
+            result &&
+            (result.text !== undefined || result.content !== undefined)
+          ) {
+            // Final result for non-streaming
+            const pendingReq = this.pendingRequests.get(result.requestId)
+            if (pendingReq) {
+              pendingReq.resolve(result)
+              this.pendingRequests.delete(result.requestId)
+            }
+          }
+        }),
+    )
+    // Create promise for final result
+    const promise = new Promise((resolveResult, rejectResult) => {
+      this.pendingRequests.set(requestId, {
+        resolve: resolveResult,
+        reject: rejectResult,
+      })
+    })
+    // Create stop function
+    const stop = () => {
+      this.context.cancelRequest(requestId)
+      const pendingReq = this.pendingRequests.get(requestId)
+      if (pendingReq) {
+        pendingReq.reject(new Error('Request cancelled'))
+        this.pendingRequests.delete(requestId)
+      }
+    }
+    // Return immediately without wrapping in a Promise
+    return {
+      requestId,
+      promise,
+      stop,
+    }
+  }
+  /**
+   * Queue an embedding request for parallel processing
+   * @param text Text to embed
+   * @param params Optional embedding parameters
+   * @returns Object with requestId and promise for result
+   */
+  async embedding(
+    text: string,
+    params?: { embd_normalize?: number },
+  ): Promise<{
+    requestId: number
+    promise: Promise<{ embedding: number[] }>
+  }> {
+    if (!this.enabled) {
+      throw new Error('Parallel mode is not enabled. Call enable() first.')
+    }
+    // Create promise for result
+    let resolveResult: (value: any) => void
+    let rejectResult: (reason?: any) => void
+    const promise = new Promise<{ embedding: number[] }>((res, rej) => {
+      resolveResult = res
+      rejectResult = rej
+    })
+    // Queue the embedding immediately (this is synchronous!)
+    const { requestId } = this.context.queueEmbedding(
+      text,
+      params,
+      (error, result) => {
+        if (error) {
+          rejectResult(error)
+        } else {
+          resolveResult(result)
+        }
+      },
+    )
+    // Return immediately without wrapping in a Promise
+    return {
+      requestId,
+      promise,
+    }
+  }
+  /**
+   * Queue a rerank request for parallel processing
+   * @param query Query text
+   * @param documents Documents to rank
+   * @param params Optional rerank parameters
+   * @returns Object with requestId and promise for results
+   */
+  async rerank(
+    query: string,
+    documents: string[],
+    params?: RerankParams,
+  ): Promise<{
+    requestId: number
+    promise: Promise<Array<{ score: number; index: number; document: string }>>
+  }> {
+    if (!this.enabled) {
+      throw new Error('Parallel mode is not enabled. Call enable() first.')
+    }
+    // Create promise for result
+    let resolveResult: (value: any) => void
+    let rejectResult: (reason?: any) => void
+    const promise = new Promise<
+      Array<{ score: number; index: number; document: string }>
+    >((res, rej) => {
+      resolveResult = res
+      rejectResult = rej
+    })
+    // Queue the rerank immediately (this is synchronous!)
+    const { requestId } = this.context.queueRerank(
+      query,
+      documents,
+      params,
+      (error, result) => {
+        if (error) {
+          rejectResult(error)
+        } else {
+          // Add document text to results and sort by score
+          const enrichedResults = result.results
+            .map((r: any) => ({
+              ...r,
+              document: documents[r.index],
+            }))
+            .sort((a: any, b: any) => b.score - a.score)
+          resolveResult(enrichedResults)
+        }
+      },
+    )
+    // Return immediately without wrapping in a Promise
+    return {
+      requestId,
+      promise,
+    }
+  }
+  /**
+   * Check if parallel mode is enabled
+   */
+  isEnabled(): boolean {
+    return this.enabled
+  }
+}

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@fugood/llama.node",
   "access": "public",
-  "version": "1.2.6",
+  "version": "1.3.0-rc.1",
   "description": "An another Node binding of llama.cpp",
   "main": "lib/index.js",
   "scripts": {
@@ -72,19 +72,19 @@
     "CMakeLists.txt"
   ],
   "optionalDependencies": {
-    "@fugood/node-llama-linux-x64": "1.2.6",
-    "@fugood/node-llama-linux-x64-vulkan": "1.2.6",
-    "@fugood/node-llama-linux-x64-cuda": "1.2.6",
-    "@fugood/node-llama-linux-arm64": "1.2.6",
-    "@fugood/node-llama-linux-arm64-vulkan": "1.2.6",
-    "@fugood/node-llama-linux-arm64-cuda": "1.2.6",
-    "@fugood/node-llama-win32-x64": "1.2.6",
-    "@fugood/node-llama-win32-x64-vulkan": "1.2.6",
-    "@fugood/node-llama-win32-x64-cuda": "1.2.6",
-    "@fugood/node-llama-win32-arm64": "1.2.6",
-    "@fugood/node-llama-win32-arm64-vulkan": "1.2.6",
-    "@fugood/node-llama-darwin-x64": "1.2.6",
-    "@fugood/node-llama-darwin-arm64": "1.2.6"
+    "@fugood/node-llama-linux-x64": "1.3.0-rc.1",
+    "@fugood/node-llama-linux-x64-vulkan": "1.3.0-rc.1",
+    "@fugood/node-llama-linux-x64-cuda": "1.3.0-rc.1",
+    "@fugood/node-llama-linux-arm64": "1.3.0-rc.1",
+    "@fugood/node-llama-linux-arm64-vulkan": "1.3.0-rc.1",
+    "@fugood/node-llama-linux-arm64-cuda": "1.3.0-rc.1",
+    "@fugood/node-llama-win32-x64": "1.3.0-rc.1",
+    "@fugood/node-llama-win32-x64-vulkan": "1.3.0-rc.1",
+    "@fugood/node-llama-win32-x64-cuda": "1.3.0-rc.1",
+    "@fugood/node-llama-win32-arm64": "1.3.0-rc.1",
+    "@fugood/node-llama-win32-arm64-vulkan": "1.3.0-rc.1",
+    "@fugood/node-llama-darwin-x64": "1.3.0-rc.1",
+    "@fugood/node-llama-darwin-arm64": "1.3.0-rc.1"
   },
   "devDependencies": {
     "@babel/preset-env": "^7.24.4",

package/src/LlamaContext.cpp CHANGED Viewed

@@ -89,6 +89,13 @@ Napi::Value LlamaContext::ModelInfo(const Napi::CallbackInfo &info) {
   return metadata;
 }
+// getBackendDevicesInfo(): string
+Napi::Value LlamaContext::GetBackendDevicesInfo(const Napi::CallbackInfo &info) {
+  Napi::Env env = info.Env();
+  std::string devices_json = rnllama::get_backend_devices_info();
+  return Napi::String::New(env, devices_json);
+}
 void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
   Napi::Function func = DefineClass(
       env, "LlamaContext",
@@ -148,6 +155,9 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
        StaticMethod<&LlamaContext::ToggleNativeLog>(
            "toggleNativeLog",
            static_cast<napi_property_attributes>(napi_enumerable)),
+       StaticMethod<&LlamaContext::GetBackendDevicesInfo>(
+           "getBackendDevicesInfo",
+           static_cast<napi_property_attributes>(napi_enumerable)),
        InstanceMethod<&LlamaContext::GetMultimodalSupport>(
            "getMultimodalSupport",
            static_cast<napi_property_attributes>(napi_enumerable)),
@@ -168,6 +178,25 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
            static_cast<napi_property_attributes>(napi_enumerable)),
        InstanceMethod<&LlamaContext::DecodeAudioTokens>(
            "decodeAudioTokens",
+           static_cast<napi_property_attributes>(napi_enumerable)),
+       // Parallel decoding methods
+       InstanceMethod<&LlamaContext::EnableParallelMode>(
+           "enableParallelMode",
+           static_cast<napi_property_attributes>(napi_enumerable)),
+       InstanceMethod<&LlamaContext::DisableParallelMode>(
+           "disableParallelMode",
+           static_cast<napi_property_attributes>(napi_enumerable)),
+       InstanceMethod<&LlamaContext::QueueCompletion>(
+           "queueCompletion",
+           static_cast<napi_property_attributes>(napi_enumerable)),
+       InstanceMethod<&LlamaContext::QueueEmbedding>(
+           "queueEmbedding",
+           static_cast<napi_property_attributes>(napi_enumerable)),
+       InstanceMethod<&LlamaContext::QueueRerank>(
+           "queueRerank",
+           static_cast<napi_property_attributes>(napi_enumerable)),
+       InstanceMethod<&LlamaContext::CancelRequest>(
+           "cancelRequest",
            static_cast<napi_property_attributes>(napi_enumerable))});
   Napi::FunctionReference *constructor = new Napi::FunctionReference();
   *constructor = Napi::Persistent(func);
@@ -217,6 +246,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
   params.n_ctx = get_option<int32_t>(options, "n_ctx", 512);
   params.n_batch = get_option<int32_t>(options, "n_batch", 2048);
   params.n_ubatch = get_option<int32_t>(options, "n_ubatch", 512);
+  params.n_parallel = get_option<int32_t>(options, "n_parallel", 1); // Default to 1 for compatibility
   params.embedding = get_option<bool>(options, "embedding", false);
   if (params.embedding) {
     // For non-causal models, batch size must be equal to ubatch size
@@ -288,6 +318,9 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
       }
     }
   }
+  // Initialize validity flag for async callback safety
+  _context_valid = std::make_shared<std::atomic<bool>>(true);
   // Use rn-llama context instead of direct session
   _rn_ctx = new llama_rn_context();
   if (!_rn_ctx->loadModel(params)) {
@@ -305,6 +338,11 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
 }
 LlamaContext::~LlamaContext() {
+  // Invalidate the context to prevent use-after-free in async callbacks
+  if (_context_valid) {
+    _context_valid->store(false);
+  }
   // The DisposeWorker is responsible for cleanup of _rn_ctx
   // If _rn_ctx is still not null here, it means disposal was not properly initiated
   if (_rn_ctx) {
@@ -579,7 +617,7 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
     // grammar: string
     result.Set("grammar", chatParams.grammar);
     // grammar_lazy: boolean
-    result.Set("grammea_lazy", chatParams.grammar_lazy);
+    result.Set("grammar_lazy", chatParams.grammar_lazy);
     // grammar_triggers: [{ value: string, token: number }]
     Napi::Array grammar_triggers = Napi::Array::New(env);
     for (size_t i = 0; i < chatParams.grammar_triggers.size(); i++) {
@@ -1135,6 +1173,11 @@ Napi::Value LlamaContext::Release(const Napi::CallbackInfo &info) {
     _wip->SetStop();
   }
+  // stop_processing_loop
+  if (_rn_ctx && _rn_ctx->slot_manager) {
+    _rn_ctx->slot_manager->stop_processing_loop();
+  }
   if (_rn_ctx == nullptr) {
     auto promise = Napi::Promise::Deferred(env);
     promise.Resolve(env.Undefined());

package/src/LlamaContext.h CHANGED Viewed

@@ -4,6 +4,10 @@
 #include "rn-llama/rn-llama.h"
 #include "rn-llama/rn-completion.h"
 #include "rn-llama/rn-tts.h"
+#include "rn-llama/rn-slot.h"
+#include "rn-llama/rn-slot-manager.h"
+#include <atomic>
+#include <memory>
 using namespace rnllama;
@@ -21,6 +25,7 @@ public:
   ~LlamaContext();
   static void ToggleNativeLog(const Napi::CallbackInfo &info);
   static Napi::Value ModelInfo(const Napi::CallbackInfo &info);
+  static Napi::Value GetBackendDevicesInfo(const Napi::CallbackInfo &info);
   static void Init(Napi::Env env, Napi::Object &exports);
 private:
@@ -55,10 +60,22 @@ private:
   Napi::Value GetAudioCompletionGuideTokens(const Napi::CallbackInfo &info);
   Napi::Value DecodeAudioTokens(const Napi::CallbackInfo &info);
+  // Parallel decoding methods
+  Napi::Value EnableParallelMode(const Napi::CallbackInfo &info);
+  void DisableParallelMode(const Napi::CallbackInfo &info);
+  Napi::Value QueueCompletion(const Napi::CallbackInfo &info);
+  Napi::Value QueueEmbedding(const Napi::CallbackInfo &info);
+  Napi::Value QueueRerank(const Napi::CallbackInfo &info);
+  void CancelRequest(const Napi::CallbackInfo &info);
   std::string _info;
   Napi::Object _meta;
   LlamaCompletionWorker *_wip = nullptr;
   // Use rn-llama context instead of direct llama.cpp types
   llama_rn_context *_rn_ctx = nullptr;
+  // Validity flag for async callbacks to prevent use-after-free
+  // Shared pointer ensures callbacks can safely check if context is still alive
+  std::shared_ptr<std::atomic<bool>> _context_valid;
 };

package/src/common.hpp CHANGED Viewed

@@ -16,11 +16,12 @@ static bool is_nil(const Napi::Value &value) {
   return value.IsNull() || value.IsUndefined();
 }
-static std::string json_stringify(const Napi::Object &obj) {
-  Napi::Env env = obj.Env();
+// Overload for Napi::Value to handle both arrays and objects
+static std::string json_stringify(const Napi::Value &value) {
+  Napi::Env env = value.Env();
   Napi::Object json = env.Global().Get("JSON").As<Napi::Object>();
   Napi::Function stringify = json.Get("stringify").As<Napi::Function>();
-  return stringify.Call(json, {obj}).As<Napi::String>().ToString();
+  return stringify.Call(json, {value}).As<Napi::String>().ToString();
 }
 static void console_log(Napi::Env env, const std::string &message) {

package/src/llama.cpp/common/arg.cpp CHANGED Viewed

@@ -1760,7 +1760,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
     add_opt(common_arg(
         {"-t", "--threads"}, "N",
-        string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
+        string_format("number of CPU threads to use during generation (default: %d)", params.cpuparams.n_threads),
         [](common_params & params, int value) {
             params.cpuparams.n_threads = value;
             if (params.cpuparams.n_threads <= 0) {

package/src/llama.cpp/ggml/include/ggml.h CHANGED Viewed

@@ -577,6 +577,10 @@ extern "C" {
         GGML_UNARY_OP_EXP,
         GGML_UNARY_OP_GELU_ERF,
         GGML_UNARY_OP_XIELU,
+        GGML_UNARY_OP_FLOOR,
+        GGML_UNARY_OP_CEIL,
+        GGML_UNARY_OP_ROUND,
+        GGML_UNARY_OP_TRUNC,
         GGML_UNARY_OP_COUNT,
     };
@@ -1151,6 +1155,46 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
+    GGML_API struct ggml_tensor * ggml_floor(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+    GGML_API struct ggml_tensor * ggml_floor_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+    GGML_API struct ggml_tensor * ggml_ceil(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+    GGML_API struct ggml_tensor * ggml_ceil_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+    GGML_API struct ggml_tensor * ggml_round(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+    GGML_API struct ggml_tensor * ggml_round_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+     /**
+     * Truncates the fractional part of each element in the tensor (towards zero).
+     * For example: trunc(3.7) = 3.0, trunc(-2.9) = -2.0
+     * Similar to std::trunc in C/C++.
+     */
+    GGML_API struct ggml_tensor * ggml_trunc(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+    GGML_API struct ggml_tensor * ggml_trunc_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
     // xIELU activation function
     // x = x * (c_a(alpha_n) + c_b(alpha_p, beta) * sigmoid(beta * x)) + eps * (x > 0)
     // where c_a = softplus and c_b(a, b) = softplus(a) + b are constraining functions

package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c CHANGED Viewed

@@ -2184,6 +2184,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
                 case GGML_UNARY_OP_HARDSWISH:
                 case GGML_UNARY_OP_HARDSIGMOID:
                 case GGML_UNARY_OP_EXP:
+                case GGML_UNARY_OP_FLOOR:
+                case GGML_UNARY_OP_CEIL:
+                case GGML_UNARY_OP_ROUND:
+                case GGML_UNARY_OP_TRUNC:
                     {
                         n_tasks = 1;
                     } break;
@@ -3563,13 +3567,17 @@ void ggml_cpu_init(void) {
 #ifdef GGML_USE_OPENMP
             //if (!getenv("OMP_WAIT_POLICY")) {
             //    // set the wait policy to active, so that OpenMP threads don't sleep
-            //    putenv("OMP_WAIT_POLICY=active");
+            //    setenv("OMP_WAIT_POLICY", "active", 0)
             //}
             if (!getenv("KMP_BLOCKTIME")) {
                 // set the time to wait before sleeping a thread
                 // this is less aggressive than setting the wait policy to active, but should achieve similar results in most cases
-                putenv("KMP_BLOCKTIME=200"); // 200ms
+#ifdef _WIN32
+                _putenv_s("KMP_BLOCKTIME", "200"); // 200ms
+#else
+                setenv("KMP_BLOCKTIME", "200", 0); // 200ms
+#endif
             }
 #endif
         }

package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp CHANGED Viewed

@@ -8993,6 +8993,22 @@ void ggml_compute_forward_unary(
             {
                 ggml_compute_forward_exp(params, dst);
             } break;
+        case GGML_UNARY_OP_FLOOR:
+            {
+                ggml_compute_forward_floor(params, dst);
+            } break;
+        case GGML_UNARY_OP_CEIL:
+            {
+                ggml_compute_forward_ceil(params, dst);
+            } break;
+        case GGML_UNARY_OP_ROUND:
+            {
+                ggml_compute_forward_round(params, dst);
+            } break;
+        case GGML_UNARY_OP_TRUNC:
+            {
+                ggml_compute_forward_trunc(params, dst);
+            } break;
         case GGML_UNARY_OP_XIELU:
             {
                 ggml_compute_forward_xielu(params, dst);

package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp CHANGED Viewed

@@ -73,6 +73,22 @@ static inline float op_log(float x) {
     return logf(x);
 }
+static inline float op_floor(float x) {
+    return floorf(x);
+}
+static inline float op_ceil(float x) {
+    return ceilf(x);
+}
+static inline float op_round(float x) {
+    return roundf(x);
+}
+static inline float op_trunc(float x) {
+    return truncf(x);
+}
 template <float (*op)(float), typename src0_t, typename dst_t>
 static inline void vec_unary_op(int64_t n, dst_t * y, const src0_t * x) {
     constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
@@ -274,6 +290,22 @@ void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor *
     unary_op<op_log>(params, dst);
 }
+void ggml_compute_forward_floor(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_floor>(params, dst);
+}
+void ggml_compute_forward_ceil(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_ceil>(params, dst);
+}
+void ggml_compute_forward_round(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_round>(params, dst);
+}
+void ggml_compute_forward_trunc(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_trunc>(params, dst);
+}
 void ggml_compute_forward_xielu(const ggml_compute_params * params, ggml_tensor * dst) {
     const float alpha_n = ggml_get_op_params_f32(dst, 1);
     const float alpha_p = ggml_get_op_params_f32(dst, 2);

package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h CHANGED Viewed

@@ -22,6 +22,10 @@ void ggml_compute_forward_sqrt(const struct ggml_compute_params * params, struct
 void ggml_compute_forward_sin(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_cos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_log(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_floor(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_ceil(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_round(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_trunc(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_xielu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 #ifdef __cplusplus

package/src/llama.cpp/src/llama-arch.cpp CHANGED Viewed

@@ -5,6 +5,7 @@
 #include <map>
 static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+    { LLM_ARCH_CLIP,             "clip"             }, // dummy, only used by llama-quantize
     { LLM_ARCH_LLAMA,            "llama"            },
     { LLM_ARCH_LLAMA4,           "llama4"           },
     { LLM_ARCH_DECI,             "deci"             },
@@ -275,6 +276,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
 };
 static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
+    {
+        LLM_ARCH_CLIP,
+        {},
+    },
     {
         LLM_ARCH_LLAMA,
         {

package/src/llama.cpp/src/llama-arch.h CHANGED Viewed

@@ -9,6 +9,7 @@
 //
 enum llm_arch {
+    LLM_ARCH_CLIP,
     LLM_ARCH_LLAMA,
     LLM_ARCH_LLAMA4,
     LLM_ARCH_DECI,

package/src/llama.cpp/src/llama-model.cpp CHANGED Viewed

@@ -478,7 +478,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_GENERAL_NAME, name, false);
     // everything past this point is not vocab-related
-    if (hparams.vocab_only) {
+    // for CLIP models, we only need to load tensors, no hparams
+    if (hparams.vocab_only || ml.get_arch() == LLM_ARCH_CLIP) {
         return;
     }
@@ -20013,6 +20014,7 @@ int32_t llama_n_head(const llama_model * model) {
 llama_rope_type llama_model_rope_type(const llama_model * model) {
     switch (model->arch) {
         // these models do not use RoPE
+        case LLM_ARCH_CLIP:
         case LLM_ARCH_GPT2:
         case LLM_ARCH_GPTJ:
         case LLM_ARCH_MPT:

package/src/llama.cpp/src/llama-quant.cpp CHANGED Viewed

@@ -701,6 +701,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         });
     }
+    bool is_clip_model = false;
     for (const auto * it : tensors) {
         const struct ggml_tensor * tensor = it->tensor;
@@ -714,12 +715,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
             qs.has_output = true;
         }
+        is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
     }
     qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
     // sanity checks for models that have attention layers
-    if (qs.n_attention_wv != 0)
+    if (qs.n_attention_wv != 0 && !is_clip_model)
     {
         const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
         // attention layers have a non-zero number of kv heads
@@ -881,6 +884,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         // do not quantize relative position bias (T5)
         quantize &= name.find("attn_rel_b.weight") == std::string::npos;
+        // do not quantize specific multimodal tensors
+        quantize &= name.find(".position_embd.") == std::string::npos;
         ggml_type new_type;
         void * new_data;
         size_t new_size;

package/src/llama.cpp/src/llama.cpp CHANGED Viewed

@@ -124,6 +124,9 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
         } catch(const std::exception & e) {
             throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
         }
+        if (model.arch == LLM_ARCH_CLIP) {
+            throw std::runtime_error("CLIP cannot be used as main model, use it with --mmproj instead");
+        }
         try {
             model.load_vocab(ml);
         } catch(const std::exception & e) {