npm - @lloyal-labs/lloyal.node - Versions diffs - 1.0.5-alpha → 1.0.7 - Mend

@lloyal-labs/lloyal.node 1.0.5-alpha → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md +158 -267
package/lib/Branch.js +268 -0
package/lib/index.d.ts +307 -165
package/lib/index.js +165 -19
package/package.json +19 -18
package/scripts/create-platform-package.js +19 -40
package/scripts/download-test-models.sh +10 -0
package/scripts/install.js +0 -138

package/lib/index.d.ts CHANGED Viewed

@@ -4,6 +4,48 @@
  * N-API bindings for liblloyal - Node.js native addon for llama.cpp inference
  */
+/**
+ * GPU variant for binary loading
+ *
+ * Specifies which GPU-accelerated binary to load:
+ * - 'default': CPU-only (works everywhere)
+ * - 'cuda': NVIDIA CUDA (requires libcudart.so/cudart64.dll)
+ * - 'vulkan': Vulkan (AMD/Intel/NVIDIA, requires Vulkan runtime)
+ *
+ * If the requested variant is unavailable (package not installed or
+ * runtime libraries missing), loading automatically falls back to CPU.
+ */
+export type GpuVariant = 'default' | 'cuda' | 'vulkan';
+/**
+ * Options for binary loading
+ *
+ * Controls which native binary variant is loaded when creating a context.
+ * Use this for explicit GPU variant selection with automatic fallback.
+ */
+export interface LoadOptions {
+  /**
+   * GPU variant to use
+   *
+   * - 'cuda': NVIDIA CUDA (requires libcudart.so)
+   * - 'vulkan': Vulkan (AMD/Intel/NVIDIA)
+   * - 'default' or undefined: CPU only
+   *
+   * If the requested variant is unavailable (missing runtime libraries),
+   * automatically falls back to CPU with a console warning.
+   *
+   * @example
+   * ```typescript
+   * // Request CUDA with automatic fallback to CPU
+   * const ctx = await createContext(
+   *   { modelPath: './model.gguf' },
+   *   { gpuVariant: 'cuda' }
+   * );
+   * ```
+   */
+  gpuVariant?: GpuVariant;
+}
 /**
  * Pooling type for embedding extraction
  */
@@ -305,9 +347,11 @@ export interface SessionContext {
    * // Creative generation
    * const token = ctx.sample({ temperature: 0.9 });
    *
-   * // Constrained to valid JSON
-   * ctx.initGrammar(grammar);
+   * // Constrained to valid JSON (handle-based API)
+   * const grammarHandle = ctx.createSampler(grammar);
+   * ctx.applySampler(grammarHandle, ctx.getLogits());
    * const token = ctx.sample({ temperature: 0.7 });
+   * ctx.acceptSamplerToken(grammarHandle, token);
    * ```
    */
   sample(params?: SamplingParams): number;
@@ -566,144 +610,6 @@ export interface SessionContext {
    */
   clearAndReseed(sinks: number[], tail: number[]): Promise<void>;
-  // ===== GRAMMAR-CONSTRAINED GENERATION =====
-  /**
-   * Initialize grammar parser (once per generation session)
-   *
-   * Grammars constrain generation to valid formats (JSON, XML, etc.).
-   * Parser tracks state across tokens to enforce rules.
-   *
-   * Call once before starting constrained generation.
-   * Use resetGrammar() to reuse same grammar for new generation.
-   *
-   * Cost: ~0.1-1ms depending on grammar complexity
-   *
-   * @param grammarStr GBNF grammar string (EBNF-like syntax)
-   * @example
-   * ```typescript
-   * // Force valid JSON
-   * const grammar = ctx.jsonSchemaToGrammar(JSON.stringify({
-   *   type: "object",
-   *   properties: {
-   *     name: { type: "string" },
-   *     age: { type: "number" }
-   *   }
-   * }));
-   *
-   * ctx.initGrammar(grammar);
-   *
-   * // Now sample() will only generate valid JSON
-   * const token = ctx.sample({ temperature: 0.7 });
-   * ```
-   */
-  initGrammar(grammarStr: string): void;
-  /**
-   * Apply grammar constraints to token scores (modifies in-place)
-   *
-   * Masks invalid tokens with -Infinity based on parser state.
-   * Call after getTokenScores(), before custom sampling.
-   *
-   * Flow: getTokenScores() → applyGrammar() → sample() → acceptToken()
-   *
-   * Thread safety: This method is synchronous and modifies the buffer
-   * in-place on the JS thread. Safe because it's called sequentially
-   * in the generation loop before any async operations.
-   *
-   * Cost: ~0.1-1ms depending on grammar complexity
-   *
-   * @param scoresBuffer Buffer from getTokenScores() (modified in-place)
-   * @throws Error if grammar not initialized (call initGrammar first)
-   * @example
-   * ```typescript
-   * // Custom sampling with grammar
-   * const buffer = ctx.getTokenScores();
-   * const scores = new Float32Array(buffer.buffer, buffer.byteOffset, buffer.length / 4);
-   *
-   * // Apply grammar constraints
-   * ctx.applyGrammar(buffer);
-   *
-   * // Now sample from constrained distribution
-   * const token = customSample(scores);
-   * ctx.acceptToken(token);
-   * ```
-   */
-  applyGrammar(scoresBuffer: Buffer): void;
-  /**
-   * Advance grammar parser with chosen token
-   *
-   * Updates parser state after sampling.
-   * MUST be called AFTER sampling, BEFORE next applyGrammar().
-   *
-   * This advances the stateful grammar parser through its rules.
-   * Without this, grammar constraints will be incorrect.
-   *
-   * Cost: <0.01ms
-   *
-   * @param tokenId Token that was sampled
-   * @example
-   * ```typescript
-   * const buffer = ctx.getTokenScores();
-   * ctx.applyGrammar(buffer);
-   *
-   * const scores = new Float32Array(buffer.buffer, buffer.byteOffset, buffer.length / 4);
-   * const token = customSample(scores);
-   *
-   * // MUST call acceptToken to advance parser
-   * ctx.acceptToken(token);
-   *
-   * // Now parser is ready for next token
-   * ```
-   */
-  acceptToken(tokenId: number): void;
-  /**
-   * Reset grammar parser to initial state
-   *
-   * Call at start of each new generation with same grammar.
-   * Parser returns to root state, ready to validate from beginning.
-   *
-   * Cost: <0.01ms
-   *
-   * @example
-   * ```typescript
-   * ctx.initGrammar(jsonGrammar);
-   *
-   * // First generation
-   * while (!done) {
-   *   const token = ctx.sample();
-   *   // ... generate ...
-   * }
-   *
-   * // Second generation - reuse same grammar
-   * ctx.resetGrammar();
-   * while (!done) {
-   *   const token = ctx.sample();
-   *   // ... generate ...
-   * }
-   * ```
-   */
-  resetGrammar(): void;
-  /**
-   * Free grammar resources
-   *
-   * Call when done with constrained generation.
-   * Releases parser memory.
-   *
-   * Cost: <0.01ms
-   *
-   * @example
-   * ```typescript
-   * ctx.initGrammar(grammar);
-   * // ... do constrained generation ...
-   * ctx.freeGrammar();
-   * ```
-   */
-  freeGrammar(): void;
   // ===== KV SEQUENCE OPERATIONS =====
   /**
@@ -775,9 +681,7 @@ export interface SessionContext {
    * Create a new grammar sampler (returns handle)
    *
    * Creates an independent grammar sampler instance with its own state.
-   *
-   * Unlike initGrammar() which uses a single internal sampler, this returns
-   * a handle that can be used with applySampler/acceptSamplerToken.
+   * Returns a handle that can be used with applySampler/acceptSamplerToken.
    * Multiple handles can coexist with independent parser states.
    *
    * Cost: ~0.1-1ms depending on grammar complexity
@@ -817,7 +721,6 @@ export interface SessionContext {
    * Accept token to advance grammar parser state (handle-based)
    *
    * Must be called after sampling to advance the grammar parser.
-   * This is the handle-based equivalent of acceptToken().
    *
    * @param handle Sampler handle from createSampler()
    * @param tokenId Token that was sampled
@@ -867,13 +770,15 @@ export interface SessionContext {
    * - High surprisal: Model didn't expect this token (low probability)
    *
    * Call after decode() to compute surprisal for any token based on
-   * the current logits distribution.
+   * the current logits distribution, or pass captured logits for
+   * offline computation (e.g., best-of-n scoring from prefill logits).
    *
    * @param pickedTokenId - Token ID to compute surprisal for
    * @param base - Logarithm base: "nats" (default) or "bits"
+   * @param logits - Optional Float32Array of logits (uses current context logits if omitted)
    * @returns Surprisal value in specified base
    *
-   * @example
+   * @example Current context logits (default)
    * ```typescript
    * await ctx.decode(tokens, position);
    * const token = ctx.sample();
@@ -881,9 +786,18 @@ export interface SessionContext {
    * console.log(`Model surprise: ${surprisal.toFixed(2)} bits`);
    * ```
    *
-   * COST: O(1) - direct probability lookup from logits
+   * @example Captured/arbitrary logits (for best-of-n, verification, etc.)
+   * ```typescript
+   * // Capture logits after prefill
+   * const capturedLogits = new Float32Array(ctx.getLogits());
+   *
+   * // Later: compute surprisal from captured logits
+   * const surprisal = ctx.modelSurprisal(token, "nats", capturedLogits);
+   * ```
+   *
+   * COST: O(n_vocab) - softmax normalization required
    */
-  modelSurprisal(pickedTokenId: number, base?: 'nats' | 'bits'): number;
+  modelSurprisal(pickedTokenId: number, base?: 'nats' | 'bits', logits?: Float32Array): number;
   /**
    * Compute entropy of the entire logits distribution.
@@ -892,12 +806,14 @@ export interface SessionContext {
    * - Low entropy: Model is confident (peaked distribution)
    * - High entropy: Model is uncertain (flat distribution)
    *
-   * Call after decode() to analyze the current prediction distribution.
+   * Call after decode() to analyze the current prediction distribution,
+   * or pass captured logits for offline analysis.
    *
    * @param base - Logarithm base: "nats" (default), "bits", or "base10"
+   * @param logits - Optional Float32Array of logits (uses current context logits if omitted)
    * @returns Entropy value in specified base
    *
-   * @example
+   * @example Current context logits (default)
    * ```typescript
    * await ctx.decode(tokens, position);
    * const entropy = ctx.modelEntropy("bits");
@@ -906,9 +822,15 @@ export interface SessionContext {
    * }
    * ```
    *
+   * @example Captured/arbitrary logits
+   * ```typescript
+   * const capturedLogits = new Float32Array(ctx.getLogits());
+   * const entropy = ctx.modelEntropy("nats", capturedLogits);
+   * ```
+   *
    * COST: O(n_vocab) - must sum over all token probabilities
    */
-  modelEntropy(base?: 'nats' | 'bits'): number;
+  modelEntropy(base?: 'nats' | 'bits', logits?: Float32Array): number;
   /**
    * Create a new perplexity tracker.
@@ -1125,7 +1047,7 @@ export interface SessionContext {
    * Convert JSON schema to GBNF grammar
    *
    * Generates grammar string for constrained JSON generation.
-   * Use with initGrammar() or sample({ grammar }).
+   * Use with createSampler() for grammar-constrained generation.
    *
    * Cost: ~1-10ms depending on schema complexity
    *
@@ -1143,7 +1065,7 @@ export interface SessionContext {
    * };
    *
    * const grammar = ctx.jsonSchemaToGrammar(JSON.stringify(schema));
-   * ctx.initGrammar(grammar);
+   * const handle = ctx.createSampler(grammar);
    * ```
    */
   jsonSchemaToGrammar(schemaJson: string): string;
@@ -1253,16 +1175,6 @@ export interface SessionContext {
   // ===== NATIVE REFERENCE IMPLEMENTATIONS =====
-  /**
-   * Compute entropy of current logits distribution
-   *
-   * Alternative entropy computation using native implementation.
-   * Equivalent to modelEntropy("nats") but may be faster.
-   *
-   * @returns Entropy in nats
-   */
-  computeEntropy(): number;
   /**
    * Sample greedily from current logits
    *
@@ -1299,14 +1211,57 @@ export interface SessionContext {
    * Context becomes unusable after disposal.
    */
   dispose(): void;
+  // ===== BRANCH API (internal, wrapped by Branch class) =====
+  /** @internal Create a new branch for parallel generation */
+  _branchCreate(seqId: number, position: number, params?: SamplingParams): number;
+  /** @internal Fork a branch to a new sequence */
+  _branchFork(handle: number, newSeqId: number): number;
+  /** @internal Capture logits into branch's snapshot */
+  _branchCaptureLogits(handle: number): void;
+  /** @internal Decode a single token and capture logits */
+  _branchDecodeAndCaptureOne(handle: number, token: number): void;
+  /** @internal Sample next token from branch's logits snapshot */
+  _branchSample(handle: number): number;
+  /** @internal Accept token (update sampler state for penalties) */
+  _branchAccept(handle: number, token: number): void;
+  /** @internal Get branch's sequence ID */
+  _branchGetSeqId(handle: number): number;
+  /** @internal Get branch's current position */
+  _branchGetPosition(handle: number): number;
+  /** @internal Get branch's perplexity */
+  _branchGetPerplexity(handle: number): number;
+  /** @internal Prune branch (remove KV cache entries and free handle) */
+  _branchPrune(handle: number): void;
+  /** @internal Destroy branch (free handle without removing KV cache) */
+  _branchDestroy(handle: number): void;
+  /** @internal Reseed branch sampler PRNG for diversity after fork */
+  _branchSamplerChainReseed(handle: number, seed: number): void;
 }
 /**
  * Create a new inference context
  *
+ * Loads the appropriate native binary (with automatic GPU fallback) and
+ * creates an inference context for the specified model.
+ *
  * @param options Context creation options
+ * @param loadOptions Optional binary loading options (GPU variant selection)
  * @returns Promise resolving to SessionContext instance
- * @example
+ *
+ * @example Basic usage
  * ```typescript
  * const ctx = await createContext({
  *   modelPath: './model.gguf',
@@ -1322,8 +1277,58 @@ export interface SessionContext {
  *   ctx.dispose();
  * }
  * ```
+ *
+ * @example With GPU variant selection
+ * ```typescript
+ * // Request CUDA - falls back to CPU if unavailable
+ * const ctx = await createContext(
+ *   { modelPath: './model.gguf', nCtx: 4096 },
+ *   { gpuVariant: 'cuda' }
+ * );
+ * ```
+ *
+ * @example Using environment variable
+ * ```typescript
+ * // Set LLOYAL_GPU=cuda before running
+ * // createContext will automatically use CUDA if available
+ * const ctx = await createContext({ modelPath: './model.gguf' });
+ * ```
  */
-export function createContext(options: ContextOptions): Promise<SessionContext>;
+export function createContext(
+  options: ContextOptions,
+  loadOptions?: LoadOptions
+): Promise<SessionContext>;
+/**
+ * Load native binary for a specific GPU variant
+ *
+ * Loads the appropriate platform-specific binary with automatic fallback:
+ * 1. Try requested GPU variant (if specified)
+ * 2. Fall back to default (CPU) platform package
+ * 3. Fall back to local build (development: build/Release/lloyal.node)
+ *
+ * Use this for advanced scenarios where you need direct binary access
+ * or want to check variant availability before creating a context.
+ *
+ * @param variant GPU variant: 'cuda', 'vulkan', or undefined for CPU
+ * @returns Native binary module with createContext method
+ * @throws Error if no binary available for the current platform
+ *
+ * @example
+ * ```typescript
+ * // Load default (CPU) binary
+ * const binary = loadBinary();
+ *
+ * // Load CUDA binary (falls back to CPU if unavailable)
+ * const binary = loadBinary('cuda');
+ *
+ * // Create context from loaded binary
+ * const ctx = await binary.createContext({ modelPath: './model.gguf' });
+ * ```
+ */
+export function loadBinary(variant?: GpuVariant): {
+  createContext(options: ContextOptions): Promise<SessionContext>;
+};
 /**
  * Safe logits access with automatic lifetime management
@@ -1386,3 +1391,140 @@ export function withLogits<T>(
   ctx: SessionContext,
   fn: (logits: Float32Array) => T
 ): T;
+/**
+ * Result from Branch.produce()
+ */
+export interface Produced {
+  /** Sampled token ID */
+  token: number;
+  /** Text representation of the token */
+  text: string;
+  /** Whether this is a stop token (EOS) */
+  isStop: boolean;
+}
+/**
+ * Forkable inference handle for covalent generation
+ *
+ * A Branch owns everything needed for independent generation: a KV cache
+ * sequence, sampler chain, logits snapshot, and perplexity tracker.
+ *
+ * Forking is cheap — the KV prefix is shared in memory (metadata-only operation under unified KV —
+ * no KV tensor buffers are copied), so sibling branches read from the same physical KV entries.
+ * Only tokens decoded after the fork point are exclusive to each branch.
+ *
+ * Branches form trees, not just flat lists. Fork from root for best-of-N,
+ * fork from children for MCTS/beam search, fork from a draft for speculative
+ * decoding.
+ *
+ * The produce/commit protocol separates sampling from state advancement:
+ * produce() samples without writing to KV, letting you inspect the result
+ * before deciding to commit().
+ *
+ * @example Best-of-N with perplexity selection
+ * ```typescript
+ * const root = Branch.create(ctx, 0, tokens.length, { temperature: 0.8 });
+ * root.captureLogits();
+ *
+ * const candidates = [1, 2, 3, 4, 5].map((seqId, i) => {
+ *   const branch = root.fork(seqId);
+ *   branch.reseedSampler(1000 + i);
+ *   return branch;
+ * });
+ *
+ * for (let t = 0; t < 50; t++) {
+ *   for (const branch of candidates) {
+ *     const { token, isStop } = branch.produce();
+ *     if (isStop) continue;
+ *     branch.commit(token);
+ *   }
+ * }
+ *
+ * const best = candidates.reduce((a, b) => a.perplexity < b.perplexity ? a : b);
+ * for (const c of candidates) { if (c !== best) c.prune(); }
+ * ```
+ */
+export class Branch {
+  /**
+   * Create a root branch at the given position
+   *
+   * The branch takes ownership of the sequence and creates its own sampler
+   * chain from the provided params. Call captureLogits() after prefill to
+   * freeze the logit distribution before forking.
+   *
+   * @param ctx SessionContext to create branch on
+   * @param seqId Sequence ID for this branch
+   * @param position Starting position (typically prompt token count)
+   * @param params Sampling parameters (temperature, topP, etc.)
+   */
+  static create(
+    ctx: SessionContext,
+    seqId: number,
+    position: number,
+    params?: SamplingParams
+  ): Branch;
+  /**
+   * Fork this branch to a new sequence
+   *
+   * The child shares the parent's KV prefix in memory (metadata-only under unified KV, no KV buffer copy).
+   * Logits, sampler state, and perplexity tracker are cloned so the child
+   * can diverge independently. Fork from any branch — root or intermediate —
+   * to build arbitrarily deep trees.
+   *
+   * @param newSeqId Sequence ID for the forked branch
+   */
+  fork(newSeqId: number): Branch;
+  /** Freeze the current logit distribution into this branch. Essential before fork(). */
+  captureLogits(): void;
+  /** Decode a single token, write to KV, and capture resulting logits */
+  decodeAndCaptureOne(token: number): void;
+  /** Sample next token from branch's frozen logits snapshot */
+  sample(): number;
+  /** Accept token for repeat-penalty tracking */
+  accept(token: number): void;
+  /** Discard branch — remove its divergent KV entries and free the handle (use for losers) */
+  prune(): void;
+  /** Release handle but keep KV entries intact (use for winners, continue with raw ops) */
+  destroy(): void;
+  /**
+   * Reseed the sampler's PRNG for diversity after fork()
+   *
+   * CRITICAL for parallel generation: Without reseeding, all forked branches
+   * produce identical outputs because they share the same PRNG state.
+   *
+   * Only affects stochastic samplers (temperature > 0). Greedy samplers are unchanged.
+   *
+   * @param seed - New seed for the PRNG
+   */
+  reseedSampler(seed: number): void;
+  /** Sample next token without advancing state. Inspect before committing. */
+  produce(): Produced;
+  /** Accept and advance — write token to KV and update branch state. */
+  commit(token: number): void;
+  /** Branch's sequence ID */
+  readonly seqId: number;
+  /** Branch's current position */
+  readonly position: number;
+  /** Branch's perplexity */
+  readonly perplexity: number;
+  /** Internal handle (for debugging) */
+  readonly handle: number;
+  /** Whether this branch has been disposed */
+  readonly disposed: boolean;
+}