npm - @mlx-node/core - Versions diffs - 0.0.0 - Mend

@mlx-node/core 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/index.d.cts ADDED Viewed

@@ -0,0 +1,2728 @@
+/* auto-generated by NAPI-RS */
+/* eslint-disable */
+/**
+ * Result from batch text generation
+ *
+ * Contains results for N prompts × G completions per prompt.
+ * Results are stored flat in arrays of length N*G, where:
+ * - First G elements are completions for prompt 0
+ * - Next G elements are completions for prompt 1
+ * - etc.
+ */
+export declare class BatchGenerationResult {
+  /** Get all generated token arrays (N*G arrays) */
+  get tokens(): Array<MxArray>;
+  /** Get all log probability arrays (N*G arrays) */
+  get logprobs(): Array<MxArray>;
+  /** Get all decoded texts (N*G strings) */
+  get texts(): Array<string>;
+  /** Get finish reasons grouped by prompt (N arrays of G finish reasons) */
+  get finishReasons(): Array<Array<string>>;
+  /** Get token counts grouped by prompt (N arrays of G counts) */
+  get tokenCounts(): Array<Array<number>>;
+  /** Get number of prompts */
+  get numPrompts(): number;
+  /** Get group size (completions per prompt) */
+  get groupSize(): number;
+}
+/**
+ * Result from the high-level `chat()` API
+ *
+ * Contains structured responses with:
+ * - Tool calls parsed as native JavaScript objects
+ * - Thinking/reasoning extracted from `<think>` tags
+ * - Clean text with all special tags stripped
+ *
+ * ## Example
+ * ```typescript
+ * const result = await model.chat(messages, { tools });
+ * console.log(result.text);       // Clean response
+ * console.log(result.thinking);   // Chain-of-thought (if any)
+ * console.log(result.toolCalls);  // Parsed tool calls
+ * ```
+ */
+export declare class ChatResult {
+  /** Get the cleaned text (tool_call and think tags removed) */
+  get text(): string;
+  /** Get the extracted tool calls */
+  get toolCalls(): Array<ToolCallResult>;
+  /**
+   * Get the extracted thinking/reasoning content
+   *
+   * Returns the content from within `<think>...</think>` tags, or null if
+   * no thinking tags were present in the response.
+   *
+   * This is useful for:
+   * - Debugging model reasoning
+   * - Displaying chain-of-thought to users (optional)
+   * - Analyzing model decision-making
+   */
+  get thinking(): string | null;
+  /** Get the generated tokens */
+  get tokens(): MxArray;
+  /** Get the log probabilities */
+  get logprobs(): MxArray;
+  /** Get the finish reason ("stop", "length", "tool_calls", or "repetition") */
+  get finishReason(): 'stop' | 'length' | 'tool_calls' | 'repetition';
+  /** Get the number of tokens generated */
+  get numTokens(): number;
+  /** Get the raw text before tool call stripping (for debugging) */
+  get rawText(): string;
+}
+/** Result from text generation with detailed metadata */
+export declare class GenerationResult {
+  /** Get the decoded text */
+  get text(): string;
+  /** Get the generated tokens */
+  get tokens(): MxArray;
+  /** Get the log probabilities */
+  get logprobs(): MxArray;
+  /** Get the finish reason ("eos", "length", or "repetition") */
+  get finishReason(): 'eos' | 'length' | 'repetition';
+  /** Get the number of tokens generated */
+  get numTokens(): number;
+}
+/**
+ * GRPO Training Engine
+ *
+ * Complete training engine that runs entirely in Rust.
+ */
+export declare class GrpoTrainingEngine {
+  /**
+   * Create a new training engine from an existing model
+   *
+   * # Arguments
+   * * `model` - The Qwen3 model to train (will be cloned internally)
+   * * `config` - Engine configuration
+   */
+  constructor(model: Qwen3Model, config: GrpoEngineConfig);
+  /** Register a built-in reward function */
+  registerBuiltinReward(config: BuiltinRewardConfig): void;
+  /**
+   * Run a training step with provided rewards
+   *
+   * This method performs the complete training cycle:
+   * 1. Generate completions for each prompt (G times per prompt)
+   * 2. Use provided rewards to compute advantages
+   * 3. Compute GRPO loss and gradients
+   * 4. Apply gradients (respecting accumulation steps)
+   *
+   * # Arguments
+   * * `prompts` - Array of chat conversations to use as prompts
+   * * `rewards` - Reward values for each completion (num_prompts * group_size)
+   *
+   * # Returns
+   * * Training step metrics
+   */
+  trainStep(prompts: Array<Array<ChatMessage>>, rewards: Array<number>): Promise<EngineStepMetrics>;
+  /**
+   * Generate completions without training
+   *
+   * Use this to generate completions for scoring by external reward functions.
+   * Returns completion texts along with the internal token data needed for training.
+   */
+  generateBatch(prompts: Array<Array<ChatMessage>>): Promise<Array<string>>;
+  /**
+   * Generate completions with all data needed for training
+   *
+   * Returns completion texts, tokens, log probabilities, and lengths.
+   * Use this when you need to score completions externally and then train.
+   */
+  generateBatchForTraining(prompts: Array<Array<ChatMessage>>): Promise<GenerateBatchResult>;
+  /**
+   * Run a training step with pre-generated completions
+   *
+   * This method performs training using pre-generated completions,
+   * eliminating the double-generation issue.
+   *
+   * # Arguments
+   * * `prompts` - Array of chat conversations to use as prompts
+   * * `rewards` - Reward values for each completion (num_prompts * group_size)
+   * * `generation_result` - Pre-generated completion data from generate_batch_for_training
+   *
+   * # Returns
+   * * Training step metrics
+   */
+  trainStepWithGenerations(
+    prompts: Array<Array<ChatMessage>>,
+    rewards: Array<number>,
+    generationResult: GenerateBatchResult,
+  ): Promise<EngineStepMetrics>;
+  /**
+   * Unified training step with JS reward callback and optional output recording
+   *
+   * Same as `train_step_auto` but optionally captures the full RewardOutput data
+   * for persistence to an output store database.
+   *
+   * # Arguments
+   * * `prompts` - Array of chat conversations to use as prompts
+   * * `reward_fn` - JavaScript function to compute rewards
+   * * `record_outputs` - If true, return the serialized RewardOutput JSON
+   *
+   * # Returns
+   * * Training step result including metrics, completions, rewards, and optionally outputs_json
+   */
+  trainStepAuto(
+    prompts: ChatMessage[][],
+    rewardFn: (err: Error | null, outputsJson: string) => Promise<number[]>,
+    recordOutputs: boolean,
+  ): Promise<TrainStepResultWithOutputs>;
+  /**
+   * Score completions using registered built-in rewards
+   *
+   * # Arguments
+   * * `prompts` - Prompt texts (expanded to match completions)
+   * * `completions` - Completion texts to score
+   */
+  scoreCompletions(prompts: Array<string>, completions: Array<string>): Array<number>;
+  /** Get current training step */
+  get step(): number;
+  /** Get current epoch */
+  get epoch(): number;
+  /** Start a new epoch */
+  startEpoch(): void;
+  /** End the current epoch and get metrics */
+  endEpoch(epochTimeSecs: number): EngineEpochMetrics;
+  /** Reset the engine for a fresh training run */
+  reset(): void;
+  /** Check if reward registry has any rewards registered */
+  get hasBuiltinRewards(): boolean;
+  /** Get names of registered reward functions */
+  get rewardNames(): Array<string>;
+  /** Get current micro-step within gradient accumulation */
+  get microStep(): number;
+  /**
+   * Check if an emergency checkpoint should be saved
+   * This flag is set when consecutive NaN gradients reach the threshold
+   */
+  get needsEmergencySave(): boolean;
+  /** Get current NaN gradient count */
+  get nanGradientCount(): number;
+  /** Clear the emergency save flag (call after saving emergency checkpoint) */
+  clearEmergencySaveFlag(): void;
+}
+export type GRPOTrainingEngine = GrpoTrainingEngine;
+export declare class MxArray {
+  static fromInt32(data: Int32Array, shape: BigInt64Array): MxArray;
+  static fromInt64(data: BigInt64Array, shape: BigInt64Array): MxArray;
+  static fromUint32(data: Uint32Array, shape: BigInt64Array): MxArray;
+  static fromFloat32(data: Float32Array, shape: BigInt64Array): MxArray;
+  static zeros(shape: BigInt64Array, dtype?: DType | undefined | null): MxArray;
+  static scalarFloat(value: number): MxArray;
+  static scalarInt(value: number): MxArray;
+  static ones(shape: BigInt64Array, dtype?: DType | undefined | null): MxArray;
+  static full(shape: BigInt64Array, fillValue: number | MxArray, dtype?: DType | undefined | null): MxArray;
+  static linspace(
+    start: number,
+    stop: number,
+    num?: number | undefined | null,
+    dtype?: DType | undefined | null,
+  ): MxArray;
+  static eye(
+    n: number,
+    m?: number | undefined | null,
+    k?: number | undefined | null,
+    dtype?: DType | undefined | null,
+  ): MxArray;
+  static arange(
+    start: number,
+    stop: number,
+    step?: number | undefined | null,
+    dtype?: DType | undefined | null,
+  ): MxArray;
+  reshape(shape: BigInt64Array): MxArray;
+  astype(dtype: DType): MxArray;
+  /**
+   * Create a copy of this array with a new handle.
+   * This is useful for parameter loading to avoid handle aliasing issues.
+   */
+  copy(): MxArray;
+  logSoftmax(axis: number): MxArray;
+  exp(): MxArray;
+  log(): MxArray;
+  sum(axes?: Int32Array | undefined | null, keepdims?: boolean | undefined | null): MxArray;
+  mean(axes?: Int32Array | undefined | null, keepdims?: boolean | undefined | null): MxArray;
+  clip(minimum?: number | undefined | null, maximum?: number | undefined | null): MxArray;
+  minimum(other: MxArray): MxArray;
+  maximum(other: MxArray): MxArray;
+  add(other: MxArray): MxArray;
+  sub(other: MxArray): MxArray;
+  mul(other: MxArray): MxArray;
+  div(other: MxArray): MxArray;
+  addScalar(value: number): MxArray;
+  mulScalar(value: number): MxArray;
+  subScalar(value: number): MxArray;
+  divScalar(value: number): MxArray;
+  matmul(other: MxArray): MxArray;
+  /**
+   * Fused matrix multiply-add: D = beta * C + alpha * (self @ B)
+   * where self is A. More efficient than separate matmul and add operations.
+   * Default: alpha=1.0, beta=1.0, giving D = C + (self @ B)
+   */
+  addmm(c: MxArray, b: MxArray, alpha?: number | undefined | null, beta?: number | undefined | null): MxArray;
+  transpose(axes?: Int32Array | undefined | null): MxArray;
+  take(indices: MxArray, axis: number): MxArray;
+  takeAlongAxis(indices: MxArray, axis: number): MxArray;
+  /**
+   * Put values into array at specified indices along an axis
+   * Equivalent to: result = array.copy(); result[..., indices] = values
+   * This matches MLX's put_along_axis for efficient in-place-style updates
+   */
+  putAlongAxis(indices: MxArray, values: MxArray, axis: number): MxArray;
+  slice(starts: BigInt64Array, stops: BigInt64Array): MxArray;
+  /**
+   * Concatenate two arrays along an axis
+   * Optimized for the common binary concatenation case
+   */
+  static concatenate(a: MxArray, b: MxArray, axis: number): MxArray;
+  /**
+   * Concatenate multiple arrays along an axis
+   * For concatenating 3 or more arrays
+   */
+  static concatenateMany(arrays: Array<MxArray>, axis?: number | undefined | null): MxArray;
+  sort(axis?: number | undefined | null): MxArray;
+  argsort(axis?: number | undefined | null): MxArray;
+  partition(kth: number, axis?: number | undefined | null): MxArray;
+  argpartition(kth: number, axis?: number | undefined | null): MxArray;
+  eval(): void;
+  evalAsync(): Promise<undefined>;
+  size(): bigint;
+  ndim(): number;
+  shape(): BigInt64Array;
+  /**
+   * Get a single dimension from the array shape without copying the entire shape
+   * This is more efficient when you only need one dimension
+   *
+   * Note: axis is u32 because NAPI doesn't support usize, but internally converted to usize
+   */
+  shapeAt(axis: number): number;
+  /**
+   * Get batch and sequence length for 2D arrays (common pattern in transformers)
+   * More efficient than calling shape() and extracting dimensions
+   */
+  getBatchSeqLen(): Array<number>;
+  /**
+   * Get batch, sequence length, and hidden size for 3D arrays (common pattern in transformers)
+   * More efficient than calling shape() and extracting dimensions
+   */
+  getBatchSeqHidden(): Array<number>;
+  dtype(): DType;
+  /**
+   * Copy entire array from GPU to CPU as Float32Array
+   *
+   * ⚠吅 **PERFORMANCE WARNING**: This triggers a FULL GPU→CPU memory transfer!
+   *
+   * **Performance impact**:
+   * - Forces evaluation of lazy operations
+   * - Copies entire array from GPU to CPU memory
+   * - Can be extremely slow for large arrays
+   *
+   * **Use sparingly**:
+   * - Prefer `item_float32()` for scalars
+   * - Prefer `item_at_float32(index)` for single elements
+   * - Only use when you truly need all array data on CPU
+   *
+   * **Acceptable use cases**:
+   * - Test validation and assertions
+   * - CPU-only operations (e.g., sorting for quantiles)
+   * - Final output extraction
+   */
+  toFloat32(): Float32Array;
+  /**
+   * Copy entire array from GPU to CPU as Int32Array
+   *
+   * ⚠吅 **PERFORMANCE WARNING**: This triggers a FULL GPU→CPU memory transfer!
+   *
+   * See `to_float32()` documentation for performance implications and alternatives.
+   * Prefer `item_int32()` for scalars.
+   */
+  toInt32(): Int32Array;
+  /**
+   * Copy entire array from GPU to CPU as Uint32Array
+   *
+   * ⚠吅 **PERFORMANCE WARNING**: This triggers a FULL GPU→CPU memory transfer!
+   *
+   * See `to_float32()` documentation for performance implications and alternatives.
+   */
+  toUint32(): Uint32Array;
+  static stack(arrays: Array<MxArray>, axis?: number | undefined | null): MxArray;
+  static randomUniform(shape: BigInt64Array, low: number, high: number, dtype?: DType | undefined | null): MxArray;
+  static randomNormal(shape: BigInt64Array, mean: number, std: number, dtype?: DType | undefined | null): MxArray;
+  static randomBernoulli(shape: BigInt64Array, prob: number): MxArray;
+  static randint(shape: BigInt64Array, low: number, high: number): MxArray;
+  /**
+   * Sample from categorical distribution
+   * Takes logits and returns sampled indices
+   */
+  categorical(axis?: number | undefined | null): MxArray;
+  equal(other: MxArray): MxArray;
+  notEqual(other: MxArray): MxArray;
+  less(other: MxArray): MxArray;
+  lessEqual(other: MxArray): MxArray;
+  greater(other: MxArray): MxArray;
+  greaterEqual(other: MxArray): MxArray;
+  logicalAnd(other: MxArray): MxArray;
+  logicalOr(other: MxArray): MxArray;
+  logicalNot(): MxArray;
+  where(x: MxArray, y: MxArray): MxArray;
+  argmax(axis: number, keepdims?: boolean | undefined | null): MxArray;
+  argmin(axis: number, keepdims?: boolean | undefined | null): MxArray;
+  max(axes?: Int32Array | undefined | null, keepdims?: boolean | undefined | null): MxArray;
+  min(axes?: Int32Array | undefined | null, keepdims?: boolean | undefined | null): MxArray;
+  prod(axes?: Int32Array | undefined | null, keepdims?: boolean | undefined | null): MxArray;
+  var(
+    axes?: Int32Array | undefined | null,
+    keepdims?: boolean | undefined | null,
+    ddof?: number | undefined | null,
+  ): MxArray;
+  std(
+    axes?: Int32Array | undefined | null,
+    keepdims?: boolean | undefined | null,
+    ddof?: number | undefined | null,
+  ): MxArray;
+  logsumexp(axes?: Int32Array | undefined | null, keepdims?: boolean | undefined | null): MxArray;
+  cumsum(axis: number): MxArray;
+  cumprod(axis: number): MxArray;
+  pad(padWidth: Int32Array, constantValue: number): MxArray;
+  roll(shift: number, axis: number): MxArray;
+  split(indicesOrSections: number, axis?: number | undefined | null): Array<MxArray>;
+  tile(reps: Int32Array): MxArray;
+  repeat(repeats: number, axis: number): MxArray;
+  squeeze(axes?: Int32Array | undefined | null): MxArray;
+  expandDims(axis: number): MxArray;
+  broadcastTo(shape: BigInt64Array): MxArray;
+  abs(): MxArray;
+  negative(): MxArray;
+  sign(): MxArray;
+  sqrt(): MxArray;
+  square(): MxArray;
+  power(other: MxArray): MxArray;
+  sin(): MxArray;
+  cos(): MxArray;
+  tan(): MxArray;
+  sinh(): MxArray;
+  cosh(): MxArray;
+  tanh(): MxArray;
+  floor(): MxArray;
+  ceil(): MxArray;
+  round(): MxArray;
+  floorDivide(other: MxArray): MxArray;
+  remainder(other: MxArray): MxArray;
+  reciprocal(): MxArray;
+  arcsin(): MxArray;
+  arccos(): MxArray;
+  arctan(): MxArray;
+  log10(): MxArray;
+  log2(): MxArray;
+  log1p(): MxArray;
+  /**
+   * Element-wise check for NaN values
+   *
+   * Returns a boolean array where True indicates the element is NaN.
+   * This is a GPU-native operation that avoids CPU data transfer.
+   */
+  isnan(): MxArray;
+  /**
+   * Element-wise check for Inf values
+   *
+   * Returns a boolean array where True indicates the element is +Inf or -Inf.
+   * This is a GPU-native operation that avoids CPU data transfer.
+   */
+  isinf(): MxArray;
+  /**
+   * Element-wise check for finite values
+   *
+   * Returns a boolean array where True indicates the element is finite (not NaN and not Inf).
+   * This is a GPU-native operation that avoids CPU data transfer.
+   */
+  isfinite(): MxArray;
+}
+/** NAPI-exported reward registry wrapper */
+export declare class NativeRewardRegistry {
+  /** Create a new reward registry */
+  constructor();
+  /** Register a built-in reward function */
+  register(config: BuiltinRewardConfig): void;
+  /** Score a single completion */
+  score(prompt: string, completion: string): number;
+  /** Score a batch of completions */
+  scoreBatch(prompts: Array<string>, completions: Array<string>): Array<number>;
+  /** Check if registry is empty */
+  get isEmpty(): boolean;
+  /** Get registered reward names */
+  get names(): Array<string>;
+  /** Set whether to normalize scores */
+  setNormalize(normalize: boolean): void;
+}
+/**
+ * OutputStore - Persistence layer for training outputs
+ *
+ * Stores all model outputs during GRPO training for debugging and research.
+ * Supports local SQLite files.
+ */
+export declare class OutputStore {
+  /** Create a new output store with local SQLite file */
+  static local(path: string): Promise<OutputStore>;
+  /** Create from config object */
+  static fromConfig(config: OutputStoreConfig): Promise<OutputStore>;
+  /** Start a new training run */
+  startRun(modelName: string, modelPath: string | undefined | null, config: string): Promise<string>;
+  /** Start a new training run with a name */
+  startRunWithName(
+    name: string | undefined | null,
+    modelName: string,
+    modelPath: string | undefined | null,
+    config: string,
+  ): Promise<string>;
+  /** End the current training run */
+  endRun(status: string): Promise<void>;
+  /** Get current run ID */
+  currentRunId(): Promise<string | null>;
+  /** Find a run by name */
+  findRunByName(name: string): Promise<TrainingRunRecord | null>;
+  /** Resume an existing run (sets status to running and makes it current) */
+  resumeRun(runId: string): Promise<void>;
+  /** Delete all steps after a given step number (for resume cleanup) */
+  deleteStepsAfter(runId: string, afterStep: number): Promise<number>;
+  /**
+   * Delete all records after a given step (for checkpoint resume)
+   *
+   * Cascades through: training_steps → generations → tool_calls, and logs.
+   * Use this when resuming from checkpoint to ensure clean database state.
+   */
+  deleteAllAfterStep(runId: string, afterStep: number): Promise<CleanupStats>;
+  /**
+   * Get recent step metrics for TUI sparkline restoration
+   *
+   * Returns metrics ordered by step (oldest first) for easy insertion into VecDeque.
+   */
+  getRecentStepMetrics(runId: string, limit: number): Promise<Array<StepMetricSummary>>;
+  /**
+   * Get aggregate statistics for a training run
+   *
+   * Returns pre-computed aggregates for restoring TUI state on resume.
+   */
+  getRunAggregates(runId: string): Promise<RunAggregates>;
+  /**
+   * Get recent generations for sample panel restoration
+   *
+   * Returns generations ordered by step DESC, reward DESC (most recent high-reward first).
+   */
+  getRecentGenerations(runId: string, limit: number): Promise<Array<GenerationRecord>>;
+  /** Get store configuration */
+  get config(): OutputStoreConfig;
+  /** Record from RewardOutput JSON (direct integration with training engine) */
+  recordStepFromOutputs(
+    step: number,
+    metrics: EngineStepMetrics,
+    outputsJson: string,
+    rewards: Array<number>,
+    groupSize: number,
+  ): Promise<number>;
+  /**
+   * Record a complete training step with all generations and tool calls
+   *
+   * Lower-level API for direct control over step recording.
+   */
+  recordStep(
+    step: StepRecord,
+    generations: Array<GenerationRecord>,
+    toolCalls: Array<Array<ToolCallRecord>>,
+  ): Promise<number>;
+  /** Flush any pending writes */
+  flush(): Promise<void>;
+  /** List all training runs */
+  listRuns(limit?: number | undefined | null, status?: string | undefined | null): Promise<Array<TrainingRunRecord>>;
+  /** Get a specific run */
+  getRun(runId: string): Promise<TrainingRunRecord | null>;
+  /** Get step summaries for a run */
+  getStepSummaries(
+    runId: string,
+    startStep?: number | undefined | null,
+    endStep?: number | undefined | null,
+  ): Promise<Array<StepSummary>>;
+  /** Get all generations for a step */
+  getGenerations(runId: string, step: number): Promise<Array<GenerationWithToolCalls>>;
+  /** Get top/bottom generations by reward */
+  getGenerationsByReward(
+    runId: string,
+    topN?: number | undefined | null,
+    bottomN?: number | undefined | null,
+    stepRange?: Array<number> | undefined | null,
+  ): Promise<Array<GenerationWithToolCalls>>;
+  /** Get generations with specific finish reason */
+  getGenerationsByFinishReason(
+    runId: string,
+    finishReason: string,
+    limit?: number | undefined | null,
+  ): Promise<Array<GenerationWithToolCalls>>;
+  /** Get generations containing tool calls */
+  getGenerationsWithToolCalls(
+    runId: string,
+    toolName?: string | undefined | null,
+    status?: string | undefined | null,
+    limit?: number | undefined | null,
+  ): Promise<Array<GenerationWithToolCalls>>;
+  /** Search generations by text content */
+  searchGenerations(
+    runId: string,
+    query: string,
+    searchIn?: string | undefined | null,
+    limit?: number | undefined | null,
+  ): Promise<Array<GenerationWithToolCalls>>;
+  /** Get reward distribution statistics */
+  getRewardStats(runId: string, stepRange?: Array<number> | undefined | null): Promise<RewardStats>;
+  /** Export to JSONL file */
+  exportJsonl(runId: string, outputPath: string, includeToolCalls?: boolean | undefined | null): Promise<number>;
+  /** Execute raw SQL query (for advanced users) */
+  queryRaw(sql: string): Promise<string>;
+}
+/**
+ * Qwen3 Model with automatic differentiation support
+ *
+ * Uses interior mutability (RwLock) for layers, final_norm, and lm_head
+ * to allow gradient application without deep cloning the model.
+ * This eliminates the previous ~4GB memory overhead from clone_for_session().
+ */
+export declare class Qwen3Model {
+  /** Create a new Qwen3 model with the given configuration */
+  constructor(config: Qwen3Config);
+  /**
+   * Forward pass through the model
+   *
+   * # Arguments
+   * * `input_ids` - Token IDs, shape: [batch_size, seq_len]
+   *
+   * # Returns
+   * * Logits, shape: [batch_size, seq_len, vocab_size]
+   */
+  forward(inputIds: MxArray): MxArray;
+  /**
+   * Initialize KV caches for incremental generation
+   *
+   * Creates one KV cache per transformer layer. Call this before starting generation.
+   */
+  initKvCaches(): void;
+  /**
+   * Reset all KV caches
+   *
+   * Clears cached key-value states. Call this between different generation sequences.
+   */
+  resetKvCaches(): void;
+  /** Check if paged attention is enabled for this model */
+  hasPagedAttention(): boolean;
+  /**
+   * Get paged attention memory statistics (if enabled)
+   *
+   * Returns memory usage statistics for the paged KV cache.
+   */
+  pagedCacheStats(): PagedCacheStats | null;
+  /**
+   * Get scheduler statistics (if paged attention is enabled)
+   *
+   * Returns the number of waiting, running, and completed sequences.
+   */
+  schedulerStats(): SchedulerStatsNapi | null;
+  /**
+   * Forward pass with KV caching for incremental generation
+   *
+   * # Arguments
+   * * `input_ids` - Token IDs, shape: [batch_size, seq_len]
+   * * `use_cache` - Whether to use KV caching (must call init_kv_caches() first)
+   *
+   * # Returns
+   * * Logits, shape: [batch_size, seq_len, vocab_size]
+   */
+  forwardWithCache(inputIds: MxArray, useCache: boolean): MxArray;
+  /**
+   * Forward pass with paged attention for memory-efficient inference.
+   *
+   * This method uses block-based KV cache management via Metal kernels for:
+   * - Variable-length sequences with efficient memory usage
+   * - Continuous batching with dynamic batch composition
+   * - Long context support beyond GPU memory limits
+   *
+   * # Arguments
+   * * `input_ids` - Token IDs, shape: [num_seqs, 1] for decode
+   * * `slot_mapping` - Slot indices for cache updates, shape: [num_seqs]
+   * * `seq_ids` - Sequence IDs in the batch (for looking up block tables/context lens)
+   * * `positions` - Token positions for RoPE, shape: [num_seqs] (per-sequence positions)
+   *
+   * # Returns
+   * * Logits, shape: [num_seqs, 1, vocab_size] for decode
+   */
+  forwardPaged(inputIds: MxArray, slotMapping: MxArray, seqIds: Array<number>, positions: MxArray): MxArray;
+  /**
+   * Prefill a sequence using standard attention and write K/V to paged cache.
+   *
+   * This method should be called before `step_paged_generation()` for each
+   * new prompt. It runs the full forward pass using standard attention
+   * (which is faster for long sequences), then writes the K/V cache to
+   * the paged cache for subsequent decode steps.
+   *
+   * # Arguments
+   * * `prompt_tokens` - Token IDs for the prompt (as u32 array)
+   * * `seq_id` - Sequence ID (obtained from scheduler)
+   *
+   * # Returns
+   * * Logits for the last token, shape: [1, vocab_size]
+   */
+  prefillPaged(promptTokens: Array<number>, seqId: number): MxArray;
+  /**
+   * Add a request to the paged attention scheduler.
+   *
+   * The scheduler queues requests and allocates blocks for KV cache.
+   * Use `step_paged_generation()` to process the scheduled batch.
+   *
+   * Note: The actual sequence ID is assigned during scheduling, not when the
+   * request is added. Use the `request_id` to track your requests through
+   * the generation process.
+   *
+   * # Arguments
+   * * `request_id` - Unique identifier for the request (returned in outputs)
+   * * `prompt_tokens` - Token IDs for the prompt
+   * * `max_new_tokens` - Maximum new tokens to generate
+   * * `priority` - Optional priority (higher = scheduled first)
+   *
+   * # Returns
+   * * Number of pending requests in the queue
+   */
+  addPagedRequest(
+    requestId: string,
+    promptTokens: Array<number>,
+    maxNewTokens: number,
+    priority?: number | undefined | null,
+  ): number;
+  /**
+   * Schedule and execute one step of paged generation.
+   *
+   * This method:
+   * 1. Schedules the next batch of sequences
+   * 2. Runs forward pass with paged attention
+   * 3. Samples next tokens
+   * 4. Returns the generated tokens for each sequence
+   *
+   * # Arguments
+   * * `config` - Generation configuration (temperature, top_k, etc.)
+   *
+   * # Returns
+   * * `PagedGenerationStep` with token outputs for each sequence
+   */
+  stepPagedGeneration(config?: GenerationConfig | undefined | null): PagedGenerationStep | null;
+  /**
+   * Get completed sequences from the scheduler.
+   *
+   * Call this after `step_paged_generation()` returns outputs with `is_finished: true`.
+   */
+  getCompletedSequences(): Array<PagedCompletedSequence>;
+  /** Check if the scheduler has pending work. */
+  hasPagedWork(): boolean;
+  /** Get model configuration */
+  getConfig(): Qwen3Config;
+  /**
+   * Generate tokens using speculative decoding with a draft model.
+   *
+   * Speculative decoding uses a smaller draft model to generate tokens speculatively,
+   * then verifies them with the target model in a single forward pass. This can achieve
+   * 2-3x speedup when the draft model has high acceptance rate.
+   *
+   * # Algorithm
+   * 1. Draft model generates N tokens speculatively (cheap forward passes)
+   * 2. Target model (self) verifies all N tokens in one forward pass
+   * 3. Accept/reject using rejection sampling
+   * 4. On rejection, resample from adjusted distribution
+   * 5. Rewind caches and continue
+   *
+   * # Arguments
+   * * `draft_model` - Smaller model for speculative generation (should share tokenizer)
+   * * `input_ids` - Input token IDs [1, seq_len]
+   * * `config` - Generation configuration (includes num_draft_tokens)
+   *
+   * # Returns
+   * GenerationResult with tokens, logprobs, and speculative stats in finish_reason
+   *
+   * # Example (TypeScript)
+   * ```typescript
+   * const targetModel = await ModelLoader.loadPretrained('qwen3-7b');
+   * const draftModel = await ModelLoader.loadPretrained('qwen3-0.5b');
+   *
+   * const result = targetModel.generateSpeculativeSync(draftModel, inputIds, {
+   *   numDraftTokens: 5,
+   *   maxNewTokens: 100,
+   *   temperature: 0.7,
+   * });
+   * ```
+   */
+  generateSpeculativeSync(
+    draftModel: Qwen3Model,
+    inputIds: MxArray,
+    config?: GenerationConfig | undefined | null,
+  ): GenerationResult;
+  /** Count total number of parameters in the model */
+  numParameters(): number;
+  /**
+   * Get all model parameters as a dictionary mapping names to arrays
+   *
+   * This matches the TypeScript API for compatibility
+   */
+  getParameters(): Record<string, MxArray>;
+  /** Load parameters from a dictionary */
+  loadParameters(params: Record<string, MxArray>): void;
+  /**
+   * Compute forward pass and loss (for evaluation)
+   *
+   * # Arguments
+   * * `input_ids` - Input token IDs, shape: [batch_size, seq_len]
+   * * `labels` - Target token IDs, shape: [batch_size, seq_len]
+   *
+   * # Returns
+   * * Scalar loss value
+   */
+  computeLoss(inputIds: MxArray, labels: MxArray): MxArray;
+  /**
+   * Compute loss and gradients using a hybrid approach
+   *
+   * This implementation computes gradients for the output layers and uses
+   * numerical approximations for other parameters. This is sufficient to
+   * demonstrate that training works while we build out full MLX autograd integration.
+   *
+   * # Arguments
+   * * `input_ids` - Input token IDs, shape: [batch_size, seq_len]
+   * * `labels` - Target token IDs, shape: [batch_size, seq_len]
+   *
+   * # Returns
+   * * A tuple of (loss, gradients_dict) where gradients_dict maps parameter names to gradient arrays
+   *
+   * # Phase 6A Status
+   * Current implementation computes:
+   * - ✅ Exact gradients for LM head (output layer)
+   * - ⚠吅 Numerical approximations for other layers
+   *
+   * Future: Full MLX autograd will compute exact gradients for all 250+ parameters
+   */
+  computeLossAndGradients(inputIds: MxArray, labels: MxArray): [MxArray, Record<string, MxArray>];
+  /**
+   * Complete GRPO training step using MLX Autograd (RECOMMENDED)
+   *
+   * This method uses automatic differentiation to compute gradients, eliminating
+   * the need for manual backward pass implementation. This is the preferred approach.
+   *
+   * # Arguments
+   * * `prompt_tokens` - Prompt token sequences [batch_size, seq_len] (1D arrays)
+   * * `completion_tokens` - Completion sequences [batch*G, completion_len] (1D arrays)
+   * * `completion_logprobs` - Logprobs from generation [batch*G, completion_len] (1D arrays)
+   * * `rewards` - Reward scores for each completion [batch*G]
+   * * `group_size` - Number of completions per prompt (G)
+   * * `config` - GRPO loss configuration
+   * * `learning_rate` - Learning rate for parameter updates
+   *
+   * # Returns
+   * * Tuple of (loss_value, metrics_dict)
+   */
+  trainStepGrpoAutograd(
+    promptTokens: Array<MxArray>,
+    completionTokens: Array<MxArray>,
+    completionLogprobs: Array<MxArray>,
+    rewards: Float64Array,
+    groupSize: number,
+    config: GrpoLossConfig,
+    learningRate: number,
+  ): [number, Record<string, number>];
+  /**
+   * Compute gradients only without applying them (for gradient accumulation)
+   *
+   * This method computes GRPO loss and gradients but does NOT update parameters.
+   * Used for gradient accumulation where gradients are summed across multiple
+   * micro-batches before applying them.
+   *
+   * # Arguments
+   * * `prompt_tokens` - Prompt token sequences [batch_size, seq_len] (1D arrays)
+   * * `completion_tokens` - Completion sequences [batch*G, completion_len] (1D arrays)
+   * * `completion_logprobs` - Logprobs from generation [batch*G, completion_len] (1D arrays)
+   * * `rewards` - Reward scores for each completion [batch*G]
+   * * `group_size` - Number of completions per prompt (G)
+   * * `config` - GRPO loss configuration
+   *
+   * # Returns
+   * * Tuple of (loss_value, gradients_dict, metrics_dict)
+   */
+  computeGradientsOnlyGrpoAutograd(
+    promptTokens: Array<MxArray>,
+    completionTokens: Array<MxArray>,
+    completionLogprobs: Array<MxArray>,
+    rewards: Float64Array,
+    groupSize: number,
+    config: GrpoLossConfig,
+  ): [number, Record<string, MxArray>, Record<string, number>];
+  /**
+   * Accumulate gradients into existing gradient dictionary
+   *
+   * This is a helper method for gradient accumulation. It adds new_gradients
+   * to accumulated_gradients element-wise.
+   *
+   * # Arguments
+   * * `accumulated_gradients` - Existing accumulated gradients (will be modified in-place conceptually, but returns new dict)
+   * * `new_gradients` - New gradients to add
+   *
+   * # Returns
+   * * Updated gradient dictionary with accumulated values
+   */
+  static accumulateGradients(
+    accumulatedGradients: Record<string, MxArray>,
+    newGradients: Record<string, MxArray>,
+  ): Record<string, MxArray>;
+  /**
+   * Complete GRPO training step using manual gradients (Legacy)
+   *
+   * This method performs a full GRPO training iteration:
+   * 1. Takes completions (already generated) with their logprobs and rewards
+   * 2. Computes advantages
+   * 3. Computes GRPO loss and gradients
+   * 4. Updates model parameters
+   *
+   * NOTE: Use train_step_grpo_autograd instead for automatic differentiation.
+   *
+   * # Arguments
+   * * `prompt_tokens` - Prompt token sequences [batch_size, seq_len] (1D arrays)
+   * * `completion_tokens` - Completion sequences [batch*G, completion_len] (1D arrays)
+   * * `completion_logprobs` - Logprobs from generation [batch*G, completion_len] (1D arrays)
+   * * `rewards` - Reward scores for each completion [batch*G]
+   * * `group_size` - Number of completions per prompt (G)
+   * * `config` - GRPO loss configuration
+   * * `learning_rate` - Learning rate for parameter updates
+   *
+   * # Returns
+   * * Tuple of (loss_value, metrics_dict)
+   */
+  trainStepGrpo(
+    promptTokens: Array<MxArray>,
+    completionTokens: Array<MxArray>,
+    completionLogprobs: Array<MxArray>,
+    rewards: Float64Array,
+    groupSize: number,
+    config: GrpoLossConfig,
+    learningRate: number,
+  ): [number, Record<string, number>];
+  /**
+   * Apply gradients to model parameters
+   *
+   * # Arguments
+   * * `gradients` - Dictionary mapping parameter names to gradient arrays
+   * * `learning_rate` - Learning rate for gradient descent
+   *
+   * This performs a simple SGD update: param = param - lr * grad
+   * Only updates parameters that have gradients; others remain unchanged.
+   *
+   * IMPORTANT: This function preserves the original dtype of parameters.
+   * The learning rate scalar is cast to match param dtype to prevent
+   * promotion to float32 during arithmetic operations.
+   */
+  applyGradients(gradients: Record<string, MxArray>, learningRate: number): void;
+  /**
+   * Text-to-text generation with integrated tokenization
+   *
+   * This is a high-level API that handles chat template formatting, tokenization,
+   * generation, and decoding internally. It takes chat messages, applies the ChatML
+   * template, generates tokens, and decodes them back to text.
+   *
+   * # Arguments
+   * * `messages` - Array of chat messages with role and content
+   * * `config` - Generation configuration
+   *
+   * # Returns
+   * * GenerationResult with text, tokens, logprobs, finish reason, and token count
+   *
+   * # Example
+   * ```typescript
+   * const model = await Qwen3Model.loadPretrained("path/to/model");
+   * const messages = [
+   *   { role: "user", content: "What is 2+2?" }
+   * ];
+   * const result = await model.generate(messages, {
+   *   maxNewTokens: 50,
+   *   temperature: 0.8,
+   *   topP: 0.95,
+   * });
+   * console.log(result.text); // Decoded text output
+   * console.log(result.tokens); // Token IDs (for GRPO)
+   * console.log(result.logprobs); // Log probabilities (for GRPO)
+   * ```
+   */
+  generate(messages: Array<ChatMessage>, config?: GenerationConfig | undefined | null): Promise<GenerationResult>;
+  /**
+   * High-level chat API with structured response parsing
+   *
+   * The primary API for conversational AI. Handles:
+   * - Chat message formatting with Jinja2 templates
+   * - Tool/function calling with structured output
+   * - Thinking extraction from `<think>` tags
+   * - Clean response text with all special tags stripped
+   *
+   * ## `chat()` vs `generate()`
+   *
+   * | Feature | `chat()` | `generate()` |
+   * |---------|----------|--------------|
+   * | **Purpose** | Conversational AI with tools | Raw text generation |
+   * | **Input** | Chat messages | Token IDs (MxArray) |
+   * | **Tool Support** | Built-in parsing | None |
+   * | **Thinking** | Extracts `<think>` content | Raw text only |
+   * | **Output** | Structured `ChatResult` | Basic `GenerationResult` |
+   * | **Use Case** | Chat apps, agents, assistants | Training, low-level control |
+   *
+   * ## When to use `chat()`
+   * - Building conversational applications
+   * - Need tool/function calling
+   * - Want structured responses with thinking separated
+   * - Working with chat message format
+   *
+   * ## When to use `generate()`
+   * - Training and fine-tuning (need raw logprobs)
+   * - Custom tokenization pipeline
+   * - Low-level generation control
+   * - Non-chat use cases
+   *
+   * # Arguments
+   * * `messages` - Array of chat messages (user/assistant/system roles)
+   * * `config` - Chat configuration including optional tools and generation params
+   *
+   * # Returns
+   * * `ChatResult` containing:
+   *   - `text`: Clean response (tool_call and think tags stripped)
+   *   - `thinking`: Extracted chain-of-thought reasoning (or null)
+   *   - `toolCalls`: Parsed tool calls with native JS object arguments
+   *   - `finishReason`: "stop" | "length" | "tool_calls"
+   *   - `rawText`: Original text before processing (for debugging)
+   *
+   * # Example
+   * ```typescript
+   * // Simple chat
+   * const result = await model.chat(messages);
+   * console.log(result.text);
+   *
+   * // With tools
+   * const result = await model.chat(messages, {
+   *   tools: [{ type: 'function', function: { name: 'get_weather' } }],
+   *   maxNewTokens: 2048,
+   *   temperature: 0.7,
+   * });
+   *
+   * // Handle tool calls
+   * for (const call of result.toolCalls) {
+   *   if (call.status === 'ok') {
+   *     console.log(call.name, call.arguments);  // Arguments is a JS object!
+   *   }
+   * }
+   *
+   * // Access thinking (chain-of-thought)
+   * if (result.thinking) {
+   *   console.log('Model reasoning:', result.thinking);
+   * }
+   * ```
+   */
+  chat(messages: Array<ChatMessage>, config?: ChatConfig | undefined | null): Promise<ChatResult>;
+  /**
+   * Generate multiple completions for multiple prompts in batch
+   *
+   * This is an optimized method for GRPO training that generates G completions
+   * for each of N prompts. It performs all tokenization, generation, and decoding
+   * in 3 blocking tasks instead of N*(1+2G) tasks.
+   *
+   * # Arguments
+   * * `prompts` - Array of N prompt message arrays
+   * * `group_size` - Number of completions (G) to generate per prompt
+   * * `config` - Generation configuration (sampling params, etc.)
+   *
+   * # Returns
+   * * BatchGenerationResult containing N*G completions with:
+   *   - tokens: Flat array of N*G token arrays
+   *   - logprobs: Flat array of N*G logprob arrays
+   *   - texts: Flat array of N*G decoded texts
+   *   - finish_reasons: N arrays of G finish reasons
+   *   - token_counts: N arrays of G token counts
+   *
+   * # Performance
+   * For N=10 prompts, G=8 completions:
+   * - Old approach: N*(1 tokenize + G generate + G decode) = 10*(1+8+8) = 170 blocking tasks
+   * - New approach: 1 tokenize + N*G generate + 1 decode = 1+80+1 = 82 blocking tasks (2.1x reduction)
+   *
+   * # Example
+   * ```typescript
+   * const result = await model.generateBatch(
+   *   [messages1, messages2, ...], // N prompts
+   *   8,                             // G completions per prompt
+   *   config
+   * );
+   * ```
+   */
+  generateBatch(
+    prompts: Array<Array<ChatMessage>>,
+    groupSize: number,
+    config?: GenerationConfig | undefined | null,
+  ): Promise<BatchGenerationResult>;
+  /**
+   * Decode token IDs to text using the internal tokenizer
+   *
+   * Helper method for decoding generated tokens. The model must have been loaded
+   * via load_pretrained() to have a tokenizer available.
+   *
+   * # Arguments
+   * * `token_ids` - Token IDs to decode as Uint32Array
+   * * `skip_special_tokens` - Whether to skip special tokens (default: true)
+   *
+   * # Returns
+   * * Decoded text string
+   */
+  decode(tokenIds: Uint32Array, skipSpecialTokens?: boolean | undefined | null): Promise<string>;
+  /**
+   * Apply chat template and encode to token IDs
+   *
+   * Formats messages using ChatML format (or Jinja2 template with tools) and encodes to tokens.
+   * The model must have been loaded via load_pretrained() to have a tokenizer available.
+   *
+   * # Arguments
+   * * `messages` - Array of chat messages
+   * * `add_generation_prompt` - Whether to add generation prompt (default: true)
+   * * `tools` - Optional array of tool definitions for function calling
+   * * `enable_thinking` - Optional flag to enable thinking mode (<think> tags)
+   *
+   * # Returns
+   * * Encoded token IDs as Uint32Array
+   */
+  applyChatTemplate(
+    messages: Array<ChatMessage>,
+    addGenerationPrompt?: boolean | undefined | null,
+    tools?: Array<ToolDefinition> | undefined | null,
+    enableThinking?: boolean | undefined | null,
+  ): Promise<Uint32Array>;
+  /**
+   * Load a pretrained model from disk
+   *
+   * This loads a model from a directory containing:
+   * - config.json: Model configuration
+   * - weights.mlx (optional): MLX format weights with data arrays
+   * - weights.safetensors (optional): SafeTensors format (not yet supported)
+   *
+   * # Arguments
+   * * `model_path` - Path to the model directory
+   *
+   * # Returns
+   * * A fully initialized Qwen3Model with loaded weights
+   */
+  static loadPretrained(modelPath: string): Promise<Qwen3Model>;
+  /**
+   * Save model configuration and weights to disk
+   *
+   * This saves:
+   * - config.json: Model configuration
+   * - weights.safetensors: Full model weights in SafeTensors format
+   * - weights.mlx: Parameter metadata (for reference)
+   *
+   * # Arguments
+   * * `save_path` - Directory to save the model
+   */
+  saveModel(savePath: string): Promise<undefined>;
+  /**
+   * Validate that a set of parameters has all required weights with correct shapes
+   *
+   * This is useful for validating parameters before loading them into a model,
+   * or for checking that saved weights are valid before training.
+   *
+   * # Arguments
+   * * `params` - HashMap of parameter names to MxArray values
+   *
+   * # Returns
+   * * Ok(()) if all validations pass
+   * * Err with descriptive message if validation fails
+   */
+  validateParameters(params: Record<string, MxArray>): void;
+}
+/** Qwen3 Tokenizer class with NAPI bindings */
+export declare class Qwen3Tokenizer {
+  /**
+   * Load tokenizer from tokenizer.json file
+   *
+   * # Arguments
+   * * `path` - Path to tokenizer.json file (default: "../.cache/assets/tokenizers/qwen3_tokenizer.json")
+   *
+   * # Example
+   * ```typescript
+   * const tokenizer = Qwen3Tokenizer.fromPretrained();
+   * const tokens = tokenizer.encode("Hello, world!");
+   * ```
+   */
+  static fromPretrained(tokenizerPath: string): Promise<Qwen3Tokenizer>;
+  /**
+   * Encode text to token IDs
+   *
+   * # Arguments
+   * * `text` - Text to encode
+   * * `add_special_tokens` - Whether to add special tokens (default: true)
+   *
+   * # Returns
+   * Array of token IDs as Int32Array
+   *
+   * # Example
+   * ```typescript
+   * const tokens = tokenizer.encode("Hello, world!");
+   * console.log(tokens); // Int32Array [9906, 11, 1879, 0]
+   * ```
+   */
+  encode(text: string, addSpecialTokens?: boolean | undefined | null): Promise<Uint32Array>;
+  /**
+   * Encode multiple texts in batch
+   *
+   * # Arguments
+   * * `texts` - Array of texts to encode
+   * * `add_special_tokens` - Whether to add special tokens (default: true)
+   *
+   * # Returns
+   * Array of Int32Arrays, one for each text
+   */
+  encodeBatch(texts: Array<string>, addSpecialTokens?: boolean | undefined | null): Promise<Array<Uint32Array>>;
+  /**
+   * Decode token IDs to text
+   *
+   * # Arguments
+   * * `token_ids` - Token IDs to decode
+   * * `skip_special_tokens` - Whether to skip special tokens (default: true)
+   *
+   * # Returns
+   * Decoded text string
+   *
+   * # Example
+   * ```typescript
+   * const text = tokenizer.decode(new Int32Array([9906, 11, 1879, 0]));
+   * console.log(text); // "Hello, world!"
+   * ```
+   */
+  decode(tokenIds: Uint32Array, skipSpecialTokens?: boolean | undefined | null): Promise<string>;
+  /**
+   * Decode multiple token sequences in batch
+   *
+   * # Arguments
+   * * `token_ids_batch` - Array of token ID arrays to decode
+   * * `skip_special_tokens` - Whether to skip special tokens (default: true)
+   *
+   * # Returns
+   * Array of decoded text strings
+   */
+  decodeBatch(
+    tokenIdsBatch: Array<Uint32Array>,
+    skipSpecialTokens?: boolean | undefined | null,
+  ): Promise<Array<string>>;
+  /**
+   * Apply chat template to messages and encode
+   *
+   * Supports both simple ChatML format and full Jinja2 template rendering with tools.
+   * When tools are provided or a chat template exists, uses Jinja2 rendering.
+   * Otherwise falls back to simple ChatML format.
+   *
+   * # Arguments
+   * * `messages` - Array of chat messages
+   * * `add_generation_prompt` - Whether to add assistant prompt at end (default: true)
+   * * `tools` - Optional array of tool definitions for function calling
+   * * `enable_thinking` - Optional flag to enable thinking mode (<think> tags)
+   *
+   * # Returns
+   * Encoded token IDs ready for model input
+   *
+   * # Example
+   * ```typescript
+   * const messages = [
+   *   { role: "system", content: "You are a helpful assistant." },
+   *   { role: "user", content: "What is 2+2?" }
+   * ];
+   * const tokens = tokenizer.applyChatTemplate(messages, true);
+   *
+   * // With tools
+   * const tools = [{
+   *   type: "function",
+   *   function: { name: "get_weather", description: "Get weather info" }
+   * }];
+   * const tokens = tokenizer.applyChatTemplate(messages, true, tools);
+   * ```
+   */
+  applyChatTemplate(
+    messages: Array<ChatMessage>,
+    addGenerationPrompt?: boolean | undefined | null,
+    tools?: Array<ToolDefinition> | undefined | null,
+    enableThinking?: boolean | undefined | null,
+  ): Promise<Uint32Array>;
+  /** Get vocabulary size */
+  vocabSize(): number;
+  /** Get PAD token ID */
+  getPadTokenId(): number;
+  /** Get EOS token ID */
+  getEosTokenId(): number;
+  /** Get BOS token ID (if exists) */
+  getBosTokenId(): number | null;
+  /** Convert token ID to string */
+  idToToken(id: number): string | null;
+  /** Convert token string to ID */
+  tokenToId(token: string): number | null;
+  /** Get the special token for IM_START */
+  getImStartToken(): string;
+  /** Get the special token for IM_END */
+  getImEndToken(): string;
+  /** Get the special token for ENDOFTEXT (used as PAD) */
+  getEndoftextToken(): string;
+}
+/** SFT Training Engine */
+export declare class SftTrainingEngine {
+  /** Create a new SFT training engine */
+  constructor(model: Qwen3Model, config: SftEngineConfig);
+  /** Run a single training step */
+  trainStep(inputIds: MxArray, labels: MxArray): Promise<SftStepMetrics>;
+  /** Get current step number */
+  getStep(): number;
+  /** Get current epoch */
+  getEpoch(): number;
+  /**
+   * Flush any accumulated gradients at epoch end
+   *
+   * When stepsPerEpoch % gradient_accumulation_steps != 0, there may be
+   * leftover gradients from the final micro-batches. This method applies
+   * them with proper averaging, matching TRL behavior.
+   */
+  flushGradients(): boolean;
+  /**
+   * Compute the resume position given current state and dataset info
+   *
+   * This centralizes all resume logic in Rust for correctness.
+   * Uses i64 math internally to avoid overflow on long runs.
+   */
+  computeResumePosition(stepsPerEpoch: number): ResumePosition;
+  /** Check if emergency save is needed */
+  needsEmergencySave(): boolean;
+  /** Clear emergency save flag */
+  clearEmergencySave(): void;
+  /**
+   * Signal start of a new epoch
+   *
+   * Takes the epoch number directly from TypeScript to ensure synchronization.
+   * The epoch is 0-indexed to match the TypeScript training loop.
+   */
+  startEpoch(epoch: number): void;
+  /** End current epoch and return metrics */
+  endEpoch(epochTimeSecs: number): SftEpochMetrics;
+  /** Reset training state (for new training run) */
+  reset(): void;
+  /** Restore training state (for resuming from checkpoint) */
+  restoreState(step: number, epoch: number): void;
+  /** Get the underlying model for checkpointing */
+  getModel(): Qwen3Model;
+}
+/**
+ * A tensor that tracks gradients for automatic differentiation
+ *
+ * This is a wrapper around MxArray that provides:
+ * - Gradient tracking
+ * - Automatic gradient accumulation
+ * - Integration with manual backward passes
+ */
+export declare class Tensor {
+  /** Create a tensor from float32 data */
+  static fromFloat32(data: Float32Array, shape: BigInt64Array, requiresGrad?: boolean | undefined | null): Tensor;
+  /** Create a tensor from int32 data */
+  static fromInt32(data: Int32Array, shape: BigInt64Array, requiresGrad?: boolean | undefined | null): Tensor;
+  /** Get the shape of the underlying data */
+  dataShape(): BigInt64Array;
+  /** Get the shape of the gradient (if it exists) */
+  gradShape(): BigInt64Array | null;
+  /** Check if gradient exists */
+  hasGrad(): boolean;
+  /** Check if this tensor requires gradients */
+  get requiresGrad(): boolean;
+  /** Set whether this tensor requires gradients */
+  set requiresGrad(requiresGrad: boolean);
+  /** Zero out the gradient */
+  zeroGrad(): void;
+  /**
+   * Accumulate gradient
+   *
+   * If gradient already exists, add to it. Otherwise, set it.
+   * Note: This takes ownership of the gradient array.
+   */
+  accumulateGrad(grad: MxArray): void;
+  /** Get the shape of the tensor */
+  shape(): BigInt64Array;
+  /** Convert data to Float32 array */
+  toFloat32(): Float32Array;
+  /** Convert gradient to Float32 array (if it exists) */
+  gradToFloat32(): Float32Array | null;
+  /** Convert to Int32 array */
+  toInt32(): Int32Array;
+  /**
+   * Detach this tensor from the computation graph
+   *
+   * Returns a new tensor with the same data but no gradient tracking
+   */
+  detach(): Tensor;
+  /** Create a tensor of zeros */
+  static zeros(
+    shape: BigInt64Array,
+    dtype?: DType | undefined | null,
+    requiresGrad?: boolean | undefined | null,
+  ): Tensor;
+  /** Create a tensor of ones */
+  static ones(
+    shape: BigInt64Array,
+    dtype?: DType | undefined | null,
+    requiresGrad?: boolean | undefined | null,
+  ): Tensor;
+  /** Evaluate the underlying array */
+  eval(): void;
+}
+/** Result from VLM chat */
+export declare class VlmChatResult {
+  /** Get the response text */
+  get text(): string;
+  /** Get the generated tokens */
+  get tokens(): MxArray;
+  /** Get the log probabilities */
+  get logprobs(): MxArray;
+  /** Get the finish reason */
+  get finishReason(): 'stop' | 'length' | 'repetition';
+  /** Get the number of tokens generated */
+  get numTokens(): number;
+}
+export type VLMChatResult = VlmChatResult;
+/**
+ * Vision-Language Model
+ *
+ * A generic VLM for OCR and document understanding tasks.
+ * Currently supports PaddleOCR-VL architecture (vision encoder + ERNIE language model).
+ */
+export declare class VLModel {
+  /** Create a new PaddleOCR-VL model */
+  constructor(config: ModelConfig);
+  /** Set the tokenizer */
+  setTokenizer(tokenizer: Qwen3Tokenizer): void;
+  /** Check if tokenizer is available */
+  get hasTokenizer(): boolean;
+  /**
+   * Chat with the VLM model
+   *
+   * High-level API for conversational interaction with images.
+   *
+   * # Arguments
+   * * `messages` - Chat messages (role + content)
+   * * `config` - Chat configuration (including image_paths for automatic processing)
+   *
+   * # Returns
+   * * VLMChatResult with generated text
+   *
+   * # Example
+   * ```typescript
+   * const result = model.chat(
+   *   [{ role: 'user', content: 'Describe this image.' }],
+   *   { imagePaths: ['./photo.jpg'], maxNewTokens: 256 }
+   * );
+   * ```
+   */
+  chat(messages: Array<VlmChatMessage>, config?: VlmChatConfig | undefined | null): VlmChatResult;
+  /**
+   * Simple OCR: extract text from an image file
+   *
+   * Convenience method that processes an image and extracts all text.
+   *
+   * # Arguments
+   * * `image_path` - Path to the image file
+   * * `prompt` - Optional custom prompt (default: "Extract all text from this image.")
+   *
+   * # Returns
+   * * Extracted text as a string
+   *
+   * # Example
+   * ```typescript
+   * const text = await model.ocr('./receipt.jpg');
+   * console.log(text);
+   * ```
+   */
+  ocr(imagePath: string, prompt?: string | undefined | null): string;
+  /**
+   * Get input embeddings with vision features merged
+   *
+   * # Arguments
+   * * `input_ids` - Token IDs [batch, seq_len]
+   * * `pixel_values` - Optional image patches [batch, seq, channels, patch_h, patch_w]
+   * * `image_grid_thw` - Optional grid dimensions [num_images, 3]
+   *
+   * # Returns
+   * * Input embeddings with vision features inserted at image token positions
+   */
+  getInputEmbeddings(
+    inputIds: MxArray,
+    pixelValues?: MxArray | undefined | null,
+    imageGridThw?: MxArray | undefined | null,
+  ): MxArray;
+  /**
+   * Forward pass
+   *
+   * # Arguments
+   * * `input_ids` - Token IDs [batch, seq_len]
+   * * `pixel_values` - Optional image patches
+   * * `image_grid_thw` - Optional grid dimensions
+   * * `mask` - Optional attention mask
+   *
+   * # Returns
+   * * Logits [batch, seq_len, vocab_size]
+   */
+  forward(
+    inputIds: MxArray,
+    pixelValues?: MxArray | undefined | null,
+    imageGridThw?: MxArray | undefined | null,
+    mask?: MxArray | undefined | null,
+  ): MxArray;
+  /**
+   * Generate text tokens given input tokens and optional image
+   *
+   * Uses KV caching for efficient generation - each step only processes the
+   * new token(s) while reusing cached key-value states from previous tokens.
+   * Vision features are computed once at the start and cached.
+   *
+   * # Arguments
+   * * `input_ids` - Input token IDs [1, seq_len]
+   * * `pixel_values` - Optional image patches [1, num_patches, C, H, W]
+   * * `image_grid_thw` - Optional grid dimensions [1, 3]
+   * * `config` - Generation configuration
+   *
+   * # Returns
+   * * GenerationResult with tokens, logprobs, and finish reason
+   */
+  generate(
+    inputIds: MxArray,
+    pixelValues?: MxArray | undefined | null,
+    imageGridThw?: MxArray | undefined | null,
+    config?: GenerationConfig | undefined | null,
+  ): GenerationResult;
+  /** Get model configuration */
+  get config(): ModelConfig;
+  /** Check if model is fully initialized */
+  get isInitialized(): boolean;
+  /**
+   * Load a VLM from disk
+   *
+   * Loads a model from a directory containing:
+   * - config.json: Model configuration
+   * - model.safetensors or model-*.safetensors: Model weights in SafeTensors format
+   *
+   * # Arguments
+   * * `model_path` - Path to the model directory
+   *
+   * # Returns
+   * * A fully initialized VLModel with loaded weights
+   *
+   * # Example
+   * ```typescript
+   * import { VLModel } from '@mlx-node/vlm';
+   * const model = await VLModel.load('./models/paddleocr-vl');
+   * const result = model.chat(messages, { imagePaths: ['./image.jpg'] });
+   * ```
+   */
+  static load(modelPath: string): Promise<VLModel>;
+  /**
+   * Load model configuration from disk without loading weights
+   *
+   * This is useful for inspecting model configuration before loading the full model.
+   *
+   * # Arguments
+   * * `model_path` - Path to the model directory containing config.json
+   *
+   * # Returns
+   * * ModelConfig with vision and text configuration
+   *
+   * # Example
+   * ```typescript
+   * import { VLModel } from '@mlx-node/vlm';
+   * const config = await VLModel.loadConfig('./models/paddleocr-vl');
+   * console.log(config.visionConfig.hiddenSize);
+   * ```
+   */
+  static loadConfig(modelPath: string): Promise<ModelConfig>;
+}
+/**
+ * Build RewardOutput array from generation results.
+ *
+ * Parses tool calls and thinking from completions, creating structured outputs
+ * aligned with the ChatResult structure.
+ *
+ * # Arguments
+ * * `prompts` - Array of prompt texts (one per unique prompt, will be expanded by group_size)
+ * * `completions` - Array of completion texts (prompts.len() * group_size total)
+ * * `token_counts` - Array of token counts for each completion
+ * * `finish_reasons` - Array of finish reasons from generation ("eos", "length", "stop", "repetition")
+ * * `group_size` - Number of completions per prompt
+ *
+ * # Returns
+ * Array of RewardOutput objects with structured completion data
+ *
+ * # Example
+ * ```typescript
+ * import { buildRewardOutputs } from '@mlx-node/core';
+ *
+ * const outputs = buildRewardOutputs(
+ *   ['What is 2+2?'],           // prompts
+ *   ['<think>Let me calculate</think>
+4', '4'],  // completions (group_size=2)
+ *   [10, 5],                     // token counts
+ *   ['eos', 'length'],          // finish reasons
+ *   2                            // group_size
+ * );
+ *
+ * outputs[0].completion.thinking; // "Let me calculate"
+ * outputs[0].completion.text;     // "4"
+ * outputs[0].completion.finishReason; // "eos"
+ * ```
+ */
+export declare function buildRewardOutputs(
+  prompts: Array<string>,
+  completions: Array<string>,
+  tokenCounts: Array<number>,
+  finishReasons: Array<string>,
+  groupSize: number,
+): Array<RewardOutput>;
+/** Configuration for built-in rewards */
+export interface BuiltinRewardConfig {
+  /** Type of reward function */
+  rewardType: BuiltinRewardType;
+  /** Weight for this reward (default 1.0) */
+  weight?: number;
+  /** Allowed tool names (for ToolUse) */
+  allowedTools?: Array<string>;
+  /** Required tags (for XmlFormat) */
+  requiredTags?: Array<string>;
+  /** Minimum length (for Length) */
+  minLength?: number;
+  /** Maximum length (for Length) */
+  maxLength?: number;
+  /** Use character count vs word count (for Length) */
+  useChars?: boolean;
+  /** Required JSON fields (for JsonSchema) */
+  requiredFields?: Array<string>;
+  /** Whether tool call is required (for ToolUse) */
+  required?: boolean;
+}
+/** Built-in reward function types */
+export declare const enum BuiltinRewardType {
+  /** Tool use validation */
+  ToolUse = 'ToolUse',
+  /** XML format validation */
+  XmlFormat = 'XmlFormat',
+  /** Length-based scoring */
+  Length = 'Length',
+  /** JSON schema validation */
+  JsonSchema = 'JsonSchema',
+}
+/**
+ * Configuration for the high-level `chat()` API
+ *
+ * Combines tool definitions with generation parameters in a single config object.
+ * Tools are optional - when not provided, `chat()` works as a simple conversational API.
+ *
+ * ## Example
+ * ```typescript
+ * // Simple chat (no tools)
+ * const result = await model.chat(messages);
+ *
+ * // With tools
+ * const result = await model.chat(messages, {
+ *   tools: [weatherTool, searchTool],
+ *   maxNewTokens: 2048,
+ *   temperature: 0.7,
+ * });
+ * ```
+ */
+export interface ChatConfig {
+  /**
+   * Tool definitions for function calling (optional)
+   *
+   * When provided, the model can invoke these tools during generation.
+   * Tool calls are parsed and returned in `ChatResult.toolCalls`.
+   */
+  tools?: Array<ToolDefinition>;
+  /** Maximum number of new tokens to generate (default: 2048 for chat) */
+  maxNewTokens?: number;
+  /** Sampling temperature (0 = greedy, higher = more random) (default: 0.7) */
+  temperature?: number;
+  /** Top-k sampling: keep only top k tokens (0 = disabled) (default: 0) */
+  topK?: number;
+  /** Top-p (nucleus) sampling: keep tokens with cumulative prob < p (default: 0.9) */
+  topP?: number;
+  /** Min-p sampling: keep tokens with prob > min_p * max_prob (default: 0.0) */
+  minP?: number;
+  /** Repetition penalty factor (1.0 = no penalty) (default: 1.0) */
+  repetitionPenalty?: number;
+  /** Number of recent tokens to consider for repetition penalty (default: 20) */
+  repetitionContextSize?: number;
+  /** Stop if same token repeats this many times consecutively (default: 16) */
+  maxConsecutiveTokens?: number;
+  /** Stop if an n-gram pattern repeats this many times (default: 8) */
+  maxNgramRepeats?: number;
+  /** N-gram size for repetition detection (default: 3) */
+  ngramSize?: number;
+  /** EOS token ID (generation stops when this is generated) */
+  eosTokenId?: number;
+  /** Whether to return log probabilities (default: true) */
+  returnLogprobs?: boolean;
+}
+/** Chat message with tool calling support */
+export interface ChatMessage {
+  /** Role: "system", "user", "assistant", or "tool" */
+  role: string;
+  /** Message content */
+  content: string;
+  /** Tool calls made by the assistant (for assistant messages) */
+  toolCalls?: Array<ToolCall>;
+  /** Tool call ID this message is responding to (for tool messages) */
+  toolCallId?: string;
+  /** Reasoning content for thinking mode (used with <think> tags) */
+  reasoningContent?: string;
+}
+/** Chat message role */
+export declare const enum ChatRole {
+  /** User message */
+  User = 'User',
+  /** Assistant response */
+  Assistant = 'Assistant',
+  /** System prompt */
+  System = 'System',
+}
+/** Statistics about cleanup operations (NAPI wrapper) */
+export interface CleanupStats {
+  /** Number of training steps deleted */
+  stepsDeleted: number;
+  /** Number of generations deleted */
+  generationsDeleted: number;
+  /** Number of tool calls deleted */
+  toolCallsDeleted: number;
+  /** Number of logs deleted */
+  logsDeleted: number;
+}
+/**
+ * Structured completion information aligned with ChatResult.
+ * Contains pre-parsed tool calls, thinking, and clean text.
+ */
+export interface CompletionInfo {
+  /** Clean text with <tool_call> and <think> tags removed */
+  text: string;
+  /** Raw output before tag stripping (for debugging/XML parsing) */
+  rawText: string;
+  /** Parsed tool calls (arguments are already JS objects) */
+  toolCalls: Array<ToolCallResult>;
+  /** Extracted thinking/reasoning from <think> tags (null if none) */
+  thinking?: string;
+  /** Number of tokens generated */
+  numTokens: number;
+  /** Finish reason: "stop" | "length" | "tool_calls" */
+  finishReason: string;
+}
+export interface ConversionOptions {
+  /** Input directory containing model files (config.json, model.safetensors) */
+  inputDir: string;
+  /** Output directory for converted model */
+  outputDir: string;
+  /** Target dtype for conversion (default: "float32") */
+  dtype?: string;
+  /** Whether to verbose logging (default: false) */
+  verbose?: boolean;
+}
+export interface ConversionResult {
+  /** Number of tensors converted */
+  numTensors: number;
+  /** Total number of parameters */
+  numParameters: number;
+  /** Output model path */
+  outputPath: string;
+  /** List of converted tensor names */
+  tensorNames: Array<string>;
+}
+/**
+ * Convert a HuggingFace SafeTensors model to MLX format
+ *
+ * This function:
+ * 1. Loads SafeTensors model from input directory
+ * 2. Converts all tensors to specified dtype (default: float32)
+ * 3. Saves converted model to output directory
+ * 4. Copies config.json and tokenizer files
+ *
+ * # Arguments
+ * * `options` - Conversion options (input_dir, output_dir, dtype, verbose)
+ *
+ * # Returns
+ * * ConversionResult with statistics about the conversion
+ *
+ * # Example
+ * ```typescript
+ * import { convertModel } from '../../index.cjs';
+ *
+ * const result = await convertModel({
+ *   inputDir: '.cache/models/qwen3-0.6b',
+ *   outputDir: '.cache/models/qwen3-0.6b-mlx',
+ *   dtype: 'float32',
+ *   verbose: true
+ * });
+ *
+ * console.log(`Converted ${result.numTensors} tensors (${result.numParameters} parameters)`);
+ * ```
+ */
+export declare function convertModel(options: ConversionOptions): Promise<ConversionResult>;
+export declare function convertParquetToJsonl(inputPath: string, outputPath: string): void;
+/** Create a default PaddleOCR-VL 1.5 configuration (JS factory function) */
+export declare function createPaddleocrVlConfig(): ModelConfig;
+/** Document element - either a table or paragraph */
+export interface DocumentElement {
+  elementType: ElementType;
+  /** Table data (only present if element_type is Table) */
+  table?: Table;
+  /** Paragraph data (only present if element_type is Paragraph) */
+  paragraph?: Paragraph;
+}
+export declare const enum DType {
+  Float32 = 0,
+  Int32 = 1,
+  Float16 = 2,
+  BFloat16 = 3,
+  Uint32 = 4,
+}
+/** Document element type */
+export declare const enum ElementType {
+  Table = 'Table',
+  Paragraph = 'Paragraph',
+}
+/** Metrics from a training epoch */
+export interface EngineEpochMetrics {
+  /** Epoch number */
+  epoch: number;
+  /** Average loss for the epoch */
+  avgLoss: number;
+  /** Average reward for the epoch */
+  avgReward: number;
+  /** Total steps in the epoch */
+  totalSteps: number;
+  /** Total tokens processed */
+  totalTokens: number;
+  /** Time for the epoch (seconds) */
+  epochTimeSecs: number;
+}
+/** Metrics from a single training step */
+export interface EngineStepMetrics {
+  /** Current step number */
+  step: number;
+  /** GRPO loss value */
+  loss: number;
+  /** Mean reward across completions */
+  meanReward: number;
+  /** Standard deviation of rewards */
+  stdReward: number;
+  /** Mean advantage value */
+  meanAdvantage: number;
+  /** Standard deviation of advantages */
+  stdAdvantage: number;
+  /** Total tokens generated this step */
+  totalTokens: number;
+  /** Whether gradients were applied */
+  gradientsApplied: boolean;
+  /** Time for generation (ms) */
+  generationTimeMs: number;
+  /** Time for training (ms) */
+  trainingTimeMs: number;
+  /** Peak memory usage this step (MB) */
+  peakMemoryMb: number;
+  /** Active memory at end of step (MB) */
+  activeMemoryMb: number;
+}
+/** Format parsed document according to config */
+export declare function formatDocument(doc: ParsedDocument, config?: ParserConfig | undefined | null): string;
+/** Function definition for tool calling */
+export interface FunctionDefinition {
+  /** Name of the function */
+  name: string;
+  /** Description of what the function does */
+  description?: string;
+  /** Parameter schema */
+  parameters?: FunctionParameters;
+}
+/** Function parameters schema (JSON Schema subset) */
+export interface FunctionParameters {
+  /** Type (usually "object") */
+  type: string;
+  /** JSON string of property definitions */
+  properties?: string;
+  /** List of required parameter names */
+  required?: Array<string>;
+}
+/** Result from generate_batch_for_training with all data needed for training */
+export interface GenerateBatchResult {
+  /** Generated completion texts */
+  completionTexts: Array<string>;
+  /** Completion token IDs (flattened, concatenated) */
+  completionTokens: Array<number>;
+  /** Completion log probabilities (flattened, concatenated) */
+  completionLogprobs: Array<number>;
+  /** Lengths of each completion (for reconstruction) */
+  completionLengths: Array<number>;
+  /** Finish reasons for each completion ("eos", "length", or "repetition") */
+  finishReasons: Array<string>;
+}
+/** Configuration for text generation */
+export interface GenerationConfig {
+  /** Maximum number of new tokens to generate (default: 100) */
+  maxNewTokens?: number;
+  /** Sampling temperature (0 = greedy, higher = more random) (default: 1.0) */
+  temperature?: number;
+  /** Top-k sampling: keep only top k tokens (0 = disabled) (default: 0) */
+  topK?: number;
+  /** Top-p (nucleus) sampling: keep tokens with cumulative prob < p (default: 1.0) */
+  topP?: number;
+  /** Min-p sampling: keep tokens with prob > min_p * max_prob (default: 0.0) */
+  minP?: number;
+  /** Repetition penalty factor (1.0 = no penalty, 1.1-1.5 typical) (default: 1.0) */
+  repetitionPenalty?: number;
+  /**
+   * Number of recent tokens to consider for repetition penalty (default: 20)
+   * Matches mlx-lm default. Larger values catch longer patterns but use more memory
+   */
+  repetitionContextSize?: number;
+  /**
+   * Stop if same token repeats this many times consecutively (default: 16)
+   * Set to 0 to disable. Prevents OOM from degenerate repetitive generation.
+   */
+  maxConsecutiveTokens?: number;
+  /**
+   * Stop if an n-gram pattern repeats this many times (default: 8)
+   * Set to 0 to disable. Detects patterns like "A B A B A B A B".
+   */
+  maxNgramRepeats?: number;
+  /**
+   * N-gram size for repetition detection (default: 3)
+   * Used with max_ngram_repeats to detect repeating patterns.
+   */
+  ngramSize?: number;
+  /** EOS token ID (generation stops when this is generated) */
+  eosTokenId?: number;
+  /** Whether to return log probabilities (always true for GRPO) */
+  returnLogprobs?: boolean;
+  /**
+   * Prefill step size for chunked processing of long prompts (default: 2048)
+   * When the prompt length exceeds this value, it will be processed in chunks
+   * to improve memory efficiency and enable async pipelining.
+   * Set to 0 to disable chunking and process the entire prompt at once.
+   */
+  prefillStepSize?: number;
+  /**
+   * KV cache quantization bits (default: 16 = no quantization)
+   * - 16: Full precision (bfloat16/float16), no quantization
+   * - 8: 8-bit quantization, ~2x memory savings, minimal quality loss
+   * - 4: 4-bit quantization, ~4x memory savings, some quality degradation
+   *
+   * Quantized KV cache is useful for long sequences where memory becomes a bottleneck.
+   * Note: Adds dequantization overhead per forward pass.
+   */
+  kvCacheBits?: number;
+  /**
+   * KV cache quantization group size (default: 64)
+   * Number of elements per quantization group. Smaller groups = better accuracy
+   * but more overhead from storing scales/biases.
+   * Only used when kv_cache_bits is 4 or 8.
+   */
+  kvCacheGroupSize?: number;
+  /**
+   * Number of draft tokens to generate speculatively (default: 5)
+   * Only used when a draft model is provided for speculative decoding.
+   * Higher values can increase throughput but may reduce acceptance rate.
+   */
+  numDraftTokens?: number;
+}
+/** A generation record (NAPI wrapper) */
+export interface GenerationRecord {
+  batchIndex: number;
+  groupIndex: number;
+  prompt: string;
+  expectedAnswer?: string;
+  completionText: string;
+  completionRaw: string;
+  thinking?: string;
+  numTokens: number;
+  finishReason: string;
+  reward: number;
+}
+/** A generation with its associated tool calls (NAPI wrapper) */
+export interface GenerationWithToolCalls {
+  generation: GenerationRecord;
+  toolCalls: Array<ToolCallRecord>;
+}
+/** Get expected weight keys for PaddleOCR-VL model */
+export declare function getExpectedWeightKeys(): Array<string>;
+/** Configuration for the GRPO training engine */
+export interface GrpoEngineConfig {
+  /** Learning rate (default: 1e-6) */
+  learningRate?: number;
+  /** Gradient accumulation steps (default: 1) */
+  gradientAccumulationSteps?: number;
+  /** Maximum gradient norm for clipping (default: 1.0) */
+  gradientClipNorm?: number;
+  /**
+   * Maximum gradient value for element-wise clipping (default: 1.0)
+   * This clamps individual gradient elements to [-value, value]
+   */
+  gradientClipValue?: number;
+  /** Number of completions per prompt (default: 4) */
+  groupSize?: number;
+  /** PPO clipping epsilon (default: 0.2) */
+  clipEpsilon?: number;
+  /** KL divergence coefficient (default: 0.0) */
+  klCoef?: number;
+  /** Loss type: "grpo", "dapo", "dr_grpo", "bnpo" (default: "grpo") */
+  lossType?: string;
+  /**
+   * Maximum completion length for both generation and training (default: 256)
+   * Matches Python TRL's max_completion_length config.
+   */
+  maxCompletionLength?: number;
+  /** Sampling temperature (default: 0.8) */
+  temperature?: number;
+  /** Top-p (nucleus) sampling (default: 0.95) */
+  topP?: number;
+  /** Top-k sampling (optional) */
+  topK?: number;
+  /** Repetition penalty (default: 1.1) */
+  repetitionPenalty?: number;
+  /**
+   * Maximum allowed NaN gradient occurrences before stopping training (default: 100)
+   * When exceeded, training will stop with an error to prevent model corruption.
+   */
+  maxNanGradients?: number;
+  /**
+   * Consecutive NaN gradients that trigger emergency checkpoint (default: 5)
+   * When reached, the needs_emergency_save flag is set for the TypeScript layer.
+   */
+  emergencySaveThreshold?: number;
+  /**
+   * Enable detailed NaN/Inf detection with per-element counts (default: false)
+   * When false (default), uses GPU-native has_nan_or_inf() which only transfers a single
+   * boolean to CPU. When true, transfers the entire gradient tensor to CPU for detailed
+   * per-element analysis - useful for debugging but has significant performance overhead
+   * for large models (e.g., 2.4GB for Qwen3-0.6B).
+   */
+  verboseNanDetection?: boolean;
+  /**
+   * Enable thinking mode for Qwen3 models (default: true)
+   * When false, adds empty <think></think> tags to disable model thinking.
+   * This is useful for tool-use training where you want direct outputs.
+   */
+  enableThinking?: boolean;
+  /**
+   * Tool definitions for function calling
+   * When provided, tools are included in the chat template so the model
+   * can generate tool calls. This is essential for tool-use training.
+   */
+  tools?: Array<ToolDefinition>;
+  /**
+   * Batch chunk size for LM head computation (memory optimization).
+   * When set, the LM head (hidden_states -> logits) is computed in chunks
+   * of this size to reduce peak memory usage.
+   * Default: None (no chunking, full batch at once)
+   * Recommended: 2 for batch_size >= 4 with large vocabularies (e.g., 151936)
+   * This reduces peak memory from ~1.2GB to ~300MB for Qwen3 (vocab=151936).
+   */
+  lmHeadChunkSize?: number;
+  /**
+   * Batch chunk size for transformer forward pass (memory optimization).
+   * When set, the transformer layers process the batch in chunks of this size,
+   * reducing peak memory from O(batch × heads × seq²) for attention.
+   * Default: None (no chunking, full batch at once)
+   * Recommended: 4 for batch_size >= 4 with groupSize >= 4
+   * Memory savings: ~70-80% for batch=4, groupSize=4 (16 sequences → 4 at a time)
+   */
+  forwardChunkSize?: number;
+  /**
+   * Chunk size for vocabulary dimension in cross-entropy computation.
+   * When computing logsumexp over large vocabularies (e.g., Qwen3's 151,936 tokens),
+   * the computation is split into chunks of this size to reduce peak memory usage.
+   * Default: 65536 (2^16)
+   * Recommended: 65536 for Qwen3 (vocab=151936) splits into 3 chunks
+   * Set to a larger value to reduce chunking overhead or smaller for tighter memory constraints.
+   */
+  vocabChunkSize?: number;
+  /**
+   * Enable true parallel batch generation (default: false).
+   * When true, all N*G sequences are processed in parallel using batched FFI
+   * with per-sequence RoPE offsets. This provides 2-4x speedup for GRPO training.
+   * When false, uses the sequential generation (process one prompt at a time,
+   * then expand KV cache for G completions).
+   */
+  useParallelBatchGeneration?: boolean;
+}
+/** Configuration for GRPO loss computation */
+export interface GrpoLossConfig {
+  /** Lower clipping bound (default: 0.2, means clip to [1-0.2, 1+epsilon_high]) */
+  epsilonLow: number;
+  /** Upper clipping bound (default: same as epsilon_low) */
+  epsilonHigh?: number;
+  /** KL divergence penalty coefficient (default: 0.0, no penalty) */
+  beta: number;
+  /** Loss aggregation type: "grpo", "bnpo", "dr_grpo", or "dapo" */
+  lossType: string;
+  /** Importance sampling level: "token" or "sequence" */
+  importanceSamplingLevel: string;
+  /**
+   * Maximum completion length (legacy, no longer used by dr_grpo)
+   * Kept for backwards compatibility but ignored in current implementation.
+   */
+  maxCompletionLength?: number;
+  /** Total number of items in batch across all processes (needed for dapo) */
+  numItemsInBatch?: number;
+  /** Current gradient accumulation step (for loss scaling) */
+  gradientAccumulationSteps: number;
+  /**
+   * Batch chunk size for LM head computation (memory optimization).
+   * When set, the LM head (hidden_states -> logits) is computed in chunks
+   * of this size to reduce peak memory usage.
+   * Default: None (no chunking, full batch at once)
+   * Recommended: 2 for batch_size >= 4 with large vocabularies (e.g., 151936)
+   */
+  lmHeadChunkSize?: number;
+  /**
+   * Batch chunk size for transformer forward pass (memory optimization).
+   * When set, the transformer layers process the batch in chunks of this size,
+   * reducing peak memory from O(batch × heads × seq²) for attention.
+   * Default: None (no chunking, full batch at once)
+   * Recommended: 4 for batch_size >= 4 with groupSize >= 4
+   * Memory savings: ~70-80% for batch=4, groupSize=4 (16 sequences → 4 at a time)
+   */
+  forwardChunkSize?: number;
+  /**
+   * Chunk size for vocabulary dimension in cross-entropy computation.
+   * When computing logsumexp over large vocabularies (e.g., Qwen3's 151,936 tokens),
+   * the computation is split into chunks of this size to reduce peak memory usage.
+   * Default: 65536 (2^16)
+   * Recommended: 65536 for Qwen3 (vocab=151936) splits into 3 chunks
+   */
+  vocabChunkSize?: number;
+}
+/** Full model configuration */
+export interface ModelConfig {
+  visionConfig: VisionConfig;
+  textConfig: TextConfig;
+  modelType: string;
+  ignoreIndex: number;
+  imageTokenId: number;
+  videoTokenId: number;
+  visionStartTokenId: number;
+  visionEndTokenId: number;
+  eosTokenId: number;
+}
+/** Output format options */
+export declare const enum OutputFormat {
+  /** Raw output with minimal processing */
+  Raw = 'Raw',
+  /** Plain text with aligned columns */
+  Plain = 'Plain',
+  /** Markdown tables */
+  Markdown = 'Markdown',
+  /** HTML tables */
+  Html = 'Html',
+}
+/** Configuration for creating an OutputStore connection */
+export interface OutputStoreConfig {
+  /** Local SQLite file path (e.g., "training_outputs.db") */
+  localPath: string;
+}
+/** Paged attention memory statistics (NAPI-compatible) */
+export interface PagedCacheStats {
+  /** Total number of blocks in the pool */
+  totalBlocks: number;
+  /** Number of free blocks */
+  freeBlocks: number;
+  /** Number of allocated blocks */
+  allocatedBlocks: number;
+  /** Total memory in MB */
+  totalMemoryMb: number;
+  /** Used memory in MB */
+  usedMemoryMb: number;
+  /** Utilization percentage */
+  utilizationPercent: number;
+}
+/** A completed sequence from paged generation */
+export interface PagedCompletedSequence {
+  /** Original request ID */
+  requestId: string;
+  /** All generated tokens (excluding prompt) */
+  tokens: Array<number>;
+  /** Reason for completion ("eos", "max_tokens", etc.) */
+  finishReason: string;
+}
+/** Result of a paged generation step */
+export interface PagedGenerationStep {
+  /** Token outputs for each sequence in the batch */
+  outputs: Array<PagedTokenOutput>;
+  /** Number of sequences that were in prefill phase */
+  numPrefill: number;
+  /** Number of sequences that were in decode phase */
+  numDecode: number;
+}
+/** Output from a single token generation step in paged attention */
+export interface PagedTokenOutput {
+  /** Sequence ID in the scheduler */
+  seqId: number;
+  /** Request ID for this sequence */
+  requestId: string;
+  /** Generated token ID */
+  token: number;
+  /** Log probability of the token (f64 for NAPI compatibility) */
+  logprob: number;
+  /** Whether this sequence has finished */
+  isFinished: boolean;
+}
+/** A text paragraph */
+export interface Paragraph {
+  content: string;
+}
+/** Parsed document structure */
+export interface ParsedDocument {
+  elements: Array<DocumentElement>;
+}
+/**
+ * Parse and format PaddleOCR-VL response in one step
+ *
+ * Convenience function that parses the VLM output and formats it
+ * according to the specified configuration.
+ *
+ * # Arguments
+ * * `text` - Raw VLM output containing table tokens
+ * * `config` - Optional parser configuration (format, trim_cells, etc.)
+ *
+ * # Returns
+ * * Formatted string in the requested format (markdown, plain, html, raw)
+ *
+ * # Example
+ * ```typescript
+ * import { parsePaddleResponse } from '@mlx-node/core';
+ *
+ * // Parse and format as markdown (default)
+ * const markdown = parsePaddleResponse(vlmResult.text);
+ *
+ * // Parse and format as HTML
+ * const html = parsePaddleResponse(vlmResult.text, { format: 'html' });
+ *
+ * // Parse and format as plain text
+ * const plain = parsePaddleResponse(vlmResult.text, { format: 'plain' });
+ * ```
+ */
+export declare function parsePaddleResponse(text: string, config?: ParserConfig | undefined | null): string;
+/** Parser configuration */
+export interface ParserConfig {
+  /** Output format (default: 'markdown') */
+  format?: OutputFormat;
+  /** Whether to trim whitespace from cells (default: true) */
+  trimCells?: boolean;
+  /** Whether to collapse empty rows (default: true) */
+  collapseEmptyRows?: boolean;
+}
+/**
+ * Parse tool calls from text (NAPI export)
+ *
+ * Extracts tool calls from model-generated text and returns both the cleaned text
+ * and the parsed tool calls.
+ *
+ * # Example
+ * ```typescript
+ * import { parseToolCallsFromText } from '@mlx-node/core';
+ *
+ * const result = parseToolCallsFromText('<tool_call>{"name": "search", "arguments": {"q": "test"}}</tool_call>');
+ * console.log(result.text); // ""
+ * console.log(result.toolCalls[0].name); // "search"
+ * console.log(result.toolCalls[0].arguments.q); // "test"
+ * ```
+ */
+export declare function parseToolCallsFromText(text: string): ParseToolCallsResult;
+/** Result of parsing tool calls from text */
+export interface ParseToolCallsResult {
+  /** Cleaned text with tool_call tags removed */
+  text: string;
+  /** Parsed tool calls */
+  toolCalls: Array<ToolCallResult>;
+}
+/** Parse VLM output into structured document */
+export declare function parseVlmOutput(text: string): ParsedDocument;
+/** Qwen3 model configuration */
+export interface Qwen3Config {
+  vocabSize: number;
+  hiddenSize: number;
+  numLayers: number;
+  numHeads: number;
+  numKvHeads: number;
+  intermediateSize: number;
+  rmsNormEps: number;
+  ropeTheta: number;
+  maxPositionEmbeddings: number;
+  headDim: number;
+  useQkNorm: boolean;
+  tieWordEmbeddings: boolean;
+  padTokenId: number;
+  eosTokenId: number;
+  bosTokenId: number;
+  /**
+   * Enable paged attention for memory-efficient inference.
+   * Default: false (use standard KVCache)
+   */
+  usePagedAttention?: boolean | undefined;
+  /**
+   * GPU memory budget for paged KV cache in megabytes.
+   * Only used when use_paged_attention is true.
+   * Default: 2048 (2GB)
+   */
+  pagedCacheMemoryMb?: number | undefined;
+  /**
+   * Block size for paged attention (tokens per block).
+   * Only used when use_paged_attention is true.
+   * Default: 16
+   */
+  pagedBlockSize?: number | undefined;
+  /**
+   * Use FP8 cache for 2x memory reduction (experimental).
+   * Only used when use_paged_attention is true.
+   * Default: false
+   */
+  useFp8Cache?: boolean | undefined;
+}
+/** Result of resume position computation */
+export interface ResumePosition {
+  /** Epoch to start from (0-indexed) */
+  startEpoch: number;
+  /** Batch index within epoch to start from */
+  startBatchIdx: number;
+  /** Whether we're at an epoch boundary */
+  isEpochBoundary: boolean;
+}
+/**
+ * Reward function input for a single completion.
+ * Provides all context needed to compute a reward score.
+ */
+export interface RewardOutput {
+  /** The input prompt text */
+  prompt: string;
+  /** Structured completion data aligned with ChatResult */
+  completion: CompletionInfo;
+}
+/** Reward distribution statistics (NAPI wrapper) */
+export interface RewardStats {
+  count: number;
+  mean: number;
+  std: number;
+  min: number;
+  max: number;
+  median: number;
+  p25: number;
+  p75: number;
+}
+/** Aggregate statistics for a training run for resume state (NAPI wrapper) */
+export interface RunAggregates {
+  /** Best (highest) reward seen */
+  bestReward: number;
+  /** Average reward */
+  avgReward: number;
+  /** Total reward count */
+  rewardCount: number;
+  /** Best (lowest) loss seen */
+  bestLoss: number;
+  /** Average loss */
+  avgLoss: number;
+  /** Total loss count */
+  lossCount: number;
+  /** Total tokens generated */
+  totalTokens: number;
+  /** Current step number */
+  currentStep: number;
+  /** Average generation time (milliseconds) */
+  avgGenerationTimeMs: number;
+  /** Average training time (milliseconds) */
+  avgTrainingTimeMs: number;
+}
+/**
+ * Configuration for sampling strategies
+ * ⚡ PERFORMANCE: Made Copy to avoid cloning on every token
+ */
+export interface SamplingConfig {
+  /** Temperature for softmax (default: 1.0). Lower = more deterministic */
+  temperature?: number;
+  /** Number of top tokens to keep (top-k sampling). 0 = disabled */
+  topK?: number;
+  /** Cumulative probability threshold (top-p/nucleus sampling). 1.0 = disabled */
+  topP?: number;
+  /** Minimum probability threshold relative to max (min-p sampling). 0 = disabled */
+  minP?: number;
+}
+/** Scheduler statistics (NAPI-compatible) */
+export interface SchedulerStatsNapi {
+  /** Number of requests waiting to be scheduled */
+  numWaiting: number;
+  /** Number of sequences currently running */
+  numRunning: number;
+  /** Number of completed sequences */
+  numCompleted: number;
+  /** Number of sequences in prefill phase */
+  numPrefill: number;
+  /** Number of sequences in decode phase */
+  numDecode: number;
+  /** Total tokens across all running sequences */
+  totalRunningTokens: number;
+}
+/** Configuration for the SFT training engine */
+export interface SftEngineConfig {
+  /** Learning rate (default: 2e-5) */
+  learningRate?: number;
+  /** Gradient accumulation steps (default: 1) */
+  gradientAccumulationSteps?: number;
+  /** Maximum gradient norm for clipping (default: 1.0) */
+  gradientClipNorm?: number;
+  /** Maximum gradient value for element-wise clipping (optional) */
+  gradientClipValue?: number;
+  /** Weight decay (L2 regularization) (default: 0.01) */
+  weightDecay?: number;
+  /** Label smoothing factor (default: 0.0) */
+  labelSmoothing?: number;
+  /** Steps between heavy cleanup (default: 25) */
+  heavyCleanupInterval?: number;
+  /** Maximum allowed NaN gradient occurrences (default: 100) */
+  maxNanGradients?: number;
+  /** Consecutive NaN gradients that trigger emergency checkpoint (default: 5) */
+  emergencySaveThreshold?: number;
+  /** Compute token accuracy (requires extra forward pass) (default: false) */
+  computeAccuracy?: boolean;
+  /**
+   * Enable detailed NaN/Inf detection with per-element counts (default: false)
+   * When false (default), uses GPU-native has_nan_or_inf() which only transfers a single
+   * boolean to CPU. When true, transfers the entire gradient tensor to CPU for detailed
+   * per-element analysis - useful for debugging but has significant performance overhead.
+   */
+  verboseNanDetection?: boolean;
+}
+/** Metrics from a training epoch */
+export interface SftEpochMetrics {
+  /** Epoch number */
+  epoch: number;
+  /** Average loss for the epoch */
+  avgLoss: number;
+  /** Total steps in the epoch */
+  totalSteps: number;
+  /** Total tokens processed */
+  totalTokens: number;
+  /** Time for the epoch (seconds) */
+  epochTimeSecs: number;
+}
+/** Metrics from a single training step */
+export interface SftStepMetrics {
+  /** Current step number */
+  step: number;
+  /** Cross-entropy loss value */
+  loss: number;
+  /** Total tokens processed this step (non-ignored) */
+  totalTokens: number;
+  /** Token-level accuracy (if compute_accuracy enabled) */
+  tokenAccuracy?: number;
+  /** Whether gradients were applied (vs accumulated) */
+  gradientsApplied: boolean;
+  /** Time for training step (ms) */
+  trainingTimeMs: number;
+}
+/** Metrics from a single training step for sparkline restoration (NAPI wrapper) */
+export interface StepMetricSummary {
+  /** Step number */
+  step: number;
+  /** Loss value */
+  loss: number;
+  /** Mean reward (GRPO) */
+  meanReward: number;
+  /** Mean advantage (GRPO) */
+  meanAdvantage: number;
+  /** Std advantage (GRPO) - indicates reward variance within groups */
+  stdAdvantage: number;
+  /** Perplexity (SFT, optional) */
+  perplexity?: number;
+  /** Token accuracy (SFT, optional) */
+  tokenAccuracy?: number;
+  /** Total tokens this step */
+  totalTokens: number;
+  /** Time for generation phase (milliseconds) */
+  generationTimeMs?: number;
+  /** Time for training phase (milliseconds) */
+  trainingTimeMs?: number;
+}
+/** A training step record (NAPI wrapper) */
+export interface StepRecord {
+  runId: string;
+  step: number;
+  epoch?: number;
+  loss: number;
+  meanReward: number;
+  stdReward: number;
+  meanAdvantage?: number;
+  stdAdvantage: number;
+  totalTokens?: number;
+  generationTimeMs?: number;
+  trainingTimeMs?: number;
+  gradientsApplied: boolean;
+}
+/** Summary of a training step (NAPI wrapper) */
+export interface StepSummary {
+  step: number;
+  loss: number;
+  meanReward: number;
+  numGenerations: number;
+  numToolCalls: number;
+  eosCount: number;
+  lengthCount: number;
+}
+/** A table structure */
+export interface Table {
+  rows: Array<TableRow>;
+}
+/** A single cell in a table */
+export interface TableCell {
+  content: string;
+  isEmpty: boolean;
+}
+/** A row in a table */
+export interface TableRow {
+  cells: Array<TableCell>;
+}
+/** Language model (text decoder) configuration */
+export interface TextConfig {
+  modelType: string;
+  hiddenSize: number;
+  numHiddenLayers: number;
+  intermediateSize: number;
+  numAttentionHeads: number;
+  rmsNormEps: number;
+  vocabSize: number;
+  numKeyValueHeads: number;
+  maxPositionEmbeddings: number;
+  ropeTheta: number;
+  ropeTraditional: boolean;
+  useBias: boolean;
+  headDim: number;
+  /**
+   * Multimodal RoPE sections: [temporal, height, width]
+   * These define how the head_dim is split for 3D position encoding
+   */
+  mropeSection: Array<number>;
+}
+/** Tool call made by an assistant */
+export interface ToolCall {
+  /** Optional unique identifier for the tool call */
+  id?: string;
+  /** Name of the tool/function to call */
+  name: string;
+  /** JSON string of arguments to pass to the tool */
+  arguments: string;
+}
+/** A tool call record (NAPI wrapper) */
+export interface ToolCallRecord {
+  callIndex: number;
+  status: string;
+  toolName?: string;
+  arguments?: string;
+  rawContent: string;
+  errorMessage?: string;
+}
+/** Structured tool call with parsed arguments */
+export interface ToolCallResult {
+  /** Unique identifier for this tool call (format: call_<uuid>) */
+  id: string;
+  /** Name of the tool/function to call */
+  name: string;
+  /**
+   * Parsed arguments as native object (serde_json::Value -> JS object)
+   *
+   * When status is "ok", this contains the parsed arguments object.
+   * When status is "parse_error", this contains the original unparsed string.
+   * Otherwise, this is an empty object {}.
+   */
+  arguments: Record<string, unknown> | string;
+  /**
+   * Parsing status: "ok" | "invalid_json" | "missing_name" | "parse_error"
+   *
+   * - "ok": Successfully parsed tool call
+   * - "invalid_json": The tool_call tag content was not valid JSON
+   * - "missing_name": Valid JSON but no "name" field
+   * - "parse_error": Valid JSON but the "arguments" string field couldn't be parsed as JSON
+   */
+  status: string;
+  /** Error message if status != "ok" */
+  error?: string;
+  /**
+   * Raw content from <tool_call> tag (preserved for debugging/persistence)
+   * Defaults to empty string for backward compatibility with older JSON
+   */
+  rawContent: string;
+}
+/** OpenAI-compatible tool definition */
+export interface ToolDefinition {
+  /** Tool type (currently only "function" is supported) */
+  type: string;
+  /** Function definition */
+  function: FunctionDefinition;
+}
+/** A training run record (NAPI wrapper) */
+export interface TrainingRunRecord {
+  id: string;
+  name?: string;
+  modelName: string;
+  modelPath?: string;
+  config: string;
+  startedAt: number;
+  endedAt?: number;
+  totalSteps: number;
+  status: string;
+}
+/** Result from train_step_auto including metrics, completions, and rewards */
+export interface TrainStepResult {
+  /** Training metrics */
+  metrics: EngineStepMetrics;
+  /** Generated completion texts (for TUI logging) */
+  completions: Array<string>;
+  /** Computed reward values (for TUI logging) */
+  rewards: Array<number>;
+}
+/** Result from train_step_auto_with_recording including optional full RewardOutput data */
+export interface TrainStepResultWithOutputs {
+  /** Training metrics */
+  metrics: EngineStepMetrics;
+  /** Generated completion texts (for TUI logging) */
+  completions: Array<string>;
+  /** Computed reward values (for TUI logging) */
+  rewards: Array<number>;
+  /**
+   * Full RewardOutput data as JSON (only populated when record_outputs is true)
+   * This enables zero-copy persistence of training outputs
+   */
+  outputsJson?: string;
+  /** Actual token counts for each completion (for accurate TUI display) */
+  completionLengths: Array<number>;
+}
+/** Vision encoder configuration */
+export interface VisionConfig {
+  modelType: string;
+  hiddenSize: number;
+  intermediateSize: number;
+  numHiddenLayers: number;
+  numAttentionHeads: number;
+  numChannels: number;
+  imageSize: number;
+  patchSize: number;
+  hiddenAct: string;
+  layerNormEps: number;
+  attentionDropout: number;
+  spatialMergeSize: number;
+}
+/** Configuration for VLM chat */
+export interface VlmChatConfig {
+  /**
+   * Image paths to process (alternative to passing pre-processed images)
+   * These will be automatically processed using the ImageProcessor
+   */
+  imagePaths?: Array<string>;
+  /** Maximum number of new tokens to generate (default: 512) */
+  maxNewTokens?: number;
+  /** Sampling temperature (0 = greedy, higher = more random) (default: 0.0 for OCR) */
+  temperature?: number;
+  /** Top-k sampling (default: 0) */
+  topK?: number;
+  /** Top-p (nucleus) sampling (default: 1.0) */
+  topP?: number;
+  /** Repetition penalty (default: 1.5) */
+  repetitionPenalty?: number;
+  /** Whether to return log probabilities (default: false) */
+  returnLogprobs?: boolean;
+}
+/** A chat message with optional image */
+export interface VlmChatMessage {
+  /** Role of the message sender */
+  role: ChatRole;
+  /** Text content of the message */
+  content: string;
+}