npm - @mlx-node/core - Versions diffs - 0.0.0 → 0.0.1 - Mend

@mlx-node/core 0.0.0 → 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/index.d.cts CHANGED Viewed

@@ -26,49 +26,81 @@ export declare class BatchGenerationResult {
   get groupSize(): number;
 }
+/** Handle returned by `chat_stream()` to control an in-progress streaming generation. */
+export declare class ChatStreamHandle {
+  cancel(): void;
+}
 /**
- * Result from the high-level `chat()` API
+ * PP-DocLayoutV3 full model for document layout analysis.
  *
- * Contains structured responses with:
- * - Tool calls parsed as native JavaScript objects
- * - Thinking/reasoning extracted from `<think>` tags
- * - Clean text with all special tags stripped
+ * Combines HGNetV2 backbone, hybrid encoder, and RT-DETR decoder
+ * with mask-enhanced attention and reading order prediction.
  *
- * ## Example
- * ```typescript
- * const result = await model.chat(messages, { tools });
- * console.log(result.text);       // Clean response
- * console.log(result.thinking);   // Chain-of-thought (if any)
- * console.log(result.toolCalls);  // Parsed tool calls
- * ```
+ * Weights must be downloaded from `PaddlePaddle/PP-DocLayoutV3_safetensors` on HuggingFace.
+ * The regular `PaddlePaddle/PP-DocLayoutV3` repo uses PaddlePaddle format and is not compatible.
  */
-export declare class ChatResult {
-  /** Get the cleaned text (tool_call and think tags removed) */
-  get text(): string;
-  /** Get the extracted tool calls */
-  get toolCalls(): Array<ToolCallResult>;
+export declare class DocLayoutModel {
   /**
-   * Get the extracted thinking/reasoning content
+   * Load a PP-DocLayoutV3 model from a directory containing `config.json` and `model.safetensors`.
    *
-   * Returns the content from within `<think>...</think>` tags, or null if
-   * no thinking tags were present in the response.
+   * The model directory should be cloned from `PaddlePaddle/PP-DocLayoutV3_safetensors` on HuggingFace.
    *
-   * This is useful for:
-   * - Debugging model reasoning
-   * - Displaying chain-of-thought to users (optional)
-   * - Analyzing model decision-making
+   * # Arguments
+   * * `model_path` - Path to model directory
+   *
+   * # Returns
+   * * Initialized DocLayoutModel ready for inference
    */
-  get thinking(): string | null;
-  /** Get the generated tokens */
-  get tokens(): MxArray;
-  /** Get the log probabilities */
-  get logprobs(): MxArray;
-  /** Get the finish reason ("stop", "length", "tool_calls", or "repetition") */
-  get finishReason(): 'stop' | 'length' | 'tool_calls' | 'repetition';
-  /** Get the number of tokens generated */
-  get numTokens(): number;
-  /** Get the raw text before tool call stripping (for debugging) */
-  get rawText(): string;
+  static load(modelPath: string): DocLayoutModel;
+  /**
+   * Detect document layout elements in an image.
+   *
+   * # Arguments
+   * * `image_data` - Encoded image bytes (PNG/JPEG)
+   * * `threshold` - Optional confidence threshold (default 0.5)
+   *
+   * # Returns
+   * * Vec of LayoutElements sorted by reading order
+   */
+  detect(imageData: Buffer, threshold?: number | undefined | null): Array<LayoutElement>;
+}
+export type PPDocLayoutV3Model = DocLayoutModel;
+/**
+ * PP-LCNet_x1_0 Document Orientation Classification model.
+ *
+ * Classifies document images into 4 orientation classes (0/90/180/270 degrees).
+ * Uses depthwise separable convolutions with HardSwish activation.
+ */
+export declare class DocOrientationModel {
+  /** Load a DocOrientationModel from a directory containing model.safetensors and config.json. */
+  static load(modelPath: string): DocOrientationModel;
+  /**
+   * Classify the orientation of a document image.
+   *
+   * Returns the detected orientation angle (0, 90, 180, 270) and confidence.
+   */
+  classify(imageData: Buffer): OrientationResult;
+  /**
+   * Classify orientation and return the corrected (upright) image bytes.
+   *
+   * Returns classification result plus corrected PNG image bytes.
+   */
+  classifyAndRotate(imageData: Buffer): ClassifyRotateResult;
+}
+/**
+ * UVDoc Document Unwarping model.
+ *
+ * Predicts a 2D displacement field and applies it to correct perspective
+ * distortion in camera-captured documents.
+ */
+export declare class DocUnwarpModel {
+  /** Load a DocUnwarpModel from a directory containing model.safetensors. */
+  static load(modelPath: string): DocUnwarpModel;
+  /** Unwarp a document image and return the corrected image bytes. */
+  unwarp(imageData: Buffer): UnwarpResult;
 }
 /** Result from text generation with detailed metadata */
@@ -79,8 +111,8 @@ export declare class GenerationResult {
   get tokens(): MxArray;
   /** Get the log probabilities */
   get logprobs(): MxArray;
-  /** Get the finish reason ("eos", "length", or "repetition") */
-  get finishReason(): 'eos' | 'length' | 'repetition';
+  /** Get the finish reason ("stop", "length", or "repetition") */
+  get finishReason(): 'stop' | 'length' | 'repetition';
   /** Get the number of tokens generated */
   get numTokens(): number;
 }
@@ -92,13 +124,17 @@ export declare class GenerationResult {
  */
 export declare class GrpoTrainingEngine {
   /**
-   * Create a new training engine from an existing model
+   * Create a new training engine from a Qwen3 model
    *
    * # Arguments
    * * `model` - The Qwen3 model to train (will be cloned internally)
    * * `config` - Engine configuration
    */
   constructor(model: Qwen3Model, config: GrpoEngineConfig);
+  /** Create a new training engine from a Qwen3.5 dense model */
+  static fromQwen35(model: Qwen3_5Model, config: GrpoEngineConfig): GrpoTrainingEngine;
+  /** Create a new training engine from a Qwen3.5 MoE model */
+  static fromQwen35Moe(model: Qwen3_5MoeModel, config: GrpoEngineConfig): GrpoTrainingEngine;
   /** Register a built-in reward function */
   registerBuiltinReward(config: BuiltinRewardConfig): void;
   /**
@@ -203,10 +239,39 @@ export declare class GrpoTrainingEngine {
   get nanGradientCount(): number;
   /** Clear the emergency save flag (call after saving emergency checkpoint) */
   clearEmergencySaveFlag(): void;
+  /**
+   * Save optimizer state (moment tensors + step) to a SafeTensors file.
+   *
+   * The step counter is stored in the `__metadata__` field.
+   * Each parameter's first moment (m) and second moment (v) are stored as
+   * `{param_name}.m` and `{param_name}.v` tensors.
+   *
+   * No-op if the engine uses SGD (no optimizer state to save).
+   */
+  saveOptimizerState(path: string): void;
+  /**
+   * Load optimizer state (moment tensors + step) from a SafeTensors file.
+   *
+   * Restores the step counter from metadata and sets first/second moment
+   * tensors for each parameter found in the file.
+   *
+   * No-op if the engine uses SGD (no optimizer to restore).
+   */
+  loadOptimizerState(path: string): void;
 }
 export type GRPOTrainingEngine = GrpoTrainingEngine;
 export declare class MxArray {
+  equal(other: MxArray): MxArray;
+  notEqual(other: MxArray): MxArray;
+  less(other: MxArray): MxArray;
+  lessEqual(other: MxArray): MxArray;
+  greater(other: MxArray): MxArray;
+  greaterEqual(other: MxArray): MxArray;
+  logicalAnd(other: MxArray): MxArray;
+  logicalOr(other: MxArray): MxArray;
+  logicalNot(): MxArray;
+  where(x: MxArray, y: MxArray): MxArray;
   static fromInt32(data: Int32Array, shape: BigInt64Array): MxArray;
   static fromInt64(data: BigInt64Array, shape: BigInt64Array): MxArray;
   static fromUint32(data: Uint32Array, shape: BigInt64Array): MxArray;
@@ -234,60 +299,12 @@ export declare class MxArray {
     step?: number | undefined | null,
     dtype?: DType | undefined | null,
   ): MxArray;
-  reshape(shape: BigInt64Array): MxArray;
   astype(dtype: DType): MxArray;
   /**
    * Create a copy of this array with a new handle.
    * This is useful for parameter loading to avoid handle aliasing issues.
    */
   copy(): MxArray;
-  logSoftmax(axis: number): MxArray;
-  exp(): MxArray;
-  log(): MxArray;
-  sum(axes?: Int32Array | undefined | null, keepdims?: boolean | undefined | null): MxArray;
-  mean(axes?: Int32Array | undefined | null, keepdims?: boolean | undefined | null): MxArray;
-  clip(minimum?: number | undefined | null, maximum?: number | undefined | null): MxArray;
-  minimum(other: MxArray): MxArray;
-  maximum(other: MxArray): MxArray;
-  add(other: MxArray): MxArray;
-  sub(other: MxArray): MxArray;
-  mul(other: MxArray): MxArray;
-  div(other: MxArray): MxArray;
-  addScalar(value: number): MxArray;
-  mulScalar(value: number): MxArray;
-  subScalar(value: number): MxArray;
-  divScalar(value: number): MxArray;
-  matmul(other: MxArray): MxArray;
-  /**
-   * Fused matrix multiply-add: D = beta * C + alpha * (self @ B)
-   * where self is A. More efficient than separate matmul and add operations.
-   * Default: alpha=1.0, beta=1.0, giving D = C + (self @ B)
-   */
-  addmm(c: MxArray, b: MxArray, alpha?: number | undefined | null, beta?: number | undefined | null): MxArray;
-  transpose(axes?: Int32Array | undefined | null): MxArray;
-  take(indices: MxArray, axis: number): MxArray;
-  takeAlongAxis(indices: MxArray, axis: number): MxArray;
-  /**
-   * Put values into array at specified indices along an axis
-   * Equivalent to: result = array.copy(); result[..., indices] = values
-   * This matches MLX's put_along_axis for efficient in-place-style updates
-   */
-  putAlongAxis(indices: MxArray, values: MxArray, axis: number): MxArray;
-  slice(starts: BigInt64Array, stops: BigInt64Array): MxArray;
-  /**
-   * Concatenate two arrays along an axis
-   * Optimized for the common binary concatenation case
-   */
-  static concatenate(a: MxArray, b: MxArray, axis: number): MxArray;
-  /**
-   * Concatenate multiple arrays along an axis
-   * For concatenating 3 or more arrays
-   */
-  static concatenateMany(arrays: Array<MxArray>, axis?: number | undefined | null): MxArray;
-  sort(axis?: number | undefined | null): MxArray;
-  argsort(axis?: number | undefined | null): MxArray;
-  partition(kth: number, axis?: number | undefined | null): MxArray;
-  argpartition(kth: number, axis?: number | undefined | null): MxArray;
   eval(): void;
   evalAsync(): Promise<undefined>;
   size(): bigint;
@@ -314,7 +331,7 @@ export declare class MxArray {
   /**
    * Copy entire array from GPU to CPU as Float32Array
    *
-   * ⚠吅 **PERFORMANCE WARNING**: This triggers a FULL GPU→CPU memory transfer!
+   * **PERFORMANCE WARNING**: This triggers a FULL GPU->CPU memory transfer!
    *
    * **Performance impact**:
    * - Forces evaluation of lazy operations
@@ -335,7 +352,7 @@ export declare class MxArray {
   /**
    * Copy entire array from GPU to CPU as Int32Array
    *
-   * ⚠吅 **PERFORMANCE WARNING**: This triggers a FULL GPU→CPU memory transfer!
+   * **PERFORMANCE WARNING**: This triggers a FULL GPU->CPU memory transfer!
    *
    * See `to_float32()` documentation for performance implications and alternatives.
    * Prefer `item_int32()` for scalars.
@@ -344,57 +361,32 @@ export declare class MxArray {
   /**
    * Copy entire array from GPU to CPU as Uint32Array
    *
-   * ⚠吅 **PERFORMANCE WARNING**: This triggers a FULL GPU→CPU memory transfer!
+   * **PERFORMANCE WARNING**: This triggers a FULL GPU->CPU memory transfer!
    *
    * See `to_float32()` documentation for performance implications and alternatives.
    */
   toUint32(): Uint32Array;
-  static stack(arrays: Array<MxArray>, axis?: number | undefined | null): MxArray;
-  static randomUniform(shape: BigInt64Array, low: number, high: number, dtype?: DType | undefined | null): MxArray;
-  static randomNormal(shape: BigInt64Array, mean: number, std: number, dtype?: DType | undefined | null): MxArray;
-  static randomBernoulli(shape: BigInt64Array, prob: number): MxArray;
-  static randint(shape: BigInt64Array, low: number, high: number): MxArray;
+  logSoftmax(axis: number): MxArray;
+  exp(): MxArray;
+  log(): MxArray;
+  clip(minimum?: number | undefined | null, maximum?: number | undefined | null): MxArray;
+  minimum(other: MxArray): MxArray;
+  maximum(other: MxArray): MxArray;
+  add(other: MxArray): MxArray;
+  sub(other: MxArray): MxArray;
+  mul(other: MxArray): MxArray;
+  div(other: MxArray): MxArray;
+  addScalar(value: number): MxArray;
+  mulScalar(value: number): MxArray;
+  subScalar(value: number): MxArray;
+  divScalar(value: number): MxArray;
+  matmul(other: MxArray): MxArray;
   /**
-   * Sample from categorical distribution
-   * Takes logits and returns sampled indices
+   * Fused matrix multiply-add: D = beta * C + alpha * (self @ B)
+   * where self is A. More efficient than separate matmul and add operations.
+   * Default: alpha=1.0, beta=1.0, giving D = C + (self @ B)
    */
-  categorical(axis?: number | undefined | null): MxArray;
-  equal(other: MxArray): MxArray;
-  notEqual(other: MxArray): MxArray;
-  less(other: MxArray): MxArray;
-  lessEqual(other: MxArray): MxArray;
-  greater(other: MxArray): MxArray;
-  greaterEqual(other: MxArray): MxArray;
-  logicalAnd(other: MxArray): MxArray;
-  logicalOr(other: MxArray): MxArray;
-  logicalNot(): MxArray;
-  where(x: MxArray, y: MxArray): MxArray;
-  argmax(axis: number, keepdims?: boolean | undefined | null): MxArray;
-  argmin(axis: number, keepdims?: boolean | undefined | null): MxArray;
-  max(axes?: Int32Array | undefined | null, keepdims?: boolean | undefined | null): MxArray;
-  min(axes?: Int32Array | undefined | null, keepdims?: boolean | undefined | null): MxArray;
-  prod(axes?: Int32Array | undefined | null, keepdims?: boolean | undefined | null): MxArray;
-  var(
-    axes?: Int32Array | undefined | null,
-    keepdims?: boolean | undefined | null,
-    ddof?: number | undefined | null,
-  ): MxArray;
-  std(
-    axes?: Int32Array | undefined | null,
-    keepdims?: boolean | undefined | null,
-    ddof?: number | undefined | null,
-  ): MxArray;
-  logsumexp(axes?: Int32Array | undefined | null, keepdims?: boolean | undefined | null): MxArray;
-  cumsum(axis: number): MxArray;
-  cumprod(axis: number): MxArray;
-  pad(padWidth: Int32Array, constantValue: number): MxArray;
-  roll(shift: number, axis: number): MxArray;
-  split(indicesOrSections: number, axis?: number | undefined | null): Array<MxArray>;
-  tile(reps: Int32Array): MxArray;
-  repeat(repeats: number, axis: number): MxArray;
-  squeeze(axes?: Int32Array | undefined | null): MxArray;
-  expandDims(axis: number): MxArray;
-  broadcastTo(shape: BigInt64Array): MxArray;
+  addmm(c: MxArray, b: MxArray, alpha?: number | undefined | null, beta?: number | undefined | null): MxArray;
   abs(): MxArray;
   negative(): MxArray;
   sign(): MxArray;
@@ -440,6 +432,69 @@ export declare class MxArray {
    * This is a GPU-native operation that avoids CPU data transfer.
    */
   isfinite(): MxArray;
+  static randomUniform(shape: BigInt64Array, low: number, high: number, dtype?: DType | undefined | null): MxArray;
+  static randomNormal(shape: BigInt64Array, mean: number, std: number, dtype?: DType | undefined | null): MxArray;
+  static randomBernoulli(shape: BigInt64Array, prob: number): MxArray;
+  static randint(shape: BigInt64Array, low: number, high: number): MxArray;
+  /**
+   * Sample from categorical distribution
+   * Takes logits and returns sampled indices
+   */
+  categorical(axis?: number | undefined | null): MxArray;
+  sum(axes?: Int32Array | undefined | null, keepdims?: boolean | undefined | null): MxArray;
+  mean(axes?: Int32Array | undefined | null, keepdims?: boolean | undefined | null): MxArray;
+  argmax(axis: number, keepdims?: boolean | undefined | null): MxArray;
+  argmin(axis: number, keepdims?: boolean | undefined | null): MxArray;
+  max(axes?: Int32Array | undefined | null, keepdims?: boolean | undefined | null): MxArray;
+  min(axes?: Int32Array | undefined | null, keepdims?: boolean | undefined | null): MxArray;
+  prod(axes?: Int32Array | undefined | null, keepdims?: boolean | undefined | null): MxArray;
+  var(
+    axes?: Int32Array | undefined | null,
+    keepdims?: boolean | undefined | null,
+    ddof?: number | undefined | null,
+  ): MxArray;
+  std(
+    axes?: Int32Array | undefined | null,
+    keepdims?: boolean | undefined | null,
+    ddof?: number | undefined | null,
+  ): MxArray;
+  logsumexp(axes?: Int32Array | undefined | null, keepdims?: boolean | undefined | null): MxArray;
+  cumsum(axis: number): MxArray;
+  cumprod(axis: number): MxArray;
+  reshape(shape: BigInt64Array): MxArray;
+  transpose(axes?: Int32Array | undefined | null): MxArray;
+  take(indices: MxArray, axis: number): MxArray;
+  takeAlongAxis(indices: MxArray, axis: number): MxArray;
+  /**
+   * Put values into array at specified indices along an axis
+   * Equivalent to: result = array.copy(); result[..., indices] = values
+   * This matches MLX's put_along_axis for efficient in-place-style updates
+   */
+  putAlongAxis(indices: MxArray, values: MxArray, axis: number): MxArray;
+  slice(starts: BigInt64Array, stops: BigInt64Array): MxArray;
+  /**
+   * Concatenate two arrays along an axis
+   * Optimized for the common binary concatenation case
+   */
+  static concatenate(a: MxArray, b: MxArray, axis: number): MxArray;
+  /**
+   * Concatenate multiple arrays along an axis
+   * For concatenating 3 or more arrays
+   */
+  static concatenateMany(arrays: Array<MxArray>, axis?: number | undefined | null): MxArray;
+  sort(axis?: number | undefined | null): MxArray;
+  argsort(axis?: number | undefined | null): MxArray;
+  partition(kth: number, axis?: number | undefined | null): MxArray;
+  argpartition(kth: number, axis?: number | undefined | null): MxArray;
+  static stack(arrays: Array<MxArray>, axis?: number | undefined | null): MxArray;
+  pad(padWidth: Int32Array, constantValue: number): MxArray;
+  roll(shift: number, axis: number): MxArray;
+  split(indicesOrSections: number, axis?: number | undefined | null): Array<MxArray>;
+  tile(reps: Int32Array): MxArray;
+  repeat(repeats: number, axis: number): MxArray;
+  squeeze(axes?: Int32Array | undefined | null): MxArray;
+  expandDims(axis: number): MxArray;
+  broadcastTo(shape: BigInt64Array): MxArray;
 }
 /** NAPI-exported reward registry wrapper */
@@ -584,6 +639,128 @@ export declare class OutputStore {
   queryRaw(sql: string): Promise<string>;
 }
+/**
+ * Qwen3.5 Model -- hybrid linear/full attention with optional MoE.
+ *
+ * Uses interior mutability (RwLock) for layers, final_norm, lm_head, and caches
+ * to allow async generation via spawn_blocking without blocking the Node.js event loop.
+ * This matches the pattern used by Qwen3Model.
+ */
+export declare class Qwen35Model {
+  /** Create a new Qwen3.5 model with the given configuration. */
+  constructor(config: Qwen35Config);
+  /** Initialize caches for incremental generation. */
+  initCaches(): void;
+  /** Reset all caches. */
+  resetCaches(): void;
+  /**
+   * Forward pass through the model.
+   *
+   * # Arguments
+   * * `input_ids` - Token IDs [B, T]
+   *
+   * # Returns
+   * Logits [B, T, vocab_size]
+   */
+  forward(inputIds: MxArray): MxArray;
+  /** Forward pass with cache for incremental generation. */
+  forwardWithCache(inputIds: MxArray): MxArray;
+  /**
+   * Load a pretrained model from a directory.
+   *
+   * Expects the directory to contain:
+   * - config.json
+   * - model.safetensors (or model-*.safetensors)
+   * - tokenizer.json + tokenizer_config.json
+   */
+  static load(path: string): Promise<Qwen35Model>;
+  /**
+   * Generate text from a prompt token sequence.
+   *
+   * Runs generation on a worker thread via spawn_blocking to avoid
+   * blocking the Node.js event loop.
+   */
+  generate(promptTokens: MxArray, config: Qwen35GenerationConfig): Promise<Qwen35GenerationResult>;
+  /**
+   * Chat API with tool calling support.
+   *
+   * Runs tokenization + generation on a worker thread via spawn_blocking
+   * to avoid blocking the Node.js event loop.
+   */
+  chat(messages: Array<ChatMessage>, config?: ChatConfig | undefined | null): Promise<ChatResult>;
+  /**
+   * Streaming chat API with tool calling support.
+   *
+   * Same as `chat()` but streams tokens one-by-one via the callback.
+   * Returns a `ChatStreamHandle` immediately; generation runs in background.
+   * Call `handle.cancel()` to abort generation early.
+   */
+  chatStream(
+    messages: ChatMessage[],
+    config: ChatConfig | null,
+    callback: (err: Error | null, chunk: ChatStreamChunk) => void,
+  ): Promise<ChatStreamHandle>;
+  /** Get the number of parameters in the model. */
+  numParameters(): number;
+  /**
+   * Save the model weights and configuration to a directory.
+   *
+   * This saves:
+   * - config.json: Model configuration (with model_type for detectModelType)
+   * - weights.safetensors: Full model weights in SafeTensors format
+   * - weights.mlx: Parameter metadata (for reference)
+   *
+   * # Arguments
+   * * `save_path` - Directory to save the model
+   */
+  saveModel(savePath: string): Promise<undefined>;
+}
+export type Qwen3_5Model = Qwen35Model;
+/**
+ * Qwen3.5 MoE Model -- hybrid linear/full attention with Mixture-of-Experts.
+ *
+ * Supports C++ MoE forward path (non-compiled, builds fresh graph per step)
+ * when weights are registered via `register_moe_weights_with_cpp`.
+ * Falls back to Rust forward_inner path for test models without stored weights.
+ */
+export declare class Qwen35MoeModel {
+  constructor(config: Qwen35MoeConfig);
+  initCaches(): void;
+  resetCaches(): void;
+  forward(inputIds: MxArray): MxArray;
+  forwardWithCache(inputIds: MxArray): MxArray;
+  static load(path: string): Promise<Qwen35MoeModel>;
+  generate(promptTokens: MxArray, config: Qwen35MoeGenerationConfig): Promise<Qwen35MoeGenerationResult>;
+  chat(messages: Array<ChatMessage>, config?: ChatConfig | undefined | null): Promise<ChatResult>;
+  /**
+   * Streaming chat API with tool calling support.
+   *
+   * Same as `chat()` but streams tokens one-by-one via the callback.
+   * Returns a `ChatStreamHandle` immediately; generation runs in background.
+   * Call `handle.cancel()` to abort generation early.
+   */
+  chatStream(
+    messages: ChatMessage[],
+    config: ChatConfig | null,
+    callback: (err: Error | null, chunk: ChatStreamChunk) => void,
+  ): Promise<ChatStreamHandle>;
+  numParameters(): number;
+  /**
+   * Save the model weights and configuration to a directory.
+   *
+   * This saves:
+   * - config.json: Model configuration (with model_type for detectModelType)
+   * - weights.safetensors: Full model weights in SafeTensors format
+   * - weights.mlx: Parameter metadata (for reference)
+   *
+   * # Arguments
+   * * `save_path` - Directory to save the model
+   */
+  saveModel(savePath: string): Promise<undefined>;
+}
+export type Qwen3_5MoeModel = Qwen35MoeModel;
 /**
  * Qwen3 Model with automatic differentiation support
  *
@@ -750,8 +927,8 @@ export declare class Qwen3Model {
    *
    * # Example (TypeScript)
    * ```typescript
-   * const targetModel = await ModelLoader.loadPretrained('qwen3-7b');
-   * const draftModel = await ModelLoader.loadPretrained('qwen3-0.5b');
+   * const targetModel = await loadModel('qwen3-7b');
+   * const draftModel = await loadModel('qwen3-0.5b');
    *
    * const result = targetModel.generateSpeculativeSync(draftModel, inputIds, {
    *   numDraftTokens: 5,
@@ -941,7 +1118,7 @@ export declare class Qwen3Model {
    *
    * # Example
    * ```typescript
-   * const model = await Qwen3Model.loadPretrained("path/to/model");
+   * const model = await Qwen3Model.load("path/to/model");
    * const messages = [
    *   { role: "user", content: "What is 2+2?" }
    * ];
@@ -1070,7 +1247,7 @@ export declare class Qwen3Model {
    * Decode token IDs to text using the internal tokenizer
    *
    * Helper method for decoding generated tokens. The model must have been loaded
-   * via load_pretrained() to have a tokenizer available.
+   * via load() to have a tokenizer available.
    *
    * # Arguments
    * * `token_ids` - Token IDs to decode as Uint32Array
@@ -1084,7 +1261,7 @@ export declare class Qwen3Model {
    * Apply chat template and encode to token IDs
    *
    * Formats messages using ChatML format (or Jinja2 template with tools) and encodes to tokens.
-   * The model must have been loaded via load_pretrained() to have a tokenizer available.
+   * The model must have been loaded via load() to have a tokenizer available.
    *
    * # Arguments
    * * `messages` - Array of chat messages
@@ -1115,7 +1292,7 @@ export declare class Qwen3Model {
    * # Returns
    * * A fully initialized Qwen3Model with loaded weights
    */
-  static loadPretrained(modelPath: string): Promise<Qwen3Model>;
+  static load(modelPath: string): Promise<Qwen3Model>;
   /**
    * Save model configuration and weights to disk
    *
@@ -1278,8 +1455,12 @@ export declare class Qwen3Tokenizer {
 /** SFT Training Engine */
 export declare class SftTrainingEngine {
-  /** Create a new SFT training engine */
+  /** Create a new SFT training engine from a Qwen3 model */
   constructor(model: Qwen3Model, config: SftEngineConfig);
+  /** Create a new SFT training engine from a Qwen3.5 dense model */
+  static fromQwen35(model: Qwen35Model, config: SftEngineConfig): SftTrainingEngine;
+  /** Create a new SFT training engine from a Qwen3.5 MoE model */
+  static fromQwen35Moe(model: Qwen35MoeModel, config: SftEngineConfig): SftTrainingEngine;
   /** Run a single training step */
   trainStep(inputIds: MxArray, labels: MxArray): Promise<SftStepMetrics>;
   /** Get current step number */
@@ -1318,8 +1499,12 @@ export declare class SftTrainingEngine {
   reset(): void;
   /** Restore training state (for resuming from checkpoint) */
   restoreState(step: number, epoch: number): void;
-  /** Get the underlying model for checkpointing */
+  /** Get the underlying Qwen3 model for checkpointing */
   getModel(): Qwen3Model;
+  /** Get the underlying Qwen3.5 dense model for checkpointing */
+  getQwen35Model(): Qwen35Model;
+  /** Get the underlying Qwen3.5 MoE model for checkpointing */
+  getQwen35MoeModel(): Qwen35MoeModel;
 }
 /**
@@ -1363,25 +1548,112 @@ export declare class Tensor {
   /** Convert to Int32 array */
   toInt32(): Int32Array;
   /**
-   * Detach this tensor from the computation graph
+   * Detach this tensor from the computation graph
+   *
+   * Returns a new tensor with the same data but no gradient tracking
+   */
+  detach(): Tensor;
+  /** Create a tensor of zeros */
+  static zeros(
+    shape: BigInt64Array,
+    dtype?: DType | undefined | null,
+    requiresGrad?: boolean | undefined | null,
+  ): Tensor;
+  /** Create a tensor of ones */
+  static ones(
+    shape: BigInt64Array,
+    dtype?: DType | undefined | null,
+    requiresGrad?: boolean | undefined | null,
+  ): Tensor;
+  /** Evaluate the underlying array */
+  eval(): void;
+}
+/**
+ * PP-OCRv5 Text Detection model (DBNet with PPHGNetV2 backbone).
+ *
+ * Detects text lines in document images and returns bounding boxes.
+ */
+export declare class TextDetModel {
+  /**
+   * Load a TextDetModel from a directory containing model.safetensors.
+   *
+   * # Arguments
+   * * `model_path` - Path to model directory
+   */
+  static load(modelPath: string): TextDetModel;
+  /**
+   * Detect text lines in an image.
+   *
+   * # Arguments
+   * * `image_data` - Encoded image bytes (PNG/JPEG)
+   * * `threshold` - Optional detection threshold (default from config, typically 0.3)
+   *
+   * # Returns
+   * * Vec of TextBox with bounding boxes and confidence scores
+   */
+  detect(imageData: Buffer, threshold?: number | undefined | null): Array<TextBox>;
+  /**
+   * Detect text lines from raw RGB pixel data.
+   *
+   * # Arguments
+   * * `rgb_data` - Raw RGB pixel data
+   * * `width` - Image width
+   * * `height` - Image height
+   * * `threshold` - Optional detection threshold (default from config)
+   *
+   * # Returns
+   * * Vec of TextBox with bounding boxes and confidence scores
+   */
+  detectCrop(rgbData: Uint8Array, width: number, height: number, threshold?: number | undefined | null): Array<TextBox>;
+}
+/**
+ * PP-OCRv5 Text Recognition model (PPHGNetV2 + SVTR + CTC).
+ *
+ * Recognizes text from cropped text line images.
+ */
+export declare class TextRecModel {
+  /**
+   * Load a TextRecModel from a directory containing model.safetensors.
+   *
+   * # Arguments
+   * * `model_path` - Path to model directory
+   * * `dict_path` - Path to character dictionary text file
+   */
+  static load(modelPath: string, dictPath: string): TextRecModel;
+  /**
+   * Recognize text from encoded image bytes.
+   *
+   * # Arguments
+   * * `image_data` - Encoded image bytes (PNG/JPEG)
+   *
+   * # Returns
+   * * RecResult with recognized text and confidence score
+   */
+  recognize(imageData: Buffer): RecResult;
+  /**
+   * Recognize text from multiple encoded images.
    *
-   * Returns a new tensor with the same data but no gradient tracking
+   * # Arguments
+   * * `images` - Vec of encoded image bytes (PNG/JPEG)
+   *
+   * # Returns
+   * * Vec of RecResult with recognized text and confidence scores
    */
-  detach(): Tensor;
-  /** Create a tensor of zeros */
-  static zeros(
-    shape: BigInt64Array,
-    dtype?: DType | undefined | null,
-    requiresGrad?: boolean | undefined | null,
-  ): Tensor;
-  /** Create a tensor of ones */
-  static ones(
-    shape: BigInt64Array,
-    dtype?: DType | undefined | null,
-    requiresGrad?: boolean | undefined | null,
-  ): Tensor;
-  /** Evaluate the underlying array */
-  eval(): void;
+  recognizeBatch(images: Array<Buffer>): Array<RecResult>;
+  /**
+   * Recognize text from raw RGB crop data.
+   *
+   * # Arguments
+   * * `rgb_data` - Raw RGB pixel data of a cropped text line
+   * * `width` - Image width
+   * * `height` - Image height
+   *
+   * # Returns
+   * * RecResult with recognized text and confidence score
+   */
+  recognizeCrop(rgbData: Uint8Array, width: number, height: number): RecResult;
 }
 /** Result from VLM chat */
@@ -1419,27 +1691,27 @@ export declare class VLModel {
    *
    * # Arguments
    * * `messages` - Chat messages (role + content)
-   * * `config` - Chat configuration (including image_paths for automatic processing)
+   * * `config` - Chat configuration (including images for automatic processing)
    *
    * # Returns
    * * VLMChatResult with generated text
    *
    * # Example
    * ```typescript
-   * const result = model.chat(
+   * const result = await model.chat(
    *   [{ role: 'user', content: 'Describe this image.' }],
-   *   { imagePaths: ['./photo.jpg'], maxNewTokens: 256 }
+   *   { images: [readFileSync('./photo.jpg')], maxNewTokens: 256 }
    * );
    * ```
    */
-  chat(messages: Array<VlmChatMessage>, config?: VlmChatConfig | undefined | null): VlmChatResult;
+  chat(messages: Array<VlmChatMessage>, config?: VlmChatConfig | undefined | null): Promise<VlmChatResult>;
   /**
-   * Simple OCR: extract text from an image file
+   * Simple OCR: extract text from encoded image bytes
    *
    * Convenience method that processes an image and extracts all text.
    *
    * # Arguments
-   * * `image_path` - Path to the image file
+   * * `image_data` - Encoded image bytes (PNG/JPEG)
    * * `prompt` - Optional custom prompt (default: "Extract all text from this image.")
    *
    * # Returns
@@ -1447,11 +1719,11 @@ export declare class VLModel {
    *
    * # Example
    * ```typescript
-   * const text = await model.ocr('./receipt.jpg');
+   * const text = await model.ocr(imageBuffer);
    * console.log(text);
    * ```
    */
-  ocr(imagePath: string, prompt?: string | undefined | null): string;
+  ocr(imageData: Buffer, prompt?: string | undefined | null): Promise<string>;
   /**
    * Get input embeddings with vision features merged
    *
@@ -1507,7 +1779,40 @@ export declare class VLModel {
     pixelValues?: MxArray | undefined | null,
     imageGridThw?: MxArray | undefined | null,
     config?: GenerationConfig | undefined | null,
-  ): GenerationResult;
+  ): Promise<GenerationResult>;
+  /**
+   * Batch OCR: extract text from multiple images simultaneously
+   *
+   * Processes N images with sequential prefill + batched decode for ~N× decode throughput.
+   *
+   * # Arguments
+   * * `images` - Encoded image buffers
+   * * `config` - Optional chat configuration (shared across all items)
+   *
+   * # Returns
+   * * Vec of extracted text strings, one per image
+   *
+   * # Example
+   * ```typescript
+   * import { readFileSync } from 'fs';
+   * const images = ['page1.jpg', 'page2.jpg'].map(p => readFileSync(p));
+   * const texts = await model.ocrBatch(images);
+   * ```
+   */
+  ocrBatch(images: Array<Buffer>, config?: VlmChatConfig | undefined | null): Promise<Array<string>>;
+  /**
+   * Batch chat: process multiple items simultaneously
+   *
+   * Sequential prefill + batched decode. Each item can have different images/prompts.
+   *
+   * # Arguments
+   * * `batch` - Batch items, each with messages and optional images
+   * * `config` - Optional shared chat configuration
+   *
+   * # Returns
+   * * Vec of VLMChatResult, one per batch item
+   */
+  batch(batch: Array<VlmBatchItem>, config?: VlmChatConfig | undefined | null): Promise<Array<VlmChatResult>>;
   /** Get model configuration */
   get config(): ModelConfig;
   /** Check if model is fully initialized */
@@ -1529,7 +1834,7 @@ export declare class VLModel {
    * ```typescript
    * import { VLModel } from '@mlx-node/vlm';
    * const model = await VLModel.load('./models/paddleocr-vl');
-   * const result = model.chat(messages, { imagePaths: ['./image.jpg'] });
+   * const result = await model.chat(messages, { images: [readFileSync('./image.jpg')] });
    * ```
    */
   static load(modelPath: string): Promise<VLModel>;
@@ -1559,35 +1864,6 @@ export declare class VLModel {
  *
  * Parses tool calls and thinking from completions, creating structured outputs
  * aligned with the ChatResult structure.
- *
- * # Arguments
- * * `prompts` - Array of prompt texts (one per unique prompt, will be expanded by group_size)
- * * `completions` - Array of completion texts (prompts.len() * group_size total)
- * * `token_counts` - Array of token counts for each completion
- * * `finish_reasons` - Array of finish reasons from generation ("eos", "length", "stop", "repetition")
- * * `group_size` - Number of completions per prompt
- *
- * # Returns
- * Array of RewardOutput objects with structured completion data
- *
- * # Example
- * ```typescript
- * import { buildRewardOutputs } from '@mlx-node/core';
- *
- * const outputs = buildRewardOutputs(
- *   ['What is 2+2?'],           // prompts
- *   ['<think>Let me calculate</think>
-4', '4'],  // completions (group_size=2)
- *   [10, 5],                     // token counts
- *   ['eos', 'length'],          // finish reasons
- *   2                            // group_size
- * );
- *
- * outputs[0].completion.thinking; // "Let me calculate"
- * outputs[0].completion.text;     // "4"
- * outputs[0].completion.finishReason; // "eos"
- * ```
  */
 export declare function buildRewardOutputs(
   prompts: Array<string>,
@@ -1627,67 +1903,41 @@ export declare const enum BuiltinRewardType {
   XmlFormat = 'XmlFormat',
   /** Length-based scoring */
   Length = 'Length',
-  /** JSON schema validation */
+  /** JSON format validation (brace matching + field name check, not full JSON parsing) */
   JsonSchema = 'JsonSchema',
 }
-/**
- * Configuration for the high-level `chat()` API
- *
- * Combines tool definitions with generation parameters in a single config object.
- * Tools are optional - when not provided, `chat()` works as a simple conversational API.
- *
- * ## Example
- * ```typescript
- * // Simple chat (no tools)
- * const result = await model.chat(messages);
- *
- * // With tools
- * const result = await model.chat(messages, {
- *   tools: [weatherTool, searchTool],
- *   maxNewTokens: 2048,
- *   temperature: 0.7,
- * });
- * ```
- */
+/** Unified chat configuration shared by all model variants (Qwen3, Qwen3.5, Qwen3.5 MoE). */
 export interface ChatConfig {
+  maxNewTokens?: number | undefined;
+  temperature?: number | undefined;
+  topK?: number | undefined;
+  topP?: number | undefined;
+  minP?: number | undefined;
+  /** Repetition penalty (1.0 = disabled). Penalizes tokens already in context. */
+  repetitionPenalty?: number | undefined;
+  /** Size of the context window for repetition penalty (default: 256) */
+  repetitionContextSize?: number | undefined;
+  /** Max consecutive identical tokens before stopping (default: 16, 0 = disabled) */
+  maxConsecutiveTokens?: number | undefined;
+  /** Max n-gram repetitions before stopping (default: 3, 0 = disabled) */
+  maxNgramRepeats?: number | undefined;
+  /** Max pattern size for n-gram repetition detection (default: 64) */
+  ngramSize?: number | undefined;
+  tools?: Array<ToolDefinition>;
   /**
-   * Tool definitions for function calling (optional)
-   *
-   * When provided, the model can invoke these tools during generation.
-   * Tool calls are parsed and returned in `ChatResult.toolCalls`.
+   * Enable thinking mode (Qwen3's <think> tags). Default: true (model thinks naturally).
+   * Set to false to suppress thinking by injecting empty <think></think> tags.
    */
-  tools?: Array<ToolDefinition>;
-  /** Maximum number of new tokens to generate (default: 2048 for chat) */
-  maxNewTokens?: number;
-  /** Sampling temperature (0 = greedy, higher = more random) (default: 0.7) */
-  temperature?: number;
-  /** Top-k sampling: keep only top k tokens (0 = disabled) (default: 0) */
-  topK?: number;
-  /** Top-p (nucleus) sampling: keep tokens with cumulative prob < p (default: 0.9) */
-  topP?: number;
-  /** Min-p sampling: keep tokens with prob > min_p * max_prob (default: 0.0) */
-  minP?: number;
-  /** Repetition penalty factor (1.0 = no penalty) (default: 1.0) */
-  repetitionPenalty?: number;
-  /** Number of recent tokens to consider for repetition penalty (default: 20) */
-  repetitionContextSize?: number;
-  /** Stop if same token repeats this many times consecutively (default: 16) */
-  maxConsecutiveTokens?: number;
-  /** Stop if an n-gram pattern repeats this many times (default: 8) */
-  maxNgramRepeats?: number;
-  /** N-gram size for repetition detection (default: 3) */
-  ngramSize?: number;
-  /** EOS token ID (generation stops when this is generated) */
-  eosTokenId?: number;
-  /** Whether to return log probabilities (default: true) */
-  returnLogprobs?: boolean;
+  enableThinking?: boolean | undefined;
+  /** When true, include performance metrics (TTFT, prefill tok/s, decode tok/s) in the result */
+  reportPerformance?: boolean | undefined;
 }
 /** Chat message with tool calling support */
 export interface ChatMessage {
   /** Role: "system", "user", "assistant", or "tool" */
-  role: string;
+  role: 'system' | 'user' | 'assistant' | 'tool' | (string & {});
   /** Message content */
   content: string;
   /** Tool calls made by the assistant (for assistant messages) */
@@ -1696,16 +1946,57 @@ export interface ChatMessage {
   toolCallId?: string;
   /** Reasoning content for thinking mode (used with <think> tags) */
   reasoningContent?: string;
+  /** Image data for VLM models (encoded image bytes: PNG/JPEG, passed as Uint8Array/Buffer) */
+  images?: Array<Uint8Array> | undefined;
+}
+/** Unified chat result shared by all model variants (Qwen3, Qwen3.5, Qwen3.5 MoE). */
+export interface ChatResult {
+  text: string;
+  toolCalls: Array<ToolCallResult>;
+  thinking?: string;
+  numTokens: number;
+  finishReason: string;
+  rawText: string;
+  /** Performance metrics (present when `reportPerformance: true` in config) */
+  performance?: PerformanceMetrics;
 }
-/** Chat message role */
+/** Chat message role (lowercase values matching standard convention) */
 export declare const enum ChatRole {
   /** User message */
-  User = 'User',
+  User = 'user',
   /** Assistant response */
-  Assistant = 'Assistant',
+  Assistant = 'assistant',
   /** System prompt */
-  System = 'System',
+  System = 'system',
+  /** Tool response */
+  Tool = 'tool',
+}
+/** A single chunk emitted during streaming chat generation. */
+export interface ChatStreamChunk {
+  text: string;
+  done: boolean;
+  finishReason?: string;
+  toolCalls?: Array<ToolCallResult>;
+  thinking?: string;
+  numTokens?: number;
+  rawText?: string;
+  /** Performance metrics (only present in the final chunk when `reportPerformance: true`) */
+  performance?: PerformanceMetrics;
+}
+/** Result from classify_and_rotate: orientation info + corrected image bytes. */
+export interface ClassifyRotateResult {
+  /** Detected rotation angle (0, 90, 180, or 270 degrees) */
+  angle: number;
+  /** Confidence score */
+  score: number;
+  /** Angle label as string */
+  label: string;
+  /** Corrected image as PNG bytes (or original bytes if angle=0) */
+  image: Buffer;
 }
 /** Statistics about cleanup operations (NAPI wrapper) */
@@ -1748,6 +2039,26 @@ export interface ConversionOptions {
   dtype?: string;
   /** Whether to verbose logging (default: false) */
   verbose?: boolean;
+  /** Model type for model-specific weight sanitization (e.g., "paddleocr-vl") */
+  modelType?: string;
+  /** Enable quantization of converted weights */
+  quantize?: boolean;
+  /** Quantization bits: 4 (default) or 8 */
+  quantBits?: number;
+  /** Quantization group size (default: 64 for affine, 32 for mxfp8) */
+  quantGroupSize?: number;
+  /** Quantization mode: "affine" (default) or "mxfp8" */
+  quantMode?: string;
+  /**
+   * Quantization recipe for per-layer mixed-bit quantization.
+   * Options: mixed_2_6, mixed_3_4, mixed_3_6, mixed_4_6, qwen3_5
+   */
+  quantRecipe?: string;
+  /**
+   * Path to an imatrix GGUF file for AWQ-style pre-scaling.
+   * Improves quantization quality by amplifying important weight channels.
+   */
+  imatrixPath?: string;
 }
 export interface ConversionResult {
@@ -1761,6 +2072,10 @@ export interface ConversionResult {
   tensorNames: Array<string>;
 }
+export declare function convertForeignWeights(options: ForeignConversionOptions): ForeignConversionResult;
+export declare function convertGgufToSafetensors(options: GgufConversionOptions): Promise<GgufConversionResult>;
 /**
  * Convert a HuggingFace SafeTensors model to MLX format
  *
@@ -1806,12 +2121,31 @@ export interface DocumentElement {
   paragraph?: Paragraph;
 }
+/**
+ * Convert a ParsedDocument to an XLSX buffer.
+ *
+ * Each Table element becomes a separate worksheet with bold headers.
+ * Paragraph elements are collected into a "Text" worksheet.
+ *
+ * # Example
+ * ```typescript
+ * import { parseVlmOutput, documentToXlsx } from '@mlx-node/core';
+ * import { writeFileSync } from 'fs';
+ *
+ * const doc = parseVlmOutput(vlmResult.text);
+ * const buffer = documentToXlsx(doc);
+ * writeFileSync('output.xlsx', buffer);
+ * ```
+ */
+export declare function documentToXlsx(doc: ParsedDocument): Buffer;
 export declare const enum DType {
   Float32 = 0,
   Int32 = 1,
   Float16 = 2,
   BFloat16 = 3,
   Uint32 = 4,
+  Uint8 = 5,
 }
 /** Document element type */
@@ -1864,6 +2198,23 @@ export interface EngineStepMetrics {
   activeMemoryMb: number;
 }
+export interface ForeignConversionOptions {
+  /** Path to the input weights file (.pdparams, .pkl, .pt, .pth) */
+  inputPath: string;
+  /** Output directory for model.safetensors + config.json */
+  outputDir: string;
+  /** Model type: "pp-lcnet-ori" or "uvdoc" */
+  modelType: string;
+  /** Enable verbose logging */
+  verbose?: boolean;
+}
+export interface ForeignConversionResult {
+  numTensors: number;
+  outputPath: string;
+  tensorNames: Array<string>;
+}
 /** Format parsed document according to config */
 export declare function formatDocument(doc: ParsedDocument, config?: ParserConfig | undefined | null): string;
@@ -1897,13 +2248,13 @@ export interface GenerateBatchResult {
   completionLogprobs: Array<number>;
   /** Lengths of each completion (for reconstruction) */
   completionLengths: Array<number>;
-  /** Finish reasons for each completion ("eos", "length", or "repetition") */
+  /** Finish reasons for each completion ("stop", "length", or "repetition") */
   finishReasons: Array<string>;
 }
 /** Configuration for text generation */
 export interface GenerationConfig {
-  /** Maximum number of new tokens to generate (default: 100) */
+  /** Maximum number of new tokens to generate (default: 2048) */
   maxNewTokens?: number;
   /** Sampling temperature (0 = greedy, higher = more random) (default: 1.0) */
   temperature?: number;
@@ -1926,13 +2277,15 @@ export interface GenerationConfig {
    */
   maxConsecutiveTokens?: number;
   /**
-   * Stop if an n-gram pattern repeats this many times (default: 8)
-   * Set to 0 to disable. Detects patterns like "A B A B A B A B".
+   * Stop if a pattern repeats this many times consecutively (default: 3)
+   * Set to 0 to disable. Detects patterns like "A B A B A B".
+   * Uses range-based detection: checks all pattern sizes from 2 to ngram_size.
    */
   maxNgramRepeats?: number;
   /**
-   * N-gram size for repetition detection (default: 3)
-   * Used with max_ngram_repeats to detect repeating patterns.
+   * Maximum pattern size for repetition detection (default: 64)
+   * All pattern sizes from 2 up to this value are checked each decode step.
+   * Larger values catch long phrase-level repetition common in small models.
    */
   ngramSize?: number;
   /** EOS token ID (generation stops when this is generated) */
@@ -1971,6 +2324,33 @@ export interface GenerationConfig {
   numDraftTokens?: number;
 }
+export interface GenerationProfile {
+  /** Label identifying the decode loop variant. */
+  label: string;
+  /** Model type (e.g. "qwen3_5", "qwen3_5_moe", "qwen3"). */
+  modelType: string;
+  /** Number of tokens generated. */
+  numTokens: number;
+  /** Number of prompt tokens. */
+  promptTokens: number;
+  /** Prefill wall-clock time (ms). */
+  prefillMs: number;
+  /** Decode wall-clock time (ms). */
+  decodeMs: number;
+  /** Total wall-clock time (prefill + decode) (ms). */
+  totalMs: number;
+  /** Tokens per second (decode only). */
+  tokensPerSecond: number;
+  /** Time to first token (ms) — from decode loop start to first token extracted. */
+  timeToFirstTokenMs: number;
+  /** Per-phase breakdown. */
+  phases: Array<PhaseProfile>;
+  /** Memory snapshot before generation. */
+  memoryBefore?: MemorySnapshot;
+  /** Memory snapshot after generation. */
+  memoryAfter?: MemorySnapshot;
+}
 /** A generation record (NAPI wrapper) */
 export interface GenerationRecord {
   batchIndex: number;
@@ -1994,6 +2374,62 @@ export interface GenerationWithToolCalls {
 /** Get expected weight keys for PaddleOCR-VL model */
 export declare function getExpectedWeightKeys(): Array<string>;
+/** Retrieve all collected profiling data as a `ProfilingSession`. */
+export declare function getProfilingData(): ProfilingSession;
+export interface GgufConversionOptions {
+  /** Path to the GGUF file */
+  inputPath: string;
+  /** Output directory for converted SafeTensors model */
+  outputDir: string;
+  /** Target dtype: "float32", "float16", "bfloat16" (default: keep original) */
+  dtype?: string;
+  /** Enable verbose logging */
+  verbose?: boolean;
+  /** Enable quantization of converted weights */
+  quantize?: boolean;
+  /** Quantization bits (default: 4) */
+  quantBits?: number;
+  /** Quantization group size (default: 64) */
+  quantGroupSize?: number;
+  /** Quantization mode: "affine" or "mxfp8" */
+  quantMode?: string;
+  /**
+   * Quantization recipe for per-layer mixed-bit quantization.
+   * Options: mixed_2_6, mixed_3_4, mixed_3_6, mixed_4_6, qwen3_5, unsloth
+   */
+  quantRecipe?: string;
+  /**
+   * Path to an imatrix GGUF file for AWQ-style pre-scaling.
+   * Improves quantization quality by amplifying important weight channels.
+   */
+  imatrixPath?: string;
+  /**
+   * Output filename (default: "model.safetensors").
+   * Useful for saving vision weights separately (e.g., "vision.safetensors").
+   */
+  outputFilename?: string;
+  /**
+   * When true, remap LLM weight keys for VLM compatibility:
+   * "model.X" → "language_model.model.X", "lm_head.X" → "language_model.lm_head.X"
+   * This makes the safetensors compatible with mlx-vlm.
+   */
+  vlmKeyPrefix?: boolean;
+}
+export interface GgufConversionResult {
+  numTensors: number;
+  numParameters: number;
+  outputPath: string;
+  tensorNames: Array<string>;
+  sourceFormat: string;
+}
+export interface GpuInfo {
+  /** GPU architecture generation (M1=13, M2=14, M3=15, M4=16, M5=17). */
+  architectureGen: number;
+}
 /** Configuration for the GRPO training engine */
 export interface GrpoEngineConfig {
   /** Learning rate (default: 1e-6) */
@@ -2093,6 +2529,24 @@ export interface GrpoEngineConfig {
    * then expand KV cache for G completions).
    */
   useParallelBatchGeneration?: boolean;
+  /**
+   * Enable gradient checkpointing (default: true).
+   * When true, each transformer layer's activations are discarded during the forward
+   * pass and recomputed during backward, reducing peak memory from O(num_layers) to O(1)
+   * for intermediate states. For Qwen3.5 0.8B, this reduces autograd peak from ~105GB to ~11GB.
+   * The trade-off is ~30% more compute (one extra forward pass per layer during backward).
+   */
+  gradientCheckpointing?: boolean;
+  /** Optimizer type: "sgd" or "adamw" (default: "adamw") */
+  optimizerType?: string;
+  /** AdamW beta1 (default: 0.9) */
+  adamwBeta1?: number;
+  /** AdamW beta2 (default: 0.999) */
+  adamwBeta2?: number;
+  /** AdamW epsilon (default: 1e-8) */
+  adamwEps?: number;
+  /** Weight decay for AdamW (default: 0.01) */
+  weightDecay?: number;
 }
 /** Configuration for GRPO loss computation */
@@ -2143,6 +2597,32 @@ export interface GrpoLossConfig {
   vocabChunkSize?: number;
 }
+/** Check whether profiling is currently enabled. */
+export declare function isProfilingEnabled(): boolean;
+/** A single detected layout element. */
+export interface LayoutElement {
+  /** Detection confidence score */
+  score: number;
+  /** Class label ID (0-24) */
+  label: number;
+  /** Human-readable label name (e.g., "title", "text", "table") */
+  labelName: string;
+  /** Bounding box in original image coordinates [x1, y1, x2, y2] */
+  bbox: Array<number>;
+  /** Reading order index (0 = first element to read) */
+  order: number;
+}
+export interface MemorySnapshot {
+  /** Active (non-cached) memory in bytes. */
+  activeBytes: number;
+  /** Peak memory usage in bytes. */
+  peakBytes: number;
+  /** Cache memory in bytes. */
+  cacheBytes: number;
+}
 /** Full model configuration */
 export interface ModelConfig {
   visionConfig: VisionConfig;
@@ -2156,8 +2636,18 @@ export interface ModelConfig {
   eosTokenId: number;
 }
+/** Result from document orientation classification. */
+export interface OrientationResult {
+  /** Detected rotation angle (0, 90, 180, or 270 degrees) */
+  angle: number;
+  /** Confidence score */
+  score: number;
+  /** Angle label as string */
+  label: string;
+}
 /** Output format options */
-export declare const enum OutputFormat {
+export enum OutputFormat {
   /** Raw output with minimal processing */
   Raw = 'Raw',
   /** Plain text with aligned columns */
@@ -2166,6 +2656,8 @@ export declare const enum OutputFormat {
   Markdown = 'Markdown',
   /** HTML tables */
   Html = 'Html',
+  /** JSON structured output */
+  Json = 'Json',
 }
 /** Configuration for creating an OutputStore connection */
@@ -2196,7 +2688,7 @@ export interface PagedCompletedSequence {
   requestId: string;
   /** All generated tokens (excluding prompt) */
   tokens: Array<number>;
-  /** Reason for completion ("eos", "max_tokens", etc.) */
+  /** Reason for completion ("stop", "length", "repetition", "tool_calls") */
   finishReason: string;
 }
@@ -2273,22 +2765,7 @@ export interface ParserConfig {
   collapseEmptyRows?: boolean;
 }
-/**
- * Parse tool calls from text (NAPI export)
- *
- * Extracts tool calls from model-generated text and returns both the cleaned text
- * and the parsed tool calls.
- *
- * # Example
- * ```typescript
- * import { parseToolCallsFromText } from '@mlx-node/core';
- *
- * const result = parseToolCallsFromText('<tool_call>{"name": "search", "arguments": {"q": "test"}}</tool_call>');
- * console.log(result.text); // ""
- * console.log(result.toolCalls[0].name); // "search"
- * console.log(result.toolCalls[0].arguments.q); // "test"
- * ```
- */
+/** Parse tool calls from text (NAPI export) */
 export declare function parseToolCallsFromText(text: string): ParseToolCallsResult;
 /** Result of parsing tool calls from text */
@@ -2302,6 +2779,162 @@ export interface ParseToolCallsResult {
 /** Parse VLM output into structured document */
 export declare function parseVlmOutput(text: string): ParsedDocument;
+/**
+ * Lightweight performance metrics returned by chat/chatStream when
+ * `reportPerformance: true` is set in the config.
+ */
+export interface PerformanceMetrics {
+  /**
+   * Time to first token (ms) — wall-clock from generation start to
+   * first token extracted. Includes tokenization, prefill (lazy graph
+   * construction + first GPU eval), and first sample.
+   */
+  ttftMs: number;
+  /** Prefill throughput: prompt_tokens / (ttft_ms / 1000). */
+  prefillTokensPerSecond: number;
+  /**
+   * Decode throughput: (generated_tokens - 1) / decode_time.
+   * Excludes the first token (counted as prefill).
+   */
+  decodeTokensPerSecond: number;
+}
+export interface PhaseProfile {
+  /** Phase name (e.g. "forward", "sample", "eval_token"). */
+  name: string;
+  /** Total wall-clock time spent in this phase (ms). */
+  totalMs: number;
+  /** Average time per invocation (µs). */
+  avgUsPerToken: number;
+  /** Number of invocations. */
+  count: number;
+}
+export interface ProfilingSession {
+  /** GPU hardware info. */
+  gpuInfo: GpuInfo;
+  /** Total session duration (ms). */
+  totalDurationMs: number;
+  /** Individual generation profiles. */
+  generations: Array<GenerationProfile>;
+  /** Aggregate summary. */
+  summary: ProfilingSummary;
+}
+export interface ProfilingSummary {
+  /** Total tokens generated across all generations. */
+  totalTokens: number;
+  /** Total prompt tokens across all generations. */
+  totalPromptTokens: number;
+  /** Average tokens per second. */
+  avgTokensPerSecond: number;
+  /** Average time to first token (ms). */
+  avgTimeToFirstTokenMs: number;
+  /** Average prefill time (ms). */
+  avgPrefillMs: number;
+}
+/**
+ * Qwen3.5 model configuration (dense variant).
+ *
+ * For MoE models, use `Qwen3_5MoeConfig` from `qwen3_5_moe`.
+ */
+export interface Qwen35Config {
+  vocabSize: number;
+  hiddenSize: number;
+  numLayers: number;
+  numHeads: number;
+  numKvHeads: number;
+  intermediateSize: number;
+  rmsNormEps: number;
+  headDim: number;
+  tieWordEmbeddings: boolean;
+  attentionBias: boolean;
+  maxPositionEmbeddings: number;
+  padTokenId: number;
+  eosTokenId: number;
+  bosTokenId: number;
+  linearNumValueHeads: number;
+  linearNumKeyHeads: number;
+  linearKeyHeadDim: number;
+  linearValueHeadDim: number;
+  linearConvKernelDim: number;
+  fullAttentionInterval: number;
+  partialRotaryFactor: number;
+  ropeTheta: number;
+}
+/** Generation configuration for Qwen3.5 */
+export interface Qwen35GenerationConfig {
+  maxNewTokens: number;
+  temperature?: number | undefined;
+  topK?: number | undefined;
+  topP?: number | undefined;
+  minP?: number | undefined;
+}
+/** Generation result */
+export interface Qwen35GenerationResult {
+  tokens: Array<number>;
+  text: string;
+  numTokens: number;
+  finishReason: string;
+}
+/**
+ * Qwen3.5 MoE model configuration.
+ *
+ * Contains all fields including MoE-specific ones (num_experts, etc.).
+ */
+export interface Qwen35MoeConfig {
+  vocabSize: number;
+  hiddenSize: number;
+  numLayers: number;
+  numHeads: number;
+  numKvHeads: number;
+  intermediateSize: number;
+  rmsNormEps: number;
+  headDim: number;
+  tieWordEmbeddings: boolean;
+  attentionBias: boolean;
+  maxPositionEmbeddings: number;
+  padTokenId: number;
+  eosTokenId: number;
+  bosTokenId: number;
+  linearNumValueHeads: number;
+  linearNumKeyHeads: number;
+  linearKeyHeadDim: number;
+  linearValueHeadDim: number;
+  linearConvKernelDim: number;
+  fullAttentionInterval: number;
+  partialRotaryFactor: number;
+  ropeTheta: number;
+  numExperts: number;
+  numExpertsPerTok: number;
+  decoderSparseStep: number;
+  sharedExpertIntermediateSize?: number | undefined;
+  moeIntermediateSize?: number | undefined;
+  normTopkProb: boolean;
+  mlpOnlyLayers?: number[] | undefined;
+}
+/** Generation configuration for Qwen3.5 MoE */
+export interface Qwen35MoeGenerationConfig {
+  maxNewTokens: number;
+  temperature?: number | undefined;
+  topK?: number | undefined;
+  topP?: number | undefined;
+  minP?: number | undefined;
+}
+/** Generation result */
+export interface Qwen35MoeGenerationResult {
+  tokens: Array<number>;
+  text: string;
+  numTokens: number;
+  finishReason: string;
+}
 /** Qwen3 model configuration */
 export interface Qwen3Config {
   vocabSize: number;
@@ -2344,6 +2977,17 @@ export interface Qwen3Config {
   useFp8Cache?: boolean | undefined;
 }
+/** Result of text recognition. */
+export interface RecResult {
+  /** Recognized text */
+  text: string;
+  /** Confidence score (mean character probability) */
+  score: number;
+}
+/** Clear all collected profiling data and reset session timer. */
+export declare function resetProfilingData(): void;
 /** Result of resume position computation */
 export interface ResumePosition {
   /** Epoch to start from (0-indexed) */
@@ -2416,6 +3060,20 @@ export interface SamplingConfig {
   minP?: number;
 }
+/**
+ * Parse VLM output and save directly as XLSX file.
+ *
+ * Convenience function that parses VLM output and writes it to an XLSX file.
+ *
+ * # Example
+ * ```typescript
+ * import { saveToXlsx } from '@mlx-node/core';
+ *
+ * saveToXlsx(vlmResult.text, 'output.xlsx');
+ * ```
+ */
+export declare function saveToXlsx(text: string, filePath: string): void;
 /** Scheduler statistics (NAPI-compatible) */
 export interface SchedulerStatsNapi {
   /** Number of requests waiting to be scheduled */
@@ -2432,6 +3090,9 @@ export interface SchedulerStatsNapi {
   totalRunningTokens: number;
 }
+/** Enable or disable profiling globally. */
+export declare function setProfilingEnabled(enabled: boolean): void;
 /** Configuration for the SFT training engine */
 export interface SftEngineConfig {
   /** Learning rate (default: 2e-5) */
@@ -2461,6 +3122,11 @@ export interface SftEngineConfig {
    * per-element analysis - useful for debugging but has significant performance overhead.
    */
   verboseNanDetection?: boolean;
+  /**
+   * Enable gradient checkpointing to reduce memory (default: true)
+   * Trades ~30% more compute for O(1) layer memory instead of O(num_layers).
+   */
+  gradientCheckpointing?: boolean;
 }
 /** Metrics from a training epoch */
@@ -2560,6 +3226,14 @@ export interface TableRow {
   cells: Array<TableCell>;
 }
+/** A detected text bounding box. */
+export interface TextBox {
+  /** Bounding box in original image coordinates [x1, y1, x2, y2] */
+  bbox: Array<number>;
+  /** Detection confidence score (mean probability inside box) */
+  score: number;
+}
 /** Language model (text decoder) configuration */
 export interface TextConfig {
   modelType: string;
@@ -2682,6 +3356,12 @@ export interface TrainStepResultWithOutputs {
   completionLengths: Array<number>;
 }
+/** Result from document unwarping. */
+export interface UnwarpResult {
+  /** Unwarped image as PNG bytes */
+  image: Buffer;
+}
 /** Vision encoder configuration */
 export interface VisionConfig {
   modelType: string;
@@ -2698,13 +3378,18 @@ export interface VisionConfig {
   spatialMergeSize: number;
 }
+/** A batch item for VLM batch inference */
+export interface VlmBatchItem {
+  /** Chat messages for this item */
+  messages: Array<VlmChatMessage>;
+  /** Encoded image buffers for this item (one image per item for OCR) */
+  images?: Array<Buffer>;
+}
 /** Configuration for VLM chat */
 export interface VlmChatConfig {
-  /**
-   * Image paths to process (alternative to passing pre-processed images)
-   * These will be automatically processed using the ImageProcessor
-   */
-  imagePaths?: Array<string>;
+  /** Encoded image buffers to process (PNG/JPEG bytes) */
+  images?: Array<Buffer>;
   /** Maximum number of new tokens to generate (default: 512) */
   maxNewTokens?: number;
   /** Sampling temperature (0 = greedy, higher = more random) (default: 0.0 for OCR) */