npm - react-native-executorch - Versions diffs - 0.9.0 → 0.9.2 - Mend

react-native-executorch 0.9.0 → 0.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

package/src/constants/modelRegistry.ts CHANGED Viewed

@@ -38,7 +38,7 @@ import { RnExecutorchErrorCode } from '../errors/ErrorCodes';
  * compile-time error.
  * @category Utils
  */
-export type Backend = 'xnnpack' | 'coreml' | 'vulkan' | 'qnn';
+export type Backend = 'xnnpack' | 'coreml' | 'vulkan' | 'qnn' | 'mlx';
 /**
  * Options for a `models` accessor call.
@@ -78,7 +78,7 @@ type ConfigOf<V> = Extract<
 >;
 type BackendsOf<V> = Extract<keyof V, Backend>;
-const BACKEND_ORDER: Backend[] = ['xnnpack', 'coreml', 'vulkan', 'qnn'];
+const BACKEND_ORDER: Backend[] = ['xnnpack', 'coreml', 'mlx', 'vulkan', 'qnn'];
 function firstBackend(variants: AnyVariantMap): Backend {
   for (const b of BACKEND_ORDER) {
@@ -181,6 +181,33 @@ function tts<C extends TextToSpeechModelConfig>(c: C): () => C {
 // Per-backend variant maps for models that ship more than one backend.
 // ─────────────────────────────────────────────────────────────────────────────
+const GEMMA4_E2B_VARIANTS = {
+  mlx: {
+    base: {
+      modelName: 'gemma4-e2b' as const,
+      modelSource: M.GEMMA4_E2B_MLX_MODEL,
+      tokenizerSource: M.GEMMA4_E2B_TOKENIZER,
+      tokenizerConfigSource: M.GEMMA4_E2B_TOKENIZER_CONFIG,
+    },
+  },
+  xnnpack: {
+    base: {
+      modelName: 'gemma4-e2b' as const,
+      modelSource: M.GEMMA4_E2B_XNNPACK_MODEL,
+      tokenizerSource: M.GEMMA4_E2B_TOKENIZER,
+      tokenizerConfigSource: M.GEMMA4_E2B_TOKENIZER_CONFIG,
+    },
+  },
+  vulkan: {
+    base: {
+      modelName: 'gemma4-e2b' as const,
+      modelSource: M.GEMMA4_E2B_VULKAN_MODEL,
+      tokenizerSource: M.GEMMA4_E2B_TOKENIZER,
+      tokenizerConfigSource: M.GEMMA4_E2B_TOKENIZER_CONFIG,
+    },
+  },
+};
 const EFFICIENTNET_V2_S_VARIANTS = {
   xnnpack: {
     base: {
@@ -249,6 +276,31 @@ const RF_DETR_NANO_SEG_VARIANTS = {
   },
 };
+// RF-DETR Keypoint (pose estimation) — BETA preview. Configs mirror the
+// All three backends ship fp32
+// (non-quantized); this entry may be re-exported under a different constant
+// once more RF-DETR keypoint weights are released.
+const RF_DETR_KEYPOINT_PREVIEW_VARIANTS = {
+  xnnpack: {
+    base: {
+      modelName: 'rfdetr-keypoint-preview' as const,
+      modelSource: M.RF_DETR_KEYPOINT_PREVIEW_XNNPACK_FP32_MODEL,
+    },
+  },
+  coreml: {
+    base: {
+      modelName: 'rfdetr-keypoint-preview' as const,
+      modelSource: M.RF_DETR_KEYPOINT_PREVIEW_COREML_FP32_MODEL,
+    },
+  },
+  mlx: {
+    base: {
+      modelName: 'rfdetr-keypoint-preview' as const,
+      modelSource: M.RF_DETR_KEYPOINT_PREVIEW_MLX_FP32_MODEL,
+    },
+  },
+};
 const FASTSAM_S_VARIANTS = {
   xnnpack: {
     base: {
@@ -496,10 +548,15 @@ export const models = {
       M.LFM2_5_1_2B_INSTRUCT_QUANTIZED
     ),
     bielik_v3_0_1_5b: pair(M.BIELIK_V3_0_1_5B, M.BIELIK_V3_0_1_5B_QUANTIZED),
+    gemma4_e2b: variant(GEMMA4_E2B_VARIANTS, {
+      ios: 'mlx',
+      android: 'vulkan',
+    }),
     // Multimodal LLMs — same hook/module as plain LLMs, listed here so users
     // pick a model by capability ("LLM") rather than by modality.
     lfm2_5_vl_1_6b: base(M.LFM2_5_VL_1_6B_QUANTIZED),
     lfm2_5_vl_450m: base(M.LFM2_5_VL_450M_QUANTIZED),
+    gemma4_e2b_multimodal: base(M.GEMMA4_E2B_MM),
   },
   classification: {
     efficientnet_v2_s: variant(EFFICIENTNET_V2_S_VARIANTS),
@@ -521,6 +578,9 @@ export const models = {
   },
   pose_estimation: {
     yolo26n: base(M.YOLO26N_POSE),
+    // BETA preview — may be re-exported under a different constant once a
+    // stable RF-DETR keypoint model ships.
+    rfdetr_keypoint_preview: variant(RF_DETR_KEYPOINT_PREVIEW_VARIANTS),
   },
   semantic_segmentation: {
     deeplab_v3_resnet50: pair(

package/src/constants/modelUrls.ts CHANGED Viewed

@@ -125,6 +125,47 @@ export const QWEN3_0_6B_QUANTIZED = {
   generationConfig: QWEN3_GENERATION_CONFIG,
 } as const;
+// GEMMA 4 — separate HF repo; tokenizer files live at the e2b root and are
+// shared by all backend variants.
+const GEMMA4_E2B_PREFIX = `${URL_PREFIX}-gemma-4/${VERSION_TAG}/e2b`;
+export const GEMMA4_E2B_MLX_MODEL = `${GEMMA4_E2B_PREFIX}/mlx/gemma4_e2b_mlx_int4.pte`;
+export const GEMMA4_E2B_XNNPACK_MODEL = `${GEMMA4_E2B_PREFIX}/xnnpack/gemma_4_e2b_xnnpack_8da4w.pte`;
+export const GEMMA4_E2B_VULKAN_MODEL = `${GEMMA4_E2B_PREFIX}/vulkan/gemma_4_e2b_vulkan_8da4w.pte`;
+export const GEMMA4_E2B_TOKENIZER = `${GEMMA4_E2B_PREFIX}/tokenizer.json`;
+export const GEMMA4_E2B_TOKENIZER_CONFIG = `${GEMMA4_E2B_PREFIX}/tokenizer_config.json`;
+const GEMMA4_E2B_MODEL =
+  Platform.OS === `android` ? GEMMA4_E2B_VULKAN_MODEL : GEMMA4_E2B_MLX_MODEL;
+const GEMMA4_E2B_MLX_MM = `${URL_PREFIX}-gemma-4-multimodal/${VERSION_TAG}/e2b/mlx/gemma4_e2b_mlx_int4.pte`;
+const GEMMA4_E2B_VULKAN_MM = `${URL_PREFIX}-gemma-4-multimodal/${VERSION_TAG}/e2b/vulkan/gemma_4_e2b_vulkan_8da4w.pte`;
+/**
+ * @category Models - LLM
+ */
+export const GEMMA4_E2B = {
+  modelName: 'gemma4-e2b',
+  modelSource: GEMMA4_E2B_MODEL,
+  tokenizerSource: GEMMA4_E2B_TOKENIZER,
+  tokenizerConfigSource: GEMMA4_E2B_TOKENIZER_CONFIG,
+} as const;
+/**
+ * @category Models - LLM Multimodal
+ */
+export const GEMMA4_E2B_MM = {
+  modelName: 'gemma4-e2b-multimodal',
+  modelSource:
+    Platform.OS === `android` ? GEMMA4_E2B_VULKAN_MM : GEMMA4_E2B_MLX_MM,
+  tokenizerSource: GEMMA4_E2B_TOKENIZER,
+  tokenizerConfigSource: GEMMA4_E2B_TOKENIZER_CONFIG,
+  capabilities: ['vision', 'audio'],
+  audioConfig: {
+    samplesPerBlock: 7680,
+    tokensPerBlock: 12,
+  },
+} as const;
 /**
  * @category Models - LLM
  */
@@ -690,6 +731,28 @@ export const YOLO26N_POSE = {
   modelSource: YOLO26N_POSE_MODEL,
 } as const;
+// RF-DETR Keypoint (pose estimation) — BETA preview.
+// NOTE: served from the `preview/` path under PREVIOUS_VERSION_TAG (shipping as
+// part of a patch release). This export is a preview and may be re-exported
+// under a different constant once a stable version ships.
+export const RF_DETR_KEYPOINT_PREVIEW_XNNPACK_FP32_MODEL = `${URL_PREFIX}-rfdetr-keypoint/${VERSION_TAG}/preview/xnnpack/rfdetr_keypoint_preview_xnnpack_fp32.pte`;
+export const RF_DETR_KEYPOINT_PREVIEW_COREML_FP32_MODEL = `${URL_PREFIX}-rfdetr-keypoint/${VERSION_TAG}/preview/coreml/rfdetr_keypoint_preview_coreml_fp32.pte`;
+export const RF_DETR_KEYPOINT_PREVIEW_MLX_FP32_MODEL = `${URL_PREFIX}-rfdetr-keypoint/${VERSION_TAG}/preview/mlx/rfdetr_keypoint_preview_mlx_fp32.pte`;
+const RF_DETR_KEYPOINT_PREVIEW_MODEL =
+  Platform.OS === 'ios'
+    ? RF_DETR_KEYPOINT_PREVIEW_COREML_FP32_MODEL
+    : RF_DETR_KEYPOINT_PREVIEW_XNNPACK_FP32_MODEL;
+/**
+ * @category Models - Pose Estimation
+ * @beta Preview export — may be re-exported under a different constant once a
+ * stable RF-DETR keypoint model ships.
+ */
+export const RF_DETR_KEYPOINT_PREVIEW = {
+  modelName: 'rfdetr-keypoint-preview',
+  modelSource: RF_DETR_KEYPOINT_PREVIEW_MODEL,
+} as const;
 // Style transfer
 /**
  * Builds the four `(backend, precision)` URLs for a single style-transfer style.
@@ -816,27 +879,27 @@ export const STYLE_TRANSFER_UDNIE_QUANTIZED = {
 // S2T
 export const WHISPER_TINY_EN_TOKENIZER = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/tokenizer.json`;
 export const WHISPER_TINY_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/xnnpack/whisper_tiny_en_xnnpack_fp32.pte`;
-export const WHISPER_TINY_EN_MODEL_COREML = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/coreml/whisper_tiny_en_coreml_fp32.pte`;
+export const WHISPER_TINY_EN_MODEL_COREML = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/coreml/whisper_tiny_en_coreml_fp16.pte`;
 export const WHISPER_BASE_EN_TOKENIZER = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/tokenizer.json`;
 export const WHISPER_BASE_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/xnnpack/whisper_base_en_xnnpack_fp32.pte`;
-export const WHISPER_BASE_EN_MODEL_COREML = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/coreml/whisper_base_en_coreml_fp32.pte`;
+export const WHISPER_BASE_EN_MODEL_COREML = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/coreml/whisper_base_en_coreml_fp16.pte`;
 export const WHISPER_SMALL_EN_TOKENIZER = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/tokenizer.json`;
 export const WHISPER_SMALL_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/xnnpack/whisper_small_en_xnnpack_fp32.pte`;
-export const WHISPER_SMALL_EN_MODEL_COREML = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/coreml/whisper_small_en_coreml_fp32.pte`;
+export const WHISPER_SMALL_EN_MODEL_COREML = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/coreml/whisper_small_en_coreml_fp16.pte`;
 export const WHISPER_TINY_TOKENIZER = `${URL_PREFIX}-whisper-tiny/${VERSION_TAG}/tokenizer.json`;
 export const WHISPER_TINY_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny/${VERSION_TAG}/xnnpack/whisper_tiny_xnnpack_fp32.pte`;
-export const WHISPER_TINY_MODEL_COREML = `${URL_PREFIX}-whisper-tiny/${VERSION_TAG}/coreml/whisper_tiny_coreml_fp32.pte`;
+export const WHISPER_TINY_MODEL_COREML = `${URL_PREFIX}-whisper-tiny/${VERSION_TAG}/coreml/whisper_tiny_coreml_fp16.pte`;
 export const WHISPER_BASE_TOKENIZER = `${URL_PREFIX}-whisper-base/${VERSION_TAG}/tokenizer.json`;
 export const WHISPER_BASE_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base/${VERSION_TAG}/xnnpack/whisper_base_xnnpack_fp32.pte`;
-export const WHISPER_BASE_MODEL_COREML = `${URL_PREFIX}-whisper-base/${VERSION_TAG}/coreml/whisper_base_coreml_fp32.pte`;
+export const WHISPER_BASE_MODEL_COREML = `${URL_PREFIX}-whisper-base/${VERSION_TAG}/coreml/whisper_base_coreml_fp16.pte`;
 export const WHISPER_SMALL_TOKENIZER = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/tokenizer.json`;
 export const WHISPER_SMALL_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/xnnpack/whisper_small_xnnpack_fp32.pte`;
-export const WHISPER_SMALL_MODEL_COREML = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/coreml/whisper_small_coreml_fp32.pte`;
+export const WHISPER_SMALL_MODEL_COREML = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/coreml/whisper_small_coreml_fp16.pte`;
 /**
  * @category Models - Speech To Text

package/src/controllers/LLMController.ts CHANGED Viewed

@@ -1,11 +1,11 @@
-import { ResourceSource } from '../types/common';
 import { ResourceFetcher } from '../utils/ResourceFetcher';
 import { Template } from '@huggingface/jinja';
 import { DEFAULT_CHAT_CONFIG } from '../constants/llmDefaults';
 import {
+  AudioConfig,
   ChatConfig,
   GenerationConfig,
-  LLMCapability,
+  LLMModel,
   LLMTool,
   Message,
   SPECIAL_TOKENS,
@@ -30,6 +30,7 @@ export class LLMController {
   private messageHistoryCallback: (messageHistory: Message[]) => void;
   private isReadyCallback: (isReady: boolean) => void;
   private isGeneratingCallback: (isGenerating: boolean) => void;
+  private audioConfig: AudioConfig | undefined;
   constructor({
     tokenCallback,
@@ -72,18 +73,10 @@ export class LLMController {
   }
   public async load({
-    modelSource,
-    tokenizerSource,
-    tokenizerConfigSource,
-    capabilities,
-    defaultGenerationConfig,
+    model,
     onDownloadProgressCallback,
   }: {
-    modelSource: ResourceSource;
-    tokenizerSource: ResourceSource;
-    tokenizerConfigSource: ResourceSource;
-    capabilities?: readonly LLMCapability[];
-    defaultGenerationConfig?: GenerationConfig;
+    model: LLMModel;
     onDownloadProgressCallback?: (downloadProgress: number) => void;
   }) {
     // reset inner state when loading new model
@@ -94,13 +87,13 @@ export class LLMController {
     try {
       const tokenizersPromise = ResourceFetcher.fetch(
         undefined,
-        tokenizerSource,
-        tokenizerConfigSource
+        model.tokenizerSource,
+        model.tokenizerConfigSource
       );
       const modelPromise = ResourceFetcher.fetch(
         onDownloadProgressCallback,
-        modelSource
+        model.modelSource
       );
       const [tokenizersResults, modelResult] = await Promise.all([
@@ -124,16 +117,18 @@ export class LLMController {
         this.nativeModule.unload();
       }
+      this.audioConfig = model.audioConfig;
       this.nativeModule = await global.loadLLM(
         modelPath,
         tokenizerPath,
-        capabilities ?? []
+        model.capabilities ?? []
       );
-      if (defaultGenerationConfig) {
+      if (model.generationConfig) {
         // Apply model-specific recommended sampling defaults before flipping
         // isReady so callers that react to it see the right config on first
         // send. User-provided `configure()` calls still override these.
-        this.applyGenerationConfig(defaultGenerationConfig);
+        this.applyGenerationConfig(model.generationConfig);
       }
       this.isReadyCallback(true);
       this.onToken = (data: string) => {
@@ -236,6 +231,17 @@ export class LLMController {
     return token;
   }
+  private getAudioToken(): string {
+    const token = this.tokenizerConfig.audio_token;
+    if (!token) {
+      throw new RnExecutorchError(
+        RnExecutorchErrorCode.InvalidConfig,
+        "Tokenizer config is missing 'audio_token'. Audio-capable models require tokenizerConfigSource with an 'audio_token' field."
+      );
+    }
+    return token;
+  }
   private filterSpecialTokens(text: string): string {
     let filtered = text;
     if (
@@ -244,6 +250,12 @@ export class LLMController {
     ) {
       filtered = filtered.replaceAll(this.tokenizerConfig.eos_token, '');
     }
+    if (
+      SPECIAL_TOKENS.EOT_TOKEN in this.tokenizerConfig &&
+      this.tokenizerConfig.eot_token
+    ) {
+      filtered = filtered.replaceAll(this.tokenizerConfig.eot_token, '');
+    }
     if (
       SPECIAL_TOKENS.PAD_TOKEN in this.tokenizerConfig &&
       this.tokenizerConfig.pad_token
@@ -269,25 +281,37 @@ export class LLMController {
     this.isGeneratingCallback(false);
   }
-  public async forward(input: string, imagePaths?: string[]): Promise<string> {
+  public async forward(
+    input: string,
+    imagePaths?: string[],
+    audioWaveforms?: Float32Array[]
+  ): Promise<string> {
     if (!this._isReady) {
       throw new RnExecutorchError(RnExecutorchErrorCode.ModuleNotLoaded);
     }
     if (this._isGenerating) {
       throw new RnExecutorchError(RnExecutorchErrorCode.ModelGenerating);
     }
+    const hasImages = !!imagePaths && imagePaths.length > 0;
+    const hasAudio = !!audioWaveforms && audioWaveforms.length > 0;
     try {
       this.isGeneratingCallback(true);
       this.nativeModule.reset();
-      const response =
-        imagePaths && imagePaths.length > 0
-          ? await this.nativeModule.generateMultimodal(
-              input,
-              imagePaths.map(normalizeImagePath),
-              this.getImageToken(),
-              this.onToken
-            )
-          : await this.nativeModule.generate(input, this.onToken);
+      let response: string;
+      if (hasImages || hasAudio) {
+        response = await this.nativeModule.generateMultimodal(
+          input,
+          this.onToken,
+          {
+            imagePaths: hasImages ? imagePaths!.map(normalizeImagePath) : null,
+            imageToken: hasImages ? this.getImageToken() : null,
+            audioWaveforms: hasAudio ? audioWaveforms! : null,
+            audioToken: hasAudio ? this.getAudioToken() : null,
+          }
+        );
+      } else {
+        response = await this.nativeModule.generate(input, this.onToken);
+      }
       return this.filterSpecialTokens(response);
     } catch (e) {
       throw parseUnknownError(e);
@@ -355,7 +379,9 @@ export class LLMController {
     const imagePaths = messages
       .filter((m) => m.mediaPath)
       .map((m) => m.mediaPath!);
+    const audioWaveforms = messages
+      .filter((m) => m.audioWaveform)
+      .map((m) => m.audioWaveform!);
     const renderedChat: string = this.applyChatTemplate(
       messages,
       this.tokenizerConfig,
@@ -365,19 +391,22 @@ export class LLMController {
     return await this.forward(
       renderedChat,
-      imagePaths.length > 0 ? imagePaths : undefined
+      imagePaths.length > 0 ? imagePaths : undefined,
+      audioWaveforms.length > 0 ? audioWaveforms : undefined
     );
   }
   public async sendMessage(
     message: string,
-    media?: { imagePath?: string }
+    media?: { imagePath?: string; audioBuffer?: Float32Array }
   ): Promise<string> {
     const mediaPath = media?.imagePath;
+    const audioBuffer = media?.audioBuffer;
     const newMessage: Message = {
       content: message,
       role: 'user',
       ...(mediaPath ? { mediaPath } : {}),
+      ...(audioBuffer ? { audioWaveform: audioBuffer } : {}),
     };
     const updatedHistory = [...this._messageHistory, newMessage];
     this.messageHistoryCallback(updatedHistory);
@@ -392,7 +421,22 @@ export class LLMController {
       );
       const textTokens = this.nativeModule.countTextTokens(rendered);
       const imageCount = messages.filter((m) => m.mediaPath).length;
-      return textTokens + imageCount * (visualTokenCount - 1);
+      // Audio soft-token expansion: audio_encoder pads samples to
+      // multiples of this.audioConfig.samplesPerBlock (7680 @ 16 kHz) and emits
+      // this.audioConfig.tokensPerBlock (~12) soft tokens per padded block. The
+      // rendered template only contributes 1 token for the audio placeholder,
+      // so add (expansion - 1) per audio message to match prefill consumption.
+      const audioTokenExpansion = messages.reduce((acc, m) => {
+        if (!m.audioWaveform) return acc;
+        const kBlocks = Math.max(
+          1,
+          Math.ceil(m.audioWaveform.length / this.audioConfig!.samplesPerBlock)
+        );
+        return acc + (this.audioConfig!.tokensPerBlock * kBlocks - 1);
+      }, 0);
+      return (
+        textTokens + imageCount * (visualTokenCount - 1) + audioTokenExpansion
+      );
     };
     const maxContextLength = this.nativeModule.getMaxContextLength();
     const messageHistoryWithPrompt =
@@ -497,12 +541,17 @@ function normalizeImagePath(path: string): string {
  * @returns Messages with image-bearing turns rewritten to structured content.
  */
 function messagesForChatTemplate(messages: Message[]): any[] {
-  return messages.map((m) =>
-    m.mediaPath && typeof m.content === 'string'
-      ? {
-          ...m,
-          content: [{ type: 'image' }, { type: 'text', text: m.content }],
-        }
-      : m
-  );
+  return messages.map((m) => {
+    if (typeof m.content !== 'string') return m;
+    const hasImage = !!m.mediaPath;
+    const hasAudio = !!m.audioWaveform;
+    if (!hasImage && !hasAudio) return m;
+    const parts: any[] = [];
+    if (hasImage) parts.push({ type: 'image' });
+    if (hasAudio) parts.push({ type: 'audio' });
+    parts.push({ type: 'text', text: m.content });
+    // Drop the Float32Array on the clone only — passing it into the Jinja
+    // template engine slows render past 3s. Don't mutate m;
+    return { ...m, content: parts, audioWaveform: undefined };
+  });
 }

package/src/hooks/natural_language_processing/useLLM.ts CHANGED Viewed

@@ -58,11 +58,7 @@ export function useLLM({
     (async () => {
       try {
         await controllerInstance.load({
-          modelSource: model.modelSource,
-          tokenizerSource: model.tokenizerSource,
-          tokenizerConfigSource: model.tokenizerConfigSource!,
-          capabilities: model.capabilities,
-          defaultGenerationConfig: model.generationConfig,
+          model: model,
           onDownloadProgressCallback: setDownloadProgress,
         });
       } catch (e) {
@@ -106,7 +102,10 @@ export function useLLM({
   );
   const sendMessage = useCallback(
-    (message: string, media?: { imagePath?: string }) => {
+    (
+      message: string,
+      media?: { imagePath?: string; audioBuffer?: Float32Array }
+    ) => {
       setResponse('');
       return controllerInstance.sendMessage(message, media);
     },

package/src/modules/computer_vision/PoseEstimationModule.ts CHANGED Viewed

@@ -29,8 +29,20 @@ const YOLO_POSE_CONFIG = {
   defaultKeypointThreshold: 0.5,
 } satisfies PoseEstimationConfig<typeof CocoKeypoint>;
+// RF-DETR keypoint preview (BETA). Unlike yolo26n-pose's multi-method
+// `forward_<size>` export, this ships a single `forward` method — omitting
+// availableInputSizes/defaultInputSize makes forward() dispatch to plain
+// `forward`. May be renamed once a stable model ships.
+const RFDETR_KEYPOINT_CONFIG = {
+  keypointMap: CocoKeypoint,
+  preprocessorConfig: undefined,
+  defaultDetectionThreshold: 0.5,
+  defaultKeypointThreshold: 0.5,
+} satisfies PoseEstimationConfig<typeof CocoKeypoint>;
 const ModelConfigs = {
   'yolo26n-pose': YOLO_POSE_CONFIG,
+  'rfdetr-keypoint-preview': RFDETR_KEYPOINT_CONFIG,
 } as const satisfies Record<
   PoseEstimationModelName,
   PoseEstimationConfig<LabelEnum>

package/src/modules/natural_language_processing/LLMModule.ts CHANGED Viewed

@@ -3,6 +3,7 @@ import { Logger } from '../../common/Logger';
 import { parseUnknownError } from '../../errors/errorUtils';
 import { ResourceSource } from '../../types/common';
 import {
+  AudioConfig,
   LLMCapability,
   LLMConfig,
   LLMModelName,
@@ -51,6 +52,7 @@ export class LLMModule {
       tokenizerSource: ResourceSource;
       tokenizerConfigSource: ResourceSource;
       capabilities?: readonly LLMCapability[];
+      audioConfig?: AudioConfig;
     },
     onDownloadProgress: (progress: number) => void = () => {},
     tokenCallback?: (token: string) => void,
@@ -59,10 +61,14 @@ export class LLMModule {
     const instance = new LLMModule({ tokenCallback, messageHistoryCallback });
     try {
       await instance.controller.load({
-        modelSource: namedSources.modelSource,
-        tokenizerSource: namedSources.tokenizerSource,
-        tokenizerConfigSource: namedSources.tokenizerConfigSource,
-        capabilities: namedSources.capabilities,
+        model: {
+          modelName: namedSources.modelName,
+          modelSource: namedSources.modelSource,
+          tokenizerSource: namedSources.tokenizerSource,
+          tokenizerConfigSource: namedSources.tokenizerConfigSource,
+          capabilities: namedSources.capabilities,
+          audioConfig: namedSources.audioConfig,
+        },
         onDownloadProgressCallback: onDownloadProgress,
       });
       return instance;
@@ -140,10 +146,15 @@ export class LLMModule {
    * If you want a simple chat with model the consider using `sendMessage`
    * @param input - Raw input string containing the prompt and conversation history.
    * @param imagePaths - Optional array of local image paths for multimodal inference. Each entry may be either `file:///absolute/path` or `/absolute/path` — the controller normalizes the path before passing it to native code.
+   * @param audioWaveforms - Optional array of 16kHz waveforms of audio recordings for multimodal inference.
    * @returns The generated response as a string.
    */
-  async forward(input: string, imagePaths?: string[]): Promise<string> {
-    return await this.controller.forward(input, imagePaths);
+  async forward(
+    input: string,
+    imagePaths?: string[],
+    audioWaveforms?: Float32Array[]
+  ): Promise<string> {
+    return await this.controller.forward(input, imagePaths, audioWaveforms);
   }
   /**
@@ -162,12 +173,12 @@ export class LLMModule {
    * After model responds it will call `messageHistoryCallback()` containing both user message and model response.
    * It also returns them.
    * @param message - The message string to send.
-   * @param media - Optional media object containing a local image path for multimodal models.
+   * @param media - Optional media object containing a local image path or 16kHz waveform of an audio recording for multimodal models.
    * @returns - Updated message history including the new user message and model response.
    */
   async sendMessage(
     message: string,
-    media?: { imagePath?: string }
+    media?: { imagePath?: string; audioBuffer?: Float32Array }
   ): Promise<Message[]> {
     await this.controller.sendMessage(message, media);
     return this.controller.messageHistory;