npm - expo-ai-kit - Versions diffs - 0.2.1 → 0.3.1 - Mend

expo-ai-kit 0.2.1 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/README.md +13 -1
package/android/build.gradle +2 -2
package/android/src/main/java/expo/modules/aikit/GemmaInferenceClient.kt +61 -47
package/build/models.d.ts +4 -5
package/build/models.d.ts.map +1 -1
package/build/models.js +12 -12
package/build/models.js.map +1 -1
package/package.json +10 -2
package/src/models.ts +16 -17

package/README.md CHANGED Viewed

@@ -2,6 +2,8 @@
 On-device AI for Expo apps. Run language models locally—no API keys, no cloud, just native intelligence.
+**Now with Gemma 4 support** — Download and run Google's [Gemma 4](https://blog.google/technology/developers/gemma-4/) E2B (2.3B) and E4B (4.5B) models directly on Android devices via [LiteRT-LM](https://ai.google.dev/edge/litert-lm). Full on-device inference with GPU acceleration, streaming, and zero cloud dependency.
 [![npm version](https://img.shields.io/npm/v/expo-ai-kit.svg)](https://www.npmjs.com/package/expo-ai-kit)
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
@@ -14,6 +16,15 @@ On-device AI for Expo apps. Run language models locally—no API keys, no cloud,
 | iOS 26+ | [Apple Foundation Models](https://developer.apple.com/documentation/FoundationModels) |
 | Android (supported devices) | [ML Kit Prompt API](https://developers.google.com/ml-kit/genai#prompt-device) |
+### Downloadable Models (Gemma 4)
+| Platform | Status |
+|----------|--------|
+| Android | Gemma 4 E2B (2.3B) and E4B (4.5B) via [LiteRT-LM](https://ai.google.dev/edge/litert-lm) |
+| iOS | Coming soon — waiting for LiteRT-LM Swift APIs from Google |
+> **Note:** iOS downloadable model support (Gemma 4 E2B/E4B) is planned for a future release. We are waiting for Google to ship native Swift APIs for LiteRT-LM. Built-in Apple Foundation Models work on iOS 26+ today.
 ### Unsupported
 | Platform | Fallback Behavior |
@@ -26,7 +37,8 @@ On-device AI for Expo apps. Run language models locally—no API keys, no cloud,
 - **Privacy-first** — All inference happens on-device; no data leaves the user's device
 - **Zero latency** — No network round-trips required
 - **Free forever** — No API costs, rate limits, or subscriptions
-- **Native performance** — Built on Apple Foundation Models (iOS) and Google ML Kit Prompt API (Android)
+- **Gemma 4 on-device** — Download and run Gemma 4 E2B/E4B models directly on Android with GPU acceleration
+- **Native performance** — Built on Apple Foundation Models (iOS), ML Kit (Android), and LiteRT-LM (Gemma 4)
 - **Multi-turn conversations** — Full conversation context support
 - **Streaming support** — Progressive token streaming for responsive UIs
 - **Simple API** — Core functions plus prompt helpers for common tasks

package/android/build.gradle CHANGED Viewed

@@ -43,6 +43,6 @@ android {
 }
 dependencies {
-  implementation "com.google.mlkit:genai-prompt:1.0.0-alpha1"
-  implementation "com.google.mediapipe:tasks-genai:0.10.24"
+  implementation "com.google.mlkit:genai-prompt:1.0.0-beta2"
+  implementation "com.google.ai.edge.litertlm:litertlm-android:+"
 }

package/android/src/main/java/expo/modules/aikit/GemmaInferenceClient.kt CHANGED Viewed

@@ -1,8 +1,10 @@
 package expo.modules.aikit
 import android.content.Context
-import com.google.mediapipe.tasks.genai.llminference.LlmInference
-import kotlinx.coroutines.CompletableDeferred
+import com.google.ai.edge.litertlm.Engine
+import com.google.ai.edge.litertlm.EngineConfig
+import com.google.ai.edge.litertlm.Conversation
+import com.google.ai.edge.litertlm.Backend
 import kotlinx.coroutines.Dispatchers
 import kotlinx.coroutines.sync.Mutex
 import kotlinx.coroutines.sync.withLock
@@ -16,7 +18,7 @@ import java.net.URL
 import java.security.MessageDigest
 /**
- * Wrapper around MediaPipe LlmInference for Gemma 4 models.
+ * Wrapper around LiteRT-LM Engine for Gemma 4 models.
  *
  * Concurrency model:
  * - A Mutex guards all state transitions (load, unload, inference).
@@ -28,7 +30,8 @@ import java.security.MessageDigest
 class GemmaInferenceClient(private val context: Context) {
   private val mutex = Mutex()
-  private var llmInference: LlmInference? = null
+  private var engine: Engine? = null
+  private var conversation: Conversation? = null
   private var loadedModelId: String? = null
   @Volatile
@@ -39,33 +42,50 @@ class GemmaInferenceClient(private val context: Context) {
   // -------------------------------------------------------------------------
   /**
-   * Load a model into memory. Unloads any previously loaded model first.
+   * Load a model into memory using LiteRT-LM Engine.
+   * Unloads any previously loaded model first.
    * Caller is responsible for emitting onModelStateChange events.
    */
   suspend fun loadModel(modelId: String, modelPath: String) = mutex.withLock {
     // Unload previous model if different
     if (loadedModelId != null && loadedModelId != modelId) {
-      llmInference?.close()
-      llmInference = null
+      conversation?.close()
+      engine?.close()
+      conversation = null
+      engine = null
       loadedModelId = null
     }
-    if (loadedModelId == modelId && llmInference != null) {
+    if (loadedModelId == modelId && engine != null) {
       return@withLock // Already loaded
     }
     try {
-      val options = LlmInference.LlmInferenceOptions.builder()
-        .setModelPath(modelPath)
-        .build()
-      llmInference = LlmInference.createFromOptions(context, options)
-      loadedModelId = modelId
+      withContext(Dispatchers.IO) {
+        val engineConfig = EngineConfig(
+          modelPath = modelPath,
+          backend = Backend.GPU()
+        )
+        val newEngine = Engine(engineConfig)
+        newEngine.initialize()
+        val newConversation = newEngine.createConversation()
+        engine = newEngine
+        conversation = newConversation
+        loadedModelId = modelId
+      }
     } catch (e: OutOfMemoryError) {
-      llmInference = null
+      conversation?.close()
+      engine?.close()
+      conversation = null
+      engine = null
       loadedModelId = null
       throw RuntimeException("INFERENCE_OOM:$modelId:Device does not have enough memory to load model")
     } catch (e: Exception) {
-      llmInference = null
+      conversation?.close()
+      engine?.close()
+      conversation = null
+      engine = null
       loadedModelId = null
       throw RuntimeException("MODEL_LOAD_FAILED:$modelId:${e.message}")
     }
@@ -75,14 +95,16 @@ class GemmaInferenceClient(private val context: Context) {
    * Unload the current model from memory.
    */
   suspend fun unloadModel() = mutex.withLock {
-    llmInference?.close()
-    llmInference = null
+    conversation?.close()
+    engine?.close()
+    conversation = null
+    engine = null
     loadedModelId = null
   }
   fun getLoadedModelId(): String? = loadedModelId
-  fun isModelLoaded(): Boolean = llmInference != null
+  fun isModelLoaded(): Boolean = engine != null
   // -------------------------------------------------------------------------
   // Inference
@@ -93,14 +115,14 @@ class GemmaInferenceClient(private val context: Context) {
    * The mutex ensures this cannot run concurrently with load/unload.
    */
   suspend fun generateText(prompt: String, systemPrompt: String): String = mutex.withLock {
-    val inference = llmInference
+    val conv = conversation
       ?: throw RuntimeException("MODEL_NOT_DOWNLOADED:${loadedModelId ?: "unknown"}:No model loaded")
     val fullPrompt = buildFullPrompt(prompt, systemPrompt)
     try {
       withContext(Dispatchers.IO) {
-        inference.generateResponse(fullPrompt)
+        conv.sendMessage(contents = fullPrompt).toString()
       }
     } catch (e: OutOfMemoryError) {
       throw RuntimeException("INFERENCE_OOM:${loadedModelId ?: "unknown"}:Out of memory during inference")
@@ -113,47 +135,37 @@ class GemmaInferenceClient(private val context: Context) {
    * Generate a streaming response. The onChunk callback receives
    * (token=delta, accumulatedText=full, isDone) matching the PromptApiClient contract.
    *
-   * MediaPipe's generateResponseAsync passes accumulated text in its partial result
-   * listener, so we diff against previousText to extract the delta token.
-   *
-   * We use a CompletableDeferred to keep the mutex held until streaming completes,
-   * preventing concurrent load/unload during active inference.
+   * LiteRT-LM's sendMessageAsync() returns a Flow<Message>. Each emission
+   * contains accumulated text, so we diff against previousText to extract
+   * the delta token.
    */
   suspend fun generateTextStream(
     prompt: String,
     systemPrompt: String,
     onChunk: (token: String, accumulatedText: String, isDone: Boolean) -> Unit
   ) = mutex.withLock {
-    val inference = llmInference
+    val conv = conversation
       ?: throw RuntimeException("MODEL_NOT_DOWNLOADED:${loadedModelId ?: "unknown"}:No model loaded")
     val fullPrompt = buildFullPrompt(prompt, systemPrompt)
     try {
       withContext(Dispatchers.IO) {
-        val completion = CompletableDeferred<String>()
         var previousText = ""
-        // MediaPipe streaming: generateResponseAsync calls the listener with
-        // accumulated text (not deltas). We normalize to match PromptApiClient's
-        // (token=delta, accumulatedText=full, isDone) contract.
-        inference.generateResponseAsync(fullPrompt) { partialResult, done ->
-          val accumulated = partialResult ?: ""
+        conv.sendMessageAsync(contents = fullPrompt).collect { message ->
+          val accumulated = message.toString()
           val token = if (accumulated.length > previousText.length) {
             accumulated.substring(previousText.length)
           } else {
             ""
           }
           previousText = accumulated
-          onChunk(token, accumulated, done)
-          if (done) {
-            completion.complete(accumulated)
-          }
+          onChunk(token, accumulated, false)
         }
-        // Wait until streaming finishes so the mutex stays held
-        completion.await()
+        // Final done event for consistency with PromptApiClient
+        onChunk("", previousText, true)
       }
     } catch (e: OutOfMemoryError) {
       throw RuntimeException("INFERENCE_OOM:${loadedModelId ?: "unknown"}:Out of memory during inference")
@@ -189,8 +201,8 @@ class GemmaInferenceClient(private val context: Context) {
         val modelsDir = File(context.filesDir, "models")
         modelsDir.mkdirs()
-        val targetFile = File(modelsDir, "$modelId.gguf")
-        val tempFile = File(modelsDir, "$modelId.gguf.tmp")
+        val targetFile = File(modelsDir, "$modelId.litertlm")
+        val tempFile = File(modelsDir, "$modelId.litertlm.tmp")
         try {
           val connection = URL(url).openConnection() as HttpURLConnection
@@ -256,17 +268,19 @@ class GemmaInferenceClient(private val context: Context) {
   suspend fun deleteModelFile(modelId: String) = mutex.withLock {
     // Unload if this model is currently loaded
     if (loadedModelId == modelId) {
-      llmInference?.close()
-      llmInference = null
+      conversation?.close()
+      engine?.close()
+      conversation = null
+      engine = null
       loadedModelId = null
     }
-    val modelFile = File(context.filesDir, "models/$modelId.gguf")
+    val modelFile = File(context.filesDir, "models/$modelId.litertlm")
     if (modelFile.exists()) {
       modelFile.delete()
     }
     // Also clean up any partial downloads
-    val tempFile = File(context.filesDir, "models/$modelId.gguf.tmp")
+    val tempFile = File(context.filesDir, "models/$modelId.litertlm.tmp")
     if (tempFile.exists()) {
       tempFile.delete()
     }
@@ -276,14 +290,14 @@ class GemmaInferenceClient(private val context: Context) {
    * Check if a model file exists on disk.
    */
   fun isModelFileDownloaded(modelId: String): Boolean {
-    return File(context.filesDir, "models/$modelId.gguf").exists()
+    return File(context.filesDir, "models/$modelId.litertlm").exists()
   }
   /**
    * Get the file path for a downloaded model.
    */
   fun getModelFilePath(modelId: String): String {
-    return File(context.filesDir, "models/$modelId.gguf").absolutePath
+    return File(context.filesDir, "models/$modelId.litertlm").absolutePath
   }
   // -------------------------------------------------------------------------

package/build/models.d.ts CHANGED Viewed

@@ -14,7 +14,7 @@ export type ModelRegistryEntry = {
     parameterCount: string;
     /** Quantization variant */
     quantization: string;
-    /** URL to download the GGUF model file */
+    /** URL to download the LiteRT-LM model file */
     downloadUrl: string;
     /** SHA256 hash for integrity verification after download */
     sha256: string;
@@ -23,10 +23,9 @@ export type ModelRegistryEntry = {
     /**
      * Practical context window (max tokens) for this model on constrained devices.
      *
-     * These are conservative defaults, NOT the base model's theoretical max (128k).
-     * On a memory-constrained mobile device running quantized inference, KV cache
-     * cannot fit the full 128k context. These values should be benchmarked and
-     * adjusted during Phase 2 testing with real devices.
+     * These are conservative defaults, NOT the base model's theoretical max.
+     * These values should be benchmarked and adjusted during testing with
+     * real devices.
      */
     contextWindow: number;
     /** Minimum device RAM in bytes required to run this model */

package/build/models.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"models.d.ts","sourceRoot":"","sources":["../src/models.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,MAAM,MAAM,kBAAkB,GAAG;IAC/B,6DAA6D;IAC7D,EAAE,EAAE,MAAM,CAAC;IACX,gCAAgC;IAChC,IAAI,EAAE,MAAM,CAAC;IACb,4BAA4B;IAC5B,cAAc,EAAE,MAAM,CAAC;IACvB,2BAA2B;IAC3B,YAAY,EAAE,MAAM,CAAC;IACrB~~,0CAA0C~~;~~IAC1C~~,WAAW,EAAE,MAAM,CAAC;IACpB,4DAA4D;IAC5D,MAAM,EAAE,MAAM,CAAC;IACf,kCAAkC;IAClC,SAAS,EAAE,MAAM,CAAC;IAClB~~;;;;;;;OAOG~~;IACH,aAAa,EAAE,MAAM,CAAC;IACtB,6DAA6D;IAC7D,WAAW,EAAE,MAAM,CAAC;IACpB,sCAAsC;IACtC,kBAAkB,EAAE,CAAC,KAAK,GAAG,SAAS,CAAC,EAAE,CAAC;CAC3C,CAAC;AAEF,eAAO,MAAM,cAAc,EAAE,kBAAkB,EA+B9C,CAAC;AAEF;;;GAGG;AACH,wBAAgB,gBAAgB,CAAC,OAAO,EAAE,MAAM,GAAG,kBAAkB,GAAG,SAAS,CAEhF"}
1	+ {"version":3,"file":"models.d.ts","sourceRoot":"","sources":["../src/models.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,MAAM,MAAM,kBAAkB,GAAG;IAC/B,6DAA6D;IAC7D,EAAE,EAAE,MAAM,CAAC;IACX,gCAAgC;IAChC,IAAI,EAAE,MAAM,CAAC;IACb,4BAA4B;IAC5B,cAAc,EAAE,MAAM,CAAC;IACvB,2BAA2B;IAC3B,YAAY,EAAE,MAAM,CAAC;IACrB,+CAA+C;IAC/C,WAAW,EAAE,MAAM,CAAC;IACpB,4DAA4D;IAC5D,MAAM,EAAE,MAAM,CAAC;IACf,kCAAkC;IAClC,SAAS,EAAE,MAAM,CAAC;IAClB;;;;;;OAMG;IACH,aAAa,EAAE,MAAM,CAAC;IACtB,6DAA6D;IAC7D,WAAW,EAAE,MAAM,CAAC;IACpB,sCAAsC;IACtC,kBAAkB,EAAE,CAAC,KAAK,GAAG,SAAS,CAAC,EAAE,CAAC;CAC3C,CAAC;AAEF,eAAO,MAAM,cAAc,EAAE,kBAAkB,EA+B9C,CAAC;AAEF;;;GAGG;AACH,wBAAgB,gBAAgB,CAAC,OAAO,EAAE,MAAM,GAAG,kBAAkB,GAAG,SAAS,CAEhF"}

package/build/models.js CHANGED Viewed

@@ -10,29 +10,29 @@ export const MODEL_REGISTRY = [
         id: 'gemma-e2b',
         name: 'Gemma 4 E2B',
         parameterCount: '2.3B',
-        quantization: 'Q4_K_M',
-        downloadUrl: 'https://huggingface.co/google/gemma-4-e2b-it-GGUF/resolve/main/gemma-4-e2b-it-Q4_K_M.gguf',
+        quantization: 'mixed-2/4/8-bit',
+        downloadUrl: 'https://huggingface.co/litert-community/gemma-4-E2B-it-litert-lm/resolve/main/gemma-4-E2B-it.litertlm',
         sha256: '', // TODO: Fill with actual hash once model file is verified
-        sizeBytes: 1_400_000_000, // ~1.4GB
-        // Conservative limit for 4GB RAM devices. Base model supports 128k but
-        // KV cache won't fit. TODO: Benchmark during Phase 2 testing.
+        sizeBytes: 2_580_000_000, // ~2.58GB
+        // Conservative limit for 4GB RAM devices.
+        // TODO: Benchmark during Phase 2 testing.
         contextWindow: 8_000,
         minRamBytes: 4_000_000_000, // 4GB
-        supportedPlatforms: ['ios', 'android'],
+        supportedPlatforms: ['android'], // iOS waiting for LiteRT-LM Swift APIs
     },
     {
         id: 'gemma-e4b',
         name: 'Gemma 4 E4B',
         parameterCount: '4.5B',
-        quantization: 'Q4_K_M',
-        downloadUrl: 'https://huggingface.co/google/gemma-4-e4b-it-GGUF/resolve/main/gemma-4-e4b-it-Q4_K_M.gguf',
+        quantization: 'mixed-4/8-bit',
+        downloadUrl: 'https://huggingface.co/litert-community/gemma-4-E4B-it-litert-lm/resolve/main/gemma-4-E4B-it.litertlm',
         sha256: '', // TODO: Fill with actual hash once model file is verified
-        sizeBytes: 2_800_000_000, // ~2.8GB
-        // Conservative limit for 6GB RAM devices. Base model supports 128k but
-        // KV cache won't fit. TODO: Benchmark during Phase 2 testing.
+        sizeBytes: 3_650_000_000, // ~3.65GB
+        // Conservative limit for 6GB RAM devices.
+        // TODO: Benchmark during Phase 2 testing.
         contextWindow: 16_000,
         minRamBytes: 6_000_000_000, // 6GB
-        supportedPlatforms: ['ios', 'android'],
+        supportedPlatforms: ['android'], // iOS waiting for LiteRT-LM Swift APIs
     },
 ];
 /**

package/build/models.js.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"models.js","sourceRoot":"","sources":["../src/models.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;~~AAgCH~~,MAAM,CAAC,MAAM,cAAc,GAAyB;IAClD;QACE,EAAE,EAAE,WAAW;QACf,IAAI,EAAE,aAAa;QACnB,cAAc,EAAE,MAAM;QACtB,YAAY,EAAE,~~QAAQ~~;~~QACtB~~,WAAW,EACT,~~2FAA2F~~;~~QAC7F~~,MAAM,EAAE,EAAE,EAAE,0DAA0D;QACtE,SAAS,EAAE,aAAa,EAAE,~~SAAS~~;~~QACnC~~,~~uEAAuE~~;~~QACvE~~,~~8DAA8D~~;~~QAC9D~~,aAAa,EAAE,KAAK;QACpB,WAAW,EAAE,aAAa,EAAE,MAAM;QAClC,kBAAkB,EAAE,CAAC,~~KAAK~~,EAAE,~~SAAS,CAAC~~;~~KACvC~~;IACD;QACE,EAAE,EAAE,WAAW;QACf,IAAI,EAAE,aAAa;QACnB,cAAc,EAAE,MAAM;QACtB,YAAY,EAAE,~~QAAQ~~;~~QACtB~~,WAAW,EACT,~~2FAA2F~~;~~QAC7F~~,MAAM,EAAE,EAAE,EAAE,0DAA0D;QACtE,SAAS,EAAE,aAAa,EAAE,~~SAAS~~;~~QACnC~~,~~uEAAuE~~;~~QACvE~~,~~8DAA8D~~;~~QAC9D~~,aAAa,EAAE,MAAM;QACrB,WAAW,EAAE,aAAa,EAAE,MAAM;QAClC,kBAAkB,EAAE,CAAC,~~KAAK~~,EAAE,~~SAAS,CAAC~~;~~KACvC~~;CACF,CAAC;AAEF;;;GAGG;AACH,MAAM,UAAU,gBAAgB,CAAC,OAAe;IAC9C,OAAO,cAAc,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,OAAO,CAAC,CAAC;AACtD,CAAC","sourcesContent":["/*\n Model Registry\n \n Defines all downloadable models known to expo-ai-kit.\n * getDownloadableModels() reads from this registry and enriches\n * each entry with on-device status from the native layer.\n /\n\nexport type ModelRegistryEntry = {\n /* Unique model identifier used in setModel/downloadModel /\n id: string;\n /* Human-readable model name /\n name: string;\n /* Parameter count label /\n parameterCount: string;\n /* Quantization variant /\n quantization: string;\n /* URL to download the ~~GGUF~~ model file /\n downloadUrl: string;\n /* SHA256 hash for integrity verification after download /\n sha256: string;\n /* Download file size in bytes /\n sizeBytes: number;\n /\n Practical context window (max tokens) for this model on constrained devices.\n \n These are conservative defaults, NOT the base model's theoretical max ~~(128k)~~.\n * ~~On a memory-constrained mobile device running quantized inference, KV cache\n * cannot fit the full 128k context.~~ These values should be benchmarked and\n * adjusted during ~~Phase 2~~ testing with real devices.\n /\n contextWindow: number;\n /* Minimum device RAM in bytes required to run this model /\n minRamBytes: number;\n /* Platforms this model can run on /\n supportedPlatforms: ('ios' \| 'android')[];\n};\n\nexport const MODEL_REGISTRY: ModelRegistryEntry[] = [\n {\n id: 'gemma-e2b',\n name: 'Gemma 4 E2B',\n parameterCount: '2.3B',\n quantization: '~~Q4_K_M~~',\n downloadUrl:\n 'https://huggingface.co/~~google~~/gemma-4-~~e2b~~-it-~~GGUF~~/resolve/main/gemma-4-~~e2b~~-it~~-Q4_K_M~~.~~gguf~~',\n sha256: '', // TODO: Fill with actual hash once model file is verified\n sizeBytes: ~~1_400_000_000~~, // ~1.~~4GB~~\n // Conservative limit for 4GB RAM devices~~. Base model supports 128k but\~~n // ~~KV cache won't fit.~~ TODO: Benchmark during Phase 2 testing.\n contextWindow: 8_000,\n minRamBytes: 4_000_000_000, // 4GB\n supportedPlatforms: ['~~ios~~', ~~'android'],\~~n },\n {\n id: 'gemma-e4b',\n name: 'Gemma 4 E4B',\n parameterCount: '4.5B',\n quantization: '~~Q4_K_M~~',\n downloadUrl:\n 'https://huggingface.co/~~google~~/gemma-4-~~e4b~~-it-~~GGUF~~/resolve/main/gemma-4-~~e4b~~-it~~-Q4_K_M~~.~~gguf~~',\n sha256: '', // TODO: Fill with actual hash once model file is verified\n sizeBytes: ~~2_800_000_000~~, // ~2.~~8GB~~\n // Conservative limit for 6GB RAM devices~~. Base model supports 128k but\~~n // ~~KV cache won't fit.~~ TODO: Benchmark during Phase 2 testing.\n contextWindow: 16_000,\n minRamBytes: 6_000_000_000, // 6GB\n supportedPlatforms: ['~~ios~~', ~~'android'],\~~n },\n];\n\n/\n Look up a model registry entry by ID.\n * Returns undefined if not found.\n */\nexport function getRegistryEntry(modelId: string): ModelRegistryEntry \| undefined {\n return MODEL_REGISTRY.find((m) => m.id === modelId);\n}\n"]}
1	+ {"version":3,"file":"models.js","sourceRoot":"","sources":["../src/models.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AA+BH,MAAM,CAAC,MAAM,cAAc,GAAyB;IAClD;QACE,EAAE,EAAE,WAAW;QACf,IAAI,EAAE,aAAa;QACnB,cAAc,EAAE,MAAM;QACtB,YAAY,EAAE,iBAAiB;QAC/B,WAAW,EACT,uGAAuG;QACzG,MAAM,EAAE,EAAE,EAAE,0DAA0D;QACtE,SAAS,EAAE,aAAa,EAAE,UAAU;QACpC,0CAA0C;QAC1C,0CAA0C;QAC1C,aAAa,EAAE,KAAK;QACpB,WAAW,EAAE,aAAa,EAAE,MAAM;QAClC,kBAAkB,EAAE,CAAC,SAAS,CAAC,EAAE,uCAAuC;KACzE;IACD;QACE,EAAE,EAAE,WAAW;QACf,IAAI,EAAE,aAAa;QACnB,cAAc,EAAE,MAAM;QACtB,YAAY,EAAE,eAAe;QAC7B,WAAW,EACT,uGAAuG;QACzG,MAAM,EAAE,EAAE,EAAE,0DAA0D;QACtE,SAAS,EAAE,aAAa,EAAE,UAAU;QACpC,0CAA0C;QAC1C,0CAA0C;QAC1C,aAAa,EAAE,MAAM;QACrB,WAAW,EAAE,aAAa,EAAE,MAAM;QAClC,kBAAkB,EAAE,CAAC,SAAS,CAAC,EAAE,uCAAuC;KACzE;CACF,CAAC;AAEF;;;GAGG;AACH,MAAM,UAAU,gBAAgB,CAAC,OAAe;IAC9C,OAAO,cAAc,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,OAAO,CAAC,CAAC;AACtD,CAAC","sourcesContent":["/*\n Model Registry\n \n Defines all downloadable models known to expo-ai-kit.\n * getDownloadableModels() reads from this registry and enriches\n * each entry with on-device status from the native layer.\n /\n\nexport type ModelRegistryEntry = {\n /* Unique model identifier used in setModel/downloadModel /\n id: string;\n /* Human-readable model name /\n name: string;\n /* Parameter count label /\n parameterCount: string;\n /* Quantization variant /\n quantization: string;\n /* URL to download the LiteRT-LM model file /\n downloadUrl: string;\n /* SHA256 hash for integrity verification after download /\n sha256: string;\n /* Download file size in bytes /\n sizeBytes: number;\n /\n Practical context window (max tokens) for this model on constrained devices.\n \n These are conservative defaults, NOT the base model's theoretical max.\n * These values should be benchmarked and adjusted during testing with\n * real devices.\n /\n contextWindow: number;\n /* Minimum device RAM in bytes required to run this model /\n minRamBytes: number;\n /* Platforms this model can run on /\n supportedPlatforms: ('ios' \| 'android')[];\n};\n\nexport const MODEL_REGISTRY: ModelRegistryEntry[] = [\n {\n id: 'gemma-e2b',\n name: 'Gemma 4 E2B',\n parameterCount: '2.3B',\n quantization: 'mixed-2/4/8-bit',\n downloadUrl:\n 'https://huggingface.co/litert-community/gemma-4-E2B-it-litert-lm/resolve/main/gemma-4-E2B-it.litertlm',\n sha256: '', // TODO: Fill with actual hash once model file is verified\n sizeBytes: 2_580_000_000, // ~2.58GB\n // Conservative limit for 4GB RAM devices.\n // TODO: Benchmark during Phase 2 testing.\n contextWindow: 8_000,\n minRamBytes: 4_000_000_000, // 4GB\n supportedPlatforms: ['android'], // iOS waiting for LiteRT-LM Swift APIs\n },\n {\n id: 'gemma-e4b',\n name: 'Gemma 4 E4B',\n parameterCount: '4.5B',\n quantization: 'mixed-4/8-bit',\n downloadUrl:\n 'https://huggingface.co/litert-community/gemma-4-E4B-it-litert-lm/resolve/main/gemma-4-E4B-it.litertlm',\n sha256: '', // TODO: Fill with actual hash once model file is verified\n sizeBytes: 3_650_000_000, // ~3.65GB\n // Conservative limit for 6GB RAM devices.\n // TODO: Benchmark during Phase 2 testing.\n contextWindow: 16_000,\n minRamBytes: 6_000_000_000, // 6GB\n supportedPlatforms: ['android'], // iOS waiting for LiteRT-LM Swift APIs\n },\n];\n\n/\n Look up a model registry entry by ID.\n * Returns undefined if not found.\n */\nexport function getRegistryEntry(modelId: string): ModelRegistryEntry \| undefined {\n return MODEL_REGISTRY.find((m) => m.id === modelId);\n}\n"]}

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "expo-ai-kit",
-  "version": "0.2.1",
-  "description": "Expo AI Kit module",
+  "version": "0.3.1",
+  "description": "On-device AI for Expo apps — run Gemma 4, Apple Foundation Models, and ML Kit locally with zero API keys",
   "main": "build/index.js",
   "types": "build/index.d.ts",
   "files": [
@@ -31,6 +31,14 @@
   "keywords": [
     "react-native",
     "expo",
+    "gemma",
+    "gemma-4",
+    "on-device-ai",
+    "llm",
+    "litert",
+    "apple-foundation-models",
+    "ml-kit",
+    "local-inference",
     "expo-ai-kit",
     "ExpoAiKit"
   ],

package/src/models.ts CHANGED Viewed

@@ -15,7 +15,7 @@ export type ModelRegistryEntry = {
   parameterCount: string;
   /** Quantization variant */
   quantization: string;
-  /** URL to download the GGUF model file */
+  /** URL to download the LiteRT-LM model file */
   downloadUrl: string;
   /** SHA256 hash for integrity verification after download */
   sha256: string;
@@ -24,10 +24,9 @@ export type ModelRegistryEntry = {
   /**
    * Practical context window (max tokens) for this model on constrained devices.
    *
-   * These are conservative defaults, NOT the base model's theoretical max (128k).
-   * On a memory-constrained mobile device running quantized inference, KV cache
-   * cannot fit the full 128k context. These values should be benchmarked and
-   * adjusted during Phase 2 testing with real devices.
+   * These are conservative defaults, NOT the base model's theoretical max.
+   * These values should be benchmarked and adjusted during testing with
+   * real devices.
    */
   contextWindow: number;
   /** Minimum device RAM in bytes required to run this model */
@@ -41,31 +40,31 @@ export const MODEL_REGISTRY: ModelRegistryEntry[] = [
     id: 'gemma-e2b',
     name: 'Gemma 4 E2B',
     parameterCount: '2.3B',
-    quantization: 'Q4_K_M',
+    quantization: 'mixed-2/4/8-bit',
     downloadUrl:
-      'https://huggingface.co/google/gemma-4-e2b-it-GGUF/resolve/main/gemma-4-e2b-it-Q4_K_M.gguf',
+      'https://huggingface.co/litert-community/gemma-4-E2B-it-litert-lm/resolve/main/gemma-4-E2B-it.litertlm',
     sha256: '', // TODO: Fill with actual hash once model file is verified
-    sizeBytes: 1_400_000_000, // ~1.4GB
-    // Conservative limit for 4GB RAM devices. Base model supports 128k but
-    // KV cache won't fit. TODO: Benchmark during Phase 2 testing.
+    sizeBytes: 2_580_000_000, // ~2.58GB
+    // Conservative limit for 4GB RAM devices.
+    // TODO: Benchmark during Phase 2 testing.
     contextWindow: 8_000,
     minRamBytes: 4_000_000_000, // 4GB
-    supportedPlatforms: ['ios', 'android'],
+    supportedPlatforms: ['android'], // iOS waiting for LiteRT-LM Swift APIs
   },
   {
     id: 'gemma-e4b',
     name: 'Gemma 4 E4B',
     parameterCount: '4.5B',
-    quantization: 'Q4_K_M',
+    quantization: 'mixed-4/8-bit',
     downloadUrl:
-      'https://huggingface.co/google/gemma-4-e4b-it-GGUF/resolve/main/gemma-4-e4b-it-Q4_K_M.gguf',
+      'https://huggingface.co/litert-community/gemma-4-E4B-it-litert-lm/resolve/main/gemma-4-E4B-it.litertlm',
     sha256: '', // TODO: Fill with actual hash once model file is verified
-    sizeBytes: 2_800_000_000, // ~2.8GB
-    // Conservative limit for 6GB RAM devices. Base model supports 128k but
-    // KV cache won't fit. TODO: Benchmark during Phase 2 testing.
+    sizeBytes: 3_650_000_000, // ~3.65GB
+    // Conservative limit for 6GB RAM devices.
+    // TODO: Benchmark during Phase 2 testing.
     contextWindow: 16_000,
     minRamBytes: 6_000_000_000, // 6GB
-    supportedPlatforms: ['ios', 'android'],
+    supportedPlatforms: ['android'], // iOS waiting for LiteRT-LM Swift APIs
   },
 ];