expo-ai-kit 0.2.1 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -2,6 +2,8 @@
2
2
 
3
3
  On-device AI for Expo apps. Run language models locally—no API keys, no cloud, just native intelligence.
4
4
 
5
+ **Now with Gemma 4 support** — Download and run Google's [Gemma 4](https://blog.google/technology/developers/gemma-4/) E2B (2.3B) and E4B (4.5B) models directly on Android devices via [LiteRT-LM](https://ai.google.dev/edge/litert-lm). Full on-device inference with GPU acceleration, streaming, and zero cloud dependency.
6
+
5
7
  [![npm version](https://img.shields.io/npm/v/expo-ai-kit.svg)](https://www.npmjs.com/package/expo-ai-kit)
6
8
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
7
9
 
@@ -14,6 +16,15 @@ On-device AI for Expo apps. Run language models locally—no API keys, no cloud,
14
16
  | iOS 26+ | [Apple Foundation Models](https://developer.apple.com/documentation/FoundationModels) |
15
17
  | Android (supported devices) | [ML Kit Prompt API](https://developers.google.com/ml-kit/genai#prompt-device) |
16
18
 
19
+ ### Downloadable Models (Gemma 4)
20
+
21
+ | Platform | Status |
22
+ |----------|--------|
23
+ | Android | Gemma 4 E2B (2.3B) and E4B (4.5B) via [LiteRT-LM](https://ai.google.dev/edge/litert-lm) |
24
+ | iOS | Coming soon — waiting for LiteRT-LM Swift APIs from Google |
25
+
26
+ > **Note:** iOS downloadable model support (Gemma 4 E2B/E4B) is planned for a future release. We are waiting for Google to ship native Swift APIs for LiteRT-LM. Built-in Apple Foundation Models work on iOS 26+ today.
27
+
17
28
  ### Unsupported
18
29
 
19
30
  | Platform | Fallback Behavior |
@@ -26,7 +37,8 @@ On-device AI for Expo apps. Run language models locally—no API keys, no cloud,
26
37
  - **Privacy-first** — All inference happens on-device; no data leaves the user's device
27
38
  - **Zero latency** — No network round-trips required
28
39
  - **Free forever** — No API costs, rate limits, or subscriptions
29
- - **Native performance** — Built on Apple Foundation Models (iOS) and Google ML Kit Prompt API (Android)
40
+ - **Gemma 4 on-device** — Download and run Gemma 4 E2B/E4B models directly on Android with GPU acceleration
41
+ - **Native performance** — Built on Apple Foundation Models (iOS), ML Kit (Android), and LiteRT-LM (Gemma 4)
30
42
  - **Multi-turn conversations** — Full conversation context support
31
43
  - **Streaming support** — Progressive token streaming for responsive UIs
32
44
  - **Simple API** — Core functions plus prompt helpers for common tasks
@@ -43,6 +43,6 @@ android {
43
43
  }
44
44
 
45
45
  dependencies {
46
- implementation "com.google.mlkit:genai-prompt:1.0.0-alpha1"
47
- implementation "com.google.mediapipe:tasks-genai:0.10.24"
46
+ implementation "com.google.mlkit:genai-prompt:1.0.0-beta2"
47
+ implementation "com.google.ai.edge.litertlm:litertlm-android:+"
48
48
  }
@@ -1,8 +1,10 @@
1
1
  package expo.modules.aikit
2
2
 
3
3
  import android.content.Context
4
- import com.google.mediapipe.tasks.genai.llminference.LlmInference
5
- import kotlinx.coroutines.CompletableDeferred
4
+ import com.google.ai.edge.litertlm.Engine
5
+ import com.google.ai.edge.litertlm.EngineConfig
6
+ import com.google.ai.edge.litertlm.Conversation
7
+ import com.google.ai.edge.litertlm.Backend
6
8
  import kotlinx.coroutines.Dispatchers
7
9
  import kotlinx.coroutines.sync.Mutex
8
10
  import kotlinx.coroutines.sync.withLock
@@ -16,7 +18,7 @@ import java.net.URL
16
18
  import java.security.MessageDigest
17
19
 
18
20
  /**
19
- * Wrapper around MediaPipe LlmInference for Gemma 4 models.
21
+ * Wrapper around LiteRT-LM Engine for Gemma 4 models.
20
22
  *
21
23
  * Concurrency model:
22
24
  * - A Mutex guards all state transitions (load, unload, inference).
@@ -28,7 +30,8 @@ import java.security.MessageDigest
28
30
  class GemmaInferenceClient(private val context: Context) {
29
31
 
30
32
  private val mutex = Mutex()
31
- private var llmInference: LlmInference? = null
33
+ private var engine: Engine? = null
34
+ private var conversation: Conversation? = null
32
35
  private var loadedModelId: String? = null
33
36
 
34
37
  @Volatile
@@ -39,33 +42,50 @@ class GemmaInferenceClient(private val context: Context) {
39
42
  // -------------------------------------------------------------------------
40
43
 
41
44
  /**
42
- * Load a model into memory. Unloads any previously loaded model first.
45
+ * Load a model into memory using LiteRT-LM Engine.
46
+ * Unloads any previously loaded model first.
43
47
  * Caller is responsible for emitting onModelStateChange events.
44
48
  */
45
49
  suspend fun loadModel(modelId: String, modelPath: String) = mutex.withLock {
46
50
  // Unload previous model if different
47
51
  if (loadedModelId != null && loadedModelId != modelId) {
48
- llmInference?.close()
49
- llmInference = null
52
+ conversation?.close()
53
+ engine?.close()
54
+ conversation = null
55
+ engine = null
50
56
  loadedModelId = null
51
57
  }
52
58
 
53
- if (loadedModelId == modelId && llmInference != null) {
59
+ if (loadedModelId == modelId && engine != null) {
54
60
  return@withLock // Already loaded
55
61
  }
56
62
 
57
63
  try {
58
- val options = LlmInference.LlmInferenceOptions.builder()
59
- .setModelPath(modelPath)
60
- .build()
61
- llmInference = LlmInference.createFromOptions(context, options)
62
- loadedModelId = modelId
64
+ withContext(Dispatchers.IO) {
65
+ val engineConfig = EngineConfig(
66
+ modelPath = modelPath,
67
+ backend = Backend.GPU()
68
+ )
69
+ val newEngine = Engine(engineConfig)
70
+ newEngine.initialize()
71
+ val newConversation = newEngine.createConversation()
72
+
73
+ engine = newEngine
74
+ conversation = newConversation
75
+ loadedModelId = modelId
76
+ }
63
77
  } catch (e: OutOfMemoryError) {
64
- llmInference = null
78
+ conversation?.close()
79
+ engine?.close()
80
+ conversation = null
81
+ engine = null
65
82
  loadedModelId = null
66
83
  throw RuntimeException("INFERENCE_OOM:$modelId:Device does not have enough memory to load model")
67
84
  } catch (e: Exception) {
68
- llmInference = null
85
+ conversation?.close()
86
+ engine?.close()
87
+ conversation = null
88
+ engine = null
69
89
  loadedModelId = null
70
90
  throw RuntimeException("MODEL_LOAD_FAILED:$modelId:${e.message}")
71
91
  }
@@ -75,14 +95,16 @@ class GemmaInferenceClient(private val context: Context) {
75
95
  * Unload the current model from memory.
76
96
  */
77
97
  suspend fun unloadModel() = mutex.withLock {
78
- llmInference?.close()
79
- llmInference = null
98
+ conversation?.close()
99
+ engine?.close()
100
+ conversation = null
101
+ engine = null
80
102
  loadedModelId = null
81
103
  }
82
104
 
83
105
  fun getLoadedModelId(): String? = loadedModelId
84
106
 
85
- fun isModelLoaded(): Boolean = llmInference != null
107
+ fun isModelLoaded(): Boolean = engine != null
86
108
 
87
109
  // -------------------------------------------------------------------------
88
110
  // Inference
@@ -93,14 +115,14 @@ class GemmaInferenceClient(private val context: Context) {
93
115
  * The mutex ensures this cannot run concurrently with load/unload.
94
116
  */
95
117
  suspend fun generateText(prompt: String, systemPrompt: String): String = mutex.withLock {
96
- val inference = llmInference
118
+ val conv = conversation
97
119
  ?: throw RuntimeException("MODEL_NOT_DOWNLOADED:${loadedModelId ?: "unknown"}:No model loaded")
98
120
 
99
121
  val fullPrompt = buildFullPrompt(prompt, systemPrompt)
100
122
 
101
123
  try {
102
124
  withContext(Dispatchers.IO) {
103
- inference.generateResponse(fullPrompt)
125
+ conv.sendMessage(contents = fullPrompt).toString()
104
126
  }
105
127
  } catch (e: OutOfMemoryError) {
106
128
  throw RuntimeException("INFERENCE_OOM:${loadedModelId ?: "unknown"}:Out of memory during inference")
@@ -113,47 +135,37 @@ class GemmaInferenceClient(private val context: Context) {
113
135
  * Generate a streaming response. The onChunk callback receives
114
136
  * (token=delta, accumulatedText=full, isDone) matching the PromptApiClient contract.
115
137
  *
116
- * MediaPipe's generateResponseAsync passes accumulated text in its partial result
117
- * listener, so we diff against previousText to extract the delta token.
118
- *
119
- * We use a CompletableDeferred to keep the mutex held until streaming completes,
120
- * preventing concurrent load/unload during active inference.
138
+ * LiteRT-LM's sendMessageAsync() returns a Flow<Message>. Each emission
139
+ * contains accumulated text, so we diff against previousText to extract
140
+ * the delta token.
121
141
  */
122
142
  suspend fun generateTextStream(
123
143
  prompt: String,
124
144
  systemPrompt: String,
125
145
  onChunk: (token: String, accumulatedText: String, isDone: Boolean) -> Unit
126
146
  ) = mutex.withLock {
127
- val inference = llmInference
147
+ val conv = conversation
128
148
  ?: throw RuntimeException("MODEL_NOT_DOWNLOADED:${loadedModelId ?: "unknown"}:No model loaded")
129
149
 
130
150
  val fullPrompt = buildFullPrompt(prompt, systemPrompt)
131
151
 
132
152
  try {
133
153
  withContext(Dispatchers.IO) {
134
- val completion = CompletableDeferred<String>()
135
154
  var previousText = ""
136
155
 
137
- // MediaPipe streaming: generateResponseAsync calls the listener with
138
- // accumulated text (not deltas). We normalize to match PromptApiClient's
139
- // (token=delta, accumulatedText=full, isDone) contract.
140
- inference.generateResponseAsync(fullPrompt) { partialResult, done ->
141
- val accumulated = partialResult ?: ""
156
+ conv.sendMessageAsync(contents = fullPrompt).collect { message ->
157
+ val accumulated = message.toString()
142
158
  val token = if (accumulated.length > previousText.length) {
143
159
  accumulated.substring(previousText.length)
144
160
  } else {
145
161
  ""
146
162
  }
147
163
  previousText = accumulated
148
- onChunk(token, accumulated, done)
149
-
150
- if (done) {
151
- completion.complete(accumulated)
152
- }
164
+ onChunk(token, accumulated, false)
153
165
  }
154
166
 
155
- // Wait until streaming finishes so the mutex stays held
156
- completion.await()
167
+ // Final done event for consistency with PromptApiClient
168
+ onChunk("", previousText, true)
157
169
  }
158
170
  } catch (e: OutOfMemoryError) {
159
171
  throw RuntimeException("INFERENCE_OOM:${loadedModelId ?: "unknown"}:Out of memory during inference")
@@ -189,8 +201,8 @@ class GemmaInferenceClient(private val context: Context) {
189
201
  val modelsDir = File(context.filesDir, "models")
190
202
  modelsDir.mkdirs()
191
203
 
192
- val targetFile = File(modelsDir, "$modelId.gguf")
193
- val tempFile = File(modelsDir, "$modelId.gguf.tmp")
204
+ val targetFile = File(modelsDir, "$modelId.litertlm")
205
+ val tempFile = File(modelsDir, "$modelId.litertlm.tmp")
194
206
 
195
207
  try {
196
208
  val connection = URL(url).openConnection() as HttpURLConnection
@@ -256,17 +268,19 @@ class GemmaInferenceClient(private val context: Context) {
256
268
  suspend fun deleteModelFile(modelId: String) = mutex.withLock {
257
269
  // Unload if this model is currently loaded
258
270
  if (loadedModelId == modelId) {
259
- llmInference?.close()
260
- llmInference = null
271
+ conversation?.close()
272
+ engine?.close()
273
+ conversation = null
274
+ engine = null
261
275
  loadedModelId = null
262
276
  }
263
277
 
264
- val modelFile = File(context.filesDir, "models/$modelId.gguf")
278
+ val modelFile = File(context.filesDir, "models/$modelId.litertlm")
265
279
  if (modelFile.exists()) {
266
280
  modelFile.delete()
267
281
  }
268
282
  // Also clean up any partial downloads
269
- val tempFile = File(context.filesDir, "models/$modelId.gguf.tmp")
283
+ val tempFile = File(context.filesDir, "models/$modelId.litertlm.tmp")
270
284
  if (tempFile.exists()) {
271
285
  tempFile.delete()
272
286
  }
@@ -276,14 +290,14 @@ class GemmaInferenceClient(private val context: Context) {
276
290
  * Check if a model file exists on disk.
277
291
  */
278
292
  fun isModelFileDownloaded(modelId: String): Boolean {
279
- return File(context.filesDir, "models/$modelId.gguf").exists()
293
+ return File(context.filesDir, "models/$modelId.litertlm").exists()
280
294
  }
281
295
 
282
296
  /**
283
297
  * Get the file path for a downloaded model.
284
298
  */
285
299
  fun getModelFilePath(modelId: String): String {
286
- return File(context.filesDir, "models/$modelId.gguf").absolutePath
300
+ return File(context.filesDir, "models/$modelId.litertlm").absolutePath
287
301
  }
288
302
 
289
303
  // -------------------------------------------------------------------------
package/build/models.d.ts CHANGED
@@ -14,7 +14,7 @@ export type ModelRegistryEntry = {
14
14
  parameterCount: string;
15
15
  /** Quantization variant */
16
16
  quantization: string;
17
- /** URL to download the GGUF model file */
17
+ /** URL to download the LiteRT-LM model file */
18
18
  downloadUrl: string;
19
19
  /** SHA256 hash for integrity verification after download */
20
20
  sha256: string;
@@ -23,10 +23,9 @@ export type ModelRegistryEntry = {
23
23
  /**
24
24
  * Practical context window (max tokens) for this model on constrained devices.
25
25
  *
26
- * These are conservative defaults, NOT the base model's theoretical max (128k).
27
- * On a memory-constrained mobile device running quantized inference, KV cache
28
- * cannot fit the full 128k context. These values should be benchmarked and
29
- * adjusted during Phase 2 testing with real devices.
26
+ * These are conservative defaults, NOT the base model's theoretical max.
27
+ * These values should be benchmarked and adjusted during testing with
28
+ * real devices.
30
29
  */
31
30
  contextWindow: number;
32
31
  /** Minimum device RAM in bytes required to run this model */
@@ -1 +1 @@
1
- {"version":3,"file":"models.d.ts","sourceRoot":"","sources":["../src/models.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,MAAM,MAAM,kBAAkB,GAAG;IAC/B,6DAA6D;IAC7D,EAAE,EAAE,MAAM,CAAC;IACX,gCAAgC;IAChC,IAAI,EAAE,MAAM,CAAC;IACb,4BAA4B;IAC5B,cAAc,EAAE,MAAM,CAAC;IACvB,2BAA2B;IAC3B,YAAY,EAAE,MAAM,CAAC;IACrB,0CAA0C;IAC1C,WAAW,EAAE,MAAM,CAAC;IACpB,4DAA4D;IAC5D,MAAM,EAAE,MAAM,CAAC;IACf,kCAAkC;IAClC,SAAS,EAAE,MAAM,CAAC;IAClB;;;;;;;OAOG;IACH,aAAa,EAAE,MAAM,CAAC;IACtB,6DAA6D;IAC7D,WAAW,EAAE,MAAM,CAAC;IACpB,sCAAsC;IACtC,kBAAkB,EAAE,CAAC,KAAK,GAAG,SAAS,CAAC,EAAE,CAAC;CAC3C,CAAC;AAEF,eAAO,MAAM,cAAc,EAAE,kBAAkB,EA+B9C,CAAC;AAEF;;;GAGG;AACH,wBAAgB,gBAAgB,CAAC,OAAO,EAAE,MAAM,GAAG,kBAAkB,GAAG,SAAS,CAEhF"}
1
+ {"version":3,"file":"models.d.ts","sourceRoot":"","sources":["../src/models.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,MAAM,MAAM,kBAAkB,GAAG;IAC/B,6DAA6D;IAC7D,EAAE,EAAE,MAAM,CAAC;IACX,gCAAgC;IAChC,IAAI,EAAE,MAAM,CAAC;IACb,4BAA4B;IAC5B,cAAc,EAAE,MAAM,CAAC;IACvB,2BAA2B;IAC3B,YAAY,EAAE,MAAM,CAAC;IACrB,+CAA+C;IAC/C,WAAW,EAAE,MAAM,CAAC;IACpB,4DAA4D;IAC5D,MAAM,EAAE,MAAM,CAAC;IACf,kCAAkC;IAClC,SAAS,EAAE,MAAM,CAAC;IAClB;;;;;;OAMG;IACH,aAAa,EAAE,MAAM,CAAC;IACtB,6DAA6D;IAC7D,WAAW,EAAE,MAAM,CAAC;IACpB,sCAAsC;IACtC,kBAAkB,EAAE,CAAC,KAAK,GAAG,SAAS,CAAC,EAAE,CAAC;CAC3C,CAAC;AAEF,eAAO,MAAM,cAAc,EAAE,kBAAkB,EA+B9C,CAAC;AAEF;;;GAGG;AACH,wBAAgB,gBAAgB,CAAC,OAAO,EAAE,MAAM,GAAG,kBAAkB,GAAG,SAAS,CAEhF"}
package/build/models.js CHANGED
@@ -10,29 +10,29 @@ export const MODEL_REGISTRY = [
10
10
  id: 'gemma-e2b',
11
11
  name: 'Gemma 4 E2B',
12
12
  parameterCount: '2.3B',
13
- quantization: 'Q4_K_M',
14
- downloadUrl: 'https://huggingface.co/google/gemma-4-e2b-it-GGUF/resolve/main/gemma-4-e2b-it-Q4_K_M.gguf',
13
+ quantization: 'mixed-2/4/8-bit',
14
+ downloadUrl: 'https://huggingface.co/litert-community/gemma-4-E2B-it-litert-lm/resolve/main/gemma-4-E2B-it.litertlm',
15
15
  sha256: '', // TODO: Fill with actual hash once model file is verified
16
- sizeBytes: 1_400_000_000, // ~1.4GB
17
- // Conservative limit for 4GB RAM devices. Base model supports 128k but
18
- // KV cache won't fit. TODO: Benchmark during Phase 2 testing.
16
+ sizeBytes: 2_580_000_000, // ~2.58GB
17
+ // Conservative limit for 4GB RAM devices.
18
+ // TODO: Benchmark during Phase 2 testing.
19
19
  contextWindow: 8_000,
20
20
  minRamBytes: 4_000_000_000, // 4GB
21
- supportedPlatforms: ['ios', 'android'],
21
+ supportedPlatforms: ['android'], // iOS waiting for LiteRT-LM Swift APIs
22
22
  },
23
23
  {
24
24
  id: 'gemma-e4b',
25
25
  name: 'Gemma 4 E4B',
26
26
  parameterCount: '4.5B',
27
- quantization: 'Q4_K_M',
28
- downloadUrl: 'https://huggingface.co/google/gemma-4-e4b-it-GGUF/resolve/main/gemma-4-e4b-it-Q4_K_M.gguf',
27
+ quantization: 'mixed-4/8-bit',
28
+ downloadUrl: 'https://huggingface.co/litert-community/gemma-4-E4B-it-litert-lm/resolve/main/gemma-4-E4B-it.litertlm',
29
29
  sha256: '', // TODO: Fill with actual hash once model file is verified
30
- sizeBytes: 2_800_000_000, // ~2.8GB
31
- // Conservative limit for 6GB RAM devices. Base model supports 128k but
32
- // KV cache won't fit. TODO: Benchmark during Phase 2 testing.
30
+ sizeBytes: 3_650_000_000, // ~3.65GB
31
+ // Conservative limit for 6GB RAM devices.
32
+ // TODO: Benchmark during Phase 2 testing.
33
33
  contextWindow: 16_000,
34
34
  minRamBytes: 6_000_000_000, // 6GB
35
- supportedPlatforms: ['ios', 'android'],
35
+ supportedPlatforms: ['android'], // iOS waiting for LiteRT-LM Swift APIs
36
36
  },
37
37
  ];
38
38
  /**
@@ -1 +1 @@
1
- {"version":3,"file":"models.js","sourceRoot":"","sources":["../src/models.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAgCH,MAAM,CAAC,MAAM,cAAc,GAAyB;IAClD;QACE,EAAE,EAAE,WAAW;QACf,IAAI,EAAE,aAAa;QACnB,cAAc,EAAE,MAAM;QACtB,YAAY,EAAE,QAAQ;QACtB,WAAW,EACT,2FAA2F;QAC7F,MAAM,EAAE,EAAE,EAAE,0DAA0D;QACtE,SAAS,EAAE,aAAa,EAAE,SAAS;QACnC,uEAAuE;QACvE,8DAA8D;QAC9D,aAAa,EAAE,KAAK;QACpB,WAAW,EAAE,aAAa,EAAE,MAAM;QAClC,kBAAkB,EAAE,CAAC,KAAK,EAAE,SAAS,CAAC;KACvC;IACD;QACE,EAAE,EAAE,WAAW;QACf,IAAI,EAAE,aAAa;QACnB,cAAc,EAAE,MAAM;QACtB,YAAY,EAAE,QAAQ;QACtB,WAAW,EACT,2FAA2F;QAC7F,MAAM,EAAE,EAAE,EAAE,0DAA0D;QACtE,SAAS,EAAE,aAAa,EAAE,SAAS;QACnC,uEAAuE;QACvE,8DAA8D;QAC9D,aAAa,EAAE,MAAM;QACrB,WAAW,EAAE,aAAa,EAAE,MAAM;QAClC,kBAAkB,EAAE,CAAC,KAAK,EAAE,SAAS,CAAC;KACvC;CACF,CAAC;AAEF;;;GAGG;AACH,MAAM,UAAU,gBAAgB,CAAC,OAAe;IAC9C,OAAO,cAAc,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,OAAO,CAAC,CAAC;AACtD,CAAC","sourcesContent":["/**\n * Model Registry\n *\n * Defines all downloadable models known to expo-ai-kit.\n * getDownloadableModels() reads from this registry and enriches\n * each entry with on-device status from the native layer.\n */\n\nexport type ModelRegistryEntry = {\n /** Unique model identifier used in setModel/downloadModel */\n id: string;\n /** Human-readable model name */\n name: string;\n /** Parameter count label */\n parameterCount: string;\n /** Quantization variant */\n quantization: string;\n /** URL to download the GGUF model file */\n downloadUrl: string;\n /** SHA256 hash for integrity verification after download */\n sha256: string;\n /** Download file size in bytes */\n sizeBytes: number;\n /**\n * Practical context window (max tokens) for this model on constrained devices.\n *\n * These are conservative defaults, NOT the base model's theoretical max (128k).\n * On a memory-constrained mobile device running quantized inference, KV cache\n * cannot fit the full 128k context. These values should be benchmarked and\n * adjusted during Phase 2 testing with real devices.\n */\n contextWindow: number;\n /** Minimum device RAM in bytes required to run this model */\n minRamBytes: number;\n /** Platforms this model can run on */\n supportedPlatforms: ('ios' | 'android')[];\n};\n\nexport const MODEL_REGISTRY: ModelRegistryEntry[] = [\n {\n id: 'gemma-e2b',\n name: 'Gemma 4 E2B',\n parameterCount: '2.3B',\n quantization: 'Q4_K_M',\n downloadUrl:\n 'https://huggingface.co/google/gemma-4-e2b-it-GGUF/resolve/main/gemma-4-e2b-it-Q4_K_M.gguf',\n sha256: '', // TODO: Fill with actual hash once model file is verified\n sizeBytes: 1_400_000_000, // ~1.4GB\n // Conservative limit for 4GB RAM devices. Base model supports 128k but\n // KV cache won't fit. TODO: Benchmark during Phase 2 testing.\n contextWindow: 8_000,\n minRamBytes: 4_000_000_000, // 4GB\n supportedPlatforms: ['ios', 'android'],\n },\n {\n id: 'gemma-e4b',\n name: 'Gemma 4 E4B',\n parameterCount: '4.5B',\n quantization: 'Q4_K_M',\n downloadUrl:\n 'https://huggingface.co/google/gemma-4-e4b-it-GGUF/resolve/main/gemma-4-e4b-it-Q4_K_M.gguf',\n sha256: '', // TODO: Fill with actual hash once model file is verified\n sizeBytes: 2_800_000_000, // ~2.8GB\n // Conservative limit for 6GB RAM devices. Base model supports 128k but\n // KV cache won't fit. TODO: Benchmark during Phase 2 testing.\n contextWindow: 16_000,\n minRamBytes: 6_000_000_000, // 6GB\n supportedPlatforms: ['ios', 'android'],\n },\n];\n\n/**\n * Look up a model registry entry by ID.\n * Returns undefined if not found.\n */\nexport function getRegistryEntry(modelId: string): ModelRegistryEntry | undefined {\n return MODEL_REGISTRY.find((m) => m.id === modelId);\n}\n"]}
1
+ {"version":3,"file":"models.js","sourceRoot":"","sources":["../src/models.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AA+BH,MAAM,CAAC,MAAM,cAAc,GAAyB;IAClD;QACE,EAAE,EAAE,WAAW;QACf,IAAI,EAAE,aAAa;QACnB,cAAc,EAAE,MAAM;QACtB,YAAY,EAAE,iBAAiB;QAC/B,WAAW,EACT,uGAAuG;QACzG,MAAM,EAAE,EAAE,EAAE,0DAA0D;QACtE,SAAS,EAAE,aAAa,EAAE,UAAU;QACpC,0CAA0C;QAC1C,0CAA0C;QAC1C,aAAa,EAAE,KAAK;QACpB,WAAW,EAAE,aAAa,EAAE,MAAM;QAClC,kBAAkB,EAAE,CAAC,SAAS,CAAC,EAAE,uCAAuC;KACzE;IACD;QACE,EAAE,EAAE,WAAW;QACf,IAAI,EAAE,aAAa;QACnB,cAAc,EAAE,MAAM;QACtB,YAAY,EAAE,eAAe;QAC7B,WAAW,EACT,uGAAuG;QACzG,MAAM,EAAE,EAAE,EAAE,0DAA0D;QACtE,SAAS,EAAE,aAAa,EAAE,UAAU;QACpC,0CAA0C;QAC1C,0CAA0C;QAC1C,aAAa,EAAE,MAAM;QACrB,WAAW,EAAE,aAAa,EAAE,MAAM;QAClC,kBAAkB,EAAE,CAAC,SAAS,CAAC,EAAE,uCAAuC;KACzE;CACF,CAAC;AAEF;;;GAGG;AACH,MAAM,UAAU,gBAAgB,CAAC,OAAe;IAC9C,OAAO,cAAc,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,OAAO,CAAC,CAAC;AACtD,CAAC","sourcesContent":["/**\n * Model Registry\n *\n * Defines all downloadable models known to expo-ai-kit.\n * getDownloadableModels() reads from this registry and enriches\n * each entry with on-device status from the native layer.\n */\n\nexport type ModelRegistryEntry = {\n /** Unique model identifier used in setModel/downloadModel */\n id: string;\n /** Human-readable model name */\n name: string;\n /** Parameter count label */\n parameterCount: string;\n /** Quantization variant */\n quantization: string;\n /** URL to download the LiteRT-LM model file */\n downloadUrl: string;\n /** SHA256 hash for integrity verification after download */\n sha256: string;\n /** Download file size in bytes */\n sizeBytes: number;\n /**\n * Practical context window (max tokens) for this model on constrained devices.\n *\n * These are conservative defaults, NOT the base model's theoretical max.\n * These values should be benchmarked and adjusted during testing with\n * real devices.\n */\n contextWindow: number;\n /** Minimum device RAM in bytes required to run this model */\n minRamBytes: number;\n /** Platforms this model can run on */\n supportedPlatforms: ('ios' | 'android')[];\n};\n\nexport const MODEL_REGISTRY: ModelRegistryEntry[] = [\n {\n id: 'gemma-e2b',\n name: 'Gemma 4 E2B',\n parameterCount: '2.3B',\n quantization: 'mixed-2/4/8-bit',\n downloadUrl:\n 'https://huggingface.co/litert-community/gemma-4-E2B-it-litert-lm/resolve/main/gemma-4-E2B-it.litertlm',\n sha256: '', // TODO: Fill with actual hash once model file is verified\n sizeBytes: 2_580_000_000, // ~2.58GB\n // Conservative limit for 4GB RAM devices.\n // TODO: Benchmark during Phase 2 testing.\n contextWindow: 8_000,\n minRamBytes: 4_000_000_000, // 4GB\n supportedPlatforms: ['android'], // iOS waiting for LiteRT-LM Swift APIs\n },\n {\n id: 'gemma-e4b',\n name: 'Gemma 4 E4B',\n parameterCount: '4.5B',\n quantization: 'mixed-4/8-bit',\n downloadUrl:\n 'https://huggingface.co/litert-community/gemma-4-E4B-it-litert-lm/resolve/main/gemma-4-E4B-it.litertlm',\n sha256: '', // TODO: Fill with actual hash once model file is verified\n sizeBytes: 3_650_000_000, // ~3.65GB\n // Conservative limit for 6GB RAM devices.\n // TODO: Benchmark during Phase 2 testing.\n contextWindow: 16_000,\n minRamBytes: 6_000_000_000, // 6GB\n supportedPlatforms: ['android'], // iOS waiting for LiteRT-LM Swift APIs\n },\n];\n\n/**\n * Look up a model registry entry by ID.\n * Returns undefined if not found.\n */\nexport function getRegistryEntry(modelId: string): ModelRegistryEntry | undefined {\n return MODEL_REGISTRY.find((m) => m.id === modelId);\n}\n"]}
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "expo-ai-kit",
3
- "version": "0.2.1",
4
- "description": "Expo AI Kit module",
3
+ "version": "0.3.1",
4
+ "description": "On-device AI for Expo apps — run Gemma 4, Apple Foundation Models, and ML Kit locally with zero API keys",
5
5
  "main": "build/index.js",
6
6
  "types": "build/index.d.ts",
7
7
  "files": [
@@ -31,6 +31,14 @@
31
31
  "keywords": [
32
32
  "react-native",
33
33
  "expo",
34
+ "gemma",
35
+ "gemma-4",
36
+ "on-device-ai",
37
+ "llm",
38
+ "litert",
39
+ "apple-foundation-models",
40
+ "ml-kit",
41
+ "local-inference",
34
42
  "expo-ai-kit",
35
43
  "ExpoAiKit"
36
44
  ],
package/src/models.ts CHANGED
@@ -15,7 +15,7 @@ export type ModelRegistryEntry = {
15
15
  parameterCount: string;
16
16
  /** Quantization variant */
17
17
  quantization: string;
18
- /** URL to download the GGUF model file */
18
+ /** URL to download the LiteRT-LM model file */
19
19
  downloadUrl: string;
20
20
  /** SHA256 hash for integrity verification after download */
21
21
  sha256: string;
@@ -24,10 +24,9 @@ export type ModelRegistryEntry = {
24
24
  /**
25
25
  * Practical context window (max tokens) for this model on constrained devices.
26
26
  *
27
- * These are conservative defaults, NOT the base model's theoretical max (128k).
28
- * On a memory-constrained mobile device running quantized inference, KV cache
29
- * cannot fit the full 128k context. These values should be benchmarked and
30
- * adjusted during Phase 2 testing with real devices.
27
+ * These are conservative defaults, NOT the base model's theoretical max.
28
+ * These values should be benchmarked and adjusted during testing with
29
+ * real devices.
31
30
  */
32
31
  contextWindow: number;
33
32
  /** Minimum device RAM in bytes required to run this model */
@@ -41,31 +40,31 @@ export const MODEL_REGISTRY: ModelRegistryEntry[] = [
41
40
  id: 'gemma-e2b',
42
41
  name: 'Gemma 4 E2B',
43
42
  parameterCount: '2.3B',
44
- quantization: 'Q4_K_M',
43
+ quantization: 'mixed-2/4/8-bit',
45
44
  downloadUrl:
46
- 'https://huggingface.co/google/gemma-4-e2b-it-GGUF/resolve/main/gemma-4-e2b-it-Q4_K_M.gguf',
45
+ 'https://huggingface.co/litert-community/gemma-4-E2B-it-litert-lm/resolve/main/gemma-4-E2B-it.litertlm',
47
46
  sha256: '', // TODO: Fill with actual hash once model file is verified
48
- sizeBytes: 1_400_000_000, // ~1.4GB
49
- // Conservative limit for 4GB RAM devices. Base model supports 128k but
50
- // KV cache won't fit. TODO: Benchmark during Phase 2 testing.
47
+ sizeBytes: 2_580_000_000, // ~2.58GB
48
+ // Conservative limit for 4GB RAM devices.
49
+ // TODO: Benchmark during Phase 2 testing.
51
50
  contextWindow: 8_000,
52
51
  minRamBytes: 4_000_000_000, // 4GB
53
- supportedPlatforms: ['ios', 'android'],
52
+ supportedPlatforms: ['android'], // iOS waiting for LiteRT-LM Swift APIs
54
53
  },
55
54
  {
56
55
  id: 'gemma-e4b',
57
56
  name: 'Gemma 4 E4B',
58
57
  parameterCount: '4.5B',
59
- quantization: 'Q4_K_M',
58
+ quantization: 'mixed-4/8-bit',
60
59
  downloadUrl:
61
- 'https://huggingface.co/google/gemma-4-e4b-it-GGUF/resolve/main/gemma-4-e4b-it-Q4_K_M.gguf',
60
+ 'https://huggingface.co/litert-community/gemma-4-E4B-it-litert-lm/resolve/main/gemma-4-E4B-it.litertlm',
62
61
  sha256: '', // TODO: Fill with actual hash once model file is verified
63
- sizeBytes: 2_800_000_000, // ~2.8GB
64
- // Conservative limit for 6GB RAM devices. Base model supports 128k but
65
- // KV cache won't fit. TODO: Benchmark during Phase 2 testing.
62
+ sizeBytes: 3_650_000_000, // ~3.65GB
63
+ // Conservative limit for 6GB RAM devices.
64
+ // TODO: Benchmark during Phase 2 testing.
66
65
  contextWindow: 16_000,
67
66
  minRamBytes: 6_000_000_000, // 6GB
68
- supportedPlatforms: ['ios', 'android'],
67
+ supportedPlatforms: ['android'], // iOS waiting for LiteRT-LM Swift APIs
69
68
  },
70
69
  ];
71
70