@dvai-bridge/android-mediapipe-core 4.0.0 → 4.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,482 +1,482 @@
1
- package co.deepvoiceai.bridge.mediapipe.core
2
-
3
- import co.deepvoiceai.bridge.shared.core.DvaiHandlers
4
- import co.deepvoiceai.bridge.shared.core.HandlerContext
5
- import co.deepvoiceai.bridge.shared.core.HandlerResponse
6
- import kotlinx.coroutines.Dispatchers
7
- import kotlinx.coroutines.channels.awaitClose
8
- import kotlinx.coroutines.flow.Flow
9
- import kotlinx.coroutines.flow.callbackFlow
10
- import kotlinx.coroutines.flow.flow
11
- import kotlinx.coroutines.sync.Mutex
12
- import kotlinx.coroutines.sync.withLock
13
- import kotlinx.coroutines.withContext
14
- import kotlinx.serialization.json.Json
15
- import kotlinx.serialization.json.JsonArray
16
- import kotlinx.serialization.json.JsonNull
17
- import kotlinx.serialization.json.JsonObject
18
- import kotlinx.serialization.json.JsonPrimitive
19
- import kotlinx.serialization.json.addJsonObject
20
- import kotlinx.serialization.json.booleanOrNull
21
- import kotlinx.serialization.json.buildJsonObject
22
- import kotlinx.serialization.json.contentOrNull
23
- import kotlinx.serialization.json.intOrNull
24
- import kotlinx.serialization.json.put
25
- import kotlinx.serialization.json.putJsonArray
26
- import kotlinx.serialization.json.putJsonObject
27
- import java.util.UUID
28
-
29
- /**
30
- * OpenAI-compatible handler set for the MediaPipe LLM backend on Android.
31
- * Wires `openAIMessagesToPrompt` -> `bridge.completePrompt` (sync) or
32
- * `bridge.completePromptAsync` (streaming) -> OpenAI response shape.
33
- *
34
- * Phase 1 scope (Task 46): text + optional image input on vision-capable
35
- * Gemma tasks (e.g. Gemma 3n vision variants). Audio (`input_audio`) is
36
- * permanently rejected — MediaPipe `tasks-genai` has no audio path. Image
37
- * support is gated behind [visionCapable]: when `false`, `image_url` parts
38
- * return a 400 pointing at the model's lack of vision capability rather than
39
- * silently ignoring them. PluginState (Task 48) toggles the flag from the
40
- * caller-supplied `visionEnabled` start option.
41
- *
42
- * ## Streaming envelope
43
- *
44
- * Emits one role-only delta frame, then one content delta frame per MediaPipe
45
- * progress callback (with `finish_reason: "stop"` on the final frame), then a
46
- * literal `[DONE]` terminator. Frame count therefore varies with the number of
47
- * tokens generated — there is no fixed envelope size. Server-side buffering in
48
- * [HandlerDispatch] still collects everything before flush in Phase 1, so
49
- * clients see all frames together; per-token streaming lands when dispatch
50
- * grows a flush-per-chunk path.
51
- *
52
- * ## Streaming envelope parity (with [LlamaHandlers])
53
- *
54
- * The two backends emit slightly different shapes — both valid per OpenAI's
55
- * spec, but worth documenting so readers don't assume identical behavior:
56
- *
57
- * - [LlamaHandlers] emits: role / content / **separate empty-delta finish
58
- * frame with `finish_reason: "stop"`** / `[DONE]` (fixed 4-frame shape).
59
- * - [MediaPipeHandlers] emits: role / content₁ … content_N (last frame
60
- * carries `finish_reason: "stop"` alongside its content) / `[DONE]`
61
- * (variable frame count).
62
- *
63
- * Clients that accumulate `delta.content` see the full text in both cases.
64
- * Clients that gate on `finish_reason` see it on the trailing chunk in both
65
- * cases — just empty-delta in Llama, content-bearing in MediaPipe. The
66
- * asymmetry is intentional: LlamaHandlers wraps a single completePrompt call
67
- * with no intra-token signal, while MediaPipe's progress callback already
68
- * surfaces the `done` flag inline with the final content chunk.
69
- *
70
- * All bridge-touching paths are serialized via [bridgeMutex] because
71
- * `LlmInferenceSession` is not safe to use from multiple concurrent callers.
72
- */
73
- class MediaPipeHandlers(
74
- private val bridge: MediaPipeBridgeApi,
75
- private val modelId: String,
76
- /**
77
- * `true` when the loaded MediaPipe `.task` bundle is a vision-capable
78
- * Gemma variant AND PluginState has wired the bridge with
79
- * `visionEnabled = true`. Defaults to `false` so non-vision deployments
80
- * (the common case) reject image parts with a clear 400.
81
- */
82
- private val visionCapable: Boolean = false,
83
- ) : DvaiHandlers {
84
- private val bridgeMutex = Mutex()
85
-
86
- override suspend fun handleChatCompletion(body: JsonObject, ctx: HandlerContext): HandlerResponse {
87
- val messages = body["messages"] as? JsonArray
88
- ?: return HandlerResponse.Error(400, "Missing 'messages' field")
89
-
90
- // Walk content parts up-front: collect images for vision-capable
91
- // models, reject audio (always unsupported), and reject image_url for
92
- // non-vision models. Mirrors LlamaHandlers/FoundationHandlers ordering.
93
- // Images are collected as raw bytes; ByteArray → MPImage conversion
94
- // happens inside the bridge implementation.
95
- val images = mutableListOf<ByteArray>()
96
- for (msg in messages) {
97
- val msgObj = msg as? JsonObject ?: continue
98
- val content = msgObj["content"] as? JsonArray ?: continue
99
- for (part in content) {
100
- val partObj = part as? JsonObject ?: continue
101
- val type = (partObj["type"] as? JsonPrimitive)?.contentOrNull
102
- if (type == "image_url") {
103
- if (!visionCapable) {
104
- return HandlerResponse.Error(
105
- 400,
106
- "Image input requires a vision-capable MediaPipe model. " +
107
- "Loaded model has no vision capability — pass " +
108
- "`visionEnabled: true` to start() with a Gemma 3n " +
109
- "vision-capable .task bundle to enable image input.",
110
- )
111
- }
112
- val urlStr = (partObj["image_url"] as? JsonObject)
113
- ?.get("url")
114
- ?.let { it as? JsonPrimitive }
115
- ?.contentOrNull
116
- if (urlStr.isNullOrEmpty()) {
117
- return HandlerResponse.Error(
118
- 400,
119
- "image_url part missing 'url' field",
120
- )
121
- }
122
- val bytes = try {
123
- withContext(Dispatchers.IO) { ImageDecoder.resolve(urlStr) }
124
- } catch (e: Exception) {
125
- // Fetch / decode failure — 502 per spec §8.5 wording.
126
- return HandlerResponse.Error(
127
- 502,
128
- "Failed to fetch image: ${e.message ?: "unknown error"}",
129
- )
130
- }
131
- images.add(bytes)
132
- }
133
- if (type == "input_audio") {
134
- return HandlerResponse.Error(
135
- 400,
136
- "Audio input not supported on MediaPipe LLM " +
137
- "(no audio-capable tasks-genai task).",
138
- )
139
- }
140
- }
141
- }
142
-
143
- if (messages.isEmpty()) {
144
- return HandlerResponse.Error(400, "Empty 'messages' array")
145
- }
146
-
147
- val prompt = openAIMessagesToPrompt(messages)
148
- val isStream = (body["stream"] as? JsonPrimitive)?.booleanOrNull ?: false
149
-
150
- val id = "chatcmpl-mp-" + UUID.randomUUID().toString().take(20).lowercase()
151
- val created = System.currentTimeMillis() / 1000L
152
-
153
- if (isStream) {
154
- return HandlerResponse.Sse(
155
- buildChatStreamFrames(id = id, created = created, prompt = prompt, images = images),
156
- )
157
- }
158
-
159
- val text = try {
160
- bridgeMutex.withLock {
161
- withContext(Dispatchers.IO) { bridge.completePrompt(prompt, images) }
162
- }
163
- } catch (e: Exception) {
164
- return HandlerResponse.Error(500, e.message ?: "Inference failed")
165
- }
166
-
167
- val response = buildJsonObject {
168
- put("id", id)
169
- put("object", "chat.completion")
170
- put("created", created)
171
- put("model", modelId)
172
- putJsonArray("choices") {
173
- addJsonObject {
174
- put("index", 0)
175
- putJsonObject("message") {
176
- put("role", "assistant")
177
- put("content", text)
178
- }
179
- put("finish_reason", "stop")
180
- }
181
- }
182
- putJsonObject("usage") {
183
- put("prompt_tokens", 0)
184
- put("completion_tokens", 0)
185
- put("total_tokens", 0)
186
- }
187
- }
188
- return HandlerResponse.Json(200, response)
189
- }
190
-
191
- override suspend fun handleCompletion(body: JsonObject, ctx: HandlerContext): HandlerResponse {
192
- val promptField = body["prompt"]
193
- val prompt: String = when {
194
- promptField == null || promptField is JsonNull -> ""
195
- promptField is JsonPrimitive && promptField.contentOrNull != null -> promptField.content
196
- promptField is JsonArray -> promptField.joinToString("\n") {
197
- (it as? JsonPrimitive)?.contentOrNull ?: ""
198
- }
199
- else -> return HandlerResponse.Error(400, "'prompt' must be a string or array of strings")
200
- }
201
-
202
- val chatBody = buildJsonObject {
203
- for ((k, v) in body) {
204
- if (k == "prompt") continue
205
- put(k, v)
206
- }
207
- putJsonArray("messages") {
208
- addJsonObject {
209
- put("role", "user")
210
- put("content", prompt)
211
- }
212
- }
213
- }
214
-
215
- val chatResp = handleChatCompletion(chatBody, ctx)
216
- return when (chatResp) {
217
- is HandlerResponse.Json -> {
218
- // Smart cast across the shared-core module boundary doesn't work;
219
- // bind to a local val and cast once. See LlamaHandlers.kt for the
220
- // same fix.
221
- val respBody = chatResp.body
222
- if (chatResp.status != 200 || respBody !is JsonObject) {
223
- chatResp
224
- } else {
225
- HandlerResponse.Json(200, chatToLegacyCompletion(respBody))
226
- }
227
- }
228
- is HandlerResponse.Sse -> {
229
- val model = (body["model"] as? JsonPrimitive)?.contentOrNull ?: modelId
230
- HandlerResponse.Sse(
231
- flow {
232
- chatResp.flow.collect { chunk ->
233
- emit(adaptChunkToLegacy(chunk, model))
234
- }
235
- },
236
- )
237
- }
238
- is HandlerResponse.Error -> chatResp
239
- }
240
- }
241
-
242
- override suspend fun handleEmbeddings(body: JsonObject, ctx: HandlerContext): HandlerResponse =
243
- HandlerResponse.Error(
244
- 400,
245
- "Embeddings not supported on MediaPipe LLM. " +
246
- "Use capacitorBackend: \"llama\" with nativeEmbeddingMode: true.",
247
- )
248
-
249
- override suspend fun handleModels(ctx: HandlerContext): HandlerResponse =
250
- HandlerResponse.Json(
251
- 200,
252
- buildJsonObject {
253
- put("object", "list")
254
- putJsonArray("data") {
255
- addJsonObject {
256
- put("id", ctx.modelId)
257
- put("object", "model")
258
- put("owned_by", "google-mediapipe")
259
- }
260
- }
261
- },
262
- )
263
-
264
- // ----- Streaming -----
265
-
266
- /**
267
- * Build the SSE envelope: role frame + N content frames (last carries
268
- * `finish_reason: "stop"`) + `[DONE]` terminator. Frame count varies with
269
- * token count — see the "Streaming envelope parity" note in this class's
270
- * KDoc for the documented divergence from [LlamaHandlers].
271
- *
272
- * Acquires [bridgeMutex] for the lifetime of the stream and releases it
273
- * in [awaitClose] — guarantees serialization with non-streaming requests
274
- * AND mutex release on either successful completion or coroutine
275
- * cancellation.
276
- */
277
- private fun buildChatStreamFrames(
278
- id: String,
279
- created: Long,
280
- prompt: String,
281
- images: List<ByteArray>,
282
- ): Flow<String> = callbackFlow {
283
- // Serialize against any other bridge use for the entire stream lifetime.
284
- // Track ownership explicitly via a local flag so we never depend on the
285
- // racy `Mutex.isLocked` snapshot read for unlock decisions.
286
- var unlocked = false
287
- bridgeMutex.lock()
288
- fun safeUnlock() {
289
- if (!unlocked) {
290
- unlocked = true
291
- bridgeMutex.unlock()
292
- }
293
- }
294
-
295
- // Role chunk (first frame of the envelope).
296
- val roleChunk = buildJsonObject {
297
- put("id", id); put("object", "chat.completion.chunk")
298
- put("created", created); put("model", modelId)
299
- putJsonArray("choices") {
300
- addJsonObject {
301
- put("index", 0)
302
- putJsonObject("delta") { put("role", "assistant") }
303
- }
304
- }
305
- }
306
- trySend("data: $roleChunk\n\n")
307
-
308
- val handle: AutoCloseable = try {
309
- bridge.completePromptAsync(prompt, images) { partial, done ->
310
- // Content-delta chunk for every (partial, done) pair. When `done`
311
- // is true the final frame carries finish_reason="stop"; otherwise
312
- // finish_reason is null.
313
- val chunk = buildJsonObject {
314
- put("id", id); put("object", "chat.completion.chunk")
315
- put("created", created); put("model", modelId)
316
- putJsonArray("choices") {
317
- addJsonObject {
318
- put("index", 0)
319
- putJsonObject("delta") { put("content", partial) }
320
- if (done) {
321
- put("finish_reason", "stop")
322
- } else {
323
- put("finish_reason", JsonNull)
324
- }
325
- }
326
- }
327
- }
328
- trySend("data: $chunk\n\n")
329
- if (done) {
330
- trySend("data: [DONE]\n\n")
331
- close()
332
- }
333
- }
334
- } catch (e: Exception) {
335
- // Generation failed to start. Emit an error chunk + [DONE] so the
336
- // client sees a well-formed SSE close, then complete the flow with
337
- // the exception (collector receives it).
338
- //
339
- // finish_reason uses the OpenAI-standard "stop" value; the failure
340
- // signal is conveyed via the sibling `error` field. (OpenAI's spec
341
- // restricts finish_reason to stop|length|tool_calls|content_filter|
342
- // function_call|null, so we don't invent an "error" value.)
343
- val errChunk = buildJsonObject {
344
- put("id", id); put("object", "chat.completion.chunk")
345
- put("created", created); put("model", modelId)
346
- putJsonArray("choices") {
347
- addJsonObject {
348
- put("index", 0)
349
- putJsonObject("delta") { /* empty */ }
350
- put("finish_reason", "stop")
351
- }
352
- }
353
- putJsonObject("error") { put("message", e.message ?: "Inference failed") }
354
- }
355
- trySend("data: $errChunk\n\n")
356
- trySend("data: [DONE]\n\n")
357
- close(e)
358
- // Release the mutex synchronously — awaitClose still runs but
359
- // there is no AutoCloseable handle to call close() on.
360
- safeUnlock()
361
- return@callbackFlow
362
- }
363
-
364
- awaitClose {
365
- try {
366
- handle.close()
367
- } catch (_: Throwable) { /* best-effort */ }
368
- safeUnlock()
369
- }
370
- }
371
-
372
- // ----- Helpers -----
373
-
374
- /**
375
- * Flatten OpenAI chat messages into a single `role: content` newline-joined
376
- * prompt string. Multimodal content arrays are reduced to their `text`
377
- * parts (image / audio parts are rejected before this method is reached).
378
- */
379
- private fun openAIMessagesToPrompt(messages: JsonArray): String =
380
- messages.mapNotNull { msg ->
381
- val msgObj = msg as? JsonObject ?: return@mapNotNull null
382
- val role = (msgObj["role"] as? JsonPrimitive)?.contentOrNull ?: "user"
383
- val content = msgObj["content"]
384
- when (content) {
385
- is JsonPrimitive -> "$role: ${content.contentOrNull ?: ""}"
386
- is JsonArray -> {
387
- val texts = content.mapNotNull inner@{ part ->
388
- val partObj = part as? JsonObject ?: return@inner null
389
- if ((partObj["type"] as? JsonPrimitive)?.contentOrNull == "text") {
390
- (partObj["text"] as? JsonPrimitive)?.contentOrNull
391
- } else {
392
- null
393
- }
394
- }
395
- if (texts.isNotEmpty()) "$role: ${texts.joinToString(" ")}" else null
396
- }
397
- else -> null
398
- }
399
- }.joinToString("\n")
400
-
401
- /** Mirrors `chatToLegacyCompletion()` from `packages/dvai-bridge-core`. */
402
- private fun chatToLegacyCompletion(chat: JsonObject): JsonObject = buildJsonObject {
403
- val chatId = (chat["id"] as? JsonPrimitive)?.contentOrNull ?: ""
404
- val cmplId = if (chatId.isEmpty()) {
405
- "cmpl-${System.currentTimeMillis() / 1000L}"
406
- } else {
407
- chatId.replace("chatcmpl-", "cmpl-")
408
- }
409
- put("id", cmplId)
410
- put("object", "text_completion")
411
- chat["created"]?.let { put("created", it) }
412
- ?: put("created", System.currentTimeMillis() / 1000L)
413
- put("model", (chat["model"] as? JsonPrimitive)?.contentOrNull ?: modelId)
414
- putJsonArray("choices") {
415
- val chatChoices = chat["choices"] as? JsonArray ?: JsonArray(emptyList())
416
- for (c in chatChoices) {
417
- val co = c as? JsonObject ?: continue
418
- addJsonObject {
419
- val msg = co["message"] as? JsonObject
420
- put("text", (msg?.get("content") as? JsonPrimitive)?.contentOrNull ?: "")
421
- put("index", (co["index"] as? JsonPrimitive)?.intOrNull ?: 0)
422
- put(
423
- "finish_reason",
424
- (co["finish_reason"] as? JsonPrimitive)?.contentOrNull ?: "stop",
425
- )
426
- put("logprobs", JsonNull)
427
- }
428
- }
429
- }
430
- val usage = chat["usage"] as? JsonObject
431
- if (usage != null) {
432
- put("usage", usage)
433
- } else {
434
- putJsonObject("usage") {
435
- put("prompt_tokens", 0)
436
- put("completion_tokens", 0)
437
- put("total_tokens", 0)
438
- }
439
- }
440
- }
441
-
442
- /** Adapt a single SSE frame from chat.completion.chunk -> text_completion.chunk. */
443
- private fun adaptChunkToLegacy(chunk: String, model: String): String {
444
- val trimmed = chunk.trim()
445
- if (!trimmed.startsWith("data:")) return chunk
446
- val payload = trimmed.removePrefix("data:").trim()
447
- if (payload == "[DONE]") return "data: [DONE]\n\n"
448
- val parsed = try {
449
- Json.parseToJsonElement(payload) as? JsonObject ?: return chunk
450
- } catch (_: Exception) {
451
- return chunk
452
- }
453
- val chatId = (parsed["id"] as? JsonPrimitive)?.contentOrNull ?: ""
454
- val id = chatId.replace("chatcmpl-", "cmpl-")
455
- val legacy = buildJsonObject {
456
- put("id", id)
457
- put("object", "text_completion.chunk")
458
- parsed["created"]?.let { put("created", it) }
459
- ?: put("created", System.currentTimeMillis() / 1000L)
460
- put("model", (parsed["model"] as? JsonPrimitive)?.contentOrNull ?: model)
461
- putJsonArray("choices") {
462
- val chatChoices = parsed["choices"] as? JsonArray ?: JsonArray(emptyList())
463
- for (c in chatChoices) {
464
- val co = c as? JsonObject ?: continue
465
- addJsonObject {
466
- val delta = co["delta"] as? JsonObject
467
- put("text", (delta?.get("content") as? JsonPrimitive)?.contentOrNull ?: "")
468
- put("index", (co["index"] as? JsonPrimitive)?.intOrNull ?: 0)
469
- val fr = co["finish_reason"]
470
- if (fr is JsonPrimitive && fr.contentOrNull != null) {
471
- put("finish_reason", fr.content)
472
- } else {
473
- put("finish_reason", JsonNull)
474
- }
475
- put("logprobs", JsonNull)
476
- }
477
- }
478
- }
479
- }
480
- return "data: $legacy\n\n"
481
- }
482
- }
1
+ package co.deepvoiceai.bridge.mediapipe.core
2
+
3
+ import co.deepvoiceai.bridge.shared.core.DvaiHandlers
4
+ import co.deepvoiceai.bridge.shared.core.HandlerContext
5
+ import co.deepvoiceai.bridge.shared.core.HandlerResponse
6
+ import kotlinx.coroutines.Dispatchers
7
+ import kotlinx.coroutines.channels.awaitClose
8
+ import kotlinx.coroutines.flow.Flow
9
+ import kotlinx.coroutines.flow.callbackFlow
10
+ import kotlinx.coroutines.flow.flow
11
+ import kotlinx.coroutines.sync.Mutex
12
+ import kotlinx.coroutines.sync.withLock
13
+ import kotlinx.coroutines.withContext
14
+ import kotlinx.serialization.json.Json
15
+ import kotlinx.serialization.json.JsonArray
16
+ import kotlinx.serialization.json.JsonNull
17
+ import kotlinx.serialization.json.JsonObject
18
+ import kotlinx.serialization.json.JsonPrimitive
19
+ import kotlinx.serialization.json.addJsonObject
20
+ import kotlinx.serialization.json.booleanOrNull
21
+ import kotlinx.serialization.json.buildJsonObject
22
+ import kotlinx.serialization.json.contentOrNull
23
+ import kotlinx.serialization.json.intOrNull
24
+ import kotlinx.serialization.json.put
25
+ import kotlinx.serialization.json.putJsonArray
26
+ import kotlinx.serialization.json.putJsonObject
27
+ import java.util.UUID
28
+
29
+ /**
30
+ * OpenAI-compatible handler set for the MediaPipe LLM backend on Android.
31
+ * Wires `openAIMessagesToPrompt` -> `bridge.completePrompt` (sync) or
32
+ * `bridge.completePromptAsync` (streaming) -> OpenAI response shape.
33
+ *
34
+ * Phase 1 scope (Task 46): text + optional image input on vision-capable
35
+ * Gemma tasks (e.g. Gemma 3n vision variants). Audio (`input_audio`) is
36
+ * permanently rejected — MediaPipe `tasks-genai` has no audio path. Image
37
+ * support is gated behind [visionCapable]: when `false`, `image_url` parts
38
+ * return a 400 pointing at the model's lack of vision capability rather than
39
+ * silently ignoring them. PluginState (Task 48) toggles the flag from the
40
+ * caller-supplied `visionEnabled` start option.
41
+ *
42
+ * ## Streaming envelope
43
+ *
44
+ * Emits one role-only delta frame, then one content delta frame per MediaPipe
45
+ * progress callback (with `finish_reason: "stop"` on the final frame), then a
46
+ * literal `[DONE]` terminator. Frame count therefore varies with the number of
47
+ * tokens generated — there is no fixed envelope size. Server-side buffering in
48
+ * [HandlerDispatch] still collects everything before flush in Phase 1, so
49
+ * clients see all frames together; per-token streaming lands when dispatch
50
+ * grows a flush-per-chunk path.
51
+ *
52
+ * ## Streaming envelope parity (with [LlamaHandlers])
53
+ *
54
+ * The two backends emit slightly different shapes — both valid per OpenAI's
55
+ * spec, but worth documenting so readers don't assume identical behavior:
56
+ *
57
+ * - [LlamaHandlers] emits: role / content / **separate empty-delta finish
58
+ * frame with `finish_reason: "stop"`** / `[DONE]` (fixed 4-frame shape).
59
+ * - [MediaPipeHandlers] emits: role / content₁ … content_N (last frame
60
+ * carries `finish_reason: "stop"` alongside its content) / `[DONE]`
61
+ * (variable frame count).
62
+ *
63
+ * Clients that accumulate `delta.content` see the full text in both cases.
64
+ * Clients that gate on `finish_reason` see it on the trailing chunk in both
65
+ * cases — just empty-delta in Llama, content-bearing in MediaPipe. The
66
+ * asymmetry is intentional: LlamaHandlers wraps a single completePrompt call
67
+ * with no intra-token signal, while MediaPipe's progress callback already
68
+ * surfaces the `done` flag inline with the final content chunk.
69
+ *
70
+ * All bridge-touching paths are serialized via [bridgeMutex] because
71
+ * `LlmInferenceSession` is not safe to use from multiple concurrent callers.
72
+ */
73
+ class MediaPipeHandlers(
74
+ private val bridge: MediaPipeBridgeApi,
75
+ private val modelId: String,
76
+ /**
77
+ * `true` when the loaded MediaPipe `.task` bundle is a vision-capable
78
+ * Gemma variant AND PluginState has wired the bridge with
79
+ * `visionEnabled = true`. Defaults to `false` so non-vision deployments
80
+ * (the common case) reject image parts with a clear 400.
81
+ */
82
+ private val visionCapable: Boolean = false,
83
+ ) : DvaiHandlers {
84
+ private val bridgeMutex = Mutex()
85
+
86
+ override suspend fun handleChatCompletion(body: JsonObject, ctx: HandlerContext): HandlerResponse {
87
+ val messages = body["messages"] as? JsonArray
88
+ ?: return HandlerResponse.Error(400, "Missing 'messages' field")
89
+
90
+ // Walk content parts up-front: collect images for vision-capable
91
+ // models, reject audio (always unsupported), and reject image_url for
92
+ // non-vision models. Mirrors LlamaHandlers/FoundationHandlers ordering.
93
+ // Images are collected as raw bytes; ByteArray → MPImage conversion
94
+ // happens inside the bridge implementation.
95
+ val images = mutableListOf<ByteArray>()
96
+ for (msg in messages) {
97
+ val msgObj = msg as? JsonObject ?: continue
98
+ val content = msgObj["content"] as? JsonArray ?: continue
99
+ for (part in content) {
100
+ val partObj = part as? JsonObject ?: continue
101
+ val type = (partObj["type"] as? JsonPrimitive)?.contentOrNull
102
+ if (type == "image_url") {
103
+ if (!visionCapable) {
104
+ return HandlerResponse.Error(
105
+ 400,
106
+ "Image input requires a vision-capable MediaPipe model. " +
107
+ "Loaded model has no vision capability — pass " +
108
+ "`visionEnabled: true` to start() with a Gemma 3n " +
109
+ "vision-capable .task bundle to enable image input.",
110
+ )
111
+ }
112
+ val urlStr = (partObj["image_url"] as? JsonObject)
113
+ ?.get("url")
114
+ ?.let { it as? JsonPrimitive }
115
+ ?.contentOrNull
116
+ if (urlStr.isNullOrEmpty()) {
117
+ return HandlerResponse.Error(
118
+ 400,
119
+ "image_url part missing 'url' field",
120
+ )
121
+ }
122
+ val bytes = try {
123
+ withContext(Dispatchers.IO) { ImageDecoder.resolve(urlStr) }
124
+ } catch (e: Exception) {
125
+ // Fetch / decode failure — 502 per spec §8.5 wording.
126
+ return HandlerResponse.Error(
127
+ 502,
128
+ "Failed to fetch image: ${e.message ?: "unknown error"}",
129
+ )
130
+ }
131
+ images.add(bytes)
132
+ }
133
+ if (type == "input_audio") {
134
+ return HandlerResponse.Error(
135
+ 400,
136
+ "Audio input not supported on MediaPipe LLM " +
137
+ "(no audio-capable tasks-genai task).",
138
+ )
139
+ }
140
+ }
141
+ }
142
+
143
+ if (messages.isEmpty()) {
144
+ return HandlerResponse.Error(400, "Empty 'messages' array")
145
+ }
146
+
147
+ val prompt = openAIMessagesToPrompt(messages)
148
+ val isStream = (body["stream"] as? JsonPrimitive)?.booleanOrNull ?: false
149
+
150
+ val id = "chatcmpl-mp-" + UUID.randomUUID().toString().take(20).lowercase()
151
+ val created = System.currentTimeMillis() / 1000L
152
+
153
+ if (isStream) {
154
+ return HandlerResponse.Sse(
155
+ buildChatStreamFrames(id = id, created = created, prompt = prompt, images = images),
156
+ )
157
+ }
158
+
159
+ val text = try {
160
+ bridgeMutex.withLock {
161
+ withContext(Dispatchers.IO) { bridge.completePrompt(prompt, images) }
162
+ }
163
+ } catch (e: Exception) {
164
+ return HandlerResponse.Error(500, e.message ?: "Inference failed")
165
+ }
166
+
167
+ val response = buildJsonObject {
168
+ put("id", id)
169
+ put("object", "chat.completion")
170
+ put("created", created)
171
+ put("model", modelId)
172
+ putJsonArray("choices") {
173
+ addJsonObject {
174
+ put("index", 0)
175
+ putJsonObject("message") {
176
+ put("role", "assistant")
177
+ put("content", text)
178
+ }
179
+ put("finish_reason", "stop")
180
+ }
181
+ }
182
+ putJsonObject("usage") {
183
+ put("prompt_tokens", 0)
184
+ put("completion_tokens", 0)
185
+ put("total_tokens", 0)
186
+ }
187
+ }
188
+ return HandlerResponse.Json(200, response)
189
+ }
190
+
191
+ override suspend fun handleCompletion(body: JsonObject, ctx: HandlerContext): HandlerResponse {
192
+ val promptField = body["prompt"]
193
+ val prompt: String = when {
194
+ promptField == null || promptField is JsonNull -> ""
195
+ promptField is JsonPrimitive && promptField.contentOrNull != null -> promptField.content
196
+ promptField is JsonArray -> promptField.joinToString("\n") {
197
+ (it as? JsonPrimitive)?.contentOrNull ?: ""
198
+ }
199
+ else -> return HandlerResponse.Error(400, "'prompt' must be a string or array of strings")
200
+ }
201
+
202
+ val chatBody = buildJsonObject {
203
+ for ((k, v) in body) {
204
+ if (k == "prompt") continue
205
+ put(k, v)
206
+ }
207
+ putJsonArray("messages") {
208
+ addJsonObject {
209
+ put("role", "user")
210
+ put("content", prompt)
211
+ }
212
+ }
213
+ }
214
+
215
+ val chatResp = handleChatCompletion(chatBody, ctx)
216
+ return when (chatResp) {
217
+ is HandlerResponse.Json -> {
218
+ // Smart cast across the shared-core module boundary doesn't work;
219
+ // bind to a local val and cast once. See LlamaHandlers.kt for the
220
+ // same fix.
221
+ val respBody = chatResp.body
222
+ if (chatResp.status != 200 || respBody !is JsonObject) {
223
+ chatResp
224
+ } else {
225
+ HandlerResponse.Json(200, chatToLegacyCompletion(respBody))
226
+ }
227
+ }
228
+ is HandlerResponse.Sse -> {
229
+ val model = (body["model"] as? JsonPrimitive)?.contentOrNull ?: modelId
230
+ HandlerResponse.Sse(
231
+ flow {
232
+ chatResp.flow.collect { chunk ->
233
+ emit(adaptChunkToLegacy(chunk, model))
234
+ }
235
+ },
236
+ )
237
+ }
238
+ is HandlerResponse.Error -> chatResp
239
+ }
240
+ }
241
+
242
+ override suspend fun handleEmbeddings(body: JsonObject, ctx: HandlerContext): HandlerResponse =
243
+ HandlerResponse.Error(
244
+ 400,
245
+ "Embeddings not supported on MediaPipe LLM. " +
246
+ "Use capacitorBackend: \"llama\" with nativeEmbeddingMode: true.",
247
+ )
248
+
249
+ override suspend fun handleModels(ctx: HandlerContext): HandlerResponse =
250
+ HandlerResponse.Json(
251
+ 200,
252
+ buildJsonObject {
253
+ put("object", "list")
254
+ putJsonArray("data") {
255
+ addJsonObject {
256
+ put("id", ctx.modelId)
257
+ put("object", "model")
258
+ put("owned_by", "google-mediapipe")
259
+ }
260
+ }
261
+ },
262
+ )
263
+
264
+ // ----- Streaming -----
265
+
266
+ /**
267
+ * Build the SSE envelope: role frame + N content frames (last carries
268
+ * `finish_reason: "stop"`) + `[DONE]` terminator. Frame count varies with
269
+ * token count — see the "Streaming envelope parity" note in this class's
270
+ * KDoc for the documented divergence from [LlamaHandlers].
271
+ *
272
+ * Acquires [bridgeMutex] for the lifetime of the stream and releases it
273
+ * in [awaitClose] — guarantees serialization with non-streaming requests
274
+ * AND mutex release on either successful completion or coroutine
275
+ * cancellation.
276
+ */
277
+ private fun buildChatStreamFrames(
278
+ id: String,
279
+ created: Long,
280
+ prompt: String,
281
+ images: List<ByteArray>,
282
+ ): Flow<String> = callbackFlow {
283
+ // Serialize against any other bridge use for the entire stream lifetime.
284
+ // Track ownership explicitly via a local flag so we never depend on the
285
+ // racy `Mutex.isLocked` snapshot read for unlock decisions.
286
+ var unlocked = false
287
+ bridgeMutex.lock()
288
+ fun safeUnlock() {
289
+ if (!unlocked) {
290
+ unlocked = true
291
+ bridgeMutex.unlock()
292
+ }
293
+ }
294
+
295
+ // Role chunk (first frame of the envelope).
296
+ val roleChunk = buildJsonObject {
297
+ put("id", id); put("object", "chat.completion.chunk")
298
+ put("created", created); put("model", modelId)
299
+ putJsonArray("choices") {
300
+ addJsonObject {
301
+ put("index", 0)
302
+ putJsonObject("delta") { put("role", "assistant") }
303
+ }
304
+ }
305
+ }
306
+ trySend("data: $roleChunk\n\n")
307
+
308
+ val handle: AutoCloseable = try {
309
+ bridge.completePromptAsync(prompt, images) { partial, done ->
310
+ // Content-delta chunk for every (partial, done) pair. When `done`
311
+ // is true the final frame carries finish_reason="stop"; otherwise
312
+ // finish_reason is null.
313
+ val chunk = buildJsonObject {
314
+ put("id", id); put("object", "chat.completion.chunk")
315
+ put("created", created); put("model", modelId)
316
+ putJsonArray("choices") {
317
+ addJsonObject {
318
+ put("index", 0)
319
+ putJsonObject("delta") { put("content", partial) }
320
+ if (done) {
321
+ put("finish_reason", "stop")
322
+ } else {
323
+ put("finish_reason", JsonNull)
324
+ }
325
+ }
326
+ }
327
+ }
328
+ trySend("data: $chunk\n\n")
329
+ if (done) {
330
+ trySend("data: [DONE]\n\n")
331
+ close()
332
+ }
333
+ }
334
+ } catch (e: Exception) {
335
+ // Generation failed to start. Emit an error chunk + [DONE] so the
336
+ // client sees a well-formed SSE close, then complete the flow with
337
+ // the exception (collector receives it).
338
+ //
339
+ // finish_reason uses the OpenAI-standard "stop" value; the failure
340
+ // signal is conveyed via the sibling `error` field. (OpenAI's spec
341
+ // restricts finish_reason to stop|length|tool_calls|content_filter|
342
+ // function_call|null, so we don't invent an "error" value.)
343
+ val errChunk = buildJsonObject {
344
+ put("id", id); put("object", "chat.completion.chunk")
345
+ put("created", created); put("model", modelId)
346
+ putJsonArray("choices") {
347
+ addJsonObject {
348
+ put("index", 0)
349
+ putJsonObject("delta") { /* empty */ }
350
+ put("finish_reason", "stop")
351
+ }
352
+ }
353
+ putJsonObject("error") { put("message", e.message ?: "Inference failed") }
354
+ }
355
+ trySend("data: $errChunk\n\n")
356
+ trySend("data: [DONE]\n\n")
357
+ close(e)
358
+ // Release the mutex synchronously — awaitClose still runs but
359
+ // there is no AutoCloseable handle to call close() on.
360
+ safeUnlock()
361
+ return@callbackFlow
362
+ }
363
+
364
+ awaitClose {
365
+ try {
366
+ handle.close()
367
+ } catch (_: Throwable) { /* best-effort */ }
368
+ safeUnlock()
369
+ }
370
+ }
371
+
372
+ // ----- Helpers -----
373
+
374
+ /**
375
+ * Flatten OpenAI chat messages into a single `role: content` newline-joined
376
+ * prompt string. Multimodal content arrays are reduced to their `text`
377
+ * parts (image / audio parts are rejected before this method is reached).
378
+ */
379
+ private fun openAIMessagesToPrompt(messages: JsonArray): String =
380
+ messages.mapNotNull { msg ->
381
+ val msgObj = msg as? JsonObject ?: return@mapNotNull null
382
+ val role = (msgObj["role"] as? JsonPrimitive)?.contentOrNull ?: "user"
383
+ val content = msgObj["content"]
384
+ when (content) {
385
+ is JsonPrimitive -> "$role: ${content.contentOrNull ?: ""}"
386
+ is JsonArray -> {
387
+ val texts = content.mapNotNull inner@{ part ->
388
+ val partObj = part as? JsonObject ?: return@inner null
389
+ if ((partObj["type"] as? JsonPrimitive)?.contentOrNull == "text") {
390
+ (partObj["text"] as? JsonPrimitive)?.contentOrNull
391
+ } else {
392
+ null
393
+ }
394
+ }
395
+ if (texts.isNotEmpty()) "$role: ${texts.joinToString(" ")}" else null
396
+ }
397
+ else -> null
398
+ }
399
+ }.joinToString("\n")
400
+
401
+ /** Mirrors `chatToLegacyCompletion()` from `packages/dvai-bridge-core`. */
402
+ private fun chatToLegacyCompletion(chat: JsonObject): JsonObject = buildJsonObject {
403
+ val chatId = (chat["id"] as? JsonPrimitive)?.contentOrNull ?: ""
404
+ val cmplId = if (chatId.isEmpty()) {
405
+ "cmpl-${System.currentTimeMillis() / 1000L}"
406
+ } else {
407
+ chatId.replace("chatcmpl-", "cmpl-")
408
+ }
409
+ put("id", cmplId)
410
+ put("object", "text_completion")
411
+ chat["created"]?.let { put("created", it) }
412
+ ?: put("created", System.currentTimeMillis() / 1000L)
413
+ put("model", (chat["model"] as? JsonPrimitive)?.contentOrNull ?: modelId)
414
+ putJsonArray("choices") {
415
+ val chatChoices = chat["choices"] as? JsonArray ?: JsonArray(emptyList())
416
+ for (c in chatChoices) {
417
+ val co = c as? JsonObject ?: continue
418
+ addJsonObject {
419
+ val msg = co["message"] as? JsonObject
420
+ put("text", (msg?.get("content") as? JsonPrimitive)?.contentOrNull ?: "")
421
+ put("index", (co["index"] as? JsonPrimitive)?.intOrNull ?: 0)
422
+ put(
423
+ "finish_reason",
424
+ (co["finish_reason"] as? JsonPrimitive)?.contentOrNull ?: "stop",
425
+ )
426
+ put("logprobs", JsonNull)
427
+ }
428
+ }
429
+ }
430
+ val usage = chat["usage"] as? JsonObject
431
+ if (usage != null) {
432
+ put("usage", usage)
433
+ } else {
434
+ putJsonObject("usage") {
435
+ put("prompt_tokens", 0)
436
+ put("completion_tokens", 0)
437
+ put("total_tokens", 0)
438
+ }
439
+ }
440
+ }
441
+
442
+ /** Adapt a single SSE frame from chat.completion.chunk -> text_completion.chunk. */
443
+ private fun adaptChunkToLegacy(chunk: String, model: String): String {
444
+ val trimmed = chunk.trim()
445
+ if (!trimmed.startsWith("data:")) return chunk
446
+ val payload = trimmed.removePrefix("data:").trim()
447
+ if (payload == "[DONE]") return "data: [DONE]\n\n"
448
+ val parsed = try {
449
+ Json.parseToJsonElement(payload) as? JsonObject ?: return chunk
450
+ } catch (_: Exception) {
451
+ return chunk
452
+ }
453
+ val chatId = (parsed["id"] as? JsonPrimitive)?.contentOrNull ?: ""
454
+ val id = chatId.replace("chatcmpl-", "cmpl-")
455
+ val legacy = buildJsonObject {
456
+ put("id", id)
457
+ put("object", "text_completion.chunk")
458
+ parsed["created"]?.let { put("created", it) }
459
+ ?: put("created", System.currentTimeMillis() / 1000L)
460
+ put("model", (parsed["model"] as? JsonPrimitive)?.contentOrNull ?: model)
461
+ putJsonArray("choices") {
462
+ val chatChoices = parsed["choices"] as? JsonArray ?: JsonArray(emptyList())
463
+ for (c in chatChoices) {
464
+ val co = c as? JsonObject ?: continue
465
+ addJsonObject {
466
+ val delta = co["delta"] as? JsonObject
467
+ put("text", (delta?.get("content") as? JsonPrimitive)?.contentOrNull ?: "")
468
+ put("index", (co["index"] as? JsonPrimitive)?.intOrNull ?: 0)
469
+ val fr = co["finish_reason"]
470
+ if (fr is JsonPrimitive && fr.contentOrNull != null) {
471
+ put("finish_reason", fr.content)
472
+ } else {
473
+ put("finish_reason", JsonNull)
474
+ }
475
+ put("logprobs", JsonNull)
476
+ }
477
+ }
478
+ }
479
+ }
480
+ return "data: $legacy\n\n"
481
+ }
482
+ }