@dvai-bridge/android-mediapipe-core 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +51 -0
- package/README.md +199 -0
- package/android/build.gradle +134 -0
- package/android/gradle.properties +5 -0
- package/android/settings.gradle +1 -0
- package/android/src/main/AndroidManifest.xml +14 -0
- package/android/src/main/java/co/deepvoiceai/bridge/mediapipe/core/ImageDecoder.kt +115 -0
- package/android/src/main/java/co/deepvoiceai/bridge/mediapipe/core/MediaPipeBridge.kt +203 -0
- package/android/src/main/java/co/deepvoiceai/bridge/mediapipe/core/MediaPipeHandlers.kt +482 -0
- package/android/src/main/java/co/deepvoiceai/bridge/mediapipe/core/PluginState.kt +134 -0
- package/android/src/main/res/xml/dvai_network_security_config.xml +7 -0
- package/android/src/test/java/co/deepvoiceai/bridge/mediapipe/core/ImageDecoderTest.kt +114 -0
- package/android/src/test/java/co/deepvoiceai/bridge/mediapipe/core/MediaPipeHandlersTest.kt +529 -0
- package/android/src/test/java/co/deepvoiceai/bridge/mediapipe/core/PluginStateTest.kt +85 -0
- package/android/src/test/resources/images/tiny-test-base64.txt +1 -0
- package/android/src/test/resources/images/tiny-test.png +0 -0
- package/package.json +19 -0
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
package co.deepvoiceai.bridge.mediapipe.core
|
|
2
|
+
|
|
3
|
+
import android.content.Context
|
|
4
|
+
import com.google.ai.edge.litertlm.Backend
|
|
5
|
+
import com.google.ai.edge.litertlm.Content
|
|
6
|
+
import com.google.ai.edge.litertlm.Contents
|
|
7
|
+
import com.google.ai.edge.litertlm.Conversation
|
|
8
|
+
import com.google.ai.edge.litertlm.Engine
|
|
9
|
+
import com.google.ai.edge.litertlm.EngineConfig
|
|
10
|
+
import com.google.ai.edge.litertlm.Message
|
|
11
|
+
import com.google.ai.edge.litertlm.MessageCallback
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Test seam over Google's LiteRT-LM Engine. Concrete [MediaPipeBridge]
|
|
15
|
+
* implements this; [MediaPipeHandlers] takes the interface so unit tests can
|
|
16
|
+
* substitute a canned-response fake without loading a real `.litertlm` model.
|
|
17
|
+
*
|
|
18
|
+
* Concurrency: implementations need NOT be thread-safe — [MediaPipeHandlers]
|
|
19
|
+
* serializes all calls behind its own mutex.
|
|
20
|
+
*/
|
|
21
|
+
interface MediaPipeBridgeApi {
|
|
22
|
+
/**
|
|
23
|
+
* Synchronous prompt completion. If [images] is non-empty the engine must
|
|
24
|
+
* have been built with `visionEnabled = true`; otherwise LiteRT-LM will
|
|
25
|
+
* throw at conversation creation or message-send time. Images are supplied
|
|
26
|
+
* as raw encoded bytes (PNG/JPEG/etc.).
|
|
27
|
+
*/
|
|
28
|
+
fun completePrompt(prompt: String, images: List<ByteArray> = emptyList()): String
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Asynchronous prompt completion. The supplied callback fires per partial
|
|
32
|
+
* chunk; the second arg is `true` on the final fragment. Returns a handle
|
|
33
|
+
* the caller can [AutoCloseable.close] to release the per-call conversation
|
|
34
|
+
* once the stream finishes (or is cancelled). Images are supplied as raw
|
|
35
|
+
* encoded bytes (PNG/JPEG/etc.).
|
|
36
|
+
*/
|
|
37
|
+
fun completePromptAsync(
|
|
38
|
+
prompt: String,
|
|
39
|
+
images: List<ByteArray> = emptyList(),
|
|
40
|
+
onPartial: (partial: String, done: Boolean) -> Unit,
|
|
41
|
+
): AutoCloseable
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Kotlin wrapper around the LiteRT-LM `litertlm-android:0.10.2` Engine API.
|
|
46
|
+
* Replaces the deprecated `com.google.mediapipe:tasks-genai` MediaPipe bridge
|
|
47
|
+
* (Phase 3B, Tasks 18-19).
|
|
48
|
+
*
|
|
49
|
+
* Architecture:
|
|
50
|
+
*
|
|
51
|
+
* - One long-lived [Engine] per bridge instance (lazy-initialized so JVM unit
|
|
52
|
+
* tests using the [MediaPipeBridgeApi] fake never trigger native loading).
|
|
53
|
+
* [engine.initialize()] is called inside the lazy block; this is the heavy
|
|
54
|
+
* model-load step (~10 s) and must be called off the main thread.
|
|
55
|
+
* - One [Conversation] per request — LiteRT-LM Conversations are stateful and
|
|
56
|
+
* multi-turn, so we create a fresh one per call and close it after to
|
|
57
|
+
* maintain the same stateless-request semantics as the old session model.
|
|
58
|
+
* - Vision is enabled at the engine level via [EngineConfig.visionBackend].
|
|
59
|
+
* There is no per-conversation vision flag (unlike the old
|
|
60
|
+
* `GraphOptions.setEnableVisionModality`).
|
|
61
|
+
*
|
|
62
|
+
* API deviations from the migration doc (§3) based on actual bytecode inspection:
|
|
63
|
+
* - [Message] has no `.text` property — text is accessed through
|
|
64
|
+
* `message.contents.contents`, which is a `List<Content>`. Text parts are
|
|
65
|
+
* `Content.Text` items; their text fields are joined to form the response.
|
|
66
|
+
* - [EngineConfig] DOES have `maxNumImages: Int?` and `maxNumTokens: Int?`
|
|
67
|
+
* fields in the actual 0.10.2 artifact — the migration doc §5 risk for
|
|
68
|
+
* setMaxNumImages is not applicable; the field exists and is used here.
|
|
69
|
+
* - [Engine] does not accept Android `Context` — per migration doc §4, Context
|
|
70
|
+
* is only needed for optional path derivation. The constructor keeps `context`
|
|
71
|
+
* for API compatibility and future use (e.g. `context.cacheDir.path`).
|
|
72
|
+
*
|
|
73
|
+
* Model file format: LiteRT-LM uses `.litertlm` bundles, not `.task`. Existing
|
|
74
|
+
* `.task` models must be re-converted; see the migration notes for details.
|
|
75
|
+
*/
|
|
76
|
+
class MediaPipeBridge(
|
|
77
|
+
@Suppress("UNUSED_PARAMETER") private val context: Context,
|
|
78
|
+
private val modelPath: String,
|
|
79
|
+
private val maxTokens: Int = 2048,
|
|
80
|
+
private val visionEnabled: Boolean = false,
|
|
81
|
+
private val maxImages: Int = 1,
|
|
82
|
+
) : MediaPipeBridgeApi, AutoCloseable {
|
|
83
|
+
|
|
84
|
+
private val engine: Engine by lazy {
|
|
85
|
+
val cfg = EngineConfig(
|
|
86
|
+
modelPath = modelPath,
|
|
87
|
+
// Vision is enabled at the engine level by supplying a visionBackend.
|
|
88
|
+
// GPU() is the standard choice; null disables vision modality.
|
|
89
|
+
visionBackend = if (visionEnabled) Backend.GPU() else null,
|
|
90
|
+
// maxNumImages: EngineConfig does have this field in 0.10.2
|
|
91
|
+
// (migration doc §5 TBD is resolved — field exists in actual artifact).
|
|
92
|
+
maxNumImages = if (visionEnabled) maxImages else null,
|
|
93
|
+
// maxNumTokens maps to the old setMaxTokens(int) option.
|
|
94
|
+
maxNumTokens = maxTokens,
|
|
95
|
+
)
|
|
96
|
+
val e = Engine(cfg)
|
|
97
|
+
e.initialize()
|
|
98
|
+
e
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
@Volatile private var engineInitialized: Boolean = false
|
|
102
|
+
|
|
103
|
+
private fun engine(): Engine {
|
|
104
|
+
val ref = engine
|
|
105
|
+
engineInitialized = true
|
|
106
|
+
return ref
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
private fun newConversation(): Conversation =
|
|
110
|
+
engine().createConversation()
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* Build a [Contents] value combining the text prompt with any image bytes.
|
|
114
|
+
* [Content.ImageBytes] accepts raw PNG/JPEG bytes directly — no MPImage
|
|
115
|
+
* wrapping required (migration doc §2). The vararg [Contents.of] overload
|
|
116
|
+
* is used to avoid a spurious unchecked-cast warning from the list overload.
|
|
117
|
+
*/
|
|
118
|
+
private fun buildContents(prompt: String, images: List<ByteArray>): Contents {
|
|
119
|
+
val parts = mutableListOf<Content>(Content.Text(prompt))
|
|
120
|
+
for (bytes in images) {
|
|
121
|
+
parts.add(Content.ImageBytes(bytes))
|
|
122
|
+
}
|
|
123
|
+
return Contents.of(parts)
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Extract text from a [Message] response.
|
|
128
|
+
*
|
|
129
|
+
* [Message] has no `.text` shortcut in the 0.10.2 public API. Text is
|
|
130
|
+
* accessed via `message.contents.contents` (a `List<Content>`). All
|
|
131
|
+
* `Content.Text` items are joined; non-text parts (images, audio, tool
|
|
132
|
+
* responses) are silently ignored, matching the expected LLM response shape.
|
|
133
|
+
*/
|
|
134
|
+
private fun Message.extractText(): String =
|
|
135
|
+
contents.contents
|
|
136
|
+
.filterIsInstance<Content.Text>()
|
|
137
|
+
.joinToString("") { it.text }
|
|
138
|
+
|
|
139
|
+
override fun completePrompt(prompt: String, images: List<ByteArray>): String {
|
|
140
|
+
val msgContents = buildContents(prompt, images)
|
|
141
|
+
val conversation = newConversation()
|
|
142
|
+
try {
|
|
143
|
+
// sendMessage is the single-call replacement for the old
|
|
144
|
+
// addQueryChunk + addImage + generateResponse triple (migration doc §3).
|
|
145
|
+
val message = conversation.sendMessage(msgContents)
|
|
146
|
+
return message.extractText()
|
|
147
|
+
} finally {
|
|
148
|
+
try {
|
|
149
|
+
conversation.close()
|
|
150
|
+
} catch (_: Throwable) { /* idempotent */ }
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
override fun completePromptAsync(
|
|
155
|
+
prompt: String,
|
|
156
|
+
images: List<ByteArray>,
|
|
157
|
+
onPartial: (String, Boolean) -> Unit,
|
|
158
|
+
): AutoCloseable {
|
|
159
|
+
val msgContents = buildContents(prompt, images)
|
|
160
|
+
val conversation = newConversation()
|
|
161
|
+
try {
|
|
162
|
+
// MessageCallback replaces the old ProgressListener<String> callback.
|
|
163
|
+
// onMessage fires per partial token; onDone signals completion.
|
|
164
|
+
// (migration doc §3 streaming: callback form maps 1:1 to our contract)
|
|
165
|
+
conversation.sendMessageAsync(
|
|
166
|
+
msgContents,
|
|
167
|
+
object : MessageCallback {
|
|
168
|
+
override fun onMessage(message: Message) {
|
|
169
|
+
onPartial(message.extractText(), false)
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
override fun onDone() {
|
|
173
|
+
onPartial("", true)
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
override fun onError(throwable: Throwable) {
|
|
177
|
+
// Surface the error: re-throw on the callback thread so
|
|
178
|
+
// that the engine's internal executor propagates it.
|
|
179
|
+
throw RuntimeException("LiteRT-LM streaming error", throwable)
|
|
180
|
+
}
|
|
181
|
+
},
|
|
182
|
+
)
|
|
183
|
+
} catch (t: Throwable) {
|
|
184
|
+
try {
|
|
185
|
+
conversation.close()
|
|
186
|
+
} catch (_: Throwable) { /* idempotent */ }
|
|
187
|
+
throw t
|
|
188
|
+
}
|
|
189
|
+
return AutoCloseable {
|
|
190
|
+
try {
|
|
191
|
+
conversation.close()
|
|
192
|
+
} catch (_: Throwable) { /* idempotent — best-effort cleanup */ }
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
override fun close() {
|
|
197
|
+
if (engineInitialized) {
|
|
198
|
+
try {
|
|
199
|
+
engine.close()
|
|
200
|
+
} catch (_: Throwable) { /* idempotent */ }
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
}
|
|
@@ -0,0 +1,482 @@
|
|
|
1
|
+
package co.deepvoiceai.bridge.mediapipe.core
|
|
2
|
+
|
|
3
|
+
import co.deepvoiceai.bridge.shared.core.DvaiHandlers
|
|
4
|
+
import co.deepvoiceai.bridge.shared.core.HandlerContext
|
|
5
|
+
import co.deepvoiceai.bridge.shared.core.HandlerResponse
|
|
6
|
+
import kotlinx.coroutines.Dispatchers
|
|
7
|
+
import kotlinx.coroutines.channels.awaitClose
|
|
8
|
+
import kotlinx.coroutines.flow.Flow
|
|
9
|
+
import kotlinx.coroutines.flow.callbackFlow
|
|
10
|
+
import kotlinx.coroutines.flow.flow
|
|
11
|
+
import kotlinx.coroutines.sync.Mutex
|
|
12
|
+
import kotlinx.coroutines.sync.withLock
|
|
13
|
+
import kotlinx.coroutines.withContext
|
|
14
|
+
import kotlinx.serialization.json.Json
|
|
15
|
+
import kotlinx.serialization.json.JsonArray
|
|
16
|
+
import kotlinx.serialization.json.JsonNull
|
|
17
|
+
import kotlinx.serialization.json.JsonObject
|
|
18
|
+
import kotlinx.serialization.json.JsonPrimitive
|
|
19
|
+
import kotlinx.serialization.json.addJsonObject
|
|
20
|
+
import kotlinx.serialization.json.booleanOrNull
|
|
21
|
+
import kotlinx.serialization.json.buildJsonObject
|
|
22
|
+
import kotlinx.serialization.json.contentOrNull
|
|
23
|
+
import kotlinx.serialization.json.intOrNull
|
|
24
|
+
import kotlinx.serialization.json.put
|
|
25
|
+
import kotlinx.serialization.json.putJsonArray
|
|
26
|
+
import kotlinx.serialization.json.putJsonObject
|
|
27
|
+
import java.util.UUID
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* OpenAI-compatible handler set for the MediaPipe LLM backend on Android.
|
|
31
|
+
* Wires `openAIMessagesToPrompt` -> `bridge.completePrompt` (sync) or
|
|
32
|
+
* `bridge.completePromptAsync` (streaming) -> OpenAI response shape.
|
|
33
|
+
*
|
|
34
|
+
* Phase 1 scope (Task 46): text + optional image input on vision-capable
|
|
35
|
+
* Gemma tasks (e.g. Gemma 3n vision variants). Audio (`input_audio`) is
|
|
36
|
+
* permanently rejected — MediaPipe `tasks-genai` has no audio path. Image
|
|
37
|
+
* support is gated behind [visionCapable]: when `false`, `image_url` parts
|
|
38
|
+
* return a 400 pointing at the model's lack of vision capability rather than
|
|
39
|
+
* silently ignoring them. PluginState (Task 48) toggles the flag from the
|
|
40
|
+
* caller-supplied `visionEnabled` start option.
|
|
41
|
+
*
|
|
42
|
+
* ## Streaming envelope
|
|
43
|
+
*
|
|
44
|
+
* Emits one role-only delta frame, then one content delta frame per MediaPipe
|
|
45
|
+
* progress callback (with `finish_reason: "stop"` on the final frame), then a
|
|
46
|
+
* literal `[DONE]` terminator. Frame count therefore varies with the number of
|
|
47
|
+
* tokens generated — there is no fixed envelope size. Server-side buffering in
|
|
48
|
+
* [HandlerDispatch] still collects everything before flush in Phase 1, so
|
|
49
|
+
* clients see all frames together; per-token streaming lands when dispatch
|
|
50
|
+
* grows a flush-per-chunk path.
|
|
51
|
+
*
|
|
52
|
+
* ## Streaming envelope parity (with [LlamaHandlers])
|
|
53
|
+
*
|
|
54
|
+
* The two backends emit slightly different shapes — both valid per OpenAI's
|
|
55
|
+
* spec, but worth documenting so readers don't assume identical behavior:
|
|
56
|
+
*
|
|
57
|
+
* - [LlamaHandlers] emits: role / content / **separate empty-delta finish
|
|
58
|
+
* frame with `finish_reason: "stop"`** / `[DONE]` (fixed 4-frame shape).
|
|
59
|
+
* - [MediaPipeHandlers] emits: role / content₁ … content_N (last frame
|
|
60
|
+
* carries `finish_reason: "stop"` alongside its content) / `[DONE]`
|
|
61
|
+
* (variable frame count).
|
|
62
|
+
*
|
|
63
|
+
* Clients that accumulate `delta.content` see the full text in both cases.
|
|
64
|
+
* Clients that gate on `finish_reason` see it on the trailing chunk in both
|
|
65
|
+
* cases — just empty-delta in Llama, content-bearing in MediaPipe. The
|
|
66
|
+
* asymmetry is intentional: LlamaHandlers wraps a single completePrompt call
|
|
67
|
+
* with no intra-token signal, while MediaPipe's progress callback already
|
|
68
|
+
* surfaces the `done` flag inline with the final content chunk.
|
|
69
|
+
*
|
|
70
|
+
* All bridge-touching paths are serialized via [bridgeMutex] because
|
|
71
|
+
* `LlmInferenceSession` is not safe to use from multiple concurrent callers.
|
|
72
|
+
*/
|
|
73
|
+
class MediaPipeHandlers(
|
|
74
|
+
private val bridge: MediaPipeBridgeApi,
|
|
75
|
+
private val modelId: String,
|
|
76
|
+
/**
|
|
77
|
+
* `true` when the loaded MediaPipe `.task` bundle is a vision-capable
|
|
78
|
+
* Gemma variant AND PluginState has wired the bridge with
|
|
79
|
+
* `visionEnabled = true`. Defaults to `false` so non-vision deployments
|
|
80
|
+
* (the common case) reject image parts with a clear 400.
|
|
81
|
+
*/
|
|
82
|
+
private val visionCapable: Boolean = false,
|
|
83
|
+
) : DvaiHandlers {
|
|
84
|
+
private val bridgeMutex = Mutex()
|
|
85
|
+
|
|
86
|
+
override suspend fun handleChatCompletion(body: JsonObject, ctx: HandlerContext): HandlerResponse {
|
|
87
|
+
val messages = body["messages"] as? JsonArray
|
|
88
|
+
?: return HandlerResponse.Error(400, "Missing 'messages' field")
|
|
89
|
+
|
|
90
|
+
// Walk content parts up-front: collect images for vision-capable
|
|
91
|
+
// models, reject audio (always unsupported), and reject image_url for
|
|
92
|
+
// non-vision models. Mirrors LlamaHandlers/FoundationHandlers ordering.
|
|
93
|
+
// Images are collected as raw bytes; ByteArray → MPImage conversion
|
|
94
|
+
// happens inside the bridge implementation.
|
|
95
|
+
val images = mutableListOf<ByteArray>()
|
|
96
|
+
for (msg in messages) {
|
|
97
|
+
val msgObj = msg as? JsonObject ?: continue
|
|
98
|
+
val content = msgObj["content"] as? JsonArray ?: continue
|
|
99
|
+
for (part in content) {
|
|
100
|
+
val partObj = part as? JsonObject ?: continue
|
|
101
|
+
val type = (partObj["type"] as? JsonPrimitive)?.contentOrNull
|
|
102
|
+
if (type == "image_url") {
|
|
103
|
+
if (!visionCapable) {
|
|
104
|
+
return HandlerResponse.Error(
|
|
105
|
+
400,
|
|
106
|
+
"Image input requires a vision-capable MediaPipe model. " +
|
|
107
|
+
"Loaded model has no vision capability — pass " +
|
|
108
|
+
"`visionEnabled: true` to start() with a Gemma 3n " +
|
|
109
|
+
"vision-capable .task bundle to enable image input.",
|
|
110
|
+
)
|
|
111
|
+
}
|
|
112
|
+
val urlStr = (partObj["image_url"] as? JsonObject)
|
|
113
|
+
?.get("url")
|
|
114
|
+
?.let { it as? JsonPrimitive }
|
|
115
|
+
?.contentOrNull
|
|
116
|
+
if (urlStr.isNullOrEmpty()) {
|
|
117
|
+
return HandlerResponse.Error(
|
|
118
|
+
400,
|
|
119
|
+
"image_url part missing 'url' field",
|
|
120
|
+
)
|
|
121
|
+
}
|
|
122
|
+
val bytes = try {
|
|
123
|
+
withContext(Dispatchers.IO) { ImageDecoder.resolve(urlStr) }
|
|
124
|
+
} catch (e: Exception) {
|
|
125
|
+
// Fetch / decode failure — 502 per spec §8.5 wording.
|
|
126
|
+
return HandlerResponse.Error(
|
|
127
|
+
502,
|
|
128
|
+
"Failed to fetch image: ${e.message ?: "unknown error"}",
|
|
129
|
+
)
|
|
130
|
+
}
|
|
131
|
+
images.add(bytes)
|
|
132
|
+
}
|
|
133
|
+
if (type == "input_audio") {
|
|
134
|
+
return HandlerResponse.Error(
|
|
135
|
+
400,
|
|
136
|
+
"Audio input not supported on MediaPipe LLM " +
|
|
137
|
+
"(no audio-capable tasks-genai task).",
|
|
138
|
+
)
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
if (messages.isEmpty()) {
|
|
144
|
+
return HandlerResponse.Error(400, "Empty 'messages' array")
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
val prompt = openAIMessagesToPrompt(messages)
|
|
148
|
+
val isStream = (body["stream"] as? JsonPrimitive)?.booleanOrNull ?: false
|
|
149
|
+
|
|
150
|
+
val id = "chatcmpl-mp-" + UUID.randomUUID().toString().take(20).lowercase()
|
|
151
|
+
val created = System.currentTimeMillis() / 1000L
|
|
152
|
+
|
|
153
|
+
if (isStream) {
|
|
154
|
+
return HandlerResponse.Sse(
|
|
155
|
+
buildChatStreamFrames(id = id, created = created, prompt = prompt, images = images),
|
|
156
|
+
)
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
val text = try {
|
|
160
|
+
bridgeMutex.withLock {
|
|
161
|
+
withContext(Dispatchers.IO) { bridge.completePrompt(prompt, images) }
|
|
162
|
+
}
|
|
163
|
+
} catch (e: Exception) {
|
|
164
|
+
return HandlerResponse.Error(500, e.message ?: "Inference failed")
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
val response = buildJsonObject {
|
|
168
|
+
put("id", id)
|
|
169
|
+
put("object", "chat.completion")
|
|
170
|
+
put("created", created)
|
|
171
|
+
put("model", modelId)
|
|
172
|
+
putJsonArray("choices") {
|
|
173
|
+
addJsonObject {
|
|
174
|
+
put("index", 0)
|
|
175
|
+
putJsonObject("message") {
|
|
176
|
+
put("role", "assistant")
|
|
177
|
+
put("content", text)
|
|
178
|
+
}
|
|
179
|
+
put("finish_reason", "stop")
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
putJsonObject("usage") {
|
|
183
|
+
put("prompt_tokens", 0)
|
|
184
|
+
put("completion_tokens", 0)
|
|
185
|
+
put("total_tokens", 0)
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
return HandlerResponse.Json(200, response)
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
override suspend fun handleCompletion(body: JsonObject, ctx: HandlerContext): HandlerResponse {
|
|
192
|
+
val promptField = body["prompt"]
|
|
193
|
+
val prompt: String = when {
|
|
194
|
+
promptField == null || promptField is JsonNull -> ""
|
|
195
|
+
promptField is JsonPrimitive && promptField.contentOrNull != null -> promptField.content
|
|
196
|
+
promptField is JsonArray -> promptField.joinToString("\n") {
|
|
197
|
+
(it as? JsonPrimitive)?.contentOrNull ?: ""
|
|
198
|
+
}
|
|
199
|
+
else -> return HandlerResponse.Error(400, "'prompt' must be a string or array of strings")
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
val chatBody = buildJsonObject {
|
|
203
|
+
for ((k, v) in body) {
|
|
204
|
+
if (k == "prompt") continue
|
|
205
|
+
put(k, v)
|
|
206
|
+
}
|
|
207
|
+
putJsonArray("messages") {
|
|
208
|
+
addJsonObject {
|
|
209
|
+
put("role", "user")
|
|
210
|
+
put("content", prompt)
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
val chatResp = handleChatCompletion(chatBody, ctx)
|
|
216
|
+
return when (chatResp) {
|
|
217
|
+
is HandlerResponse.Json -> {
|
|
218
|
+
// Smart cast across the shared-core module boundary doesn't work;
|
|
219
|
+
// bind to a local val and cast once. See LlamaHandlers.kt for the
|
|
220
|
+
// same fix.
|
|
221
|
+
val respBody = chatResp.body
|
|
222
|
+
if (chatResp.status != 200 || respBody !is JsonObject) {
|
|
223
|
+
chatResp
|
|
224
|
+
} else {
|
|
225
|
+
HandlerResponse.Json(200, chatToLegacyCompletion(respBody))
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
is HandlerResponse.Sse -> {
|
|
229
|
+
val model = (body["model"] as? JsonPrimitive)?.contentOrNull ?: modelId
|
|
230
|
+
HandlerResponse.Sse(
|
|
231
|
+
flow {
|
|
232
|
+
chatResp.flow.collect { chunk ->
|
|
233
|
+
emit(adaptChunkToLegacy(chunk, model))
|
|
234
|
+
}
|
|
235
|
+
},
|
|
236
|
+
)
|
|
237
|
+
}
|
|
238
|
+
is HandlerResponse.Error -> chatResp
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
override suspend fun handleEmbeddings(body: JsonObject, ctx: HandlerContext): HandlerResponse =
|
|
243
|
+
HandlerResponse.Error(
|
|
244
|
+
400,
|
|
245
|
+
"Embeddings not supported on MediaPipe LLM. " +
|
|
246
|
+
"Use capacitorBackend: \"llama\" with nativeEmbeddingMode: true.",
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
override suspend fun handleModels(ctx: HandlerContext): HandlerResponse =
|
|
250
|
+
HandlerResponse.Json(
|
|
251
|
+
200,
|
|
252
|
+
buildJsonObject {
|
|
253
|
+
put("object", "list")
|
|
254
|
+
putJsonArray("data") {
|
|
255
|
+
addJsonObject {
|
|
256
|
+
put("id", ctx.modelId)
|
|
257
|
+
put("object", "model")
|
|
258
|
+
put("owned_by", "google-mediapipe")
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
},
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
// ----- Streaming -----
|
|
265
|
+
|
|
266
|
+
/**
|
|
267
|
+
* Build the SSE envelope: role frame + N content frames (last carries
|
|
268
|
+
* `finish_reason: "stop"`) + `[DONE]` terminator. Frame count varies with
|
|
269
|
+
* token count — see the "Streaming envelope parity" note in this class's
|
|
270
|
+
* KDoc for the documented divergence from [LlamaHandlers].
|
|
271
|
+
*
|
|
272
|
+
* Acquires [bridgeMutex] for the lifetime of the stream and releases it
|
|
273
|
+
* in [awaitClose] — guarantees serialization with non-streaming requests
|
|
274
|
+
* AND mutex release on either successful completion or coroutine
|
|
275
|
+
* cancellation.
|
|
276
|
+
*/
|
|
277
|
+
private fun buildChatStreamFrames(
|
|
278
|
+
id: String,
|
|
279
|
+
created: Long,
|
|
280
|
+
prompt: String,
|
|
281
|
+
images: List<ByteArray>,
|
|
282
|
+
): Flow<String> = callbackFlow {
|
|
283
|
+
// Serialize against any other bridge use for the entire stream lifetime.
|
|
284
|
+
// Track ownership explicitly via a local flag so we never depend on the
|
|
285
|
+
// racy `Mutex.isLocked` snapshot read for unlock decisions.
|
|
286
|
+
var unlocked = false
|
|
287
|
+
bridgeMutex.lock()
|
|
288
|
+
fun safeUnlock() {
|
|
289
|
+
if (!unlocked) {
|
|
290
|
+
unlocked = true
|
|
291
|
+
bridgeMutex.unlock()
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
// Role chunk (first frame of the envelope).
|
|
296
|
+
val roleChunk = buildJsonObject {
|
|
297
|
+
put("id", id); put("object", "chat.completion.chunk")
|
|
298
|
+
put("created", created); put("model", modelId)
|
|
299
|
+
putJsonArray("choices") {
|
|
300
|
+
addJsonObject {
|
|
301
|
+
put("index", 0)
|
|
302
|
+
putJsonObject("delta") { put("role", "assistant") }
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
trySend("data: $roleChunk\n\n")
|
|
307
|
+
|
|
308
|
+
val handle: AutoCloseable = try {
|
|
309
|
+
bridge.completePromptAsync(prompt, images) { partial, done ->
|
|
310
|
+
// Content-delta chunk for every (partial, done) pair. When `done`
|
|
311
|
+
// is true the final frame carries finish_reason="stop"; otherwise
|
|
312
|
+
// finish_reason is null.
|
|
313
|
+
val chunk = buildJsonObject {
|
|
314
|
+
put("id", id); put("object", "chat.completion.chunk")
|
|
315
|
+
put("created", created); put("model", modelId)
|
|
316
|
+
putJsonArray("choices") {
|
|
317
|
+
addJsonObject {
|
|
318
|
+
put("index", 0)
|
|
319
|
+
putJsonObject("delta") { put("content", partial) }
|
|
320
|
+
if (done) {
|
|
321
|
+
put("finish_reason", "stop")
|
|
322
|
+
} else {
|
|
323
|
+
put("finish_reason", JsonNull)
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
trySend("data: $chunk\n\n")
|
|
329
|
+
if (done) {
|
|
330
|
+
trySend("data: [DONE]\n\n")
|
|
331
|
+
close()
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
} catch (e: Exception) {
|
|
335
|
+
// Generation failed to start. Emit an error chunk + [DONE] so the
|
|
336
|
+
// client sees a well-formed SSE close, then complete the flow with
|
|
337
|
+
// the exception (collector receives it).
|
|
338
|
+
//
|
|
339
|
+
// finish_reason uses the OpenAI-standard "stop" value; the failure
|
|
340
|
+
// signal is conveyed via the sibling `error` field. (OpenAI's spec
|
|
341
|
+
// restricts finish_reason to stop|length|tool_calls|content_filter|
|
|
342
|
+
// function_call|null, so we don't invent an "error" value.)
|
|
343
|
+
val errChunk = buildJsonObject {
|
|
344
|
+
put("id", id); put("object", "chat.completion.chunk")
|
|
345
|
+
put("created", created); put("model", modelId)
|
|
346
|
+
putJsonArray("choices") {
|
|
347
|
+
addJsonObject {
|
|
348
|
+
put("index", 0)
|
|
349
|
+
putJsonObject("delta") { /* empty */ }
|
|
350
|
+
put("finish_reason", "stop")
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
putJsonObject("error") { put("message", e.message ?: "Inference failed") }
|
|
354
|
+
}
|
|
355
|
+
trySend("data: $errChunk\n\n")
|
|
356
|
+
trySend("data: [DONE]\n\n")
|
|
357
|
+
close(e)
|
|
358
|
+
// Release the mutex synchronously — awaitClose still runs but
|
|
359
|
+
// there is no AutoCloseable handle to call close() on.
|
|
360
|
+
safeUnlock()
|
|
361
|
+
return@callbackFlow
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
awaitClose {
|
|
365
|
+
try {
|
|
366
|
+
handle.close()
|
|
367
|
+
} catch (_: Throwable) { /* best-effort */ }
|
|
368
|
+
safeUnlock()
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
// ----- Helpers -----
|
|
373
|
+
|
|
374
|
+
/**
|
|
375
|
+
* Flatten OpenAI chat messages into a single `role: content` newline-joined
|
|
376
|
+
* prompt string. Multimodal content arrays are reduced to their `text`
|
|
377
|
+
* parts (image / audio parts are rejected before this method is reached).
|
|
378
|
+
*/
|
|
379
|
+
private fun openAIMessagesToPrompt(messages: JsonArray): String =
|
|
380
|
+
messages.mapNotNull { msg ->
|
|
381
|
+
val msgObj = msg as? JsonObject ?: return@mapNotNull null
|
|
382
|
+
val role = (msgObj["role"] as? JsonPrimitive)?.contentOrNull ?: "user"
|
|
383
|
+
val content = msgObj["content"]
|
|
384
|
+
when (content) {
|
|
385
|
+
is JsonPrimitive -> "$role: ${content.contentOrNull ?: ""}"
|
|
386
|
+
is JsonArray -> {
|
|
387
|
+
val texts = content.mapNotNull inner@{ part ->
|
|
388
|
+
val partObj = part as? JsonObject ?: return@inner null
|
|
389
|
+
if ((partObj["type"] as? JsonPrimitive)?.contentOrNull == "text") {
|
|
390
|
+
(partObj["text"] as? JsonPrimitive)?.contentOrNull
|
|
391
|
+
} else {
|
|
392
|
+
null
|
|
393
|
+
}
|
|
394
|
+
}
|
|
395
|
+
if (texts.isNotEmpty()) "$role: ${texts.joinToString(" ")}" else null
|
|
396
|
+
}
|
|
397
|
+
else -> null
|
|
398
|
+
}
|
|
399
|
+
}.joinToString("\n")
|
|
400
|
+
|
|
401
|
+
/** Mirrors `chatToLegacyCompletion()` from `packages/dvai-bridge-core`. */
|
|
402
|
+
private fun chatToLegacyCompletion(chat: JsonObject): JsonObject = buildJsonObject {
|
|
403
|
+
val chatId = (chat["id"] as? JsonPrimitive)?.contentOrNull ?: ""
|
|
404
|
+
val cmplId = if (chatId.isEmpty()) {
|
|
405
|
+
"cmpl-${System.currentTimeMillis() / 1000L}"
|
|
406
|
+
} else {
|
|
407
|
+
chatId.replace("chatcmpl-", "cmpl-")
|
|
408
|
+
}
|
|
409
|
+
put("id", cmplId)
|
|
410
|
+
put("object", "text_completion")
|
|
411
|
+
chat["created"]?.let { put("created", it) }
|
|
412
|
+
?: put("created", System.currentTimeMillis() / 1000L)
|
|
413
|
+
put("model", (chat["model"] as? JsonPrimitive)?.contentOrNull ?: modelId)
|
|
414
|
+
putJsonArray("choices") {
|
|
415
|
+
val chatChoices = chat["choices"] as? JsonArray ?: JsonArray(emptyList())
|
|
416
|
+
for (c in chatChoices) {
|
|
417
|
+
val co = c as? JsonObject ?: continue
|
|
418
|
+
addJsonObject {
|
|
419
|
+
val msg = co["message"] as? JsonObject
|
|
420
|
+
put("text", (msg?.get("content") as? JsonPrimitive)?.contentOrNull ?: "")
|
|
421
|
+
put("index", (co["index"] as? JsonPrimitive)?.intOrNull ?: 0)
|
|
422
|
+
put(
|
|
423
|
+
"finish_reason",
|
|
424
|
+
(co["finish_reason"] as? JsonPrimitive)?.contentOrNull ?: "stop",
|
|
425
|
+
)
|
|
426
|
+
put("logprobs", JsonNull)
|
|
427
|
+
}
|
|
428
|
+
}
|
|
429
|
+
}
|
|
430
|
+
val usage = chat["usage"] as? JsonObject
|
|
431
|
+
if (usage != null) {
|
|
432
|
+
put("usage", usage)
|
|
433
|
+
} else {
|
|
434
|
+
putJsonObject("usage") {
|
|
435
|
+
put("prompt_tokens", 0)
|
|
436
|
+
put("completion_tokens", 0)
|
|
437
|
+
put("total_tokens", 0)
|
|
438
|
+
}
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
/** Adapt a single SSE frame from chat.completion.chunk -> text_completion.chunk. */
|
|
443
|
+
private fun adaptChunkToLegacy(chunk: String, model: String): String {
|
|
444
|
+
val trimmed = chunk.trim()
|
|
445
|
+
if (!trimmed.startsWith("data:")) return chunk
|
|
446
|
+
val payload = trimmed.removePrefix("data:").trim()
|
|
447
|
+
if (payload == "[DONE]") return "data: [DONE]\n\n"
|
|
448
|
+
val parsed = try {
|
|
449
|
+
Json.parseToJsonElement(payload) as? JsonObject ?: return chunk
|
|
450
|
+
} catch (_: Exception) {
|
|
451
|
+
return chunk
|
|
452
|
+
}
|
|
453
|
+
val chatId = (parsed["id"] as? JsonPrimitive)?.contentOrNull ?: ""
|
|
454
|
+
val id = chatId.replace("chatcmpl-", "cmpl-")
|
|
455
|
+
val legacy = buildJsonObject {
|
|
456
|
+
put("id", id)
|
|
457
|
+
put("object", "text_completion.chunk")
|
|
458
|
+
parsed["created"]?.let { put("created", it) }
|
|
459
|
+
?: put("created", System.currentTimeMillis() / 1000L)
|
|
460
|
+
put("model", (parsed["model"] as? JsonPrimitive)?.contentOrNull ?: model)
|
|
461
|
+
putJsonArray("choices") {
|
|
462
|
+
val chatChoices = parsed["choices"] as? JsonArray ?: JsonArray(emptyList())
|
|
463
|
+
for (c in chatChoices) {
|
|
464
|
+
val co = c as? JsonObject ?: continue
|
|
465
|
+
addJsonObject {
|
|
466
|
+
val delta = co["delta"] as? JsonObject
|
|
467
|
+
put("text", (delta?.get("content") as? JsonPrimitive)?.contentOrNull ?: "")
|
|
468
|
+
put("index", (co["index"] as? JsonPrimitive)?.intOrNull ?: 0)
|
|
469
|
+
val fr = co["finish_reason"]
|
|
470
|
+
if (fr is JsonPrimitive && fr.contentOrNull != null) {
|
|
471
|
+
put("finish_reason", fr.content)
|
|
472
|
+
} else {
|
|
473
|
+
put("finish_reason", JsonNull)
|
|
474
|
+
}
|
|
475
|
+
put("logprobs", JsonNull)
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
return "data: $legacy\n\n"
|
|
481
|
+
}
|
|
482
|
+
}
|