@dvai-bridge/android-mediapipe-core 4.0.0 → 4.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,203 +1,203 @@
1
- package co.deepvoiceai.bridge.mediapipe.core
2
-
3
- import android.content.Context
4
- import com.google.ai.edge.litertlm.Backend
5
- import com.google.ai.edge.litertlm.Content
6
- import com.google.ai.edge.litertlm.Contents
7
- import com.google.ai.edge.litertlm.Conversation
8
- import com.google.ai.edge.litertlm.Engine
9
- import com.google.ai.edge.litertlm.EngineConfig
10
- import com.google.ai.edge.litertlm.Message
11
- import com.google.ai.edge.litertlm.MessageCallback
12
-
13
- /**
14
- * Test seam over Google's LiteRT-LM Engine. Concrete [MediaPipeBridge]
15
- * implements this; [MediaPipeHandlers] takes the interface so unit tests can
16
- * substitute a canned-response fake without loading a real `.litertlm` model.
17
- *
18
- * Concurrency: implementations need NOT be thread-safe — [MediaPipeHandlers]
19
- * serializes all calls behind its own mutex.
20
- */
21
- interface MediaPipeBridgeApi {
22
- /**
23
- * Synchronous prompt completion. If [images] is non-empty the engine must
24
- * have been built with `visionEnabled = true`; otherwise LiteRT-LM will
25
- * throw at conversation creation or message-send time. Images are supplied
26
- * as raw encoded bytes (PNG/JPEG/etc.).
27
- */
28
- fun completePrompt(prompt: String, images: List<ByteArray> = emptyList()): String
29
-
30
- /**
31
- * Asynchronous prompt completion. The supplied callback fires per partial
32
- * chunk; the second arg is `true` on the final fragment. Returns a handle
33
- * the caller can [AutoCloseable.close] to release the per-call conversation
34
- * once the stream finishes (or is cancelled). Images are supplied as raw
35
- * encoded bytes (PNG/JPEG/etc.).
36
- */
37
- fun completePromptAsync(
38
- prompt: String,
39
- images: List<ByteArray> = emptyList(),
40
- onPartial: (partial: String, done: Boolean) -> Unit,
41
- ): AutoCloseable
42
- }
43
-
44
- /**
45
- * Kotlin wrapper around the LiteRT-LM `litertlm-android:0.10.2` Engine API.
46
- * Replaces the deprecated `com.google.mediapipe:tasks-genai` MediaPipe bridge
47
- * (Phase 3B, Tasks 18-19).
48
- *
49
- * Architecture:
50
- *
51
- * - One long-lived [Engine] per bridge instance (lazy-initialized so JVM unit
52
- * tests using the [MediaPipeBridgeApi] fake never trigger native loading).
53
- * [engine.initialize()] is called inside the lazy block; this is the heavy
54
- * model-load step (~10 s) and must be called off the main thread.
55
- * - One [Conversation] per request — LiteRT-LM Conversations are stateful and
56
- * multi-turn, so we create a fresh one per call and close it after to
57
- * maintain the same stateless-request semantics as the old session model.
58
- * - Vision is enabled at the engine level via [EngineConfig.visionBackend].
59
- * There is no per-conversation vision flag (unlike the old
60
- * `GraphOptions.setEnableVisionModality`).
61
- *
62
- * API deviations from the migration doc (§3) based on actual bytecode inspection:
63
- * - [Message] has no `.text` property — text is accessed through
64
- * `message.contents.contents`, which is a `List<Content>`. Text parts are
65
- * `Content.Text` items; their text fields are joined to form the response.
66
- * - [EngineConfig] DOES have `maxNumImages: Int?` and `maxNumTokens: Int?`
67
- * fields in the actual 0.10.2 artifact — the migration doc §5 risk for
68
- * setMaxNumImages is not applicable; the field exists and is used here.
69
- * - [Engine] does not accept Android `Context` — per migration doc §4, Context
70
- * is only needed for optional path derivation. The constructor keeps `context`
71
- * for API compatibility and future use (e.g. `context.cacheDir.path`).
72
- *
73
- * Model file format: LiteRT-LM uses `.litertlm` bundles, not `.task`. Existing
74
- * `.task` models must be re-converted; see the migration notes for details.
75
- */
76
- class MediaPipeBridge(
77
- @Suppress("UNUSED_PARAMETER") private val context: Context,
78
- private val modelPath: String,
79
- private val maxTokens: Int = 2048,
80
- private val visionEnabled: Boolean = false,
81
- private val maxImages: Int = 1,
82
- ) : MediaPipeBridgeApi, AutoCloseable {
83
-
84
- private val engine: Engine by lazy {
85
- val cfg = EngineConfig(
86
- modelPath = modelPath,
87
- // Vision is enabled at the engine level by supplying a visionBackend.
88
- // GPU() is the standard choice; null disables vision modality.
89
- visionBackend = if (visionEnabled) Backend.GPU() else null,
90
- // maxNumImages: EngineConfig does have this field in 0.10.2
91
- // (migration doc §5 TBD is resolved — field exists in actual artifact).
92
- maxNumImages = if (visionEnabled) maxImages else null,
93
- // maxNumTokens maps to the old setMaxTokens(int) option.
94
- maxNumTokens = maxTokens,
95
- )
96
- val e = Engine(cfg)
97
- e.initialize()
98
- e
99
- }
100
-
101
- @Volatile private var engineInitialized: Boolean = false
102
-
103
- private fun engine(): Engine {
104
- val ref = engine
105
- engineInitialized = true
106
- return ref
107
- }
108
-
109
- private fun newConversation(): Conversation =
110
- engine().createConversation()
111
-
112
- /**
113
- * Build a [Contents] value combining the text prompt with any image bytes.
114
- * [Content.ImageBytes] accepts raw PNG/JPEG bytes directly — no MPImage
115
- * wrapping required (migration doc §2). The vararg [Contents.of] overload
116
- * is used to avoid a spurious unchecked-cast warning from the list overload.
117
- */
118
- private fun buildContents(prompt: String, images: List<ByteArray>): Contents {
119
- val parts = mutableListOf<Content>(Content.Text(prompt))
120
- for (bytes in images) {
121
- parts.add(Content.ImageBytes(bytes))
122
- }
123
- return Contents.of(parts)
124
- }
125
-
126
- /**
127
- * Extract text from a [Message] response.
128
- *
129
- * [Message] has no `.text` shortcut in the 0.10.2 public API. Text is
130
- * accessed via `message.contents.contents` (a `List<Content>`). All
131
- * `Content.Text` items are joined; non-text parts (images, audio, tool
132
- * responses) are silently ignored, matching the expected LLM response shape.
133
- */
134
- private fun Message.extractText(): String =
135
- contents.contents
136
- .filterIsInstance<Content.Text>()
137
- .joinToString("") { it.text }
138
-
139
- override fun completePrompt(prompt: String, images: List<ByteArray>): String {
140
- val msgContents = buildContents(prompt, images)
141
- val conversation = newConversation()
142
- try {
143
- // sendMessage is the single-call replacement for the old
144
- // addQueryChunk + addImage + generateResponse triple (migration doc §3).
145
- val message = conversation.sendMessage(msgContents)
146
- return message.extractText()
147
- } finally {
148
- try {
149
- conversation.close()
150
- } catch (_: Throwable) { /* idempotent */ }
151
- }
152
- }
153
-
154
- override fun completePromptAsync(
155
- prompt: String,
156
- images: List<ByteArray>,
157
- onPartial: (String, Boolean) -> Unit,
158
- ): AutoCloseable {
159
- val msgContents = buildContents(prompt, images)
160
- val conversation = newConversation()
161
- try {
162
- // MessageCallback replaces the old ProgressListener<String> callback.
163
- // onMessage fires per partial token; onDone signals completion.
164
- // (migration doc §3 streaming: callback form maps 1:1 to our contract)
165
- conversation.sendMessageAsync(
166
- msgContents,
167
- object : MessageCallback {
168
- override fun onMessage(message: Message) {
169
- onPartial(message.extractText(), false)
170
- }
171
-
172
- override fun onDone() {
173
- onPartial("", true)
174
- }
175
-
176
- override fun onError(throwable: Throwable) {
177
- // Surface the error: re-throw on the callback thread so
178
- // that the engine's internal executor propagates it.
179
- throw RuntimeException("LiteRT-LM streaming error", throwable)
180
- }
181
- },
182
- )
183
- } catch (t: Throwable) {
184
- try {
185
- conversation.close()
186
- } catch (_: Throwable) { /* idempotent */ }
187
- throw t
188
- }
189
- return AutoCloseable {
190
- try {
191
- conversation.close()
192
- } catch (_: Throwable) { /* idempotent — best-effort cleanup */ }
193
- }
194
- }
195
-
196
- override fun close() {
197
- if (engineInitialized) {
198
- try {
199
- engine.close()
200
- } catch (_: Throwable) { /* idempotent */ }
201
- }
202
- }
203
- }
1
+ package co.deepvoiceai.bridge.mediapipe.core
2
+
3
+ import android.content.Context
4
+ import com.google.ai.edge.litertlm.Backend
5
+ import com.google.ai.edge.litertlm.Content
6
+ import com.google.ai.edge.litertlm.Contents
7
+ import com.google.ai.edge.litertlm.Conversation
8
+ import com.google.ai.edge.litertlm.Engine
9
+ import com.google.ai.edge.litertlm.EngineConfig
10
+ import com.google.ai.edge.litertlm.Message
11
+ import com.google.ai.edge.litertlm.MessageCallback
12
+
13
+ /**
14
+ * Test seam over Google's LiteRT-LM Engine. Concrete [MediaPipeBridge]
15
+ * implements this; [MediaPipeHandlers] takes the interface so unit tests can
16
+ * substitute a canned-response fake without loading a real `.litertlm` model.
17
+ *
18
+ * Concurrency: implementations need NOT be thread-safe — [MediaPipeHandlers]
19
+ * serializes all calls behind its own mutex.
20
+ */
21
+ interface MediaPipeBridgeApi {
22
+ /**
23
+ * Synchronous prompt completion. If [images] is non-empty the engine must
24
+ * have been built with `visionEnabled = true`; otherwise LiteRT-LM will
25
+ * throw at conversation creation or message-send time. Images are supplied
26
+ * as raw encoded bytes (PNG/JPEG/etc.).
27
+ */
28
+ fun completePrompt(prompt: String, images: List<ByteArray> = emptyList()): String
29
+
30
+ /**
31
+ * Asynchronous prompt completion. The supplied callback fires per partial
32
+ * chunk; the second arg is `true` on the final fragment. Returns a handle
33
+ * the caller can [AutoCloseable.close] to release the per-call conversation
34
+ * once the stream finishes (or is cancelled). Images are supplied as raw
35
+ * encoded bytes (PNG/JPEG/etc.).
36
+ */
37
+ fun completePromptAsync(
38
+ prompt: String,
39
+ images: List<ByteArray> = emptyList(),
40
+ onPartial: (partial: String, done: Boolean) -> Unit,
41
+ ): AutoCloseable
42
+ }
43
+
44
+ /**
45
+ * Kotlin wrapper around the LiteRT-LM `litertlm-android:0.10.2` Engine API.
46
+ * Replaces the deprecated `com.google.mediapipe:tasks-genai` MediaPipe bridge
47
+ * (Phase 3B, Tasks 18-19).
48
+ *
49
+ * Architecture:
50
+ *
51
+ * - One long-lived [Engine] per bridge instance (lazy-initialized so JVM unit
52
+ * tests using the [MediaPipeBridgeApi] fake never trigger native loading).
53
+ * [engine.initialize()] is called inside the lazy block; this is the heavy
54
+ * model-load step (~10 s) and must be called off the main thread.
55
+ * - One [Conversation] per request — LiteRT-LM Conversations are stateful and
56
+ * multi-turn, so we create a fresh one per call and close it after to
57
+ * maintain the same stateless-request semantics as the old session model.
58
+ * - Vision is enabled at the engine level via [EngineConfig.visionBackend].
59
+ * There is no per-conversation vision flag (unlike the old
60
+ * `GraphOptions.setEnableVisionModality`).
61
+ *
62
+ * API deviations from the migration doc (§3) based on actual bytecode inspection:
63
+ * - [Message] has no `.text` property — text is accessed through
64
+ * `message.contents.contents`, which is a `List<Content>`. Text parts are
65
+ * `Content.Text` items; their text fields are joined to form the response.
66
+ * - [EngineConfig] DOES have `maxNumImages: Int?` and `maxNumTokens: Int?`
67
+ * fields in the actual 0.10.2 artifact — the migration doc §5 risk for
68
+ * setMaxNumImages is not applicable; the field exists and is used here.
69
+ * - [Engine] does not accept Android `Context` — per migration doc §4, Context
70
+ * is only needed for optional path derivation. The constructor keeps `context`
71
+ * for API compatibility and future use (e.g. `context.cacheDir.path`).
72
+ *
73
+ * Model file format: LiteRT-LM uses `.litertlm` bundles, not `.task`. Existing
74
+ * `.task` models must be re-converted; see the migration notes for details.
75
+ */
76
+ class MediaPipeBridge(
77
+ @Suppress("UNUSED_PARAMETER") private val context: Context,
78
+ private val modelPath: String,
79
+ private val maxTokens: Int = 2048,
80
+ private val visionEnabled: Boolean = false,
81
+ private val maxImages: Int = 1,
82
+ ) : MediaPipeBridgeApi, AutoCloseable {
83
+
84
+ private val engine: Engine by lazy {
85
+ val cfg = EngineConfig(
86
+ modelPath = modelPath,
87
+ // Vision is enabled at the engine level by supplying a visionBackend.
88
+ // GPU() is the standard choice; null disables vision modality.
89
+ visionBackend = if (visionEnabled) Backend.GPU() else null,
90
+ // maxNumImages: EngineConfig does have this field in 0.10.2
91
+ // (migration doc §5 TBD is resolved — field exists in actual artifact).
92
+ maxNumImages = if (visionEnabled) maxImages else null,
93
+ // maxNumTokens maps to the old setMaxTokens(int) option.
94
+ maxNumTokens = maxTokens,
95
+ )
96
+ val e = Engine(cfg)
97
+ e.initialize()
98
+ e
99
+ }
100
+
101
+ @Volatile private var engineInitialized: Boolean = false
102
+
103
+ private fun engine(): Engine {
104
+ val ref = engine
105
+ engineInitialized = true
106
+ return ref
107
+ }
108
+
109
+ private fun newConversation(): Conversation =
110
+ engine().createConversation()
111
+
112
+ /**
113
+ * Build a [Contents] value combining the text prompt with any image bytes.
114
+ * [Content.ImageBytes] accepts raw PNG/JPEG bytes directly — no MPImage
115
+ * wrapping required (migration doc §2). The vararg [Contents.of] overload
116
+ * is used to avoid a spurious unchecked-cast warning from the list overload.
117
+ */
118
+ private fun buildContents(prompt: String, images: List<ByteArray>): Contents {
119
+ val parts = mutableListOf<Content>(Content.Text(prompt))
120
+ for (bytes in images) {
121
+ parts.add(Content.ImageBytes(bytes))
122
+ }
123
+ return Contents.of(parts)
124
+ }
125
+
126
+ /**
127
+ * Extract text from a [Message] response.
128
+ *
129
+ * [Message] has no `.text` shortcut in the 0.10.2 public API. Text is
130
+ * accessed via `message.contents.contents` (a `List<Content>`). All
131
+ * `Content.Text` items are joined; non-text parts (images, audio, tool
132
+ * responses) are silently ignored, matching the expected LLM response shape.
133
+ */
134
+ private fun Message.extractText(): String =
135
+ contents.contents
136
+ .filterIsInstance<Content.Text>()
137
+ .joinToString("") { it.text }
138
+
139
+ override fun completePrompt(prompt: String, images: List<ByteArray>): String {
140
+ val msgContents = buildContents(prompt, images)
141
+ val conversation = newConversation()
142
+ try {
143
+ // sendMessage is the single-call replacement for the old
144
+ // addQueryChunk + addImage + generateResponse triple (migration doc §3).
145
+ val message = conversation.sendMessage(msgContents)
146
+ return message.extractText()
147
+ } finally {
148
+ try {
149
+ conversation.close()
150
+ } catch (_: Throwable) { /* idempotent */ }
151
+ }
152
+ }
153
+
154
+ override fun completePromptAsync(
155
+ prompt: String,
156
+ images: List<ByteArray>,
157
+ onPartial: (String, Boolean) -> Unit,
158
+ ): AutoCloseable {
159
+ val msgContents = buildContents(prompt, images)
160
+ val conversation = newConversation()
161
+ try {
162
+ // MessageCallback replaces the old ProgressListener<String> callback.
163
+ // onMessage fires per partial token; onDone signals completion.
164
+ // (migration doc §3 streaming: callback form maps 1:1 to our contract)
165
+ conversation.sendMessageAsync(
166
+ msgContents,
167
+ object : MessageCallback {
168
+ override fun onMessage(message: Message) {
169
+ onPartial(message.extractText(), false)
170
+ }
171
+
172
+ override fun onDone() {
173
+ onPartial("", true)
174
+ }
175
+
176
+ override fun onError(throwable: Throwable) {
177
+ // Surface the error: re-throw on the callback thread so
178
+ // that the engine's internal executor propagates it.
179
+ throw RuntimeException("LiteRT-LM streaming error", throwable)
180
+ }
181
+ },
182
+ )
183
+ } catch (t: Throwable) {
184
+ try {
185
+ conversation.close()
186
+ } catch (_: Throwable) { /* idempotent */ }
187
+ throw t
188
+ }
189
+ return AutoCloseable {
190
+ try {
191
+ conversation.close()
192
+ } catch (_: Throwable) { /* idempotent — best-effort cleanup */ }
193
+ }
194
+ }
195
+
196
+ override fun close() {
197
+ if (engineInitialized) {
198
+ try {
199
+ engine.close()
200
+ } catch (_: Throwable) { /* idempotent */ }
201
+ }
202
+ }
203
+ }