@extentos/mcp-server 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/tools/data/capabilities.d.ts +49 -0
- package/dist/tools/data/capabilities.d.ts.map +1 -1
- package/dist/tools/data/capabilities.js +37 -0
- package/dist/tools/data/capabilities.js.map +1 -1
- package/dist/tools/data/capabilityPatterns.d.ts.map +1 -1
- package/dist/tools/data/capabilityPatterns.js +265 -19
- package/dist/tools/data/capabilityPatterns.js.map +1 -1
- package/dist/tools/data/codeExamples.d.ts.map +1 -1
- package/dist/tools/data/codeExamples.js +624 -45
- package/dist/tools/data/codeExamples.js.map +1 -1
- package/dist/tools/data/version.d.ts +12 -0
- package/dist/tools/data/version.d.ts.map +1 -1
- package/dist/tools/data/version.js +17 -0
- package/dist/tools/data/version.js.map +1 -1
- package/dist/tools/definitions.d.ts.map +1 -1
- package/dist/tools/definitions.js +11 -2
- package/dist/tools/definitions.js.map +1 -1
- package/dist/tools/docs/index.d.ts.map +1 -1
- package/dist/tools/docs/index.js +10 -9
- package/dist/tools/docs/index.js.map +1 -1
- package/dist/tools/handlers/assertToolCalled.d.ts.map +1 -1
- package/dist/tools/handlers/assertToolCalled.js +38 -1
- package/dist/tools/handlers/assertToolCalled.js.map +1 -1
- package/dist/tools/handlers/generateConnectionModule.js +21 -2
- package/dist/tools/handlers/generateConnectionModule.js.map +1 -1
- package/dist/tools/handlers/getCredentialGuide.d.ts.map +1 -1
- package/dist/tools/handlers/getCredentialGuide.js +33 -6
- package/dist/tools/handlers/getCredentialGuide.js.map +1 -1
- package/dist/tools/handlers/injectAssistantUtterance.d.ts.map +1 -1
- package/dist/tools/handlers/injectAssistantUtterance.js +241 -2
- package/dist/tools/handlers/injectAssistantUtterance.js.map +1 -1
- package/package.json +1 -1
|
@@ -1904,61 +1904,139 @@ browser side-by-side with this agent loop and watch the panel react.`,
|
|
|
1904
1904
|
const ASSISTANT_AGENT_LOOP = {
|
|
1905
1905
|
pattern: "assistant_agent_loop",
|
|
1906
1906
|
title: "Phase 4 assistant runtime + agent-driven E2E loop (canonical voice-AI for new apps)",
|
|
1907
|
-
description: "**Canonical Phase 4 voice-AI pattern. Start here for any new voice assistant work on Extentos.** The customer code is one block: `glasses.assistant.start(provider) { tool(name, description) { body -> ToolResult } }`. The model owns wake detection, turn taking, intent parsing, and confirmation speech — the customer only writes tool bodies that act on the app's own state (route data, app DB, camera, library). Provider abstraction wraps OpenAI Realtime in v1; Gemini Live follows.
|
|
1907
|
+
description: "**Canonical Phase 4 voice-AI pattern. Start here for any new voice assistant work on Extentos.** The customer code is one block: `glasses.assistant.start(provider) { tool(name, description) { body -> ToolResult } }`. The model owns wake detection, turn taking, intent parsing, and confirmation speech — the customer only writes tool bodies that act on the app's own state (route data, app DB, camera, library). Provider abstraction wraps OpenAI Realtime in v1; Gemini Live follows. **As of iter5.2 (2026-05-27) `injectAssistantUtterance(text)` drives BOTH the Mock provider AND the real OpenAi Realtime provider** — same MCP call, same code, real model reasoning when you want it. iter5.3 (2026-05-27) added autoWake so multi-step agent tests don't drop injects across silence-timeout sleep transitions. **For the dedicated agent-driven test workflow (4-channel verification: getEventLog + adb logcat + screencap + library-state), see `getCodeExample(pattern:\"agent_driven_e2e_full_loop\")`.** This pattern stays focused on the handler-side code. **Replaces `conversation_agent_loop` (Phase 3 cascaded VAD+STT+SmartTurn+TTS+LLM) which is deprecated in v1.4.0 and removed in v2.0.0.** Customer code typically drops ~60% LoC vs Phase 3.",
|
|
1908
1908
|
code: {
|
|
1909
|
-
kotlin: `// ──
|
|
1910
|
-
//
|
|
1909
|
+
kotlin: `// ── App bootstrap ───────────────────────────────────────────────────
|
|
1910
|
+
// Run once during app startup, after RECORD_AUDIO is granted. The
|
|
1911
|
+
// returned ExtentosGlasses instance is the single handle for everything;
|
|
1912
|
+
// keep it on your Application subclass (or a singleton container) so
|
|
1913
|
+
// the rest of your app can reach it.
|
|
1914
|
+
//
|
|
1915
|
+
// createGlasses is suspend — call it from a coroutine:
|
|
1916
|
+
//
|
|
1917
|
+
// class MyApp : Application() {
|
|
1918
|
+
// lateinit var glasses: ExtentosGlasses
|
|
1919
|
+
// override fun onCreate() {
|
|
1920
|
+
// super.onCreate()
|
|
1921
|
+
// CoroutineScope(Dispatchers.Main).launch {
|
|
1922
|
+
// glasses = createGlasses(this@MyApp)
|
|
1923
|
+
// // Wire your assistant handler here (see below)
|
|
1924
|
+
// }
|
|
1925
|
+
// }
|
|
1926
|
+
// }
|
|
1911
1927
|
//
|
|
1912
1928
|
// No ONNX models, no model paths, no cascaded options — Phase 4 ships
|
|
1913
1929
|
// end-to-end via the provider's WebSocket. glasses.assistant is
|
|
1914
1930
|
// always-on (no opt-in conversationOptions needed).
|
|
1931
|
+
import com.extentos.glasses.core.CaptureError
|
|
1915
1932
|
import com.extentos.glasses.core.ExtentosConfig
|
|
1916
1933
|
import com.extentos.glasses.core.ExtentosGlasses
|
|
1934
|
+
import com.extentos.glasses.core.ExtentosResult
|
|
1935
|
+
import com.extentos.glasses.core.RuntimeEvent
|
|
1936
|
+
import com.extentos.glasses.core.VideoClip
|
|
1937
|
+
import com.extentos.glasses.core.VideoConfig
|
|
1938
|
+
import com.extentos.glasses.core.assistant.AssistantEvent
|
|
1917
1939
|
import com.extentos.glasses.core.assistant.AssistantProvider
|
|
1940
|
+
import com.extentos.glasses.core.assistant.AssistantSession
|
|
1918
1941
|
import com.extentos.glasses.core.assistant.ToolResult
|
|
1919
1942
|
import com.extentos.glasses.core.assistant.tool
|
|
1943
|
+
import com.extentos.glasses.core.valueOrNull
|
|
1944
|
+
import kotlinx.coroutines.CoroutineScope
|
|
1945
|
+
import kotlinx.coroutines.Deferred
|
|
1946
|
+
import kotlinx.coroutines.Dispatchers
|
|
1947
|
+
import kotlinx.coroutines.SupervisorJob
|
|
1948
|
+
import kotlinx.coroutines.async
|
|
1949
|
+
import kotlinx.coroutines.flow.filterIsInstance
|
|
1950
|
+
import kotlinx.coroutines.flow.launchIn
|
|
1951
|
+
import kotlinx.coroutines.flow.onEach
|
|
1952
|
+
import kotlinx.coroutines.launch
|
|
1953
|
+
import kotlin.time.Duration.Companion.seconds
|
|
1920
1954
|
|
|
1921
1955
|
suspend fun createGlasses(context: android.content.Context): ExtentosGlasses {
|
|
1922
1956
|
return ExtentosGlasses.create(
|
|
1923
1957
|
ExtentosConfig(applicationContext = context)
|
|
1924
1958
|
).also { glasses ->
|
|
1925
1959
|
// BYOK OpenAI key — see getCredentialGuide(service:"openai") for
|
|
1926
|
-
// the local.properties +
|
|
1927
|
-
// device → api.openai.com via WS Authorization header;
|
|
1928
|
-
// backend never sees it.
|
|
1960
|
+
// the local.properties + BuildConfig plumbing. Key flows direct
|
|
1961
|
+
// from device → api.openai.com via WS Authorization header;
|
|
1962
|
+
// Extentos backend never sees it.
|
|
1963
|
+
//
|
|
1964
|
+
// **F-meta-2 warning:** do NOT have your AI agent write the
|
|
1965
|
+
// OPENAI_API_KEY value into local.properties — agent file edits
|
|
1966
|
+
// leak through the conversation transcript. Add the line yourself.
|
|
1929
1967
|
glasses.assistant.setOpenaiApiKey(BuildConfig.OPENAI_API_KEY)
|
|
1930
1968
|
}
|
|
1931
1969
|
}
|
|
1932
1970
|
|
|
1933
|
-
// ── Handler — Strava-style example
|
|
1971
|
+
// ── Handler — Strava-style example ─────────────────────────────────────
|
|
1972
|
+
//
|
|
1973
|
+
// Phase 4 wake/sleep state machine (F12): the session starts Dormant
|
|
1974
|
+
// (zero token spend), the developer picks the wake mechanism (voice
|
|
1975
|
+
// phrase via glasses.voice.onPhrase, button tap, gesture, MCP call),
|
|
1976
|
+
// and the model decides when to end the conversation via the built-in
|
|
1977
|
+
// end_conversation tool (endOnIntent default true) — no rigid
|
|
1978
|
+
// "goodbye <name>" phrase required. sleepAfterSilence is the
|
|
1979
|
+
// deterministic backup. onWake { say(...) } speaks in the assistant's
|
|
1980
|
+
// own voice (alloy etc.) so the greeting matches the AI's reply voice.
|
|
1934
1981
|
|
|
1935
1982
|
class StravaAssistantHandler(
|
|
1936
1983
|
private val glasses: ExtentosGlasses,
|
|
1937
1984
|
private val routeTracker: RouteTracker, // app-internal state
|
|
1938
1985
|
private val library: ClipLibrary, // app-internal state
|
|
1939
|
-
private val scope:
|
|
1986
|
+
private val scope: CoroutineScope = CoroutineScope(SupervisorJob() + Dispatchers.IO),
|
|
1940
1987
|
) {
|
|
1941
|
-
private var
|
|
1988
|
+
private var session: AssistantSession? = null
|
|
1989
|
+
|
|
1990
|
+
// Natural async + await + stop pattern. The library exposes
|
|
1991
|
+
// glasses.camera.stopVideo() so the customer never has to cancel
|
|
1992
|
+
// the wrapping Deferred — captureVideo() resumes naturally with
|
|
1993
|
+
// Ok(partial) when stop is signalled.
|
|
1994
|
+
private var activeVideo: Deferred<ExtentosResult<VideoClip, CaptureError>>? = null
|
|
1995
|
+
|
|
1996
|
+
@Volatile private var notesActive: Boolean = false
|
|
1997
|
+
private val notesBuffer = StringBuilder()
|
|
1942
1998
|
|
|
1943
1999
|
fun start() {
|
|
1944
|
-
//
|
|
1945
|
-
//
|
|
1946
|
-
//
|
|
1947
|
-
//
|
|
2000
|
+
// F3 (iteration-2 fix): subscribe to AssistantEvent.UserSpoke for
|
|
2001
|
+
// transcript capture — needed for notes / live-captions / journal
|
|
2002
|
+
// patterns. Fires once per user turn after the provider's STT
|
|
2003
|
+
// completes. PII boundary: text is verbatim, customer-owned.
|
|
2004
|
+
glasses.runtime.events
|
|
2005
|
+
.filterIsInstance<RuntimeEvent.Assistant>()
|
|
2006
|
+
.onEach { evt ->
|
|
2007
|
+
val userSpoke = evt.event as? AssistantEvent.UserSpoke ?: return@onEach
|
|
2008
|
+
if (notesActive) {
|
|
2009
|
+
synchronized(notesBuffer) {
|
|
2010
|
+
if (notesBuffer.isNotEmpty()) notesBuffer.append(" ")
|
|
2011
|
+
notesBuffer.append(userSpoke.transcript)
|
|
2012
|
+
}
|
|
2013
|
+
}
|
|
2014
|
+
}
|
|
2015
|
+
.launchIn(scope)
|
|
2016
|
+
|
|
1948
2017
|
scope.launch {
|
|
1949
|
-
glasses.assistant.start(
|
|
1950
|
-
provider = AssistantProvider.OpenAi(
|
|
1951
|
-
model = "gpt-realtime",
|
|
1952
|
-
voice = "alloy",
|
|
1953
|
-
),
|
|
2018
|
+
session = glasses.assistant.start(
|
|
2019
|
+
provider = AssistantProvider.OpenAi(model = "gpt-realtime", voice = "alloy"),
|
|
1954
2020
|
) {
|
|
1955
2021
|
instructions = """
|
|
1956
2022
|
You are a Strava companion. Help the runner with route
|
|
1957
2023
|
stats and capture moments. Speak briefly — they're
|
|
1958
2024
|
running. Don't narrate what you're doing — just do it
|
|
1959
|
-
and confirm.
|
|
2025
|
+
and confirm. When the user clearly indicates they want
|
|
2026
|
+
to stop talking, call end_conversation.
|
|
1960
2027
|
""".trimIndent()
|
|
1961
2028
|
|
|
2029
|
+
// Phase 4 v1.1 lifecycle:
|
|
2030
|
+
// - session lands in Dormant after start() (default)
|
|
2031
|
+
// - wake via the voice phrase wired below (any other
|
|
2032
|
+
// trigger — button, gesture, MCP — works the same)
|
|
2033
|
+
// - onWake speaks the greeting in the model's voice
|
|
2034
|
+
// - 30 s of user silence → auto-sleep (backup)
|
|
2035
|
+
// - end_conversation tool registered automatically by
|
|
2036
|
+
// the library because endOnIntent defaults true
|
|
2037
|
+
onWake { say("Hi, I'm here. What can I do for you?") }
|
|
2038
|
+
sleepAfterSilence(30.seconds)
|
|
2039
|
+
|
|
1962
2040
|
// Read-tools: instant data the AI reads aloud or weaves
|
|
1963
2041
|
// into the answer.
|
|
1964
2042
|
tool("get_route_remaining", "How much of the planned route is left, in km.") {
|
|
@@ -1972,33 +2050,112 @@ class StravaAssistantHandler(
|
|
|
1972
2050
|
}
|
|
1973
2051
|
|
|
1974
2052
|
// Action-tools: side effects on the app's own state. The
|
|
1975
|
-
// AI manages the take/stop pair
|
|
1976
|
-
// that stop_video pairs with take_video.
|
|
2053
|
+
// AI manages the take/stop pair from context.
|
|
1977
2054
|
tool("take_video", "Start recording a video clip of the runner's view.") {
|
|
2055
|
+
if (activeVideo?.isActive == true) {
|
|
2056
|
+
return@tool ToolResult.Err("a recording is already in progress")
|
|
2057
|
+
}
|
|
1978
2058
|
activeVideo = scope.async {
|
|
1979
|
-
glasses.camera.captureVideo(
|
|
1980
|
-
VideoConfig(maxDurationSeconds = 30),
|
|
1981
|
-
)
|
|
2059
|
+
glasses.camera.captureVideo(VideoConfig(maxDurationSeconds = 30))
|
|
1982
2060
|
}
|
|
1983
2061
|
ToolResult.Ok("recording started")
|
|
1984
2062
|
}
|
|
1985
2063
|
tool("stop_video", "Stop the current video recording.") {
|
|
1986
|
-
//
|
|
1987
|
-
//
|
|
1988
|
-
//
|
|
1989
|
-
//
|
|
1990
|
-
//
|
|
1991
|
-
//
|
|
1992
|
-
//
|
|
1993
|
-
|
|
2064
|
+
// Clean stop pattern: signal the library to gracefully
|
|
2065
|
+
// end the capture; the in-flight captureVideo() inside
|
|
2066
|
+
// the \`async\` block resumes naturally with
|
|
2067
|
+
// ExtentosResult.Ok(partialClip). No coroutine
|
|
2068
|
+
// cancellation, no sticky-Cancelled Deferred — just
|
|
2069
|
+
// a normal await on the result.
|
|
2070
|
+
//
|
|
2071
|
+
// (Don't be tempted to call activeVideo.cancel() to
|
|
2072
|
+
// stop — Kotlin Deferred state is sticky-Cancelled,
|
|
2073
|
+
// so await() throws CancellationException even if the
|
|
2074
|
+
// library produced a partial. Use stopVideo() instead.)
|
|
2075
|
+
val capture = activeVideo
|
|
1994
2076
|
activeVideo = null
|
|
1995
|
-
if (
|
|
1996
|
-
|
|
1997
|
-
val
|
|
1998
|
-
clip
|
|
1999
|
-
|
|
2077
|
+
if (capture == null) return@tool ToolResult.Err("nothing was recording")
|
|
2078
|
+
glasses.camera.stopVideo()
|
|
2079
|
+
val result = capture.await()
|
|
2080
|
+
val clip = result.valueOrNull() ?: return@tool ToolResult.Err("video capture failed")
|
|
2081
|
+
library.add(clip)
|
|
2082
|
+
ToolResult.Ok("video saved")
|
|
2083
|
+
}
|
|
2084
|
+
|
|
2085
|
+
tool("start_notes", "Start capturing what the runner says as a note.") {
|
|
2086
|
+
if (notesActive) return@tool ToolResult.Err("already taking notes")
|
|
2087
|
+
synchronized(notesBuffer) { notesBuffer.setLength(0) }
|
|
2088
|
+
notesActive = true
|
|
2089
|
+
ToolResult.Ok("ok, taking notes")
|
|
2090
|
+
}
|
|
2091
|
+
tool("stop_notes", "Stop note-taking and save the accumulated notes.") {
|
|
2092
|
+
if (!notesActive) return@tool ToolResult.Err("not currently taking notes")
|
|
2093
|
+
notesActive = false
|
|
2094
|
+
val text = synchronized(notesBuffer) {
|
|
2095
|
+
val s = notesBuffer.toString()
|
|
2096
|
+
notesBuffer.setLength(0)
|
|
2097
|
+
s
|
|
2098
|
+
}
|
|
2099
|
+
if (text.isBlank()) return@tool ToolResult.Err("nothing was captured")
|
|
2100
|
+
library.addNote(text)
|
|
2101
|
+
ToolResult.Ok("notes saved")
|
|
2102
|
+
}
|
|
2103
|
+
|
|
2104
|
+
// Vision tools — Phase 4 v1.4 includeImage + Photos.copyToFile.
|
|
2105
|
+
//
|
|
2106
|
+
// Two distinct tools, NOT one. The model picks based on
|
|
2107
|
+
// user intent:
|
|
2108
|
+
// describe_scene — "what / describe / tell me" → AI speaks
|
|
2109
|
+
// about the photo; photo NOT persisted.
|
|
2110
|
+
// save_photo — "save / capture / take a picture / remember"
|
|
2111
|
+
// → photo persisted to library; NOT described.
|
|
2112
|
+
//
|
|
2113
|
+
// Splitting lets the model call BOTH back-to-back when the
|
|
2114
|
+
// user wants both ("save this and tell me what it is") +
|
|
2115
|
+
// keeps each description tight enough for the Mock + real
|
|
2116
|
+
// matchers to disambiguate. See getCapabilityGuide(feature:"assistant_vision")
|
|
2117
|
+
// gotcha about describe-vs-save.
|
|
2118
|
+
tool("describe_scene", "Describe what the runner is currently looking at without saving the photo. Call for 'what do you see' / 'describe this' / 'tell me about this'.") {
|
|
2119
|
+
val photo = glasses.camera.capturePhoto().valueOrNull()
|
|
2120
|
+
?: return@tool ToolResult.Err("camera failed")
|
|
2121
|
+
val uri = photo.uri
|
|
2122
|
+
?: return@tool ToolResult.Err("photo had no uri")
|
|
2123
|
+
// session is the AssistantSession returned by start();
|
|
2124
|
+
// captured here via the outer-class property. By the
|
|
2125
|
+
// time a tool dispatches, the session is Active so
|
|
2126
|
+
// includeImage won't throw NotReady.
|
|
2127
|
+
session?.includeImage(uri)
|
|
2128
|
+
ToolResult.Ok("looking")
|
|
2129
|
+
}
|
|
2130
|
+
tool("save_photo", "Save a photo to the runner's library WITHOUT describing it. Call for 'save this' / 'capture this' / 'take a picture' / 'remember this view'.") {
|
|
2131
|
+
val photo = glasses.camera.capturePhoto().valueOrNull()
|
|
2132
|
+
?: return@tool ToolResult.Err("camera failed")
|
|
2133
|
+
val uri = photo.uri
|
|
2134
|
+
?: return@tool ToolResult.Err("photo had no uri")
|
|
2135
|
+
val ext = when (Photos.mediaTypeFromUri(uri)) {
|
|
2136
|
+
"image/png" -> "png"
|
|
2137
|
+
"image/webp" -> "webp"
|
|
2138
|
+
else -> "jpg"
|
|
2139
|
+
}
|
|
2140
|
+
val dst = File(library.photosDir, "photo-\${System.currentTimeMillis()}.\$ext")
|
|
2141
|
+
// Photos.copyToFile mirrors Videos.copyToFile: stream-copy
|
|
2142
|
+
// across data: / file:// / absolute path, parent mkdirs,
|
|
2143
|
+
// overwrite, returns Boolean. Use it — don't hand-roll
|
|
2144
|
+
// loadBytes + writeBytes.
|
|
2145
|
+
if (!Photos.copyToFile(uri, dst)) return@tool ToolResult.Err("could not save photo")
|
|
2146
|
+
library.addPhoto(dst.absolutePath, photo.width, photo.height)
|
|
2147
|
+
ToolResult.Ok("photo saved")
|
|
2000
2148
|
}
|
|
2001
2149
|
}
|
|
2150
|
+
|
|
2151
|
+
// Wake mechanism — canonical pattern uses the existing
|
|
2152
|
+
// glasses.voice.onPhrase system. Defaults to firesWhen =
|
|
2153
|
+
// VoiceScope.WhenDormant so it won't double-fire during an
|
|
2154
|
+
// active conversation. Swap this line for a button onClick,
|
|
2155
|
+
// a gesture handler, or any other trigger that calls
|
|
2156
|
+
// session?.wake() — the library doesn't dictate the
|
|
2157
|
+
// mechanism, only the lifecycle.
|
|
2158
|
+
glasses.voice.onPhrase("hey strava") { session?.wake() }
|
|
2002
2159
|
}
|
|
2003
2160
|
}
|
|
2004
2161
|
}
|
|
@@ -2024,7 +2181,20 @@ suspend fun startRawForm(glasses: ExtentosGlasses) {
|
|
|
2024
2181
|
)
|
|
2025
2182
|
session.start()
|
|
2026
2183
|
}`,
|
|
2027
|
-
swift:
|
|
2184
|
+
swift: `// ⚠️ iOS PARITY IN FLIGHT
|
|
2185
|
+
// The Phase 4 wake/sleep state machine (session.wake() / sleep() / say() /
|
|
2186
|
+
// onWake / sleepAfterSilence / endOnIntent / VoiceScope.WhenDormant) is
|
|
2187
|
+
// LIVE in the Android library + verified in the sim. The Swift port is
|
|
2188
|
+
// pending — see shared-context/real-hardware-validation-backlog.md B1
|
|
2189
|
+
// for status. This Swift block reflects the Phase 4 ergonomics target
|
|
2190
|
+
// but DOES NOT compile against the current iOS library (which still
|
|
2191
|
+
// ships the always-on Phase 4 surface from earlier sprints).
|
|
2192
|
+
//
|
|
2193
|
+
// Until iOS parity ships, iOS apps should use the Phase 3
|
|
2194
|
+
// conversation_agent_loop pattern or wait. Track the Mac VPS handoff in
|
|
2195
|
+
// shared-context/.
|
|
2196
|
+
|
|
2197
|
+
import GlassesCore
|
|
2028
2198
|
|
|
2029
2199
|
// ── App bootstrap (do once, e.g. in App.init or DI container) ─────────
|
|
2030
2200
|
//
|
|
@@ -2134,19 +2304,50 @@ func startRawForm(_ glasses: ExtentosGlasses) async throws {
|
|
|
2134
2304
|
},
|
|
2135
2305
|
explanation: `AGENT-DRIVEN E2E TEST LOOP — RUN AFTER createGlasses + handler.start()
|
|
2136
2306
|
|
|
2307
|
+
→ For the dedicated agent-driven workflow with FOUR-channel verification
|
|
2308
|
+
(getEventLog + adb logcat + screencap + library-state inspection), the
|
|
2309
|
+
multi-tool sweep pattern, autoWake details, and headless-CI guidance,
|
|
2310
|
+
see \`getCodeExample(pattern: "agent_driven_e2e_full_loop")\`. The block
|
|
2311
|
+
below is the quickstart; the dedicated example is the production
|
|
2312
|
+
pattern.
|
|
2313
|
+
|
|
2137
2314
|
The headless verification: drive the assistant with synthetic utterances,
|
|
2138
2315
|
assert the expected tools fire, then read the event log. Two providers
|
|
2139
|
-
satisfy the same agent loop
|
|
2316
|
+
satisfy the same agent loop (since iter5.2, the SAME injectAssistantUtterance
|
|
2317
|
+
MCP call drives BOTH):
|
|
2140
2318
|
|
|
2141
2319
|
- AssistantProvider.Mock — deterministic, sub-millisecond, $0. Word-
|
|
2142
2320
|
overlap-matches the injected utterance against tool descriptions
|
|
2143
2321
|
and dispatches the first match. Use for CI + tight inner loop.
|
|
2144
2322
|
- AssistantProvider.OpenAi — real WebSocket to api.openai.com,
|
|
2145
|
-
real LLM picks the tool, real audio output. Use for
|
|
2146
|
-
|
|
2147
|
-
|
|
2148
|
-
|
|
2149
|
-
|
|
2323
|
+
real LLM picks the tool, real audio output. Use for real-provider
|
|
2324
|
+
verification — works headless against the real model via
|
|
2325
|
+
injectAssistantUtterance.text since iter5.2 (no mic / no human
|
|
2326
|
+
required). Sim browser tab still required for camera-using tools.
|
|
2327
|
+
|
|
2328
|
+
// Mock-provider path — DON'T bake a useMock Boolean into the handler
|
|
2329
|
+
// (F13: invisible to users; ships looking like OpenAi by default).
|
|
2330
|
+
// Instead, wire the provider choice via a BuildConfig field tied to
|
|
2331
|
+
// a build flavor:
|
|
2332
|
+
//
|
|
2333
|
+
// // app/build.gradle.kts
|
|
2334
|
+
// android {
|
|
2335
|
+
// flavorDimensions += "assistant"
|
|
2336
|
+
// productFlavors {
|
|
2337
|
+
// create("mock") { dimension = "assistant"; buildConfigField("Boolean", "USE_MOCK_ASSISTANT", "true") }
|
|
2338
|
+
// create("real") { dimension = "assistant"; buildConfigField("Boolean", "USE_MOCK_ASSISTANT", "false") }
|
|
2339
|
+
// }
|
|
2340
|
+
// }
|
|
2341
|
+
//
|
|
2342
|
+
// // In your handler:
|
|
2343
|
+
// val provider = if (BuildConfig.USE_MOCK_ASSISTANT)
|
|
2344
|
+
// AssistantProvider.Mock() else
|
|
2345
|
+
// AssistantProvider.OpenAi(model = "gpt-realtime", voice = "alloy")
|
|
2346
|
+
//
|
|
2347
|
+
// CI builds the "mock" flavor for the agent-driven E2E loop below;
|
|
2348
|
+
// production builds the "real" flavor. The provider is visible in the
|
|
2349
|
+
// AssistantStatusBadge (glasses-ui) + the sim's waveform color so
|
|
2350
|
+
// dev never confuses which mode the running app is in.
|
|
2150
2351
|
const { sessionId } = await createSimulatorSession({
|
|
2151
2352
|
glasses: "meta_rayban", platform: "android",
|
|
2152
2353
|
});
|
|
@@ -2232,8 +2433,385 @@ side with this agent loop and watch the panel react.`,
|
|
|
2232
2433
|
"**Tool body runs on Dispatchers.IO (Kotlin) / Swift Task (Swift); suspending camera/storage/HTTP calls are fine.** Per-tool blocking opt-out via `tool(name, desc, blocking = true) { ... }` per synthesis #9 — when true, the model waits silently for the result before speaking. Default is non-blocking (model says \"let me check...\" while the tool runs). Use blocking=true for sub-100ms tools where the filler would be awkward (\"what time is it\" returning in 10 ms).",
|
|
2233
2434
|
"**BYOK key flows direct, not via Extentos.** synthesis §12. `glasses.assistant.setOpenaiApiKey(key)` stores the key in the AssistantClient. When start() opens a WebSocket, the key goes into `Authorization: Bearer ...` for the wss://api.openai.com/v1/realtime?model=gpt-realtime connection. Extentos backend never sees the key. Test endpoint override is on the AssistantProvider.OpenAi case if you need to point at a mock OpenAI proxy.",
|
|
2234
2435
|
"**`glasses.ai.complete` is deprecated in v1.4.0 and removed in v2.0.0.** Use the OpenAI SDK directly for non-voice LLM calls (image description, summarization, etc.). Migration: see `searchDocs(topic:'assistant_runtime')` → migration section walkthrough C.",
|
|
2436
|
+
"**Vision via `session.includeImage(uri, prompt = null)` (v1.4 addition).** Capture a photo, hand the URI to the assistant inside a tool body — the model sees it + speaks about it in its configured voice. URI accepts data: / http(s): / file:// / content://. The image persists in conversation history at the provider, so follow-up questions in the same session work without re-sending. Active-only (throws NotReady otherwise — safe inside a tool body where the session is always Active). Canonical pattern: `tool(\"describe_scene\", \"...\") { val photo = glasses.camera.capturePhoto().valueOrNull(); session.includeImage(photo.uri); ToolResult.Ok(\"looking\") }`. See getCapabilityGuide(feature:\"assistant_vision\") for the prompt-parameter pattern + URI-type gotchas.",
|
|
2437
|
+
"**Mid-session primitives (iter5 addition).** Four building blocks on `AssistantSession`: `setReasoningEffort(level)` for dynamic effort routing, `updateInstructions(text)` for persona/mode swaps, `cancelSpeak()` for tool-driven interrupts, `conversationHistory(limit)` for forwarding context to a stronger model. All composable with the existing tool surface — write tools that call them in their bodies. Canonical escalation pattern: a `ask_smart_model` tool body reads `session.conversationHistory()`, formats it as context, calls the customer's own GPT-5 (or Anthropic Claude, or Gemini) client, returns the response as `ToolResult.Ok(answer)` — the realtime model then speaks the answer in its configured voice. Active-only (except conversationHistory which is always safe). See getCapabilityGuide(feature:\"assistant_session_runtime\") for the canonical snippet of each.",
|
|
2438
|
+
],
|
|
2439
|
+
relatedFeatures: ["assistant_runtime", "assistant_start", "assistant_tool", "assistant_provider_openai", "assistant_vision", "assistant_session_runtime", "audio_chunks", "speak", "capture_photo"],
|
|
2440
|
+
};
|
|
2441
|
+
// ── Phase 4 agent-driven E2E loop (iter5.2 + iter5.3) ─────────────────
|
|
2442
|
+
//
|
|
2443
|
+
// Sister pattern to `assistant_agent_loop`. That one shows the customer-
|
|
2444
|
+
// side handler code; THIS one shows the AGENT-side workflow that drives
|
|
2445
|
+
// + verifies it. The split exists because the audiences differ — a dev
|
|
2446
|
+
// shipping a voice assistant cares about the handler; an AI coding agent
|
|
2447
|
+
// (or a customer's CI pipeline) cares about the test loop. Both reference
|
|
2448
|
+
// each other.
|
|
2449
|
+
//
|
|
2450
|
+
// Iter5.2 (2026-05-27) made `injectAssistantUtterance(text)` drive the
|
|
2451
|
+
// REAL OpenAI Realtime provider, not just Mock. Iter5.3 (2026-05-27)
|
|
2452
|
+
// wired autoWake so multi-step sessions don't silently drop injects
|
|
2453
|
+
// across silence-timeout sleep transitions. assertToolCalled's cursor
|
|
2454
|
+
// anchor (b87388d) eliminated the stale-match false-positive. Together
|
|
2455
|
+
// these turn the agent-driven E2E loop from "demo-quality" into a
|
|
2456
|
+
// production-grade verification pattern.
|
|
2457
|
+
const AGENT_DRIVEN_E2E_FULL_LOOP = {
|
|
2458
|
+
pattern: "agent_driven_e2e_full_loop",
|
|
2459
|
+
title: "Phase 4 assistant — agent-driven E2E loop with multi-channel verification (real OpenAi capable)",
|
|
2460
|
+
description: "How an AI coding agent verifies a Phase 4 voice-assistant handler end-to-end FROM THE SAME MCP SESSION that scaffolded it — no human, no mic. Iter5.2 unlocked driving the REAL OpenAI Realtime provider via `injectAssistantUtterance(text)` (same MCP call as Mock — the runtime decides based on `AssistantProvider.OpenAi` vs `.Mock()` at session creation). The full loop combines FOUR independent verification channels: `getEventLog` for the protocol-level assistant.* event trace, `adb logcat` for tool-body internals (file paths, byte counts, branch decisions), `adb screencap` for UI confirmation, and direct library-state inspection (Room DB / file system) for persisted side effects. When all four agree, the flow really works at every layer. Companion to `assistant_agent_loop` which covers the handler-side code — use both. Replaces the older `agent_test_loop` (Phase 3 cascaded VAD + recordDiscrete + AnthropicClient) for new voice-AI work.",
|
|
2461
|
+
code: {
|
|
2462
|
+
kotlin: `// HANDLER UNDER TEST — minimal Phase 4 assistant the agent loop drives.
|
|
2463
|
+
// For the full handler-side pattern (8+ tools, wake/sleep hooks,
|
|
2464
|
+
// build-flavor Mock/OpenAi switching, voice phrase wiring), see
|
|
2465
|
+
// getCodeExample(pattern: "assistant_agent_loop").
|
|
2466
|
+
|
|
2467
|
+
import com.extentos.glasses.core.ExtentosGlasses
|
|
2468
|
+
import com.extentos.glasses.core.Photos
|
|
2469
|
+
import com.extentos.glasses.core.assistant.AssistantProvider
|
|
2470
|
+
import com.extentos.glasses.core.assistant.AssistantSession
|
|
2471
|
+
import com.extentos.glasses.core.assistant.ToolResult
|
|
2472
|
+
import com.extentos.glasses.core.valueOrNull
|
|
2473
|
+
import android.util.Log
|
|
2474
|
+
import kotlinx.coroutines.CoroutineScope
|
|
2475
|
+
import kotlinx.coroutines.Dispatchers
|
|
2476
|
+
import kotlinx.coroutines.SupervisorJob
|
|
2477
|
+
import kotlinx.coroutines.launch
|
|
2478
|
+
import java.io.File
|
|
2479
|
+
import kotlin.time.Duration.Companion.seconds
|
|
2480
|
+
|
|
2481
|
+
class NotesAssistant(
|
|
2482
|
+
private val glasses: ExtentosGlasses,
|
|
2483
|
+
private val library: NotesLibrary, // app-internal: Room + photos/ dir
|
|
2484
|
+
private val scope: CoroutineScope = CoroutineScope(SupervisorJob() + Dispatchers.IO),
|
|
2485
|
+
) {
|
|
2486
|
+
private var session: AssistantSession? = null
|
|
2487
|
+
@Volatile private var notesActive = false
|
|
2488
|
+
private val notesBuffer = StringBuilder()
|
|
2489
|
+
|
|
2490
|
+
fun start() {
|
|
2491
|
+
scope.launch {
|
|
2492
|
+
session = glasses.assistant.start(
|
|
2493
|
+
// F13: pick provider via build flavor (USE_MOCK_ASSISTANT BuildConfig)
|
|
2494
|
+
// — never bake \`useMock\` into handler code. See assistant_agent_loop.
|
|
2495
|
+
provider = AssistantProvider.OpenAi(model = "gpt-realtime-2", voice = "alloy"),
|
|
2496
|
+
) {
|
|
2497
|
+
instructions = "You are a voice-notes assistant on glasses. Speak briefly."
|
|
2498
|
+
sleepAfterSilence(30.seconds)
|
|
2499
|
+
|
|
2500
|
+
tool("start_notes", "Start capturing the user's speech as a note. Call when the user says 'start notes', 'take notes', or similar.") {
|
|
2501
|
+
Log.i(TAG, "start_notes: notesActive=\$notesActive")
|
|
2502
|
+
if (notesActive) return@tool ToolResult.Err("already taking notes")
|
|
2503
|
+
notesActive = true
|
|
2504
|
+
notesBuffer.setLength(0)
|
|
2505
|
+
ToolResult.Ok("ok, taking notes")
|
|
2506
|
+
}
|
|
2507
|
+
|
|
2508
|
+
tool("save_notes", "Save the accumulated notes to the library. Call when the user says 'save notes', 'stop notes', or 'I'm done'.") {
|
|
2509
|
+
Log.i(TAG, "save_notes: buf=\${notesBuffer.length} chars")
|
|
2510
|
+
if (!notesActive) return@tool ToolResult.Err("not taking notes")
|
|
2511
|
+
notesActive = false
|
|
2512
|
+
val text = notesBuffer.toString()
|
|
2513
|
+
if (text.isBlank()) return@tool ToolResult.Err("nothing captured")
|
|
2514
|
+
library.addNote(text)
|
|
2515
|
+
ToolResult.Ok("notes saved")
|
|
2516
|
+
}
|
|
2517
|
+
|
|
2518
|
+
tool("save_photo", "Take a photo and save it to the user's library. Call when the user says 'save this' or 'take a picture'.") {
|
|
2519
|
+
val photo = glasses.camera.capturePhoto().valueOrNull()
|
|
2520
|
+
?: return@tool ToolResult.Err("camera failed")
|
|
2521
|
+
val uri = photo.uri ?: return@tool ToolResult.Err("no uri")
|
|
2522
|
+
Log.i(TAG, "save_photo: uri=\${uri.take(60)} \${photo.width}x\${photo.height}")
|
|
2523
|
+
val dst = File(library.photosDir, "photo-\${System.currentTimeMillis()}.jpg")
|
|
2524
|
+
if (!Photos.copyToFile(uri, dst)) return@tool ToolResult.Err("save failed")
|
|
2525
|
+
Log.i(TAG, "save_photo: wrote \${dst.length()} bytes to \${dst.absolutePath}")
|
|
2526
|
+
library.addPhoto(dst.absolutePath, photo.width, photo.height)
|
|
2527
|
+
ToolResult.Ok("photo saved")
|
|
2528
|
+
}
|
|
2529
|
+
|
|
2530
|
+
tool("describe_scene", "Describe what the user is currently looking at WITHOUT saving the photo. Call for 'what do you see' / 'describe this' / 'tell me about this'.") {
|
|
2531
|
+
val photo = glasses.camera.capturePhoto().valueOrNull()
|
|
2532
|
+
?: return@tool ToolResult.Err("camera failed")
|
|
2533
|
+
val uri = photo.uri ?: return@tool ToolResult.Err("no uri")
|
|
2534
|
+
Log.i(TAG, "describe_scene: handing uri to session.includeImage")
|
|
2535
|
+
session?.includeImage(uri)
|
|
2536
|
+
ToolResult.Ok("looking")
|
|
2537
|
+
}
|
|
2538
|
+
}
|
|
2539
|
+
// Wake phrase — the wake handler the autoWake pre-wake path matches.
|
|
2540
|
+
glasses.voice.onPhrase("hey notes") {
|
|
2541
|
+
Log.i(TAG, "wake phrase matched")
|
|
2542
|
+
session?.wake()
|
|
2543
|
+
}
|
|
2544
|
+
}
|
|
2545
|
+
}
|
|
2546
|
+
|
|
2547
|
+
private companion object {
|
|
2548
|
+
const val TAG = "NotesAssistant"
|
|
2549
|
+
}
|
|
2550
|
+
}`,
|
|
2551
|
+
swift: `// HANDLER UNDER TEST — minimal Phase 4 assistant the agent loop drives.
|
|
2552
|
+
// For the full handler-side pattern see getCodeExample(pattern: "assistant_agent_loop").
|
|
2553
|
+
//
|
|
2554
|
+
// ⚠️ iOS PARITY IN FLIGHT — the Phase 4 wake/sleep state machine + iter5
|
|
2555
|
+
// mid-session primitives are LIVE on Android. The Swift port is queued
|
|
2556
|
+
// (see shared-context/ios-pure-sdk-pivot-handoff.md). This Swift block
|
|
2557
|
+
// reflects the Phase 4 ergonomics target; current iOS apps using
|
|
2558
|
+
// glasses.assistant still ship the always-on surface from earlier sprints.
|
|
2559
|
+
// The TS agent-test loop below is platform-agnostic — it works against
|
|
2560
|
+
// either the Android handler above OR an iOS app running the same APIs
|
|
2561
|
+
// once the iOS port lands.
|
|
2562
|
+
|
|
2563
|
+
import GlassesCore
|
|
2564
|
+
|
|
2565
|
+
final class NotesAssistant: @unchecked Sendable {
|
|
2566
|
+
private let glasses: ExtentosGlasses
|
|
2567
|
+
private let library: NotesLibrary
|
|
2568
|
+
private var session: AssistantSession?
|
|
2569
|
+
private var notesActive = false
|
|
2570
|
+
private var notesBuffer = ""
|
|
2571
|
+
|
|
2572
|
+
init(glasses: ExtentosGlasses, library: NotesLibrary) {
|
|
2573
|
+
self.glasses = glasses
|
|
2574
|
+
self.library = library
|
|
2575
|
+
}
|
|
2576
|
+
|
|
2577
|
+
func start() async throws {
|
|
2578
|
+
session = try await glasses.assistant.start(
|
|
2579
|
+
provider: .openAI(model: "gpt-realtime-2", voice: "alloy")
|
|
2580
|
+
) { config in
|
|
2581
|
+
config.instructions = "You are a voice-notes assistant on glasses. Speak briefly."
|
|
2582
|
+
// config.sleepAfterSilence(.seconds(30)) // iOS API parity pending
|
|
2583
|
+
|
|
2584
|
+
config.tool("start_notes", description: "Start capturing the user's speech as a note. Call when the user says 'start notes' / 'take notes'.") {
|
|
2585
|
+
NSLog("NotesAssistant start_notes: notesActive=\\(self.notesActive)")
|
|
2586
|
+
if self.notesActive { return .err("already taking notes") }
|
|
2587
|
+
self.notesActive = true
|
|
2588
|
+
self.notesBuffer = ""
|
|
2589
|
+
return .ok("ok, taking notes")
|
|
2590
|
+
}
|
|
2591
|
+
|
|
2592
|
+
config.tool("save_notes", description: "Save the accumulated notes. Call when the user says 'save notes' / 'stop notes' / 'I'm done'.") {
|
|
2593
|
+
NSLog("NotesAssistant save_notes: buf=\\(self.notesBuffer.count) chars")
|
|
2594
|
+
if !self.notesActive { return .err("not taking notes") }
|
|
2595
|
+
self.notesActive = false
|
|
2596
|
+
if self.notesBuffer.isEmpty { return .err("nothing captured") }
|
|
2597
|
+
self.library.addNote(self.notesBuffer)
|
|
2598
|
+
return .ok("notes saved")
|
|
2599
|
+
}
|
|
2600
|
+
|
|
2601
|
+
config.tool("save_photo", description: "Take a photo and save it to the library. Call when the user says 'save this' / 'take a picture'.") {
|
|
2602
|
+
guard let photo = (await self.glasses.camera.capturePhoto()).success else {
|
|
2603
|
+
return .err("camera failed")
|
|
2604
|
+
}
|
|
2605
|
+
NSLog("NotesAssistant save_photo: width=\\(photo.width)")
|
|
2606
|
+
self.library.addPhoto(photo)
|
|
2607
|
+
return .ok("photo saved")
|
|
2608
|
+
}
|
|
2609
|
+
}
|
|
2610
|
+
// Wake phrase wiring (handler the autoWake pre-wake path matches).
|
|
2611
|
+
_ = glasses.voice.onPhrase("hey notes") { [weak self] in
|
|
2612
|
+
await self?.session?.wake()
|
|
2613
|
+
}
|
|
2614
|
+
}
|
|
2615
|
+
}`,
|
|
2616
|
+
},
|
|
2617
|
+
explanation: `THE AGENT-SIDE TEST LOOP — what runs in YOUR conversation, NOT in the customer's app
|
|
2618
|
+
|
|
2619
|
+
// 1. Mint or resume the sim. createSimulatorSession is get-or-create —
|
|
2620
|
+
// pass resetFresh:true only when you need a clean event-log slate.
|
|
2621
|
+
const { sessionId } = await createSimulatorSession({
|
|
2622
|
+
glasses: "meta_rayban", platform: "android",
|
|
2623
|
+
});
|
|
2624
|
+
|
|
2625
|
+
// 2. Sweep across N tools in one session. autoWake (default true)
|
|
2626
|
+
// handles the silence-timeout sleep transitions that bite multi-step
|
|
2627
|
+
// tests where the agent spends >30s between injects.
|
|
2628
|
+
const cases = [
|
|
2629
|
+
{ utter: "start taking notes", expect: "start_notes" },
|
|
2630
|
+
{ utter: "save this view", expect: "save_photo" },
|
|
2631
|
+
{ utter: "what am I looking at", expect: "describe_scene" },
|
|
2632
|
+
{ utter: "okay save the notes", expect: "save_notes" },
|
|
2633
|
+
];
|
|
2634
|
+
|
|
2635
|
+
for (const c of cases) {
|
|
2636
|
+
const inj = await injectAssistantUtterance({
|
|
2637
|
+
sessionId,
|
|
2638
|
+
text: c.utter,
|
|
2639
|
+
// autoWake defaults true; wakePhrase defaults "hey elizabeth" —
|
|
2640
|
+
// override for apps with a different wake phrase:
|
|
2641
|
+
wakePhrase: "hey notes",
|
|
2642
|
+
});
|
|
2643
|
+
// inj.autoWake describes what the pre-wake step did:
|
|
2644
|
+
// { action: "skipped_active" } session was up already
|
|
2645
|
+
// { action: "pre_waked", waitedMs: 1843 } re-waked from dormant
|
|
2646
|
+
// { action: "pre_wake_timeout", reason } wake phrase didn't match
|
|
2647
|
+
// — check glasses.voice.onPhrase
|
|
2648
|
+
// registration in the app
|
|
2649
|
+
|
|
2650
|
+
// 3. Assert the tool fired. The cursor anchor (auto since b87388d)
|
|
2651
|
+
// eliminates stale matches from prior runs.
|
|
2652
|
+
const call = await assertToolCalled({
|
|
2653
|
+
sessionId, name: c.expect, timeoutMs: 5000,
|
|
2654
|
+
});
|
|
2655
|
+
console.log(\`\${c.expect}: call_id=\${call.call_id} waited=\${call.waitedMs}ms\`);
|
|
2656
|
+
}
|
|
2657
|
+
|
|
2658
|
+
THE FOUR VERIFICATION CHANNELS — when does an assertion really mean it worked?
|
|
2659
|
+
|
|
2660
|
+
CHANNEL 1 — getEventLog (protocol layer)
|
|
2661
|
+
|
|
2662
|
+
The 8-event assistant.* family captures the full conversation. Anchor a cursor
|
|
2663
|
+
before injecting, then read forward:
|
|
2664
|
+
|
|
2665
|
+
const head = await getEventLog({ sessionId, filter: "voice", limit: 1 });
|
|
2666
|
+
let cur = head.cursor;
|
|
2667
|
+
await injectAssistantUtterance({ sessionId, text: "save this view" });
|
|
2668
|
+
await assertToolCalled({ sessionId, name: "save_photo", timeoutMs: 5000 });
|
|
2669
|
+
const trace = await getEventLog({ sessionId, cursor: cur, filter: "voice" });
|
|
2670
|
+
// Expected event sequence:
|
|
2671
|
+
// assistant.user_spoke transcript="save this view"
|
|
2672
|
+
// assistant.tool_called name="save_photo", call_id
|
|
2673
|
+
// assistant.tool_result call_id, output="photo saved", is_error:false,
|
|
2674
|
+
// duration_ms
|
|
2675
|
+
// assistant.assistant_spoke transcript="<the model's confirmation>"
|
|
2676
|
+
|
|
2677
|
+
CHANNEL 2 — adb logcat (tool-body internals)
|
|
2678
|
+
|
|
2679
|
+
assertToolCalled proves the tool FIRED. It does NOT prove the body finished
|
|
2680
|
+
cleanly. Tool bodies log internals via android.util.Log — file paths, byte
|
|
2681
|
+
counts, branch decisions. Tail logcat with a tag filter to see what the body
|
|
2682
|
+
actually did:
|
|
2683
|
+
|
|
2684
|
+
// Find the adb binary (Windows: Sdk\\platform-tools\\adb.exe; macOS/Linux:
|
|
2685
|
+
// /Users/<you>/Library/Android/sdk/platform-tools/adb).
|
|
2686
|
+
adb logcat -d -s "NotesAssistant:I" \\
|
|
2687
|
+
| tail -50
|
|
2688
|
+
// I/NotesAssistant: save_photo: uri=content://media/external/images/... 1920x1080
|
|
2689
|
+
// I/NotesAssistant: save_photo: wrote 487231 bytes to /data/data/<pkg>/files/photos/photo-1716908...
|
|
2690
|
+
|
|
2691
|
+
If Channel 1 fires but Channel 2 is silent past the tool entry log, the body
|
|
2692
|
+
hit a branch that returned early without logging. Often a missing else clause.
|
|
2693
|
+
|
|
2694
|
+
CHANNEL 3 — adb screencap (UI confirmation)
|
|
2695
|
+
|
|
2696
|
+
Run-state changes (badge color, count, list items) reflect the persisted
|
|
2697
|
+
state in the UI. screencap proves the binding observed the new state:
|
|
2698
|
+
|
|
2699
|
+
adb exec-out screencap -p > /tmp/post-save.png
|
|
2700
|
+
// Then Read /tmp/post-save.png — the agent SEES what the user would see.
|
|
2701
|
+
// The Notes app's count badge incremented; the latest photo appears in
|
|
2702
|
+
// the gallery row; the "taking notes" indicator turned off.
|
|
2703
|
+
|
|
2704
|
+
Channel 3 catches UI-binding bugs: the tool wrote to the DB but the UI didn't
|
|
2705
|
+
recompose. Often a missing StateFlow.emit / notifyDataSetChanged / SwiftUI
|
|
2706
|
+
@Published.
|
|
2707
|
+
|
|
2708
|
+
CHANNEL 4 — library-state inspection (persistence)
|
|
2709
|
+
|
|
2710
|
+
Tools writing to local storage leave a durable artifact. Pull + inspect:
|
|
2711
|
+
|
|
2712
|
+
// Photos directory
|
|
2713
|
+
adb exec-out run-as <pkg> ls -la files/photos/ | tail -5
|
|
2714
|
+
// -rw------- 1 u0_a234 u0_a234 487231 2026-05-27 21:14 photo-1716908143421.jpg
|
|
2715
|
+
|
|
2716
|
+
// Room DB (always pull .db + .db-wal + .db-shm together — WAL is mandatory)
|
|
2717
|
+
adb exec-out run-as <pkg> cat databases/notes.db > /tmp/local.db
|
|
2718
|
+
adb exec-out run-as <pkg> cat databases/notes.db-wal > /tmp/local.db-wal
|
|
2719
|
+
adb exec-out run-as <pkg> cat databases/notes.db-shm > /tmp/local.db-shm
|
|
2720
|
+
sqlite3 /tmp/local.db "SELECT id, text FROM notes ORDER BY id DESC LIMIT 3"
|
|
2721
|
+
|
|
2722
|
+
Channel 4 catches persistence bugs: the tool succeeded but the write never
|
|
2723
|
+
hit storage. Common when a developer comments out the DB insert while
|
|
2724
|
+
debugging the speak()-side and forgets to uncomment.
|
|
2725
|
+
|
|
2726
|
+
WHEN THE CHANNELS DISAGREE
|
|
2727
|
+
|
|
2728
|
+
Channels 1+2+3+4 agree flow works end-to-end
|
|
2729
|
+
Channel 1 fires, Channel 2 silent body returned early; missing branch log
|
|
2730
|
+
Channels 1+2 fire, Channel 3 unchanged UI binding didn't fire; missing emit
|
|
2731
|
+
Channels 1-3 fire, Channel 4 missing body wrote to a transient; never persisted
|
|
2732
|
+
|
|
2733
|
+
Each disagreement points at a specific layer. With only Channel 1 (which is
|
|
2734
|
+
what assertToolCalled gives you in isolation), half these failures look
|
|
2735
|
+
identical to a passing test.
|
|
2736
|
+
|
|
2737
|
+
REAL OPENAI VS MOCK — when to pick which
|
|
2738
|
+
|
|
2739
|
+
Mock sub-ms, deterministic, $0. Word-overlap-matches tool
|
|
2740
|
+
descriptions. Use for CI / tight inner loop / regression
|
|
2741
|
+
sweeps.
|
|
2742
|
+
OpenAi 500-2000ms per inject, ~$0.005, real model reasoning, real
|
|
2743
|
+
tool routing under the model's actual instructions. Use for
|
|
2744
|
+
confidence-building before a release; for catching cases
|
|
2745
|
+
where the model picks the WRONG tool when descriptions
|
|
2746
|
+
overlap; for vision tests (includeImage); for confirming
|
|
2747
|
+
the model speaks a sensible confirmation back.
|
|
2748
|
+
|
|
2749
|
+
Same injectAssistantUtterance call works against both — \`AssistantProvider.OpenAi\`
|
|
2750
|
+
vs \`AssistantProvider.Mock()\` at session creation is the entire switch.
|
|
2751
|
+
F13: never bake a \`useMock\` Boolean into handler code; wire it via a build
|
|
2752
|
+
flavor so the choice is visible in the AssistantStatusBadge (glasses-ui) +
|
|
2753
|
+
the sim's waveform color so dev never confuses which mode is running.
|
|
2754
|
+
|
|
2755
|
+
WAKING THE SESSION — autoWake (iter5.3) replaced the old two-step dance
|
|
2756
|
+
|
|
2757
|
+
Pre-iter5.3, agents had to manually inject the wake phrase before each
|
|
2758
|
+
assistant inject if the session might be dormant:
|
|
2759
|
+
|
|
2760
|
+
// OLD pattern — no longer required
|
|
2761
|
+
await injectTranscript({ sessionId, text: "hey notes" });
|
|
2762
|
+
await sleep(3000);
|
|
2763
|
+
await injectAssistantUtterance({ sessionId, text: "save this view" });
|
|
2764
|
+
|
|
2765
|
+
Now autoWake handles it transparently — default true, only fires when the
|
|
2766
|
+
event-log scan shows the session isn't currently Active:
|
|
2767
|
+
|
|
2768
|
+
await injectAssistantUtterance({
|
|
2769
|
+
sessionId, text: "save this view", wakePhrase: "hey notes",
|
|
2770
|
+
});
|
|
2771
|
+
// If session was Active: skipped (one ~50ms events query, no wake roundtrip)
|
|
2772
|
+
// If session was Dormant: pre-injects "hey notes", waits ~1-2s for
|
|
2773
|
+
// assistant.session_started, then dispatches.
|
|
2774
|
+
|
|
2775
|
+
Set \`autoWake: false\` only when you want to ASSERT the dormant-drop behavior
|
|
2776
|
+
itself, or when you're driving wake manually with custom timing.
|
|
2777
|
+
|
|
2778
|
+
CAMERA-USING TOOLS NEED A LIVE SIM BROWSER TAB
|
|
2779
|
+
|
|
2780
|
+
Tools that call glasses.camera.capturePhoto / captureVideo need the SIM
|
|
2781
|
+
BROWSER TAB OPEN — that's where camera input streams live in the simulator.
|
|
2782
|
+
Without it, the tool body returns Err("camera failed") and the Channel 1
|
|
2783
|
+
event log shows tool_result.is_error=true.
|
|
2784
|
+
|
|
2785
|
+
Open the sim browser tab at the start of your session:
|
|
2786
|
+
|
|
2787
|
+
// Windows
|
|
2788
|
+
await sh(\`cmd /c start https://extentos.com/s/\${sessionId}\`);
|
|
2789
|
+
// macOS
|
|
2790
|
+
await sh(\`open https://extentos.com/s/\${sessionId}\`);
|
|
2791
|
+
// Then poll until ready
|
|
2792
|
+
while (true) {
|
|
2793
|
+
const s = await getSimulatorStatus({ sessionId });
|
|
2794
|
+
if (s.connectedRoles?.browser) break;
|
|
2795
|
+
await sleep(500);
|
|
2796
|
+
}
|
|
2797
|
+
|
|
2798
|
+
For headless CI: camera-tool verification is blocked today. Non-camera tools
|
|
2799
|
+
(state toggles, persistence, AI calls) work headless. A future
|
|
2800
|
+
\`ensureSimulatorBrowser\` MCP tool would lift this for cross-platform CI —
|
|
2801
|
+
see project_iter4_sim_agent_discovery in shared-context.`,
|
|
2802
|
+
gotchas: [
|
|
2803
|
+
"**autoWake's pre-wake adds ~1-2s on dormant sessions; ~50ms on active ones.** The handler now scans recent events to detect state; if assistant.went_dormant or assistant.session_ended is more recent than any other assistant.* signal, it pre-injects the wake phrase + waits for assistant.session_started before dispatching. Active sessions skip the round trip entirely. To opt out (e.g. when asserting the dormant-drop behavior itself), pass `autoWake: false`. The response's `autoWake` block reports exactly what happened — read it on the first few injects of a new session to confirm the wake phrase is correct.",
|
|
2804
|
+
"**wakePhrase defaults to \"hey elizabeth\" — pass `wakePhrase` explicitly for other apps.** The default matches the canonical dogfood pattern. When the phrase doesn't match what the customer registered via `glasses.voice.onPhrase(...) { session.wake() }`, autoWake reports `action: \"pre_wake_timeout\"` with a diagnostic message + still dispatches the inject (which then drops since the session is still dormant). Fix the phrase + retry.",
|
|
2805
|
+
"**assertToolCalled now cursor-anchors at \"now\" (b87388d).** Pre-fix, a no-cursor first poll returned the OLDEST 200 events — any matching assistant.tool_called from yesterday's testing satisfied immediately with a stale call_id + bogus waitedMs (e.g. 400ms reported on an OpenAi call that physically takes 1-2s). Now: a one-time limit=1 anchor before polling guarantees only events arriving AFTER the call match. No schema change for callers.",
|
|
2806
|
+
"**Camera-tool tests need the sim browser tab open. There's no automated workaround today.** save_photo / describe_scene / capture_video and any tool body that calls glasses.camera.* will return Err(\"camera failed\") without `connectedRoles.browser` being non-null. Open the tab once at the session start (commands above) and poll getSimulatorStatus until ready. Future: an `ensureSimulatorBrowser` MCP tool will collapse this into one call.",
|
|
2807
|
+
"**adb logcat needs the adb binary in your PATH or full path.** On Windows it's at `C:/Users/<you>/AppData/Local/Android/Sdk/platform-tools/adb.exe`; on macOS at `~/Library/Android/sdk/platform-tools/adb`. The MCP server runs on YOUR machine, so cloud-hosted agents can't reach the emulator — Channel 2/3/4 are local-only. Headless CI agents can still do Channel 1 (getEventLog is the only protocol-level channel).",
|
|
2808
|
+
"**WAL caveat (Android Room): always pull .db + .db-wal + .db-shm together.** Room writes in WAL mode by default. Pulling only the .db file gives a stale snapshot — recent writes live in .db-wal. The agent's first DB-read returning zero rows almost always means missing .db-wal. iOS / GRDB doesn't have this — GRDB ships with WAL enabled but the pulled DB self-checkpoints on close.",
|
|
2809
|
+
"**Don't mint a fresh sim per test.** createSimulatorSession is get-or-create — the same sim resumes across test runs. Use `resetFresh: true` ONLY when you need a clean event-log slate; otherwise rebuild + reinstall the app + the library reattaches automatically. The autoWake state-detection scans the latest 200 events, so a long-running sim with thousands of events still resolves state correctly (the lifecycle markers are usually well within the window).",
|
|
2810
|
+
"**OpenAi inject latency varies by tool body cost.** State-toggle tools (start_notes / save_notes) dispatch in 500-1000ms; camera+vision tools (describe_scene with includeImage) take 2000-4000ms (camera + image upload + model reasoning). Budget `timeoutMs: 5000` for the former + `timeoutMs: 10000` for vision tools. Mock dispatches in <50ms for everything.",
|
|
2811
|
+
"**The same `injectAssistantUtterance(text)` MCP call drives BOTH Mock and OpenAi.** The runtime picks based on `AssistantProvider.OpenAi(...)` vs `AssistantProvider.Mock()` at session creation. Wire the choice via a build flavor (USE_MOCK_ASSISTANT BuildConfig) per F13 — never bake `useMock` into handler code. CI runs the Mock flavor for the inner loop; pre-release smoke runs the OpenAi flavor for the real-model confidence pass.",
|
|
2812
|
+
"**Pull the JS-side test driver into your CI as a script, not inside the agent.** The agent-driven workflow above is great for inner-loop scaffolding, but for repeated CI runs commit the test code as a separate `.test.ts` / `.test.kt` file in the host app's repo. Use `vitest` (TS) or `kotlinx.coroutines.test.runTest` (Kotlin instrumented test) to run it. The MCP tool calls are still issued (the test driver calls Extentos backend APIs directly using @extentos/mcp-server's exported helpers, or via the MCP HTTP transport).",
|
|
2235
2813
|
],
|
|
2236
|
-
relatedFeatures: ["assistant_runtime", "assistant_start", "assistant_tool", "assistant_provider_openai", "
|
|
2814
|
+
relatedFeatures: ["assistant_runtime", "assistant_start", "assistant_tool", "assistant_provider_openai", "assistant_session_runtime", "assistant_vision"],
|
|
2237
2815
|
};
|
|
2238
2816
|
export const CODE_EXAMPLES = {
|
|
2239
2817
|
voice_qa_assistant: VOICE_QA_ASSISTANT,
|
|
@@ -2245,6 +2823,7 @@ export const CODE_EXAMPLES = {
|
|
|
2245
2823
|
byok_anthropic: BYOK_ANTHROPIC,
|
|
2246
2824
|
agent_test_loop: AGENT_TEST_LOOP,
|
|
2247
2825
|
assistant_agent_loop: ASSISTANT_AGENT_LOOP,
|
|
2826
|
+
agent_driven_e2e_full_loop: AGENT_DRIVEN_E2E_FULL_LOOP,
|
|
2248
2827
|
conversation_agent_loop: CONVERSATION_AGENT_LOOP,
|
|
2249
2828
|
};
|
|
2250
2829
|
export const CODE_EXAMPLE_PATTERNS = Object.keys(CODE_EXAMPLES).sort();
|