@extentos/mcp-server 0.0.94 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/tools/data/capabilities.d.ts +49 -0
- package/dist/tools/data/capabilities.d.ts.map +1 -1
- package/dist/tools/data/capabilities.js +37 -0
- package/dist/tools/data/capabilities.js.map +1 -1
- package/dist/tools/data/capabilityPatterns.d.ts.map +1 -1
- package/dist/tools/data/capabilityPatterns.js +499 -6
- package/dist/tools/data/capabilityPatterns.js.map +1 -1
- package/dist/tools/data/codeExamples.d.ts.map +1 -1
- package/dist/tools/data/codeExamples.js +916 -2
- package/dist/tools/data/codeExamples.js.map +1 -1
- package/dist/tools/data/version.d.ts +12 -0
- package/dist/tools/data/version.d.ts.map +1 -1
- package/dist/tools/data/version.js +17 -0
- package/dist/tools/data/version.js.map +1 -1
- package/dist/tools/definitions.d.ts.map +1 -1
- package/dist/tools/definitions.js +63 -3
- package/dist/tools/definitions.js.map +1 -1
- package/dist/tools/docs/index.d.ts.map +1 -1
- package/dist/tools/docs/index.js +115 -4
- package/dist/tools/docs/index.js.map +1 -1
- package/dist/tools/handlers/assertToolCalled.d.ts +3 -0
- package/dist/tools/handlers/assertToolCalled.d.ts.map +1 -0
- package/dist/tools/handlers/assertToolCalled.js +208 -0
- package/dist/tools/handlers/assertToolCalled.js.map +1 -0
- package/dist/tools/handlers/generateConnectionModule.js +21 -2
- package/dist/tools/handlers/generateConnectionModule.js.map +1 -1
- package/dist/tools/handlers/getCredentialGuide.d.ts.map +1 -1
- package/dist/tools/handlers/getCredentialGuide.js +33 -6
- package/dist/tools/handlers/getCredentialGuide.js.map +1 -1
- package/dist/tools/handlers/getProductionChecklist.d.ts.map +1 -1
- package/dist/tools/handlers/getProductionChecklist.js +52 -1
- package/dist/tools/handlers/getProductionChecklist.js.map +1 -1
- package/dist/tools/handlers/injectAssistantUtterance.d.ts +3 -0
- package/dist/tools/handlers/injectAssistantUtterance.d.ts.map +1 -0
- package/dist/tools/handlers/injectAssistantUtterance.js +378 -0
- package/dist/tools/handlers/injectAssistantUtterance.js.map +1 -0
- package/dist/tools/handlers/validateIntegration.d.ts.map +1 -1
- package/dist/tools/handlers/validateIntegration.js +67 -0
- package/dist/tools/handlers/validateIntegration.js.map +1 -1
- package/dist/tools/registry.d.ts.map +1 -1
- package/dist/tools/registry.js +4 -0
- package/dist/tools/registry.js.map +1 -1
- package/package.json +1 -1
|
@@ -1604,8 +1604,8 @@ The agent reads each \`speak\` event's \`details.text\` — that is what the app
|
|
|
1604
1604
|
};
|
|
1605
1605
|
const CONVERSATION_AGENT_LOOP = {
|
|
1606
1606
|
pattern: "conversation_agent_loop",
|
|
1607
|
-
title: "Phase 3 conversation runtime
|
|
1608
|
-
description: "
|
|
1607
|
+
title: "LEGACY Phase 3 conversation runtime (DEPRECATED in v1.4.0; removed in v2.0.0 — use assistant_agent_loop for new apps)",
|
|
1608
|
+
description: "**LEGACY — DEPRECATED in v1.4.0; removed entirely in v2.0.0.** For new voice-assistant work use `assistant_agent_loop` (Phase 4 `glasses.assistant.start { tool(...) { ... } }`). The Phase 4 customer surface drops ~60% LoC vs this pattern, ships zero on-device ML (no ONNX models to bundle), and lets the model decide which tool to call instead of the customer hand-writing `when (turn.text)` keyword routing. This Phase 3 pattern stays in the catalog for the v1.4.0 → v2.0.0 migration window (3-month deprecation period) — existing apps continue to work unchanged. See `searchDocs(topic:'assistant_runtime')` migration section for the side-by-side Phase 3 → Phase 4 walkthrough. The Phase 3 `glasses.conversation.onWake { listen() / speak() / ai.complete() }` API composes VAD + STT + Smart Turn + TTS + BYOK LLM in the shared Rust core; the agent loop here drives wake + follow-up utterances via `injectTranscript` and asserts symmetric `conversation.*` event pairs.",
|
|
1609
1609
|
code: {
|
|
1610
1610
|
kotlin: `// ── Application bootstrap (do once, typically in Application.onCreate
|
|
1611
1611
|
// after RECORD_AUDIO is granted) ────────────────────────────────────
|
|
@@ -1901,6 +1901,918 @@ browser side-by-side with this agent loop and watch the panel react.`,
|
|
|
1901
1901
|
],
|
|
1902
1902
|
relatedFeatures: ["voice_command", "speak", "transcription_incremental", "ai_call", "smart_turn_eou"],
|
|
1903
1903
|
};
|
|
1904
|
+
const ASSISTANT_AGENT_LOOP = {
|
|
1905
|
+
pattern: "assistant_agent_loop",
|
|
1906
|
+
title: "Phase 4 assistant runtime + agent-driven E2E loop (canonical voice-AI for new apps)",
|
|
1907
|
+
description: "**Canonical Phase 4 voice-AI pattern. Start here for any new voice assistant work on Extentos.** The customer code is one block: `glasses.assistant.start(provider) { tool(name, description) { body -> ToolResult } }`. The model owns wake detection, turn taking, intent parsing, and confirmation speech — the customer only writes tool bodies that act on the app's own state (route data, app DB, camera, library). Provider abstraction wraps OpenAI Realtime in v1; Gemini Live follows. **As of iter5.2 (2026-05-27) `injectAssistantUtterance(text)` drives BOTH the Mock provider AND the real OpenAi Realtime provider** — same MCP call, same code, real model reasoning when you want it. iter5.3 (2026-05-27) added autoWake so multi-step agent tests don't drop injects across silence-timeout sleep transitions. **For the dedicated agent-driven test workflow (4-channel verification: getEventLog + adb logcat + screencap + library-state), see `getCodeExample(pattern:\"agent_driven_e2e_full_loop\")`.** This pattern stays focused on the handler-side code. **Replaces `conversation_agent_loop` (Phase 3 cascaded VAD+STT+SmartTurn+TTS+LLM) which is deprecated in v1.4.0 and removed in v2.0.0.** Customer code typically drops ~60% LoC vs Phase 3.",
|
|
1908
|
+
code: {
|
|
1909
|
+
kotlin: `// ── App bootstrap ───────────────────────────────────────────────────
|
|
1910
|
+
// Run once during app startup, after RECORD_AUDIO is granted. The
|
|
1911
|
+
// returned ExtentosGlasses instance is the single handle for everything;
|
|
1912
|
+
// keep it on your Application subclass (or a singleton container) so
|
|
1913
|
+
// the rest of your app can reach it.
|
|
1914
|
+
//
|
|
1915
|
+
// createGlasses is suspend — call it from a coroutine:
|
|
1916
|
+
//
|
|
1917
|
+
// class MyApp : Application() {
|
|
1918
|
+
// lateinit var glasses: ExtentosGlasses
|
|
1919
|
+
// override fun onCreate() {
|
|
1920
|
+
// super.onCreate()
|
|
1921
|
+
// CoroutineScope(Dispatchers.Main).launch {
|
|
1922
|
+
// glasses = createGlasses(this@MyApp)
|
|
1923
|
+
// // Wire your assistant handler here (see below)
|
|
1924
|
+
// }
|
|
1925
|
+
// }
|
|
1926
|
+
// }
|
|
1927
|
+
//
|
|
1928
|
+
// No ONNX models, no model paths, no cascaded options — Phase 4 ships
|
|
1929
|
+
// end-to-end via the provider's WebSocket. glasses.assistant is
|
|
1930
|
+
// always-on (no opt-in conversationOptions needed).
|
|
1931
|
+
import com.extentos.glasses.core.CaptureError
|
|
1932
|
+
import com.extentos.glasses.core.ExtentosConfig
|
|
1933
|
+
import com.extentos.glasses.core.ExtentosGlasses
|
|
1934
|
+
import com.extentos.glasses.core.ExtentosResult
|
|
1935
|
+
import com.extentos.glasses.core.RuntimeEvent
|
|
1936
|
+
import com.extentos.glasses.core.VideoClip
|
|
1937
|
+
import com.extentos.glasses.core.VideoConfig
|
|
1938
|
+
import com.extentos.glasses.core.assistant.AssistantEvent
|
|
1939
|
+
import com.extentos.glasses.core.assistant.AssistantProvider
|
|
1940
|
+
import com.extentos.glasses.core.assistant.AssistantSession
|
|
1941
|
+
import com.extentos.glasses.core.assistant.ToolResult
|
|
1942
|
+
import com.extentos.glasses.core.assistant.tool
|
|
1943
|
+
import com.extentos.glasses.core.valueOrNull
|
|
1944
|
+
import kotlinx.coroutines.CoroutineScope
|
|
1945
|
+
import kotlinx.coroutines.Deferred
|
|
1946
|
+
import kotlinx.coroutines.Dispatchers
|
|
1947
|
+
import kotlinx.coroutines.SupervisorJob
|
|
1948
|
+
import kotlinx.coroutines.async
|
|
1949
|
+
import kotlinx.coroutines.flow.filterIsInstance
|
|
1950
|
+
import kotlinx.coroutines.flow.launchIn
|
|
1951
|
+
import kotlinx.coroutines.flow.onEach
|
|
1952
|
+
import kotlinx.coroutines.launch
|
|
1953
|
+
import kotlin.time.Duration.Companion.seconds
|
|
1954
|
+
|
|
1955
|
+
suspend fun createGlasses(context: android.content.Context): ExtentosGlasses {
|
|
1956
|
+
return ExtentosGlasses.create(
|
|
1957
|
+
ExtentosConfig(applicationContext = context)
|
|
1958
|
+
).also { glasses ->
|
|
1959
|
+
// BYOK OpenAI key — see getCredentialGuide(service:"openai") for
|
|
1960
|
+
// the local.properties + BuildConfig plumbing. Key flows direct
|
|
1961
|
+
// from device → api.openai.com via WS Authorization header;
|
|
1962
|
+
// Extentos backend never sees it.
|
|
1963
|
+
//
|
|
1964
|
+
// **F-meta-2 warning:** do NOT have your AI agent write the
|
|
1965
|
+
// OPENAI_API_KEY value into local.properties — agent file edits
|
|
1966
|
+
// leak through the conversation transcript. Add the line yourself.
|
|
1967
|
+
glasses.assistant.setOpenaiApiKey(BuildConfig.OPENAI_API_KEY)
|
|
1968
|
+
}
|
|
1969
|
+
}
|
|
1970
|
+
|
|
1971
|
+
// ── Handler — Strava-style example ─────────────────────────────────────
|
|
1972
|
+
//
|
|
1973
|
+
// Phase 4 wake/sleep state machine (F12): the session starts Dormant
|
|
1974
|
+
// (zero token spend), the developer picks the wake mechanism (voice
|
|
1975
|
+
// phrase via glasses.voice.onPhrase, button tap, gesture, MCP call),
|
|
1976
|
+
// and the model decides when to end the conversation via the built-in
|
|
1977
|
+
// end_conversation tool (endOnIntent default true) — no rigid
|
|
1978
|
+
// "goodbye <name>" phrase required. sleepAfterSilence is the
|
|
1979
|
+
// deterministic backup. onWake { say(...) } speaks in the assistant's
|
|
1980
|
+
// own voice (alloy etc.) so the greeting matches the AI's reply voice.
|
|
1981
|
+
|
|
1982
|
+
class StravaAssistantHandler(
|
|
1983
|
+
private val glasses: ExtentosGlasses,
|
|
1984
|
+
private val routeTracker: RouteTracker, // app-internal state
|
|
1985
|
+
private val library: ClipLibrary, // app-internal state
|
|
1986
|
+
private val scope: CoroutineScope = CoroutineScope(SupervisorJob() + Dispatchers.IO),
|
|
1987
|
+
) {
|
|
1988
|
+
private var session: AssistantSession? = null
|
|
1989
|
+
|
|
1990
|
+
// Natural async + await + stop pattern. The library exposes
|
|
1991
|
+
// glasses.camera.stopVideo() so the customer never has to cancel
|
|
1992
|
+
// the wrapping Deferred — captureVideo() resumes naturally with
|
|
1993
|
+
// Ok(partial) when stop is signalled.
|
|
1994
|
+
private var activeVideo: Deferred<ExtentosResult<VideoClip, CaptureError>>? = null
|
|
1995
|
+
|
|
1996
|
+
@Volatile private var notesActive: Boolean = false
|
|
1997
|
+
private val notesBuffer = StringBuilder()
|
|
1998
|
+
|
|
1999
|
+
fun start() {
|
|
2000
|
+
// F3 (iteration-2 fix): subscribe to AssistantEvent.UserSpoke for
|
|
2001
|
+
// transcript capture — needed for notes / live-captions / journal
|
|
2002
|
+
// patterns. Fires once per user turn after the provider's STT
|
|
2003
|
+
// completes. PII boundary: text is verbatim, customer-owned.
|
|
2004
|
+
glasses.runtime.events
|
|
2005
|
+
.filterIsInstance<RuntimeEvent.Assistant>()
|
|
2006
|
+
.onEach { evt ->
|
|
2007
|
+
val userSpoke = evt.event as? AssistantEvent.UserSpoke ?: return@onEach
|
|
2008
|
+
if (notesActive) {
|
|
2009
|
+
synchronized(notesBuffer) {
|
|
2010
|
+
if (notesBuffer.isNotEmpty()) notesBuffer.append(" ")
|
|
2011
|
+
notesBuffer.append(userSpoke.transcript)
|
|
2012
|
+
}
|
|
2013
|
+
}
|
|
2014
|
+
}
|
|
2015
|
+
.launchIn(scope)
|
|
2016
|
+
|
|
2017
|
+
scope.launch {
|
|
2018
|
+
session = glasses.assistant.start(
|
|
2019
|
+
provider = AssistantProvider.OpenAi(model = "gpt-realtime", voice = "alloy"),
|
|
2020
|
+
) {
|
|
2021
|
+
instructions = """
|
|
2022
|
+
You are a Strava companion. Help the runner with route
|
|
2023
|
+
stats and capture moments. Speak briefly — they're
|
|
2024
|
+
running. Don't narrate what you're doing — just do it
|
|
2025
|
+
and confirm. When the user clearly indicates they want
|
|
2026
|
+
to stop talking, call end_conversation.
|
|
2027
|
+
""".trimIndent()
|
|
2028
|
+
|
|
2029
|
+
// Phase 4 v1.1 lifecycle:
|
|
2030
|
+
// - session lands in Dormant after start() (default)
|
|
2031
|
+
// - wake via the voice phrase wired below (any other
|
|
2032
|
+
// trigger — button, gesture, MCP — works the same)
|
|
2033
|
+
// - onWake speaks the greeting in the model's voice
|
|
2034
|
+
// - 30 s of user silence → auto-sleep (backup)
|
|
2035
|
+
// - end_conversation tool registered automatically by
|
|
2036
|
+
// the library because endOnIntent defaults true
|
|
2037
|
+
onWake { say("Hi, I'm here. What can I do for you?") }
|
|
2038
|
+
sleepAfterSilence(30.seconds)
|
|
2039
|
+
|
|
2040
|
+
// Read-tools: instant data the AI reads aloud or weaves
|
|
2041
|
+
// into the answer.
|
|
2042
|
+
tool("get_route_remaining", "How much of the planned route is left, in km.") {
|
|
2043
|
+
ToolResult.Ok("\${routeTracker.kmRemaining()} km remaining")
|
|
2044
|
+
}
|
|
2045
|
+
tool("get_average_pace", "Current average pace in minutes per km.") {
|
|
2046
|
+
ToolResult.Ok("\${routeTracker.avgPaceMinKm()} min per km")
|
|
2047
|
+
}
|
|
2048
|
+
tool("get_route_length", "Total planned route length in km.") {
|
|
2049
|
+
ToolResult.Ok("\${routeTracker.totalKm} km")
|
|
2050
|
+
}
|
|
2051
|
+
|
|
2052
|
+
// Action-tools: side effects on the app's own state. The
|
|
2053
|
+
// AI manages the take/stop pair from context.
|
|
2054
|
+
tool("take_video", "Start recording a video clip of the runner's view.") {
|
|
2055
|
+
if (activeVideo?.isActive == true) {
|
|
2056
|
+
return@tool ToolResult.Err("a recording is already in progress")
|
|
2057
|
+
}
|
|
2058
|
+
activeVideo = scope.async {
|
|
2059
|
+
glasses.camera.captureVideo(VideoConfig(maxDurationSeconds = 30))
|
|
2060
|
+
}
|
|
2061
|
+
ToolResult.Ok("recording started")
|
|
2062
|
+
}
|
|
2063
|
+
tool("stop_video", "Stop the current video recording.") {
|
|
2064
|
+
// Clean stop pattern: signal the library to gracefully
|
|
2065
|
+
// end the capture; the in-flight captureVideo() inside
|
|
2066
|
+
// the \`async\` block resumes naturally with
|
|
2067
|
+
// ExtentosResult.Ok(partialClip). No coroutine
|
|
2068
|
+
// cancellation, no sticky-Cancelled Deferred — just
|
|
2069
|
+
// a normal await on the result.
|
|
2070
|
+
//
|
|
2071
|
+
// (Don't be tempted to call activeVideo.cancel() to
|
|
2072
|
+
// stop — Kotlin Deferred state is sticky-Cancelled,
|
|
2073
|
+
// so await() throws CancellationException even if the
|
|
2074
|
+
// library produced a partial. Use stopVideo() instead.)
|
|
2075
|
+
val capture = activeVideo
|
|
2076
|
+
activeVideo = null
|
|
2077
|
+
if (capture == null) return@tool ToolResult.Err("nothing was recording")
|
|
2078
|
+
glasses.camera.stopVideo()
|
|
2079
|
+
val result = capture.await()
|
|
2080
|
+
val clip = result.valueOrNull() ?: return@tool ToolResult.Err("video capture failed")
|
|
2081
|
+
library.add(clip)
|
|
2082
|
+
ToolResult.Ok("video saved")
|
|
2083
|
+
}
|
|
2084
|
+
|
|
2085
|
+
tool("start_notes", "Start capturing what the runner says as a note.") {
|
|
2086
|
+
if (notesActive) return@tool ToolResult.Err("already taking notes")
|
|
2087
|
+
synchronized(notesBuffer) { notesBuffer.setLength(0) }
|
|
2088
|
+
notesActive = true
|
|
2089
|
+
ToolResult.Ok("ok, taking notes")
|
|
2090
|
+
}
|
|
2091
|
+
tool("stop_notes", "Stop note-taking and save the accumulated notes.") {
|
|
2092
|
+
if (!notesActive) return@tool ToolResult.Err("not currently taking notes")
|
|
2093
|
+
notesActive = false
|
|
2094
|
+
val text = synchronized(notesBuffer) {
|
|
2095
|
+
val s = notesBuffer.toString()
|
|
2096
|
+
notesBuffer.setLength(0)
|
|
2097
|
+
s
|
|
2098
|
+
}
|
|
2099
|
+
if (text.isBlank()) return@tool ToolResult.Err("nothing was captured")
|
|
2100
|
+
library.addNote(text)
|
|
2101
|
+
ToolResult.Ok("notes saved")
|
|
2102
|
+
}
|
|
2103
|
+
|
|
2104
|
+
// Vision tools — Phase 4 v1.4 includeImage + Photos.copyToFile.
|
|
2105
|
+
//
|
|
2106
|
+
// Two distinct tools, NOT one. The model picks based on
|
|
2107
|
+
// user intent:
|
|
2108
|
+
// describe_scene — "what / describe / tell me" → AI speaks
|
|
2109
|
+
// about the photo; photo NOT persisted.
|
|
2110
|
+
// save_photo — "save / capture / take a picture / remember"
|
|
2111
|
+
// → photo persisted to library; NOT described.
|
|
2112
|
+
//
|
|
2113
|
+
// Splitting lets the model call BOTH back-to-back when the
|
|
2114
|
+
// user wants both ("save this and tell me what it is") +
|
|
2115
|
+
// keeps each description tight enough for the Mock + real
|
|
2116
|
+
// matchers to disambiguate. See getCapabilityGuide(feature:"assistant_vision")
|
|
2117
|
+
// gotcha about describe-vs-save.
|
|
2118
|
+
tool("describe_scene", "Describe what the runner is currently looking at without saving the photo. Call for 'what do you see' / 'describe this' / 'tell me about this'.") {
|
|
2119
|
+
val photo = glasses.camera.capturePhoto().valueOrNull()
|
|
2120
|
+
?: return@tool ToolResult.Err("camera failed")
|
|
2121
|
+
val uri = photo.uri
|
|
2122
|
+
?: return@tool ToolResult.Err("photo had no uri")
|
|
2123
|
+
// session is the AssistantSession returned by start();
|
|
2124
|
+
// captured here via the outer-class property. By the
|
|
2125
|
+
// time a tool dispatches, the session is Active so
|
|
2126
|
+
// includeImage won't throw NotReady.
|
|
2127
|
+
session?.includeImage(uri)
|
|
2128
|
+
ToolResult.Ok("looking")
|
|
2129
|
+
}
|
|
2130
|
+
tool("save_photo", "Save a photo to the runner's library WITHOUT describing it. Call for 'save this' / 'capture this' / 'take a picture' / 'remember this view'.") {
|
|
2131
|
+
val photo = glasses.camera.capturePhoto().valueOrNull()
|
|
2132
|
+
?: return@tool ToolResult.Err("camera failed")
|
|
2133
|
+
val uri = photo.uri
|
|
2134
|
+
?: return@tool ToolResult.Err("photo had no uri")
|
|
2135
|
+
val ext = when (Photos.mediaTypeFromUri(uri)) {
|
|
2136
|
+
"image/png" -> "png"
|
|
2137
|
+
"image/webp" -> "webp"
|
|
2138
|
+
else -> "jpg"
|
|
2139
|
+
}
|
|
2140
|
+
val dst = File(library.photosDir, "photo-\${System.currentTimeMillis()}.\$ext")
|
|
2141
|
+
// Photos.copyToFile mirrors Videos.copyToFile: stream-copy
|
|
2142
|
+
// across data: / file:// / absolute path, parent mkdirs,
|
|
2143
|
+
// overwrite, returns Boolean. Use it — don't hand-roll
|
|
2144
|
+
// loadBytes + writeBytes.
|
|
2145
|
+
if (!Photos.copyToFile(uri, dst)) return@tool ToolResult.Err("could not save photo")
|
|
2146
|
+
library.addPhoto(dst.absolutePath, photo.width, photo.height)
|
|
2147
|
+
ToolResult.Ok("photo saved")
|
|
2148
|
+
}
|
|
2149
|
+
}
|
|
2150
|
+
|
|
2151
|
+
// Wake mechanism — canonical pattern uses the existing
|
|
2152
|
+
// glasses.voice.onPhrase system. Defaults to firesWhen =
|
|
2153
|
+
// VoiceScope.WhenDormant so it won't double-fire during an
|
|
2154
|
+
// active conversation. Swap this line for a button onClick,
|
|
2155
|
+
// a gesture handler, or any other trigger that calls
|
|
2156
|
+
// session?.wake() — the library doesn't dictate the
|
|
2157
|
+
// mechanism, only the lifecycle.
|
|
2158
|
+
glasses.voice.onPhrase("hey strava") { session?.wake() }
|
|
2159
|
+
}
|
|
2160
|
+
}
|
|
2161
|
+
}
|
|
2162
|
+
|
|
2163
|
+
// ── Raw form — equivalent, for programmatic construction (tools loaded
|
|
2164
|
+
// from config, conditional registration). Customer-can-skip-it: the
|
|
2165
|
+
// trailing-lambda builder above reduces to this. ────────────────────
|
|
2166
|
+
|
|
2167
|
+
import com.extentos.glasses.core.assistant.AssistantConfig
|
|
2168
|
+
import com.extentos.glasses.core.assistant.ToolDefinition
|
|
2169
|
+
|
|
2170
|
+
suspend fun startRawForm(glasses: ExtentosGlasses) {
|
|
2171
|
+
val session = glasses.assistant.createSession(
|
|
2172
|
+
AssistantConfig(
|
|
2173
|
+
provider = AssistantProvider.OpenAi(model = "gpt-realtime", voice = "alloy"),
|
|
2174
|
+
instructions = "You are a helpful assistant.",
|
|
2175
|
+
tools = listOf(
|
|
2176
|
+
ToolDefinition("take_picture", "Take a photo when asked.") { _ ->
|
|
2177
|
+
ToolResult.Ok("photo saved")
|
|
2178
|
+
},
|
|
2179
|
+
),
|
|
2180
|
+
),
|
|
2181
|
+
)
|
|
2182
|
+
session.start()
|
|
2183
|
+
}`,
|
|
2184
|
+
swift: `// ⚠️ iOS PARITY IN FLIGHT
|
|
2185
|
+
// The Phase 4 wake/sleep state machine (session.wake() / sleep() / say() /
|
|
2186
|
+
// onWake / sleepAfterSilence / endOnIntent / VoiceScope.WhenDormant) is
|
|
2187
|
+
// LIVE in the Android library + verified in the sim. The Swift port is
|
|
2188
|
+
// pending — see shared-context/real-hardware-validation-backlog.md B1
|
|
2189
|
+
// for status. This Swift block reflects the Phase 4 ergonomics target
|
|
2190
|
+
// but DOES NOT compile against the current iOS library (which still
|
|
2191
|
+
// ships the always-on Phase 4 surface from earlier sprints).
|
|
2192
|
+
//
|
|
2193
|
+
// Until iOS parity ships, iOS apps should use the Phase 3
|
|
2194
|
+
// conversation_agent_loop pattern or wait. Track the Mac VPS handoff in
|
|
2195
|
+
// shared-context/.
|
|
2196
|
+
|
|
2197
|
+
import GlassesCore
|
|
2198
|
+
|
|
2199
|
+
// ── App bootstrap (do once, e.g. in App.init or DI container) ─────────
|
|
2200
|
+
//
|
|
2201
|
+
// Phase 4 is end-to-end via the provider's WebSocket — no ONNX models
|
|
2202
|
+
// to bundle, no cascaded options. glasses.assistant is always-on.
|
|
2203
|
+
// \`Extentos.create(config:)\` is synchronous + non-throwing on iOS.
|
|
2204
|
+
func makeGlasses() -> ExtentosGlasses {
|
|
2205
|
+
let glasses = Extentos.create(config: ExtentosConfig())
|
|
2206
|
+
// BYOK OpenAI key — see getCredentialGuide(service:"openai") for the
|
|
2207
|
+
// Info.plist plumbing. Key flows direct from device → api.openai.com
|
|
2208
|
+
// via WS Authorization header; Extentos backend never sees it.
|
|
2209
|
+
glasses.assistant.setOpenAiApiKey(Secrets.openAiKey)
|
|
2210
|
+
return glasses
|
|
2211
|
+
}
|
|
2212
|
+
|
|
2213
|
+
// ── Handler — Strava-style example ────────────────────────────────────
|
|
2214
|
+
//
|
|
2215
|
+
// Plain final class (not @MainActor) so the @Sendable tool-body
|
|
2216
|
+
// closures can call into the handler without forcing every access
|
|
2217
|
+
// through an actor hop. State mutation (\`activeVideo\`) is single-
|
|
2218
|
+
// dispatch in practice — OpenAi + Mock providers serialize per-call_id
|
|
2219
|
+
// tool dispatch in v1.
|
|
2220
|
+
|
|
2221
|
+
final class StravaAssistantHandler: @unchecked Sendable {
|
|
2222
|
+
private let glasses: ExtentosGlasses
|
|
2223
|
+
private let routeTracker: RouteTracker // app-internal
|
|
2224
|
+
private let library: ClipLibrary // app-internal
|
|
2225
|
+
private var activeVideo: Task<VideoClip?, Never>?
|
|
2226
|
+
|
|
2227
|
+
init(glasses: ExtentosGlasses, routeTracker: RouteTracker, library: ClipLibrary) {
|
|
2228
|
+
self.glasses = glasses
|
|
2229
|
+
self.routeTracker = routeTracker
|
|
2230
|
+
self.library = library
|
|
2231
|
+
}
|
|
2232
|
+
|
|
2233
|
+
func start() async throws {
|
|
2234
|
+
// Sugar form (trailing-closure builder). Returns once the session
|
|
2235
|
+
// reaches Active. Q3 asymmetry: Swift's typed-args overload
|
|
2236
|
+
// requires an explicit \`schema:\` parameter (no Mirror walk on
|
|
2237
|
+
// types — see assistant_tool capability guide).
|
|
2238
|
+
_ = try await glasses.assistant.start(
|
|
2239
|
+
provider: .openAI(model: "gpt-realtime", voice: "alloy")
|
|
2240
|
+
) { config in
|
|
2241
|
+
config.instructions = """
|
|
2242
|
+
You are a Strava companion. Help the runner with route
|
|
2243
|
+
stats and capture moments. Speak briefly — they're
|
|
2244
|
+
running. Don't narrate what you're doing — just do it.
|
|
2245
|
+
"""
|
|
2246
|
+
|
|
2247
|
+
// Read-tools
|
|
2248
|
+
config.tool("get_route_remaining", description: "How much of the planned route is left, in km.") {
|
|
2249
|
+
.ok("\\(self.routeTracker.kmRemaining()) km remaining")
|
|
2250
|
+
}
|
|
2251
|
+
config.tool("get_average_pace", description: "Current average pace in minutes per km.") {
|
|
2252
|
+
.ok("\\(self.routeTracker.avgPaceMinKm()) min per km")
|
|
2253
|
+
}
|
|
2254
|
+
config.tool("get_route_length", description: "Total planned route length in km.") {
|
|
2255
|
+
.ok("\\(self.routeTracker.totalKm) km")
|
|
2256
|
+
}
|
|
2257
|
+
|
|
2258
|
+
// Action-tools — fire-and-forget via Task; stop_video cancels
|
|
2259
|
+
// the activeVideo task + reads its (possibly partial) result.
|
|
2260
|
+
// Camera's captureVideo returns \`ExtentosResult<VideoClip,
|
|
2261
|
+
// CaptureError>\` async — we extract the success branch into
|
|
2262
|
+
// an optional clip via the \`.success\` accessor.
|
|
2263
|
+
config.tool("take_video", description: "Start recording a video clip of the runner's view.") {
|
|
2264
|
+
self.activeVideo = Task {
|
|
2265
|
+
let result = await self.glasses.camera.captureVideo(
|
|
2266
|
+
VideoConfig(maxDurationSeconds: 30)
|
|
2267
|
+
)
|
|
2268
|
+
return result.success
|
|
2269
|
+
}
|
|
2270
|
+
return .ok("recording started")
|
|
2271
|
+
}
|
|
2272
|
+
config.tool("stop_video", description: "Stop the current video recording.") {
|
|
2273
|
+
// Cooperative cancellation: the camera capture finishes
|
|
2274
|
+
// its current chunk and surfaces a partial clip (or
|
|
2275
|
+
// nothing if the cancel landed before any frames).
|
|
2276
|
+
self.activeVideo?.cancel()
|
|
2277
|
+
let clip = await self.activeVideo?.value ?? nil
|
|
2278
|
+
self.activeVideo = nil
|
|
2279
|
+
if let clip {
|
|
2280
|
+
self.library.add(clip)
|
|
2281
|
+
return .ok("video saved")
|
|
2282
|
+
} else {
|
|
2283
|
+
return .err("nothing was recording")
|
|
2284
|
+
}
|
|
2285
|
+
}
|
|
2286
|
+
}
|
|
2287
|
+
}
|
|
2288
|
+
}
|
|
2289
|
+
|
|
2290
|
+
// ── Raw form — equivalent, for programmatic construction ──────────────
|
|
2291
|
+
|
|
2292
|
+
func startRawForm(_ glasses: ExtentosGlasses) async throws {
|
|
2293
|
+
let session = glasses.assistant.createSession(config: AssistantConfig(
|
|
2294
|
+
provider: .openAI(model: "gpt-realtime", voice: "alloy"),
|
|
2295
|
+
instructions: "You are a helpful assistant.",
|
|
2296
|
+
tools: [
|
|
2297
|
+
ToolDefinition(name: "take_picture", description: "Take a photo when asked.") { _ in
|
|
2298
|
+
.ok("photo saved")
|
|
2299
|
+
},
|
|
2300
|
+
]
|
|
2301
|
+
))
|
|
2302
|
+
try await session.start()
|
|
2303
|
+
}`,
|
|
2304
|
+
},
|
|
2305
|
+
explanation: `AGENT-DRIVEN E2E TEST LOOP — RUN AFTER createGlasses + handler.start()
|
|
2306
|
+
|
|
2307
|
+
→ For the dedicated agent-driven workflow with FOUR-channel verification
|
|
2308
|
+
(getEventLog + adb logcat + screencap + library-state inspection), the
|
|
2309
|
+
multi-tool sweep pattern, autoWake details, and headless-CI guidance,
|
|
2310
|
+
see \`getCodeExample(pattern: "agent_driven_e2e_full_loop")\`. The block
|
|
2311
|
+
below is the quickstart; the dedicated example is the production
|
|
2312
|
+
pattern.
|
|
2313
|
+
|
|
2314
|
+
The headless verification: drive the assistant with synthetic utterances,
|
|
2315
|
+
assert the expected tools fire, then read the event log. Two providers
|
|
2316
|
+
satisfy the same agent loop (since iter5.2, the SAME injectAssistantUtterance
|
|
2317
|
+
MCP call drives BOTH):
|
|
2318
|
+
|
|
2319
|
+
- AssistantProvider.Mock — deterministic, sub-millisecond, $0. Word-
|
|
2320
|
+
overlap-matches the injected utterance against tool descriptions
|
|
2321
|
+
and dispatches the first match. Use for CI + tight inner loop.
|
|
2322
|
+
- AssistantProvider.OpenAi — real WebSocket to api.openai.com,
|
|
2323
|
+
real LLM picks the tool, real audio output. Use for real-provider
|
|
2324
|
+
verification — works headless against the real model via
|
|
2325
|
+
injectAssistantUtterance.text since iter5.2 (no mic / no human
|
|
2326
|
+
required). Sim browser tab still required for camera-using tools.
|
|
2327
|
+
|
|
2328
|
+
// Mock-provider path — DON'T bake a useMock Boolean into the handler
|
|
2329
|
+
// (F13: invisible to users; ships looking like OpenAi by default).
|
|
2330
|
+
// Instead, wire the provider choice via a BuildConfig field tied to
|
|
2331
|
+
// a build flavor:
|
|
2332
|
+
//
|
|
2333
|
+
// // app/build.gradle.kts
|
|
2334
|
+
// android {
|
|
2335
|
+
// flavorDimensions += "assistant"
|
|
2336
|
+
// productFlavors {
|
|
2337
|
+
// create("mock") { dimension = "assistant"; buildConfigField("Boolean", "USE_MOCK_ASSISTANT", "true") }
|
|
2338
|
+
// create("real") { dimension = "assistant"; buildConfigField("Boolean", "USE_MOCK_ASSISTANT", "false") }
|
|
2339
|
+
// }
|
|
2340
|
+
// }
|
|
2341
|
+
//
|
|
2342
|
+
// // In your handler:
|
|
2343
|
+
// val provider = if (BuildConfig.USE_MOCK_ASSISTANT)
|
|
2344
|
+
// AssistantProvider.Mock() else
|
|
2345
|
+
// AssistantProvider.OpenAi(model = "gpt-realtime", voice = "alloy")
|
|
2346
|
+
//
|
|
2347
|
+
// CI builds the "mock" flavor for the agent-driven E2E loop below;
|
|
2348
|
+
// production builds the "real" flavor. The provider is visible in the
|
|
2349
|
+
// AssistantStatusBadge (glasses-ui) + the sim's waveform color so
|
|
2350
|
+
// dev never confuses which mode the running app is in.
|
|
2351
|
+
const { sessionId } = await createSimulatorSession({
|
|
2352
|
+
glasses: "meta_rayban", platform: "android",
|
|
2353
|
+
});
|
|
2354
|
+
// Anchor cursor BEFORE inject so a fresh log read starts at 'now'.
|
|
2355
|
+
const head = await getEventLog({ sessionId, filter: "voice", limit: 1 });
|
|
2356
|
+
let cur = head.cursor;
|
|
2357
|
+
|
|
2358
|
+
// 1. Inject the utterance. text: routes to MockAssistantProvider via
|
|
2359
|
+
// the BrowserSim raw-frame observer (S2.W.0 wiring).
|
|
2360
|
+
await injectAssistantUtterance({ sessionId, text: "take a picture" });
|
|
2361
|
+
|
|
2362
|
+
// 2. Assert the expected tool was called within the timeout.
|
|
2363
|
+
const call = await assertToolCalled({
|
|
2364
|
+
sessionId, name: "take_picture", timeoutMs: 5000,
|
|
2365
|
+
});
|
|
2366
|
+
// call.args is the parsed JsonObject from the tool's args schema
|
|
2367
|
+
// (or {} for no-arg tools); call.call_id pairs with the result event.
|
|
2368
|
+
|
|
2369
|
+
// 3. (Optional) Read the full event-log trace for assertions on the
|
|
2370
|
+
// speak transcript, the tool result, etc.
|
|
2371
|
+
const log = await getEventLog({
|
|
2372
|
+
sessionId, cursor: cur, filter: "voice",
|
|
2373
|
+
});
|
|
2374
|
+
// Expected events (in order):
|
|
2375
|
+
// assistant.user_spoke transcript="take a picture"
|
|
2376
|
+
// assistant.tool_called name="take_picture", call_id
|
|
2377
|
+
// assistant.tool_result name="take_picture", output, is_error,
|
|
2378
|
+
// duration_ms
|
|
2379
|
+
// assistant.assistant_spoke transcript="ok, take_picture" (Mock
|
|
2380
|
+
// synthesizes; OpenAi has the model
|
|
2381
|
+
// speak its own confirmation)
|
|
2382
|
+
|
|
2383
|
+
// 4. To drive a multi-utterance flow, just repeat steps 1-3 with the
|
|
2384
|
+
// next text. The session stays Active across injects until you
|
|
2385
|
+
// explicitly stop it.
|
|
2386
|
+
|
|
2387
|
+
WHAT GETS LOGGED — assistant.* event family (per synthesis #15)
|
|
2388
|
+
|
|
2389
|
+
assistant.session_started provider, model, voice
|
|
2390
|
+
assistant.session_ended reason (user / error / ceiling), message
|
|
2391
|
+
assistant.user_spoke transcript (verbatim — see PII note below)
|
|
2392
|
+
assistant.assistant_spoke transcript (verbatim)
|
|
2393
|
+
assistant.tool_called name, args (JsonObject), call_id
|
|
2394
|
+
assistant.tool_result call_id, name, output, is_error, duration_ms
|
|
2395
|
+
assistant.reconnected reason, downtime_ms (synthesis #23
|
|
2396
|
+
transparent 60-min ceiling + onFailure
|
|
2397
|
+
recovery — library hides this from
|
|
2398
|
+
customers; observable here for debugging)
|
|
2399
|
+
assistant.error kind, message (non-fatal; fatal errors
|
|
2400
|
+
emit session_ended(error))
|
|
2401
|
+
|
|
2402
|
+
PII BOUNDARY — Phase 4 differs from Phase 3 conversation.* events
|
|
2403
|
+
|
|
2404
|
+
Phase 4 assistant events DO carry verbatim transcripts in
|
|
2405
|
+
user_spoke / assistant_spoke. Phase 3 conversation.* events strip
|
|
2406
|
+
text (only text_len) because the cascaded path routed transcripts
|
|
2407
|
+
through Extentos backend. Phase 4's BYOK contract sends transcripts
|
|
2408
|
+
device → openai.com directly without touching Extentos backend, so
|
|
2409
|
+
the platform-side PII boundary that Phase 3 enforced doesn't apply.
|
|
2410
|
+
Customer apps own their data retention story.
|
|
2411
|
+
|
|
2412
|
+
FILTER QUICK REFERENCE
|
|
2413
|
+
|
|
2414
|
+
- filter: "voice" → all assistant.* events + legacy stt_/speak/
|
|
2415
|
+
audio_/tts_audio_chunk frames coexist here
|
|
2416
|
+
- filter: "errors" → assistant.error AUTOMATICALLY (the backend
|
|
2417
|
+
bumps severity to warn for this type)
|
|
2418
|
+
|
|
2419
|
+
SIMULATOR UI
|
|
2420
|
+
|
|
2421
|
+
The sim's right rail renders an AssistantPanel showing the active
|
|
2422
|
+
session's provider + model + voice + current state. Recent turns
|
|
2423
|
+
appear as colored chips (UserSpoke=sky, AssistantSpoke=emerald,
|
|
2424
|
+
ToolCalled→Result=indigo, Error=red). Open the sim browser side-by-
|
|
2425
|
+
side with this agent loop and watch the panel react.`,
|
|
2426
|
+
gotchas: [
|
|
2427
|
+
"**Anchor a cursor BEFORE the first inject.** A no-cursor getEventLog returns the tail of the log + a cursor positioned after it — that's your 'now' bookmark. Without it, a resumed sim's stale assistant.tool_called from a prior run can satisfy your assertion and you'd never know the new inject silently failed.",
|
|
2428
|
+
"**Mock provider matches utterances by word overlap (≥3 chars) against tool descriptions.** Tool descriptions whose keywords appear in your test utterances get matched; ones without don't. Write descriptions like \"Take a photo when the user asks to capture a moment\" rather than \"Captures imagery via the camera SDK\" — both the model AND the Mock matcher work better when descriptions describe WHEN to call, not what they do internally. For deterministic tests with ambiguous text, use `argsMatch` on assertToolCalled to constrain the matched call further.",
|
|
2429
|
+
"**Mock provider only attaches when transport is BrowserSim.** RealMeta + LocalSim sessions silently no-op the inject subscription (Mock is sim-only by design). To test against the OpenAi provider in sim, open the sim browser tab — its mic input flows through audio_chunks → OpenAI WebSocket → tool dispatch. injectAssistantUtterance.text: still works against Mock; for real-provider audio injection use the sim browser mic or wait for v1.1's audioWavBase64 path.",
|
|
2430
|
+
"**No ONNX models, no model paths, no conversationOptions.** Phase 4 ships zero on-device ML — the provider's WebSocket carries everything. If you're carrying old conversationOptions / ConversationModels code from Phase 3, delete it during migration (see `searchDocs(topic:'assistant_runtime')` → migration section). The deprecation warning fires on first `glasses.conversation.*` call in v1.4.0; the API is removed in v2.0.0.",
|
|
2431
|
+
"**Singleton-active sessions — synthesis #13.** At most one assistant.start at a time per ExtentosGlasses instance. A second start while one is active throws AssistantException(AlreadyActive). Call assistant.stop() (or session.stop()) before starting a new one. For v1 simplicity; can relax in v1.x if customer demand surfaces.",
|
|
2432
|
+
"**Reconnection is library-owned + transparent — synthesis #23.** OpenAI Realtime hard-caps sessions at 60 min; the library proactively reconnects every ~50 min (configurable in v1.1+). assistant.reconnected fires for observability; customer code doesn't see the swap. Conversation history is replayed (recent 40 turns) on each reconnect. Tool dispatches in flight at reconnect time are preserved.",
|
|
2433
|
+
"**Tool body runs on Dispatchers.IO (Kotlin) / Swift Task (Swift); suspending camera/storage/HTTP calls are fine.** Per-tool blocking opt-out via `tool(name, desc, blocking = true) { ... }` per synthesis #9 — when true, the model waits silently for the result before speaking. Default is non-blocking (model says \"let me check...\" while the tool runs). Use blocking=true for sub-100ms tools where the filler would be awkward (\"what time is it\" returning in 10 ms).",
|
|
2434
|
+
"**BYOK key flows direct, not via Extentos.** synthesis §12. `glasses.assistant.setOpenaiApiKey(key)` stores the key in the AssistantClient. When start() opens a WebSocket, the key goes into `Authorization: Bearer ...` for the wss://api.openai.com/v1/realtime?model=gpt-realtime connection. Extentos backend never sees the key. Test endpoint override is on the AssistantProvider.OpenAi case if you need to point at a mock OpenAI proxy.",
|
|
2435
|
+
"**`glasses.ai.complete` is deprecated in v1.4.0 and removed in v2.0.0.** Use the OpenAI SDK directly for non-voice LLM calls (image description, summarization, etc.). Migration: see `searchDocs(topic:'assistant_runtime')` → migration section walkthrough C.",
|
|
2436
|
+
"**Vision via `session.includeImage(uri, prompt = null)` (v1.4 addition).** Capture a photo, hand the URI to the assistant inside a tool body — the model sees it + speaks about it in its configured voice. URI accepts data: / http(s): / file:// / content://. The image persists in conversation history at the provider, so follow-up questions in the same session work without re-sending. Active-only (throws NotReady otherwise — safe inside a tool body where the session is always Active). Canonical pattern: `tool(\"describe_scene\", \"...\") { val photo = glasses.camera.capturePhoto().valueOrNull(); session.includeImage(photo.uri); ToolResult.Ok(\"looking\") }`. See getCapabilityGuide(feature:\"assistant_vision\") for the prompt-parameter pattern + URI-type gotchas.",
|
|
2437
|
+
"**Mid-session primitives (iter5 addition).** Four building blocks on `AssistantSession`: `setReasoningEffort(level)` for dynamic effort routing, `updateInstructions(text)` for persona/mode swaps, `cancelSpeak()` for tool-driven interrupts, `conversationHistory(limit)` for forwarding context to a stronger model. All composable with the existing tool surface — write tools that call them in their bodies. Canonical escalation pattern: a `ask_smart_model` tool body reads `session.conversationHistory()`, formats it as context, calls the customer's own GPT-5 (or Anthropic Claude, or Gemini) client, returns the response as `ToolResult.Ok(answer)` — the realtime model then speaks the answer in its configured voice. Active-only (except conversationHistory which is always safe). See getCapabilityGuide(feature:\"assistant_session_runtime\") for the canonical snippet of each.",
|
|
2438
|
+
],
|
|
2439
|
+
relatedFeatures: ["assistant_runtime", "assistant_start", "assistant_tool", "assistant_provider_openai", "assistant_vision", "assistant_session_runtime", "audio_chunks", "speak", "capture_photo"],
|
|
2440
|
+
};
|
|
2441
|
+
// ── Phase 4 agent-driven E2E loop (iter5.2 + iter5.3) ─────────────────
|
|
2442
|
+
//
|
|
2443
|
+
// Sister pattern to `assistant_agent_loop`. That one shows the customer-
|
|
2444
|
+
// side handler code; THIS one shows the AGENT-side workflow that drives
|
|
2445
|
+
// + verifies it. The split exists because the audiences differ — a dev
|
|
2446
|
+
// shipping a voice assistant cares about the handler; an AI coding agent
|
|
2447
|
+
// (or a customer's CI pipeline) cares about the test loop. Both reference
|
|
2448
|
+
// each other.
|
|
2449
|
+
//
|
|
2450
|
+
// Iter5.2 (2026-05-27) made `injectAssistantUtterance(text)` drive the
|
|
2451
|
+
// REAL OpenAI Realtime provider, not just Mock. Iter5.3 (2026-05-27)
|
|
2452
|
+
// wired autoWake so multi-step sessions don't silently drop injects
|
|
2453
|
+
// across silence-timeout sleep transitions. assertToolCalled's cursor
|
|
2454
|
+
// anchor (b87388d) eliminated the stale-match false-positive. Together
|
|
2455
|
+
// these turn the agent-driven E2E loop from "demo-quality" into a
|
|
2456
|
+
// production-grade verification pattern.
|
|
2457
|
+
const AGENT_DRIVEN_E2E_FULL_LOOP = {
|
|
2458
|
+
pattern: "agent_driven_e2e_full_loop",
|
|
2459
|
+
title: "Phase 4 assistant — agent-driven E2E loop with multi-channel verification (real OpenAi capable)",
|
|
2460
|
+
description: "How an AI coding agent verifies a Phase 4 voice-assistant handler end-to-end FROM THE SAME MCP SESSION that scaffolded it — no human, no mic. Iter5.2 unlocked driving the REAL OpenAI Realtime provider via `injectAssistantUtterance(text)` (same MCP call as Mock — the runtime decides based on `AssistantProvider.OpenAi` vs `.Mock()` at session creation). The full loop combines FOUR independent verification channels: `getEventLog` for the protocol-level assistant.* event trace, `adb logcat` for tool-body internals (file paths, byte counts, branch decisions), `adb screencap` for UI confirmation, and direct library-state inspection (Room DB / file system) for persisted side effects. When all four agree, the flow really works at every layer. Companion to `assistant_agent_loop` which covers the handler-side code — use both. Replaces the older `agent_test_loop` (Phase 3 cascaded VAD + recordDiscrete + AnthropicClient) for new voice-AI work.",
|
|
2461
|
+
code: {
|
|
2462
|
+
kotlin: `// HANDLER UNDER TEST — minimal Phase 4 assistant the agent loop drives.
|
|
2463
|
+
// For the full handler-side pattern (8+ tools, wake/sleep hooks,
|
|
2464
|
+
// build-flavor Mock/OpenAi switching, voice phrase wiring), see
|
|
2465
|
+
// getCodeExample(pattern: "assistant_agent_loop").
|
|
2466
|
+
|
|
2467
|
+
import com.extentos.glasses.core.ExtentosGlasses
|
|
2468
|
+
import com.extentos.glasses.core.Photos
|
|
2469
|
+
import com.extentos.glasses.core.assistant.AssistantProvider
|
|
2470
|
+
import com.extentos.glasses.core.assistant.AssistantSession
|
|
2471
|
+
import com.extentos.glasses.core.assistant.ToolResult
|
|
2472
|
+
import com.extentos.glasses.core.valueOrNull
|
|
2473
|
+
import android.util.Log
|
|
2474
|
+
import kotlinx.coroutines.CoroutineScope
|
|
2475
|
+
import kotlinx.coroutines.Dispatchers
|
|
2476
|
+
import kotlinx.coroutines.SupervisorJob
|
|
2477
|
+
import kotlinx.coroutines.launch
|
|
2478
|
+
import java.io.File
|
|
2479
|
+
import kotlin.time.Duration.Companion.seconds
|
|
2480
|
+
|
|
2481
|
+
class NotesAssistant(
|
|
2482
|
+
private val glasses: ExtentosGlasses,
|
|
2483
|
+
private val library: NotesLibrary, // app-internal: Room + photos/ dir
|
|
2484
|
+
private val scope: CoroutineScope = CoroutineScope(SupervisorJob() + Dispatchers.IO),
|
|
2485
|
+
) {
|
|
2486
|
+
private var session: AssistantSession? = null
|
|
2487
|
+
@Volatile private var notesActive = false
|
|
2488
|
+
private val notesBuffer = StringBuilder()
|
|
2489
|
+
|
|
2490
|
+
fun start() {
|
|
2491
|
+
scope.launch {
|
|
2492
|
+
session = glasses.assistant.start(
|
|
2493
|
+
// F13: pick provider via build flavor (USE_MOCK_ASSISTANT BuildConfig)
|
|
2494
|
+
// — never bake \`useMock\` into handler code. See assistant_agent_loop.
|
|
2495
|
+
provider = AssistantProvider.OpenAi(model = "gpt-realtime-2", voice = "alloy"),
|
|
2496
|
+
) {
|
|
2497
|
+
instructions = "You are a voice-notes assistant on glasses. Speak briefly."
|
|
2498
|
+
sleepAfterSilence(30.seconds)
|
|
2499
|
+
|
|
2500
|
+
tool("start_notes", "Start capturing the user's speech as a note. Call when the user says 'start notes', 'take notes', or similar.") {
|
|
2501
|
+
Log.i(TAG, "start_notes: notesActive=\$notesActive")
|
|
2502
|
+
if (notesActive) return@tool ToolResult.Err("already taking notes")
|
|
2503
|
+
notesActive = true
|
|
2504
|
+
notesBuffer.setLength(0)
|
|
2505
|
+
ToolResult.Ok("ok, taking notes")
|
|
2506
|
+
}
|
|
2507
|
+
|
|
2508
|
+
tool("save_notes", "Save the accumulated notes to the library. Call when the user says 'save notes', 'stop notes', or 'I'm done'.") {
|
|
2509
|
+
Log.i(TAG, "save_notes: buf=\${notesBuffer.length} chars")
|
|
2510
|
+
if (!notesActive) return@tool ToolResult.Err("not taking notes")
|
|
2511
|
+
notesActive = false
|
|
2512
|
+
val text = notesBuffer.toString()
|
|
2513
|
+
if (text.isBlank()) return@tool ToolResult.Err("nothing captured")
|
|
2514
|
+
library.addNote(text)
|
|
2515
|
+
ToolResult.Ok("notes saved")
|
|
2516
|
+
}
|
|
2517
|
+
|
|
2518
|
+
tool("save_photo", "Take a photo and save it to the user's library. Call when the user says 'save this' or 'take a picture'.") {
|
|
2519
|
+
val photo = glasses.camera.capturePhoto().valueOrNull()
|
|
2520
|
+
?: return@tool ToolResult.Err("camera failed")
|
|
2521
|
+
val uri = photo.uri ?: return@tool ToolResult.Err("no uri")
|
|
2522
|
+
Log.i(TAG, "save_photo: uri=\${uri.take(60)} \${photo.width}x\${photo.height}")
|
|
2523
|
+
val dst = File(library.photosDir, "photo-\${System.currentTimeMillis()}.jpg")
|
|
2524
|
+
if (!Photos.copyToFile(uri, dst)) return@tool ToolResult.Err("save failed")
|
|
2525
|
+
Log.i(TAG, "save_photo: wrote \${dst.length()} bytes to \${dst.absolutePath}")
|
|
2526
|
+
library.addPhoto(dst.absolutePath, photo.width, photo.height)
|
|
2527
|
+
ToolResult.Ok("photo saved")
|
|
2528
|
+
}
|
|
2529
|
+
|
|
2530
|
+
tool("describe_scene", "Describe what the user is currently looking at WITHOUT saving the photo. Call for 'what do you see' / 'describe this' / 'tell me about this'.") {
|
|
2531
|
+
val photo = glasses.camera.capturePhoto().valueOrNull()
|
|
2532
|
+
?: return@tool ToolResult.Err("camera failed")
|
|
2533
|
+
val uri = photo.uri ?: return@tool ToolResult.Err("no uri")
|
|
2534
|
+
Log.i(TAG, "describe_scene: handing uri to session.includeImage")
|
|
2535
|
+
session?.includeImage(uri)
|
|
2536
|
+
ToolResult.Ok("looking")
|
|
2537
|
+
}
|
|
2538
|
+
}
|
|
2539
|
+
// Wake phrase — the wake handler the autoWake pre-wake path matches.
|
|
2540
|
+
glasses.voice.onPhrase("hey notes") {
|
|
2541
|
+
Log.i(TAG, "wake phrase matched")
|
|
2542
|
+
session?.wake()
|
|
2543
|
+
}
|
|
2544
|
+
}
|
|
2545
|
+
}
|
|
2546
|
+
|
|
2547
|
+
private companion object {
|
|
2548
|
+
const val TAG = "NotesAssistant"
|
|
2549
|
+
}
|
|
2550
|
+
}`,
|
|
2551
|
+
swift: `// HANDLER UNDER TEST — minimal Phase 4 assistant the agent loop drives.
|
|
2552
|
+
// For the full handler-side pattern see getCodeExample(pattern: "assistant_agent_loop").
|
|
2553
|
+
//
|
|
2554
|
+
// ⚠️ iOS PARITY IN FLIGHT — the Phase 4 wake/sleep state machine + iter5
|
|
2555
|
+
// mid-session primitives are LIVE on Android. The Swift port is queued
|
|
2556
|
+
// (see shared-context/ios-pure-sdk-pivot-handoff.md). This Swift block
|
|
2557
|
+
// reflects the Phase 4 ergonomics target; current iOS apps using
|
|
2558
|
+
// glasses.assistant still ship the always-on surface from earlier sprints.
|
|
2559
|
+
// The TS agent-test loop below is platform-agnostic — it works against
|
|
2560
|
+
// either the Android handler above OR an iOS app running the same APIs
|
|
2561
|
+
// once the iOS port lands.
|
|
2562
|
+
|
|
2563
|
+
import GlassesCore
|
|
2564
|
+
|
|
2565
|
+
final class NotesAssistant: @unchecked Sendable {
|
|
2566
|
+
private let glasses: ExtentosGlasses
|
|
2567
|
+
private let library: NotesLibrary
|
|
2568
|
+
private var session: AssistantSession?
|
|
2569
|
+
private var notesActive = false
|
|
2570
|
+
private var notesBuffer = ""
|
|
2571
|
+
|
|
2572
|
+
init(glasses: ExtentosGlasses, library: NotesLibrary) {
|
|
2573
|
+
self.glasses = glasses
|
|
2574
|
+
self.library = library
|
|
2575
|
+
}
|
|
2576
|
+
|
|
2577
|
+
func start() async throws {
|
|
2578
|
+
session = try await glasses.assistant.start(
|
|
2579
|
+
provider: .openAI(model: "gpt-realtime-2", voice: "alloy")
|
|
2580
|
+
) { config in
|
|
2581
|
+
config.instructions = "You are a voice-notes assistant on glasses. Speak briefly."
|
|
2582
|
+
// config.sleepAfterSilence(.seconds(30)) // iOS API parity pending
|
|
2583
|
+
|
|
2584
|
+
config.tool("start_notes", description: "Start capturing the user's speech as a note. Call when the user says 'start notes' / 'take notes'.") {
|
|
2585
|
+
NSLog("NotesAssistant start_notes: notesActive=\\(self.notesActive)")
|
|
2586
|
+
if self.notesActive { return .err("already taking notes") }
|
|
2587
|
+
self.notesActive = true
|
|
2588
|
+
self.notesBuffer = ""
|
|
2589
|
+
return .ok("ok, taking notes")
|
|
2590
|
+
}
|
|
2591
|
+
|
|
2592
|
+
config.tool("save_notes", description: "Save the accumulated notes. Call when the user says 'save notes' / 'stop notes' / 'I'm done'.") {
|
|
2593
|
+
NSLog("NotesAssistant save_notes: buf=\\(self.notesBuffer.count) chars")
|
|
2594
|
+
if !self.notesActive { return .err("not taking notes") }
|
|
2595
|
+
self.notesActive = false
|
|
2596
|
+
if self.notesBuffer.isEmpty { return .err("nothing captured") }
|
|
2597
|
+
self.library.addNote(self.notesBuffer)
|
|
2598
|
+
return .ok("notes saved")
|
|
2599
|
+
}
|
|
2600
|
+
|
|
2601
|
+
config.tool("save_photo", description: "Take a photo and save it to the library. Call when the user says 'save this' / 'take a picture'.") {
|
|
2602
|
+
guard let photo = (await self.glasses.camera.capturePhoto()).success else {
|
|
2603
|
+
return .err("camera failed")
|
|
2604
|
+
}
|
|
2605
|
+
NSLog("NotesAssistant save_photo: width=\\(photo.width)")
|
|
2606
|
+
self.library.addPhoto(photo)
|
|
2607
|
+
return .ok("photo saved")
|
|
2608
|
+
}
|
|
2609
|
+
}
|
|
2610
|
+
// Wake phrase wiring (handler the autoWake pre-wake path matches).
|
|
2611
|
+
_ = glasses.voice.onPhrase("hey notes") { [weak self] in
|
|
2612
|
+
await self?.session?.wake()
|
|
2613
|
+
}
|
|
2614
|
+
}
|
|
2615
|
+
}`,
|
|
2616
|
+
},
|
|
2617
|
+
explanation: `THE AGENT-SIDE TEST LOOP — what runs in YOUR conversation, NOT in the customer's app
|
|
2618
|
+
|
|
2619
|
+
// 1. Mint or resume the sim. createSimulatorSession is get-or-create —
|
|
2620
|
+
// pass resetFresh:true only when you need a clean event-log slate.
|
|
2621
|
+
const { sessionId } = await createSimulatorSession({
|
|
2622
|
+
glasses: "meta_rayban", platform: "android",
|
|
2623
|
+
});
|
|
2624
|
+
|
|
2625
|
+
// 2. Sweep across N tools in one session. autoWake (default true)
|
|
2626
|
+
// handles the silence-timeout sleep transitions that bite multi-step
|
|
2627
|
+
// tests where the agent spends >30s between injects.
|
|
2628
|
+
const cases = [
|
|
2629
|
+
{ utter: "start taking notes", expect: "start_notes" },
|
|
2630
|
+
{ utter: "save this view", expect: "save_photo" },
|
|
2631
|
+
{ utter: "what am I looking at", expect: "describe_scene" },
|
|
2632
|
+
{ utter: "okay save the notes", expect: "save_notes" },
|
|
2633
|
+
];
|
|
2634
|
+
|
|
2635
|
+
for (const c of cases) {
|
|
2636
|
+
const inj = await injectAssistantUtterance({
|
|
2637
|
+
sessionId,
|
|
2638
|
+
text: c.utter,
|
|
2639
|
+
// autoWake defaults true; wakePhrase defaults "hey elizabeth" —
|
|
2640
|
+
// override for apps with a different wake phrase:
|
|
2641
|
+
wakePhrase: "hey notes",
|
|
2642
|
+
});
|
|
2643
|
+
// inj.autoWake describes what the pre-wake step did:
|
|
2644
|
+
// { action: "skipped_active" } session was up already
|
|
2645
|
+
// { action: "pre_waked", waitedMs: 1843 } re-waked from dormant
|
|
2646
|
+
// { action: "pre_wake_timeout", reason } wake phrase didn't match
|
|
2647
|
+
// — check glasses.voice.onPhrase
|
|
2648
|
+
// registration in the app
|
|
2649
|
+
|
|
2650
|
+
// 3. Assert the tool fired. The cursor anchor (auto since b87388d)
|
|
2651
|
+
// eliminates stale matches from prior runs.
|
|
2652
|
+
const call = await assertToolCalled({
|
|
2653
|
+
sessionId, name: c.expect, timeoutMs: 5000,
|
|
2654
|
+
});
|
|
2655
|
+
console.log(\`\${c.expect}: call_id=\${call.call_id} waited=\${call.waitedMs}ms\`);
|
|
2656
|
+
}
|
|
2657
|
+
|
|
2658
|
+
THE FOUR VERIFICATION CHANNELS — when does an assertion really mean it worked?
|
|
2659
|
+
|
|
2660
|
+
CHANNEL 1 — getEventLog (protocol layer)
|
|
2661
|
+
|
|
2662
|
+
The 8-event assistant.* family captures the full conversation. Anchor a cursor
|
|
2663
|
+
before injecting, then read forward:
|
|
2664
|
+
|
|
2665
|
+
const head = await getEventLog({ sessionId, filter: "voice", limit: 1 });
|
|
2666
|
+
let cur = head.cursor;
|
|
2667
|
+
await injectAssistantUtterance({ sessionId, text: "save this view" });
|
|
2668
|
+
await assertToolCalled({ sessionId, name: "save_photo", timeoutMs: 5000 });
|
|
2669
|
+
const trace = await getEventLog({ sessionId, cursor: cur, filter: "voice" });
|
|
2670
|
+
// Expected event sequence:
|
|
2671
|
+
// assistant.user_spoke transcript="save this view"
|
|
2672
|
+
// assistant.tool_called name="save_photo", call_id
|
|
2673
|
+
// assistant.tool_result call_id, output="photo saved", is_error:false,
|
|
2674
|
+
// duration_ms
|
|
2675
|
+
// assistant.assistant_spoke transcript="<the model's confirmation>"
|
|
2676
|
+
|
|
2677
|
+
CHANNEL 2 — adb logcat (tool-body internals)
|
|
2678
|
+
|
|
2679
|
+
assertToolCalled proves the tool FIRED. It does NOT prove the body finished
|
|
2680
|
+
cleanly. Tool bodies log internals via android.util.Log — file paths, byte
|
|
2681
|
+
counts, branch decisions. Tail logcat with a tag filter to see what the body
|
|
2682
|
+
actually did:
|
|
2683
|
+
|
|
2684
|
+
// Find the adb binary (Windows: Sdk\\platform-tools\\adb.exe; macOS/Linux:
|
|
2685
|
+
// /Users/<you>/Library/Android/sdk/platform-tools/adb).
|
|
2686
|
+
adb logcat -d -s "NotesAssistant:I" \\
|
|
2687
|
+
| tail -50
|
|
2688
|
+
// I/NotesAssistant: save_photo: uri=content://media/external/images/... 1920x1080
|
|
2689
|
+
// I/NotesAssistant: save_photo: wrote 487231 bytes to /data/data/<pkg>/files/photos/photo-1716908...
|
|
2690
|
+
|
|
2691
|
+
If Channel 1 fires but Channel 2 is silent past the tool entry log, the body
|
|
2692
|
+
hit a branch that returned early without logging. Often a missing else clause.
|
|
2693
|
+
|
|
2694
|
+
CHANNEL 3 — adb screencap (UI confirmation)
|
|
2695
|
+
|
|
2696
|
+
Run-state changes (badge color, count, list items) reflect the persisted
|
|
2697
|
+
state in the UI. screencap proves the binding observed the new state:
|
|
2698
|
+
|
|
2699
|
+
adb exec-out screencap -p > /tmp/post-save.png
|
|
2700
|
+
// Then Read /tmp/post-save.png — the agent SEES what the user would see.
|
|
2701
|
+
// The Notes app's count badge incremented; the latest photo appears in
|
|
2702
|
+
// the gallery row; the "taking notes" indicator turned off.
|
|
2703
|
+
|
|
2704
|
+
Channel 3 catches UI-binding bugs: the tool wrote to the DB but the UI didn't
|
|
2705
|
+
recompose. Often a missing StateFlow.emit / notifyDataSetChanged / SwiftUI
|
|
2706
|
+
@Published.
|
|
2707
|
+
|
|
2708
|
+
CHANNEL 4 — library-state inspection (persistence)
|
|
2709
|
+
|
|
2710
|
+
Tools writing to local storage leave a durable artifact. Pull + inspect:
|
|
2711
|
+
|
|
2712
|
+
// Photos directory
|
|
2713
|
+
adb exec-out run-as <pkg> ls -la files/photos/ | tail -5
|
|
2714
|
+
// -rw------- 1 u0_a234 u0_a234 487231 2026-05-27 21:14 photo-1716908143421.jpg
|
|
2715
|
+
|
|
2716
|
+
// Room DB (always pull .db + .db-wal + .db-shm together — WAL is mandatory)
|
|
2717
|
+
adb exec-out run-as <pkg> cat databases/notes.db > /tmp/local.db
|
|
2718
|
+
adb exec-out run-as <pkg> cat databases/notes.db-wal > /tmp/local.db-wal
|
|
2719
|
+
adb exec-out run-as <pkg> cat databases/notes.db-shm > /tmp/local.db-shm
|
|
2720
|
+
sqlite3 /tmp/local.db "SELECT id, text FROM notes ORDER BY id DESC LIMIT 3"
|
|
2721
|
+
|
|
2722
|
+
Channel 4 catches persistence bugs: the tool succeeded but the write never
|
|
2723
|
+
hit storage. Common when a developer comments out the DB insert while
|
|
2724
|
+
debugging the speak()-side and forgets to uncomment.
|
|
2725
|
+
|
|
2726
|
+
WHEN THE CHANNELS DISAGREE
|
|
2727
|
+
|
|
2728
|
+
Channels 1+2+3+4 agree flow works end-to-end
|
|
2729
|
+
Channel 1 fires, Channel 2 silent body returned early; missing branch log
|
|
2730
|
+
Channels 1+2 fire, Channel 3 unchanged UI binding didn't fire; missing emit
|
|
2731
|
+
Channels 1-3 fire, Channel 4 missing body wrote to a transient; never persisted
|
|
2732
|
+
|
|
2733
|
+
Each disagreement points at a specific layer. With only Channel 1 (which is
|
|
2734
|
+
what assertToolCalled gives you in isolation), half these failures look
|
|
2735
|
+
identical to a passing test.
|
|
2736
|
+
|
|
2737
|
+
REAL OPENAI VS MOCK — when to pick which
|
|
2738
|
+
|
|
2739
|
+
Mock sub-ms, deterministic, $0. Word-overlap-matches tool
|
|
2740
|
+
descriptions. Use for CI / tight inner loop / regression
|
|
2741
|
+
sweeps.
|
|
2742
|
+
OpenAi 500-2000ms per inject, ~$0.005, real model reasoning, real
|
|
2743
|
+
tool routing under the model's actual instructions. Use for
|
|
2744
|
+
confidence-building before a release; for catching cases
|
|
2745
|
+
where the model picks the WRONG tool when descriptions
|
|
2746
|
+
overlap; for vision tests (includeImage); for confirming
|
|
2747
|
+
the model speaks a sensible confirmation back.
|
|
2748
|
+
|
|
2749
|
+
Same injectAssistantUtterance call works against both — \`AssistantProvider.OpenAi\`
|
|
2750
|
+
vs \`AssistantProvider.Mock()\` at session creation is the entire switch.
|
|
2751
|
+
F13: never bake a \`useMock\` Boolean into handler code; wire it via a build
|
|
2752
|
+
flavor so the choice is visible in the AssistantStatusBadge (glasses-ui) +
|
|
2753
|
+
the sim's waveform color so dev never confuses which mode is running.
|
|
2754
|
+
|
|
2755
|
+
WAKING THE SESSION — autoWake (iter5.3) replaced the old two-step dance
|
|
2756
|
+
|
|
2757
|
+
Pre-iter5.3, agents had to manually inject the wake phrase before each
|
|
2758
|
+
assistant inject if the session might be dormant:
|
|
2759
|
+
|
|
2760
|
+
// OLD pattern — no longer required
|
|
2761
|
+
await injectTranscript({ sessionId, text: "hey notes" });
|
|
2762
|
+
await sleep(3000);
|
|
2763
|
+
await injectAssistantUtterance({ sessionId, text: "save this view" });
|
|
2764
|
+
|
|
2765
|
+
Now autoWake handles it transparently — default true, only fires when the
|
|
2766
|
+
event-log scan shows the session isn't currently Active:
|
|
2767
|
+
|
|
2768
|
+
await injectAssistantUtterance({
|
|
2769
|
+
sessionId, text: "save this view", wakePhrase: "hey notes",
|
|
2770
|
+
});
|
|
2771
|
+
// If session was Active: skipped (one ~50ms events query, no wake roundtrip)
|
|
2772
|
+
// If session was Dormant: pre-injects "hey notes", waits ~1-2s for
|
|
2773
|
+
// assistant.session_started, then dispatches.
|
|
2774
|
+
|
|
2775
|
+
Set \`autoWake: false\` only when you want to ASSERT the dormant-drop behavior
|
|
2776
|
+
itself, or when you're driving wake manually with custom timing.
|
|
2777
|
+
|
|
2778
|
+
CAMERA-USING TOOLS NEED A LIVE SIM BROWSER TAB
|
|
2779
|
+
|
|
2780
|
+
Tools that call glasses.camera.capturePhoto / captureVideo need the SIM
|
|
2781
|
+
BROWSER TAB OPEN — that's where camera input streams live in the simulator.
|
|
2782
|
+
Without it, the tool body returns Err("camera failed") and the Channel 1
|
|
2783
|
+
event log shows tool_result.is_error=true.
|
|
2784
|
+
|
|
2785
|
+
Open the sim browser tab at the start of your session:
|
|
2786
|
+
|
|
2787
|
+
// Windows
|
|
2788
|
+
await sh(\`cmd /c start https://extentos.com/s/\${sessionId}\`);
|
|
2789
|
+
// macOS
|
|
2790
|
+
await sh(\`open https://extentos.com/s/\${sessionId}\`);
|
|
2791
|
+
// Then poll until ready
|
|
2792
|
+
while (true) {
|
|
2793
|
+
const s = await getSimulatorStatus({ sessionId });
|
|
2794
|
+
if (s.connectedRoles?.browser) break;
|
|
2795
|
+
await sleep(500);
|
|
2796
|
+
}
|
|
2797
|
+
|
|
2798
|
+
For headless CI: camera-tool verification is blocked today. Non-camera tools
|
|
2799
|
+
(state toggles, persistence, AI calls) work headless. A future
|
|
2800
|
+
\`ensureSimulatorBrowser\` MCP tool would lift this for cross-platform CI —
|
|
2801
|
+
see project_iter4_sim_agent_discovery in shared-context.`,
|
|
2802
|
+
gotchas: [
|
|
2803
|
+
"**autoWake's pre-wake adds ~1-2s on dormant sessions; ~50ms on active ones.** The handler now scans recent events to detect state; if assistant.went_dormant or assistant.session_ended is more recent than any other assistant.* signal, it pre-injects the wake phrase + waits for assistant.session_started before dispatching. Active sessions skip the round trip entirely. To opt out (e.g. when asserting the dormant-drop behavior itself), pass `autoWake: false`. The response's `autoWake` block reports exactly what happened — read it on the first few injects of a new session to confirm the wake phrase is correct.",
|
|
2804
|
+
"**wakePhrase defaults to \"hey elizabeth\" — pass `wakePhrase` explicitly for other apps.** The default matches the canonical dogfood pattern. When the phrase doesn't match what the customer registered via `glasses.voice.onPhrase(...) { session.wake() }`, autoWake reports `action: \"pre_wake_timeout\"` with a diagnostic message + still dispatches the inject (which then drops since the session is still dormant). Fix the phrase + retry.",
|
|
2805
|
+
"**assertToolCalled now cursor-anchors at \"now\" (b87388d).** Pre-fix, a no-cursor first poll returned the OLDEST 200 events — any matching assistant.tool_called from yesterday's testing satisfied immediately with a stale call_id + bogus waitedMs (e.g. 400ms reported on an OpenAi call that physically takes 1-2s). Now: a one-time limit=1 anchor before polling guarantees only events arriving AFTER the call match. No schema change for callers.",
|
|
2806
|
+
"**Camera-tool tests need the sim browser tab open. There's no automated workaround today.** save_photo / describe_scene / capture_video and any tool body that calls glasses.camera.* will return Err(\"camera failed\") without `connectedRoles.browser` being non-null. Open the tab once at the session start (commands above) and poll getSimulatorStatus until ready. Future: an `ensureSimulatorBrowser` MCP tool will collapse this into one call.",
|
|
2807
|
+
"**adb logcat needs the adb binary in your PATH or full path.** On Windows it's at `C:/Users/<you>/AppData/Local/Android/Sdk/platform-tools/adb.exe`; on macOS at `~/Library/Android/sdk/platform-tools/adb`. The MCP server runs on YOUR machine, so cloud-hosted agents can't reach the emulator — Channel 2/3/4 are local-only. Headless CI agents can still do Channel 1 (getEventLog is the only protocol-level channel).",
|
|
2808
|
+
"**WAL caveat (Android Room): always pull .db + .db-wal + .db-shm together.** Room writes in WAL mode by default. Pulling only the .db file gives a stale snapshot — recent writes live in .db-wal. The agent's first DB-read returning zero rows almost always means missing .db-wal. iOS / GRDB doesn't have this — GRDB ships with WAL enabled but the pulled DB self-checkpoints on close.",
|
|
2809
|
+
"**Don't mint a fresh sim per test.** createSimulatorSession is get-or-create — the same sim resumes across test runs. Use `resetFresh: true` ONLY when you need a clean event-log slate; otherwise rebuild + reinstall the app + the library reattaches automatically. The autoWake state-detection scans the latest 200 events, so a long-running sim with thousands of events still resolves state correctly (the lifecycle markers are usually well within the window).",
|
|
2810
|
+
"**OpenAi inject latency varies by tool body cost.** State-toggle tools (start_notes / save_notes) dispatch in 500-1000ms; camera+vision tools (describe_scene with includeImage) take 2000-4000ms (camera + image upload + model reasoning). Budget `timeoutMs: 5000` for the former + `timeoutMs: 10000` for vision tools. Mock dispatches in <50ms for everything.",
|
|
2811
|
+
"**The same `injectAssistantUtterance(text)` MCP call drives BOTH Mock and OpenAi.** The runtime picks based on `AssistantProvider.OpenAi(...)` vs `AssistantProvider.Mock()` at session creation. Wire the choice via a build flavor (USE_MOCK_ASSISTANT BuildConfig) per F13 — never bake `useMock` into handler code. CI runs the Mock flavor for the inner loop; pre-release smoke runs the OpenAi flavor for the real-model confidence pass.",
|
|
2812
|
+
"**Pull the JS-side test driver into your CI as a script, not inside the agent.** The agent-driven workflow above is great for inner-loop scaffolding, but for repeated CI runs commit the test code as a separate `.test.ts` / `.test.kt` file in the host app's repo. Use `vitest` (TS) or `kotlinx.coroutines.test.runTest` (Kotlin instrumented test) to run it. The MCP tool calls are still issued (the test driver calls Extentos backend APIs directly using @extentos/mcp-server's exported helpers, or via the MCP HTTP transport).",
|
|
2813
|
+
],
|
|
2814
|
+
relatedFeatures: ["assistant_runtime", "assistant_start", "assistant_tool", "assistant_provider_openai", "assistant_session_runtime", "assistant_vision"],
|
|
2815
|
+
};
|
|
1904
2816
|
export const CODE_EXAMPLES = {
|
|
1905
2817
|
voice_qa_assistant: VOICE_QA_ASSISTANT,
|
|
1906
2818
|
barge_in_speak: BARGE_IN_SPEAK,
|
|
@@ -1910,6 +2822,8 @@ export const CODE_EXAMPLES = {
|
|
|
1910
2822
|
connection_page_setup: CONNECTION_PAGE_SETUP,
|
|
1911
2823
|
byok_anthropic: BYOK_ANTHROPIC,
|
|
1912
2824
|
agent_test_loop: AGENT_TEST_LOOP,
|
|
2825
|
+
assistant_agent_loop: ASSISTANT_AGENT_LOOP,
|
|
2826
|
+
agent_driven_e2e_full_loop: AGENT_DRIVEN_E2E_FULL_LOOP,
|
|
1913
2827
|
conversation_agent_loop: CONVERSATION_AGENT_LOOP,
|
|
1914
2828
|
};
|
|
1915
2829
|
export const CODE_EXAMPLE_PATTERNS = Object.keys(CODE_EXAMPLES).sort();
|