@extentos/mcp-server 0.0.94 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/dist/tools/data/capabilities.d.ts +49 -0
  2. package/dist/tools/data/capabilities.d.ts.map +1 -1
  3. package/dist/tools/data/capabilities.js +37 -0
  4. package/dist/tools/data/capabilities.js.map +1 -1
  5. package/dist/tools/data/capabilityPatterns.d.ts.map +1 -1
  6. package/dist/tools/data/capabilityPatterns.js +499 -6
  7. package/dist/tools/data/capabilityPatterns.js.map +1 -1
  8. package/dist/tools/data/codeExamples.d.ts.map +1 -1
  9. package/dist/tools/data/codeExamples.js +916 -2
  10. package/dist/tools/data/codeExamples.js.map +1 -1
  11. package/dist/tools/data/version.d.ts +12 -0
  12. package/dist/tools/data/version.d.ts.map +1 -1
  13. package/dist/tools/data/version.js +17 -0
  14. package/dist/tools/data/version.js.map +1 -1
  15. package/dist/tools/definitions.d.ts.map +1 -1
  16. package/dist/tools/definitions.js +63 -3
  17. package/dist/tools/definitions.js.map +1 -1
  18. package/dist/tools/docs/index.d.ts.map +1 -1
  19. package/dist/tools/docs/index.js +115 -4
  20. package/dist/tools/docs/index.js.map +1 -1
  21. package/dist/tools/handlers/assertToolCalled.d.ts +3 -0
  22. package/dist/tools/handlers/assertToolCalled.d.ts.map +1 -0
  23. package/dist/tools/handlers/assertToolCalled.js +208 -0
  24. package/dist/tools/handlers/assertToolCalled.js.map +1 -0
  25. package/dist/tools/handlers/generateConnectionModule.js +21 -2
  26. package/dist/tools/handlers/generateConnectionModule.js.map +1 -1
  27. package/dist/tools/handlers/getCredentialGuide.d.ts.map +1 -1
  28. package/dist/tools/handlers/getCredentialGuide.js +33 -6
  29. package/dist/tools/handlers/getCredentialGuide.js.map +1 -1
  30. package/dist/tools/handlers/getProductionChecklist.d.ts.map +1 -1
  31. package/dist/tools/handlers/getProductionChecklist.js +52 -1
  32. package/dist/tools/handlers/getProductionChecklist.js.map +1 -1
  33. package/dist/tools/handlers/injectAssistantUtterance.d.ts +3 -0
  34. package/dist/tools/handlers/injectAssistantUtterance.d.ts.map +1 -0
  35. package/dist/tools/handlers/injectAssistantUtterance.js +378 -0
  36. package/dist/tools/handlers/injectAssistantUtterance.js.map +1 -0
  37. package/dist/tools/handlers/validateIntegration.d.ts.map +1 -1
  38. package/dist/tools/handlers/validateIntegration.js +67 -0
  39. package/dist/tools/handlers/validateIntegration.js.map +1 -1
  40. package/dist/tools/registry.d.ts.map +1 -1
  41. package/dist/tools/registry.js +4 -0
  42. package/dist/tools/registry.js.map +1 -1
  43. package/package.json +1 -1
@@ -1604,8 +1604,8 @@ The agent reads each \`speak\` event's \`details.text\` — that is what the app
1604
1604
  };
1605
1605
  const CONVERSATION_AGENT_LOOP = {
1606
1606
  pattern: "conversation_agent_loop",
1607
- title: "Phase 3 conversation runtime + agent-driven E2E loop",
1608
- description: "The Phase 3 `glasses.conversation.onWake { listen() / speak() / ai.complete() }` API in one place, plus the headless agent loop that verifies it end-to-end via `injectTranscript` + `getEventLog`. The runtime composes VAD + STT + Smart Turn + TTS + BYOK LLM in the shared Rust core the customer code is one block of structured-concurrency Kotlin/Swift. The agent loop drives wake + follow-up utterances via MCP, then asserts symmetric `conversation.handler_started`/`turn_started`/`turn_ended`/`handler_finished` pairs in the event log. Pair this with `voice_qa_assistant` (which uses the legacy `recordDiscrete` API) when comparing the two paths.",
1607
+ title: "LEGACY Phase 3 conversation runtime (DEPRECATED in v1.4.0; removed in v2.0.0 — use assistant_agent_loop for new apps)",
1608
+ description: "**LEGACY — DEPRECATED in v1.4.0; removed entirely in v2.0.0.** For new voice-assistant work use `assistant_agent_loop` (Phase 4 `glasses.assistant.start { tool(...) { ... } }`). The Phase 4 customer surface drops ~60% LoC vs this pattern, ships zero on-device ML (no ONNX models to bundle), and lets the model decide which tool to call instead of the customer hand-writing `when (turn.text)` keyword routing. This Phase 3 pattern stays in the catalog for the v1.4.0 v2.0.0 migration window (3-month deprecation period) — existing apps continue to work unchanged. See `searchDocs(topic:'assistant_runtime')` migration section for the side-by-side Phase 3 → Phase 4 walkthrough. The Phase 3 `glasses.conversation.onWake { listen() / speak() / ai.complete() }` API composes VAD + STT + Smart Turn + TTS + BYOK LLM in the shared Rust core; the agent loop here drives wake + follow-up utterances via `injectTranscript` and asserts symmetric `conversation.*` event pairs.",
1609
1609
  code: {
1610
1610
  kotlin: `// ── Application bootstrap (do once, typically in Application.onCreate
1611
1611
  // after RECORD_AUDIO is granted) ────────────────────────────────────
@@ -1901,6 +1901,918 @@ browser side-by-side with this agent loop and watch the panel react.`,
1901
1901
  ],
1902
1902
  relatedFeatures: ["voice_command", "speak", "transcription_incremental", "ai_call", "smart_turn_eou"],
1903
1903
  };
1904
+ const ASSISTANT_AGENT_LOOP = {
1905
+ pattern: "assistant_agent_loop",
1906
+ title: "Phase 4 assistant runtime + agent-driven E2E loop (canonical voice-AI for new apps)",
1907
+ description: "**Canonical Phase 4 voice-AI pattern. Start here for any new voice assistant work on Extentos.** The customer code is one block: `glasses.assistant.start(provider) { tool(name, description) { body -> ToolResult } }`. The model owns wake detection, turn taking, intent parsing, and confirmation speech — the customer only writes tool bodies that act on the app's own state (route data, app DB, camera, library). Provider abstraction wraps OpenAI Realtime in v1; Gemini Live follows. **As of iter5.2 (2026-05-27) `injectAssistantUtterance(text)` drives BOTH the Mock provider AND the real OpenAi Realtime provider** — same MCP call, same code, real model reasoning when you want it. iter5.3 (2026-05-27) added autoWake so multi-step agent tests don't drop injects across silence-timeout sleep transitions. **For the dedicated agent-driven test workflow (4-channel verification: getEventLog + adb logcat + screencap + library-state), see `getCodeExample(pattern:\"agent_driven_e2e_full_loop\")`.** This pattern stays focused on the handler-side code. **Replaces `conversation_agent_loop` (Phase 3 cascaded VAD+STT+SmartTurn+TTS+LLM) which is deprecated in v1.4.0 and removed in v2.0.0.** Customer code typically drops ~60% LoC vs Phase 3.",
1908
+ code: {
1909
+ kotlin: `// ── App bootstrap ───────────────────────────────────────────────────
1910
+ // Run once during app startup, after RECORD_AUDIO is granted. The
1911
+ // returned ExtentosGlasses instance is the single handle for everything;
1912
+ // keep it on your Application subclass (or a singleton container) so
1913
+ // the rest of your app can reach it.
1914
+ //
1915
+ // createGlasses is suspend — call it from a coroutine:
1916
+ //
1917
+ // class MyApp : Application() {
1918
+ // lateinit var glasses: ExtentosGlasses
1919
+ // override fun onCreate() {
1920
+ // super.onCreate()
1921
+ // CoroutineScope(Dispatchers.Main).launch {
1922
+ // glasses = createGlasses(this@MyApp)
1923
+ // // Wire your assistant handler here (see below)
1924
+ // }
1925
+ // }
1926
+ // }
1927
+ //
1928
+ // No ONNX models, no model paths, no cascaded options — Phase 4 ships
1929
+ // end-to-end via the provider's WebSocket. glasses.assistant is
1930
+ // always-on (no opt-in conversationOptions needed).
1931
+ import com.extentos.glasses.core.CaptureError
1932
+ import com.extentos.glasses.core.ExtentosConfig
1933
+ import com.extentos.glasses.core.ExtentosGlasses
1934
+ import com.extentos.glasses.core.ExtentosResult
1935
+ import com.extentos.glasses.core.RuntimeEvent
1936
+ import com.extentos.glasses.core.VideoClip
1937
+ import com.extentos.glasses.core.VideoConfig
1938
+ import com.extentos.glasses.core.assistant.AssistantEvent
1939
+ import com.extentos.glasses.core.assistant.AssistantProvider
1940
+ import com.extentos.glasses.core.assistant.AssistantSession
1941
+ import com.extentos.glasses.core.assistant.ToolResult
1942
+ import com.extentos.glasses.core.assistant.tool
1943
+ import com.extentos.glasses.core.valueOrNull
1944
+ import kotlinx.coroutines.CoroutineScope
1945
+ import kotlinx.coroutines.Deferred
1946
+ import kotlinx.coroutines.Dispatchers
1947
+ import kotlinx.coroutines.SupervisorJob
1948
+ import kotlinx.coroutines.async
1949
+ import kotlinx.coroutines.flow.filterIsInstance
1950
+ import kotlinx.coroutines.flow.launchIn
1951
+ import kotlinx.coroutines.flow.onEach
1952
+ import kotlinx.coroutines.launch
1953
+ import kotlin.time.Duration.Companion.seconds
1954
+
1955
+ suspend fun createGlasses(context: android.content.Context): ExtentosGlasses {
1956
+ return ExtentosGlasses.create(
1957
+ ExtentosConfig(applicationContext = context)
1958
+ ).also { glasses ->
1959
+ // BYOK OpenAI key — see getCredentialGuide(service:"openai") for
1960
+ // the local.properties + BuildConfig plumbing. Key flows direct
1961
+ // from device → api.openai.com via WS Authorization header;
1962
+ // Extentos backend never sees it.
1963
+ //
1964
+ // **F-meta-2 warning:** do NOT have your AI agent write the
1965
+ // OPENAI_API_KEY value into local.properties — agent file edits
1966
+ // leak through the conversation transcript. Add the line yourself.
1967
+ glasses.assistant.setOpenaiApiKey(BuildConfig.OPENAI_API_KEY)
1968
+ }
1969
+ }
1970
+
1971
+ // ── Handler — Strava-style example ─────────────────────────────────────
1972
+ //
1973
+ // Phase 4 wake/sleep state machine (F12): the session starts Dormant
1974
+ // (zero token spend), the developer picks the wake mechanism (voice
1975
+ // phrase via glasses.voice.onPhrase, button tap, gesture, MCP call),
1976
+ // and the model decides when to end the conversation via the built-in
1977
+ // end_conversation tool (endOnIntent default true) — no rigid
1978
+ // "goodbye <name>" phrase required. sleepAfterSilence is the
1979
+ // deterministic backup. onWake { say(...) } speaks in the assistant's
1980
+ // own voice (alloy etc.) so the greeting matches the AI's reply voice.
1981
+
1982
+ class StravaAssistantHandler(
1983
+ private val glasses: ExtentosGlasses,
1984
+ private val routeTracker: RouteTracker, // app-internal state
1985
+ private val library: ClipLibrary, // app-internal state
1986
+ private val scope: CoroutineScope = CoroutineScope(SupervisorJob() + Dispatchers.IO),
1987
+ ) {
1988
+ private var session: AssistantSession? = null
1989
+
1990
+ // Natural async + await + stop pattern. The library exposes
1991
+ // glasses.camera.stopVideo() so the customer never has to cancel
1992
+ // the wrapping Deferred — captureVideo() resumes naturally with
1993
+ // Ok(partial) when stop is signalled.
1994
+ private var activeVideo: Deferred<ExtentosResult<VideoClip, CaptureError>>? = null
1995
+
1996
+ @Volatile private var notesActive: Boolean = false
1997
+ private val notesBuffer = StringBuilder()
1998
+
1999
+ fun start() {
2000
+ // F3 (iteration-2 fix): subscribe to AssistantEvent.UserSpoke for
2001
+ // transcript capture — needed for notes / live-captions / journal
2002
+ // patterns. Fires once per user turn after the provider's STT
2003
+ // completes. PII boundary: text is verbatim, customer-owned.
2004
+ glasses.runtime.events
2005
+ .filterIsInstance<RuntimeEvent.Assistant>()
2006
+ .onEach { evt ->
2007
+ val userSpoke = evt.event as? AssistantEvent.UserSpoke ?: return@onEach
2008
+ if (notesActive) {
2009
+ synchronized(notesBuffer) {
2010
+ if (notesBuffer.isNotEmpty()) notesBuffer.append(" ")
2011
+ notesBuffer.append(userSpoke.transcript)
2012
+ }
2013
+ }
2014
+ }
2015
+ .launchIn(scope)
2016
+
2017
+ scope.launch {
2018
+ session = glasses.assistant.start(
2019
+ provider = AssistantProvider.OpenAi(model = "gpt-realtime", voice = "alloy"),
2020
+ ) {
2021
+ instructions = """
2022
+ You are a Strava companion. Help the runner with route
2023
+ stats and capture moments. Speak briefly — they're
2024
+ running. Don't narrate what you're doing — just do it
2025
+ and confirm. When the user clearly indicates they want
2026
+ to stop talking, call end_conversation.
2027
+ """.trimIndent()
2028
+
2029
+ // Phase 4 v1.1 lifecycle:
2030
+ // - session lands in Dormant after start() (default)
2031
+ // - wake via the voice phrase wired below (any other
2032
+ // trigger — button, gesture, MCP — works the same)
2033
+ // - onWake speaks the greeting in the model's voice
2034
+ // - 30 s of user silence → auto-sleep (backup)
2035
+ // - end_conversation tool registered automatically by
2036
+ // the library because endOnIntent defaults true
2037
+ onWake { say("Hi, I'm here. What can I do for you?") }
2038
+ sleepAfterSilence(30.seconds)
2039
+
2040
+ // Read-tools: instant data the AI reads aloud or weaves
2041
+ // into the answer.
2042
+ tool("get_route_remaining", "How much of the planned route is left, in km.") {
2043
+ ToolResult.Ok("\${routeTracker.kmRemaining()} km remaining")
2044
+ }
2045
+ tool("get_average_pace", "Current average pace in minutes per km.") {
2046
+ ToolResult.Ok("\${routeTracker.avgPaceMinKm()} min per km")
2047
+ }
2048
+ tool("get_route_length", "Total planned route length in km.") {
2049
+ ToolResult.Ok("\${routeTracker.totalKm} km")
2050
+ }
2051
+
2052
+ // Action-tools: side effects on the app's own state. The
2053
+ // AI manages the take/stop pair from context.
2054
+ tool("take_video", "Start recording a video clip of the runner's view.") {
2055
+ if (activeVideo?.isActive == true) {
2056
+ return@tool ToolResult.Err("a recording is already in progress")
2057
+ }
2058
+ activeVideo = scope.async {
2059
+ glasses.camera.captureVideo(VideoConfig(maxDurationSeconds = 30))
2060
+ }
2061
+ ToolResult.Ok("recording started")
2062
+ }
2063
+ tool("stop_video", "Stop the current video recording.") {
2064
+ // Clean stop pattern: signal the library to gracefully
2065
+ // end the capture; the in-flight captureVideo() inside
2066
+ // the \`async\` block resumes naturally with
2067
+ // ExtentosResult.Ok(partialClip). No coroutine
2068
+ // cancellation, no sticky-Cancelled Deferred — just
2069
+ // a normal await on the result.
2070
+ //
2071
+ // (Don't be tempted to call activeVideo.cancel() to
2072
+ // stop — Kotlin Deferred state is sticky-Cancelled,
2073
+ // so await() throws CancellationException even if the
2074
+ // library produced a partial. Use stopVideo() instead.)
2075
+ val capture = activeVideo
2076
+ activeVideo = null
2077
+ if (capture == null) return@tool ToolResult.Err("nothing was recording")
2078
+ glasses.camera.stopVideo()
2079
+ val result = capture.await()
2080
+ val clip = result.valueOrNull() ?: return@tool ToolResult.Err("video capture failed")
2081
+ library.add(clip)
2082
+ ToolResult.Ok("video saved")
2083
+ }
2084
+
2085
+ tool("start_notes", "Start capturing what the runner says as a note.") {
2086
+ if (notesActive) return@tool ToolResult.Err("already taking notes")
2087
+ synchronized(notesBuffer) { notesBuffer.setLength(0) }
2088
+ notesActive = true
2089
+ ToolResult.Ok("ok, taking notes")
2090
+ }
2091
+ tool("stop_notes", "Stop note-taking and save the accumulated notes.") {
2092
+ if (!notesActive) return@tool ToolResult.Err("not currently taking notes")
2093
+ notesActive = false
2094
+ val text = synchronized(notesBuffer) {
2095
+ val s = notesBuffer.toString()
2096
+ notesBuffer.setLength(0)
2097
+ s
2098
+ }
2099
+ if (text.isBlank()) return@tool ToolResult.Err("nothing was captured")
2100
+ library.addNote(text)
2101
+ ToolResult.Ok("notes saved")
2102
+ }
2103
+
2104
+ // Vision tools — Phase 4 v1.4 includeImage + Photos.copyToFile.
2105
+ //
2106
+ // Two distinct tools, NOT one. The model picks based on
2107
+ // user intent:
2108
+ // describe_scene — "what / describe / tell me" → AI speaks
2109
+ // about the photo; photo NOT persisted.
2110
+ // save_photo — "save / capture / take a picture / remember"
2111
+ // → photo persisted to library; NOT described.
2112
+ //
2113
+ // Splitting lets the model call BOTH back-to-back when the
2114
+ // user wants both ("save this and tell me what it is") +
2115
+ // keeps each description tight enough for the Mock + real
2116
+ // matchers to disambiguate. See getCapabilityGuide(feature:"assistant_vision")
2117
+ // gotcha about describe-vs-save.
2118
+ tool("describe_scene", "Describe what the runner is currently looking at without saving the photo. Call for 'what do you see' / 'describe this' / 'tell me about this'.") {
2119
+ val photo = glasses.camera.capturePhoto().valueOrNull()
2120
+ ?: return@tool ToolResult.Err("camera failed")
2121
+ val uri = photo.uri
2122
+ ?: return@tool ToolResult.Err("photo had no uri")
2123
+ // session is the AssistantSession returned by start();
2124
+ // captured here via the outer-class property. By the
2125
+ // time a tool dispatches, the session is Active so
2126
+ // includeImage won't throw NotReady.
2127
+ session?.includeImage(uri)
2128
+ ToolResult.Ok("looking")
2129
+ }
2130
+ tool("save_photo", "Save a photo to the runner's library WITHOUT describing it. Call for 'save this' / 'capture this' / 'take a picture' / 'remember this view'.") {
2131
+ val photo = glasses.camera.capturePhoto().valueOrNull()
2132
+ ?: return@tool ToolResult.Err("camera failed")
2133
+ val uri = photo.uri
2134
+ ?: return@tool ToolResult.Err("photo had no uri")
2135
+ val ext = when (Photos.mediaTypeFromUri(uri)) {
2136
+ "image/png" -> "png"
2137
+ "image/webp" -> "webp"
2138
+ else -> "jpg"
2139
+ }
2140
+ val dst = File(library.photosDir, "photo-\${System.currentTimeMillis()}.\$ext")
2141
+ // Photos.copyToFile mirrors Videos.copyToFile: stream-copy
2142
+ // across data: / file:// / absolute path, parent mkdirs,
2143
+ // overwrite, returns Boolean. Use it — don't hand-roll
2144
+ // loadBytes + writeBytes.
2145
+ if (!Photos.copyToFile(uri, dst)) return@tool ToolResult.Err("could not save photo")
2146
+ library.addPhoto(dst.absolutePath, photo.width, photo.height)
2147
+ ToolResult.Ok("photo saved")
2148
+ }
2149
+ }
2150
+
2151
+ // Wake mechanism — canonical pattern uses the existing
2152
+ // glasses.voice.onPhrase system. Defaults to firesWhen =
2153
+ // VoiceScope.WhenDormant so it won't double-fire during an
2154
+ // active conversation. Swap this line for a button onClick,
2155
+ // a gesture handler, or any other trigger that calls
2156
+ // session?.wake() — the library doesn't dictate the
2157
+ // mechanism, only the lifecycle.
2158
+ glasses.voice.onPhrase("hey strava") { session?.wake() }
2159
+ }
2160
+ }
2161
+ }
2162
+
2163
+ // ── Raw form — equivalent, for programmatic construction (tools loaded
2164
+ // from config, conditional registration). Customer-can-skip-it: the
2165
+ // trailing-lambda builder above reduces to this. ────────────────────
2166
+
2167
+ import com.extentos.glasses.core.assistant.AssistantConfig
2168
+ import com.extentos.glasses.core.assistant.ToolDefinition
2169
+
2170
+ suspend fun startRawForm(glasses: ExtentosGlasses) {
2171
+ val session = glasses.assistant.createSession(
2172
+ AssistantConfig(
2173
+ provider = AssistantProvider.OpenAi(model = "gpt-realtime", voice = "alloy"),
2174
+ instructions = "You are a helpful assistant.",
2175
+ tools = listOf(
2176
+ ToolDefinition("take_picture", "Take a photo when asked.") { _ ->
2177
+ ToolResult.Ok("photo saved")
2178
+ },
2179
+ ),
2180
+ ),
2181
+ )
2182
+ session.start()
2183
+ }`,
2184
+ swift: `// ⚠️ iOS PARITY IN FLIGHT
2185
+ // The Phase 4 wake/sleep state machine (session.wake() / sleep() / say() /
2186
+ // onWake / sleepAfterSilence / endOnIntent / VoiceScope.WhenDormant) is
2187
+ // LIVE in the Android library + verified in the sim. The Swift port is
2188
+ // pending — see shared-context/real-hardware-validation-backlog.md B1
2189
+ // for status. This Swift block reflects the Phase 4 ergonomics target
2190
+ // but DOES NOT compile against the current iOS library (which still
2191
+ // ships the always-on Phase 4 surface from earlier sprints).
2192
+ //
2193
+ // Until iOS parity ships, iOS apps should use the Phase 3
2194
+ // conversation_agent_loop pattern or wait. Track the Mac VPS handoff in
2195
+ // shared-context/.
2196
+
2197
+ import GlassesCore
2198
+
2199
+ // ── App bootstrap (do once, e.g. in App.init or DI container) ─────────
2200
+ //
2201
+ // Phase 4 is end-to-end via the provider's WebSocket — no ONNX models
2202
+ // to bundle, no cascaded options. glasses.assistant is always-on.
2203
+ // \`Extentos.create(config:)\` is synchronous + non-throwing on iOS.
2204
+ func makeGlasses() -> ExtentosGlasses {
2205
+ let glasses = Extentos.create(config: ExtentosConfig())
2206
+ // BYOK OpenAI key — see getCredentialGuide(service:"openai") for the
2207
+ // Info.plist plumbing. Key flows direct from device → api.openai.com
2208
+ // via WS Authorization header; Extentos backend never sees it.
2209
+ glasses.assistant.setOpenAiApiKey(Secrets.openAiKey)
2210
+ return glasses
2211
+ }
2212
+
2213
+ // ── Handler — Strava-style example ────────────────────────────────────
2214
+ //
2215
+ // Plain final class (not @MainActor) so the @Sendable tool-body
2216
+ // closures can call into the handler without forcing every access
2217
+ // through an actor hop. State mutation (\`activeVideo\`) is single-
2218
+ // dispatch in practice — OpenAi + Mock providers serialize per-call_id
2219
+ // tool dispatch in v1.
2220
+
2221
+ final class StravaAssistantHandler: @unchecked Sendable {
2222
+ private let glasses: ExtentosGlasses
2223
+ private let routeTracker: RouteTracker // app-internal
2224
+ private let library: ClipLibrary // app-internal
2225
+ private var activeVideo: Task<VideoClip?, Never>?
2226
+
2227
+ init(glasses: ExtentosGlasses, routeTracker: RouteTracker, library: ClipLibrary) {
2228
+ self.glasses = glasses
2229
+ self.routeTracker = routeTracker
2230
+ self.library = library
2231
+ }
2232
+
2233
+ func start() async throws {
2234
+ // Sugar form (trailing-closure builder). Returns once the session
2235
+ // reaches Active. Q3 asymmetry: Swift's typed-args overload
2236
+ // requires an explicit \`schema:\` parameter (no Mirror walk on
2237
+ // types — see assistant_tool capability guide).
2238
+ _ = try await glasses.assistant.start(
2239
+ provider: .openAI(model: "gpt-realtime", voice: "alloy")
2240
+ ) { config in
2241
+ config.instructions = """
2242
+ You are a Strava companion. Help the runner with route
2243
+ stats and capture moments. Speak briefly — they're
2244
+ running. Don't narrate what you're doing — just do it.
2245
+ """
2246
+
2247
+ // Read-tools
2248
+ config.tool("get_route_remaining", description: "How much of the planned route is left, in km.") {
2249
+ .ok("\\(self.routeTracker.kmRemaining()) km remaining")
2250
+ }
2251
+ config.tool("get_average_pace", description: "Current average pace in minutes per km.") {
2252
+ .ok("\\(self.routeTracker.avgPaceMinKm()) min per km")
2253
+ }
2254
+ config.tool("get_route_length", description: "Total planned route length in km.") {
2255
+ .ok("\\(self.routeTracker.totalKm) km")
2256
+ }
2257
+
2258
+ // Action-tools — fire-and-forget via Task; stop_video cancels
2259
+ // the activeVideo task + reads its (possibly partial) result.
2260
+ // Camera's captureVideo returns \`ExtentosResult<VideoClip,
2261
+ // CaptureError>\` async — we extract the success branch into
2262
+ // an optional clip via the \`.success\` accessor.
2263
+ config.tool("take_video", description: "Start recording a video clip of the runner's view.") {
2264
+ self.activeVideo = Task {
2265
+ let result = await self.glasses.camera.captureVideo(
2266
+ VideoConfig(maxDurationSeconds: 30)
2267
+ )
2268
+ return result.success
2269
+ }
2270
+ return .ok("recording started")
2271
+ }
2272
+ config.tool("stop_video", description: "Stop the current video recording.") {
2273
+ // Cooperative cancellation: the camera capture finishes
2274
+ // its current chunk and surfaces a partial clip (or
2275
+ // nothing if the cancel landed before any frames).
2276
+ self.activeVideo?.cancel()
2277
+ let clip = await self.activeVideo?.value ?? nil
2278
+ self.activeVideo = nil
2279
+ if let clip {
2280
+ self.library.add(clip)
2281
+ return .ok("video saved")
2282
+ } else {
2283
+ return .err("nothing was recording")
2284
+ }
2285
+ }
2286
+ }
2287
+ }
2288
+ }
2289
+
2290
+ // ── Raw form — equivalent, for programmatic construction ──────────────
2291
+
2292
+ func startRawForm(_ glasses: ExtentosGlasses) async throws {
2293
+ let session = glasses.assistant.createSession(config: AssistantConfig(
2294
+ provider: .openAI(model: "gpt-realtime", voice: "alloy"),
2295
+ instructions: "You are a helpful assistant.",
2296
+ tools: [
2297
+ ToolDefinition(name: "take_picture", description: "Take a photo when asked.") { _ in
2298
+ .ok("photo saved")
2299
+ },
2300
+ ]
2301
+ ))
2302
+ try await session.start()
2303
+ }`,
2304
+ },
2305
+ explanation: `AGENT-DRIVEN E2E TEST LOOP — RUN AFTER createGlasses + handler.start()
2306
+
2307
+ → For the dedicated agent-driven workflow with FOUR-channel verification
2308
+ (getEventLog + adb logcat + screencap + library-state inspection), the
2309
+ multi-tool sweep pattern, autoWake details, and headless-CI guidance,
2310
+ see \`getCodeExample(pattern: "agent_driven_e2e_full_loop")\`. The block
2311
+ below is the quickstart; the dedicated example is the production
2312
+ pattern.
2313
+
2314
+ The headless verification: drive the assistant with synthetic utterances,
2315
+ assert the expected tools fire, then read the event log. Two providers
2316
+ satisfy the same agent loop (since iter5.2, the SAME injectAssistantUtterance
2317
+ MCP call drives BOTH):
2318
+
2319
+ - AssistantProvider.Mock — deterministic, sub-millisecond, $0. Word-
2320
+ overlap-matches the injected utterance against tool descriptions
2321
+ and dispatches the first match. Use for CI + tight inner loop.
2322
+ - AssistantProvider.OpenAi — real WebSocket to api.openai.com,
2323
+ real LLM picks the tool, real audio output. Use for real-provider
2324
+ verification — works headless against the real model via
2325
+ injectAssistantUtterance.text since iter5.2 (no mic / no human
2326
+ required). Sim browser tab still required for camera-using tools.
2327
+
2328
+ // Mock-provider path — DON'T bake a useMock Boolean into the handler
2329
+ // (F13: invisible to users; ships looking like OpenAi by default).
2330
+ // Instead, wire the provider choice via a BuildConfig field tied to
2331
+ // a build flavor:
2332
+ //
2333
+ // // app/build.gradle.kts
2334
+ // android {
2335
+ // flavorDimensions += "assistant"
2336
+ // productFlavors {
2337
+ // create("mock") { dimension = "assistant"; buildConfigField("Boolean", "USE_MOCK_ASSISTANT", "true") }
2338
+ // create("real") { dimension = "assistant"; buildConfigField("Boolean", "USE_MOCK_ASSISTANT", "false") }
2339
+ // }
2340
+ // }
2341
+ //
2342
+ // // In your handler:
2343
+ // val provider = if (BuildConfig.USE_MOCK_ASSISTANT)
2344
+ // AssistantProvider.Mock() else
2345
+ // AssistantProvider.OpenAi(model = "gpt-realtime", voice = "alloy")
2346
+ //
2347
+ // CI builds the "mock" flavor for the agent-driven E2E loop below;
2348
+ // production builds the "real" flavor. The provider is visible in the
2349
+ // AssistantStatusBadge (glasses-ui) + the sim's waveform color so
2350
+ // dev never confuses which mode the running app is in.
2351
+ const { sessionId } = await createSimulatorSession({
2352
+ glasses: "meta_rayban", platform: "android",
2353
+ });
2354
+ // Anchor cursor BEFORE inject so a fresh log read starts at 'now'.
2355
+ const head = await getEventLog({ sessionId, filter: "voice", limit: 1 });
2356
+ let cur = head.cursor;
2357
+
2358
+ // 1. Inject the utterance. text: routes to MockAssistantProvider via
2359
+ // the BrowserSim raw-frame observer (S2.W.0 wiring).
2360
+ await injectAssistantUtterance({ sessionId, text: "take a picture" });
2361
+
2362
+ // 2. Assert the expected tool was called within the timeout.
2363
+ const call = await assertToolCalled({
2364
+ sessionId, name: "take_picture", timeoutMs: 5000,
2365
+ });
2366
+ // call.args is the parsed JsonObject from the tool's args schema
2367
+ // (or {} for no-arg tools); call.call_id pairs with the result event.
2368
+
2369
+ // 3. (Optional) Read the full event-log trace for assertions on the
2370
+ // speak transcript, the tool result, etc.
2371
+ const log = await getEventLog({
2372
+ sessionId, cursor: cur, filter: "voice",
2373
+ });
2374
+ // Expected events (in order):
2375
+ // assistant.user_spoke transcript="take a picture"
2376
+ // assistant.tool_called name="take_picture", call_id
2377
+ // assistant.tool_result name="take_picture", output, is_error,
2378
+ // duration_ms
2379
+ // assistant.assistant_spoke transcript="ok, take_picture" (Mock
2380
+ // synthesizes; OpenAi has the model
2381
+ // speak its own confirmation)
2382
+
2383
+ // 4. To drive a multi-utterance flow, just repeat steps 1-3 with the
2384
+ // next text. The session stays Active across injects until you
2385
+ // explicitly stop it.
2386
+
2387
+ WHAT GETS LOGGED — assistant.* event family (per synthesis #15)
2388
+
2389
+ assistant.session_started provider, model, voice
2390
+ assistant.session_ended reason (user / error / ceiling), message
2391
+ assistant.user_spoke transcript (verbatim — see PII note below)
2392
+ assistant.assistant_spoke transcript (verbatim)
2393
+ assistant.tool_called name, args (JsonObject), call_id
2394
+ assistant.tool_result call_id, name, output, is_error, duration_ms
2395
+ assistant.reconnected reason, downtime_ms (synthesis #23
2396
+ transparent 60-min ceiling + onFailure
2397
+ recovery — library hides this from
2398
+ customers; observable here for debugging)
2399
+ assistant.error kind, message (non-fatal; fatal errors
2400
+ emit session_ended(error))
2401
+
2402
+ PII BOUNDARY — Phase 4 differs from Phase 3 conversation.* events
2403
+
2404
+ Phase 4 assistant events DO carry verbatim transcripts in
2405
+ user_spoke / assistant_spoke. Phase 3 conversation.* events strip
2406
+ text (only text_len) because the cascaded path routed transcripts
2407
+ through Extentos backend. Phase 4's BYOK contract sends transcripts
2408
+ device → openai.com directly without touching Extentos backend, so
2409
+ the platform-side PII boundary that Phase 3 enforced doesn't apply.
2410
+ Customer apps own their data retention story.
2411
+
2412
+ FILTER QUICK REFERENCE
2413
+
2414
+ - filter: "voice" → all assistant.* events + legacy stt_/speak/
2415
+ audio_/tts_audio_chunk frames coexist here
2416
+ - filter: "errors" → assistant.error AUTOMATICALLY (the backend
2417
+ bumps severity to warn for this type)
2418
+
2419
+ SIMULATOR UI
2420
+
2421
+ The sim's right rail renders an AssistantPanel showing the active
2422
+ session's provider + model + voice + current state. Recent turns
2423
+ appear as colored chips (UserSpoke=sky, AssistantSpoke=emerald,
2424
+ ToolCalled→Result=indigo, Error=red). Open the sim browser side-by-
2425
+ side with this agent loop and watch the panel react.`,
2426
+ gotchas: [
2427
+ "**Anchor a cursor BEFORE the first inject.** A no-cursor getEventLog returns the tail of the log + a cursor positioned after it — that's your 'now' bookmark. Without it, a resumed sim's stale assistant.tool_called from a prior run can satisfy your assertion and you'd never know the new inject silently failed.",
2428
+ "**Mock provider matches utterances by word overlap (≥3 chars) against tool descriptions.** Tool descriptions whose keywords appear in your test utterances get matched; ones without don't. Write descriptions like \"Take a photo when the user asks to capture a moment\" rather than \"Captures imagery via the camera SDK\" — both the model AND the Mock matcher work better when descriptions describe WHEN to call, not what they do internally. For deterministic tests with ambiguous text, use `argsMatch` on assertToolCalled to constrain the matched call further.",
2429
+ "**Mock provider only attaches when transport is BrowserSim.** RealMeta + LocalSim sessions silently no-op the inject subscription (Mock is sim-only by design). To test against the OpenAi provider in sim, open the sim browser tab — its mic input flows through audio_chunks → OpenAI WebSocket → tool dispatch. injectAssistantUtterance.text: still works against Mock; for real-provider audio injection use the sim browser mic or wait for v1.1's audioWavBase64 path.",
2430
+ "**No ONNX models, no model paths, no conversationOptions.** Phase 4 ships zero on-device ML — the provider's WebSocket carries everything. If you're carrying old conversationOptions / ConversationModels code from Phase 3, delete it during migration (see `searchDocs(topic:'assistant_runtime')` → migration section). The deprecation warning fires on first `glasses.conversation.*` call in v1.4.0; the API is removed in v2.0.0.",
2431
+ "**Singleton-active sessions — synthesis #13.** At most one assistant.start at a time per ExtentosGlasses instance. A second start while one is active throws AssistantException(AlreadyActive). Call assistant.stop() (or session.stop()) before starting a new one. For v1 simplicity; can relax in v1.x if customer demand surfaces.",
2432
+ "**Reconnection is library-owned + transparent — synthesis #23.** OpenAI Realtime hard-caps sessions at 60 min; the library proactively reconnects every ~50 min (configurable in v1.1+). assistant.reconnected fires for observability; customer code doesn't see the swap. Conversation history is replayed (recent 40 turns) on each reconnect. Tool dispatches in flight at reconnect time are preserved.",
2433
+ "**Tool body runs on Dispatchers.IO (Kotlin) / Swift Task (Swift); suspending camera/storage/HTTP calls are fine.** Per-tool blocking opt-out via `tool(name, desc, blocking = true) { ... }` per synthesis #9 — when true, the model waits silently for the result before speaking. Default is non-blocking (model says \"let me check...\" while the tool runs). Use blocking=true for sub-100ms tools where the filler would be awkward (\"what time is it\" returning in 10 ms).",
2434
+ "**BYOK key flows direct, not via Extentos.** synthesis §12. `glasses.assistant.setOpenaiApiKey(key)` stores the key in the AssistantClient. When start() opens a WebSocket, the key goes into `Authorization: Bearer ...` for the wss://api.openai.com/v1/realtime?model=gpt-realtime connection. Extentos backend never sees the key. Test endpoint override is on the AssistantProvider.OpenAi case if you need to point at a mock OpenAI proxy.",
2435
+ "**`glasses.ai.complete` is deprecated in v1.4.0 and removed in v2.0.0.** Use the OpenAI SDK directly for non-voice LLM calls (image description, summarization, etc.). Migration: see `searchDocs(topic:'assistant_runtime')` → migration section walkthrough C.",
2436
+ "**Vision via `session.includeImage(uri, prompt = null)` (v1.4 addition).** Capture a photo, hand the URI to the assistant inside a tool body — the model sees it + speaks about it in its configured voice. URI accepts data: / http(s): / file:// / content://. The image persists in conversation history at the provider, so follow-up questions in the same session work without re-sending. Active-only (throws NotReady otherwise — safe inside a tool body where the session is always Active). Canonical pattern: `tool(\"describe_scene\", \"...\") { val photo = glasses.camera.capturePhoto().valueOrNull(); session.includeImage(photo.uri); ToolResult.Ok(\"looking\") }`. See getCapabilityGuide(feature:\"assistant_vision\") for the prompt-parameter pattern + URI-type gotchas.",
2437
+ "**Mid-session primitives (iter5 addition).** Four building blocks on `AssistantSession`: `setReasoningEffort(level)` for dynamic effort routing, `updateInstructions(text)` for persona/mode swaps, `cancelSpeak()` for tool-driven interrupts, `conversationHistory(limit)` for forwarding context to a stronger model. All composable with the existing tool surface — write tools that call them in their bodies. Canonical escalation pattern: a `ask_smart_model` tool body reads `session.conversationHistory()`, formats it as context, calls the customer's own GPT-5 (or Anthropic Claude, or Gemini) client, returns the response as `ToolResult.Ok(answer)` — the realtime model then speaks the answer in its configured voice. Active-only (except conversationHistory which is always safe). See getCapabilityGuide(feature:\"assistant_session_runtime\") for the canonical snippet of each.",
2438
+ ],
2439
+ relatedFeatures: ["assistant_runtime", "assistant_start", "assistant_tool", "assistant_provider_openai", "assistant_vision", "assistant_session_runtime", "audio_chunks", "speak", "capture_photo"],
2440
+ };
2441
+ // ── Phase 4 agent-driven E2E loop (iter5.2 + iter5.3) ─────────────────
2442
+ //
2443
+ // Sister pattern to `assistant_agent_loop`. That one shows the customer-
2444
+ // side handler code; THIS one shows the AGENT-side workflow that drives
2445
+ // + verifies it. The split exists because the audiences differ — a dev
2446
+ // shipping a voice assistant cares about the handler; an AI coding agent
2447
+ // (or a customer's CI pipeline) cares about the test loop. Both reference
2448
+ // each other.
2449
+ //
2450
+ // Iter5.2 (2026-05-27) made `injectAssistantUtterance(text)` drive the
2451
+ // REAL OpenAI Realtime provider, not just Mock. Iter5.3 (2026-05-27)
2452
+ // wired autoWake so multi-step sessions don't silently drop injects
2453
+ // across silence-timeout sleep transitions. assertToolCalled's cursor
2454
+ // anchor (b87388d) eliminated the stale-match false-positive. Together
2455
+ // these turn the agent-driven E2E loop from "demo-quality" into a
2456
+ // production-grade verification pattern.
2457
+ const AGENT_DRIVEN_E2E_FULL_LOOP = {
2458
+ pattern: "agent_driven_e2e_full_loop",
2459
+ title: "Phase 4 assistant — agent-driven E2E loop with multi-channel verification (real OpenAi capable)",
2460
+ description: "How an AI coding agent verifies a Phase 4 voice-assistant handler end-to-end FROM THE SAME MCP SESSION that scaffolded it — no human, no mic. Iter5.2 unlocked driving the REAL OpenAI Realtime provider via `injectAssistantUtterance(text)` (same MCP call as Mock — the runtime decides based on `AssistantProvider.OpenAi` vs `.Mock()` at session creation). The full loop combines FOUR independent verification channels: `getEventLog` for the protocol-level assistant.* event trace, `adb logcat` for tool-body internals (file paths, byte counts, branch decisions), `adb screencap` for UI confirmation, and direct library-state inspection (Room DB / file system) for persisted side effects. When all four agree, the flow really works at every layer. Companion to `assistant_agent_loop` which covers the handler-side code — use both. Replaces the older `agent_test_loop` (Phase 3 cascaded VAD + recordDiscrete + AnthropicClient) for new voice-AI work.",
2461
+ code: {
2462
+ kotlin: `// HANDLER UNDER TEST — minimal Phase 4 assistant the agent loop drives.
2463
+ // For the full handler-side pattern (8+ tools, wake/sleep hooks,
2464
+ // build-flavor Mock/OpenAi switching, voice phrase wiring), see
2465
+ // getCodeExample(pattern: "assistant_agent_loop").
2466
+
2467
+ import com.extentos.glasses.core.ExtentosGlasses
2468
+ import com.extentos.glasses.core.Photos
2469
+ import com.extentos.glasses.core.assistant.AssistantProvider
2470
+ import com.extentos.glasses.core.assistant.AssistantSession
2471
+ import com.extentos.glasses.core.assistant.ToolResult
2472
+ import com.extentos.glasses.core.valueOrNull
2473
+ import android.util.Log
2474
+ import kotlinx.coroutines.CoroutineScope
2475
+ import kotlinx.coroutines.Dispatchers
2476
+ import kotlinx.coroutines.SupervisorJob
2477
+ import kotlinx.coroutines.launch
2478
+ import java.io.File
2479
+ import kotlin.time.Duration.Companion.seconds
2480
+
2481
+ class NotesAssistant(
2482
+ private val glasses: ExtentosGlasses,
2483
+ private val library: NotesLibrary, // app-internal: Room + photos/ dir
2484
+ private val scope: CoroutineScope = CoroutineScope(SupervisorJob() + Dispatchers.IO),
2485
+ ) {
2486
+ private var session: AssistantSession? = null
2487
+ @Volatile private var notesActive = false
2488
+ private val notesBuffer = StringBuilder()
2489
+
2490
+ fun start() {
2491
+ scope.launch {
2492
+ session = glasses.assistant.start(
2493
+ // F13: pick provider via build flavor (USE_MOCK_ASSISTANT BuildConfig)
2494
+ // — never bake \`useMock\` into handler code. See assistant_agent_loop.
2495
+ provider = AssistantProvider.OpenAi(model = "gpt-realtime-2", voice = "alloy"),
2496
+ ) {
2497
+ instructions = "You are a voice-notes assistant on glasses. Speak briefly."
2498
+ sleepAfterSilence(30.seconds)
2499
+
2500
+ tool("start_notes", "Start capturing the user's speech as a note. Call when the user says 'start notes', 'take notes', or similar.") {
2501
+ Log.i(TAG, "start_notes: notesActive=\$notesActive")
2502
+ if (notesActive) return@tool ToolResult.Err("already taking notes")
2503
+ notesActive = true
2504
+ notesBuffer.setLength(0)
2505
+ ToolResult.Ok("ok, taking notes")
2506
+ }
2507
+
2508
+ tool("save_notes", "Save the accumulated notes to the library. Call when the user says 'save notes', 'stop notes', or 'I'm done'.") {
2509
+ Log.i(TAG, "save_notes: buf=\${notesBuffer.length} chars")
2510
+ if (!notesActive) return@tool ToolResult.Err("not taking notes")
2511
+ notesActive = false
2512
+ val text = notesBuffer.toString()
2513
+ if (text.isBlank()) return@tool ToolResult.Err("nothing captured")
2514
+ library.addNote(text)
2515
+ ToolResult.Ok("notes saved")
2516
+ }
2517
+
2518
+ tool("save_photo", "Take a photo and save it to the user's library. Call when the user says 'save this' or 'take a picture'.") {
2519
+ val photo = glasses.camera.capturePhoto().valueOrNull()
2520
+ ?: return@tool ToolResult.Err("camera failed")
2521
+ val uri = photo.uri ?: return@tool ToolResult.Err("no uri")
2522
+ Log.i(TAG, "save_photo: uri=\${uri.take(60)} \${photo.width}x\${photo.height}")
2523
+ val dst = File(library.photosDir, "photo-\${System.currentTimeMillis()}.jpg")
2524
+ if (!Photos.copyToFile(uri, dst)) return@tool ToolResult.Err("save failed")
2525
+ Log.i(TAG, "save_photo: wrote \${dst.length()} bytes to \${dst.absolutePath}")
2526
+ library.addPhoto(dst.absolutePath, photo.width, photo.height)
2527
+ ToolResult.Ok("photo saved")
2528
+ }
2529
+
2530
+ tool("describe_scene", "Describe what the user is currently looking at WITHOUT saving the photo. Call for 'what do you see' / 'describe this' / 'tell me about this'.") {
2531
+ val photo = glasses.camera.capturePhoto().valueOrNull()
2532
+ ?: return@tool ToolResult.Err("camera failed")
2533
+ val uri = photo.uri ?: return@tool ToolResult.Err("no uri")
2534
+ Log.i(TAG, "describe_scene: handing uri to session.includeImage")
2535
+ session?.includeImage(uri)
2536
+ ToolResult.Ok("looking")
2537
+ }
2538
+ }
2539
+ // Wake phrase — the wake handler the autoWake pre-wake path matches.
2540
+ glasses.voice.onPhrase("hey notes") {
2541
+ Log.i(TAG, "wake phrase matched")
2542
+ session?.wake()
2543
+ }
2544
+ }
2545
+ }
2546
+
2547
+ private companion object {
2548
+ const val TAG = "NotesAssistant"
2549
+ }
2550
+ }`,
2551
+ swift: `// HANDLER UNDER TEST — minimal Phase 4 assistant the agent loop drives.
2552
+ // For the full handler-side pattern see getCodeExample(pattern: "assistant_agent_loop").
2553
+ //
2554
+ // ⚠️ iOS PARITY IN FLIGHT — the Phase 4 wake/sleep state machine + iter5
2555
+ // mid-session primitives are LIVE on Android. The Swift port is queued
2556
+ // (see shared-context/ios-pure-sdk-pivot-handoff.md). This Swift block
2557
+ // reflects the Phase 4 ergonomics target; current iOS apps using
2558
+ // glasses.assistant still ship the always-on surface from earlier sprints.
2559
+ // The TS agent-test loop below is platform-agnostic — it works against
2560
+ // either the Android handler above OR an iOS app running the same APIs
2561
+ // once the iOS port lands.
2562
+
2563
+ import GlassesCore
2564
+
2565
+ final class NotesAssistant: @unchecked Sendable {
2566
+ private let glasses: ExtentosGlasses
2567
+ private let library: NotesLibrary
2568
+ private var session: AssistantSession?
2569
+ private var notesActive = false
2570
+ private var notesBuffer = ""
2571
+
2572
+ init(glasses: ExtentosGlasses, library: NotesLibrary) {
2573
+ self.glasses = glasses
2574
+ self.library = library
2575
+ }
2576
+
2577
+ func start() async throws {
2578
+ session = try await glasses.assistant.start(
2579
+ provider: .openAI(model: "gpt-realtime-2", voice: "alloy")
2580
+ ) { config in
2581
+ config.instructions = "You are a voice-notes assistant on glasses. Speak briefly."
2582
+ // config.sleepAfterSilence(.seconds(30)) // iOS API parity pending
2583
+
2584
+ config.tool("start_notes", description: "Start capturing the user's speech as a note. Call when the user says 'start notes' / 'take notes'.") {
2585
+ NSLog("NotesAssistant start_notes: notesActive=\\(self.notesActive)")
2586
+ if self.notesActive { return .err("already taking notes") }
2587
+ self.notesActive = true
2588
+ self.notesBuffer = ""
2589
+ return .ok("ok, taking notes")
2590
+ }
2591
+
2592
+ config.tool("save_notes", description: "Save the accumulated notes. Call when the user says 'save notes' / 'stop notes' / 'I'm done'.") {
2593
+ NSLog("NotesAssistant save_notes: buf=\\(self.notesBuffer.count) chars")
2594
+ if !self.notesActive { return .err("not taking notes") }
2595
+ self.notesActive = false
2596
+ if self.notesBuffer.isEmpty { return .err("nothing captured") }
2597
+ self.library.addNote(self.notesBuffer)
2598
+ return .ok("notes saved")
2599
+ }
2600
+
2601
+ config.tool("save_photo", description: "Take a photo and save it to the library. Call when the user says 'save this' / 'take a picture'.") {
2602
+ guard let photo = (await self.glasses.camera.capturePhoto()).success else {
2603
+ return .err("camera failed")
2604
+ }
2605
+ NSLog("NotesAssistant save_photo: width=\\(photo.width)")
2606
+ self.library.addPhoto(photo)
2607
+ return .ok("photo saved")
2608
+ }
2609
+ }
2610
+ // Wake phrase wiring (handler the autoWake pre-wake path matches).
2611
+ _ = glasses.voice.onPhrase("hey notes") { [weak self] in
2612
+ await self?.session?.wake()
2613
+ }
2614
+ }
2615
+ }`,
2616
+ },
2617
+ explanation: `THE AGENT-SIDE TEST LOOP — what runs in YOUR conversation, NOT in the customer's app
2618
+
2619
+ // 1. Mint or resume the sim. createSimulatorSession is get-or-create —
2620
+ // pass resetFresh:true only when you need a clean event-log slate.
2621
+ const { sessionId } = await createSimulatorSession({
2622
+ glasses: "meta_rayban", platform: "android",
2623
+ });
2624
+
2625
+ // 2. Sweep across N tools in one session. autoWake (default true)
2626
+ // handles the silence-timeout sleep transitions that bite multi-step
2627
+ // tests where the agent spends >30s between injects.
2628
+ const cases = [
2629
+ { utter: "start taking notes", expect: "start_notes" },
2630
+ { utter: "save this view", expect: "save_photo" },
2631
+ { utter: "what am I looking at", expect: "describe_scene" },
2632
+ { utter: "okay save the notes", expect: "save_notes" },
2633
+ ];
2634
+
2635
+ for (const c of cases) {
2636
+ const inj = await injectAssistantUtterance({
2637
+ sessionId,
2638
+ text: c.utter,
2639
+ // autoWake defaults true; wakePhrase defaults "hey elizabeth" —
2640
+ // override for apps with a different wake phrase:
2641
+ wakePhrase: "hey notes",
2642
+ });
2643
+ // inj.autoWake describes what the pre-wake step did:
2644
+ // { action: "skipped_active" } session was up already
2645
+ // { action: "pre_waked", waitedMs: 1843 } re-waked from dormant
2646
+ // { action: "pre_wake_timeout", reason } wake phrase didn't match
2647
+ // — check glasses.voice.onPhrase
2648
+ // registration in the app
2649
+
2650
+ // 3. Assert the tool fired. The cursor anchor (auto since b87388d)
2651
+ // eliminates stale matches from prior runs.
2652
+ const call = await assertToolCalled({
2653
+ sessionId, name: c.expect, timeoutMs: 5000,
2654
+ });
2655
+ console.log(\`\${c.expect}: call_id=\${call.call_id} waited=\${call.waitedMs}ms\`);
2656
+ }
2657
+
2658
+ THE FOUR VERIFICATION CHANNELS — when does an assertion really mean it worked?
2659
+
2660
+ CHANNEL 1 — getEventLog (protocol layer)
2661
+
2662
+ The 8-event assistant.* family captures the full conversation. Anchor a cursor
2663
+ before injecting, then read forward:
2664
+
2665
+ const head = await getEventLog({ sessionId, filter: "voice", limit: 1 });
2666
+ let cur = head.cursor;
2667
+ await injectAssistantUtterance({ sessionId, text: "save this view" });
2668
+ await assertToolCalled({ sessionId, name: "save_photo", timeoutMs: 5000 });
2669
+ const trace = await getEventLog({ sessionId, cursor: cur, filter: "voice" });
2670
+ // Expected event sequence:
2671
+ // assistant.user_spoke transcript="save this view"
2672
+ // assistant.tool_called name="save_photo", call_id
2673
+ // assistant.tool_result call_id, output="photo saved", is_error:false,
2674
+ // duration_ms
2675
+ // assistant.assistant_spoke transcript="<the model's confirmation>"
2676
+
2677
+ CHANNEL 2 — adb logcat (tool-body internals)
2678
+
2679
+ assertToolCalled proves the tool FIRED. It does NOT prove the body finished
2680
+ cleanly. Tool bodies log internals via android.util.Log — file paths, byte
2681
+ counts, branch decisions. Tail logcat with a tag filter to see what the body
2682
+ actually did:
2683
+
2684
+ // Find the adb binary (Windows: Sdk\\platform-tools\\adb.exe; macOS/Linux:
2685
+ // /Users/<you>/Library/Android/sdk/platform-tools/adb).
2686
+ adb logcat -d -s "NotesAssistant:I" \\
2687
+ | tail -50
2688
+ // I/NotesAssistant: save_photo: uri=content://media/external/images/... 1920x1080
2689
+ // I/NotesAssistant: save_photo: wrote 487231 bytes to /data/data/<pkg>/files/photos/photo-1716908...
2690
+
2691
+ If Channel 1 fires but Channel 2 is silent past the tool entry log, the body
2692
+ hit a branch that returned early without logging. Often a missing else clause.
2693
+
2694
+ CHANNEL 3 — adb screencap (UI confirmation)
2695
+
2696
+ Run-state changes (badge color, count, list items) reflect the persisted
2697
+ state in the UI. screencap proves the binding observed the new state:
2698
+
2699
+ adb exec-out screencap -p > /tmp/post-save.png
2700
+ // Then Read /tmp/post-save.png — the agent SEES what the user would see.
2701
+ // The Notes app's count badge incremented; the latest photo appears in
2702
+ // the gallery row; the "taking notes" indicator turned off.
2703
+
2704
+ Channel 3 catches UI-binding bugs: the tool wrote to the DB but the UI didn't
2705
+ recompose. Often a missing StateFlow.emit / notifyDataSetChanged / SwiftUI
2706
+ @Published.
2707
+
2708
+ CHANNEL 4 — library-state inspection (persistence)
2709
+
2710
+ Tools writing to local storage leave a durable artifact. Pull + inspect:
2711
+
2712
+ // Photos directory
2713
+ adb exec-out run-as <pkg> ls -la files/photos/ | tail -5
2714
+ // -rw------- 1 u0_a234 u0_a234 487231 2026-05-27 21:14 photo-1716908143421.jpg
2715
+
2716
+ // Room DB (always pull .db + .db-wal + .db-shm together — WAL is mandatory)
2717
+ adb exec-out run-as <pkg> cat databases/notes.db > /tmp/local.db
2718
+ adb exec-out run-as <pkg> cat databases/notes.db-wal > /tmp/local.db-wal
2719
+ adb exec-out run-as <pkg> cat databases/notes.db-shm > /tmp/local.db-shm
2720
+ sqlite3 /tmp/local.db "SELECT id, text FROM notes ORDER BY id DESC LIMIT 3"
2721
+
2722
+ Channel 4 catches persistence bugs: the tool succeeded but the write never
2723
+ hit storage. Common when a developer comments out the DB insert while
2724
+ debugging the speak()-side and forgets to uncomment.
2725
+
2726
+ WHEN THE CHANNELS DISAGREE
2727
+
2728
+ Channels 1+2+3+4 agree flow works end-to-end
2729
+ Channel 1 fires, Channel 2 silent body returned early; missing branch log
2730
+ Channels 1+2 fire, Channel 3 unchanged UI binding didn't fire; missing emit
2731
+ Channels 1-3 fire, Channel 4 missing body wrote to a transient; never persisted
2732
+
2733
+ Each disagreement points at a specific layer. With only Channel 1 (which is
2734
+ what assertToolCalled gives you in isolation), half these failures look
2735
+ identical to a passing test.
2736
+
2737
+ REAL OPENAI VS MOCK — when to pick which
2738
+
2739
+ Mock sub-ms, deterministic, $0. Word-overlap-matches tool
2740
+ descriptions. Use for CI / tight inner loop / regression
2741
+ sweeps.
2742
+ OpenAi 500-2000ms per inject, ~$0.005, real model reasoning, real
2743
+ tool routing under the model's actual instructions. Use for
2744
+ confidence-building before a release; for catching cases
2745
+ where the model picks the WRONG tool when descriptions
2746
+ overlap; for vision tests (includeImage); for confirming
2747
+ the model speaks a sensible confirmation back.
2748
+
2749
+ Same injectAssistantUtterance call works against both — \`AssistantProvider.OpenAi\`
2750
+ vs \`AssistantProvider.Mock()\` at session creation is the entire switch.
2751
+ F13: never bake a \`useMock\` Boolean into handler code; wire it via a build
2752
+ flavor so the choice is visible in the AssistantStatusBadge (glasses-ui) +
2753
+ the sim's waveform color so dev never confuses which mode is running.
2754
+
2755
+ WAKING THE SESSION — autoWake (iter5.3) replaced the old two-step dance
2756
+
2757
+ Pre-iter5.3, agents had to manually inject the wake phrase before each
2758
+ assistant inject if the session might be dormant:
2759
+
2760
+ // OLD pattern — no longer required
2761
+ await injectTranscript({ sessionId, text: "hey notes" });
2762
+ await sleep(3000);
2763
+ await injectAssistantUtterance({ sessionId, text: "save this view" });
2764
+
2765
+ Now autoWake handles it transparently — default true, only fires when the
2766
+ event-log scan shows the session isn't currently Active:
2767
+
2768
+ await injectAssistantUtterance({
2769
+ sessionId, text: "save this view", wakePhrase: "hey notes",
2770
+ });
2771
+ // If session was Active: skipped (one ~50ms events query, no wake roundtrip)
2772
+ // If session was Dormant: pre-injects "hey notes", waits ~1-2s for
2773
+ // assistant.session_started, then dispatches.
2774
+
2775
+ Set \`autoWake: false\` only when you want to ASSERT the dormant-drop behavior
2776
+ itself, or when you're driving wake manually with custom timing.
2777
+
2778
+ CAMERA-USING TOOLS NEED A LIVE SIM BROWSER TAB
2779
+
2780
+ Tools that call glasses.camera.capturePhoto / captureVideo need the SIM
2781
+ BROWSER TAB OPEN — that's where camera input streams live in the simulator.
2782
+ Without it, the tool body returns Err("camera failed") and the Channel 1
2783
+ event log shows tool_result.is_error=true.
2784
+
2785
+ Open the sim browser tab at the start of your session:
2786
+
2787
+ // Windows
2788
+ await sh(\`cmd /c start https://extentos.com/s/\${sessionId}\`);
2789
+ // macOS
2790
+ await sh(\`open https://extentos.com/s/\${sessionId}\`);
2791
+ // Then poll until ready
2792
+ while (true) {
2793
+ const s = await getSimulatorStatus({ sessionId });
2794
+ if (s.connectedRoles?.browser) break;
2795
+ await sleep(500);
2796
+ }
2797
+
2798
+ For headless CI: camera-tool verification is blocked today. Non-camera tools
2799
+ (state toggles, persistence, AI calls) work headless. A future
2800
+ \`ensureSimulatorBrowser\` MCP tool would lift this for cross-platform CI —
2801
+ see project_iter4_sim_agent_discovery in shared-context.`,
2802
+ gotchas: [
2803
+ "**autoWake's pre-wake adds ~1-2s on dormant sessions; ~50ms on active ones.** The handler now scans recent events to detect state; if assistant.went_dormant or assistant.session_ended is more recent than any other assistant.* signal, it pre-injects the wake phrase + waits for assistant.session_started before dispatching. Active sessions skip the round trip entirely. To opt out (e.g. when asserting the dormant-drop behavior itself), pass `autoWake: false`. The response's `autoWake` block reports exactly what happened — read it on the first few injects of a new session to confirm the wake phrase is correct.",
2804
+ "**wakePhrase defaults to \"hey elizabeth\" — pass `wakePhrase` explicitly for other apps.** The default matches the canonical dogfood pattern. When the phrase doesn't match what the customer registered via `glasses.voice.onPhrase(...) { session.wake() }`, autoWake reports `action: \"pre_wake_timeout\"` with a diagnostic message + still dispatches the inject (which then drops since the session is still dormant). Fix the phrase + retry.",
2805
+ "**assertToolCalled now cursor-anchors at \"now\" (b87388d).** Pre-fix, a no-cursor first poll returned the OLDEST 200 events — any matching assistant.tool_called from yesterday's testing satisfied immediately with a stale call_id + bogus waitedMs (e.g. 400ms reported on an OpenAi call that physically takes 1-2s). Now: a one-time limit=1 anchor before polling guarantees only events arriving AFTER the call match. No schema change for callers.",
2806
+ "**Camera-tool tests need the sim browser tab open. There's no automated workaround today.** save_photo / describe_scene / capture_video and any tool body that calls glasses.camera.* will return Err(\"camera failed\") without `connectedRoles.browser` being non-null. Open the tab once at the session start (commands above) and poll getSimulatorStatus until ready. Future: an `ensureSimulatorBrowser` MCP tool will collapse this into one call.",
2807
+ "**adb logcat needs the adb binary in your PATH or full path.** On Windows it's at `C:/Users/<you>/AppData/Local/Android/Sdk/platform-tools/adb.exe`; on macOS at `~/Library/Android/sdk/platform-tools/adb`. The MCP server runs on YOUR machine, so cloud-hosted agents can't reach the emulator — Channel 2/3/4 are local-only. Headless CI agents can still do Channel 1 (getEventLog is the only protocol-level channel).",
2808
+ "**WAL caveat (Android Room): always pull .db + .db-wal + .db-shm together.** Room writes in WAL mode by default. Pulling only the .db file gives a stale snapshot — recent writes live in .db-wal. The agent's first DB-read returning zero rows almost always means missing .db-wal. iOS / GRDB doesn't have this — GRDB ships with WAL enabled but the pulled DB self-checkpoints on close.",
2809
+ "**Don't mint a fresh sim per test.** createSimulatorSession is get-or-create — the same sim resumes across test runs. Use `resetFresh: true` ONLY when you need a clean event-log slate; otherwise rebuild + reinstall the app + the library reattaches automatically. The autoWake state-detection scans the latest 200 events, so a long-running sim with thousands of events still resolves state correctly (the lifecycle markers are usually well within the window).",
2810
+ "**OpenAi inject latency varies by tool body cost.** State-toggle tools (start_notes / save_notes) dispatch in 500-1000ms; camera+vision tools (describe_scene with includeImage) take 2000-4000ms (camera + image upload + model reasoning). Budget `timeoutMs: 5000` for the former + `timeoutMs: 10000` for vision tools. Mock dispatches in <50ms for everything.",
2811
+ "**The same `injectAssistantUtterance(text)` MCP call drives BOTH Mock and OpenAi.** The runtime picks based on `AssistantProvider.OpenAi(...)` vs `AssistantProvider.Mock()` at session creation. Wire the choice via a build flavor (USE_MOCK_ASSISTANT BuildConfig) per F13 — never bake `useMock` into handler code. CI runs the Mock flavor for the inner loop; pre-release smoke runs the OpenAi flavor for the real-model confidence pass.",
2812
+ "**Pull the JS-side test driver into your CI as a script, not inside the agent.** The agent-driven workflow above is great for inner-loop scaffolding, but for repeated CI runs commit the test code as a separate `.test.ts` / `.test.kt` file in the host app's repo. Use `vitest` (TS) or `kotlinx.coroutines.test.runTest` (Kotlin instrumented test) to run it. The MCP tool calls are still issued (the test driver calls Extentos backend APIs directly using @extentos/mcp-server's exported helpers, or via the MCP HTTP transport).",
2813
+ ],
2814
+ relatedFeatures: ["assistant_runtime", "assistant_start", "assistant_tool", "assistant_provider_openai", "assistant_session_runtime", "assistant_vision"],
2815
+ };
1904
2816
  export const CODE_EXAMPLES = {
1905
2817
  voice_qa_assistant: VOICE_QA_ASSISTANT,
1906
2818
  barge_in_speak: BARGE_IN_SPEAK,
@@ -1910,6 +2822,8 @@ export const CODE_EXAMPLES = {
1910
2822
  connection_page_setup: CONNECTION_PAGE_SETUP,
1911
2823
  byok_anthropic: BYOK_ANTHROPIC,
1912
2824
  agent_test_loop: AGENT_TEST_LOOP,
2825
+ assistant_agent_loop: ASSISTANT_AGENT_LOOP,
2826
+ agent_driven_e2e_full_loop: AGENT_DRIVEN_E2E_FULL_LOOP,
1913
2827
  conversation_agent_loop: CONVERSATION_AGENT_LOOP,
1914
2828
  };
1915
2829
  export const CODE_EXAMPLE_PATTERNS = Object.keys(CODE_EXAMPLES).sort();