npm - @extentos/mcp-server - Versions diffs - 0.1.0 → 0.1.2 - Mend

@extentos/mcp-server 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

package/dist/tools/data/capabilities.d.ts +49 -0
package/dist/tools/data/capabilities.d.ts.map +1 -1
package/dist/tools/data/capabilities.js +37 -0
package/dist/tools/data/capabilities.js.map +1 -1
package/dist/tools/data/capabilityPatterns.d.ts.map +1 -1
package/dist/tools/data/capabilityPatterns.js +265 -19
package/dist/tools/data/capabilityPatterns.js.map +1 -1
package/dist/tools/data/codeExamples.d.ts.map +1 -1
package/dist/tools/data/codeExamples.js +624 -45
package/dist/tools/data/codeExamples.js.map +1 -1
package/dist/tools/data/version.d.ts +12 -0
package/dist/tools/data/version.d.ts.map +1 -1
package/dist/tools/data/version.js +17 -0
package/dist/tools/data/version.js.map +1 -1
package/dist/tools/definitions.d.ts.map +1 -1
package/dist/tools/definitions.js +11 -2
package/dist/tools/definitions.js.map +1 -1
package/dist/tools/docs/index.d.ts.map +1 -1
package/dist/tools/docs/index.js +10 -9
package/dist/tools/docs/index.js.map +1 -1
package/dist/tools/handlers/assertToolCalled.d.ts.map +1 -1
package/dist/tools/handlers/assertToolCalled.js +38 -1
package/dist/tools/handlers/assertToolCalled.js.map +1 -1
package/dist/tools/handlers/generateConnectionModule.js +21 -2
package/dist/tools/handlers/generateConnectionModule.js.map +1 -1
package/dist/tools/handlers/getCredentialGuide.d.ts.map +1 -1
package/dist/tools/handlers/getCredentialGuide.js +33 -6
package/dist/tools/handlers/getCredentialGuide.js.map +1 -1
package/dist/tools/handlers/injectAssistantUtterance.d.ts.map +1 -1
package/dist/tools/handlers/injectAssistantUtterance.js +241 -2
package/dist/tools/handlers/injectAssistantUtterance.js.map +1 -1
package/package.json +1 -1

package/dist/tools/data/codeExamples.js CHANGED Viewed

@@ -1904,61 +1904,139 @@ browser side-by-side with this agent loop and watch the panel react.`,
 const ASSISTANT_AGENT_LOOP = {
     pattern: "assistant_agent_loop",
     title: "Phase 4 assistant runtime + agent-driven E2E loop (canonical voice-AI for new apps)",
-    description: "**Canonical Phase 4 voice-AI pattern. Start here for any new voice assistant work on Extentos.** The customer code is one block: `glasses.assistant.start(provider) { tool(name, description) { body -> ToolResult } }`. The model owns wake detection, turn taking, intent parsing, and confirmation speech — the customer only writes tool bodies that act on the app's own state (route data, app DB, camera, library). Provider abstraction wraps OpenAI Realtime in v1; Gemini Live follows. The agent-test loop drives utterances via `injectAssistantUtterance` (Mock provider — deterministic, $0) and asserts via `assertToolCalled`, closing E2E verification without humans. **This replaces `conversation_agent_loop` (Phase 3 cascaded VAD+STT+SmartTurn+TTS+LLM) which is deprecated in v1.4.0 and removed in v2.0.0.** Customer code typically drops ~60% LoC vs Phase 3.",
+    description: "**Canonical Phase 4 voice-AI pattern. Start here for any new voice assistant work on Extentos.** The customer code is one block: `glasses.assistant.start(provider) { tool(name, description) { body -> ToolResult } }`. The model owns wake detection, turn taking, intent parsing, and confirmation speech — the customer only writes tool bodies that act on the app's own state (route data, app DB, camera, library). Provider abstraction wraps OpenAI Realtime in v1; Gemini Live follows. **As of iter5.2 (2026-05-27) `injectAssistantUtterance(text)` drives BOTH the Mock provider AND the real OpenAi Realtime provider** — same MCP call, same code, real model reasoning when you want it. iter5.3 (2026-05-27) added autoWake so multi-step agent tests don't drop injects across silence-timeout sleep transitions. **For the dedicated agent-driven test workflow (4-channel verification: getEventLog + adb logcat + screencap + library-state), see `getCodeExample(pattern:\"agent_driven_e2e_full_loop\")`.** This pattern stays focused on the handler-side code. **Replaces `conversation_agent_loop` (Phase 3 cascaded VAD+STT+SmartTurn+TTS+LLM) which is deprecated in v1.4.0 and removed in v2.0.0.** Customer code typically drops ~60% LoC vs Phase 3.",
     code: {
-        kotlin: `// ── Application bootstrap (do once in Application.onCreate after
-//     RECORD_AUDIO is granted) ──────────────────────────────────────────
+        kotlin: `// ── App bootstrap ───────────────────────────────────────────────────
+// Run once during app startup, after RECORD_AUDIO is granted. The
+// returned ExtentosGlasses instance is the single handle for everything;
+// keep it on your Application subclass (or a singleton container) so
+// the rest of your app can reach it.
+//
+// createGlasses is suspend — call it from a coroutine:
+//
+//   class MyApp : Application() {
+//       lateinit var glasses: ExtentosGlasses
+//       override fun onCreate() {
+//           super.onCreate()
+//           CoroutineScope(Dispatchers.Main).launch {
+//               glasses = createGlasses(this@MyApp)
+//               // Wire your assistant handler here (see below)
+//           }
+//       }
+//   }
 //
 // No ONNX models, no model paths, no cascaded options — Phase 4 ships
 // end-to-end via the provider's WebSocket. glasses.assistant is
 // always-on (no opt-in conversationOptions needed).
+import com.extentos.glasses.core.CaptureError
 import com.extentos.glasses.core.ExtentosConfig
 import com.extentos.glasses.core.ExtentosGlasses
+import com.extentos.glasses.core.ExtentosResult
+import com.extentos.glasses.core.RuntimeEvent
+import com.extentos.glasses.core.VideoClip
+import com.extentos.glasses.core.VideoConfig
+import com.extentos.glasses.core.assistant.AssistantEvent
 import com.extentos.glasses.core.assistant.AssistantProvider
+import com.extentos.glasses.core.assistant.AssistantSession
 import com.extentos.glasses.core.assistant.ToolResult
 import com.extentos.glasses.core.assistant.tool
+import com.extentos.glasses.core.valueOrNull
+import kotlinx.coroutines.CoroutineScope
+import kotlinx.coroutines.Deferred
+import kotlinx.coroutines.Dispatchers
+import kotlinx.coroutines.SupervisorJob
+import kotlinx.coroutines.async
+import kotlinx.coroutines.flow.filterIsInstance
+import kotlinx.coroutines.flow.launchIn
+import kotlinx.coroutines.flow.onEach
+import kotlinx.coroutines.launch
+import kotlin.time.Duration.Companion.seconds
 suspend fun createGlasses(context: android.content.Context): ExtentosGlasses {
     return ExtentosGlasses.create(
         ExtentosConfig(applicationContext = context)
     ).also { glasses ->
         // BYOK OpenAI key — see getCredentialGuide(service:"openai") for
-        // the local.properties + resValue plumbing. Key flows direct from
-        // device → api.openai.com via WS Authorization header; Extentos
-        // backend never sees it.
+        // the local.properties + BuildConfig plumbing. Key flows direct
+        // from device → api.openai.com via WS Authorization header;
+        // Extentos backend never sees it.
+        //
+        // **F-meta-2 warning:** do NOT have your AI agent write the
+        // OPENAI_API_KEY value into local.properties — agent file edits
+        // leak through the conversation transcript. Add the line yourself.
         glasses.assistant.setOpenaiApiKey(BuildConfig.OPENAI_API_KEY)
     }
 }
-// ── Handler — Strava-style example. Wire after createGlasses returns. ──
+// ── Handler — Strava-style example ─────────────────────────────────────
+//
+// Phase 4 wake/sleep state machine (F12): the session starts Dormant
+// (zero token spend), the developer picks the wake mechanism (voice
+// phrase via glasses.voice.onPhrase, button tap, gesture, MCP call),
+// and the model decides when to end the conversation via the built-in
+// end_conversation tool (endOnIntent default true) — no rigid
+// "goodbye <name>" phrase required. sleepAfterSilence is the
+// deterministic backup. onWake { say(...) } speaks in the assistant's
+// own voice (alloy etc.) so the greeting matches the AI's reply voice.
 class StravaAssistantHandler(
     private val glasses: ExtentosGlasses,
     private val routeTracker: RouteTracker,   // app-internal state
     private val library: ClipLibrary,         // app-internal state
-    private val scope: kotlinx.coroutines.CoroutineScope,
+    private val scope: CoroutineScope = CoroutineScope(SupervisorJob() + Dispatchers.IO),
 ) {
-    private var activeVideo: kotlinx.coroutines.Deferred<*>? = null
+    private var session: AssistantSession? = null
+    // Natural async + await + stop pattern. The library exposes
+    // glasses.camera.stopVideo() so the customer never has to cancel
+    // the wrapping Deferred — captureVideo() resumes naturally with
+    // Ok(partial) when stop is signalled.
+    private var activeVideo: Deferred<ExtentosResult<VideoClip, CaptureError>>? = null
+    @Volatile private var notesActive: Boolean = false
+    private val notesBuffer = StringBuilder()
     fun start() {
-        // Sugar form (the trailing-lambda builder). The block runs once
-        // at start to register tools + instructions; \`session.update\` is
-        // sent to OpenAI and the audio pump engages. Returns when the
-        // session reaches Active state.
+        // F3 (iteration-2 fix): subscribe to AssistantEvent.UserSpoke for
+        // transcript capture — needed for notes / live-captions / journal
+        // patterns. Fires once per user turn after the provider's STT
+        // completes. PII boundary: text is verbatim, customer-owned.
+        glasses.runtime.events
+            .filterIsInstance<RuntimeEvent.Assistant>()
+            .onEach { evt ->
+                val userSpoke = evt.event as? AssistantEvent.UserSpoke ?: return@onEach
+                if (notesActive) {
+                    synchronized(notesBuffer) {
+                        if (notesBuffer.isNotEmpty()) notesBuffer.append(" ")
+                        notesBuffer.append(userSpoke.transcript)
+                    }
+                }
+            }
+            .launchIn(scope)
         scope.launch {
-            glasses.assistant.start(
-                provider = AssistantProvider.OpenAi(
-                    model = "gpt-realtime",
-                    voice = "alloy",
-                ),
+            session = glasses.assistant.start(
+                provider = AssistantProvider.OpenAi(model = "gpt-realtime", voice = "alloy"),
             ) {
                 instructions = """
                     You are a Strava companion. Help the runner with route
                     stats and capture moments. Speak briefly — they're
                     running. Don't narrate what you're doing — just do it
-                    and confirm.
+                    and confirm. When the user clearly indicates they want
+                    to stop talking, call end_conversation.
                 """.trimIndent()
+                // Phase 4 v1.1 lifecycle:
+                //   - session lands in Dormant after start() (default)
+                //   - wake via the voice phrase wired below (any other
+                //     trigger — button, gesture, MCP — works the same)
+                //   - onWake speaks the greeting in the model's voice
+                //   - 30 s of user silence → auto-sleep (backup)
+                //   - end_conversation tool registered automatically by
+                //     the library because endOnIntent defaults true
+                onWake { say("Hi, I'm here. What can I do for you?") }
+                sleepAfterSilence(30.seconds)
                 // Read-tools: instant data the AI reads aloud or weaves
                 // into the answer.
                 tool("get_route_remaining", "How much of the planned route is left, in km.") {
@@ -1972,33 +2050,112 @@ class StravaAssistantHandler(
                 }
                 // Action-tools: side effects on the app's own state. The
-                // AI manages the take/stop pair — it knows from context
-                // that stop_video pairs with take_video.
+                // AI manages the take/stop pair from context.
                 tool("take_video", "Start recording a video clip of the runner's view.") {
+                    if (activeVideo?.isActive == true) {
+                        return@tool ToolResult.Err("a recording is already in progress")
+                    }
                     activeVideo = scope.async {
-                        glasses.camera.captureVideo(
-                            VideoConfig(maxDurationSeconds = 30),
-                        )
+                        glasses.camera.captureVideo(VideoConfig(maxDurationSeconds = 30))
                     }
                     ToolResult.Ok("recording started")
                 }
                 tool("stop_video", "Stop the current video recording.") {
-                    // Cooperative cancellation: cancel the Deferred awaiting
-                    // captureVideo. The library's transport layer catches the
-                    // cancellation, fires core.abortCaptureVideo() internally,
-                    // and surfaces a partial clip (or nothing if cancel landed
-                    // before any frames). There is no glasses.camera.stopVideo()
-                    // — Deferred cancellation IS the customer-facing stop
-                    // mechanism. Matches the iOS Task.cancel() pattern.
-                    val pending = activeVideo
+                    // Clean stop pattern: signal the library to gracefully
+                    // end the capture; the in-flight captureVideo() inside
+                    // the \`async\` block resumes naturally with
+                    // ExtentosResult.Ok(partialClip). No coroutine
+                    // cancellation, no sticky-Cancelled Deferred — just
+                    // a normal await on the result.
+                    //
+                    // (Don't be tempted to call activeVideo.cancel() to
+                    // stop — Kotlin Deferred state is sticky-Cancelled,
+                    // so await() throws CancellationException even if the
+                    // library produced a partial. Use stopVideo() instead.)
+                    val capture = activeVideo
                     activeVideo = null
-                    if (pending == null) return@tool ToolResult.Err("nothing was recording")
-                    pending.cancel()
-                    val clip = runCatching { pending.await().valueOrNull() }.getOrNull()
-                    clip?.let { library.add(it); ToolResult.Ok("video saved") }
-                        ?: ToolResult.Err("nothing was recording")
+                    if (capture == null) return@tool ToolResult.Err("nothing was recording")
+                    glasses.camera.stopVideo()
+                    val result = capture.await()
+                    val clip = result.valueOrNull() ?: return@tool ToolResult.Err("video capture failed")
+                    library.add(clip)
+                    ToolResult.Ok("video saved")
+                }
+                tool("start_notes", "Start capturing what the runner says as a note.") {
+                    if (notesActive) return@tool ToolResult.Err("already taking notes")
+                    synchronized(notesBuffer) { notesBuffer.setLength(0) }
+                    notesActive = true
+                    ToolResult.Ok("ok, taking notes")
+                }
+                tool("stop_notes", "Stop note-taking and save the accumulated notes.") {
+                    if (!notesActive) return@tool ToolResult.Err("not currently taking notes")
+                    notesActive = false
+                    val text = synchronized(notesBuffer) {
+                        val s = notesBuffer.toString()
+                        notesBuffer.setLength(0)
+                        s
+                    }
+                    if (text.isBlank()) return@tool ToolResult.Err("nothing was captured")
+                    library.addNote(text)
+                    ToolResult.Ok("notes saved")
+                }
+                // Vision tools — Phase 4 v1.4 includeImage + Photos.copyToFile.
+                //
+                // Two distinct tools, NOT one. The model picks based on
+                // user intent:
+                //   describe_scene — "what / describe / tell me" → AI speaks
+                //                    about the photo; photo NOT persisted.
+                //   save_photo     — "save / capture / take a picture / remember"
+                //                    → photo persisted to library; NOT described.
+                //
+                // Splitting lets the model call BOTH back-to-back when the
+                // user wants both ("save this and tell me what it is") +
+                // keeps each description tight enough for the Mock + real
+                // matchers to disambiguate. See getCapabilityGuide(feature:"assistant_vision")
+                // gotcha about describe-vs-save.
+                tool("describe_scene", "Describe what the runner is currently looking at without saving the photo. Call for 'what do you see' / 'describe this' / 'tell me about this'.") {
+                    val photo = glasses.camera.capturePhoto().valueOrNull()
+                        ?: return@tool ToolResult.Err("camera failed")
+                    val uri = photo.uri
+                        ?: return@tool ToolResult.Err("photo had no uri")
+                    // session is the AssistantSession returned by start();
+                    // captured here via the outer-class property. By the
+                    // time a tool dispatches, the session is Active so
+                    // includeImage won't throw NotReady.
+                    session?.includeImage(uri)
+                    ToolResult.Ok("looking")
+                }
+                tool("save_photo", "Save a photo to the runner's library WITHOUT describing it. Call for 'save this' / 'capture this' / 'take a picture' / 'remember this view'.") {
+                    val photo = glasses.camera.capturePhoto().valueOrNull()
+                        ?: return@tool ToolResult.Err("camera failed")
+                    val uri = photo.uri
+                        ?: return@tool ToolResult.Err("photo had no uri")
+                    val ext = when (Photos.mediaTypeFromUri(uri)) {
+                        "image/png" -> "png"
+                        "image/webp" -> "webp"
+                        else -> "jpg"
+                    }
+                    val dst = File(library.photosDir, "photo-\${System.currentTimeMillis()}.\$ext")
+                    // Photos.copyToFile mirrors Videos.copyToFile: stream-copy
+                    // across data: / file:// / absolute path, parent mkdirs,
+                    // overwrite, returns Boolean. Use it — don't hand-roll
+                    // loadBytes + writeBytes.
+                    if (!Photos.copyToFile(uri, dst)) return@tool ToolResult.Err("could not save photo")
+                    library.addPhoto(dst.absolutePath, photo.width, photo.height)
+                    ToolResult.Ok("photo saved")
                 }
             }
+            // Wake mechanism — canonical pattern uses the existing
+            // glasses.voice.onPhrase system. Defaults to firesWhen =
+            // VoiceScope.WhenDormant so it won't double-fire during an
+            // active conversation. Swap this line for a button onClick,
+            // a gesture handler, or any other trigger that calls
+            // session?.wake() — the library doesn't dictate the
+            // mechanism, only the lifecycle.
+            glasses.voice.onPhrase("hey strava") { session?.wake() }
         }
     }
 }
@@ -2024,7 +2181,20 @@ suspend fun startRawForm(glasses: ExtentosGlasses) {
     )
     session.start()
 }`,
-        swift: `import GlassesCore
+        swift: `// ⚠️  iOS PARITY IN FLIGHT
+// The Phase 4 wake/sleep state machine (session.wake() / sleep() / say() /
+// onWake / sleepAfterSilence / endOnIntent / VoiceScope.WhenDormant) is
+// LIVE in the Android library + verified in the sim. The Swift port is
+// pending — see shared-context/real-hardware-validation-backlog.md B1
+// for status. This Swift block reflects the Phase 4 ergonomics target
+// but DOES NOT compile against the current iOS library (which still
+// ships the always-on Phase 4 surface from earlier sprints).
+//
+// Until iOS parity ships, iOS apps should use the Phase 3
+// conversation_agent_loop pattern or wait. Track the Mac VPS handoff in
+// shared-context/.
+import GlassesCore
 // ── App bootstrap (do once, e.g. in App.init or DI container) ─────────
 //
@@ -2134,19 +2304,50 @@ func startRawForm(_ glasses: ExtentosGlasses) async throws {
     },
     explanation: `AGENT-DRIVEN E2E TEST LOOP — RUN AFTER createGlasses + handler.start()
+→ For the dedicated agent-driven workflow with FOUR-channel verification
+  (getEventLog + adb logcat + screencap + library-state inspection), the
+  multi-tool sweep pattern, autoWake details, and headless-CI guidance,
+  see \`getCodeExample(pattern: "agent_driven_e2e_full_loop")\`. The block
+  below is the quickstart; the dedicated example is the production
+  pattern.
 The headless verification: drive the assistant with synthetic utterances,
 assert the expected tools fire, then read the event log. Two providers
-satisfy the same agent loop:
+satisfy the same agent loop (since iter5.2, the SAME injectAssistantUtterance
+MCP call drives BOTH):
   - AssistantProvider.Mock — deterministic, sub-millisecond, $0. Word-
     overlap-matches the injected utterance against tool descriptions
     and dispatches the first match. Use for CI + tight inner loop.
   - AssistantProvider.OpenAi — real WebSocket to api.openai.com,
-    real LLM picks the tool, real audio output. Use for end-to-end
-    smoke when the sim browser tab has mic input.
-  // Mock-provider path — set useMockProvider when constructing the
-  // handler in your dogfood / test build. Production stays on OpenAi.
+    real LLM picks the tool, real audio output. Use for real-provider
+    verification — works headless against the real model via
+    injectAssistantUtterance.text since iter5.2 (no mic / no human
+    required). Sim browser tab still required for camera-using tools.
+  // Mock-provider path — DON'T bake a useMock Boolean into the handler
+  // (F13: invisible to users; ships looking like OpenAi by default).
+  // Instead, wire the provider choice via a BuildConfig field tied to
+  // a build flavor:
+  //
+  //   // app/build.gradle.kts
+  //   android {
+  //     flavorDimensions += "assistant"
+  //     productFlavors {
+  //       create("mock")   { dimension = "assistant"; buildConfigField("Boolean", "USE_MOCK_ASSISTANT", "true") }
+  //       create("real")   { dimension = "assistant"; buildConfigField("Boolean", "USE_MOCK_ASSISTANT", "false") }
+  //     }
+  //   }
+  //
+  //   // In your handler:
+  //   val provider = if (BuildConfig.USE_MOCK_ASSISTANT)
+  //       AssistantProvider.Mock() else
+  //       AssistantProvider.OpenAi(model = "gpt-realtime", voice = "alloy")
+  //
+  // CI builds the "mock" flavor for the agent-driven E2E loop below;
+  // production builds the "real" flavor. The provider is visible in the
+  // AssistantStatusBadge (glasses-ui) + the sim's waveform color so
+  // dev never confuses which mode the running app is in.
   const { sessionId } = await createSimulatorSession({
     glasses: "meta_rayban", platform: "android",
   });
@@ -2232,8 +2433,385 @@ side with this agent loop and watch the panel react.`,
         "**Tool body runs on Dispatchers.IO (Kotlin) / Swift Task (Swift); suspending camera/storage/HTTP calls are fine.** Per-tool blocking opt-out via `tool(name, desc, blocking = true) { ... }` per synthesis #9 — when true, the model waits silently for the result before speaking. Default is non-blocking (model says \"let me check...\" while the tool runs). Use blocking=true for sub-100ms tools where the filler would be awkward (\"what time is it\" returning in 10 ms).",
         "**BYOK key flows direct, not via Extentos.** synthesis §12. `glasses.assistant.setOpenaiApiKey(key)` stores the key in the AssistantClient. When start() opens a WebSocket, the key goes into `Authorization: Bearer ...` for the wss://api.openai.com/v1/realtime?model=gpt-realtime connection. Extentos backend never sees the key. Test endpoint override is on the AssistantProvider.OpenAi case if you need to point at a mock OpenAI proxy.",
         "**`glasses.ai.complete` is deprecated in v1.4.0 and removed in v2.0.0.** Use the OpenAI SDK directly for non-voice LLM calls (image description, summarization, etc.). Migration: see `searchDocs(topic:'assistant_runtime')` → migration section walkthrough C.",
+        "**Vision via `session.includeImage(uri, prompt = null)` (v1.4 addition).** Capture a photo, hand the URI to the assistant inside a tool body — the model sees it + speaks about it in its configured voice. URI accepts data: / http(s): / file:// / content://. The image persists in conversation history at the provider, so follow-up questions in the same session work without re-sending. Active-only (throws NotReady otherwise — safe inside a tool body where the session is always Active). Canonical pattern: `tool(\"describe_scene\", \"...\") { val photo = glasses.camera.capturePhoto().valueOrNull(); session.includeImage(photo.uri); ToolResult.Ok(\"looking\") }`. See getCapabilityGuide(feature:\"assistant_vision\") for the prompt-parameter pattern + URI-type gotchas.",
+        "**Mid-session primitives (iter5 addition).** Four building blocks on `AssistantSession`: `setReasoningEffort(level)` for dynamic effort routing, `updateInstructions(text)` for persona/mode swaps, `cancelSpeak()` for tool-driven interrupts, `conversationHistory(limit)` for forwarding context to a stronger model. All composable with the existing tool surface — write tools that call them in their bodies. Canonical escalation pattern: a `ask_smart_model` tool body reads `session.conversationHistory()`, formats it as context, calls the customer's own GPT-5 (or Anthropic Claude, or Gemini) client, returns the response as `ToolResult.Ok(answer)` — the realtime model then speaks the answer in its configured voice. Active-only (except conversationHistory which is always safe). See getCapabilityGuide(feature:\"assistant_session_runtime\") for the canonical snippet of each.",
+    ],
+    relatedFeatures: ["assistant_runtime", "assistant_start", "assistant_tool", "assistant_provider_openai", "assistant_vision", "assistant_session_runtime", "audio_chunks", "speak", "capture_photo"],
+};
+// ── Phase 4 agent-driven E2E loop (iter5.2 + iter5.3) ─────────────────
+//
+// Sister pattern to `assistant_agent_loop`. That one shows the customer-
+// side handler code; THIS one shows the AGENT-side workflow that drives
+// + verifies it. The split exists because the audiences differ — a dev
+// shipping a voice assistant cares about the handler; an AI coding agent
+// (or a customer's CI pipeline) cares about the test loop. Both reference
+// each other.
+//
+// Iter5.2 (2026-05-27) made `injectAssistantUtterance(text)` drive the
+// REAL OpenAI Realtime provider, not just Mock. Iter5.3 (2026-05-27)
+// wired autoWake so multi-step sessions don't silently drop injects
+// across silence-timeout sleep transitions. assertToolCalled's cursor
+// anchor (b87388d) eliminated the stale-match false-positive. Together
+// these turn the agent-driven E2E loop from "demo-quality" into a
+// production-grade verification pattern.
+const AGENT_DRIVEN_E2E_FULL_LOOP = {
+    pattern: "agent_driven_e2e_full_loop",
+    title: "Phase 4 assistant — agent-driven E2E loop with multi-channel verification (real OpenAi capable)",
+    description: "How an AI coding agent verifies a Phase 4 voice-assistant handler end-to-end FROM THE SAME MCP SESSION that scaffolded it — no human, no mic. Iter5.2 unlocked driving the REAL OpenAI Realtime provider via `injectAssistantUtterance(text)` (same MCP call as Mock — the runtime decides based on `AssistantProvider.OpenAi` vs `.Mock()` at session creation). The full loop combines FOUR independent verification channels: `getEventLog` for the protocol-level assistant.* event trace, `adb logcat` for tool-body internals (file paths, byte counts, branch decisions), `adb screencap` for UI confirmation, and direct library-state inspection (Room DB / file system) for persisted side effects. When all four agree, the flow really works at every layer. Companion to `assistant_agent_loop` which covers the handler-side code — use both. Replaces the older `agent_test_loop` (Phase 3 cascaded VAD + recordDiscrete + AnthropicClient) for new voice-AI work.",
+    code: {
+        kotlin: `// HANDLER UNDER TEST — minimal Phase 4 assistant the agent loop drives.
+// For the full handler-side pattern (8+ tools, wake/sleep hooks,
+// build-flavor Mock/OpenAi switching, voice phrase wiring), see
+// getCodeExample(pattern: "assistant_agent_loop").
+import com.extentos.glasses.core.ExtentosGlasses
+import com.extentos.glasses.core.Photos
+import com.extentos.glasses.core.assistant.AssistantProvider
+import com.extentos.glasses.core.assistant.AssistantSession
+import com.extentos.glasses.core.assistant.ToolResult
+import com.extentos.glasses.core.valueOrNull
+import android.util.Log
+import kotlinx.coroutines.CoroutineScope
+import kotlinx.coroutines.Dispatchers
+import kotlinx.coroutines.SupervisorJob
+import kotlinx.coroutines.launch
+import java.io.File
+import kotlin.time.Duration.Companion.seconds
+class NotesAssistant(
+    private val glasses: ExtentosGlasses,
+    private val library: NotesLibrary,  // app-internal: Room + photos/ dir
+    private val scope: CoroutineScope = CoroutineScope(SupervisorJob() + Dispatchers.IO),
+) {
+    private var session: AssistantSession? = null
+    @Volatile private var notesActive = false
+    private val notesBuffer = StringBuilder()
+    fun start() {
+        scope.launch {
+            session = glasses.assistant.start(
+                // F13: pick provider via build flavor (USE_MOCK_ASSISTANT BuildConfig)
+                // — never bake \`useMock\` into handler code. See assistant_agent_loop.
+                provider = AssistantProvider.OpenAi(model = "gpt-realtime-2", voice = "alloy"),
+            ) {
+                instructions = "You are a voice-notes assistant on glasses. Speak briefly."
+                sleepAfterSilence(30.seconds)
+                tool("start_notes", "Start capturing the user's speech as a note. Call when the user says 'start notes', 'take notes', or similar.") {
+                    Log.i(TAG, "start_notes: notesActive=\$notesActive")
+                    if (notesActive) return@tool ToolResult.Err("already taking notes")
+                    notesActive = true
+                    notesBuffer.setLength(0)
+                    ToolResult.Ok("ok, taking notes")
+                }
+                tool("save_notes", "Save the accumulated notes to the library. Call when the user says 'save notes', 'stop notes', or 'I'm done'.") {
+                    Log.i(TAG, "save_notes: buf=\${notesBuffer.length} chars")
+                    if (!notesActive) return@tool ToolResult.Err("not taking notes")
+                    notesActive = false
+                    val text = notesBuffer.toString()
+                    if (text.isBlank()) return@tool ToolResult.Err("nothing captured")
+                    library.addNote(text)
+                    ToolResult.Ok("notes saved")
+                }
+                tool("save_photo", "Take a photo and save it to the user's library. Call when the user says 'save this' or 'take a picture'.") {
+                    val photo = glasses.camera.capturePhoto().valueOrNull()
+                        ?: return@tool ToolResult.Err("camera failed")
+                    val uri = photo.uri ?: return@tool ToolResult.Err("no uri")
+                    Log.i(TAG, "save_photo: uri=\${uri.take(60)} \${photo.width}x\${photo.height}")
+                    val dst = File(library.photosDir, "photo-\${System.currentTimeMillis()}.jpg")
+                    if (!Photos.copyToFile(uri, dst)) return@tool ToolResult.Err("save failed")
+                    Log.i(TAG, "save_photo: wrote \${dst.length()} bytes to \${dst.absolutePath}")
+                    library.addPhoto(dst.absolutePath, photo.width, photo.height)
+                    ToolResult.Ok("photo saved")
+                }
+                tool("describe_scene", "Describe what the user is currently looking at WITHOUT saving the photo. Call for 'what do you see' / 'describe this' / 'tell me about this'.") {
+                    val photo = glasses.camera.capturePhoto().valueOrNull()
+                        ?: return@tool ToolResult.Err("camera failed")
+                    val uri = photo.uri ?: return@tool ToolResult.Err("no uri")
+                    Log.i(TAG, "describe_scene: handing uri to session.includeImage")
+                    session?.includeImage(uri)
+                    ToolResult.Ok("looking")
+                }
+            }
+            // Wake phrase — the wake handler the autoWake pre-wake path matches.
+            glasses.voice.onPhrase("hey notes") {
+                Log.i(TAG, "wake phrase matched")
+                session?.wake()
+            }
+        }
+    }
+    private companion object {
+        const val TAG = "NotesAssistant"
+    }
+}`,
+        swift: `// HANDLER UNDER TEST — minimal Phase 4 assistant the agent loop drives.
+// For the full handler-side pattern see getCodeExample(pattern: "assistant_agent_loop").
+//
+// ⚠️  iOS PARITY IN FLIGHT — the Phase 4 wake/sleep state machine + iter5
+// mid-session primitives are LIVE on Android. The Swift port is queued
+// (see shared-context/ios-pure-sdk-pivot-handoff.md). This Swift block
+// reflects the Phase 4 ergonomics target; current iOS apps using
+// glasses.assistant still ship the always-on surface from earlier sprints.
+// The TS agent-test loop below is platform-agnostic — it works against
+// either the Android handler above OR an iOS app running the same APIs
+// once the iOS port lands.
+import GlassesCore
+final class NotesAssistant: @unchecked Sendable {
+    private let glasses: ExtentosGlasses
+    private let library: NotesLibrary
+    private var session: AssistantSession?
+    private var notesActive = false
+    private var notesBuffer = ""
+    init(glasses: ExtentosGlasses, library: NotesLibrary) {
+        self.glasses = glasses
+        self.library = library
+    }
+    func start() async throws {
+        session = try await glasses.assistant.start(
+            provider: .openAI(model: "gpt-realtime-2", voice: "alloy")
+        ) { config in
+            config.instructions = "You are a voice-notes assistant on glasses. Speak briefly."
+            // config.sleepAfterSilence(.seconds(30))   // iOS API parity pending
+            config.tool("start_notes", description: "Start capturing the user's speech as a note. Call when the user says 'start notes' / 'take notes'.") {
+                NSLog("NotesAssistant start_notes: notesActive=\\(self.notesActive)")
+                if self.notesActive { return .err("already taking notes") }
+                self.notesActive = true
+                self.notesBuffer = ""
+                return .ok("ok, taking notes")
+            }
+            config.tool("save_notes", description: "Save the accumulated notes. Call when the user says 'save notes' / 'stop notes' / 'I'm done'.") {
+                NSLog("NotesAssistant save_notes: buf=\\(self.notesBuffer.count) chars")
+                if !self.notesActive { return .err("not taking notes") }
+                self.notesActive = false
+                if self.notesBuffer.isEmpty { return .err("nothing captured") }
+                self.library.addNote(self.notesBuffer)
+                return .ok("notes saved")
+            }
+            config.tool("save_photo", description: "Take a photo and save it to the library. Call when the user says 'save this' / 'take a picture'.") {
+                guard let photo = (await self.glasses.camera.capturePhoto()).success else {
+                    return .err("camera failed")
+                }
+                NSLog("NotesAssistant save_photo: width=\\(photo.width)")
+                self.library.addPhoto(photo)
+                return .ok("photo saved")
+            }
+        }
+        // Wake phrase wiring (handler the autoWake pre-wake path matches).
+        _ = glasses.voice.onPhrase("hey notes") { [weak self] in
+            await self?.session?.wake()
+        }
+    }
+}`,
+    },
+    explanation: `THE AGENT-SIDE TEST LOOP — what runs in YOUR conversation, NOT in the customer's app
+  // 1. Mint or resume the sim. createSimulatorSession is get-or-create —
+  //    pass resetFresh:true only when you need a clean event-log slate.
+  const { sessionId } = await createSimulatorSession({
+    glasses: "meta_rayban", platform: "android",
+  });
+  // 2. Sweep across N tools in one session. autoWake (default true)
+  //    handles the silence-timeout sleep transitions that bite multi-step
+  //    tests where the agent spends >30s between injects.
+  const cases = [
+    { utter: "start taking notes",          expect: "start_notes" },
+    { utter: "save this view",              expect: "save_photo"  },
+    { utter: "what am I looking at",        expect: "describe_scene" },
+    { utter: "okay save the notes",         expect: "save_notes" },
+  ];
+  for (const c of cases) {
+    const inj = await injectAssistantUtterance({
+      sessionId,
+      text: c.utter,
+      // autoWake defaults true; wakePhrase defaults "hey elizabeth" —
+      // override for apps with a different wake phrase:
+      wakePhrase: "hey notes",
+    });
+    // inj.autoWake describes what the pre-wake step did:
+    //   { action: "skipped_active" }              session was up already
+    //   { action: "pre_waked", waitedMs: 1843 }   re-waked from dormant
+    //   { action: "pre_wake_timeout", reason }    wake phrase didn't match
+    //                                              — check glasses.voice.onPhrase
+    //                                              registration in the app
+    // 3. Assert the tool fired. The cursor anchor (auto since b87388d)
+    //    eliminates stale matches from prior runs.
+    const call = await assertToolCalled({
+      sessionId, name: c.expect, timeoutMs: 5000,
+    });
+    console.log(\`\${c.expect}: call_id=\${call.call_id} waited=\${call.waitedMs}ms\`);
+  }
+THE FOUR VERIFICATION CHANNELS — when does an assertion really mean it worked?
+CHANNEL 1 — getEventLog (protocol layer)
+The 8-event assistant.* family captures the full conversation. Anchor a cursor
+before injecting, then read forward:
+  const head = await getEventLog({ sessionId, filter: "voice", limit: 1 });
+  let cur = head.cursor;
+  await injectAssistantUtterance({ sessionId, text: "save this view" });
+  await assertToolCalled({ sessionId, name: "save_photo", timeoutMs: 5000 });
+  const trace = await getEventLog({ sessionId, cursor: cur, filter: "voice" });
+  // Expected event sequence:
+  //   assistant.user_spoke      transcript="save this view"
+  //   assistant.tool_called     name="save_photo", call_id
+  //   assistant.tool_result     call_id, output="photo saved", is_error:false,
+  //                             duration_ms
+  //   assistant.assistant_spoke transcript="<the model's confirmation>"
+CHANNEL 2 — adb logcat (tool-body internals)
+assertToolCalled proves the tool FIRED. It does NOT prove the body finished
+cleanly. Tool bodies log internals via android.util.Log — file paths, byte
+counts, branch decisions. Tail logcat with a tag filter to see what the body
+actually did:
+  // Find the adb binary (Windows: Sdk\\platform-tools\\adb.exe; macOS/Linux:
+  // /Users/<you>/Library/Android/sdk/platform-tools/adb).
+  adb logcat -d -s "NotesAssistant:I" \\
+    | tail -50
+  // I/NotesAssistant: save_photo: uri=content://media/external/images/... 1920x1080
+  // I/NotesAssistant: save_photo: wrote 487231 bytes to /data/data/<pkg>/files/photos/photo-1716908...
+If Channel 1 fires but Channel 2 is silent past the tool entry log, the body
+hit a branch that returned early without logging. Often a missing else clause.
+CHANNEL 3 — adb screencap (UI confirmation)
+Run-state changes (badge color, count, list items) reflect the persisted
+state in the UI. screencap proves the binding observed the new state:
+  adb exec-out screencap -p > /tmp/post-save.png
+  // Then Read /tmp/post-save.png — the agent SEES what the user would see.
+  // The Notes app's count badge incremented; the latest photo appears in
+  // the gallery row; the "taking notes" indicator turned off.
+Channel 3 catches UI-binding bugs: the tool wrote to the DB but the UI didn't
+recompose. Often a missing StateFlow.emit / notifyDataSetChanged / SwiftUI
+@Published.
+CHANNEL 4 — library-state inspection (persistence)
+Tools writing to local storage leave a durable artifact. Pull + inspect:
+  // Photos directory
+  adb exec-out run-as <pkg> ls -la files/photos/ | tail -5
+  // -rw------- 1 u0_a234 u0_a234 487231 2026-05-27 21:14 photo-1716908143421.jpg
+  // Room DB (always pull .db + .db-wal + .db-shm together — WAL is mandatory)
+  adb exec-out run-as <pkg> cat databases/notes.db     > /tmp/local.db
+  adb exec-out run-as <pkg> cat databases/notes.db-wal > /tmp/local.db-wal
+  adb exec-out run-as <pkg> cat databases/notes.db-shm > /tmp/local.db-shm
+  sqlite3 /tmp/local.db "SELECT id, text FROM notes ORDER BY id DESC LIMIT 3"
+Channel 4 catches persistence bugs: the tool succeeded but the write never
+hit storage. Common when a developer comments out the DB insert while
+debugging the speak()-side and forgets to uncomment.
+WHEN THE CHANNELS DISAGREE
+  Channels 1+2+3+4 agree                  flow works end-to-end
+  Channel 1 fires, Channel 2 silent       body returned early; missing branch log
+  Channels 1+2 fire, Channel 3 unchanged  UI binding didn't fire; missing emit
+  Channels 1-3 fire, Channel 4 missing    body wrote to a transient; never persisted
+Each disagreement points at a specific layer. With only Channel 1 (which is
+what assertToolCalled gives you in isolation), half these failures look
+identical to a passing test.
+REAL OPENAI VS MOCK — when to pick which
+  Mock           sub-ms, deterministic, $0. Word-overlap-matches tool
+                 descriptions. Use for CI / tight inner loop / regression
+                 sweeps.
+  OpenAi         500-2000ms per inject, ~$0.005, real model reasoning, real
+                 tool routing under the model's actual instructions. Use for
+                 confidence-building before a release; for catching cases
+                 where the model picks the WRONG tool when descriptions
+                 overlap; for vision tests (includeImage); for confirming
+                 the model speaks a sensible confirmation back.
+Same injectAssistantUtterance call works against both — \`AssistantProvider.OpenAi\`
+vs \`AssistantProvider.Mock()\` at session creation is the entire switch.
+F13: never bake a \`useMock\` Boolean into handler code; wire it via a build
+flavor so the choice is visible in the AssistantStatusBadge (glasses-ui) +
+the sim's waveform color so dev never confuses which mode is running.
+WAKING THE SESSION — autoWake (iter5.3) replaced the old two-step dance
+Pre-iter5.3, agents had to manually inject the wake phrase before each
+assistant inject if the session might be dormant:
+  // OLD pattern — no longer required
+  await injectTranscript({ sessionId, text: "hey notes" });
+  await sleep(3000);
+  await injectAssistantUtterance({ sessionId, text: "save this view" });
+Now autoWake handles it transparently — default true, only fires when the
+event-log scan shows the session isn't currently Active:
+  await injectAssistantUtterance({
+    sessionId, text: "save this view", wakePhrase: "hey notes",
+  });
+  // If session was Active: skipped (one ~50ms events query, no wake roundtrip)
+  // If session was Dormant: pre-injects "hey notes", waits ~1-2s for
+  //                         assistant.session_started, then dispatches.
+Set \`autoWake: false\` only when you want to ASSERT the dormant-drop behavior
+itself, or when you're driving wake manually with custom timing.
+CAMERA-USING TOOLS NEED A LIVE SIM BROWSER TAB
+Tools that call glasses.camera.capturePhoto / captureVideo need the SIM
+BROWSER TAB OPEN — that's where camera input streams live in the simulator.
+Without it, the tool body returns Err("camera failed") and the Channel 1
+event log shows tool_result.is_error=true.
+Open the sim browser tab at the start of your session:
+  // Windows
+  await sh(\`cmd /c start https://extentos.com/s/\${sessionId}\`);
+  // macOS
+  await sh(\`open https://extentos.com/s/\${sessionId}\`);
+  // Then poll until ready
+  while (true) {
+    const s = await getSimulatorStatus({ sessionId });
+    if (s.connectedRoles?.browser) break;
+    await sleep(500);
+  }
+For headless CI: camera-tool verification is blocked today. Non-camera tools
+(state toggles, persistence, AI calls) work headless. A future
+\`ensureSimulatorBrowser\` MCP tool would lift this for cross-platform CI —
+see project_iter4_sim_agent_discovery in shared-context.`,
+    gotchas: [
+        "**autoWake's pre-wake adds ~1-2s on dormant sessions; ~50ms on active ones.** The handler now scans recent events to detect state; if assistant.went_dormant or assistant.session_ended is more recent than any other assistant.* signal, it pre-injects the wake phrase + waits for assistant.session_started before dispatching. Active sessions skip the round trip entirely. To opt out (e.g. when asserting the dormant-drop behavior itself), pass `autoWake: false`. The response's `autoWake` block reports exactly what happened — read it on the first few injects of a new session to confirm the wake phrase is correct.",
+        "**wakePhrase defaults to \"hey elizabeth\" — pass `wakePhrase` explicitly for other apps.** The default matches the canonical dogfood pattern. When the phrase doesn't match what the customer registered via `glasses.voice.onPhrase(...) { session.wake() }`, autoWake reports `action: \"pre_wake_timeout\"` with a diagnostic message + still dispatches the inject (which then drops since the session is still dormant). Fix the phrase + retry.",
+        "**assertToolCalled now cursor-anchors at \"now\" (b87388d).** Pre-fix, a no-cursor first poll returned the OLDEST 200 events — any matching assistant.tool_called from yesterday's testing satisfied immediately with a stale call_id + bogus waitedMs (e.g. 400ms reported on an OpenAi call that physically takes 1-2s). Now: a one-time limit=1 anchor before polling guarantees only events arriving AFTER the call match. No schema change for callers.",
+        "**Camera-tool tests need the sim browser tab open. There's no automated workaround today.** save_photo / describe_scene / capture_video and any tool body that calls glasses.camera.* will return Err(\"camera failed\") without `connectedRoles.browser` being non-null. Open the tab once at the session start (commands above) and poll getSimulatorStatus until ready. Future: an `ensureSimulatorBrowser` MCP tool will collapse this into one call.",
+        "**adb logcat needs the adb binary in your PATH or full path.** On Windows it's at `C:/Users/<you>/AppData/Local/Android/Sdk/platform-tools/adb.exe`; on macOS at `~/Library/Android/sdk/platform-tools/adb`. The MCP server runs on YOUR machine, so cloud-hosted agents can't reach the emulator — Channel 2/3/4 are local-only. Headless CI agents can still do Channel 1 (getEventLog is the only protocol-level channel).",
+        "**WAL caveat (Android Room): always pull .db + .db-wal + .db-shm together.** Room writes in WAL mode by default. Pulling only the .db file gives a stale snapshot — recent writes live in .db-wal. The agent's first DB-read returning zero rows almost always means missing .db-wal. iOS / GRDB doesn't have this — GRDB ships with WAL enabled but the pulled DB self-checkpoints on close.",
+        "**Don't mint a fresh sim per test.** createSimulatorSession is get-or-create — the same sim resumes across test runs. Use `resetFresh: true` ONLY when you need a clean event-log slate; otherwise rebuild + reinstall the app + the library reattaches automatically. The autoWake state-detection scans the latest 200 events, so a long-running sim with thousands of events still resolves state correctly (the lifecycle markers are usually well within the window).",
+        "**OpenAi inject latency varies by tool body cost.** State-toggle tools (start_notes / save_notes) dispatch in 500-1000ms; camera+vision tools (describe_scene with includeImage) take 2000-4000ms (camera + image upload + model reasoning). Budget `timeoutMs: 5000` for the former + `timeoutMs: 10000` for vision tools. Mock dispatches in <50ms for everything.",
+        "**The same `injectAssistantUtterance(text)` MCP call drives BOTH Mock and OpenAi.** The runtime picks based on `AssistantProvider.OpenAi(...)` vs `AssistantProvider.Mock()` at session creation. Wire the choice via a build flavor (USE_MOCK_ASSISTANT BuildConfig) per F13 — never bake `useMock` into handler code. CI runs the Mock flavor for the inner loop; pre-release smoke runs the OpenAi flavor for the real-model confidence pass.",
+        "**Pull the JS-side test driver into your CI as a script, not inside the agent.** The agent-driven workflow above is great for inner-loop scaffolding, but for repeated CI runs commit the test code as a separate `.test.ts` / `.test.kt` file in the host app's repo. Use `vitest` (TS) or `kotlinx.coroutines.test.runTest` (Kotlin instrumented test) to run it. The MCP tool calls are still issued (the test driver calls Extentos backend APIs directly using @extentos/mcp-server's exported helpers, or via the MCP HTTP transport).",
     ],
-    relatedFeatures: ["assistant_runtime", "assistant_start", "assistant_tool", "assistant_provider_openai", "audio_chunks", "speak"],
+    relatedFeatures: ["assistant_runtime", "assistant_start", "assistant_tool", "assistant_provider_openai", "assistant_session_runtime", "assistant_vision"],
 };
 export const CODE_EXAMPLES = {
     voice_qa_assistant: VOICE_QA_ASSISTANT,
@@ -2245,6 +2823,7 @@ export const CODE_EXAMPLES = {
     byok_anthropic: BYOK_ANTHROPIC,
     agent_test_loop: AGENT_TEST_LOOP,
     assistant_agent_loop: ASSISTANT_AGENT_LOOP,
+    agent_driven_e2e_full_loop: AGENT_DRIVEN_E2E_FULL_LOOP,
     conversation_agent_loop: CONVERSATION_AGENT_LOOP,
 };
 export const CODE_EXAMPLE_PATTERNS = Object.keys(CODE_EXAMPLES).sort();