npm - agent-office - Versions diffs - 0.4.7 → 0.4.8 - Mend

agent-office 0.4.7 → 0.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/README.md +2 -5
package/dist/cli.js +1 -0
package/dist/commands/communicator.d.ts +1 -0
package/dist/commands/communicator.js +803 -2
package/package.json +1 -1

package/README.md CHANGED Viewed

@@ -40,10 +40,6 @@ An office for your AI agents. Manage multiple [OpenCode](https://opencode.ai) co
                     +-----------------------+
 ```
-## Breaking Changes in v0.4.7
-**Cron Job Approval Workflow**: Workers can no longer create cron jobs directly. Instead, they must use `agent-office worker cron request` to submit requests that require human approval. The old `agent-office worker cron create` command has been renamed to `cron request`. This change ensures all automated tasks have human oversight.
 ## Installation
 ```bash
@@ -206,9 +202,10 @@ Options:
   --password <password>    API password (env: AGENT_OFFICE_PASSWORD)
   --host <host>            Communicator bind host (default: 127.0.0.1)
   --port <port>            Communicator bind port (default: 7655)
+  --xai-key <key>          xAI API key for voice chat (enables voice button)
 ```
-Features: dark theme, iMessage-style chat bubbles, auto-scroll, Enter to send (Shift+Enter for newline), live message polling (5s), unread indicators, status display, and a reset button to revert the agent's session.
+Features: dark theme, iMessage-style chat bubbles, auto-scroll, Enter to send (Shift+Enter for newline), live message polling (5s), unread indicators, status display, and a reset button to revert the agent's session. **Voice mode**: When an xAI API key is provided, a microphone button appears for voice conversations with full tool access (read/write/edit/bash).
 ### `agent-office worker` (for AI agents)

package/dist/cli.js CHANGED Viewed

@@ -129,6 +129,7 @@ appCmd
     .option("--password <password>", "API password for the agent-office server", process.env.AGENT_OFFICE_PASSWORD ?? "secret")
     .option("--host <host>", "Host to bind the web server to", "127.0.0.1")
     .option("--port <port>", "Port to run the web server on", "7655")
+    .option("--xai-key <key>", "xAI API key for voice chat (enables voice button)", process.env.XAI_API_KEY)
     .action(async (options) => {
     const { appCoworkerChatWeb } = await import("./commands/communicator.js");
     await appCoworkerChatWeb(options);

package/dist/commands/communicator.d.ts CHANGED Viewed

@@ -3,6 +3,7 @@ interface CommunicatorOptions {
     password: string;
     host: string;
     port: string;
+    xaiKey?: string;
 }
 export declare function appCoworkerChatWeb(options: CommunicatorOptions): Promise<void>;
 export {};

package/dist/commands/communicator.js CHANGED Viewed

@@ -1,4 +1,7 @@
 import express from "express";
+import { exec } from "child_process";
+import { readFile, writeFile, mkdir } from "fs/promises";
+import { dirname } from "path";
 // ── API helpers ───────────────────────────────────────────────────────────────
 async function apiFetch(agentUrl, password, path, init = {}) {
     const res = await fetch(`${agentUrl}${path}`, {
@@ -489,6 +492,122 @@ function renderPage(coworker, coworkers, msgs, humanName) {
     /* ── HTMX request indicator ── */
     .htmx-request .send-btn { background: var(--accent-dim); }
+    /* ── Voice button ── */
+    .voice-btn {
+      width: 36px; height: 36px;
+      border-radius: 50%;
+      background: var(--surface2);
+      border: 1px solid var(--border);
+      color: var(--text-dim);
+      cursor: pointer;
+      display: flex;
+      align-items: center;
+      justify-content: center;
+      flex-shrink: 0;
+      transition: all 0.2s;
+      padding: 0;
+    }
+    .voice-btn:hover { border-color: var(--accent); color: var(--accent); }
+    .voice-btn:disabled { opacity: 0.4; cursor: not-allowed; }
+    .voice-btn svg { width: 18px; height: 18px; }
+    .voice-btn.active {
+      background: var(--red);
+      border-color: var(--red);
+      color: #fff;
+      animation: voice-pulse 1.5s ease-in-out infinite;
+    }
+    .voice-btn.connecting {
+      background: var(--accent-dim);
+      border-color: var(--accent);
+      color: var(--accent);
+      animation: voice-pulse 0.8s ease-in-out infinite;
+    }
+    @keyframes voice-pulse {
+      0%, 100% { box-shadow: 0 0 0 0 rgba(255, 107, 107, 0.4); }
+      50% { box-shadow: 0 0 0 8px rgba(255, 107, 107, 0); }
+    }
+    /* ── Voice overlay ── */
+    .voice-overlay {
+      position: absolute;
+      top: var(--header-h);
+      left: 0; right: 0; bottom: 0;
+      background: rgba(15, 17, 23, 0.95);
+      z-index: 50;
+      display: flex;
+      align-items: center;
+      justify-content: center;
+      backdrop-filter: blur(8px);
+    }
+    .voice-overlay-content {
+      display: flex;
+      flex-direction: column;
+      align-items: center;
+      gap: 24px;
+      padding: 32px;
+    }
+    .voice-visualizer {
+      position: relative;
+      width: 120px; height: 120px;
+      display: flex;
+      align-items: center;
+      justify-content: center;
+    }
+    .voice-ring {
+      position: absolute;
+      width: 100%; height: 100%;
+      border-radius: 50%;
+      border: 2px solid var(--accent);
+      opacity: 0.3;
+      animation: voice-ring-pulse 2s ease-in-out infinite;
+    }
+    .voice-ring-2 { animation-delay: 0.4s; width: 140%; height: 140%; top: -20%; left: -20%; opacity: 0.15; }
+    .voice-ring-3 { animation-delay: 0.8s; width: 180%; height: 180%; top: -40%; left: -40%; opacity: 0.08; }
+    .voice-overlay.speaking .voice-ring { border-color: var(--green); }
+    .voice-overlay.listening .voice-ring { border-color: var(--accent); }
+    @keyframes voice-ring-pulse {
+      0%, 100% { transform: scale(1); opacity: 0.3; }
+      50% { transform: scale(1.1); opacity: 0.1; }
+    }
+    .voice-avatar {
+      width: 64px; height: 64px;
+      border-radius: 50%;
+      background: var(--accent-dim);
+      color: var(--accent);
+      display: flex;
+      align-items: center;
+      justify-content: center;
+      font-weight: 700;
+      font-size: 24px;
+      z-index: 1;
+    }
+    .voice-status {
+      font-size: 16px;
+      color: var(--text);
+      font-weight: 500;
+    }
+    .voice-transcript {
+      font-size: 14px;
+      color: var(--text-dim);
+      text-align: center;
+      max-width: 400px;
+      min-height: 40px;
+      line-height: 1.4;
+    }
+    .voice-end-btn {
+      background: var(--red);
+      border: none;
+      border-radius: 22px;
+      color: #fff;
+      cursor: pointer;
+      font-size: 14px;
+      font-weight: 600;
+      padding: 10px 24px;
+      transition: background 0.15s, transform 0.1s;
+    }
+    .voice-end-btn:hover { background: #ff8888; }
+    .voice-end-btn:active { transform: scale(0.95); }
   </style>
 </head>
 <body>
@@ -511,6 +630,23 @@ function renderPage(coworker, coworkers, msgs, humanName) {
            hx-swap="innerHTML"></div>
     </div>
     <a href="/cron-requests" class="header-link" title="Manage cron job requests">⚙️</a>
+    <button class="voice-btn" id="voice-btn"
+            onclick="toggleVoice()"
+            title="Voice chat"
+            style="display:none"
+            ${!selected ? 'disabled' : ''}>
+      <svg class="voice-icon-mic" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
+        <path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
+        <path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
+        <line x1="12" y1="19" x2="12" y2="23"></line>
+        <line x1="8" y1="23" x2="16" y2="23"></line>
+      </svg>
+      <div class="voice-icon-stop" style="display:none">
+        <svg viewBox="0 0 24 24" fill="currentColor">
+          <rect x="6" y="6" width="12" height="12" rx="2"></rect>
+        </svg>
+      </div>
+    </button>
     <button class="reset-btn"
             hx-post="/reset?coworker=${encodeURIComponent(selected)}"
             hx-target="#reset-status"
@@ -531,6 +667,21 @@ function renderPage(coworker, coworkers, msgs, humanName) {
   <div id="reset-status"></div>
+  <!-- Voice overlay -->
+  <div id="voice-overlay" class="voice-overlay" style="display:none">
+    <div class="voice-overlay-content">
+      <div class="voice-visualizer" id="voice-visualizer">
+        <div class="voice-ring"></div>
+        <div class="voice-ring voice-ring-2"></div>
+        <div class="voice-ring voice-ring-3"></div>
+        <div class="voice-avatar" id="voice-avatar">?</div>
+      </div>
+      <div class="voice-status" id="voice-status">Connecting...</div>
+      <div class="voice-transcript" id="voice-transcript"></div>
+      <button class="voice-end-btn" onclick="toggleVoice()">End Voice Chat</button>
+    </div>
+  </div>
   <!-- Messages -->
   <div class="messages-outer" id="messages-outer">
     <div id="messages"
@@ -640,9 +791,10 @@ function renderPage(coworker, coworkers, msgs, humanName) {
   // Initial scroll
   scrollToBottom()
-  // Switch to a different coworker
+  // Switch to a different coworker — stop any active voice session first
   function switchCoworker(name) {
     if (!name) return
+    if (voiceState.active) stopVoice()
     const url = new URL(window.location.href)
     url.searchParams.set('coworker', name)
     window.location.href = url.toString()
@@ -679,6 +831,459 @@ function renderPage(coworker, coworkers, msgs, humanName) {
   document.addEventListener('htmx:afterSwap', () => {
     renderMarkdown()
   })
+  // ── Voice Chat ─────────────────────────────────────────────────────────────
+  const SAMPLE_RATE = 24000
+  const voiceState = {
+    active: false,
+    ws: null,
+    audioCtx: null,
+    micStream: null,
+    scriptProcessor: null,
+    playbackQueue: [],
+    isPlaying: false,
+    nextPlayTime: 0,
+  }
+  // Check if voice is enabled and show button
+  fetch('/voice/config').then(r => r.json()).then(cfg => {
+    if (cfg.enabled) {
+      const btn = document.getElementById('voice-btn')
+      if (btn) btn.style.display = 'flex'
+    }
+  }).catch(() => {})
+  function toggleVoice() {
+    if (voiceState.active) {
+      stopVoice()
+    } else {
+      startVoice()
+    }
+  }
+  async function startVoice() {
+    const select = document.getElementById('coworker-select')
+    const coworker = select ? select.value : ''
+    if (!coworker) return
+    const btn = document.getElementById('voice-btn')
+    const overlay = document.getElementById('voice-overlay')
+    const statusEl = document.getElementById('voice-status')
+    const transcriptEl = document.getElementById('voice-transcript')
+    const avatarEl = document.getElementById('voice-avatar')
+    // Update UI to connecting state
+    btn.classList.add('connecting')
+    btn.querySelector('.voice-icon-mic').style.display = 'none'
+    btn.querySelector('.voice-icon-stop').style.display = 'flex'
+    overlay.style.display = 'flex'
+    statusEl.textContent = 'Connecting...'
+    transcriptEl.textContent = ''
+    avatarEl.textContent = coworker.charAt(0).toUpperCase()
+    try {
+      // Request ephemeral token from our backend
+      const sessRes = await fetch('/voice/session', {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({ coworker }),
+      })
+      if (!sessRes.ok) {
+        const err = await sessRes.json().catch(() => ({}))
+        throw new Error(err.error || 'Failed to create voice session')
+      }
+      const sessData = await sessRes.json()
+      const token = sessData.token
+      const instructions = sessData.instructions
+      if (!token) throw new Error('No ephemeral token received')
+      // Request microphone access
+      const micStream = await navigator.mediaDevices.getUserMedia({ audio: {
+        sampleRate: SAMPLE_RATE,
+        channelCount: 1,
+        echoCancellation: true,
+        noiseSuppression: true,
+        autoGainControl: true,
+      }})
+      voiceState.micStream = micStream
+      // Create audio context for playback
+      const audioCtx = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: SAMPLE_RATE })
+      voiceState.audioCtx = audioCtx
+      voiceState.nextPlayTime = 0
+      // Connect WebSocket to xAI realtime API using subprotocol for auth
+      const ws = new WebSocket('wss://api.x.ai/v1/realtime', [
+        'xai-client-secret.' + token
+      ])
+      voiceState.ws = ws
+      ws.onopen = () => {
+        voiceState.active = true
+        btn.classList.remove('connecting')
+        btn.classList.add('active')
+        statusEl.textContent = 'Listening...'
+        overlay.classList.add('listening')
+        overlay.classList.remove('speaking')
+        // Configure session with tools
+        ws.send(JSON.stringify({
+          type: 'session.update',
+          session: {
+            voice: 'Ara',
+            instructions: instructions,
+            turn_detection: { type: 'server_vad' },
+            audio: {
+              input:  { format: { type: 'audio/pcm', rate: SAMPLE_RATE } },
+              output: { format: { type: 'audio/pcm', rate: SAMPLE_RATE } },
+            },
+            tools: [
+              {
+                type: 'function',
+                name: 'read',
+                description: 'Read a file from the filesystem. Returns the file contents. Use this to examine source code, config files, or any text file.',
+                parameters: {
+                  type: 'object',
+                  properties: {
+                    path: { type: 'string', description: 'Absolute or relative file path to read' },
+                    offset: { type: 'number', description: 'Line number to start reading from (1-indexed). Optional.' },
+                    limit: { type: 'number', description: 'Maximum number of lines to read. Optional, defaults to 200.' },
+                  },
+                  required: ['path'],
+                },
+              },
+              {
+                type: 'function',
+                name: 'write',
+                description: 'Write content to a file, creating it if it does not exist or overwriting if it does. Use this to create new files.',
+                parameters: {
+                  type: 'object',
+                  properties: {
+                    path: { type: 'string', description: 'Absolute or relative file path to write' },
+                    content: { type: 'string', description: 'The full content to write to the file' },
+                  },
+                  required: ['path', 'content'],
+                },
+              },
+              {
+                type: 'function',
+                name: 'edit',
+                description: 'Edit a file by replacing an exact string match with new content. The oldString must match exactly (including whitespace and indentation).',
+                parameters: {
+                  type: 'object',
+                  properties: {
+                    path: { type: 'string', description: 'Absolute or relative file path to edit' },
+                    oldText: { type: 'string', description: 'The exact text to find and replace' },
+                    newText: { type: 'string', description: 'The replacement text' },
+                  },
+                  required: ['path', 'oldText', 'newText'],
+                },
+              },
+              {
+                type: 'function',
+                name: 'bash',
+                description: 'Execute a bash command and return its output. Use for running scripts, git commands, build tools, listing files, searching, etc.',
+                parameters: {
+                  type: 'object',
+                  properties: {
+                    command: { type: 'string', description: 'The bash command to execute' },
+                    timeout: { type: 'number', description: 'Timeout in seconds. Optional, defaults to 30.' },
+                  },
+                  required: ['command'],
+                },
+              },
+            ],
+          },
+        }))
+        // Start streaming microphone audio
+        startMicStreaming(ws, micStream, audioCtx)
+      }
+      ws.onmessage = (event) => {
+        const data = JSON.parse(event.data)
+        handleVoiceEvent(data)
+      }
+      ws.onerror = () => {
+        statusEl.textContent = 'Connection error'
+        setTimeout(() => stopVoice(), 2000)
+      }
+      ws.onclose = () => {
+        if (voiceState.active) {
+          stopVoice()
+        }
+      }
+    } catch (err) {
+      console.error('Voice start error:', err)
+      const statusEl = document.getElementById('voice-status')
+      if (statusEl) statusEl.textContent = 'Error: ' + (err.message || 'Unknown error')
+      setTimeout(() => stopVoice(), 2500)
+    }
+  }
+  function startMicStreaming(ws, micStream, audioCtx) {
+    const source = audioCtx.createMediaStreamSource(micStream)
+    // Use ScriptProcessorNode for broad compatibility (including mobile)
+    const bufSize = 4096
+    const processor = audioCtx.createScriptProcessor(bufSize, 1, 1)
+    voiceState.scriptProcessor = processor
+    processor.onaudioprocess = (e) => {
+      if (!voiceState.active || ws.readyState !== WebSocket.OPEN) return
+      const inputData = e.inputBuffer.getChannelData(0)
+      // Resample if audioCtx sample rate differs from target
+      let pcmFloat
+      if (audioCtx.sampleRate !== SAMPLE_RATE) {
+        const ratio = SAMPLE_RATE / audioCtx.sampleRate
+        const newLen = Math.round(inputData.length * ratio)
+        pcmFloat = new Float32Array(newLen)
+        for (let i = 0; i < newLen; i++) {
+          const srcIdx = i / ratio
+          const lo = Math.floor(srcIdx)
+          const hi = Math.min(lo + 1, inputData.length - 1)
+          const frac = srcIdx - lo
+          pcmFloat[i] = inputData[lo] * (1 - frac) + inputData[hi] * frac
+        }
+      } else {
+        pcmFloat = inputData
+      }
+      // Convert Float32 to Int16 PCM
+      const pcm16 = new Int16Array(pcmFloat.length)
+      for (let i = 0; i < pcmFloat.length; i++) {
+        const s = Math.max(-1, Math.min(1, pcmFloat[i]))
+        pcm16[i] = s < 0 ? s * 0x8000 : s * 0x7FFF
+      }
+      // Base64 encode
+      const bytes = new Uint8Array(pcm16.buffer)
+      let binary = ''
+      for (let i = 0; i < bytes.length; i++) {
+        binary += String.fromCharCode(bytes[i])
+      }
+      const b64 = btoa(binary)
+      ws.send(JSON.stringify({
+        type: 'input_audio_buffer.append',
+        audio: b64,
+      }))
+    }
+    source.connect(processor)
+    processor.connect(audioCtx.destination)
+  }
+  function handleVoiceEvent(data) {
+    const overlay = document.getElementById('voice-overlay')
+    const statusEl = document.getElementById('voice-status')
+    const transcriptEl = document.getElementById('voice-transcript')
+    switch (data.type) {
+      case 'input_audio_buffer.speech_started':
+        if (overlay) { overlay.classList.add('listening'); overlay.classList.remove('speaking') }
+        if (statusEl) statusEl.textContent = 'Listening...'
+        break
+      case 'input_audio_buffer.speech_stopped':
+        if (statusEl) statusEl.textContent = 'Processing...'
+        break
+      case 'conversation.item.input_audio_transcription.completed':
+        if (transcriptEl && data.transcript) {
+          transcriptEl.textContent = 'You: ' + data.transcript
+        }
+        break
+      case 'response.function_call_arguments.done':
+        handleToolCall(data)
+        break
+      case 'response.output_audio_transcript.delta':
+        if (overlay) { overlay.classList.remove('listening'); overlay.classList.add('speaking') }
+        if (statusEl) statusEl.textContent = 'Speaking...'
+        if (transcriptEl) {
+          const current = transcriptEl.textContent
+          if (current.startsWith('You:') || current.startsWith('[Tool')) {
+            transcriptEl.textContent = data.delta
+          } else {
+            transcriptEl.textContent += data.delta
+          }
+        }
+        break
+      case 'response.output_audio.delta':
+        if (data.delta) {
+          playAudioChunk(data.delta)
+        }
+        break
+      case 'response.done':
+        if (overlay) { overlay.classList.add('listening'); overlay.classList.remove('speaking') }
+        if (statusEl) statusEl.textContent = 'Listening...'
+        break
+      case 'error':
+        console.error('Voice API error:', data)
+        if (statusEl) statusEl.textContent = 'Error: ' + (data.error?.message || 'Unknown')
+        break
+    }
+  }
+  async function handleToolCall(event) {
+    const ws = voiceState.ws
+    if (!ws || ws.readyState !== WebSocket.OPEN) return
+    const toolName = event.name
+    const callId = event.call_id
+    const args = event.arguments
+    const statusEl = document.getElementById('voice-status')
+    const transcriptEl = document.getElementById('voice-transcript')
+    const overlay = document.getElementById('voice-overlay')
+    // Show tool execution in UI
+    if (overlay) { overlay.classList.remove('listening', 'speaking') }
+    if (statusEl) statusEl.textContent = 'Running tool: ' + toolName + '...'
+    // Parse args for display
+    let argsObj = {}
+    try { argsObj = JSON.parse(args) } catch {}
+    const brief = toolName === 'bash' ? (argsObj.command || '').slice(0, 80) :
+                  toolName === 'read' ? argsObj.path || '' :
+                  toolName === 'write' ? argsObj.path || '' :
+                  toolName === 'edit' ? argsObj.path || '' : ''
+    if (transcriptEl) transcriptEl.textContent = '[Tool: ' + toolName + '] ' + brief
+    try {
+      // Execute tool via our backend
+      const toolRes = await fetch('/voice/tool', {
+        method: 'POST',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({ name: toolName, arguments: args }),
+      })
+      const toolData = await toolRes.json()
+      const output = toolData.output || '(no output)'
+      // Show brief result
+      const shortOutput = output.length > 120 ? output.slice(0, 120) + '...' : output
+      if (transcriptEl) transcriptEl.textContent = '[Tool: ' + toolName + '] ' + shortOutput
+      // Send result back to voice agent
+      ws.send(JSON.stringify({
+        type: 'conversation.item.create',
+        item: {
+          type: 'function_call_output',
+          call_id: callId,
+          output: output,
+        },
+      }))
+      // Request the agent to continue
+      ws.send(JSON.stringify({ type: 'response.create' }))
+      if (statusEl) statusEl.textContent = 'Processing...'
+    } catch (err) {
+      console.error('Tool execution error:', err)
+      const errMsg = err.message || 'Tool execution failed'
+      // Send error back as tool output so the agent can handle it
+      ws.send(JSON.stringify({
+        type: 'conversation.item.create',
+        item: {
+          type: 'function_call_output',
+          call_id: callId,
+          output: 'Error: ' + errMsg,
+        },
+      }))
+      ws.send(JSON.stringify({ type: 'response.create' }))
+      if (statusEl) statusEl.textContent = 'Listening...'
+      if (overlay) overlay.classList.add('listening')
+    }
+  }
+  function playAudioChunk(base64Audio) {
+    if (!voiceState.audioCtx) return
+    const ctx = voiceState.audioCtx
+    // Decode base64 to Int16 PCM
+    const binaryStr = atob(base64Audio)
+    const bytes = new Uint8Array(binaryStr.length)
+    for (let i = 0; i < binaryStr.length; i++) {
+      bytes[i] = binaryStr.charCodeAt(i)
+    }
+    const pcm16 = new Int16Array(bytes.buffer)
+    // Convert to Float32 for Web Audio
+    const float32 = new Float32Array(pcm16.length)
+    for (let i = 0; i < pcm16.length; i++) {
+      float32[i] = pcm16[i] / 32768.0
+    }
+    // Create audio buffer and schedule playback
+    const buffer = ctx.createBuffer(1, float32.length, SAMPLE_RATE)
+    buffer.getChannelData(0).set(float32)
+    const source = ctx.createBufferSource()
+    source.buffer = buffer
+    source.connect(ctx.destination)
+    // Schedule seamless playback
+    const now = ctx.currentTime
+    const startTime = Math.max(now, voiceState.nextPlayTime)
+    source.start(startTime)
+    voiceState.nextPlayTime = startTime + buffer.duration
+  }
+  function stopVoice() {
+    voiceState.active = false
+    // Close WebSocket
+    if (voiceState.ws) {
+      try { voiceState.ws.close() } catch {}
+      voiceState.ws = null
+    }
+    // Stop microphone
+    if (voiceState.micStream) {
+      voiceState.micStream.getTracks().forEach(t => t.stop())
+      voiceState.micStream = null
+    }
+    // Disconnect audio processor
+    if (voiceState.scriptProcessor) {
+      try { voiceState.scriptProcessor.disconnect() } catch {}
+      voiceState.scriptProcessor = null
+    }
+    // Close audio context
+    if (voiceState.audioCtx) {
+      try { voiceState.audioCtx.close() } catch {}
+      voiceState.audioCtx = null
+    }
+    voiceState.nextPlayTime = 0
+    // Reset UI
+    const btn = document.getElementById('voice-btn')
+    if (btn) {
+      btn.classList.remove('active', 'connecting')
+      btn.querySelector('.voice-icon-mic').style.display = 'block'
+      btn.querySelector('.voice-icon-stop').style.display = 'none'
+    }
+    const overlay = document.getElementById('voice-overlay')
+    if (overlay) {
+      overlay.style.display = 'none'
+      overlay.classList.remove('listening', 'speaking')
+    }
+  }
 </script>
 </body>
 </html>`;
@@ -1180,7 +1785,8 @@ function renderCronRequestsPage(requests) {
 }
 // ── Express app ───────────────────────────────────────────────────────────────
 export async function appCoworkerChatWeb(options) {
-    const { url: agentUrl, password, host, port: portStr } = options;
+    const { url: agentUrl, password, host, port: portStr, xaiKey } = options;
+    const voiceEnabled = !!xaiKey;
     const port = parseInt(portStr, 10);
     if (isNaN(port) || port < 1 || port > 65535) {
         console.error(`Error: invalid port "${portStr}"`);
@@ -1203,6 +1809,9 @@ export async function appCoworkerChatWeb(options) {
         console.error("Check that agent-office serve is running and --password is correct.");
     }
     console.log(`Communicator: chatting as "${humanName}"`);
+    if (voiceEnabled) {
+        console.log(`Voice chat enabled (xAI API key configured)`);
+    }
     const app = express();
     app.use(express.urlencoded({ extended: false }));
     app.use(express.json());
@@ -1294,6 +1903,198 @@ export async function appCoworkerChatWeb(options) {
             res.send(`<span style="color:var(--red)">✗ Rejection failed: ${escapeHtml(msg)}</span>`);
         }
     });
+    // ── GET /voice/config — whether voice is enabled ──────────────────────────
+    app.get("/voice/config", (_req, res) => {
+        res.json({ enabled: voiceEnabled });
+    });
+    // ── POST /voice/session — fetch ephemeral token from xAI ─────────────────
+    app.post("/voice/session", async (req, res) => {
+        if (!voiceEnabled || !xaiKey) {
+            res.status(403).json({ error: "Voice is not enabled" });
+            return;
+        }
+        const { coworker } = req.body;
+        if (!coworker) {
+            res.status(400).json({ error: "coworker is required" });
+            return;
+        }
+        try {
+            // Fetch the coworker's status for context
+            const status = await fetchCoworkerStatus(agentUrl, password, coworker);
+            // Get ephemeral token from xAI
+            const tokenRes = await fetch("https://api.x.ai/v1/realtime/client_secrets", {
+                method: "POST",
+                headers: {
+                    "Authorization": `Bearer ${xaiKey}`,
+                    "Content-Type": "application/json",
+                },
+                body: JSON.stringify({ expires_after: { seconds: 300 } }),
+            });
+            if (!tokenRes.ok) {
+                const errBody = await tokenRes.json().catch(() => ({}));
+                res.status(502).json({ error: `xAI API error: ${errBody.error ?? `HTTP ${tokenRes.status}`}` });
+                return;
+            }
+            // xAI returns { value: string, expires_at: number } at the top level
+            const tokenData = await tokenRes.json();
+            const token = tokenData.value;
+            if (!token) {
+                console.error("Voice session: unexpected xAI response shape:", JSON.stringify(tokenData));
+                res.status(502).json({ error: "No ephemeral token in xAI response" });
+                return;
+            }
+            // Build voice instructions based on the coworker
+            const instructions = [
+                `You are ${escapeHtml(coworker)}, an AI coworker in the agent office.`,
+                status ? `Your current status is: "${status}".` : "",
+                `You are having a voice conversation with your human manager ${humanName}.`,
+                `Be helpful, collaborative, and keep your responses concise since this is a voice conversation.`,
+                `You can discuss work, answer questions, and collaborate on tasks.`,
+                ``,
+                `You have access to the agent-office CLI tool which can:`,
+                `- Create and manage AI coworker sessions`,
+                `- Send messages between coworkers`,
+                `- Set status messages for visibility`,
+                `- Schedule cron jobs for recurring tasks`,
+                `- Run a web chat interface for human interaction`,
+                `- Manage task boards with kanban-style workflows`,
+                `- Send email notifications for unread messages`,
+                ``,
+                `You have access to coding tools that you can use when the human asks you to look at, create, or modify files, or run commands:`,
+                `- read: Read a file from the filesystem. Use this to examine source code, config files, etc.`,
+                `- write: Write content to a file, creating or overwriting it.`,
+                `- edit: Edit a file by finding and replacing an exact string.`,
+                `- bash: Execute a shell command and get the output.`,
+                ``,
+                `When using tools, briefly tell the human what you're doing before calling the tool.`,
+                `After getting tool results, summarize the key information verbally rather than reading everything.`,
+                `The working directory is: ${process.cwd()}`,
+            ].filter(Boolean).join("\n");
+            res.json({
+                token,
+                instructions,
+                coworker,
+            });
+        }
+        catch (err) {
+            const msg = err instanceof Error ? err.message : String(err);
+            res.status(502).json({ error: `Failed to create voice session: ${msg}` });
+        }
+    });
+    // ── POST /voice/tool — execute a tool call server-side ────────────────────
+    app.post("/voice/tool", async (req, res) => {
+        const { name, arguments: argsStr } = req.body;
+        if (!name || typeof name !== "string") {
+            res.status(400).json({ error: "name is required" });
+            return;
+        }
+        let args;
+        try {
+            args = typeof argsStr === "string" ? JSON.parse(argsStr) : (argsStr ?? {});
+        }
+        catch {
+            res.status(400).json({ error: "Invalid arguments JSON" });
+            return;
+        }
+        try {
+            let result;
+            switch (name) {
+                case "read": {
+                    const filePath = String(args.path ?? "");
+                    if (!filePath) {
+                        res.json({ output: "Error: path is required" });
+                        return;
+                    }
+                    const content = await readFile(filePath, "utf-8");
+                    const lines = content.split("\n");
+                    const offset = Math.max(1, Number(args.offset) || 1);
+                    const limit = Math.min(2000, Number(args.limit) || 200);
+                    const sliced = lines.slice(offset - 1, offset - 1 + limit);
+                    result = sliced.map((line, i) => `${offset + i}: ${line}`).join("\n");
+                    if (lines.length > offset - 1 + limit) {
+                        result += `\n... (${lines.length} total lines)`;
+                    }
+                    break;
+                }
+                case "write": {
+                    const filePath = String(args.path ?? "");
+                    const content = String(args.content ?? "");
+                    if (!filePath) {
+                        res.json({ output: "Error: path is required" });
+                        return;
+                    }
+                    await mkdir(dirname(filePath), { recursive: true });
+                    await writeFile(filePath, content, "utf-8");
+                    result = `Written ${content.length} bytes to ${filePath}`;
+                    break;
+                }
+                case "edit": {
+                    const filePath = String(args.path ?? "");
+                    const oldStr = String(args.oldText ?? "");
+                    const newStr = String(args.newText ?? "");
+                    if (!filePath) {
+                        res.json({ output: "Error: path is required" });
+                        return;
+                    }
+                    if (!oldStr) {
+                        res.json({ output: "Error: oldText is required" });
+                        return;
+                    }
+                    const fileContent = await readFile(filePath, "utf-8");
+                    const idx = fileContent.indexOf(oldStr);
+                    if (idx === -1) {
+                        result = "Error: oldText not found in file";
+                    }
+                    else if (fileContent.indexOf(oldStr, idx + 1) !== -1) {
+                        result = "Error: oldText found multiple times. Provide more context to make it unique.";
+                    }
+                    else {
+                        const edited = fileContent.slice(0, idx) + newStr + fileContent.slice(idx + oldStr.length);
+                        await writeFile(filePath, edited, "utf-8");
+                        result = `Edit applied to ${filePath}`;
+                    }
+                    break;
+                }
+                case "bash": {
+                    const command = String(args.command ?? "");
+                    if (!command) {
+                        res.json({ output: "Error: command is required" });
+                        return;
+                    }
+                    const timeoutSec = Math.min(120, Number(args.timeout) || 30);
+                    const timeout = timeoutSec * 1000;
+                    result = await new Promise((resolve) => {
+                        exec(command, { timeout, maxBuffer: 1024 * 1024, cwd: process.cwd() }, (err, stdout, stderr) => {
+                            const out = (stdout || "").trim();
+                            const errOut = (stderr || "").trim();
+                            if (err && err.killed) {
+                                resolve(`Command timed out after ${timeout}ms`);
+                            }
+                            else if (err) {
+                                resolve(`Exit code ${err.code ?? 1}\n${errOut}\n${out}`.trim());
+                            }
+                            else {
+                                const combined = errOut ? `${out}\n${errOut}` : out;
+                                resolve(combined || "(no output)");
+                            }
+                        });
+                    });
+                    // Truncate very long output for the voice context
+                    if (result.length > 4000) {
+                        result = result.slice(0, 4000) + "\n... (output truncated)";
+                    }
+                    break;
+                }
+                default:
+                    result = `Unknown tool: ${name}`;
+            }
+            res.json({ output: result });
+        }
+        catch (err) {
+            const msg = err instanceof Error ? err.message : String(err);
+            res.json({ output: `Error: ${msg}` });
+        }
+    });
     // ── GET / — full page ────────────────────────────────────────────────────
     app.get("/", async (req, res) => {
         try {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "agent-office",
-  "version": "0.4.7",
+  "version": "0.4.8",
   "description": "An office for your AI agents",
   "type": "module",
   "license": "MIT",