agent-office 0.4.7 → 0.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -40,10 +40,6 @@ An office for your AI agents. Manage multiple [OpenCode](https://opencode.ai) co
40
40
  +-----------------------+
41
41
  ```
42
42
 
43
- ## Breaking Changes in v0.4.7
44
-
45
- **Cron Job Approval Workflow**: Workers can no longer create cron jobs directly. Instead, they must use `agent-office worker cron request` to submit requests that require human approval. The old `agent-office worker cron create` command has been renamed to `cron request`. This change ensures all automated tasks have human oversight.
46
-
47
43
  ## Installation
48
44
 
49
45
  ```bash
@@ -206,9 +202,10 @@ Options:
206
202
  --password <password> API password (env: AGENT_OFFICE_PASSWORD)
207
203
  --host <host> Communicator bind host (default: 127.0.0.1)
208
204
  --port <port> Communicator bind port (default: 7655)
205
+ --xai-key <key> xAI API key for voice chat (enables voice button)
209
206
  ```
210
207
 
211
- Features: dark theme, iMessage-style chat bubbles, auto-scroll, Enter to send (Shift+Enter for newline), live message polling (5s), unread indicators, status display, and a reset button to revert the agent's session.
208
+ Features: dark theme, iMessage-style chat bubbles, auto-scroll, Enter to send (Shift+Enter for newline), live message polling (5s), unread indicators, status display, and a reset button to revert the agent's session. **Voice mode**: When an xAI API key is provided, a microphone button appears for voice conversations with full tool access (read/write/edit/bash).
212
209
 
213
210
  ### `agent-office worker` (for AI agents)
214
211
 
package/dist/cli.js CHANGED
@@ -129,6 +129,7 @@ appCmd
129
129
  .option("--password <password>", "API password for the agent-office server", process.env.AGENT_OFFICE_PASSWORD ?? "secret")
130
130
  .option("--host <host>", "Host to bind the web server to", "127.0.0.1")
131
131
  .option("--port <port>", "Port to run the web server on", "7655")
132
+ .option("--xai-key <key>", "xAI API key for voice chat (enables voice button)", process.env.XAI_API_KEY)
132
133
  .action(async (options) => {
133
134
  const { appCoworkerChatWeb } = await import("./commands/communicator.js");
134
135
  await appCoworkerChatWeb(options);
@@ -3,6 +3,7 @@ interface CommunicatorOptions {
3
3
  password: string;
4
4
  host: string;
5
5
  port: string;
6
+ xaiKey?: string;
6
7
  }
7
8
  export declare function appCoworkerChatWeb(options: CommunicatorOptions): Promise<void>;
8
9
  export {};
@@ -1,4 +1,7 @@
1
1
  import express from "express";
2
+ import { exec } from "child_process";
3
+ import { readFile, writeFile, mkdir } from "fs/promises";
4
+ import { dirname } from "path";
2
5
  // ── API helpers ───────────────────────────────────────────────────────────────
3
6
  async function apiFetch(agentUrl, password, path, init = {}) {
4
7
  const res = await fetch(`${agentUrl}${path}`, {
@@ -489,6 +492,122 @@ function renderPage(coworker, coworkers, msgs, humanName) {
489
492
 
490
493
  /* ── HTMX request indicator ── */
491
494
  .htmx-request .send-btn { background: var(--accent-dim); }
495
+
496
+ /* ── Voice button ── */
497
+ .voice-btn {
498
+ width: 36px; height: 36px;
499
+ border-radius: 50%;
500
+ background: var(--surface2);
501
+ border: 1px solid var(--border);
502
+ color: var(--text-dim);
503
+ cursor: pointer;
504
+ display: flex;
505
+ align-items: center;
506
+ justify-content: center;
507
+ flex-shrink: 0;
508
+ transition: all 0.2s;
509
+ padding: 0;
510
+ }
511
+ .voice-btn:hover { border-color: var(--accent); color: var(--accent); }
512
+ .voice-btn:disabled { opacity: 0.4; cursor: not-allowed; }
513
+ .voice-btn svg { width: 18px; height: 18px; }
514
+ .voice-btn.active {
515
+ background: var(--red);
516
+ border-color: var(--red);
517
+ color: #fff;
518
+ animation: voice-pulse 1.5s ease-in-out infinite;
519
+ }
520
+ .voice-btn.connecting {
521
+ background: var(--accent-dim);
522
+ border-color: var(--accent);
523
+ color: var(--accent);
524
+ animation: voice-pulse 0.8s ease-in-out infinite;
525
+ }
526
+ @keyframes voice-pulse {
527
+ 0%, 100% { box-shadow: 0 0 0 0 rgba(255, 107, 107, 0.4); }
528
+ 50% { box-shadow: 0 0 0 8px rgba(255, 107, 107, 0); }
529
+ }
530
+
531
+ /* ── Voice overlay ── */
532
+ .voice-overlay {
533
+ position: absolute;
534
+ top: var(--header-h);
535
+ left: 0; right: 0; bottom: 0;
536
+ background: rgba(15, 17, 23, 0.95);
537
+ z-index: 50;
538
+ display: flex;
539
+ align-items: center;
540
+ justify-content: center;
541
+ backdrop-filter: blur(8px);
542
+ }
543
+ .voice-overlay-content {
544
+ display: flex;
545
+ flex-direction: column;
546
+ align-items: center;
547
+ gap: 24px;
548
+ padding: 32px;
549
+ }
550
+ .voice-visualizer {
551
+ position: relative;
552
+ width: 120px; height: 120px;
553
+ display: flex;
554
+ align-items: center;
555
+ justify-content: center;
556
+ }
557
+ .voice-ring {
558
+ position: absolute;
559
+ width: 100%; height: 100%;
560
+ border-radius: 50%;
561
+ border: 2px solid var(--accent);
562
+ opacity: 0.3;
563
+ animation: voice-ring-pulse 2s ease-in-out infinite;
564
+ }
565
+ .voice-ring-2 { animation-delay: 0.4s; width: 140%; height: 140%; top: -20%; left: -20%; opacity: 0.15; }
566
+ .voice-ring-3 { animation-delay: 0.8s; width: 180%; height: 180%; top: -40%; left: -40%; opacity: 0.08; }
567
+ .voice-overlay.speaking .voice-ring { border-color: var(--green); }
568
+ .voice-overlay.listening .voice-ring { border-color: var(--accent); }
569
+ @keyframes voice-ring-pulse {
570
+ 0%, 100% { transform: scale(1); opacity: 0.3; }
571
+ 50% { transform: scale(1.1); opacity: 0.1; }
572
+ }
573
+ .voice-avatar {
574
+ width: 64px; height: 64px;
575
+ border-radius: 50%;
576
+ background: var(--accent-dim);
577
+ color: var(--accent);
578
+ display: flex;
579
+ align-items: center;
580
+ justify-content: center;
581
+ font-weight: 700;
582
+ font-size: 24px;
583
+ z-index: 1;
584
+ }
585
+ .voice-status {
586
+ font-size: 16px;
587
+ color: var(--text);
588
+ font-weight: 500;
589
+ }
590
+ .voice-transcript {
591
+ font-size: 14px;
592
+ color: var(--text-dim);
593
+ text-align: center;
594
+ max-width: 400px;
595
+ min-height: 40px;
596
+ line-height: 1.4;
597
+ }
598
+ .voice-end-btn {
599
+ background: var(--red);
600
+ border: none;
601
+ border-radius: 22px;
602
+ color: #fff;
603
+ cursor: pointer;
604
+ font-size: 14px;
605
+ font-weight: 600;
606
+ padding: 10px 24px;
607
+ transition: background 0.15s, transform 0.1s;
608
+ }
609
+ .voice-end-btn:hover { background: #ff8888; }
610
+ .voice-end-btn:active { transform: scale(0.95); }
492
611
  </style>
493
612
  </head>
494
613
  <body>
@@ -511,6 +630,23 @@ function renderPage(coworker, coworkers, msgs, humanName) {
511
630
  hx-swap="innerHTML"></div>
512
631
  </div>
513
632
  <a href="/cron-requests" class="header-link" title="Manage cron job requests">⚙️</a>
633
+ <button class="voice-btn" id="voice-btn"
634
+ onclick="toggleVoice()"
635
+ title="Voice chat"
636
+ style="display:none"
637
+ ${!selected ? 'disabled' : ''}>
638
+ <svg class="voice-icon-mic" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
639
+ <path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
640
+ <path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
641
+ <line x1="12" y1="19" x2="12" y2="23"></line>
642
+ <line x1="8" y1="23" x2="16" y2="23"></line>
643
+ </svg>
644
+ <div class="voice-icon-stop" style="display:none">
645
+ <svg viewBox="0 0 24 24" fill="currentColor">
646
+ <rect x="6" y="6" width="12" height="12" rx="2"></rect>
647
+ </svg>
648
+ </div>
649
+ </button>
514
650
  <button class="reset-btn"
515
651
  hx-post="/reset?coworker=${encodeURIComponent(selected)}"
516
652
  hx-target="#reset-status"
@@ -531,6 +667,21 @@ function renderPage(coworker, coworkers, msgs, humanName) {
531
667
 
532
668
  <div id="reset-status"></div>
533
669
 
670
+ <!-- Voice overlay -->
671
+ <div id="voice-overlay" class="voice-overlay" style="display:none">
672
+ <div class="voice-overlay-content">
673
+ <div class="voice-visualizer" id="voice-visualizer">
674
+ <div class="voice-ring"></div>
675
+ <div class="voice-ring voice-ring-2"></div>
676
+ <div class="voice-ring voice-ring-3"></div>
677
+ <div class="voice-avatar" id="voice-avatar">?</div>
678
+ </div>
679
+ <div class="voice-status" id="voice-status">Connecting...</div>
680
+ <div class="voice-transcript" id="voice-transcript"></div>
681
+ <button class="voice-end-btn" onclick="toggleVoice()">End Voice Chat</button>
682
+ </div>
683
+ </div>
684
+
534
685
  <!-- Messages -->
535
686
  <div class="messages-outer" id="messages-outer">
536
687
  <div id="messages"
@@ -640,9 +791,10 @@ function renderPage(coworker, coworkers, msgs, humanName) {
640
791
  // Initial scroll
641
792
  scrollToBottom()
642
793
 
643
- // Switch to a different coworker
794
+ // Switch to a different coworker — stop any active voice session first
644
795
  function switchCoworker(name) {
645
796
  if (!name) return
797
+ if (voiceState.active) stopVoice()
646
798
  const url = new URL(window.location.href)
647
799
  url.searchParams.set('coworker', name)
648
800
  window.location.href = url.toString()
@@ -679,6 +831,459 @@ function renderPage(coworker, coworkers, msgs, humanName) {
679
831
  document.addEventListener('htmx:afterSwap', () => {
680
832
  renderMarkdown()
681
833
  })
834
+
835
+ // ── Voice Chat ─────────────────────────────────────────────────────────────
836
+ const SAMPLE_RATE = 24000
837
+
838
+ const voiceState = {
839
+ active: false,
840
+ ws: null,
841
+ audioCtx: null,
842
+ micStream: null,
843
+ scriptProcessor: null,
844
+ playbackQueue: [],
845
+ isPlaying: false,
846
+ nextPlayTime: 0,
847
+ }
848
+
849
+ // Check if voice is enabled and show button
850
+ fetch('/voice/config').then(r => r.json()).then(cfg => {
851
+ if (cfg.enabled) {
852
+ const btn = document.getElementById('voice-btn')
853
+ if (btn) btn.style.display = 'flex'
854
+ }
855
+ }).catch(() => {})
856
+
857
+ function toggleVoice() {
858
+ if (voiceState.active) {
859
+ stopVoice()
860
+ } else {
861
+ startVoice()
862
+ }
863
+ }
864
+
865
+ async function startVoice() {
866
+ const select = document.getElementById('coworker-select')
867
+ const coworker = select ? select.value : ''
868
+ if (!coworker) return
869
+
870
+ const btn = document.getElementById('voice-btn')
871
+ const overlay = document.getElementById('voice-overlay')
872
+ const statusEl = document.getElementById('voice-status')
873
+ const transcriptEl = document.getElementById('voice-transcript')
874
+ const avatarEl = document.getElementById('voice-avatar')
875
+
876
+ // Update UI to connecting state
877
+ btn.classList.add('connecting')
878
+ btn.querySelector('.voice-icon-mic').style.display = 'none'
879
+ btn.querySelector('.voice-icon-stop').style.display = 'flex'
880
+ overlay.style.display = 'flex'
881
+ statusEl.textContent = 'Connecting...'
882
+ transcriptEl.textContent = ''
883
+ avatarEl.textContent = coworker.charAt(0).toUpperCase()
884
+
885
+ try {
886
+ // Request ephemeral token from our backend
887
+ const sessRes = await fetch('/voice/session', {
888
+ method: 'POST',
889
+ headers: { 'Content-Type': 'application/json' },
890
+ body: JSON.stringify({ coworker }),
891
+ })
892
+ if (!sessRes.ok) {
893
+ const err = await sessRes.json().catch(() => ({}))
894
+ throw new Error(err.error || 'Failed to create voice session')
895
+ }
896
+ const sessData = await sessRes.json()
897
+ const token = sessData.token
898
+ const instructions = sessData.instructions
899
+
900
+ if (!token) throw new Error('No ephemeral token received')
901
+
902
+ // Request microphone access
903
+ const micStream = await navigator.mediaDevices.getUserMedia({ audio: {
904
+ sampleRate: SAMPLE_RATE,
905
+ channelCount: 1,
906
+ echoCancellation: true,
907
+ noiseSuppression: true,
908
+ autoGainControl: true,
909
+ }})
910
+ voiceState.micStream = micStream
911
+
912
+ // Create audio context for playback
913
+ const audioCtx = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: SAMPLE_RATE })
914
+ voiceState.audioCtx = audioCtx
915
+ voiceState.nextPlayTime = 0
916
+
917
+ // Connect WebSocket to xAI realtime API using subprotocol for auth
918
+ const ws = new WebSocket('wss://api.x.ai/v1/realtime', [
919
+ 'xai-client-secret.' + token
920
+ ])
921
+ voiceState.ws = ws
922
+
923
+ ws.onopen = () => {
924
+ voiceState.active = true
925
+ btn.classList.remove('connecting')
926
+ btn.classList.add('active')
927
+ statusEl.textContent = 'Listening...'
928
+ overlay.classList.add('listening')
929
+ overlay.classList.remove('speaking')
930
+
931
+ // Configure session with tools
932
+ ws.send(JSON.stringify({
933
+ type: 'session.update',
934
+ session: {
935
+ voice: 'Ara',
936
+ instructions: instructions,
937
+ turn_detection: { type: 'server_vad' },
938
+ audio: {
939
+ input: { format: { type: 'audio/pcm', rate: SAMPLE_RATE } },
940
+ output: { format: { type: 'audio/pcm', rate: SAMPLE_RATE } },
941
+ },
942
+ tools: [
943
+ {
944
+ type: 'function',
945
+ name: 'read',
946
+ description: 'Read a file from the filesystem. Returns the file contents. Use this to examine source code, config files, or any text file.',
947
+ parameters: {
948
+ type: 'object',
949
+ properties: {
950
+ path: { type: 'string', description: 'Absolute or relative file path to read' },
951
+ offset: { type: 'number', description: 'Line number to start reading from (1-indexed). Optional.' },
952
+ limit: { type: 'number', description: 'Maximum number of lines to read. Optional, defaults to 200.' },
953
+ },
954
+ required: ['path'],
955
+ },
956
+ },
957
+ {
958
+ type: 'function',
959
+ name: 'write',
960
+ description: 'Write content to a file, creating it if it does not exist or overwriting if it does. Use this to create new files.',
961
+ parameters: {
962
+ type: 'object',
963
+ properties: {
964
+ path: { type: 'string', description: 'Absolute or relative file path to write' },
965
+ content: { type: 'string', description: 'The full content to write to the file' },
966
+ },
967
+ required: ['path', 'content'],
968
+ },
969
+ },
970
+ {
971
+ type: 'function',
972
+ name: 'edit',
973
+ description: 'Edit a file by replacing an exact string match with new content. The oldString must match exactly (including whitespace and indentation).',
974
+ parameters: {
975
+ type: 'object',
976
+ properties: {
977
+ path: { type: 'string', description: 'Absolute or relative file path to edit' },
978
+ oldText: { type: 'string', description: 'The exact text to find and replace' },
979
+ newText: { type: 'string', description: 'The replacement text' },
980
+ },
981
+ required: ['path', 'oldText', 'newText'],
982
+ },
983
+ },
984
+ {
985
+ type: 'function',
986
+ name: 'bash',
987
+ description: 'Execute a bash command and return its output. Use for running scripts, git commands, build tools, listing files, searching, etc.',
988
+ parameters: {
989
+ type: 'object',
990
+ properties: {
991
+ command: { type: 'string', description: 'The bash command to execute' },
992
+ timeout: { type: 'number', description: 'Timeout in seconds. Optional, defaults to 30.' },
993
+ },
994
+ required: ['command'],
995
+ },
996
+ },
997
+ ],
998
+ },
999
+ }))
1000
+
1001
+ // Start streaming microphone audio
1002
+ startMicStreaming(ws, micStream, audioCtx)
1003
+ }
1004
+
1005
+ ws.onmessage = (event) => {
1006
+ const data = JSON.parse(event.data)
1007
+ handleVoiceEvent(data)
1008
+ }
1009
+
1010
+ ws.onerror = () => {
1011
+ statusEl.textContent = 'Connection error'
1012
+ setTimeout(() => stopVoice(), 2000)
1013
+ }
1014
+
1015
+ ws.onclose = () => {
1016
+ if (voiceState.active) {
1017
+ stopVoice()
1018
+ }
1019
+ }
1020
+
1021
+ } catch (err) {
1022
+ console.error('Voice start error:', err)
1023
+ const statusEl = document.getElementById('voice-status')
1024
+ if (statusEl) statusEl.textContent = 'Error: ' + (err.message || 'Unknown error')
1025
+ setTimeout(() => stopVoice(), 2500)
1026
+ }
1027
+ }
1028
+
1029
+ function startMicStreaming(ws, micStream, audioCtx) {
1030
+ const source = audioCtx.createMediaStreamSource(micStream)
1031
+ // Use ScriptProcessorNode for broad compatibility (including mobile)
1032
+ const bufSize = 4096
1033
+ const processor = audioCtx.createScriptProcessor(bufSize, 1, 1)
1034
+ voiceState.scriptProcessor = processor
1035
+
1036
+ processor.onaudioprocess = (e) => {
1037
+ if (!voiceState.active || ws.readyState !== WebSocket.OPEN) return
1038
+ const inputData = e.inputBuffer.getChannelData(0)
1039
+
1040
+ // Resample if audioCtx sample rate differs from target
1041
+ let pcmFloat
1042
+ if (audioCtx.sampleRate !== SAMPLE_RATE) {
1043
+ const ratio = SAMPLE_RATE / audioCtx.sampleRate
1044
+ const newLen = Math.round(inputData.length * ratio)
1045
+ pcmFloat = new Float32Array(newLen)
1046
+ for (let i = 0; i < newLen; i++) {
1047
+ const srcIdx = i / ratio
1048
+ const lo = Math.floor(srcIdx)
1049
+ const hi = Math.min(lo + 1, inputData.length - 1)
1050
+ const frac = srcIdx - lo
1051
+ pcmFloat[i] = inputData[lo] * (1 - frac) + inputData[hi] * frac
1052
+ }
1053
+ } else {
1054
+ pcmFloat = inputData
1055
+ }
1056
+
1057
+ // Convert Float32 to Int16 PCM
1058
+ const pcm16 = new Int16Array(pcmFloat.length)
1059
+ for (let i = 0; i < pcmFloat.length; i++) {
1060
+ const s = Math.max(-1, Math.min(1, pcmFloat[i]))
1061
+ pcm16[i] = s < 0 ? s * 0x8000 : s * 0x7FFF
1062
+ }
1063
+
1064
+ // Base64 encode
1065
+ const bytes = new Uint8Array(pcm16.buffer)
1066
+ let binary = ''
1067
+ for (let i = 0; i < bytes.length; i++) {
1068
+ binary += String.fromCharCode(bytes[i])
1069
+ }
1070
+ const b64 = btoa(binary)
1071
+
1072
+ ws.send(JSON.stringify({
1073
+ type: 'input_audio_buffer.append',
1074
+ audio: b64,
1075
+ }))
1076
+ }
1077
+
1078
+ source.connect(processor)
1079
+ processor.connect(audioCtx.destination)
1080
+ }
1081
+
1082
+ function handleVoiceEvent(data) {
1083
+ const overlay = document.getElementById('voice-overlay')
1084
+ const statusEl = document.getElementById('voice-status')
1085
+ const transcriptEl = document.getElementById('voice-transcript')
1086
+
1087
+ switch (data.type) {
1088
+ case 'input_audio_buffer.speech_started':
1089
+ if (overlay) { overlay.classList.add('listening'); overlay.classList.remove('speaking') }
1090
+ if (statusEl) statusEl.textContent = 'Listening...'
1091
+ break
1092
+
1093
+ case 'input_audio_buffer.speech_stopped':
1094
+ if (statusEl) statusEl.textContent = 'Processing...'
1095
+ break
1096
+
1097
+ case 'conversation.item.input_audio_transcription.completed':
1098
+ if (transcriptEl && data.transcript) {
1099
+ transcriptEl.textContent = 'You: ' + data.transcript
1100
+ }
1101
+ break
1102
+
1103
+ case 'response.function_call_arguments.done':
1104
+ handleToolCall(data)
1105
+ break
1106
+
1107
+ case 'response.output_audio_transcript.delta':
1108
+ if (overlay) { overlay.classList.remove('listening'); overlay.classList.add('speaking') }
1109
+ if (statusEl) statusEl.textContent = 'Speaking...'
1110
+ if (transcriptEl) {
1111
+ const current = transcriptEl.textContent
1112
+ if (current.startsWith('You:') || current.startsWith('[Tool')) {
1113
+ transcriptEl.textContent = data.delta
1114
+ } else {
1115
+ transcriptEl.textContent += data.delta
1116
+ }
1117
+ }
1118
+ break
1119
+
1120
+ case 'response.output_audio.delta':
1121
+ if (data.delta) {
1122
+ playAudioChunk(data.delta)
1123
+ }
1124
+ break
1125
+
1126
+ case 'response.done':
1127
+ if (overlay) { overlay.classList.add('listening'); overlay.classList.remove('speaking') }
1128
+ if (statusEl) statusEl.textContent = 'Listening...'
1129
+ break
1130
+
1131
+ case 'error':
1132
+ console.error('Voice API error:', data)
1133
+ if (statusEl) statusEl.textContent = 'Error: ' + (data.error?.message || 'Unknown')
1134
+ break
1135
+ }
1136
+ }
1137
+
1138
+ async function handleToolCall(event) {
1139
+ const ws = voiceState.ws
1140
+ if (!ws || ws.readyState !== WebSocket.OPEN) return
1141
+
1142
+ const toolName = event.name
1143
+ const callId = event.call_id
1144
+ const args = event.arguments
1145
+
1146
+ const statusEl = document.getElementById('voice-status')
1147
+ const transcriptEl = document.getElementById('voice-transcript')
1148
+ const overlay = document.getElementById('voice-overlay')
1149
+
1150
+ // Show tool execution in UI
1151
+ if (overlay) { overlay.classList.remove('listening', 'speaking') }
1152
+ if (statusEl) statusEl.textContent = 'Running tool: ' + toolName + '...'
1153
+
1154
+ // Parse args for display
1155
+ let argsObj = {}
1156
+ try { argsObj = JSON.parse(args) } catch {}
1157
+ const brief = toolName === 'bash' ? (argsObj.command || '').slice(0, 80) :
1158
+ toolName === 'read' ? argsObj.path || '' :
1159
+ toolName === 'write' ? argsObj.path || '' :
1160
+ toolName === 'edit' ? argsObj.path || '' : ''
1161
+ if (transcriptEl) transcriptEl.textContent = '[Tool: ' + toolName + '] ' + brief
1162
+
1163
+ try {
1164
+ // Execute tool via our backend
1165
+ const toolRes = await fetch('/voice/tool', {
1166
+ method: 'POST',
1167
+ headers: { 'Content-Type': 'application/json' },
1168
+ body: JSON.stringify({ name: toolName, arguments: args }),
1169
+ })
1170
+ const toolData = await toolRes.json()
1171
+ const output = toolData.output || '(no output)'
1172
+
1173
+ // Show brief result
1174
+ const shortOutput = output.length > 120 ? output.slice(0, 120) + '...' : output
1175
+ if (transcriptEl) transcriptEl.textContent = '[Tool: ' + toolName + '] ' + shortOutput
1176
+
1177
+ // Send result back to voice agent
1178
+ ws.send(JSON.stringify({
1179
+ type: 'conversation.item.create',
1180
+ item: {
1181
+ type: 'function_call_output',
1182
+ call_id: callId,
1183
+ output: output,
1184
+ },
1185
+ }))
1186
+
1187
+ // Request the agent to continue
1188
+ ws.send(JSON.stringify({ type: 'response.create' }))
1189
+
1190
+ if (statusEl) statusEl.textContent = 'Processing...'
1191
+ } catch (err) {
1192
+ console.error('Tool execution error:', err)
1193
+ const errMsg = err.message || 'Tool execution failed'
1194
+
1195
+ // Send error back as tool output so the agent can handle it
1196
+ ws.send(JSON.stringify({
1197
+ type: 'conversation.item.create',
1198
+ item: {
1199
+ type: 'function_call_output',
1200
+ call_id: callId,
1201
+ output: 'Error: ' + errMsg,
1202
+ },
1203
+ }))
1204
+ ws.send(JSON.stringify({ type: 'response.create' }))
1205
+
1206
+ if (statusEl) statusEl.textContent = 'Listening...'
1207
+ if (overlay) overlay.classList.add('listening')
1208
+ }
1209
+ }
1210
+
1211
+ function playAudioChunk(base64Audio) {
1212
+ if (!voiceState.audioCtx) return
1213
+ const ctx = voiceState.audioCtx
1214
+
1215
+ // Decode base64 to Int16 PCM
1216
+ const binaryStr = atob(base64Audio)
1217
+ const bytes = new Uint8Array(binaryStr.length)
1218
+ for (let i = 0; i < binaryStr.length; i++) {
1219
+ bytes[i] = binaryStr.charCodeAt(i)
1220
+ }
1221
+ const pcm16 = new Int16Array(bytes.buffer)
1222
+
1223
+ // Convert to Float32 for Web Audio
1224
+ const float32 = new Float32Array(pcm16.length)
1225
+ for (let i = 0; i < pcm16.length; i++) {
1226
+ float32[i] = pcm16[i] / 32768.0
1227
+ }
1228
+
1229
+ // Create audio buffer and schedule playback
1230
+ const buffer = ctx.createBuffer(1, float32.length, SAMPLE_RATE)
1231
+ buffer.getChannelData(0).set(float32)
1232
+
1233
+ const source = ctx.createBufferSource()
1234
+ source.buffer = buffer
1235
+ source.connect(ctx.destination)
1236
+
1237
+ // Schedule seamless playback
1238
+ const now = ctx.currentTime
1239
+ const startTime = Math.max(now, voiceState.nextPlayTime)
1240
+ source.start(startTime)
1241
+ voiceState.nextPlayTime = startTime + buffer.duration
1242
+ }
1243
+
1244
+ function stopVoice() {
1245
+ voiceState.active = false
1246
+
1247
+ // Close WebSocket
1248
+ if (voiceState.ws) {
1249
+ try { voiceState.ws.close() } catch {}
1250
+ voiceState.ws = null
1251
+ }
1252
+
1253
+ // Stop microphone
1254
+ if (voiceState.micStream) {
1255
+ voiceState.micStream.getTracks().forEach(t => t.stop())
1256
+ voiceState.micStream = null
1257
+ }
1258
+
1259
+ // Disconnect audio processor
1260
+ if (voiceState.scriptProcessor) {
1261
+ try { voiceState.scriptProcessor.disconnect() } catch {}
1262
+ voiceState.scriptProcessor = null
1263
+ }
1264
+
1265
+ // Close audio context
1266
+ if (voiceState.audioCtx) {
1267
+ try { voiceState.audioCtx.close() } catch {}
1268
+ voiceState.audioCtx = null
1269
+ }
1270
+
1271
+ voiceState.nextPlayTime = 0
1272
+
1273
+ // Reset UI
1274
+ const btn = document.getElementById('voice-btn')
1275
+ if (btn) {
1276
+ btn.classList.remove('active', 'connecting')
1277
+ btn.querySelector('.voice-icon-mic').style.display = 'block'
1278
+ btn.querySelector('.voice-icon-stop').style.display = 'none'
1279
+ }
1280
+
1281
+ const overlay = document.getElementById('voice-overlay')
1282
+ if (overlay) {
1283
+ overlay.style.display = 'none'
1284
+ overlay.classList.remove('listening', 'speaking')
1285
+ }
1286
+ }
682
1287
  </script>
683
1288
  </body>
684
1289
  </html>`;
@@ -1180,7 +1785,8 @@ function renderCronRequestsPage(requests) {
1180
1785
  }
1181
1786
  // ── Express app ───────────────────────────────────────────────────────────────
1182
1787
  export async function appCoworkerChatWeb(options) {
1183
- const { url: agentUrl, password, host, port: portStr } = options;
1788
+ const { url: agentUrl, password, host, port: portStr, xaiKey } = options;
1789
+ const voiceEnabled = !!xaiKey;
1184
1790
  const port = parseInt(portStr, 10);
1185
1791
  if (isNaN(port) || port < 1 || port > 65535) {
1186
1792
  console.error(`Error: invalid port "${portStr}"`);
@@ -1203,6 +1809,9 @@ export async function appCoworkerChatWeb(options) {
1203
1809
  console.error("Check that agent-office serve is running and --password is correct.");
1204
1810
  }
1205
1811
  console.log(`Communicator: chatting as "${humanName}"`);
1812
+ if (voiceEnabled) {
1813
+ console.log(`Voice chat enabled (xAI API key configured)`);
1814
+ }
1206
1815
  const app = express();
1207
1816
  app.use(express.urlencoded({ extended: false }));
1208
1817
  app.use(express.json());
@@ -1294,6 +1903,198 @@ export async function appCoworkerChatWeb(options) {
1294
1903
  res.send(`<span style="color:var(--red)">✗ Rejection failed: ${escapeHtml(msg)}</span>`);
1295
1904
  }
1296
1905
  });
1906
+ // ── GET /voice/config — whether voice is enabled ──────────────────────────
1907
+ app.get("/voice/config", (_req, res) => {
1908
+ res.json({ enabled: voiceEnabled });
1909
+ });
1910
+ // ── POST /voice/session — fetch ephemeral token from xAI ─────────────────
1911
+ app.post("/voice/session", async (req, res) => {
1912
+ if (!voiceEnabled || !xaiKey) {
1913
+ res.status(403).json({ error: "Voice is not enabled" });
1914
+ return;
1915
+ }
1916
+ const { coworker } = req.body;
1917
+ if (!coworker) {
1918
+ res.status(400).json({ error: "coworker is required" });
1919
+ return;
1920
+ }
1921
+ try {
1922
+ // Fetch the coworker's status for context
1923
+ const status = await fetchCoworkerStatus(agentUrl, password, coworker);
1924
+ // Get ephemeral token from xAI
1925
+ const tokenRes = await fetch("https://api.x.ai/v1/realtime/client_secrets", {
1926
+ method: "POST",
1927
+ headers: {
1928
+ "Authorization": `Bearer ${xaiKey}`,
1929
+ "Content-Type": "application/json",
1930
+ },
1931
+ body: JSON.stringify({ expires_after: { seconds: 300 } }),
1932
+ });
1933
+ if (!tokenRes.ok) {
1934
+ const errBody = await tokenRes.json().catch(() => ({}));
1935
+ res.status(502).json({ error: `xAI API error: ${errBody.error ?? `HTTP ${tokenRes.status}`}` });
1936
+ return;
1937
+ }
1938
+ // xAI returns { value: string, expires_at: number } at the top level
1939
+ const tokenData = await tokenRes.json();
1940
+ const token = tokenData.value;
1941
+ if (!token) {
1942
+ console.error("Voice session: unexpected xAI response shape:", JSON.stringify(tokenData));
1943
+ res.status(502).json({ error: "No ephemeral token in xAI response" });
1944
+ return;
1945
+ }
1946
+ // Build voice instructions based on the coworker
1947
+ const instructions = [
1948
+ `You are ${escapeHtml(coworker)}, an AI coworker in the agent office.`,
1949
+ status ? `Your current status is: "${status}".` : "",
1950
+ `You are having a voice conversation with your human manager ${humanName}.`,
1951
+ `Be helpful, collaborative, and keep your responses concise since this is a voice conversation.`,
1952
+ `You can discuss work, answer questions, and collaborate on tasks.`,
1953
+ ``,
1954
+ `You have access to the agent-office CLI tool which can:`,
1955
+ `- Create and manage AI coworker sessions`,
1956
+ `- Send messages between coworkers`,
1957
+ `- Set status messages for visibility`,
1958
+ `- Schedule cron jobs for recurring tasks`,
1959
+ `- Run a web chat interface for human interaction`,
1960
+ `- Manage task boards with kanban-style workflows`,
1961
+ `- Send email notifications for unread messages`,
1962
+ ``,
1963
+ `You have access to coding tools that you can use when the human asks you to look at, create, or modify files, or run commands:`,
1964
+ `- read: Read a file from the filesystem. Use this to examine source code, config files, etc.`,
1965
+ `- write: Write content to a file, creating or overwriting it.`,
1966
+ `- edit: Edit a file by finding and replacing an exact string.`,
1967
+ `- bash: Execute a shell command and get the output.`,
1968
+ ``,
1969
+ `When using tools, briefly tell the human what you're doing before calling the tool.`,
1970
+ `After getting tool results, summarize the key information verbally rather than reading everything.`,
1971
+ `The working directory is: ${process.cwd()}`,
1972
+ ].filter(Boolean).join("\n");
1973
+ res.json({
1974
+ token,
1975
+ instructions,
1976
+ coworker,
1977
+ });
1978
+ }
1979
+ catch (err) {
1980
+ const msg = err instanceof Error ? err.message : String(err);
1981
+ res.status(502).json({ error: `Failed to create voice session: ${msg}` });
1982
+ }
1983
+ });
1984
+ // ── POST /voice/tool — execute a tool call server-side ────────────────────
1985
+ app.post("/voice/tool", async (req, res) => {
1986
+ const { name, arguments: argsStr } = req.body;
1987
+ if (!name || typeof name !== "string") {
1988
+ res.status(400).json({ error: "name is required" });
1989
+ return;
1990
+ }
1991
+ let args;
1992
+ try {
1993
+ args = typeof argsStr === "string" ? JSON.parse(argsStr) : (argsStr ?? {});
1994
+ }
1995
+ catch {
1996
+ res.status(400).json({ error: "Invalid arguments JSON" });
1997
+ return;
1998
+ }
1999
+ try {
2000
+ let result;
2001
+ switch (name) {
2002
+ case "read": {
2003
+ const filePath = String(args.path ?? "");
2004
+ if (!filePath) {
2005
+ res.json({ output: "Error: path is required" });
2006
+ return;
2007
+ }
2008
+ const content = await readFile(filePath, "utf-8");
2009
+ const lines = content.split("\n");
2010
+ const offset = Math.max(1, Number(args.offset) || 1);
2011
+ const limit = Math.min(2000, Number(args.limit) || 200);
2012
+ const sliced = lines.slice(offset - 1, offset - 1 + limit);
2013
+ result = sliced.map((line, i) => `${offset + i}: ${line}`).join("\n");
2014
+ if (lines.length > offset - 1 + limit) {
2015
+ result += `\n... (${lines.length} total lines)`;
2016
+ }
2017
+ break;
2018
+ }
2019
+ case "write": {
2020
+ const filePath = String(args.path ?? "");
2021
+ const content = String(args.content ?? "");
2022
+ if (!filePath) {
2023
+ res.json({ output: "Error: path is required" });
2024
+ return;
2025
+ }
2026
+ await mkdir(dirname(filePath), { recursive: true });
2027
+ await writeFile(filePath, content, "utf-8");
2028
+ result = `Written ${content.length} bytes to ${filePath}`;
2029
+ break;
2030
+ }
2031
+ case "edit": {
2032
+ const filePath = String(args.path ?? "");
2033
+ const oldStr = String(args.oldText ?? "");
2034
+ const newStr = String(args.newText ?? "");
2035
+ if (!filePath) {
2036
+ res.json({ output: "Error: path is required" });
2037
+ return;
2038
+ }
2039
+ if (!oldStr) {
2040
+ res.json({ output: "Error: oldText is required" });
2041
+ return;
2042
+ }
2043
+ const fileContent = await readFile(filePath, "utf-8");
2044
+ const idx = fileContent.indexOf(oldStr);
2045
+ if (idx === -1) {
2046
+ result = "Error: oldText not found in file";
2047
+ }
2048
+ else if (fileContent.indexOf(oldStr, idx + 1) !== -1) {
2049
+ result = "Error: oldText found multiple times. Provide more context to make it unique.";
2050
+ }
2051
+ else {
2052
+ const edited = fileContent.slice(0, idx) + newStr + fileContent.slice(idx + oldStr.length);
2053
+ await writeFile(filePath, edited, "utf-8");
2054
+ result = `Edit applied to ${filePath}`;
2055
+ }
2056
+ break;
2057
+ }
2058
+ case "bash": {
2059
+ const command = String(args.command ?? "");
2060
+ if (!command) {
2061
+ res.json({ output: "Error: command is required" });
2062
+ return;
2063
+ }
2064
+ const timeoutSec = Math.min(120, Number(args.timeout) || 30);
2065
+ const timeout = timeoutSec * 1000;
2066
+ result = await new Promise((resolve) => {
2067
+ exec(command, { timeout, maxBuffer: 1024 * 1024, cwd: process.cwd() }, (err, stdout, stderr) => {
2068
+ const out = (stdout || "").trim();
2069
+ const errOut = (stderr || "").trim();
2070
+ if (err && err.killed) {
2071
+ resolve(`Command timed out after ${timeout}ms`);
2072
+ }
2073
+ else if (err) {
2074
+ resolve(`Exit code ${err.code ?? 1}\n${errOut}\n${out}`.trim());
2075
+ }
2076
+ else {
2077
+ const combined = errOut ? `${out}\n${errOut}` : out;
2078
+ resolve(combined || "(no output)");
2079
+ }
2080
+ });
2081
+ });
2082
+ // Truncate very long output for the voice context
2083
+ if (result.length > 4000) {
2084
+ result = result.slice(0, 4000) + "\n... (output truncated)";
2085
+ }
2086
+ break;
2087
+ }
2088
+ default:
2089
+ result = `Unknown tool: ${name}`;
2090
+ }
2091
+ res.json({ output: result });
2092
+ }
2093
+ catch (err) {
2094
+ const msg = err instanceof Error ? err.message : String(err);
2095
+ res.json({ output: `Error: ${msg}` });
2096
+ }
2097
+ });
1297
2098
  // ── GET / — full page ────────────────────────────────────────────────────
1298
2099
  app.get("/", async (req, res) => {
1299
2100
  try {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-office",
3
- "version": "0.4.7",
3
+ "version": "0.4.8",
4
4
  "description": "An office for your AI agents",
5
5
  "type": "module",
6
6
  "license": "MIT",