agent-relay-server 0.13.0 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-relay-server",
3
- "version": "0.13.0",
3
+ "version": "0.15.0",
4
4
  "description": "Lightweight HTTP message relay for inter-agent communication across machines",
5
5
  "module": "src/index.ts",
6
6
  "type": "module",
package/public/index.html CHANGED
@@ -10300,6 +10300,21 @@ async function apiPostAudio(path, blob) {
10300
10300
  }
10301
10301
  return response.json();
10302
10302
  }
10303
+ /** POST JSON and get an audio (or other) Blob back — used for server-side TTS. */
10304
+ async function apiPostJsonForBlob(path, body) {
10305
+ const headers = { "Content-Type": "application/json" };
10306
+ if (authToken) headers["X-Agent-Relay-Token"] = authToken;
10307
+ const response = await fetch(new URL("api" + path, baseUrl()), {
10308
+ method: "POST",
10309
+ headers,
10310
+ body: JSON.stringify(body)
10311
+ });
10312
+ if (!response.ok) {
10313
+ if (response.status === 401) throw makeError(401, "Authentication required");
10314
+ throw makeError(response.status, await responseErrorMessage(response));
10315
+ }
10316
+ return response.blob();
10317
+ }
10303
10318
  async function apiBlob(path) {
10304
10319
  const opts = {
10305
10320
  method: "GET",
@@ -10377,13 +10392,18 @@ function chunkForSpeech(text) {
10377
10392
  var synthAvailable = typeof window !== "undefined" && "speechSynthesis" in window;
10378
10393
  var VoiceTts = class {
10379
10394
  enabled = false;
10395
+ lang = "en-US";
10396
+ mode = "kokoro";
10397
+ kokoroVoice = "am_michael";
10380
10398
  active = null;
10381
10399
  queue = [];
10382
10400
  currentChat = null;
10383
10401
  speaking = false;
10384
10402
  gen = 0;
10403
+ audioEl = null;
10404
+ audioUrl = null;
10385
10405
  get available() {
10386
- return synthAvailable;
10406
+ return synthAvailable || typeof Audio !== "undefined";
10387
10407
  }
10388
10408
  isEnabled() {
10389
10409
  return this.enabled;
@@ -10393,6 +10413,18 @@ var VoiceTts = class {
10393
10413
  this.enabled = on;
10394
10414
  if (!on) this.reset();
10395
10415
  }
10416
+ /** Set the spoken-voice language (BCP-47, e.g. "en-US"). Empty = browser locale. */
10417
+ setLang(lang) {
10418
+ this.lang = lang;
10419
+ }
10420
+ setMode(mode) {
10421
+ if (mode === this.mode) return;
10422
+ this.mode = mode;
10423
+ this.reset();
10424
+ }
10425
+ setKokoroVoice(voice) {
10426
+ this.kokoroVoice = voice;
10427
+ }
10396
10428
  setActiveChat(chatId) {
10397
10429
  if (chatId === this.active) return;
10398
10430
  this.active = chatId;
@@ -10400,7 +10432,7 @@ var VoiceTts = class {
10400
10432
  }
10401
10433
  /** A captured agent response turn arrived for `chatId`. */
10402
10434
  onResponse(chatId, rawText) {
10403
- if (!this.enabled || !synthAvailable || !chatId || chatId !== this.active) return;
10435
+ if (!this.enabled || !this.available || !chatId || chatId !== this.active) return;
10404
10436
  const text = speechify(rawText);
10405
10437
  if (!text) return;
10406
10438
  if (this.speaking && this.currentChat && this.currentChat !== chatId) {
@@ -10428,6 +10460,19 @@ var VoiceTts = class {
10428
10460
  try {
10429
10461
  window.speechSynthesis.cancel();
10430
10462
  } catch {}
10463
+ this.stopAudio();
10464
+ }
10465
+ stopAudio() {
10466
+ if (this.audioEl) try {
10467
+ this.audioEl.pause();
10468
+ this.audioEl.src = "";
10469
+ } catch {}
10470
+ if (this.audioUrl) {
10471
+ try {
10472
+ URL.revokeObjectURL(this.audioUrl);
10473
+ } catch {}
10474
+ this.audioUrl = null;
10475
+ }
10431
10476
  }
10432
10477
  pump() {
10433
10478
  if (this.speaking) return;
@@ -10437,24 +10482,78 @@ var VoiceTts = class {
10437
10482
  this.currentChat = item.chatId;
10438
10483
  const gen = ++this.gen;
10439
10484
  const chunks = chunkForSpeech(item.text);
10440
- const speakAt = (i) => {
10485
+ const done = () => {
10441
10486
  if (gen !== this.gen) return;
10442
- if (i >= chunks.length) {
10443
- this.speaking = false;
10444
- this.currentChat = null;
10445
- this.pump();
10446
- return;
10447
- }
10448
- const u = new SpeechSynthesisUtterance(chunks[i]);
10449
- u.lang = navigator.language || "en-US";
10450
- u.onend = () => speakAt(i + 1);
10451
- u.onerror = () => speakAt(i + 1);
10452
- window.speechSynthesis.speak(u);
10487
+ this.speaking = false;
10488
+ this.currentChat = null;
10489
+ this.pump();
10453
10490
  };
10454
- speakAt(0);
10491
+ if (this.mode === "kokoro") this.speakKokoro(chunks, 0, gen, done);
10492
+ else this.speakBrowser(chunks, 0, gen, done);
10493
+ }
10494
+ speakBrowser(chunks, i, gen, done) {
10495
+ if (gen !== this.gen) return;
10496
+ if (!synthAvailable || i >= chunks.length) return done();
10497
+ const u = new SpeechSynthesisUtterance(chunks[i]);
10498
+ u.lang = this.lang || navigator.language || "en-US";
10499
+ u.onend = () => this.speakBrowser(chunks, i + 1, gen, done);
10500
+ u.onerror = () => this.speakBrowser(chunks, i + 1, gen, done);
10501
+ window.speechSynthesis.speak(u);
10502
+ }
10503
+ speakKokoro(chunks, i, gen, done, prefetched) {
10504
+ if (gen !== this.gen) return;
10505
+ const text = chunks[i];
10506
+ if (text === void 0) return done();
10507
+ const cur = prefetched ?? this.fetchSpeech(text);
10508
+ const nextText = chunks[i + 1];
10509
+ const next = nextText !== void 0 ? this.fetchSpeech(nextText) : void 0;
10510
+ cur.then((blob) => {
10511
+ if (gen !== this.gen) return;
10512
+ this.playBlob(blob, gen, () => this.speakKokoro(chunks, i + 1, gen, done, next), () => {
10513
+ this.speakBrowser(chunks, i, gen, done);
10514
+ });
10515
+ }).catch(() => {
10516
+ if (gen !== this.gen) return;
10517
+ this.speakBrowser(chunks, i, gen, done);
10518
+ });
10519
+ }
10520
+ fetchSpeech(text) {
10521
+ return apiPostJsonForBlob("/connectors/voice/call/speak", {
10522
+ text,
10523
+ voice: this.kokoroVoice
10524
+ });
10525
+ }
10526
+ playBlob(blob, gen, onend, onerror) {
10527
+ if (gen !== this.gen) return;
10528
+ if (typeof Audio === "undefined") return onerror();
10529
+ this.stopAudio();
10530
+ if (!this.audioEl) this.audioEl = new Audio();
10531
+ const url = URL.createObjectURL(blob);
10532
+ this.audioUrl = url;
10533
+ const el = this.audioEl;
10534
+ el.src = url;
10535
+ el.onended = () => {
10536
+ if (gen === this.gen) onend();
10537
+ };
10538
+ el.onerror = () => {
10539
+ if (gen === this.gen) onerror();
10540
+ };
10541
+ el.play().catch(() => {
10542
+ if (gen === this.gen) onerror();
10543
+ });
10455
10544
  }
10456
10545
  };
10457
10546
  var voiceTts = new VoiceTts();
10547
+ /** Sorted unique BCP-47 languages the browser's speech engine can speak. May be empty
10548
+ * until the engine finishes loading voices (listen for `voiceschanged` and re-read). */
10549
+ function availableSpeechLangs() {
10550
+ if (!synthAvailable) return [];
10551
+ try {
10552
+ return [...new Set(window.speechSynthesis.getVoices().map((v) => v.lang).filter(Boolean))].sort();
10553
+ } catch {
10554
+ return [];
10555
+ }
10556
+ }
10458
10557
  var micAvailable = typeof navigator !== "undefined" && !!navigator.mediaDevices?.getUserMedia && typeof window !== "undefined" && "MediaRecorder" in window;
10459
10558
  function pickMimeType() {
10460
10559
  for (const m of [
@@ -11861,6 +11960,10 @@ var useRelayStore = create$1()(persist((set, get) => ({
11861
11960
  showBuiltIns: false,
11862
11961
  autoRefresh: true,
11863
11962
  voiceTtsEnabled: false,
11963
+ voiceTtsLang: "en-US",
11964
+ voiceTtsMode: "kokoro",
11965
+ voiceTtsKokoroVoice: "am_michael",
11966
+ voiceInputMode: "compose",
11864
11967
  agentSort: "status",
11865
11968
  agentSortDir: "asc",
11866
11969
  agentPresetFilter: "",
@@ -12052,6 +12155,21 @@ var useRelayStore = create$1()(persist((set, get) => ({
12052
12155
  voiceTts.setEnabled(on);
12053
12156
  set({ voiceTtsEnabled: on });
12054
12157
  },
12158
+ setVoiceTtsLang(lang) {
12159
+ voiceTts.setLang(lang);
12160
+ set({ voiceTtsLang: lang });
12161
+ },
12162
+ setVoiceTtsMode(mode) {
12163
+ voiceTts.setMode(mode);
12164
+ set({ voiceTtsMode: mode });
12165
+ },
12166
+ setVoiceTtsKokoroVoice(voice) {
12167
+ voiceTts.setKokoroVoice(voice);
12168
+ set({ voiceTtsKokoroVoice: voice });
12169
+ },
12170
+ setVoiceInputMode(mode) {
12171
+ set({ voiceInputMode: mode });
12172
+ },
12055
12173
  async init() {
12056
12174
  if (!useRelayStore.persist.hasHydrated()) await new Promise((resolve) => {
12057
12175
  const unsub = useRelayStore.persist.onFinishHydration(() => {
@@ -12062,6 +12180,9 @@ var useRelayStore = create$1()(persist((set, get) => ({
12062
12180
  const token = get().authToken;
12063
12181
  if (token) setAuthToken(token);
12064
12182
  voiceTts.setEnabled(get().voiceTtsEnabled);
12183
+ voiceTts.setLang(get().voiceTtsLang);
12184
+ voiceTts.setMode(get().voiceTtsMode);
12185
+ voiceTts.setKokoroVoice(get().voiceTtsKokoroVoice);
12065
12186
  syncVoiceActiveChat(get());
12066
12187
  setUnauthorizedHandler(() => {
12067
12188
  if (!get().authNeeded) set({
@@ -12576,9 +12697,9 @@ var useRelayStore = create$1()(persist((set, get) => ({
12576
12697
  const msgs = [...s.messages, msg];
12577
12698
  if (msgs.length > 500) msgs.splice(0, msgs.length - 500);
12578
12699
  set({ messages: msgs });
12579
- if (msg.kind === "session") {
12700
+ if (msg.kind === "session" && msg.from !== "user") {
12580
12701
  const sess = msg.payload?.session;
12581
- if ((sess?.type ?? "response") === "response" && (sess?.origin ?? "provider") === "provider") voiceTts.onResponse(inboxPeer(msg), msg.body);
12702
+ if (sess?.type === "response" && sess?.origin === "provider") voiceTts.onResponse(inboxPeer(msg), msg.body);
12582
12703
  }
12583
12704
  const peer = inboxPeer(msg);
12584
12705
  if (isHumanInboundMessage(msg) && peer && s.view === "chat" && s.selectedInboxThread === peer && !isDashboardHidden()) get().markInboxThreadReadTo(peer, msg.id);
@@ -14150,6 +14271,10 @@ var useRelayStore = create$1()(persist((set, get) => ({
14150
14271
  showBuiltIns: state.showBuiltIns,
14151
14272
  autoRefresh: state.autoRefresh,
14152
14273
  voiceTtsEnabled: state.voiceTtsEnabled,
14274
+ voiceTtsLang: state.voiceTtsLang,
14275
+ voiceTtsMode: state.voiceTtsMode,
14276
+ voiceTtsKokoroVoice: state.voiceTtsKokoroVoice,
14277
+ voiceInputMode: state.voiceInputMode,
14153
14278
  agentSort: state.agentSort,
14154
14279
  agentSortDir: state.agentSortDir,
14155
14280
  agentPresetFilter: state.agentPresetFilter,
@@ -125564,6 +125689,40 @@ var TIMELINE_STATUS_LABELS = {
125564
125689
  var TIMELINE_STATUSES = new Set(Object.keys(TIMELINE_STATUS_LABELS));
125565
125690
  var STATUS_DEDUPE_WINDOW_MS = 3e3;
125566
125691
  var CHAT_BOTTOM_THRESHOLD_PX = 96;
125692
+ var KOKORO_VOICES = [
125693
+ {
125694
+ id: "am_michael",
125695
+ label: "Michael (US ♂)"
125696
+ },
125697
+ {
125698
+ id: "am_adam",
125699
+ label: "Adam (US ♂)"
125700
+ },
125701
+ {
125702
+ id: "af_heart",
125703
+ label: "Heart (US ♀)"
125704
+ },
125705
+ {
125706
+ id: "af_bella",
125707
+ label: "Bella (US ♀)"
125708
+ },
125709
+ {
125710
+ id: "af_nicole",
125711
+ label: "Nicole (US ♀)"
125712
+ },
125713
+ {
125714
+ id: "af_sarah",
125715
+ label: "Sarah (US ♀)"
125716
+ },
125717
+ {
125718
+ id: "bm_george",
125719
+ label: "George (UK ♂)"
125720
+ },
125721
+ {
125722
+ id: "bf_emma",
125723
+ label: "Emma (UK ♀)"
125724
+ }
125725
+ ];
125567
125726
  function StatusMarker({ event }) {
125568
125727
  return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", {
125569
125728
  className: "flex items-center justify-center gap-2 py-2 my-1",
@@ -126599,6 +126758,21 @@ function ChatPanel({ threads, onBack, showBackButton }) {
126599
126758
  const fetchOrchestrators = useRelayStore((s) => s.fetchOrchestrators);
126600
126759
  const voiceTtsEnabled = useRelayStore((s) => s.voiceTtsEnabled);
126601
126760
  const setVoiceTtsEnabled = useRelayStore((s) => s.setVoiceTtsEnabled);
126761
+ const voiceTtsLang = useRelayStore((s) => s.voiceTtsLang);
126762
+ const setVoiceTtsLang = useRelayStore((s) => s.setVoiceTtsLang);
126763
+ const voiceTtsMode = useRelayStore((s) => s.voiceTtsMode);
126764
+ const setVoiceTtsMode = useRelayStore((s) => s.setVoiceTtsMode);
126765
+ const voiceTtsKokoroVoice = useRelayStore((s) => s.voiceTtsKokoroVoice);
126766
+ const setVoiceTtsKokoroVoice = useRelayStore((s) => s.setVoiceTtsKokoroVoice);
126767
+ const voiceInputMode = useRelayStore((s) => s.voiceInputMode);
126768
+ const [speechLangs, setSpeechLangs] = (0, import_react.useState)(() => availableSpeechLangs());
126769
+ (0, import_react.useEffect)(() => {
126770
+ if (!voiceTts.available) return;
126771
+ const refresh = () => setSpeechLangs(availableSpeechLangs());
126772
+ refresh();
126773
+ window.speechSynthesis.addEventListener?.("voiceschanged", refresh);
126774
+ return () => window.speechSynthesis.removeEventListener?.("voiceschanged", refresh);
126775
+ }, []);
126602
126776
  const fileInputRef = (0, import_react.useRef)(null);
126603
126777
  const pttRecorderRef = (0, import_react.useRef)(null);
126604
126778
  const [micState, setMicState] = (0, import_react.useState)("idle");
@@ -126915,7 +127089,21 @@ function ChatPanel({ threads, onBack, showBackButton }) {
126915
127089
  const clip = await recorder.stop();
126916
127090
  if (!clip) return;
126917
127091
  const text = await transcribeClip(clip);
126918
- if (text) setReplyDraft(selectedInboxThread, draft ? `${draft} ${text}` : text);
127092
+ if (!text) return;
127093
+ const combined = draft ? `${draft} ${text}` : text;
127094
+ if (voiceInputMode === "autosend" && !chatSending && !hasPendingUploads) {
127095
+ const attachments = readyAttachments.map((item) => ({
127096
+ artifactId: item.artifact.id,
127097
+ kind: item.artifact.kind,
127098
+ role: "media",
127099
+ title: item.artifact.filename || item.fileName
127100
+ }));
127101
+ sendChatMessage(combined, {
127102
+ peer: selectedInboxThread,
127103
+ lastMessage: thread?.lastMessage || null
127104
+ }, attachments);
127105
+ clearPendingAttachments();
127106
+ } else setReplyDraft(selectedInboxThread, combined);
126919
127107
  } catch (e) {
126920
127108
  showError("Transcription failed", e?.message || "Could not transcribe audio.");
126921
127109
  } finally {
@@ -126996,6 +127184,47 @@ function ChatPanel({ threads, onBack, showBackButton }) {
126996
127184
  onClick: () => setVoiceTtsEnabled(!voiceTtsEnabled),
126997
127185
  children: voiceTtsEnabled ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Volume2, { className: "w-3.5 h-3.5" }) : /* @__PURE__ */ (0, import_jsx_runtime.jsx)(VolumeX, { className: "w-3.5 h-3.5" })
126998
127186
  }),
127187
+ voiceTts.available && voiceTtsEnabled && /* @__PURE__ */ (0, import_jsx_runtime.jsxs)(import_jsx_runtime.Fragment, { children: [/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("select", {
127188
+ value: voiceTtsMode,
127189
+ onChange: (e) => setVoiceTtsMode(e.target.value),
127190
+ title: "Voice engine — Kokoro (server, natural) falls back to browser automatically",
127191
+ className: "h-7 rounded border border-border bg-background px-1 text-xs",
127192
+ children: [/* @__PURE__ */ (0, import_jsx_runtime.jsx)("option", {
127193
+ value: "kokoro",
127194
+ children: "Kokoro"
127195
+ }), /* @__PURE__ */ (0, import_jsx_runtime.jsx)("option", {
127196
+ value: "browser",
127197
+ children: "Browser"
127198
+ })]
127199
+ }), voiceTtsMode === "kokoro" ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("select", {
127200
+ value: voiceTtsKokoroVoice,
127201
+ onChange: (e) => setVoiceTtsKokoroVoice(e.target.value),
127202
+ title: "Kokoro voice",
127203
+ className: "h-7 rounded border border-border bg-background px-1 text-xs",
127204
+ children: [...KOKORO_VOICES, ...KOKORO_VOICES.some((v) => v.id === voiceTtsKokoroVoice) ? [] : [{
127205
+ id: voiceTtsKokoroVoice,
127206
+ label: voiceTtsKokoroVoice
127207
+ }]].map((v) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("option", {
127208
+ value: v.id,
127209
+ children: v.label
127210
+ }, v.id))
127211
+ }) : /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("select", {
127212
+ value: voiceTtsLang,
127213
+ onChange: (e) => setVoiceTtsLang(e.target.value),
127214
+ title: "Voice language",
127215
+ className: "h-7 rounded border border-border bg-background px-1 text-xs",
127216
+ children: [[...new Set([
127217
+ "en-US",
127218
+ ...speechLangs,
127219
+ ...voiceTtsLang ? [voiceTtsLang] : []
127220
+ ])].sort().map((l) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("option", {
127221
+ value: l,
127222
+ children: l
127223
+ }, l)), /* @__PURE__ */ (0, import_jsx_runtime.jsx)("option", {
127224
+ value: "",
127225
+ children: "Browser default"
127226
+ })]
127227
+ })] }),
126999
127228
  canOpenTerminal && /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Button, {
127000
127229
  variant: "ghost",
127001
127230
  size: "icon-sm",
@@ -153453,10 +153682,38 @@ function SettingsView() {
153453
153682
  ]
153454
153683
  })]
153455
153684
  }),
153456
- /* @__PURE__ */ (0, import_jsx_runtime.jsx)(StewardSettings, {})
153685
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(StewardSettings, {}),
153686
+ /* @__PURE__ */ (0, import_jsx_runtime.jsx)(VoiceSettings, {})
153457
153687
  ]
153458
153688
  });
153459
153689
  }
153690
+ function VoiceSettings() {
153691
+ const voiceInputMode = useRelayStore((s) => s.voiceInputMode);
153692
+ const setVoiceInputMode = useRelayStore((s) => s.setVoiceInputMode);
153693
+ return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("section", {
153694
+ className: "space-y-3 rounded-lg border p-4",
153695
+ children: [/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", { children: [/* @__PURE__ */ (0, import_jsx_runtime.jsx)("h2", {
153696
+ className: "text-sm font-semibold",
153697
+ children: "Voice"
153698
+ }), /* @__PURE__ */ (0, import_jsx_runtime.jsx)("p", {
153699
+ className: "text-xs text-muted-foreground",
153700
+ children: "How push-to-talk behaves in chat."
153701
+ })] }), /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Field, {
153702
+ label: "Push-to-talk input",
153703
+ children: /* @__PURE__ */ (0, import_jsx_runtime.jsxs)(Select$1, {
153704
+ value: voiceInputMode,
153705
+ onValueChange: (v) => setVoiceInputMode(v),
153706
+ children: [/* @__PURE__ */ (0, import_jsx_runtime.jsx)(SelectTrigger, { children: /* @__PURE__ */ (0, import_jsx_runtime.jsx)(SelectValue, {}) }), /* @__PURE__ */ (0, import_jsx_runtime.jsxs)(SelectContent, { children: [/* @__PURE__ */ (0, import_jsx_runtime.jsx)(SelectItem, {
153707
+ value: "compose",
153708
+ children: "Fill the message box (review, then Enter)"
153709
+ }), /* @__PURE__ */ (0, import_jsx_runtime.jsx)(SelectItem, {
153710
+ value: "autosend",
153711
+ children: "Send immediately (speak-and-send)"
153712
+ })] })]
153713
+ })
153714
+ })]
153715
+ });
153716
+ }
153460
153717
  var EFFORTS = [
153461
153718
  "low",
153462
153719
  "medium",
package/src/routes.ts CHANGED
@@ -5679,7 +5679,7 @@ const putConnectorConfig: Handler = async (req, params) => {
5679
5679
  // Endpoints a connector daemon may expose for the dashboard to call through the
5680
5680
  // relay (single-origin — no CORS, no extra port exposure). Kept to a small,
5681
5681
  // non-mutating allowlist; the connector advertises its HTTP base via its status.
5682
- const PROXYABLE_CONNECTOR_CALLS = new Set(["transcribe", "utterance"]);
5682
+ const PROXYABLE_CONNECTOR_CALLS = new Set(["transcribe", "utterance", "speak"]);
5683
5683
 
5684
5684
  function connectorAdvertisedEndpoint(connector: NonNullable<ReturnType<typeof getConnector>>): string | null {
5685
5685
  const raw = connector.state?.raw;