agent-relay-server 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-relay-server",
3
- "version": "0.13.0",
3
+ "version": "0.14.0",
4
4
  "description": "Lightweight HTTP message relay for inter-agent communication across machines",
5
5
  "module": "src/index.ts",
6
6
  "type": "module",
package/public/index.html CHANGED
@@ -10300,6 +10300,21 @@ async function apiPostAudio(path, blob) {
10300
10300
  }
10301
10301
  return response.json();
10302
10302
  }
10303
+ /** POST JSON and get an audio (or other) Blob back — used for server-side TTS. */
10304
+ async function apiPostJsonForBlob(path, body) {
10305
+ const headers = { "Content-Type": "application/json" };
10306
+ if (authToken) headers["X-Agent-Relay-Token"] = authToken;
10307
+ const response = await fetch(new URL("api" + path, baseUrl()), {
10308
+ method: "POST",
10309
+ headers,
10310
+ body: JSON.stringify(body)
10311
+ });
10312
+ if (!response.ok) {
10313
+ if (response.status === 401) throw makeError(401, "Authentication required");
10314
+ throw makeError(response.status, await responseErrorMessage(response));
10315
+ }
10316
+ return response.blob();
10317
+ }
10303
10318
  async function apiBlob(path) {
10304
10319
  const opts = {
10305
10320
  method: "GET",
@@ -10377,13 +10392,18 @@ function chunkForSpeech(text) {
10377
10392
  var synthAvailable = typeof window !== "undefined" && "speechSynthesis" in window;
10378
10393
  var VoiceTts = class {
10379
10394
  enabled = false;
10395
+ lang = "en-US";
10396
+ mode = "kokoro";
10397
+ kokoroVoice = "am_michael";
10380
10398
  active = null;
10381
10399
  queue = [];
10382
10400
  currentChat = null;
10383
10401
  speaking = false;
10384
10402
  gen = 0;
10403
+ audioEl = null;
10404
+ audioUrl = null;
10385
10405
  get available() {
10386
- return synthAvailable;
10406
+ return synthAvailable || typeof Audio !== "undefined";
10387
10407
  }
10388
10408
  isEnabled() {
10389
10409
  return this.enabled;
@@ -10393,6 +10413,18 @@ var VoiceTts = class {
10393
10413
  this.enabled = on;
10394
10414
  if (!on) this.reset();
10395
10415
  }
10416
+ /** Set the spoken-voice language (BCP-47, e.g. "en-US"). Empty = browser locale. */
10417
+ setLang(lang) {
10418
+ this.lang = lang;
10419
+ }
10420
+ setMode(mode) {
10421
+ if (mode === this.mode) return;
10422
+ this.mode = mode;
10423
+ this.reset();
10424
+ }
10425
+ setKokoroVoice(voice) {
10426
+ this.kokoroVoice = voice;
10427
+ }
10396
10428
  setActiveChat(chatId) {
10397
10429
  if (chatId === this.active) return;
10398
10430
  this.active = chatId;
@@ -10400,7 +10432,7 @@ var VoiceTts = class {
10400
10432
  }
10401
10433
  /** A captured agent response turn arrived for `chatId`. */
10402
10434
  onResponse(chatId, rawText) {
10403
- if (!this.enabled || !synthAvailable || !chatId || chatId !== this.active) return;
10435
+ if (!this.enabled || !this.available || !chatId || chatId !== this.active) return;
10404
10436
  const text = speechify(rawText);
10405
10437
  if (!text) return;
10406
10438
  if (this.speaking && this.currentChat && this.currentChat !== chatId) {
@@ -10428,6 +10460,19 @@ var VoiceTts = class {
10428
10460
  try {
10429
10461
  window.speechSynthesis.cancel();
10430
10462
  } catch {}
10463
+ this.stopAudio();
10464
+ }
10465
+ stopAudio() {
10466
+ if (this.audioEl) try {
10467
+ this.audioEl.pause();
10468
+ this.audioEl.src = "";
10469
+ } catch {}
10470
+ if (this.audioUrl) {
10471
+ try {
10472
+ URL.revokeObjectURL(this.audioUrl);
10473
+ } catch {}
10474
+ this.audioUrl = null;
10475
+ }
10431
10476
  }
10432
10477
  pump() {
10433
10478
  if (this.speaking) return;
@@ -10437,24 +10482,78 @@ var VoiceTts = class {
10437
10482
  this.currentChat = item.chatId;
10438
10483
  const gen = ++this.gen;
10439
10484
  const chunks = chunkForSpeech(item.text);
10440
- const speakAt = (i) => {
10485
+ const done = () => {
10441
10486
  if (gen !== this.gen) return;
10442
- if (i >= chunks.length) {
10443
- this.speaking = false;
10444
- this.currentChat = null;
10445
- this.pump();
10446
- return;
10447
- }
10448
- const u = new SpeechSynthesisUtterance(chunks[i]);
10449
- u.lang = navigator.language || "en-US";
10450
- u.onend = () => speakAt(i + 1);
10451
- u.onerror = () => speakAt(i + 1);
10452
- window.speechSynthesis.speak(u);
10487
+ this.speaking = false;
10488
+ this.currentChat = null;
10489
+ this.pump();
10490
+ };
10491
+ if (this.mode === "kokoro") this.speakKokoro(chunks, 0, gen, done);
10492
+ else this.speakBrowser(chunks, 0, gen, done);
10493
+ }
10494
+ speakBrowser(chunks, i, gen, done) {
10495
+ if (gen !== this.gen) return;
10496
+ if (!synthAvailable || i >= chunks.length) return done();
10497
+ const u = new SpeechSynthesisUtterance(chunks[i]);
10498
+ u.lang = this.lang || navigator.language || "en-US";
10499
+ u.onend = () => this.speakBrowser(chunks, i + 1, gen, done);
10500
+ u.onerror = () => this.speakBrowser(chunks, i + 1, gen, done);
10501
+ window.speechSynthesis.speak(u);
10502
+ }
10503
+ speakKokoro(chunks, i, gen, done, prefetched) {
10504
+ if (gen !== this.gen) return;
10505
+ const text = chunks[i];
10506
+ if (text === void 0) return done();
10507
+ const cur = prefetched ?? this.fetchSpeech(text);
10508
+ const nextText = chunks[i + 1];
10509
+ const next = nextText !== void 0 ? this.fetchSpeech(nextText) : void 0;
10510
+ cur.then((blob) => {
10511
+ if (gen !== this.gen) return;
10512
+ this.playBlob(blob, gen, () => this.speakKokoro(chunks, i + 1, gen, done, next), () => {
10513
+ this.speakBrowser(chunks, i, gen, done);
10514
+ });
10515
+ }).catch(() => {
10516
+ if (gen !== this.gen) return;
10517
+ this.speakBrowser(chunks, i, gen, done);
10518
+ });
10519
+ }
10520
+ fetchSpeech(text) {
10521
+ return apiPostJsonForBlob("/connectors/voice/call/speak", {
10522
+ text,
10523
+ voice: this.kokoroVoice
10524
+ });
10525
+ }
10526
+ playBlob(blob, gen, onend, onerror) {
10527
+ if (gen !== this.gen) return;
10528
+ if (typeof Audio === "undefined") return onerror();
10529
+ this.stopAudio();
10530
+ if (!this.audioEl) this.audioEl = new Audio();
10531
+ const url = URL.createObjectURL(blob);
10532
+ this.audioUrl = url;
10533
+ const el = this.audioEl;
10534
+ el.src = url;
10535
+ el.onended = () => {
10536
+ if (gen === this.gen) onend();
10537
+ };
10538
+ el.onerror = () => {
10539
+ if (gen === this.gen) onerror();
10453
10540
  };
10454
- speakAt(0);
10541
+ el.play().catch(() => {
10542
+ if (gen === this.gen) onerror();
10543
+ });
10455
10544
  }
10456
10545
  };
10457
10546
  var voiceTts = new VoiceTts();
10547
+ /** Sorted unique BCP-47 languages the browser's speech engine can speak. May be empty
10548
+ * until the engine finishes loading voices (listen for `voiceschanged` and re-read). */
10549
+ function availableSpeechLangs() {
10550
+ if (!synthAvailable) return [];
10551
+ try {
10552
+ return [...new Set(window.speechSynthesis.getVoices().map((v) => v.lang).filter(Boolean))].sort();
10553
+ } catch {
10554
+ return [];
10555
+ }
10556
+ }
10458
10557
  var micAvailable = typeof navigator !== "undefined" && !!navigator.mediaDevices?.getUserMedia && typeof window !== "undefined" && "MediaRecorder" in window;
10459
10558
  function pickMimeType() {
10460
10559
  for (const m of [
@@ -11861,6 +11960,9 @@ var useRelayStore = create$1()(persist((set, get) => ({
11861
11960
  showBuiltIns: false,
11862
11961
  autoRefresh: true,
11863
11962
  voiceTtsEnabled: false,
11963
+ voiceTtsLang: "en-US",
11964
+ voiceTtsMode: "kokoro",
11965
+ voiceTtsKokoroVoice: "am_michael",
11864
11966
  agentSort: "status",
11865
11967
  agentSortDir: "asc",
11866
11968
  agentPresetFilter: "",
@@ -12052,6 +12154,18 @@ var useRelayStore = create$1()(persist((set, get) => ({
12052
12154
  voiceTts.setEnabled(on);
12053
12155
  set({ voiceTtsEnabled: on });
12054
12156
  },
12157
+ setVoiceTtsLang(lang) {
12158
+ voiceTts.setLang(lang);
12159
+ set({ voiceTtsLang: lang });
12160
+ },
12161
+ setVoiceTtsMode(mode) {
12162
+ voiceTts.setMode(mode);
12163
+ set({ voiceTtsMode: mode });
12164
+ },
12165
+ setVoiceTtsKokoroVoice(voice) {
12166
+ voiceTts.setKokoroVoice(voice);
12167
+ set({ voiceTtsKokoroVoice: voice });
12168
+ },
12055
12169
  async init() {
12056
12170
  if (!useRelayStore.persist.hasHydrated()) await new Promise((resolve) => {
12057
12171
  const unsub = useRelayStore.persist.onFinishHydration(() => {
@@ -12062,6 +12176,9 @@ var useRelayStore = create$1()(persist((set, get) => ({
12062
12176
  const token = get().authToken;
12063
12177
  if (token) setAuthToken(token);
12064
12178
  voiceTts.setEnabled(get().voiceTtsEnabled);
12179
+ voiceTts.setLang(get().voiceTtsLang);
12180
+ voiceTts.setMode(get().voiceTtsMode);
12181
+ voiceTts.setKokoroVoice(get().voiceTtsKokoroVoice);
12065
12182
  syncVoiceActiveChat(get());
12066
12183
  setUnauthorizedHandler(() => {
12067
12184
  if (!get().authNeeded) set({
@@ -12576,9 +12693,9 @@ var useRelayStore = create$1()(persist((set, get) => ({
12576
12693
  const msgs = [...s.messages, msg];
12577
12694
  if (msgs.length > 500) msgs.splice(0, msgs.length - 500);
12578
12695
  set({ messages: msgs });
12579
- if (msg.kind === "session") {
12696
+ if (msg.kind === "session" && msg.from !== "user") {
12580
12697
  const sess = msg.payload?.session;
12581
- if ((sess?.type ?? "response") === "response" && (sess?.origin ?? "provider") === "provider") voiceTts.onResponse(inboxPeer(msg), msg.body);
12698
+ if (sess?.type === "response" && sess?.origin === "provider") voiceTts.onResponse(inboxPeer(msg), msg.body);
12582
12699
  }
12583
12700
  const peer = inboxPeer(msg);
12584
12701
  if (isHumanInboundMessage(msg) && peer && s.view === "chat" && s.selectedInboxThread === peer && !isDashboardHidden()) get().markInboxThreadReadTo(peer, msg.id);
@@ -14150,6 +14267,9 @@ var useRelayStore = create$1()(persist((set, get) => ({
14150
14267
  showBuiltIns: state.showBuiltIns,
14151
14268
  autoRefresh: state.autoRefresh,
14152
14269
  voiceTtsEnabled: state.voiceTtsEnabled,
14270
+ voiceTtsLang: state.voiceTtsLang,
14271
+ voiceTtsMode: state.voiceTtsMode,
14272
+ voiceTtsKokoroVoice: state.voiceTtsKokoroVoice,
14153
14273
  agentSort: state.agentSort,
14154
14274
  agentSortDir: state.agentSortDir,
14155
14275
  agentPresetFilter: state.agentPresetFilter,
@@ -125564,6 +125684,40 @@ var TIMELINE_STATUS_LABELS = {
125564
125684
  var TIMELINE_STATUSES = new Set(Object.keys(TIMELINE_STATUS_LABELS));
125565
125685
  var STATUS_DEDUPE_WINDOW_MS = 3e3;
125566
125686
  var CHAT_BOTTOM_THRESHOLD_PX = 96;
125687
+ var KOKORO_VOICES = [
125688
+ {
125689
+ id: "am_michael",
125690
+ label: "Michael (US ♂)"
125691
+ },
125692
+ {
125693
+ id: "am_adam",
125694
+ label: "Adam (US ♂)"
125695
+ },
125696
+ {
125697
+ id: "af_heart",
125698
+ label: "Heart (US ♀)"
125699
+ },
125700
+ {
125701
+ id: "af_bella",
125702
+ label: "Bella (US ♀)"
125703
+ },
125704
+ {
125705
+ id: "af_nicole",
125706
+ label: "Nicole (US ♀)"
125707
+ },
125708
+ {
125709
+ id: "af_sarah",
125710
+ label: "Sarah (US ♀)"
125711
+ },
125712
+ {
125713
+ id: "bm_george",
125714
+ label: "George (UK ♂)"
125715
+ },
125716
+ {
125717
+ id: "bf_emma",
125718
+ label: "Emma (UK ♀)"
125719
+ }
125720
+ ];
125567
125721
  function StatusMarker({ event }) {
125568
125722
  return /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("div", {
125569
125723
  className: "flex items-center justify-center gap-2 py-2 my-1",
@@ -126599,6 +126753,20 @@ function ChatPanel({ threads, onBack, showBackButton }) {
126599
126753
  const fetchOrchestrators = useRelayStore((s) => s.fetchOrchestrators);
126600
126754
  const voiceTtsEnabled = useRelayStore((s) => s.voiceTtsEnabled);
126601
126755
  const setVoiceTtsEnabled = useRelayStore((s) => s.setVoiceTtsEnabled);
126756
+ const voiceTtsLang = useRelayStore((s) => s.voiceTtsLang);
126757
+ const setVoiceTtsLang = useRelayStore((s) => s.setVoiceTtsLang);
126758
+ const voiceTtsMode = useRelayStore((s) => s.voiceTtsMode);
126759
+ const setVoiceTtsMode = useRelayStore((s) => s.setVoiceTtsMode);
126760
+ const voiceTtsKokoroVoice = useRelayStore((s) => s.voiceTtsKokoroVoice);
126761
+ const setVoiceTtsKokoroVoice = useRelayStore((s) => s.setVoiceTtsKokoroVoice);
126762
+ const [speechLangs, setSpeechLangs] = (0, import_react.useState)(() => availableSpeechLangs());
126763
+ (0, import_react.useEffect)(() => {
126764
+ if (!voiceTts.available) return;
126765
+ const refresh = () => setSpeechLangs(availableSpeechLangs());
126766
+ refresh();
126767
+ window.speechSynthesis.addEventListener?.("voiceschanged", refresh);
126768
+ return () => window.speechSynthesis.removeEventListener?.("voiceschanged", refresh);
126769
+ }, []);
126602
126770
  const fileInputRef = (0, import_react.useRef)(null);
126603
126771
  const pttRecorderRef = (0, import_react.useRef)(null);
126604
126772
  const [micState, setMicState] = (0, import_react.useState)("idle");
@@ -126996,6 +127164,47 @@ function ChatPanel({ threads, onBack, showBackButton }) {
126996
127164
  onClick: () => setVoiceTtsEnabled(!voiceTtsEnabled),
126997
127165
  children: voiceTtsEnabled ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Volume2, { className: "w-3.5 h-3.5" }) : /* @__PURE__ */ (0, import_jsx_runtime.jsx)(VolumeX, { className: "w-3.5 h-3.5" })
126998
127166
  }),
127167
+ voiceTts.available && voiceTtsEnabled && /* @__PURE__ */ (0, import_jsx_runtime.jsxs)(import_jsx_runtime.Fragment, { children: [/* @__PURE__ */ (0, import_jsx_runtime.jsxs)("select", {
127168
+ value: voiceTtsMode,
127169
+ onChange: (e) => setVoiceTtsMode(e.target.value),
127170
+ title: "Voice engine — Kokoro (server, natural) falls back to browser automatically",
127171
+ className: "h-7 rounded border border-border bg-background px-1 text-xs",
127172
+ children: [/* @__PURE__ */ (0, import_jsx_runtime.jsx)("option", {
127173
+ value: "kokoro",
127174
+ children: "Kokoro"
127175
+ }), /* @__PURE__ */ (0, import_jsx_runtime.jsx)("option", {
127176
+ value: "browser",
127177
+ children: "Browser"
127178
+ })]
127179
+ }), voiceTtsMode === "kokoro" ? /* @__PURE__ */ (0, import_jsx_runtime.jsx)("select", {
127180
+ value: voiceTtsKokoroVoice,
127181
+ onChange: (e) => setVoiceTtsKokoroVoice(e.target.value),
127182
+ title: "Kokoro voice",
127183
+ className: "h-7 rounded border border-border bg-background px-1 text-xs",
127184
+ children: [...KOKORO_VOICES, ...KOKORO_VOICES.some((v) => v.id === voiceTtsKokoroVoice) ? [] : [{
127185
+ id: voiceTtsKokoroVoice,
127186
+ label: voiceTtsKokoroVoice
127187
+ }]].map((v) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("option", {
127188
+ value: v.id,
127189
+ children: v.label
127190
+ }, v.id))
127191
+ }) : /* @__PURE__ */ (0, import_jsx_runtime.jsxs)("select", {
127192
+ value: voiceTtsLang,
127193
+ onChange: (e) => setVoiceTtsLang(e.target.value),
127194
+ title: "Voice language",
127195
+ className: "h-7 rounded border border-border bg-background px-1 text-xs",
127196
+ children: [[...new Set([
127197
+ "en-US",
127198
+ ...speechLangs,
127199
+ ...voiceTtsLang ? [voiceTtsLang] : []
127200
+ ])].sort().map((l) => /* @__PURE__ */ (0, import_jsx_runtime.jsx)("option", {
127201
+ value: l,
127202
+ children: l
127203
+ }, l)), /* @__PURE__ */ (0, import_jsx_runtime.jsx)("option", {
127204
+ value: "",
127205
+ children: "Browser default"
127206
+ })]
127207
+ })] }),
126999
127208
  canOpenTerminal && /* @__PURE__ */ (0, import_jsx_runtime.jsx)(Button, {
127000
127209
  variant: "ghost",
127001
127210
  size: "icon-sm",
package/src/routes.ts CHANGED
@@ -5679,7 +5679,7 @@ const putConnectorConfig: Handler = async (req, params) => {
5679
5679
  // Endpoints a connector daemon may expose for the dashboard to call through the
5680
5680
  // relay (single-origin — no CORS, no extra port exposure). Kept to a small,
5681
5681
  // non-mutating allowlist; the connector advertises its HTTP base via its status.
5682
- const PROXYABLE_CONNECTOR_CALLS = new Set(["transcribe", "utterance"]);
5682
+ const PROXYABLE_CONNECTOR_CALLS = new Set(["transcribe", "utterance", "speak"]);
5683
5683
 
5684
5684
  function connectorAdvertisedEndpoint(connector: NonNullable<ReturnType<typeof getConnector>>): string | null {
5685
5685
  const raw = connector.state?.raw;