@livekit/agents 1.0.37 → 1.0.39
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.cjs.map +1 -1
- package/dist/inference/api_protos.cjs +68 -0
- package/dist/inference/api_protos.cjs.map +1 -1
- package/dist/inference/api_protos.d.cts +345 -4
- package/dist/inference/api_protos.d.ts +345 -4
- package/dist/inference/api_protos.d.ts.map +1 -1
- package/dist/inference/api_protos.js +60 -0
- package/dist/inference/api_protos.js.map +1 -1
- package/dist/inference/llm.cjs +7 -3
- package/dist/inference/llm.cjs.map +1 -1
- package/dist/inference/llm.d.cts +5 -6
- package/dist/inference/llm.d.ts +5 -6
- package/dist/inference/llm.d.ts.map +1 -1
- package/dist/inference/llm.js +7 -3
- package/dist/inference/llm.js.map +1 -1
- package/dist/inference/stt.cjs +32 -21
- package/dist/inference/stt.cjs.map +1 -1
- package/dist/inference/stt.d.cts +5 -4
- package/dist/inference/stt.d.ts +5 -4
- package/dist/inference/stt.d.ts.map +1 -1
- package/dist/inference/stt.js +34 -21
- package/dist/inference/stt.js.map +1 -1
- package/dist/inference/tts.cjs.map +1 -1
- package/dist/inference/tts.d.cts +10 -7
- package/dist/inference/tts.d.ts +10 -7
- package/dist/inference/tts.d.ts.map +1 -1
- package/dist/inference/tts.js.map +1 -1
- package/dist/ipc/inference_proc_executor.cjs.map +1 -1
- package/dist/ipc/job_proc_executor.cjs.map +1 -1
- package/dist/stt/stream_adapter.cjs +9 -1
- package/dist/stt/stream_adapter.cjs.map +1 -1
- package/dist/stt/stream_adapter.d.ts.map +1 -1
- package/dist/stt/stream_adapter.js +9 -1
- package/dist/stt/stream_adapter.js.map +1 -1
- package/dist/stt/stt.cjs +10 -0
- package/dist/stt/stt.cjs.map +1 -1
- package/dist/stt/stt.d.cts +12 -0
- package/dist/stt/stt.d.ts +12 -0
- package/dist/stt/stt.d.ts.map +1 -1
- package/dist/stt/stt.js +10 -0
- package/dist/stt/stt.js.map +1 -1
- package/dist/telemetry/traces.cjs +4 -3
- package/dist/telemetry/traces.cjs.map +1 -1
- package/dist/telemetry/traces.d.cts +2 -0
- package/dist/telemetry/traces.d.ts +2 -0
- package/dist/telemetry/traces.d.ts.map +1 -1
- package/dist/telemetry/traces.js +4 -3
- package/dist/telemetry/traces.js.map +1 -1
- package/dist/utils.cjs +11 -0
- package/dist/utils.cjs.map +1 -1
- package/dist/utils.d.cts +10 -0
- package/dist/utils.d.ts +10 -0
- package/dist/utils.d.ts.map +1 -1
- package/dist/utils.js +10 -0
- package/dist/utils.js.map +1 -1
- package/dist/voice/agent.cjs +6 -2
- package/dist/voice/agent.cjs.map +1 -1
- package/dist/voice/agent.d.ts.map +1 -1
- package/dist/voice/agent.js +6 -2
- package/dist/voice/agent.js.map +1 -1
- package/dist/voice/agent_activity.cjs +72 -37
- package/dist/voice/agent_activity.cjs.map +1 -1
- package/dist/voice/agent_activity.d.cts +2 -1
- package/dist/voice/agent_activity.d.ts +2 -1
- package/dist/voice/agent_activity.d.ts.map +1 -1
- package/dist/voice/agent_activity.js +73 -38
- package/dist/voice/agent_activity.js.map +1 -1
- package/dist/voice/agent_session.cjs +7 -5
- package/dist/voice/agent_session.cjs.map +1 -1
- package/dist/voice/agent_session.d.cts +5 -2
- package/dist/voice/agent_session.d.ts +5 -2
- package/dist/voice/agent_session.d.ts.map +1 -1
- package/dist/voice/agent_session.js +7 -5
- package/dist/voice/agent_session.js.map +1 -1
- package/dist/voice/audio_recognition.cjs +3 -1
- package/dist/voice/audio_recognition.cjs.map +1 -1
- package/dist/voice/audio_recognition.d.ts.map +1 -1
- package/dist/voice/audio_recognition.js +3 -1
- package/dist/voice/audio_recognition.js.map +1 -1
- package/dist/voice/avatar/datastream_io.cjs +6 -0
- package/dist/voice/avatar/datastream_io.cjs.map +1 -1
- package/dist/voice/avatar/datastream_io.d.cts +1 -0
- package/dist/voice/avatar/datastream_io.d.ts +1 -0
- package/dist/voice/avatar/datastream_io.d.ts.map +1 -1
- package/dist/voice/avatar/datastream_io.js +6 -0
- package/dist/voice/avatar/datastream_io.js.map +1 -1
- package/dist/voice/background_audio.cjs.map +1 -1
- package/dist/voice/generation.cjs +14 -5
- package/dist/voice/generation.cjs.map +1 -1
- package/dist/voice/generation.d.cts +3 -2
- package/dist/voice/generation.d.ts +3 -2
- package/dist/voice/generation.d.ts.map +1 -1
- package/dist/voice/generation.js +14 -5
- package/dist/voice/generation.js.map +1 -1
- package/dist/voice/io.cjs +12 -0
- package/dist/voice/io.cjs.map +1 -1
- package/dist/voice/io.d.cts +19 -1
- package/dist/voice/io.d.ts +19 -1
- package/dist/voice/io.d.ts.map +1 -1
- package/dist/voice/io.js +12 -0
- package/dist/voice/io.js.map +1 -1
- package/dist/voice/recorder_io/recorder_io.cjs +91 -28
- package/dist/voice/recorder_io/recorder_io.cjs.map +1 -1
- package/dist/voice/recorder_io/recorder_io.d.cts +7 -1
- package/dist/voice/recorder_io/recorder_io.d.ts +7 -1
- package/dist/voice/recorder_io/recorder_io.d.ts.map +1 -1
- package/dist/voice/recorder_io/recorder_io.js +91 -28
- package/dist/voice/recorder_io/recorder_io.js.map +1 -1
- package/dist/voice/room_io/_input.cjs +40 -11
- package/dist/voice/room_io/_input.cjs.map +1 -1
- package/dist/voice/room_io/_input.d.cts +4 -1
- package/dist/voice/room_io/_input.d.ts +4 -1
- package/dist/voice/room_io/_input.d.ts.map +1 -1
- package/dist/voice/room_io/_input.js +31 -2
- package/dist/voice/room_io/_input.js.map +1 -1
- package/dist/voice/room_io/_output.cjs +6 -0
- package/dist/voice/room_io/_output.cjs.map +1 -1
- package/dist/voice/room_io/_output.d.cts +1 -0
- package/dist/voice/room_io/_output.d.ts +1 -0
- package/dist/voice/room_io/_output.d.ts.map +1 -1
- package/dist/voice/room_io/_output.js +6 -0
- package/dist/voice/room_io/_output.js.map +1 -1
- package/dist/voice/room_io/room_io.cjs.map +1 -1
- package/dist/voice/room_io/room_io.d.cts +2 -2
- package/dist/voice/room_io/room_io.d.ts +2 -2
- package/dist/voice/room_io/room_io.d.ts.map +1 -1
- package/dist/voice/room_io/room_io.js.map +1 -1
- package/dist/voice/speech_handle.cjs +2 -0
- package/dist/voice/speech_handle.cjs.map +1 -1
- package/dist/voice/speech_handle.d.cts +3 -0
- package/dist/voice/speech_handle.d.ts +3 -0
- package/dist/voice/speech_handle.d.ts.map +1 -1
- package/dist/voice/speech_handle.js +2 -0
- package/dist/voice/speech_handle.js.map +1 -1
- package/package.json +2 -2
- package/src/inference/api_protos.ts +83 -0
- package/src/inference/llm.ts +20 -15
- package/src/inference/stt.ts +48 -29
- package/src/inference/tts.ts +36 -16
- package/src/stt/stream_adapter.ts +12 -1
- package/src/stt/stt.ts +21 -0
- package/src/telemetry/traces.ts +6 -2
- package/src/utils.ts +21 -0
- package/src/voice/agent.ts +11 -2
- package/src/voice/agent_activity.ts +108 -41
- package/src/voice/agent_session.ts +6 -5
- package/src/voice/audio_recognition.ts +2 -0
- package/src/voice/avatar/datastream_io.ts +8 -0
- package/src/voice/generation.ts +24 -12
- package/src/voice/io.ts +27 -5
- package/src/voice/recorder_io/recorder_io.ts +123 -31
- package/src/voice/room_io/_input.ts +32 -4
- package/src/voice/room_io/_output.ts +8 -0
- package/src/voice/room_io/room_io.ts +3 -1
- package/src/voice/speech_handle.ts +4 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"agent_activity.d.ts","sourceRoot":"","sources":["../../src/voice/agent_activity.ts"],"names":[],"mappings":";AAIA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAKpD,OAAO,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AACjD,OAAO,EAAE,KAAK,WAAW,EAAE,WAAW,EAAE,MAAM,wBAAwB,CAAC;AACvE,OAAO,EAIL,KAAK,sBAAsB,EAC3B,KAAK,uBAAuB,EAC5B,KAAK,uBAAuB,EAC5B,KAAK,2BAA2B,EAChC,GAAG,EACH,aAAa,EAEb,KAAK,eAAe,EACpB,KAAK,UAAU,EACf,KAAK,WAAW,EACjB,MAAM,iBAAiB,CAAC;AAazB,OAAO,EAAE,GAAG,EAAiB,KAAK,WAAW,EAAE,MAAM,eAAe,CAAC;AAGrE,OAAO,EAAE,GAAG,EAAiB,MAAM,eAAe,CAAC;AACnD,OAAO,EAAE,MAAM,EAAE,IAAI,EAA0B,MAAM,aAAa,CAAC;AACnE,OAAO,EAAE,GAAG,EAAE,KAAK,QAAQ,EAAE,MAAM,WAAW,CAAC;AAC/C,OAAO,KAAK,EAAE,KAAK,EAAiB,MAAM,YAAY,CAAC;AAEvD,OAAO,EAAE,KAAK,YAAY,EAAE,KAAK,iBAAiB,EAAE,MAAM,oBAAoB,CAAC;AAC/E,OAAO,EAEL,KAAK,aAAa,EAClB,KAAK,wBAAwB,EAC7B,KAAK,gBAAgB,EAEtB,MAAM,wBAAwB,CAAC;AAqBhC,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAclD,qBAAa,aAAc,YAAW,gBAAgB;IACpD,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,yBAAyB,CAAQ;IACzD,OAAO,CAAC,OAAO,CAAS;IACxB,OAAO,CAAC,gBAAgB,CAAC,CAAmB;IAC5C,OAAO,CAAC,eAAe,CAAC,CAAkB;IAC1C,OAAO,CAAC,aAAa,CAAC,CAAoB;IAC1C,OAAO,CAAC,iBAAiB,CAAC,CAA4C;IACtE,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,SAAS,CAAS;IAC1B,OAAO,CAAC,cAAc,CAAC,CAAe;IACtC,OAAO,CAAC,WAAW,CAAuC;IAC1D,OAAO,CAAC,SAAS,CAAS;IAC1B,OAAO,CAAC,WAAW,CAA8B;IACjD,OAAO,CAAC,IAAI,CAAe;IAC3B,OAAO,CAAC,WAAW,CAA4C;IAE/D,OAAO,CAAC,UAAU,CAA2B;IAC7C,OAAO,CAAC,qBAAqB,CAAC,CAAuB;IAErD,KAAK,EAAE,KAAK,CAAC;IACb,YAAY,EAAE,YAAY,CAAC;IAE3B,gBAAgB;IAChB,SAAS,CAAC,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC;IACvB,sBAAsB,CAAC,EAAE,OAAO,CAAC,IAAI,CAAC,CAAC;gBAE3B,KAAK,EAAE,KAAK,EAAE,YAAY,EAAE,YAAY;
|
|
1
|
+
{"version":3,"file":"agent_activity.d.ts","sourceRoot":"","sources":["../../src/voice/agent_activity.ts"],"names":[],"mappings":";AAIA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAKpD,OAAO,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AACjD,OAAO,EAAE,KAAK,WAAW,EAAE,WAAW,EAAE,MAAM,wBAAwB,CAAC;AACvE,OAAO,EAIL,KAAK,sBAAsB,EAC3B,KAAK,uBAAuB,EAC5B,KAAK,uBAAuB,EAC5B,KAAK,2BAA2B,EAChC,GAAG,EACH,aAAa,EAEb,KAAK,eAAe,EACpB,KAAK,UAAU,EACf,KAAK,WAAW,EACjB,MAAM,iBAAiB,CAAC;AAazB,OAAO,EAAE,GAAG,EAAiB,KAAK,WAAW,EAAE,MAAM,eAAe,CAAC;AAGrE,OAAO,EAAE,GAAG,EAAiB,MAAM,eAAe,CAAC;AACnD,OAAO,EAAE,MAAM,EAAE,IAAI,EAA0B,MAAM,aAAa,CAAC;AACnE,OAAO,EAAE,GAAG,EAAE,KAAK,QAAQ,EAAE,MAAM,WAAW,CAAC;AAC/C,OAAO,KAAK,EAAE,KAAK,EAAiB,MAAM,YAAY,CAAC;AAEvD,OAAO,EAAE,KAAK,YAAY,EAAE,KAAK,iBAAiB,EAAE,MAAM,oBAAoB,CAAC;AAC/E,OAAO,EAEL,KAAK,aAAa,EAClB,KAAK,wBAAwB,EAC7B,KAAK,gBAAgB,EAEtB,MAAM,wBAAwB,CAAC;AAqBhC,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAclD,qBAAa,aAAc,YAAW,gBAAgB;IACpD,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,yBAAyB,CAAQ;IACzD,OAAO,CAAC,OAAO,CAAS;IACxB,OAAO,CAAC,gBAAgB,CAAC,CAAmB;IAC5C,OAAO,CAAC,eAAe,CAAC,CAAkB;IAC1C,OAAO,CAAC,aAAa,CAAC,CAAoB;IAC1C,OAAO,CAAC,iBAAiB,CAAC,CAA4C;IACtE,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,SAAS,CAAS;IAC1B,OAAO,CAAC,cAAc,CAAC,CAAe;IACtC,OAAO,CAAC,WAAW,CAAuC;IAC1D,OAAO,CAAC,SAAS,CAAS;IAC1B,OAAO,CAAC,WAAW,CAA8B;IACjD,OAAO,CAAC,IAAI,CAAe;IAC3B,OAAO,CAAC,WAAW,CAA4C;IAE/D,OAAO,CAAC,UAAU,CAA2B;IAC7C,OAAO,CAAC,qBAAqB,CAAC,CAAuB;IAErD,KAAK,EAAE,KAAK,CAAC;IACb,YAAY,EAAE,YAAY,CAAC;IAE3B,gBAAgB;IAChB,SAAS,CAAC,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC;IACvB,sBAAsB,CAAC,EAAE,OAAO,CAAC,IAAI,CAAC,CAAC;gBAE3B,KAAK,EAAE,KAAK,EAAE,YAAY,EAAE,YAAY;IA8F9C,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;IAoH5B,IAAI,aAAa,IAAI,YAAY,GAAG,SAAS,CAE5C;IAED,IAAI,GAAG,IAAI,GAAG,GAAG,SAAS,CAEzB;IAED,IAAI,GAAG,IAAI,GAAG,GAAG,SAAS,CAEzB;IAED,IAAI,GAAG,IAAI,GAAG,GAAG,aAAa,GAAG,SAAS,CAEzC;IAED,IAAI,GAAG,IAAI,GAAG,GAAG,SAAS,CAEzB;IAED,IAAI,KAAK,IAAI,WAAW,CAEvB;IAED,IAAI,QAAQ,IAAI,OAAO,CAEtB;IAED,IAAI,kBAAkB,IAAI,eAAe,GAAG,SAAS,CAEpD;IAED,IAAI,kBAAkB,IAAI,OAAO,CAGhC;IAED,IAAI,aAAa,IAAI,iBAAiB,GAAG,SAAS,CAGjD;IAED,IAAI,OAAO,IAAI,WAAW,CAEzB;IAEK,aAAa,CAAC,OAAO,EAAE,WAAW,GAAG,OAAO,CAAC,IAAI,CAAC;IAiBxD,aAAa,CAAC,EAAE,UAAU,EAAE,EAAE;QAAE,UAAU,CAAC,EAAE,UAAU,GAAG,IAAI,CAAA;KAAE,GAAG,IAAI;IAUvE,gBAAgB,CAAC,WAAW,EAAE,cAAc,CAAC,UAAU,CAAC,GAAG,IAAI;IAwB/D,gBAAgB,IAAI,IAAI;IAIxB,cAAc;IAUd,aAAa;IAKb,GAAG,CACD,IAAI,EAAE,MAAM,GAAG,cAAc,CAAC,MAAM,CAAC,EACrC,OAAO,CAAC,EAAE;QACR,KAAK,CAAC,EAAE,cAAc,CAAC,UAAU,CAAC,CAAC;QACnC,kBAAkB,CAAC,EAAE,OAAO,CAAC;QAC7B,YAAY,CAAC,EAAE,OAAO,CAAC;KACxB,GACA,YAAY;IAwDf,OAAO,CAAC,kBAAkB,CAqBxB;IAEF,OAAO,CAAC,OAAO;IAoBf,oBAAoB,CAAC,GAAG,EAAE,uBAAuB,GAAG,IAAI;IAmBxD,oBAAoB,CAAC,EAAE,EAAE,uBAAuB,GAAG,IAAI;IAkBvD,kCAAkC,CAAC,EAAE,EAAE,2BAA2B,GAAG,IAAI;IAoBzE,mBAAmB,CAAC,EAAE,EAAE,sBAAsB,GAAG,IAAI;IAqCrD,eAAe,CAAC,EAAE,EAAE,QAAQ,GAAG,IAAI;IAQnC,aAAa,CAAC,EAAE,EAAE,QAAQ,GAAG,IAAI;IAQjC,kBAAkB,CAAC,EAAE,EAAE,QAAQ,GAAG,IAAI;IAWtC,OAAO,CAAC,wBAAwB;IAyChC,mBAAmB,CAAC,EAAE,EAAE,WAAW,GAAG,IAAI;IAqB1C,iBAAiB,CAAC,EAAE,EAAE,WAAW,GAAG,IAAI;IA+BxC,sBAAsB,CAAC,IAAI,EAAE,wBAAwB,GAAG,IAAI;IA0C5D,OAAO,CAAC,0BAA0B;IAOlC,OAAO,CAAC,gBAAgB;IA4BlB,WAAW,CAAC,IAAI,EAAE,aAAa,GAAG,OAAO,CAAC,OAAO,CAAC;IA0CxD,eAAe,IAAI,WAAW;YAIhB,QAAQ;IAuCtB,OAAO,CAAC,cAAc;IAItB,aAAa,CAAC,OAAO,EAAE;QACrB,WAAW,CAAC,EAAE,WAAW,CAAC;QAC1B,OAAO,CAAC,EAAE,WAAW,CAAC;QACtB,YAAY,CAAC,EAAE,MAAM,CAAC;QACtB,UAAU,CAAC,EAAE,UAAU,GAAG,IAAI,CAAC;QAC/B,kBAAkB,CAAC,EAAE,OAAO,CAAC;QAC7B,cAAc,CAAC,EAAE,OAAO,CAAC;KAC1B,GAAG,YAAY;IAuGhB,SAAS,IAAI,MAAM,CAAC,IAAI,CAAC;IA0BzB,OAAO,CAAC,mBAAmB;YAMb,iBAAiB;YAyHjB,OAAO;IAmIrB,OAAO,CAAC,sBAAsB,CAkZ5B;IAEF,OAAO,CAAC,iBAAiB,CA2BrB;YAEU,sBAAsB;YAsBtB,2BAA2B;YAwc3B,iBAAiB;IAqD/B,OAAO,CAAC,cAAc;IAiBhB,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;YAQd,UAAU;IA2BlB,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;CAyC7B"}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { Mutex } from "@livekit/mutex";
|
|
2
|
-
import { ROOT_CONTEXT, trace } from "@opentelemetry/api";
|
|
2
|
+
import { ROOT_CONTEXT, context as otelContext, trace } from "@opentelemetry/api";
|
|
3
3
|
import { Heap } from "heap-js";
|
|
4
4
|
import { AsyncLocalStorage } from "node:async_hooks";
|
|
5
5
|
import { ReadableStream } from "node:stream/web";
|
|
@@ -119,9 +119,9 @@ class AgentActivity {
|
|
|
119
119
|
);
|
|
120
120
|
this.turnDetectionMode = void 0;
|
|
121
121
|
}
|
|
122
|
-
if (!this.vad && this.stt && this.llm instanceof LLM && this.allowInterruptions && this.turnDetectionMode === void 0) {
|
|
122
|
+
if (!this.vad && this.stt && !this.stt.capabilities.streaming && this.llm instanceof LLM && this.allowInterruptions && this.turnDetectionMode === void 0) {
|
|
123
123
|
this.logger.warn(
|
|
124
|
-
"VAD is not set. Enabling VAD is recommended when using LLM and STT for more responsive interruption handling."
|
|
124
|
+
"VAD is not set. Enabling VAD is recommended when using LLM and non-streaming STT for more responsive interruption handling."
|
|
125
125
|
);
|
|
126
126
|
}
|
|
127
127
|
}
|
|
@@ -455,8 +455,12 @@ class AgentActivity {
|
|
|
455
455
|
this.scheduleSpeech(handle, SpeechHandle.SPEECH_PRIORITY_NORMAL);
|
|
456
456
|
}
|
|
457
457
|
// recognition hooks
|
|
458
|
-
onStartOfSpeech(
|
|
459
|
-
|
|
458
|
+
onStartOfSpeech(ev) {
|
|
459
|
+
let speechStartTime = Date.now();
|
|
460
|
+
if (ev) {
|
|
461
|
+
speechStartTime = speechStartTime - ev.speechDuration;
|
|
462
|
+
}
|
|
463
|
+
this.agentSession._updateUserState("speaking", speechStartTime);
|
|
460
464
|
}
|
|
461
465
|
onEndOfSpeech(ev) {
|
|
462
466
|
let speechEndTime = Date.now();
|
|
@@ -466,14 +470,16 @@ class AgentActivity {
|
|
|
466
470
|
this.agentSession._updateUserState("listening", speechEndTime);
|
|
467
471
|
}
|
|
468
472
|
onVADInferenceDone(ev) {
|
|
469
|
-
var _a, _b;
|
|
470
473
|
if (this.turnDetection === "manual" || this.turnDetection === "realtime_llm") {
|
|
471
474
|
return;
|
|
472
475
|
}
|
|
473
|
-
if (
|
|
474
|
-
|
|
476
|
+
if (ev.speechDuration >= this.agentSession.options.minInterruptionDuration) {
|
|
477
|
+
this.interruptByAudioActivity();
|
|
475
478
|
}
|
|
476
|
-
|
|
479
|
+
}
|
|
480
|
+
interruptByAudioActivity() {
|
|
481
|
+
var _a, _b;
|
|
482
|
+
if (this.llm instanceof RealtimeModel && this.llm.capabilities.turnDetection) {
|
|
477
483
|
return;
|
|
478
484
|
}
|
|
479
485
|
if (this.stt && this.agentSession.options.minInterruptionWords > 0 && this.audioRecognition) {
|
|
@@ -486,7 +492,10 @@ class AgentActivity {
|
|
|
486
492
|
}
|
|
487
493
|
(_a = this.realtimeSession) == null ? void 0 : _a.startUserActivity();
|
|
488
494
|
if (this._currentSpeech && !this._currentSpeech.interrupted && this._currentSpeech.allowInterruptions) {
|
|
489
|
-
this.logger.info(
|
|
495
|
+
this.logger.info(
|
|
496
|
+
{ "speech id": this._currentSpeech.id },
|
|
497
|
+
"speech interrupted by audio activity"
|
|
498
|
+
);
|
|
490
499
|
(_b = this.realtimeSession) == null ? void 0 : _b.interrupt();
|
|
491
500
|
this._currentSpeech.interrupt();
|
|
492
501
|
}
|
|
@@ -504,6 +513,9 @@ class AgentActivity {
|
|
|
504
513
|
// TODO(AJS-106): add multi participant support
|
|
505
514
|
})
|
|
506
515
|
);
|
|
516
|
+
if (ev.alternatives[0].text) {
|
|
517
|
+
this.interruptByAudioActivity();
|
|
518
|
+
}
|
|
507
519
|
}
|
|
508
520
|
onFinalTranscript(ev) {
|
|
509
521
|
if (this.llm instanceof RealtimeModel && this.llm.capabilities.userTranscription) {
|
|
@@ -518,6 +530,9 @@ class AgentActivity {
|
|
|
518
530
|
// TODO(AJS-106): add multi participant support
|
|
519
531
|
})
|
|
520
532
|
);
|
|
533
|
+
if (this.audioRecognition && this.turnDetection !== "manual" && this.turnDetection !== "realtime_llm") {
|
|
534
|
+
this.interruptByAudioActivity();
|
|
535
|
+
}
|
|
521
536
|
}
|
|
522
537
|
onPreemptiveGeneration(info) {
|
|
523
538
|
if (!this.agentSession.options.preemptiveGeneration || this.draining || this._currentSpeech !== void 0 && !this._currentSpeech.interrupted || !(this.llm instanceof LLM)) {
|
|
@@ -833,6 +848,7 @@ ${instructions}` : instructions,
|
|
|
833
848
|
);
|
|
834
849
|
}
|
|
835
850
|
async ttsTask(speechHandle, text, addToChatCtx, modelSettings, replyAbortController, audio) {
|
|
851
|
+
speechHandle._agentTurnContext = otelContext.active();
|
|
836
852
|
speechHandleStorage.enterWith(speechHandle);
|
|
837
853
|
const transcriptionOutput = this.agentSession.output.transcriptionEnabled ? this.agentSession.output.transcription : null;
|
|
838
854
|
const audioOutput = this.agentSession.output.audioEnabled ? this.agentSession.output.audio : null;
|
|
@@ -864,12 +880,15 @@ ${instructions}` : instructions,
|
|
|
864
880
|
textOut = _textOut;
|
|
865
881
|
tasks.push(textForwardTask);
|
|
866
882
|
}
|
|
867
|
-
const onFirstFrame = () => {
|
|
868
|
-
this.agentSession._updateAgentState("speaking"
|
|
883
|
+
const onFirstFrame = (startedSpeakingAt) => {
|
|
884
|
+
this.agentSession._updateAgentState("speaking", {
|
|
885
|
+
startTime: startedSpeakingAt,
|
|
886
|
+
otelContext: speechHandle._agentTurnContext
|
|
887
|
+
});
|
|
869
888
|
};
|
|
870
889
|
if (!audioOutput) {
|
|
871
890
|
if (textOut) {
|
|
872
|
-
textOut.firstTextFut.await.
|
|
891
|
+
textOut.firstTextFut.await.then(() => onFirstFrame()).catch(() => this.logger.debug("firstTextFut cancelled before first frame"));
|
|
873
892
|
}
|
|
874
893
|
} else {
|
|
875
894
|
let audioOut = null;
|
|
@@ -897,7 +916,7 @@ ${instructions}` : instructions,
|
|
|
897
916
|
tasks.push(forwardTask);
|
|
898
917
|
audioOut = _audioOut;
|
|
899
918
|
}
|
|
900
|
-
audioOut.firstFrameFut.await.
|
|
919
|
+
audioOut.firstFrameFut.await.then((ts) => onFirstFrame(ts)).catch(() => this.logger.debug("firstFrameFut cancelled before first frame"));
|
|
901
920
|
}
|
|
902
921
|
await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
|
|
903
922
|
if (audioOutput) {
|
|
@@ -936,6 +955,7 @@ ${instructions}` : instructions,
|
|
|
936
955
|
span
|
|
937
956
|
}) => {
|
|
938
957
|
var _a, _b, _c;
|
|
958
|
+
speechHandle._agentTurnContext = otelContext.active();
|
|
939
959
|
span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
940
960
|
if (instructions) {
|
|
941
961
|
span.setAttribute(traceTypes.ATTR_INSTRUCTIONS, instructions);
|
|
@@ -1012,8 +1032,11 @@ ${instructions}` : instructions,
|
|
|
1012
1032
|
tasks.push(textForwardTask);
|
|
1013
1033
|
textOut = _textOut;
|
|
1014
1034
|
}
|
|
1015
|
-
const onFirstFrame = () => {
|
|
1016
|
-
this.agentSession._updateAgentState("speaking"
|
|
1035
|
+
const onFirstFrame = (startedSpeakingAt) => {
|
|
1036
|
+
this.agentSession._updateAgentState("speaking", {
|
|
1037
|
+
startTime: startedSpeakingAt,
|
|
1038
|
+
otelContext: speechHandle._agentTurnContext
|
|
1039
|
+
});
|
|
1017
1040
|
};
|
|
1018
1041
|
let audioOut = null;
|
|
1019
1042
|
if (audioOutput) {
|
|
@@ -1025,12 +1048,12 @@ ${instructions}` : instructions,
|
|
|
1025
1048
|
);
|
|
1026
1049
|
audioOut = _audioOut;
|
|
1027
1050
|
tasks.push(forwardTask);
|
|
1028
|
-
audioOut.firstFrameFut.await.
|
|
1051
|
+
audioOut.firstFrameFut.await.then((ts) => onFirstFrame(ts)).catch(() => this.logger.debug("firstFrameFut cancelled before first frame"));
|
|
1029
1052
|
} else {
|
|
1030
1053
|
throw Error("ttsStream is null when audioOutput is enabled");
|
|
1031
1054
|
}
|
|
1032
1055
|
} else {
|
|
1033
|
-
textOut == null ? void 0 : textOut.firstTextFut.await.
|
|
1056
|
+
textOut == null ? void 0 : textOut.firstTextFut.await.then(() => onFirstFrame()).catch(() => this.logger.debug("firstTextFut cancelled before first frame"));
|
|
1034
1057
|
}
|
|
1035
1058
|
const onToolExecutionStarted = (f) => {
|
|
1036
1059
|
speechHandle._itemAdded([f]);
|
|
@@ -1061,7 +1084,12 @@ ${instructions}` : instructions,
|
|
|
1061
1084
|
msg.createdAt = replyStartedAt;
|
|
1062
1085
|
}
|
|
1063
1086
|
this.agent._chatCtx.insert(toolsMessages);
|
|
1064
|
-
|
|
1087
|
+
const toolCallOutputs = toolsMessages.filter(
|
|
1088
|
+
(m) => m.type === "function_call_output"
|
|
1089
|
+
);
|
|
1090
|
+
if (toolCallOutputs.length > 0) {
|
|
1091
|
+
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
1092
|
+
}
|
|
1065
1093
|
}
|
|
1066
1094
|
if (speechHandle.interrupted) {
|
|
1067
1095
|
this.logger.debug(
|
|
@@ -1078,9 +1106,9 @@ ${instructions}` : instructions,
|
|
|
1078
1106
|
let forwardedText = (textOut == null ? void 0 : textOut.text) || "";
|
|
1079
1107
|
if (audioOutput) {
|
|
1080
1108
|
const playbackEv = await audioOutput.waitForPlayout();
|
|
1081
|
-
if (audioOut == null ? void 0 : audioOut.firstFrameFut.done) {
|
|
1109
|
+
if ((audioOut == null ? void 0 : audioOut.firstFrameFut.done) && !audioOut.firstFrameFut.rejected) {
|
|
1082
1110
|
this.logger.info(
|
|
1083
|
-
{ speech_id: speechHandle.id,
|
|
1111
|
+
{ speech_id: speechHandle.id, playbackPositionInS: playbackEv.playbackPosition },
|
|
1084
1112
|
"playout interrupted"
|
|
1085
1113
|
);
|
|
1086
1114
|
if (playbackEv.synchronizedTranscript) {
|
|
@@ -1218,7 +1246,12 @@ ${instructions}` : instructions,
|
|
|
1218
1246
|
msg.createdAt = replyStartedAt;
|
|
1219
1247
|
}
|
|
1220
1248
|
this.agent._chatCtx.insert(toolMessages);
|
|
1221
|
-
|
|
1249
|
+
const toolCallOutputs = toolMessages.filter(
|
|
1250
|
+
(m) => m.type === "function_call_output"
|
|
1251
|
+
);
|
|
1252
|
+
if (toolCallOutputs.length > 0) {
|
|
1253
|
+
this.agentSession._toolItemsAdded(toolCallOutputs);
|
|
1254
|
+
}
|
|
1222
1255
|
}
|
|
1223
1256
|
};
|
|
1224
1257
|
pipelineReplyTask = async (speechHandle, chatCtx, toolCtx, modelSettings, replyAbortController, instructions, newMessage, toolsMessages) => tracer.startActiveSpan(
|
|
@@ -1261,6 +1294,7 @@ ${instructions}` : instructions,
|
|
|
1261
1294
|
span
|
|
1262
1295
|
}) {
|
|
1263
1296
|
var _a, _b, _c;
|
|
1297
|
+
speechHandle._agentTurnContext = otelContext.active();
|
|
1264
1298
|
span.setAttribute(traceTypes.ATTR_SPEECH_ID, speechHandle.id);
|
|
1265
1299
|
speechHandleStorage.enterWith(speechHandle);
|
|
1266
1300
|
if (!this.realtimeSession) {
|
|
@@ -1285,8 +1319,11 @@ ${instructions}` : instructions,
|
|
|
1285
1319
|
if (speechHandle.interrupted) {
|
|
1286
1320
|
return;
|
|
1287
1321
|
}
|
|
1288
|
-
const onFirstFrame = () => {
|
|
1289
|
-
this.agentSession._updateAgentState("speaking"
|
|
1322
|
+
const onFirstFrame = (startedSpeakingAt) => {
|
|
1323
|
+
this.agentSession._updateAgentState("speaking", {
|
|
1324
|
+
startTime: startedSpeakingAt,
|
|
1325
|
+
otelContext: speechHandle._agentTurnContext
|
|
1326
|
+
});
|
|
1290
1327
|
};
|
|
1291
1328
|
const readMessages = async (abortController, outputs) => {
|
|
1292
1329
|
replyAbortController.signal.addEventListener("abort", () => abortController.abort(), {
|
|
@@ -1361,10 +1398,10 @@ ${instructions}` : instructions,
|
|
|
1361
1398
|
);
|
|
1362
1399
|
forwardTasks.push(forwardTask);
|
|
1363
1400
|
audioOut = _audioOut;
|
|
1364
|
-
audioOut.firstFrameFut.await.
|
|
1401
|
+
audioOut.firstFrameFut.await.then((ts) => onFirstFrame(ts)).catch(() => this.logger.debug("firstFrameFut cancelled before first frame"));
|
|
1365
1402
|
}
|
|
1366
1403
|
} else if (textOut) {
|
|
1367
|
-
textOut.firstTextFut.await.
|
|
1404
|
+
textOut.firstTextFut.await.then(() => onFirstFrame()).catch(() => this.logger.debug("firstTextFut cancelled before first frame"));
|
|
1368
1405
|
}
|
|
1369
1406
|
outputs.push([msg.messageId, textOut, audioOut, msgModalities]);
|
|
1370
1407
|
}
|
|
@@ -1428,7 +1465,6 @@ ${instructions}` : instructions,
|
|
|
1428
1465
|
await speechHandle.waitIfNotInterrupted(tasks.map((task) => task.result));
|
|
1429
1466
|
if (audioOutput) {
|
|
1430
1467
|
await speechHandle.waitIfNotInterrupted([audioOutput.waitForPlayout()]);
|
|
1431
|
-
this.agentSession._updateAgentState("listening");
|
|
1432
1468
|
}
|
|
1433
1469
|
if (speechHandle.interrupted) {
|
|
1434
1470
|
this.logger.debug(
|
|
@@ -1443,10 +1479,10 @@ ${instructions}` : instructions,
|
|
|
1443
1479
|
if (audioOutput) {
|
|
1444
1480
|
audioOutput.clearBuffer();
|
|
1445
1481
|
const playbackEv = await audioOutput.waitForPlayout();
|
|
1446
|
-
let
|
|
1447
|
-
if (audioOut == null ? void 0 : audioOut.firstFrameFut.done) {
|
|
1482
|
+
let playbackPositionInS = playbackEv.playbackPosition;
|
|
1483
|
+
if ((audioOut == null ? void 0 : audioOut.firstFrameFut.done) && !audioOut.firstFrameFut.rejected) {
|
|
1448
1484
|
this.logger.info(
|
|
1449
|
-
{ speech_id: speechHandle.id,
|
|
1485
|
+
{ speech_id: speechHandle.id, playbackPositionInS },
|
|
1450
1486
|
"playout interrupted"
|
|
1451
1487
|
);
|
|
1452
1488
|
if (playbackEv.synchronizedTranscript) {
|
|
@@ -1454,11 +1490,11 @@ ${instructions}` : instructions,
|
|
|
1454
1490
|
}
|
|
1455
1491
|
} else {
|
|
1456
1492
|
forwardedText = "";
|
|
1457
|
-
|
|
1493
|
+
playbackPositionInS = 0;
|
|
1458
1494
|
}
|
|
1459
1495
|
this.realtimeSession.truncate({
|
|
1460
1496
|
messageId: msgId,
|
|
1461
|
-
audioEndMs: Math.floor(
|
|
1497
|
+
audioEndMs: Math.floor(playbackPositionInS * 1e3),
|
|
1462
1498
|
modalities: msgModalities,
|
|
1463
1499
|
audioTranscript: forwardedText
|
|
1464
1500
|
});
|
|
@@ -1496,14 +1532,13 @@ ${instructions}` : instructions,
|
|
|
1496
1532
|
this.agentSession._conversationItemAdded(message);
|
|
1497
1533
|
}
|
|
1498
1534
|
speechHandle._markGenerationDone();
|
|
1499
|
-
toolOutput.firstToolStartedFuture.await.finally(() => {
|
|
1500
|
-
this.agentSession._updateAgentState("thinking");
|
|
1501
|
-
});
|
|
1502
1535
|
await executeToolsTask.result;
|
|
1536
|
+
if (toolOutput.output.length > 0) {
|
|
1537
|
+
this.agentSession._updateAgentState("thinking");
|
|
1538
|
+
} else if (this.agentSession.agentState === "speaking") {
|
|
1539
|
+
this.agentSession._updateAgentState("listening");
|
|
1540
|
+
}
|
|
1503
1541
|
if (toolOutput.output.length === 0) {
|
|
1504
|
-
if (!speechHandle.interrupted) {
|
|
1505
|
-
this.agentSession._updateAgentState("listening");
|
|
1506
|
-
}
|
|
1507
1542
|
return;
|
|
1508
1543
|
}
|
|
1509
1544
|
const { maxToolSteps } = this.agentSession.options;
|