@iinm/plain-agent 1.7.18 → 1.7.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,105 @@
1
+ import {
2
+ isObjectLike,
3
+ startWebSocketVoiceSession,
4
+ } from "./voiceInputSession.mjs";
5
+
6
+ /**
7
+ * @import { VoiceProviderHooks, VoiceRecorderConfig, VoiceSession, VoiceSessionCallbacks } from "./voiceInputSession.mjs"
8
+ */
9
+
10
+ /**
11
+ * @typedef {Object} VoiceInputGeminiConfig
12
+ * @property {"gemini"} provider
13
+ * @property {string} apiKey
14
+ * @property {string} [model] - Defaults to "gemini-3.1-flash-live-preview".
15
+ * @property {string} [language] - ISO-639-1 code (e.g. "ja", "en"). Passed to the model as a system instruction since Gemini Live has no native language hint for input transcription.
16
+ * @property {string} [baseURL]
17
+ * @property {VoiceRecorderConfig} [recorder]
18
+ * @property {string} [toggleKey]
19
+ */
20
+
21
+ const GEMINI_DEFAULT_MODEL = "gemini-3.1-flash-live-preview";
22
+ const GEMINI_DEFAULT_WS =
23
+ "wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent";
24
+ const GEMINI_SAMPLE_RATE = 16000;
25
+ const GEMINI_LABEL = "Gemini Live";
26
+
27
+ /**
28
+ * Start a voice input session backed by the Gemini Live BidiGenerateContent
29
+ * WebSocket. Spawns a recorder, streams PCM as base64 JSON messages, and
30
+ * forwards transcript deltas via `onTranscript`.
31
+ *
32
+ * Gemini Live was designed for voice agents, not pure STT, so the setup
33
+ * message forces `maxOutputTokens: 1` and disables thinking on 2.5 models
34
+ * to minimise wasted audio output.
35
+ *
36
+ * @param {object} options
37
+ * @param {VoiceInputGeminiConfig} options.config
38
+ * @param {VoiceSessionCallbacks} options.callbacks
39
+ * @returns {VoiceSession}
40
+ */
41
+ export function startGeminiVoiceSession({ config, callbacks }) {
42
+ /** @type {VoiceProviderHooks<VoiceInputGeminiConfig>} */
43
+ const hooks = {
44
+ label: GEMINI_LABEL,
45
+ sampleRate: GEMINI_SAMPLE_RATE,
46
+ buildWsUrl(config) {
47
+ const base = config.baseURL ?? GEMINI_DEFAULT_WS;
48
+ return `${base}?key=${encodeURIComponent(config.apiKey)}`;
49
+ },
50
+ buildSetupMessage(config) {
51
+ const model = config.model ?? GEMINI_DEFAULT_MODEL;
52
+ /** @type {Record<string, unknown>} */
53
+ const generationConfig = {
54
+ // https://ai.google.dev/gemini-api/docs/live-api/capabilities#response-modalities
55
+ // > The native audio models only support `AUDIO` response modality.
56
+ responseModalities: ["AUDIO"],
57
+ maxOutputTokens: 1,
58
+ };
59
+ if (model.includes("2.5")) {
60
+ generationConfig.thinkingConfig = { thinkingBudget: 0 };
61
+ }
62
+ /** @type {Record<string, unknown>} */
63
+ const setup = {
64
+ model: `models/${model}`,
65
+ generationConfig,
66
+ inputAudioTranscription: {},
67
+ };
68
+ if (config.language) {
69
+ setup.systemInstruction = {
70
+ parts: [{ text: `The user is speaking in ${config.language}.` }],
71
+ };
72
+ }
73
+ return { setup };
74
+ },
75
+ isReadyMessage(message) {
76
+ return isObjectLike(message) && "setupComplete" in message;
77
+ },
78
+ extractTranscript(message) {
79
+ if (!isObjectLike(message)) return undefined;
80
+ const serverContent = message.serverContent;
81
+ if (!isObjectLike(serverContent)) return undefined;
82
+ const transcription = serverContent.inputTranscription;
83
+ if (
84
+ isObjectLike(transcription) &&
85
+ typeof transcription.text === "string" &&
86
+ transcription.text.length > 0
87
+ ) {
88
+ return transcription.text;
89
+ }
90
+ return undefined;
91
+ },
92
+ buildAudioPayload(chunk, sampleRate) {
93
+ return {
94
+ realtimeInput: {
95
+ audio: {
96
+ data: chunk.toString("base64"),
97
+ mimeType: `audio/pcm;rate=${sampleRate}`,
98
+ },
99
+ },
100
+ };
101
+ },
102
+ };
103
+
104
+ return startWebSocketVoiceSession({ hooks, config, callbacks });
105
+ }
@@ -0,0 +1,104 @@
1
+ import {
2
+ isObjectLike,
3
+ startWebSocketVoiceSession,
4
+ } from "./voiceInputSession.mjs";
5
+
6
+ /**
7
+ * @import { VoiceProviderHooks, VoiceRecorderConfig, VoiceSession, VoiceSessionCallbacks } from "./voiceInputSession.mjs"
8
+ */
9
+
10
+ /**
11
+ * @typedef {Object} VoiceInputOpenAIConfig
12
+ * @property {"openai"} provider
13
+ * @property {string} apiKey
14
+ * @property {string} [model] - Defaults to "gpt-4o-transcribe".
15
+ * @property {string} [language] - ISO-639-1 code (e.g. "ja", "en"). Improves accuracy and latency when set.
16
+ * @property {string} [baseURL]
17
+ * @property {VoiceRecorderConfig} [recorder]
18
+ * @property {string} [toggleKey] - "ctrl-<char>". Defaults to "ctrl-o".
19
+ */
20
+
21
+ const OPENAI_DEFAULT_MODEL = "gpt-4o-transcribe";
22
+ const OPENAI_DEFAULT_WS = "wss://api.openai.com/v1/realtime";
23
+ const OPENAI_SAMPLE_RATE = 24000;
24
+ const OPENAI_LABEL = "OpenAI Realtime";
25
+
26
+ /**
27
+ * Start a voice input session backed by the OpenAI Realtime transcription
28
+ * WebSocket. Spawns a recorder, streams PCM as base64 JSON messages, and
29
+ * forwards transcript deltas via `onTranscript`.
30
+ *
31
+ * @param {object} options
32
+ * @param {VoiceInputOpenAIConfig} options.config
33
+ * @param {VoiceSessionCallbacks} options.callbacks
34
+ * @returns {VoiceSession}
35
+ */
36
+ export function startOpenAIVoiceSession({ config, callbacks }) {
37
+ /** @type {VoiceProviderHooks<VoiceInputOpenAIConfig>} */
38
+ const hooks = {
39
+ label: OPENAI_LABEL,
40
+ sampleRate: OPENAI_SAMPLE_RATE,
41
+ buildWsUrl(config) {
42
+ const base = config.baseURL ?? OPENAI_DEFAULT_WS;
43
+ return `${base}?intent=transcription`;
44
+ },
45
+ buildWsOptions(config) {
46
+ return {
47
+ headers: {
48
+ Authorization: `Bearer ${config.apiKey}`,
49
+ "OpenAI-Beta": "realtime=v1",
50
+ },
51
+ };
52
+ },
53
+ buildSetupMessage(config) {
54
+ const model = config.model ?? OPENAI_DEFAULT_MODEL;
55
+ /** @type {{ model: string, language?: string }} */
56
+ const transcription = { model };
57
+ if (config.language) transcription.language = config.language;
58
+ // The `?intent=transcription` endpoint uses the flat transcription-session
59
+ // schema, not the nested `session.audio.input.*` realtime schema.
60
+ return {
61
+ type: "transcription_session.update",
62
+ session: {
63
+ input_audio_format: "pcm16",
64
+ input_audio_transcription: transcription,
65
+ turn_detection: { type: "server_vad" },
66
+ },
67
+ };
68
+ },
69
+ isReadyMessage(message) {
70
+ return (
71
+ isObjectLike(message) &&
72
+ (message.type === "transcription_session.created" ||
73
+ message.type === "transcription_session.updated")
74
+ );
75
+ },
76
+ extractError(message) {
77
+ if (!isObjectLike(message) || message.type !== "error") return undefined;
78
+ const error = message.error;
79
+ if (!isObjectLike(error)) return undefined;
80
+ return typeof error.message === "string"
81
+ ? error.message
82
+ : JSON.stringify(error);
83
+ },
84
+ extractTranscript(message) {
85
+ if (
86
+ isObjectLike(message) &&
87
+ message.type === "conversation.item.input_audio_transcription.delta" &&
88
+ typeof message.delta === "string" &&
89
+ message.delta.length > 0
90
+ ) {
91
+ return message.delta;
92
+ }
93
+ return undefined;
94
+ },
95
+ buildAudioPayload(chunk, _sampleRate) {
96
+ return {
97
+ type: "input_audio_buffer.append",
98
+ audio: chunk.toString("base64"),
99
+ };
100
+ },
101
+ };
102
+
103
+ return startWebSocketVoiceSession({ hooks, config, callbacks });
104
+ }