@iinm/plain-agent 1.7.18 → 1.7.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +57 -83
- package/config/config.predefined.json +15 -15
- package/package.json +1 -3
- package/src/agentLoop.mjs +3 -1
- package/src/cliArgs.mjs +31 -1
- package/src/cliBatch.mjs +22 -0
- package/src/cliCost.mjs +309 -0
- package/src/cliFormatter.mjs +1 -1
- package/src/cliInteractive.mjs +29 -1
- package/src/config.d.ts +2 -2
- package/src/config.mjs +1 -1
- package/src/costTracker.mjs +58 -19
- package/src/env.mjs +9 -6
- package/src/main.mjs +17 -6
- package/src/model.d.ts +1 -1
- package/src/tools/patchFile.mjs +11 -12
- package/src/usageStore.mjs +167 -0
- package/src/utils/notify.mjs +3 -2
- package/src/voiceInput.mjs +24 -634
- package/src/voiceInputGemini.mjs +105 -0
- package/src/voiceInputOpenAI.mjs +104 -0
- package/src/voiceInputSession.mjs +543 -0
- package/src/voiceToggleKey.mjs +62 -0
- package/bin/plain-notify-terminal-bell +0 -3
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import {
|
|
2
|
+
isObjectLike,
|
|
3
|
+
startWebSocketVoiceSession,
|
|
4
|
+
} from "./voiceInputSession.mjs";
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* @import { VoiceProviderHooks, VoiceRecorderConfig, VoiceSession, VoiceSessionCallbacks } from "./voiceInputSession.mjs"
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* @typedef {Object} VoiceInputGeminiConfig
|
|
12
|
+
* @property {"gemini"} provider
|
|
13
|
+
* @property {string} apiKey
|
|
14
|
+
* @property {string} [model] - Defaults to "gemini-3.1-flash-live-preview".
|
|
15
|
+
* @property {string} [language] - ISO-639-1 code (e.g. "ja", "en"). Passed to the model as a system instruction since Gemini Live has no native language hint for input transcription.
|
|
16
|
+
* @property {string} [baseURL]
|
|
17
|
+
* @property {VoiceRecorderConfig} [recorder]
|
|
18
|
+
* @property {string} [toggleKey]
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
const GEMINI_DEFAULT_MODEL = "gemini-3.1-flash-live-preview";
|
|
22
|
+
const GEMINI_DEFAULT_WS =
|
|
23
|
+
"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent";
|
|
24
|
+
const GEMINI_SAMPLE_RATE = 16000;
|
|
25
|
+
const GEMINI_LABEL = "Gemini Live";
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Start a voice input session backed by the Gemini Live BidiGenerateContent
|
|
29
|
+
* WebSocket. Spawns a recorder, streams PCM as base64 JSON messages, and
|
|
30
|
+
* forwards transcript deltas via `onTranscript`.
|
|
31
|
+
*
|
|
32
|
+
* Gemini Live was designed for voice agents, not pure STT, so the setup
|
|
33
|
+
* message forces `maxOutputTokens: 1` and disables thinking on 2.5 models
|
|
34
|
+
* to minimise wasted audio output.
|
|
35
|
+
*
|
|
36
|
+
* @param {object} options
|
|
37
|
+
* @param {VoiceInputGeminiConfig} options.config
|
|
38
|
+
* @param {VoiceSessionCallbacks} options.callbacks
|
|
39
|
+
* @returns {VoiceSession}
|
|
40
|
+
*/
|
|
41
|
+
export function startGeminiVoiceSession({ config, callbacks }) {
|
|
42
|
+
/** @type {VoiceProviderHooks<VoiceInputGeminiConfig>} */
|
|
43
|
+
const hooks = {
|
|
44
|
+
label: GEMINI_LABEL,
|
|
45
|
+
sampleRate: GEMINI_SAMPLE_RATE,
|
|
46
|
+
buildWsUrl(config) {
|
|
47
|
+
const base = config.baseURL ?? GEMINI_DEFAULT_WS;
|
|
48
|
+
return `${base}?key=${encodeURIComponent(config.apiKey)}`;
|
|
49
|
+
},
|
|
50
|
+
buildSetupMessage(config) {
|
|
51
|
+
const model = config.model ?? GEMINI_DEFAULT_MODEL;
|
|
52
|
+
/** @type {Record<string, unknown>} */
|
|
53
|
+
const generationConfig = {
|
|
54
|
+
// https://ai.google.dev/gemini-api/docs/live-api/capabilities#response-modalities
|
|
55
|
+
// > The native audio models only support `AUDIO` response modality.
|
|
56
|
+
responseModalities: ["AUDIO"],
|
|
57
|
+
maxOutputTokens: 1,
|
|
58
|
+
};
|
|
59
|
+
if (model.includes("2.5")) {
|
|
60
|
+
generationConfig.thinkingConfig = { thinkingBudget: 0 };
|
|
61
|
+
}
|
|
62
|
+
/** @type {Record<string, unknown>} */
|
|
63
|
+
const setup = {
|
|
64
|
+
model: `models/${model}`,
|
|
65
|
+
generationConfig,
|
|
66
|
+
inputAudioTranscription: {},
|
|
67
|
+
};
|
|
68
|
+
if (config.language) {
|
|
69
|
+
setup.systemInstruction = {
|
|
70
|
+
parts: [{ text: `The user is speaking in ${config.language}.` }],
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
return { setup };
|
|
74
|
+
},
|
|
75
|
+
isReadyMessage(message) {
|
|
76
|
+
return isObjectLike(message) && "setupComplete" in message;
|
|
77
|
+
},
|
|
78
|
+
extractTranscript(message) {
|
|
79
|
+
if (!isObjectLike(message)) return undefined;
|
|
80
|
+
const serverContent = message.serverContent;
|
|
81
|
+
if (!isObjectLike(serverContent)) return undefined;
|
|
82
|
+
const transcription = serverContent.inputTranscription;
|
|
83
|
+
if (
|
|
84
|
+
isObjectLike(transcription) &&
|
|
85
|
+
typeof transcription.text === "string" &&
|
|
86
|
+
transcription.text.length > 0
|
|
87
|
+
) {
|
|
88
|
+
return transcription.text;
|
|
89
|
+
}
|
|
90
|
+
return undefined;
|
|
91
|
+
},
|
|
92
|
+
buildAudioPayload(chunk, sampleRate) {
|
|
93
|
+
return {
|
|
94
|
+
realtimeInput: {
|
|
95
|
+
audio: {
|
|
96
|
+
data: chunk.toString("base64"),
|
|
97
|
+
mimeType: `audio/pcm;rate=${sampleRate}`,
|
|
98
|
+
},
|
|
99
|
+
},
|
|
100
|
+
};
|
|
101
|
+
},
|
|
102
|
+
};
|
|
103
|
+
|
|
104
|
+
return startWebSocketVoiceSession({ hooks, config, callbacks });
|
|
105
|
+
}
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import {
|
|
2
|
+
isObjectLike,
|
|
3
|
+
startWebSocketVoiceSession,
|
|
4
|
+
} from "./voiceInputSession.mjs";
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* @import { VoiceProviderHooks, VoiceRecorderConfig, VoiceSession, VoiceSessionCallbacks } from "./voiceInputSession.mjs"
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* @typedef {Object} VoiceInputOpenAIConfig
|
|
12
|
+
* @property {"openai"} provider
|
|
13
|
+
* @property {string} apiKey
|
|
14
|
+
* @property {string} [model] - Defaults to "gpt-4o-transcribe".
|
|
15
|
+
* @property {string} [language] - ISO-639-1 code (e.g. "ja", "en"). Improves accuracy and latency when set.
|
|
16
|
+
* @property {string} [baseURL]
|
|
17
|
+
* @property {VoiceRecorderConfig} [recorder]
|
|
18
|
+
* @property {string} [toggleKey] - "ctrl-<char>". Defaults to "ctrl-o".
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
const OPENAI_DEFAULT_MODEL = "gpt-4o-transcribe";
|
|
22
|
+
const OPENAI_DEFAULT_WS = "wss://api.openai.com/v1/realtime";
|
|
23
|
+
const OPENAI_SAMPLE_RATE = 24000;
|
|
24
|
+
const OPENAI_LABEL = "OpenAI Realtime";
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Start a voice input session backed by the OpenAI Realtime transcription
|
|
28
|
+
* WebSocket. Spawns a recorder, streams PCM as base64 JSON messages, and
|
|
29
|
+
* forwards transcript deltas via `onTranscript`.
|
|
30
|
+
*
|
|
31
|
+
* @param {object} options
|
|
32
|
+
* @param {VoiceInputOpenAIConfig} options.config
|
|
33
|
+
* @param {VoiceSessionCallbacks} options.callbacks
|
|
34
|
+
* @returns {VoiceSession}
|
|
35
|
+
*/
|
|
36
|
+
export function startOpenAIVoiceSession({ config, callbacks }) {
|
|
37
|
+
/** @type {VoiceProviderHooks<VoiceInputOpenAIConfig>} */
|
|
38
|
+
const hooks = {
|
|
39
|
+
label: OPENAI_LABEL,
|
|
40
|
+
sampleRate: OPENAI_SAMPLE_RATE,
|
|
41
|
+
buildWsUrl(config) {
|
|
42
|
+
const base = config.baseURL ?? OPENAI_DEFAULT_WS;
|
|
43
|
+
return `${base}?intent=transcription`;
|
|
44
|
+
},
|
|
45
|
+
buildWsOptions(config) {
|
|
46
|
+
return {
|
|
47
|
+
headers: {
|
|
48
|
+
Authorization: `Bearer ${config.apiKey}`,
|
|
49
|
+
"OpenAI-Beta": "realtime=v1",
|
|
50
|
+
},
|
|
51
|
+
};
|
|
52
|
+
},
|
|
53
|
+
buildSetupMessage(config) {
|
|
54
|
+
const model = config.model ?? OPENAI_DEFAULT_MODEL;
|
|
55
|
+
/** @type {{ model: string, language?: string }} */
|
|
56
|
+
const transcription = { model };
|
|
57
|
+
if (config.language) transcription.language = config.language;
|
|
58
|
+
// The `?intent=transcription` endpoint uses the flat transcription-session
|
|
59
|
+
// schema, not the nested `session.audio.input.*` realtime schema.
|
|
60
|
+
return {
|
|
61
|
+
type: "transcription_session.update",
|
|
62
|
+
session: {
|
|
63
|
+
input_audio_format: "pcm16",
|
|
64
|
+
input_audio_transcription: transcription,
|
|
65
|
+
turn_detection: { type: "server_vad" },
|
|
66
|
+
},
|
|
67
|
+
};
|
|
68
|
+
},
|
|
69
|
+
isReadyMessage(message) {
|
|
70
|
+
return (
|
|
71
|
+
isObjectLike(message) &&
|
|
72
|
+
(message.type === "transcription_session.created" ||
|
|
73
|
+
message.type === "transcription_session.updated")
|
|
74
|
+
);
|
|
75
|
+
},
|
|
76
|
+
extractError(message) {
|
|
77
|
+
if (!isObjectLike(message) || message.type !== "error") return undefined;
|
|
78
|
+
const error = message.error;
|
|
79
|
+
if (!isObjectLike(error)) return undefined;
|
|
80
|
+
return typeof error.message === "string"
|
|
81
|
+
? error.message
|
|
82
|
+
: JSON.stringify(error);
|
|
83
|
+
},
|
|
84
|
+
extractTranscript(message) {
|
|
85
|
+
if (
|
|
86
|
+
isObjectLike(message) &&
|
|
87
|
+
message.type === "conversation.item.input_audio_transcription.delta" &&
|
|
88
|
+
typeof message.delta === "string" &&
|
|
89
|
+
message.delta.length > 0
|
|
90
|
+
) {
|
|
91
|
+
return message.delta;
|
|
92
|
+
}
|
|
93
|
+
return undefined;
|
|
94
|
+
},
|
|
95
|
+
buildAudioPayload(chunk, _sampleRate) {
|
|
96
|
+
return {
|
|
97
|
+
type: "input_audio_buffer.append",
|
|
98
|
+
audio: chunk.toString("base64"),
|
|
99
|
+
};
|
|
100
|
+
},
|
|
101
|
+
};
|
|
102
|
+
|
|
103
|
+
return startWebSocketVoiceSession({ hooks, config, callbacks });
|
|
104
|
+
}
|