@tractorscorch/clank 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +24 -0
- package/dist/index.js +405 -36
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/CHANGELOG.md
CHANGED
|
@@ -6,6 +6,30 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/).
|
|
|
6
6
|
|
|
7
7
|
---
|
|
8
8
|
|
|
9
|
+
## [1.3.0] — 2026-03-23
|
|
10
|
+
|
|
11
|
+
### Added
|
|
12
|
+
- **ElevenLabs integration** — text-to-speech via ElevenLabs API, configurable during onboarding
|
|
13
|
+
- **Whisper integration** — speech-to-text via OpenAI Whisper API or local whisper.cpp
|
|
14
|
+
- **Voice tools (3):** `text_to_speech`, `speech_to_text`, `list_voices` — agent can generate and transcribe audio
|
|
15
|
+
- **Telegram voice messages** — send a voice message → auto-transcribed via Whisper → routed to agent → response as voice (ElevenLabs) or text
|
|
16
|
+
- **Integrations config section** — unified config for third-party API services (ElevenLabs, Whisper, image gen, extensible)
|
|
17
|
+
- **Setup wizard integrations step** — configure ElevenLabs, Whisper, and other API services during onboarding
|
|
18
|
+
|
|
19
|
+
### Changed
|
|
20
|
+
- Tool count: 21 total (10 core + 11 self-config/voice)
|
|
21
|
+
- Setup wizard now asks about integrations for all users (not just advanced mode)
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## [1.2.1] — 2026-03-23
|
|
26
|
+
|
|
27
|
+
### Fixed
|
|
28
|
+
- **Gateway crash on restart** — stale Telegram messages queued while offline no longer flood the model. Messages older than 30s before startup are dropped.
|
|
29
|
+
- **Parallel model overload** — Telegram messages from the same chat are now processed sequentially (per-chat queue) instead of all at once.
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
9
33
|
## [1.2.0] — 2026-03-22
|
|
10
34
|
|
|
11
35
|
### Added
|
package/dist/index.js
CHANGED
|
@@ -1901,7 +1901,8 @@ function defaultConfig() {
|
|
|
1901
1901
|
},
|
|
1902
1902
|
safety: {
|
|
1903
1903
|
confirmExternal: true
|
|
1904
|
-
}
|
|
1904
|
+
},
|
|
1905
|
+
integrations: {}
|
|
1905
1906
|
};
|
|
1906
1907
|
}
|
|
1907
1908
|
function substituteEnvVars(obj) {
|
|
@@ -3919,6 +3920,264 @@ var init_message_tool = __esm({
|
|
|
3919
3920
|
}
|
|
3920
3921
|
});
|
|
3921
3922
|
|
|
3923
|
+
// src/voice/tts.ts
|
|
3924
|
+
var TTSEngine, STTEngine;
|
|
3925
|
+
var init_tts = __esm({
|
|
3926
|
+
"src/voice/tts.ts"() {
|
|
3927
|
+
"use strict";
|
|
3928
|
+
init_esm_shims();
|
|
3929
|
+
TTSEngine = class {
|
|
3930
|
+
config;
|
|
3931
|
+
constructor(config) {
|
|
3932
|
+
this.config = config;
|
|
3933
|
+
}
|
|
3934
|
+
/** Check if TTS is available */
|
|
3935
|
+
isAvailable() {
|
|
3936
|
+
return !!(this.config.integrations.elevenlabs?.enabled && this.config.integrations.elevenlabs?.apiKey);
|
|
3937
|
+
}
|
|
3938
|
+
/** Convert text to speech */
|
|
3939
|
+
async synthesize(text, opts) {
|
|
3940
|
+
const elevenlabs = this.config.integrations.elevenlabs;
|
|
3941
|
+
if (!elevenlabs?.enabled || !elevenlabs.apiKey) return null;
|
|
3942
|
+
const voiceId = opts?.voiceId || elevenlabs.voiceId || "JBFqnCBsd6RMkjVDRZzb";
|
|
3943
|
+
const model = elevenlabs.model || "eleven_multilingual_v2";
|
|
3944
|
+
try {
|
|
3945
|
+
const res = await fetch(
|
|
3946
|
+
`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`,
|
|
3947
|
+
{
|
|
3948
|
+
method: "POST",
|
|
3949
|
+
headers: {
|
|
3950
|
+
"Content-Type": "application/json",
|
|
3951
|
+
"xi-api-key": elevenlabs.apiKey
|
|
3952
|
+
},
|
|
3953
|
+
body: JSON.stringify({
|
|
3954
|
+
text,
|
|
3955
|
+
model_id: model,
|
|
3956
|
+
voice_settings: {
|
|
3957
|
+
stability: 0.5,
|
|
3958
|
+
similarity_boost: 0.75
|
|
3959
|
+
}
|
|
3960
|
+
})
|
|
3961
|
+
}
|
|
3962
|
+
);
|
|
3963
|
+
if (!res.ok) {
|
|
3964
|
+
const err = await res.text().catch(() => "");
|
|
3965
|
+
console.error(`ElevenLabs TTS error ${res.status}: ${err}`);
|
|
3966
|
+
return null;
|
|
3967
|
+
}
|
|
3968
|
+
const arrayBuffer = await res.arrayBuffer();
|
|
3969
|
+
return {
|
|
3970
|
+
audioBuffer: Buffer.from(arrayBuffer),
|
|
3971
|
+
format: "mp3"
|
|
3972
|
+
};
|
|
3973
|
+
} catch (err) {
|
|
3974
|
+
console.error(`TTS error: ${err instanceof Error ? err.message : err}`);
|
|
3975
|
+
return null;
|
|
3976
|
+
}
|
|
3977
|
+
}
|
|
3978
|
+
/** List available voices from ElevenLabs */
|
|
3979
|
+
async listVoices() {
|
|
3980
|
+
const elevenlabs = this.config.integrations.elevenlabs;
|
|
3981
|
+
if (!elevenlabs?.enabled || !elevenlabs.apiKey) return [];
|
|
3982
|
+
try {
|
|
3983
|
+
const res = await fetch("https://api.elevenlabs.io/v1/voices", {
|
|
3984
|
+
headers: { "xi-api-key": elevenlabs.apiKey }
|
|
3985
|
+
});
|
|
3986
|
+
if (!res.ok) return [];
|
|
3987
|
+
const data = await res.json();
|
|
3988
|
+
return (data.voices || []).map((v) => ({ id: v.voice_id, name: v.name }));
|
|
3989
|
+
} catch {
|
|
3990
|
+
return [];
|
|
3991
|
+
}
|
|
3992
|
+
}
|
|
3993
|
+
};
|
|
3994
|
+
STTEngine = class {
|
|
3995
|
+
config;
|
|
3996
|
+
constructor(config) {
|
|
3997
|
+
this.config = config;
|
|
3998
|
+
}
|
|
3999
|
+
/** Check if STT is available */
|
|
4000
|
+
isAvailable() {
|
|
4001
|
+
const whisper = this.config.integrations.whisper;
|
|
4002
|
+
if (whisper?.enabled) {
|
|
4003
|
+
if (whisper.provider === "openai" && whisper.apiKey) return true;
|
|
4004
|
+
if (whisper.provider === "local") return true;
|
|
4005
|
+
}
|
|
4006
|
+
if (this.config.models.providers.openai?.apiKey) return true;
|
|
4007
|
+
return false;
|
|
4008
|
+
}
|
|
4009
|
+
/** Transcribe audio to text */
|
|
4010
|
+
async transcribe(audioBuffer, format = "ogg") {
|
|
4011
|
+
const whisper = this.config.integrations.whisper;
|
|
4012
|
+
const apiKey = whisper?.apiKey || this.config.models.providers.openai?.apiKey;
|
|
4013
|
+
if (apiKey && whisper?.provider !== "local") {
|
|
4014
|
+
return this.transcribeOpenAI(audioBuffer, format, apiKey);
|
|
4015
|
+
}
|
|
4016
|
+
return this.transcribeLocal(audioBuffer, format);
|
|
4017
|
+
}
|
|
4018
|
+
/** Transcribe via OpenAI Whisper API */
|
|
4019
|
+
async transcribeOpenAI(audioBuffer, format, apiKey) {
|
|
4020
|
+
try {
|
|
4021
|
+
const blob = new Blob([new Uint8Array(audioBuffer)], { type: `audio/${format}` });
|
|
4022
|
+
const formData = new FormData();
|
|
4023
|
+
formData.append("file", blob, `audio.${format}`);
|
|
4024
|
+
formData.append("model", "whisper-1");
|
|
4025
|
+
const res = await fetch("https://api.openai.com/v1/audio/transcriptions", {
|
|
4026
|
+
method: "POST",
|
|
4027
|
+
headers: { "Authorization": `Bearer ${apiKey}` },
|
|
4028
|
+
body: formData
|
|
4029
|
+
});
|
|
4030
|
+
if (!res.ok) return null;
|
|
4031
|
+
const data = await res.json();
|
|
4032
|
+
return data.text ? { text: data.text, language: data.language } : null;
|
|
4033
|
+
} catch {
|
|
4034
|
+
return null;
|
|
4035
|
+
}
|
|
4036
|
+
}
|
|
4037
|
+
/** Transcribe via local whisper.cpp */
|
|
4038
|
+
async transcribeLocal(audioBuffer, format) {
|
|
4039
|
+
try {
|
|
4040
|
+
const { writeFile: writeFile9, unlink: unlink5 } = await import("fs/promises");
|
|
4041
|
+
const { execSync: execSync3 } = await import("child_process");
|
|
4042
|
+
const { join: join19 } = await import("path");
|
|
4043
|
+
const { tmpdir } = await import("os");
|
|
4044
|
+
const tmpFile = join19(tmpdir(), `clank-stt-${Date.now()}.${format}`);
|
|
4045
|
+
await writeFile9(tmpFile, audioBuffer);
|
|
4046
|
+
const output = execSync3(`whisper "${tmpFile}" --model base.en --output-txt`, {
|
|
4047
|
+
encoding: "utf-8",
|
|
4048
|
+
timeout: 6e4
|
|
4049
|
+
});
|
|
4050
|
+
await unlink5(tmpFile).catch(() => {
|
|
4051
|
+
});
|
|
4052
|
+
return output.trim() ? { text: output.trim() } : null;
|
|
4053
|
+
} catch {
|
|
4054
|
+
return null;
|
|
4055
|
+
}
|
|
4056
|
+
}
|
|
4057
|
+
};
|
|
4058
|
+
}
|
|
4059
|
+
});
|
|
4060
|
+
|
|
4061
|
+
// src/voice/index.ts
|
|
4062
|
+
var voice_exports = {};
|
|
4063
|
+
__export(voice_exports, {
|
|
4064
|
+
STTEngine: () => STTEngine,
|
|
4065
|
+
TTSEngine: () => TTSEngine
|
|
4066
|
+
});
|
|
4067
|
+
var init_voice = __esm({
|
|
4068
|
+
"src/voice/index.ts"() {
|
|
4069
|
+
"use strict";
|
|
4070
|
+
init_esm_shims();
|
|
4071
|
+
init_tts();
|
|
4072
|
+
}
|
|
4073
|
+
});
|
|
4074
|
+
|
|
4075
|
+
// src/tools/self-config/voice-tool.ts
|
|
4076
|
+
var ttsTool, sttTool, voiceListTool;
|
|
4077
|
+
var init_voice_tool = __esm({
|
|
4078
|
+
"src/tools/self-config/voice-tool.ts"() {
|
|
4079
|
+
"use strict";
|
|
4080
|
+
init_esm_shims();
|
|
4081
|
+
init_voice();
|
|
4082
|
+
init_config2();
|
|
4083
|
+
ttsTool = {
|
|
4084
|
+
definition: {
|
|
4085
|
+
name: "text_to_speech",
|
|
4086
|
+
description: "Convert text to speech audio using ElevenLabs. Returns the audio file path. Requires ElevenLabs integration to be configured.",
|
|
4087
|
+
parameters: {
|
|
4088
|
+
type: "object",
|
|
4089
|
+
properties: {
|
|
4090
|
+
text: { type: "string", description: "Text to convert to speech" },
|
|
4091
|
+
voice_id: { type: "string", description: "ElevenLabs voice ID (optional, uses default)" }
|
|
4092
|
+
},
|
|
4093
|
+
required: ["text"]
|
|
4094
|
+
}
|
|
4095
|
+
},
|
|
4096
|
+
safetyLevel: "low",
|
|
4097
|
+
readOnly: true,
|
|
4098
|
+
validate(args) {
|
|
4099
|
+
if (!args.text || typeof args.text !== "string") return { ok: false, error: "text is required" };
|
|
4100
|
+
if (args.text.length > 5e3) return { ok: false, error: "text too long (max 5000 chars)" };
|
|
4101
|
+
return { ok: true };
|
|
4102
|
+
},
|
|
4103
|
+
async execute(args) {
|
|
4104
|
+
const config = await loadConfig();
|
|
4105
|
+
const engine = new TTSEngine(config);
|
|
4106
|
+
if (!engine.isAvailable()) {
|
|
4107
|
+
return "Error: ElevenLabs not configured. Tell me to set it up, or run: clank setup --section integrations";
|
|
4108
|
+
}
|
|
4109
|
+
const result = await engine.synthesize(args.text, {
|
|
4110
|
+
voiceId: args.voice_id
|
|
4111
|
+
});
|
|
4112
|
+
if (!result) return "Error: TTS synthesis failed";
|
|
4113
|
+
const { writeFile: writeFile9 } = await import("fs/promises");
|
|
4114
|
+
const { join: join19 } = await import("path");
|
|
4115
|
+
const { tmpdir } = await import("os");
|
|
4116
|
+
const outPath = join19(tmpdir(), `clank-tts-${Date.now()}.${result.format}`);
|
|
4117
|
+
await writeFile9(outPath, result.audioBuffer);
|
|
4118
|
+
return `Audio generated: ${outPath} (${result.format}, ${Math.round(result.audioBuffer.length / 1024)}KB)`;
|
|
4119
|
+
}
|
|
4120
|
+
};
|
|
4121
|
+
sttTool = {
|
|
4122
|
+
definition: {
|
|
4123
|
+
name: "speech_to_text",
|
|
4124
|
+
description: "Transcribe an audio file to text using Whisper (OpenAI API or local whisper.cpp). Provide a file path to an audio file.",
|
|
4125
|
+
parameters: {
|
|
4126
|
+
type: "object",
|
|
4127
|
+
properties: {
|
|
4128
|
+
file_path: { type: "string", description: "Path to audio file (.mp3, .wav, .ogg, .m4a)" }
|
|
4129
|
+
},
|
|
4130
|
+
required: ["file_path"]
|
|
4131
|
+
}
|
|
4132
|
+
},
|
|
4133
|
+
safetyLevel: "low",
|
|
4134
|
+
readOnly: true,
|
|
4135
|
+
validate(args) {
|
|
4136
|
+
if (!args.file_path || typeof args.file_path !== "string") return { ok: false, error: "file_path is required" };
|
|
4137
|
+
return { ok: true };
|
|
4138
|
+
},
|
|
4139
|
+
async execute(args) {
|
|
4140
|
+
const { readFile: readFile12 } = await import("fs/promises");
|
|
4141
|
+
const { existsSync: existsSync10 } = await import("fs");
|
|
4142
|
+
const filePath = args.file_path;
|
|
4143
|
+
if (!existsSync10(filePath)) return `Error: File not found: ${filePath}`;
|
|
4144
|
+
const config = await loadConfig();
|
|
4145
|
+
const engine = new STTEngine(config);
|
|
4146
|
+
if (!engine.isAvailable()) {
|
|
4147
|
+
return "Error: Speech-to-text not configured. Need OpenAI API key or local whisper.cpp installed.";
|
|
4148
|
+
}
|
|
4149
|
+
const audioBuffer = await readFile12(filePath);
|
|
4150
|
+
const ext = filePath.split(".").pop() || "wav";
|
|
4151
|
+
const result = await engine.transcribe(audioBuffer, ext);
|
|
4152
|
+
if (!result) return "Error: Transcription failed";
|
|
4153
|
+
return result.text;
|
|
4154
|
+
}
|
|
4155
|
+
};
|
|
4156
|
+
voiceListTool = {
|
|
4157
|
+
definition: {
|
|
4158
|
+
name: "list_voices",
|
|
4159
|
+
description: "List available ElevenLabs voices for text-to-speech.",
|
|
4160
|
+
parameters: { type: "object", properties: {} }
|
|
4161
|
+
},
|
|
4162
|
+
safetyLevel: "low",
|
|
4163
|
+
readOnly: true,
|
|
4164
|
+
validate() {
|
|
4165
|
+
return { ok: true };
|
|
4166
|
+
},
|
|
4167
|
+
async execute() {
|
|
4168
|
+
const config = await loadConfig();
|
|
4169
|
+
const engine = new TTSEngine(config);
|
|
4170
|
+
if (!engine.isAvailable()) {
|
|
4171
|
+
return "Error: ElevenLabs not configured.";
|
|
4172
|
+
}
|
|
4173
|
+
const voices = await engine.listVoices();
|
|
4174
|
+
if (voices.length === 0) return "No voices found or API error.";
|
|
4175
|
+
return voices.map((v) => `${v.name}: ${v.id}`).join("\n");
|
|
4176
|
+
}
|
|
4177
|
+
};
|
|
4178
|
+
}
|
|
4179
|
+
});
|
|
4180
|
+
|
|
3922
4181
|
// src/tools/self-config/index.ts
|
|
3923
4182
|
function registerSelfConfigTools(registry) {
|
|
3924
4183
|
registry.register(configTool);
|
|
@@ -3929,6 +4188,9 @@ function registerSelfConfigTools(registry) {
|
|
|
3929
4188
|
registry.register(cronTool);
|
|
3930
4189
|
registry.register(gatewayTool);
|
|
3931
4190
|
registry.register(messageTool);
|
|
4191
|
+
registry.register(ttsTool);
|
|
4192
|
+
registry.register(sttTool);
|
|
4193
|
+
registry.register(voiceListTool);
|
|
3932
4194
|
}
|
|
3933
4195
|
var init_self_config = __esm({
|
|
3934
4196
|
"src/tools/self-config/index.ts"() {
|
|
@@ -3942,6 +4204,7 @@ var init_self_config = __esm({
|
|
|
3942
4204
|
init_cron_tool();
|
|
3943
4205
|
init_gateway_tool();
|
|
3944
4206
|
init_message_tool();
|
|
4207
|
+
init_voice_tool();
|
|
3945
4208
|
init_config_tool();
|
|
3946
4209
|
init_channel_tool();
|
|
3947
4210
|
init_agent_tool();
|
|
@@ -3950,6 +4213,7 @@ var init_self_config = __esm({
|
|
|
3950
4213
|
init_cron_tool();
|
|
3951
4214
|
init_gateway_tool();
|
|
3952
4215
|
init_message_tool();
|
|
4216
|
+
init_voice_tool();
|
|
3953
4217
|
}
|
|
3954
4218
|
});
|
|
3955
4219
|
|
|
@@ -4732,11 +4996,17 @@ var init_telegram = __esm({
|
|
|
4732
4996
|
try {
|
|
4733
4997
|
this.bot = new Bot(telegramConfig.botToken);
|
|
4734
4998
|
const bot = this.bot;
|
|
4999
|
+
const startupTime = Math.floor(Date.now() / 1e3);
|
|
5000
|
+
const chatLocks = /* @__PURE__ */ new Map();
|
|
4735
5001
|
bot.on("message:text", async (ctx) => {
|
|
4736
5002
|
const msg = ctx.message;
|
|
4737
5003
|
const chatId = msg.chat.id;
|
|
4738
5004
|
const userId = msg.from?.id;
|
|
4739
5005
|
const isGroup = msg.chat.type === "group" || msg.chat.type === "supergroup";
|
|
5006
|
+
if (msg.date < startupTime - 30) {
|
|
5007
|
+
console.log(` Telegram: dropping stale message from ${userId} (${startupTime - msg.date}s old)`);
|
|
5008
|
+
return;
|
|
5009
|
+
}
|
|
4740
5010
|
if (telegramConfig.allowFrom && telegramConfig.allowFrom.length > 0) {
|
|
4741
5011
|
const username = msg.from?.username ? `@${msg.from.username}` : "";
|
|
4742
5012
|
const userIdStr = String(userId || "");
|
|
@@ -4760,27 +5030,103 @@ var init_telegram = __esm({
|
|
|
4760
5030
|
}
|
|
4761
5031
|
return;
|
|
4762
5032
|
}
|
|
4763
|
-
|
|
4764
|
-
|
|
4765
|
-
|
|
4766
|
-
|
|
4767
|
-
|
|
4768
|
-
|
|
4769
|
-
|
|
4770
|
-
|
|
4771
|
-
|
|
4772
|
-
|
|
4773
|
-
|
|
4774
|
-
|
|
4775
|
-
|
|
4776
|
-
|
|
4777
|
-
|
|
5033
|
+
const processMessage = async () => {
|
|
5034
|
+
if (!this.gateway) return;
|
|
5035
|
+
try {
|
|
5036
|
+
await ctx.api.sendChatAction(chatId, "typing");
|
|
5037
|
+
const response = await this.gateway.handleInboundMessage(
|
|
5038
|
+
{
|
|
5039
|
+
channel: "telegram",
|
|
5040
|
+
peerId: chatId,
|
|
5041
|
+
peerKind: isGroup ? "group" : "dm"
|
|
5042
|
+
},
|
|
5043
|
+
msg.text
|
|
5044
|
+
);
|
|
5045
|
+
if (response) {
|
|
5046
|
+
const chunks = splitMessage(response, 4e3);
|
|
5047
|
+
for (const chunk of chunks) {
|
|
5048
|
+
await ctx.api.sendMessage(chatId, chunk);
|
|
5049
|
+
}
|
|
4778
5050
|
}
|
|
5051
|
+
} catch (err) {
|
|
5052
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
5053
|
+
await ctx.api.sendMessage(chatId, `Error: ${errMsg.slice(0, 200)}`);
|
|
4779
5054
|
}
|
|
4780
|
-
}
|
|
4781
|
-
|
|
4782
|
-
|
|
5055
|
+
};
|
|
5056
|
+
const prev = chatLocks.get(chatId) || Promise.resolve();
|
|
5057
|
+
const next = prev.then(processMessage).catch(() => {
|
|
5058
|
+
});
|
|
5059
|
+
chatLocks.set(chatId, next);
|
|
5060
|
+
});
|
|
5061
|
+
bot.on("message:voice", async (ctx) => {
|
|
5062
|
+
const msg = ctx.message;
|
|
5063
|
+
const chatId = msg.chat.id;
|
|
5064
|
+
const userId = msg.from?.id;
|
|
5065
|
+
if (telegramConfig.allowFrom && telegramConfig.allowFrom.length > 0) {
|
|
5066
|
+
const username = msg.from?.username ? `@${msg.from.username}` : "";
|
|
5067
|
+
const userIdStr = String(userId || "");
|
|
5068
|
+
const allowed = telegramConfig.allowFrom.map(String);
|
|
5069
|
+
const isAllowed = allowed.some(
|
|
5070
|
+
(a) => a === userIdStr || a.toLowerCase() === username.toLowerCase() || a.toLowerCase() === (msg.from?.username || "").toLowerCase()
|
|
5071
|
+
);
|
|
5072
|
+
if (!isAllowed) return;
|
|
4783
5073
|
}
|
|
5074
|
+
if (msg.date < startupTime - 30) return;
|
|
5075
|
+
const processVoice = async () => {
|
|
5076
|
+
if (!this.gateway || !this.config) return;
|
|
5077
|
+
try {
|
|
5078
|
+
await ctx.api.sendChatAction(chatId, "typing");
|
|
5079
|
+
const file = await ctx.api.getFile(msg.voice.file_id);
|
|
5080
|
+
const fileUrl = `https://api.telegram.org/file/bot${telegramConfig.botToken}/${file.file_path}`;
|
|
5081
|
+
const res = await fetch(fileUrl);
|
|
5082
|
+
if (!res.ok) {
|
|
5083
|
+
await ctx.api.sendMessage(chatId, "Error: could not download voice message");
|
|
5084
|
+
return;
|
|
5085
|
+
}
|
|
5086
|
+
const audioBuffer = Buffer.from(await res.arrayBuffer());
|
|
5087
|
+
const { STTEngine: STTEngine2 } = await Promise.resolve().then(() => (init_voice(), voice_exports));
|
|
5088
|
+
const { loadConfig: loadConfig3 } = await Promise.resolve().then(() => (init_config2(), config_exports));
|
|
5089
|
+
const config = await loadConfig3();
|
|
5090
|
+
const stt = new STTEngine2(config);
|
|
5091
|
+
if (!stt.isAvailable()) {
|
|
5092
|
+
await ctx.api.sendMessage(chatId, "Voice messages require speech-to-text. Set up Whisper: /help");
|
|
5093
|
+
return;
|
|
5094
|
+
}
|
|
5095
|
+
const transcription = await stt.transcribe(audioBuffer, "ogg");
|
|
5096
|
+
if (!transcription?.text) {
|
|
5097
|
+
await ctx.api.sendMessage(chatId, "Could not transcribe voice message.");
|
|
5098
|
+
return;
|
|
5099
|
+
}
|
|
5100
|
+
const isGroup = msg.chat.type === "group" || msg.chat.type === "supergroup";
|
|
5101
|
+
const response = await this.gateway.handleInboundMessage(
|
|
5102
|
+
{ channel: "telegram", peerId: chatId, peerKind: isGroup ? "group" : "dm" },
|
|
5103
|
+
`[Voice message transcription]: ${transcription.text}`
|
|
5104
|
+
);
|
|
5105
|
+
if (response) {
|
|
5106
|
+
const { TTSEngine: TTSEngine2 } = await Promise.resolve().then(() => (init_voice(), voice_exports));
|
|
5107
|
+
const tts = new TTSEngine2(config);
|
|
5108
|
+
if (tts.isAvailable() && response.length < 2e3) {
|
|
5109
|
+
const audio = await tts.synthesize(response);
|
|
5110
|
+
if (audio) {
|
|
5111
|
+
const { InputFile } = await import("grammy");
|
|
5112
|
+
await ctx.api.sendVoice(chatId, new InputFile(audio.audioBuffer, "reply.mp3"));
|
|
5113
|
+
return;
|
|
5114
|
+
}
|
|
5115
|
+
}
|
|
5116
|
+
const chunks = splitMessage(response, 4e3);
|
|
5117
|
+
for (const chunk of chunks) {
|
|
5118
|
+
await ctx.api.sendMessage(chatId, chunk);
|
|
5119
|
+
}
|
|
5120
|
+
}
|
|
5121
|
+
} catch (err) {
|
|
5122
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
5123
|
+
await ctx.api.sendMessage(chatId, `Error: ${errMsg.slice(0, 200)}`);
|
|
5124
|
+
}
|
|
5125
|
+
};
|
|
5126
|
+
const prev = chatLocks.get(chatId) || Promise.resolve();
|
|
5127
|
+
const next = prev.then(processVoice).catch(() => {
|
|
5128
|
+
});
|
|
5129
|
+
chatLocks.set(chatId, next);
|
|
4784
5130
|
});
|
|
4785
5131
|
bot.start({
|
|
4786
5132
|
onStart: () => {
|
|
@@ -5327,7 +5673,7 @@ var init_server = __esm({
|
|
|
5327
5673
|
res.writeHead(200, { "Content-Type": "application/json" });
|
|
5328
5674
|
res.end(JSON.stringify({
|
|
5329
5675
|
status: "ok",
|
|
5330
|
-
version: "1.
|
|
5676
|
+
version: "1.3.0",
|
|
5331
5677
|
uptime: process.uptime(),
|
|
5332
5678
|
clients: this.clients.size,
|
|
5333
5679
|
agents: this.engines.size
|
|
@@ -5435,7 +5781,7 @@ var init_server = __esm({
|
|
|
5435
5781
|
const hello = {
|
|
5436
5782
|
type: "hello",
|
|
5437
5783
|
protocol: PROTOCOL_VERSION,
|
|
5438
|
-
version: "1.
|
|
5784
|
+
version: "1.3.0",
|
|
5439
5785
|
agents: this.config.agents.list.map((a) => ({
|
|
5440
5786
|
id: a.id,
|
|
5441
5787
|
name: a.name || a.id,
|
|
@@ -6278,21 +6624,44 @@ async function runSetup(opts) {
|
|
|
6278
6624
|
console.log(green4(" Brave Search configured"));
|
|
6279
6625
|
}
|
|
6280
6626
|
}
|
|
6281
|
-
|
|
6282
|
-
|
|
6283
|
-
|
|
6284
|
-
|
|
6285
|
-
|
|
6286
|
-
|
|
6287
|
-
|
|
6288
|
-
|
|
6289
|
-
|
|
6627
|
+
console.log("");
|
|
6628
|
+
console.log(" API Integrations:");
|
|
6629
|
+
console.log(dim4(" Add third-party services for voice, image gen, etc."));
|
|
6630
|
+
console.log(dim4(" You can also configure these later through conversation."));
|
|
6631
|
+
console.log("");
|
|
6632
|
+
const addElevenLabs = await ask(rl, cyan2(" Set up ElevenLabs (text-to-speech)? [y/N] "));
|
|
6633
|
+
if (addElevenLabs.toLowerCase() === "y") {
|
|
6634
|
+
console.log(dim4(" Get an API key at: https://elevenlabs.io/"));
|
|
6635
|
+
const key = await ask(rl, cyan2(" ElevenLabs API key: "));
|
|
6636
|
+
if (key.trim()) {
|
|
6637
|
+
config.integrations.elevenlabs = { enabled: true, apiKey: key.trim() };
|
|
6638
|
+
const voiceId = await ask(rl, cyan2(" Default voice ID (Enter to skip): "));
|
|
6639
|
+
if (voiceId.trim()) {
|
|
6640
|
+
config.integrations.elevenlabs.voiceId = voiceId.trim();
|
|
6641
|
+
}
|
|
6642
|
+
console.log(green4(" ElevenLabs configured (TTS available)"));
|
|
6643
|
+
}
|
|
6644
|
+
}
|
|
6645
|
+
const addWhisper = await ask(rl, cyan2(" Set up speech-to-text (Whisper)? [y/N] "));
|
|
6646
|
+
if (addWhisper.toLowerCase() === "y") {
|
|
6647
|
+
console.log(dim4(" 1. OpenAI Whisper API (cloud, uses OpenAI key)"));
|
|
6648
|
+
console.log(dim4(" 2. Local whisper.cpp (requires whisper installed)"));
|
|
6649
|
+
const whisperChoice = await ask(rl, cyan2(" Choice [1]: "));
|
|
6650
|
+
if (whisperChoice === "2") {
|
|
6651
|
+
config.integrations.whisper = { enabled: true, provider: "local" };
|
|
6652
|
+
console.log(green4(" Local whisper.cpp configured"));
|
|
6653
|
+
console.log(dim4(" Make sure whisper is installed and in PATH"));
|
|
6654
|
+
} else {
|
|
6655
|
+
const existingKey = config.models.providers.openai?.apiKey;
|
|
6656
|
+
if (existingKey) {
|
|
6657
|
+
config.integrations.whisper = { enabled: true, provider: "openai", apiKey: existingKey };
|
|
6658
|
+
console.log(green4(" Whisper configured (using existing OpenAI key)"));
|
|
6659
|
+
} else {
|
|
6660
|
+
const key = await ask(rl, cyan2(" OpenAI API key for Whisper: "));
|
|
6290
6661
|
if (key.trim()) {
|
|
6291
|
-
|
|
6662
|
+
config.integrations.whisper = { enabled: true, provider: "openai", apiKey: key.trim() };
|
|
6663
|
+
console.log(green4(" Whisper configured"));
|
|
6292
6664
|
}
|
|
6293
|
-
} else {
|
|
6294
|
-
console.log(dim4(" Local voice will use whisper.cpp (STT) + piper (TTS)"));
|
|
6295
|
-
console.log(dim4(" Make sure they're installed: whisper, piper"));
|
|
6296
6665
|
}
|
|
6297
6666
|
}
|
|
6298
6667
|
}
|
|
@@ -6769,7 +7138,7 @@ async function runTui(opts) {
|
|
|
6769
7138
|
ws.on("open", () => {
|
|
6770
7139
|
ws.send(JSON.stringify({
|
|
6771
7140
|
type: "connect",
|
|
6772
|
-
params: { auth: { token }, mode: "tui", version: "1.
|
|
7141
|
+
params: { auth: { token }, mode: "tui", version: "1.3.0" }
|
|
6773
7142
|
}));
|
|
6774
7143
|
});
|
|
6775
7144
|
ws.on("message", (data) => {
|
|
@@ -7198,7 +7567,7 @@ import { fileURLToPath as fileURLToPath5 } from "url";
|
|
|
7198
7567
|
import { dirname as dirname5, join as join18 } from "path";
|
|
7199
7568
|
var __filename3 = fileURLToPath5(import.meta.url);
|
|
7200
7569
|
var __dirname3 = dirname5(__filename3);
|
|
7201
|
-
var version = "1.
|
|
7570
|
+
var version = "1.3.0";
|
|
7202
7571
|
try {
|
|
7203
7572
|
const pkg = JSON.parse(readFileSync(join18(__dirname3, "..", "package.json"), "utf-8"));
|
|
7204
7573
|
version = pkg.version;
|