heyhank 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +83 -10
  3. package/bin/cli.ts +7 -7
  4. package/bin/ctl.ts +42 -42
  5. package/dist/assets/{AgentsPage-BPhirnCe.js → AgentsPage-B-AAmsMK.js} +3 -3
  6. package/dist/assets/AssistantPage-BV1Mfwdt.js +2 -0
  7. package/dist/assets/BusinessPage-tLpNEz19.js +1 -0
  8. package/dist/assets/{CronManager-DDbz-yiT.js → CronManager-B-K_n3Jg.js} +1 -1
  9. package/dist/assets/HelpPage-Bhf_j6Xr.js +1 -0
  10. package/dist/assets/{IntegrationsPage-CrOitCmJ.js → IntegrationsPage-DAMjs9tM.js} +1 -1
  11. package/dist/assets/JarvisHUD-C_TGXCCn.js +120 -0
  12. package/dist/assets/MediaPage-C48HTTrt.js +1 -0
  13. package/dist/assets/MemoryPage-JkC-qtgp.js +1 -0
  14. package/dist/assets/{PlatformDashboard-Do6F0O2p.js → PlatformDashboard-AUo7tNnE.js} +1 -1
  15. package/dist/assets/{Playground-Fc5cdc5p.js → Playground-AzNMsRBL.js} +1 -1
  16. package/dist/assets/{ProcessPanel-CslEiZkI.js → ProcessPanel-DpE_2sX3.js} +1 -1
  17. package/dist/assets/{PromptsPage-D2EhsdNO.js → PromptsPage-C2RQOs6p.js} +2 -2
  18. package/dist/assets/RunsPage-B9UOyO79.js +1 -0
  19. package/dist/assets/{SandboxManager-a1AVI5q2.js → SandboxManager-jHvYjwfh.js} +1 -1
  20. package/dist/assets/SettingsPage-BBJax6gt.js +51 -0
  21. package/dist/assets/SkillsMarketplace-IjmjfdjD.js +1 -0
  22. package/dist/assets/SocialMediaPage-DoPZHhr2.js +10 -0
  23. package/dist/assets/{TailscalePage-CHiFhZXF.js → TailscalePage-DDEY7ckO.js} +1 -1
  24. package/dist/assets/TelephonyPage-OPNBZYKt.js +9 -0
  25. package/dist/assets/{TerminalPage-Drwyrnfd.js → TerminalPage-BjMbHHW3.js} +1 -1
  26. package/dist/assets/{gemini-live-client-C7rqAW7G.js → gemini-live-client-C70FEtX2.js} +11 -8
  27. package/dist/assets/{index-CEqZnThB.js → index-BgYM4wXw.js} +94 -93
  28. package/dist/assets/index-BkjSoVgn.css +32 -0
  29. package/dist/assets/sw-register-C7NOHtIu.js +1 -0
  30. package/dist/assets/text-chat-client-BSbLJerZ.js +2 -0
  31. package/dist/index.html +2 -2
  32. package/dist/sw.js +1 -1
  33. package/package.json +6 -1
  34. package/server/agent-executor.ts +37 -2
  35. package/server/agent-store.ts +3 -3
  36. package/server/agent-types.ts +11 -0
  37. package/server/assistant-store.ts +232 -6
  38. package/server/auth-manager.ts +9 -0
  39. package/server/cache-headers.ts +1 -1
  40. package/server/calendar-service.ts +10 -0
  41. package/server/ceo/document-store.ts +129 -0
  42. package/server/ceo/finance-store.ts +343 -0
  43. package/server/ceo/kpi-store.ts +208 -0
  44. package/server/ceo/memory-import.ts +277 -0
  45. package/server/ceo/news-store.ts +208 -0
  46. package/server/ceo/template-store.ts +134 -0
  47. package/server/ceo/time-tracking-store.ts +227 -0
  48. package/server/claude-auth-monitor.ts +128 -0
  49. package/server/claude-code-worker.ts +86 -0
  50. package/server/claude-session-discovery.ts +74 -1
  51. package/server/cli-launcher.ts +32 -10
  52. package/server/codex-adapter.ts +2 -2
  53. package/server/codex-ws-proxy.cjs +1 -1
  54. package/server/container-manager.ts +4 -4
  55. package/server/content-intelligence/content-engine.ts +1112 -0
  56. package/server/content-intelligence/platform-knowledge.ts +870 -0
  57. package/server/cron-store.ts +3 -3
  58. package/server/embedding-service.ts +49 -0
  59. package/server/event-bus-types.ts +13 -0
  60. package/server/federation/node-store.ts +5 -4
  61. package/server/fs-utils.ts +28 -1
  62. package/server/hank-notifications-store.ts +91 -0
  63. package/server/hank-tool-executor.ts +1835 -0
  64. package/server/hank-tools.ts +2107 -0
  65. package/server/image-pull-manager.ts +2 -2
  66. package/server/index.ts +25 -2
  67. package/server/llm-providers-streaming.ts +541 -0
  68. package/server/llm-providers.ts +12 -0
  69. package/server/marketplace.ts +249 -0
  70. package/server/mcp-registry.ts +158 -0
  71. package/server/memory-service.ts +296 -0
  72. package/server/obsidian-sync.ts +184 -0
  73. package/server/provider-manager.ts +5 -2
  74. package/server/provider-registry.ts +12 -0
  75. package/server/reminder-scheduler.ts +37 -1
  76. package/server/routes/agent-routes.ts +2 -1
  77. package/server/routes/assistant-routes.ts +198 -5
  78. package/server/routes/ceo-finance-kpi-routes.ts +167 -0
  79. package/server/routes/ceo-news-time-routes.ts +137 -0
  80. package/server/routes/ceo-routes.ts +99 -0
  81. package/server/routes/content-routes.ts +116 -0
  82. package/server/routes/email-routes.ts +147 -0
  83. package/server/routes/env-routes.ts +3 -3
  84. package/server/routes/fs-routes.ts +12 -9
  85. package/server/routes/hank-chat-routes.ts +592 -0
  86. package/server/routes/llm-routes.ts +12 -0
  87. package/server/routes/marketplace-routes.ts +63 -0
  88. package/server/routes/media-routes.ts +1 -1
  89. package/server/routes/memory-routes.ts +127 -0
  90. package/server/routes/platform-routes.ts +14 -675
  91. package/server/routes/sandbox-routes.ts +1 -1
  92. package/server/routes/settings-routes.ts +51 -1
  93. package/server/routes/socialmedia-routes.ts +152 -2
  94. package/server/routes/system-routes.ts +2 -2
  95. package/server/routes/team-routes.ts +71 -0
  96. package/server/routes/telephony-routes.ts +98 -18
  97. package/server/routes.ts +36 -9
  98. package/server/session-creation-service.ts +2 -2
  99. package/server/session-orchestrator.ts +54 -2
  100. package/server/session-types.ts +2 -0
  101. package/server/settings-manager.ts +50 -2
  102. package/server/skill-discovery.ts +68 -0
  103. package/server/socialmedia/adapters/browser-adapter.ts +179 -0
  104. package/server/socialmedia/adapters/postiz-adapter.ts +291 -14
  105. package/server/socialmedia/manager.ts +234 -15
  106. package/server/socialmedia/store.ts +51 -1
  107. package/server/socialmedia/types.ts +35 -2
  108. package/server/socialview/browser-manager.ts +150 -0
  109. package/server/socialview/extractors.ts +1298 -0
  110. package/server/socialview/image-describe.ts +188 -0
  111. package/server/socialview/library.ts +119 -0
  112. package/server/socialview/poster.ts +276 -0
  113. package/server/socialview/routes.ts +371 -0
  114. package/server/socialview/style-analyzer.ts +187 -0
  115. package/server/socialview/style-profiles.ts +67 -0
  116. package/server/socialview/types.ts +166 -0
  117. package/server/socialview/vision.ts +127 -0
  118. package/server/socialview/vnc-manager.ts +110 -0
  119. package/server/style-injector.ts +135 -0
  120. package/server/team-service.ts +239 -0
  121. package/server/team-store.ts +75 -0
  122. package/server/team-types.ts +52 -0
  123. package/server/telephony/audio-bridge.ts +281 -35
  124. package/server/telephony/audio-recorder.ts +132 -0
  125. package/server/telephony/call-manager.ts +803 -104
  126. package/server/telephony/call-types.ts +67 -1
  127. package/server/telephony/esl-client.ts +319 -0
  128. package/server/telephony/freeswitch-sync.ts +155 -0
  129. package/server/telephony/phone-utils.ts +63 -0
  130. package/server/telephony/telephony-store.ts +9 -8
  131. package/server/url-validator.ts +82 -0
  132. package/server/vault-markdown.ts +317 -0
  133. package/server/vault-migration.ts +121 -0
  134. package/server/vault-store.ts +466 -0
  135. package/server/vault-watcher.ts +59 -0
  136. package/server/vector-store.ts +210 -0
  137. package/server/voice-pipeline/gemini-live-adapter.ts +97 -0
  138. package/server/voice-pipeline/greeting-cache.ts +200 -0
  139. package/server/voice-pipeline/manager.ts +249 -0
  140. package/server/voice-pipeline/pipeline.ts +335 -0
  141. package/server/voice-pipeline/providers/index.ts +47 -0
  142. package/server/voice-pipeline/providers/llm-internal.ts +527 -0
  143. package/server/voice-pipeline/providers/stt-google.ts +157 -0
  144. package/server/voice-pipeline/providers/tts-google.ts +126 -0
  145. package/server/voice-pipeline/types.ts +247 -0
  146. package/server/ws-bridge-types.ts +6 -1
  147. package/dist/assets/AssistantPage-DJ-cMQfb.js +0 -1
  148. package/dist/assets/HelpPage-DMfkzERp.js +0 -1
  149. package/dist/assets/MediaPage-CE5rdvkC.js +0 -1
  150. package/dist/assets/RunsPage-C5BZF5Rx.js +0 -1
  151. package/dist/assets/SettingsPage-DirhjQrJ.js +0 -51
  152. package/dist/assets/SocialMediaPage-DBuM28vD.js +0 -1
  153. package/dist/assets/TelephonyPage-x0VV0fOo.js +0 -1
  154. package/dist/assets/index-C8M_PUmX.css +0 -32
  155. package/dist/assets/sw-register-LSSpj6RU.js +0 -1
  156. package/server/socialmedia/adapters/ayrshare-adapter.ts +0 -169
@@ -0,0 +1,249 @@
1
+ // ─── Voice Engine Manager ────────────────────────────────────────────────────
2
+ // Decides per-call which engine to use (Pipeline vs Gemini Live).
3
+ // Handles caller-ID lookup for personalized greetings.
4
+ // On pipeline failure, falls back to Gemini Live.
5
+
6
+ import type { TelephonyContact, TranscriptEntry, CallState, TelephonySettings } from "../telephony/call-types.js";
7
+ import type { VoicePipelineSettings, PipelineSessionConfig, LLMToolCall, LLMToolDef, LLMToolResult } from "./types.js";
8
+ import { DEFAULT_VOICE_PIPELINE_SETTINGS } from "./types.js";
9
+ import { findContactByPhone, getOrRenderGreeting, buildGreetingText } from "./greeting-cache.js";
10
+ import { startPipelineSession } from "./pipeline.js";
11
+
12
+ /** Pipeline-friendly tool definitions (JSON-Schema instead of Gemini's OBJECT/STRING shape) */
13
+ export const PIPELINE_TELEPHONY_TOOLS: LLMToolDef[] = [
14
+ {
15
+ name: "end_call",
16
+ description: "Hang up the phone. CRITICAL RULES: (1) NEVER call this tool proactively. (2) Only call this AFTER you said goodbye AND heard the other person's final response. (3) If you have a script, you MUST complete EVERY step first. (4) There must be a natural pause in the conversation before you use this. (5) When in doubt, do NOT hang up — let the other person hang up instead.",
17
+ parameters: { type: "object", properties: {} },
18
+ },
19
+ {
20
+ name: "transfer_call",
21
+ description: "Request to transfer the call to a human or another number. Use when the AI can't handle the request.",
22
+ parameters: {
23
+ type: "object",
24
+ properties: {
25
+ reason: { type: "string", description: "Why the transfer is needed" },
26
+ },
27
+ },
28
+ },
29
+ ];
30
+
31
+ /**
32
+ * Public interface — the call-manager creates one of these per call.
33
+ * Mirrors the AudioBridge surface so call-manager doesn't need big refactors.
34
+ */
35
+ export interface VoiceEngineSession {
36
+ /** Connect/start the engine (idempotent) */
37
+ connect(): Promise<void>;
38
+ /** Push caller audio (PCM 8kHz from FreeSWITCH) */
39
+ sendCallerAudio(pcm: Buffer | Uint8Array): void;
40
+ /** Tear down */
41
+ disconnect(): void;
42
+ /** Engine identifier (for transcript label / debug) */
43
+ readonly engineLabel: string;
44
+ /** When connected, send pre-rendered greeting immediately (Pipeline only) */
45
+ playGreetingIfReady?(): void;
46
+ /** Audio out callback — call-manager sets this to forward PCM to FreeSWITCH */
47
+ onAudio: (pcm8k: Uint8Array) => void;
48
+ /** Turn-complete callback (Gemini parity) */
49
+ onTurnComplete?: () => void;
50
+ readonly isReady: boolean;
51
+ }
52
+
53
+ /**
54
+ * Build the system prompt for an inbound call from an unknown caller.
55
+ * Adds an instruction to ask for the name if the caller doesn't introduce themselves.
56
+ */
57
+ function augmentInboundPromptForUnknownCaller(basePrompt: string): string {
58
+ return basePrompt + `
59
+
60
+ ZUSÄTZLICHE ANWEISUNG (unbekannte Nummer):
61
+ Falls der Anrufer seinen Namen nicht von selbst nennt, frag höflich nach: "Mit wem spreche ich denn?"`;
62
+ }
63
+
64
+ /**
65
+ * When a pre-rendered greeting is played server-side on call answer, the LLM
66
+ * doesn't realise it has already greeted. Combined with prompt phrasing like
67
+ * "Answer warmly" or "DEFAULT INBOUND CALL FLOW: 1. Answer warmly...", the LLM
68
+ * tends to greet again on its first turn — caller hears the welcome twice.
69
+ * This appendix tells the LLM exactly what was already said and to skip step 1.
70
+ */
71
+ function augmentPromptForPreRenderedGreeting(basePrompt: string, greetingText: string): string {
72
+ if (!greetingText) return basePrompt;
73
+ return basePrompt + `
74
+
75
+ WICHTIG — BEGRÜSSUNG WURDE BEREITS GESPROCHEN:
76
+ Du hast soeben folgende Begrüssung gesagt (sie wurde bereits abgespielt, der Anrufer hat sie gehört):
77
+ "${greetingText}"
78
+ Wiederhole sie NICHT. Begrüsse den Anrufer nicht erneut. Reagiere stattdessen direkt und natürlich auf das, was der Anrufer als Nächstes sagt. Überspringe Schritt 1 ("Answer warmly" / "Greet the person") im Standard-Call-Flow — der ist bereits erledigt.`;
79
+ }
80
+
81
+ interface CreateEngineParams {
82
+ callId: string;
83
+ direction: "inbound" | "outbound";
84
+ /** Remote phone number — for outbound, the number we're calling; for inbound, the caller's number */
85
+ remoteNumber: string;
86
+ /** Outbound: known contact; Inbound: looked up from caller-ID */
87
+ contact: TelephonyContact | null;
88
+ /** System prompt (script for outbound, inbound prompt for inbound) */
89
+ systemPrompt: string;
90
+ /** Settings to consult */
91
+ telephonySettings: TelephonySettings;
92
+ pipelineSettings: VoicePipelineSettings;
93
+ /** LLM tool definitions the pipeline may invoke (default: PIPELINE_TELEPHONY_TOOLS) */
94
+ tools?: LLMToolDef[];
95
+ /** Tool-call handler — typically call-manager.handleToolCalls(callId, …) */
96
+ onToolCall?: (calls: LLMToolCall[]) => Promise<LLMToolResult[]>;
97
+ /** Callbacks (proxied through to engine) */
98
+ onTranscript: (entry: TranscriptEntry) => void;
99
+ onStatusChange: (status: CallState["status"]) => void;
100
+ }
101
+
102
+ /**
103
+ * Resolve which engine to use for this call.
104
+ * Returns "pipeline" or "gemini-live".
105
+ */
106
+ export function resolveEngine(pipelineSettings: VoicePipelineSettings): "pipeline" | "gemini-live" {
107
+ if (!pipelineSettings.enabled) return "gemini-live";
108
+ return pipelineSettings.engine;
109
+ }
110
+
111
+ /**
112
+ * Look up contact by phone number for inbound calls.
113
+ * For outbound calls, the contact is already known from the UI selection.
114
+ */
115
+ export function lookupContactForCall(
116
+ direction: "inbound" | "outbound",
117
+ remoteNumber: string,
118
+ knownContact: TelephonyContact | null,
119
+ contacts: TelephonyContact[],
120
+ defaultCountryCode: string = "",
121
+ ): TelephonyContact | null {
122
+ if (knownContact) return knownContact;
123
+ if (direction === "inbound") {
124
+ return findContactByPhone(contacts, remoteNumber, defaultCountryCode);
125
+ }
126
+ return null;
127
+ }
128
+
129
+ /**
130
+ * Pre-render the greeting for an upcoming call.
131
+ * For OUTBOUND: render immediately so it's ready by the time the call connects.
132
+ * For INBOUND: should already be cached (rendered on contact create/settings change).
133
+ */
134
+ export async function prepareGreeting(params: {
135
+ direction: "inbound" | "outbound";
136
+ contact: TelephonyContact | null;
137
+ pipelineSettings: VoicePipelineSettings;
138
+ }): Promise<{ text: string; pcm: Uint8Array | null }> {
139
+ if (!params.pipelineSettings.enabled) return { text: "", pcm: null };
140
+ try {
141
+ const r = await getOrRenderGreeting({
142
+ direction: params.direction,
143
+ contact: params.contact,
144
+ settings: params.pipelineSettings,
145
+ });
146
+ return { text: r.text, pcm: r.pcm };
147
+ } catch (e) {
148
+ console.error("[voice-pipeline] greeting prep failed:", e);
149
+ return { text: buildGreetingText(params.direction, params.contact), pcm: null };
150
+ }
151
+ }
152
+
153
+ /**
154
+ * Create a Pipeline-engine voice session for a call.
155
+ * Throws on init failure → caller should catch and fall back to Gemini Live.
156
+ */
157
+ export async function createPipelineEngine(params: CreateEngineParams & {
158
+ greetingText: string;
159
+ greetingPcm: Uint8Array | null;
160
+ }): Promise<VoiceEngineSession> {
161
+ // Augment inbound prompt for unknown callers
162
+ let finalPrompt = (params.direction === "inbound" && !params.contact)
163
+ ? augmentInboundPromptForUnknownCaller(params.systemPrompt)
164
+ : params.systemPrompt;
165
+ // If we have a pre-rendered greeting, tell the LLM it was already played
166
+ // so the LLM doesn't greet again on its first turn (caller would hear it twice).
167
+ if (params.greetingPcm && params.greetingText) {
168
+ finalPrompt = augmentPromptForPreRenderedGreeting(finalPrompt, params.greetingText);
169
+ }
170
+
171
+ // Wire callbacks: onAudio (patched by caller) + onTurnComplete (patched by call-manager)
172
+ let onAudioCallback: (pcm: Uint8Array) => void = () => {};
173
+ let onTurnCompleteCallback: (() => void) | undefined;
174
+
175
+ const sessionConfig: PipelineSessionConfig = {
176
+ callId: params.callId,
177
+ direction: params.direction,
178
+ contact: params.contact ? { id: params.contact.id, name: params.contact.name, phone: params.contact.phone } : null,
179
+ remoteNumber: params.remoteNumber,
180
+ systemPrompt: finalPrompt,
181
+ greetingPcm: params.greetingPcm,
182
+ tools: params.tools ?? PIPELINE_TELEPHONY_TOOLS,
183
+ settings: params.pipelineSettings,
184
+ onTranscript: params.onTranscript,
185
+ onStatusChange: params.onStatusChange,
186
+ onAudioOut: (pcm) => onAudioCallback(pcm),
187
+ onToolCall: params.onToolCall,
188
+ onTurnComplete: () => onTurnCompleteCallback?.(),
189
+ };
190
+
191
+ const session = await startPipelineSession(sessionConfig);
192
+
193
+ const wrapper: VoiceEngineSession = {
194
+ engineLabel: "Pipeline (Google STT/TTS)",
195
+ onAudio: () => {},
196
+ async connect() {
197
+ // Pipeline starts on construction — no extra connect step needed
198
+ },
199
+ sendCallerAudio(pcm) {
200
+ session.pushAudio(pcm);
201
+ },
202
+ playGreetingIfReady() {
203
+ if (params.greetingPcm) {
204
+ // The PipelineSessionImpl exposes a .sendGreetingNow() method
205
+ (session as unknown as { sendGreetingNow: (txt: string) => void }).sendGreetingNow(params.greetingText);
206
+ }
207
+ },
208
+ disconnect() {
209
+ session.close().catch(() => {});
210
+ },
211
+ get isReady() {
212
+ return session.isReady;
213
+ },
214
+ };
215
+
216
+ // Wire the audio callback to whoever sets wrapper.onAudio
217
+ Object.defineProperty(wrapper, "onAudio", {
218
+ get() { return onAudioCallback; },
219
+ set(fn: (pcm: Uint8Array) => void) { onAudioCallback = fn; },
220
+ });
221
+ // Wire onTurnComplete getter/setter (so call-manager's pendingHangup logic works)
222
+ Object.defineProperty(wrapper, "onTurnComplete", {
223
+ get() { return onTurnCompleteCallback; },
224
+ set(fn: () => void) { onTurnCompleteCallback = fn; },
225
+ });
226
+
227
+ return wrapper;
228
+ }
229
+
230
+ // ─── Pipeline settings persistence ───────────────────────────────────────────
231
+ // Stored alongside telephony settings as a sub-object: settings.voicePipeline
232
+
233
+ export function getPipelineSettingsFrom(telephonySettings: TelephonySettings): VoicePipelineSettings {
234
+ const fromSettings = (telephonySettings as TelephonySettings & { voicePipeline?: VoicePipelineSettings }).voicePipeline;
235
+ if (!fromSettings) return { ...DEFAULT_VOICE_PIPELINE_SETTINGS };
236
+ const merged: VoicePipelineSettings = {
237
+ ...DEFAULT_VOICE_PIPELINE_SETTINGS,
238
+ ...fromSettings,
239
+ stt: { ...DEFAULT_VOICE_PIPELINE_SETTINGS.stt, ...(fromSettings.stt || {}) },
240
+ tts: { ...DEFAULT_VOICE_PIPELINE_SETTINGS.tts, ...(fromSettings.tts || {}) },
241
+ llm: { ...DEFAULT_VOICE_PIPELINE_SETTINGS.llm, ...(fromSettings.llm || {}) },
242
+ };
243
+ // Migrate legacy "auto" value → the new default (groq). "auto" was removed
244
+ // in favor of explicit per-pipeline selection.
245
+ if ((merged.llm.provider as string) === "auto") {
246
+ merged.llm.provider = DEFAULT_VOICE_PIPELINE_SETTINGS.llm.provider;
247
+ }
248
+ return merged;
249
+ }
@@ -0,0 +1,335 @@
1
+ // ─── Voice Pipeline Orchestrator ────────────────────────────────────────────
2
+ // Connects STT → LLM → TTS for a single call.
3
+ //
4
+ // Flow:
5
+ // 1. Caller audio (8kHz LINEAR16) → STT.pushAudio()
6
+ // 2. STT emits final transcript → LLM.generateStream()
7
+ // 3. LLM tokens accumulated until sentence boundary → TTS.synthesize()
8
+ // 4. TTS PCM 8kHz → onAudioOut()
9
+ //
10
+ // Turn-taking:
11
+ // - Whenever caller speaks (interim STT result), we cancel any in-flight LLM/TTS (barge-in).
12
+ // - Final transcript triggers a new LLM turn.
13
+
14
+ import type {
15
+ LLMMessage,
16
+ PipelineSession,
17
+ PipelineSessionConfig,
18
+ STTResult,
19
+ STTSession,
20
+ TTSConfig,
21
+ } from "./types.js";
22
+ import { getLLMProvider, getSTTProvider, getTTSProvider } from "./providers/index.js";
23
+
24
+ /**
25
+ * Remove tool-call markup that some LLMs leak into the text stream instead
26
+ * of emitting through the provider's structured tool-call channel. Common
27
+ * patterns: `<function=name>{...}</function>`, `<tool_call>...</tool_call>`,
28
+ * `<|python_tag|>...`, and bare `<function=name/>` self-closing tags.
29
+ * Without this, goodbye lines like
30
+ * "Schönen Tag noch! <function=end_call></function>"
31
+ * get TTS'd literally by the German voice.
32
+ */
33
+ function sanitizeForTts(text: string): string {
34
+ return text
35
+ .replace(/<function\b[^>]*>[\s\S]*?<\/function>/gi, "")
36
+ .replace(/<function\b[^>]*\/>/gi, "")
37
+ .replace(/<tool_call\b[^>]*>[\s\S]*?<\/tool_call>/gi, "")
38
+ .replace(/<\|python_tag\|>[\s\S]*$/g, "")
39
+ .replace(/<\|?[a-z_]+\|?>/gi, "")
40
+ .replace(/\s{2,}/g, " ")
41
+ .trim();
42
+ }
43
+
44
+ /** Split text into sentence-like chunks for incremental TTS playback */
45
+ function* splitSentences(buffer: string): Generator<{ sentence: string; rest: string }> {
46
+ // Split on sentence-ending punctuation followed by space or end
47
+ const re = /([^.!?…]+[.!?…]+)(\s+|$)/g;
48
+ let match;
49
+ let lastIdx = 0;
50
+ while ((match = re.exec(buffer)) !== null) {
51
+ yield { sentence: match[1].trim(), rest: "" };
52
+ lastIdx = match.index + match[0].length;
53
+ }
54
+ // Return any unfinished tail as `rest`
55
+ if (lastIdx < buffer.length) {
56
+ yield { sentence: "", rest: buffer.slice(lastIdx) };
57
+ } else {
58
+ yield { sentence: "", rest: "" };
59
+ }
60
+ }
61
+
62
+ class PipelineSessionImpl implements PipelineSession {
63
+ private stt: STTSession | null = null;
64
+ private history: LLMMessage[] = [];
65
+ private closed = false;
66
+ private ready = false;
67
+ /** Counter to invalidate in-flight LLM/TTS when user starts talking */
68
+ private currentTurnId = 0;
69
+ /** Initial greeting PCM (set in constructor for instant playback) */
70
+ private greetingPcm: Uint8Array | null = null;
71
+ /** Whether we've already sent the greeting */
72
+ private greetingSent = false;
73
+ /** Buffer for accumulating LLM tokens until sentence boundary */
74
+ private llmBuffer = "";
75
+
76
+ constructor(private config: PipelineSessionConfig) {
77
+ this.greetingPcm = config.greetingPcm ?? null;
78
+ // Seed history with the system prompt
79
+ this.history.push({ role: "system", content: config.systemPrompt });
80
+ // If we have a greeting, prime the assistant's first turn so the LLM knows
81
+ // what was already said and continues naturally.
82
+ // We'll add the greeting to history when sendGreeting() runs.
83
+ }
84
+
85
+ async start(): Promise<void> {
86
+ // Start STT stream
87
+ const sttProvider = getSTTProvider(this.config.settings.stt.provider);
88
+ this.stt = await sttProvider.start({
89
+ language: this.config.settings.stt.language,
90
+ sampleRateHertz: 8000,
91
+ interimResults: true,
92
+ });
93
+
94
+ this.stt.onResult((r) => this.handleSTTResult(r));
95
+ this.stt.onError((err) => {
96
+ console.error(`[voice-pipeline] STT error on call ${this.config.callId}:`, err.message);
97
+ });
98
+
99
+ this.ready = true;
100
+ this.config.onStatusChange("active");
101
+ this.config.onTranscript({
102
+ speaker: "system",
103
+ text: `Pipeline engine connected (Google STT/TTS, ${this.config.settings.llm.provider} LLM)`,
104
+ isFinal: true,
105
+ ts: Date.now(),
106
+ });
107
+ }
108
+
109
+ /** Send pre-rendered greeting immediately on call answer */
110
+ sendGreetingNow(greetingText: string): void {
111
+ if (this.greetingSent || !this.greetingPcm) return;
112
+ this.greetingSent = true;
113
+
114
+ // Stream PCM to FreeSWITCH
115
+ this.config.onAudioOut(this.greetingPcm);
116
+
117
+ // Add greeting to history + transcript
118
+ this.history.push({ role: "assistant", content: greetingText });
119
+ this.config.onTranscript({
120
+ speaker: "ai",
121
+ text: greetingText,
122
+ isFinal: true,
123
+ ts: Date.now(),
124
+ });
125
+ }
126
+
127
+ pushAudio(pcm: Buffer | Uint8Array): void {
128
+ if (this.closed || !this.stt) return;
129
+ this.stt.pushAudio(pcm);
130
+ }
131
+
132
+ async sendText(text: string, role: "user" | "system" = "user"): Promise<void> {
133
+ this.history.push({ role, content: text });
134
+ if (role === "user") {
135
+ await this.runLLMTurn();
136
+ }
137
+ }
138
+
139
+ getGreetingPcm(): Uint8Array | null {
140
+ return this.greetingPcm;
141
+ }
142
+
143
+ private handleSTTResult(r: STTResult): void {
144
+ if (this.closed) return;
145
+
146
+ // Interim result → barge-in: cancel any in-flight LLM/TTS
147
+ if (!r.isFinal) {
148
+ // Increment turnId so any in-flight TTS becomes stale
149
+ this.currentTurnId++;
150
+ // Optionally emit interim transcript (for live UI)
151
+ if (r.text.trim()) {
152
+ this.config.onTranscript({
153
+ speaker: "callee",
154
+ text: r.text.trim(),
155
+ isFinal: false,
156
+ ts: Date.now(),
157
+ });
158
+ }
159
+ return;
160
+ }
161
+
162
+ // Final transcript → add to history + run LLM turn
163
+ const trimmed = r.text.trim();
164
+ if (!trimmed) return;
165
+ this.config.onTranscript({
166
+ speaker: "callee",
167
+ text: trimmed,
168
+ isFinal: true,
169
+ ts: Date.now(),
170
+ });
171
+ this.history.push({ role: "user", content: trimmed });
172
+ const tSttFinal = Date.now();
173
+ console.log(`[voice-pipeline][timing] call=${this.config.callId} STT_FINAL t=${tSttFinal} text="${trimmed.slice(0, 80)}"`);
174
+ this.runLLMTurn(tSttFinal).catch((e) => {
175
+ console.error(`[voice-pipeline] LLM turn error:`, e);
176
+ });
177
+ }
178
+
179
+ private async runLLMTurn(tSttFinal: number = Date.now()): Promise<void> {
180
+ const myTurnId = ++this.currentTurnId;
181
+ const llm = getLLMProvider(this.config.settings.llm.provider);
182
+ const ttsProvider = getTTSProvider(this.config.settings.tts.provider);
183
+ const callId = this.config.callId;
184
+ const tLlmStart = Date.now();
185
+
186
+ // Reset buffer for this turn
187
+ this.llmBuffer = "";
188
+ let assistantText = "";
189
+ let firstChunkLogged = false;
190
+ let firstTtsLogged = false;
191
+ /** Track in-flight TTS promises so we can await them before signalling turn-complete */
192
+ const ttsInflight: Promise<void>[] = [];
193
+
194
+ const ttsConfig: TTSConfig = {
195
+ voice: this.config.settings.tts.voice,
196
+ language: this.config.settings.stt.language,
197
+ format: "PCM_8000",
198
+ speakingRate: this.config.settings.tts.speakingRate,
199
+ };
200
+
201
+ /** Synthesize a sentence and pipe to FreeSWITCH (if turn still active) */
202
+ const speakSentence = async (sentence: string): Promise<void> => {
203
+ const cleaned = sanitizeForTts(sentence);
204
+ if (!cleaned.trim()) return;
205
+ if (myTurnId !== this.currentTurnId) return; // barge-in cancelled this turn
206
+ const tTtsStart = Date.now();
207
+ try {
208
+ // Phonetic fix: German TTS says "Hank" with a long A ("Haank").
209
+ // Spelling it "Henk" yields the English short-A pronunciation.
210
+ const ttsText = cleaned.replace(/\bHank\b/g, "Henk");
211
+ const result = await ttsProvider.synthesize(ttsText, ttsConfig);
212
+ if (myTurnId !== this.currentTurnId) return; // double-check after await
213
+ const tTtsDone = Date.now();
214
+ if (!firstTtsLogged) {
215
+ firstTtsLogged = true;
216
+ console.log(`[voice-pipeline][timing] call=${callId} FIRST_TTS_AUDIO t=${tTtsDone} dSinceSttFinal=${tTtsDone - tSttFinal}ms dSinceLlmStart=${tTtsDone - tLlmStart}ms synthMs=${tTtsDone - tTtsStart} len=${sentence.length}`);
217
+ }
218
+ this.config.onAudioOut(result.audio);
219
+ } catch (e) {
220
+ console.error(`[voice-pipeline] TTS error:`, e);
221
+ }
222
+ };
223
+
224
+ const enqueueTts = (sentence: string): void => {
225
+ const p = speakSentence(sentence);
226
+ ttsInflight.push(p);
227
+ };
228
+
229
+ const onChunk = (chunk: string): void => {
230
+ if (!firstChunkLogged) {
231
+ firstChunkLogged = true;
232
+ const now = Date.now();
233
+ console.log(`[voice-pipeline][timing] call=${callId} LLM_FIRST_TOKEN t=${now} dSinceSttFinal=${now - tSttFinal}ms dSinceLlmStart=${now - tLlmStart}ms`);
234
+ }
235
+ assistantText += chunk;
236
+ this.llmBuffer += chunk;
237
+ // Try to extract a complete sentence and speak it immediately (low latency)
238
+ let sentence = "";
239
+ let rest = this.llmBuffer;
240
+ for (const part of splitSentences(this.llmBuffer)) {
241
+ if (part.sentence) {
242
+ sentence = part.sentence;
243
+ rest = part.rest;
244
+ break;
245
+ }
246
+ }
247
+ if (sentence) {
248
+ this.llmBuffer = rest;
249
+ enqueueTts(sentence);
250
+ }
251
+ };
252
+
253
+ /** Forward LLM tool calls to the call-manager handler */
254
+ const onToolCalls = this.config.onToolCall
255
+ ? async (calls: import("./types.js").LLMToolCall[]) => {
256
+ if (myTurnId !== this.currentTurnId) return [];
257
+ this.config.onTranscript({
258
+ speaker: "system",
259
+ text: `Tool calls: ${calls.map((c) => c.name).join(", ")}`,
260
+ isFinal: true,
261
+ ts: Date.now(),
262
+ });
263
+ return await this.config.onToolCall!(calls);
264
+ }
265
+ : undefined;
266
+
267
+ const result = llm.generateStream
268
+ ? await llm.generateStream(
269
+ this.history,
270
+ { onChunk, onToolCalls },
271
+ { ...this.config.settings.llm, tools: this.config.tools },
272
+ )
273
+ : await llm.generate(this.history, { ...this.config.settings.llm, tools: this.config.tools });
274
+
275
+ if (!result.ok) {
276
+ console.error(`[voice-pipeline] LLM error: ${result.error}`);
277
+ return;
278
+ }
279
+
280
+ // Speak any leftover buffer (last sentence without trailing punctuation)
281
+ if (this.llmBuffer.trim()) {
282
+ enqueueTts(this.llmBuffer.trim());
283
+ this.llmBuffer = "";
284
+ }
285
+
286
+ // Wait for all TTS to finish dispatching (so onTurnComplete fires after audio is sent)
287
+ await Promise.allSettled(ttsInflight);
288
+ const tTurnDone = Date.now();
289
+ console.log(`[voice-pipeline][timing] call=${callId} TURN_COMPLETE t=${tTurnDone} totalFromSttFinal=${tTurnDone - tSttFinal}ms llmMs=${tTurnDone - tLlmStart}ms replyLen=${assistantText.length}`);
290
+
291
+ // Save assistant turn to history (strip any leaked tool-call markup)
292
+ const cleanedAssistant = sanitizeForTts(assistantText).trim();
293
+ if (myTurnId === this.currentTurnId && cleanedAssistant) {
294
+ this.history.push({ role: "assistant", content: cleanedAssistant });
295
+ this.config.onTranscript({
296
+ speaker: "ai",
297
+ text: cleanedAssistant,
298
+ isFinal: true,
299
+ ts: Date.now(),
300
+ });
301
+ }
302
+
303
+ // Signal turn-complete (used by call-manager's pendingHangup logic)
304
+ if (myTurnId === this.currentTurnId) {
305
+ try { this.config.onTurnComplete?.(); } catch { /* ignore */ }
306
+ }
307
+ }
308
+
309
+ async close(): Promise<void> {
310
+ if (this.closed) return;
311
+ this.closed = true;
312
+ this.ready = false;
313
+ if (this.stt) {
314
+ try { await this.stt.close(); } catch { /* ignore */ }
315
+ this.stt = null;
316
+ }
317
+ this.config.onStatusChange("ended");
318
+ }
319
+
320
+ get isReady(): boolean {
321
+ return this.ready && !this.closed;
322
+ }
323
+ }
324
+
325
+ /**
326
+ * Create + start a new pipeline session.
327
+ * Returns the session and (if greeting available) the greeting text/pcm.
328
+ */
329
+ export async function startPipelineSession(
330
+ config: PipelineSessionConfig,
331
+ ): Promise<PipelineSessionImpl> {
332
+ const session = new PipelineSessionImpl(config);
333
+ await session.start();
334
+ return session;
335
+ }
@@ -0,0 +1,47 @@
1
+ // ─── Provider Registry ───────────────────────────────────────────────────────
2
+
3
+ import type {
4
+ LLMProvider,
5
+ LLMProviderId,
6
+ STTProvider,
7
+ STTProviderId,
8
+ TTSProvider,
9
+ TTSProviderId,
10
+ } from "../types.js";
11
+ import { GoogleSTTProvider } from "./stt-google.js";
12
+ import { GoogleTTSProvider } from "./tts-google.js";
13
+ import { InternalLLMProvider } from "./llm-internal.js";
14
+
15
+ const sttRegistry = new Map<STTProviderId, STTProvider>();
16
+ const ttsRegistry = new Map<TTSProviderId, TTSProvider>();
17
+ /** Keyed by LLMProviderId — each instance is pre-configured with its preferred provider */
18
+ const llmRegistry = new Map<LLMProviderId, LLMProvider>();
19
+
20
+ // Lazy-init providers (so missing creds don't break import)
21
+ function ensureRegistered(): void {
22
+ if (!sttRegistry.has("google")) sttRegistry.set("google", new GoogleSTTProvider());
23
+ if (!ttsRegistry.has("google")) ttsRegistry.set("google", new GoogleTTSProvider());
24
+ }
25
+
26
+ export function getSTTProvider(id: STTProviderId): STTProvider {
27
+ ensureRegistered();
28
+ const p = sttRegistry.get(id);
29
+ if (!p) throw new Error(`STT provider not available: ${id}`);
30
+ return p;
31
+ }
32
+
33
+ export function getTTSProvider(id: TTSProviderId): TTSProvider {
34
+ ensureRegistered();
35
+ const p = ttsRegistry.get(id);
36
+ if (!p) throw new Error(`TTS provider not available: ${id}`);
37
+ return p;
38
+ }
39
+
40
+ export function getLLMProvider(id: LLMProviderId): LLMProvider {
41
+ let p = llmRegistry.get(id);
42
+ if (!p) {
43
+ p = new InternalLLMProvider(id);
44
+ llmRegistry.set(id, p);
45
+ }
46
+ return p;
47
+ }