@alexkroman1/aai 1.2.3 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +14 -12
- package/CHANGELOG.md +20 -0
- package/dist/{constants-VTFoymJ-.js → constants-BL3nvg4I.js} +8 -1
- package/dist/host/_pipeline-test-fakes.d.ts +117 -0
- package/dist/host/pipeline-session-ctx.d.ts +24 -0
- package/dist/host/pipeline-session.d.ts +48 -0
- package/dist/host/providers/llm.d.ts +2 -0
- package/dist/host/providers/stt/assemblyai.d.ts +31 -0
- package/dist/host/providers/stt-barrel.d.ts +8 -0
- package/dist/host/providers/stt-barrel.js +92 -0
- package/dist/host/providers/stt.d.ts +2 -0
- package/dist/host/providers/tts/cartesia.d.ts +39 -0
- package/dist/host/providers/tts-barrel.d.ts +8 -0
- package/dist/host/providers/tts-barrel.js +182 -0
- package/dist/host/providers/tts.d.ts +2 -0
- package/dist/host/runtime-barrel.js +565 -81
- package/dist/host/runtime.d.ts +17 -0
- package/dist/host/s2s.d.ts +5 -0
- package/dist/host/session-ctx.d.ts +22 -4
- package/dist/host/to-vercel-tools.d.ts +45 -0
- package/dist/index.js +7 -2
- package/dist/sdk/_internal-types.d.ts +15 -1
- package/dist/sdk/constants.d.ts +7 -0
- package/dist/sdk/define.d.ts +21 -0
- package/dist/sdk/manifest.d.ts +22 -0
- package/dist/sdk/protocol.d.ts +3 -3
- package/dist/sdk/protocol.js +1 -1
- package/dist/sdk/providers.d.ts +70 -0
- package/dist/sdk/types.d.ts +16 -0
- package/exports-no-dev-deps.test.ts +39 -14
- package/host/_pipeline-test-fakes.ts +357 -0
- package/host/_test-utils.ts +1 -0
- package/host/integration/fixtures/README.md +49 -0
- package/host/integration/pipeline-reference.integration.test.ts +124 -0
- package/host/pipeline-session-ctx.test.ts +31 -0
- package/host/pipeline-session-ctx.ts +36 -0
- package/host/pipeline-session.test.ts +572 -0
- package/host/pipeline-session.ts +489 -0
- package/host/providers/llm.ts +3 -0
- package/host/providers/providers.test-d.ts +31 -0
- package/host/providers/stt/assemblyai.test.ts +100 -0
- package/host/providers/stt/assemblyai.ts +154 -0
- package/host/providers/stt/fixtures/assemblyai/basic-turn.json +30 -0
- package/host/providers/stt-barrel.ts +13 -0
- package/host/providers/stt.ts +3 -0
- package/host/providers/tts/cartesia.test.ts +210 -0
- package/host/providers/tts/cartesia.ts +251 -0
- package/host/providers/tts-barrel.ts +13 -0
- package/host/providers/tts.ts +3 -0
- package/host/runtime.test.ts +81 -1
- package/host/runtime.ts +61 -0
- package/host/s2s.test.ts +19 -0
- package/host/s2s.ts +10 -0
- package/host/session-ctx.ts +35 -8
- package/host/to-vercel-tools.test.ts +187 -0
- package/host/to-vercel-tools.ts +74 -0
- package/package.json +15 -1
- package/sdk/__snapshots__/exports.test.ts.snap +2 -0
- package/sdk/_internal-types.ts +16 -0
- package/sdk/constants.ts +8 -0
- package/sdk/define.test-d.ts +21 -0
- package/sdk/define.test.ts +33 -0
- package/sdk/define.ts +21 -0
- package/sdk/manifest.test-d.ts +14 -0
- package/sdk/manifest.test.ts +51 -0
- package/sdk/manifest.ts +39 -0
- package/sdk/providers.ts +90 -0
- package/sdk/types.ts +16 -0
- package/vitest.config.ts +1 -0
package/.turbo/turbo-build.log
CHANGED
|
@@ -1,20 +1,22 @@
|
|
|
1
1
|
|
|
2
|
-
> @alexkroman1/aai@1.
|
|
2
|
+
> @alexkroman1/aai@1.3.1 build /home/runner/work/agent/agent/packages/aai
|
|
3
3
|
> tsdown && tsc -p tsconfig.build.json
|
|
4
4
|
|
|
5
5
|
[34mℹ[39m [34mtsdown v0.21.7[39m powered by [38;2;255;126;23mrolldown v1.0.0-rc.12[39m
|
|
6
6
|
[34mℹ[39m config file: [4m/home/runner/work/agent/agent/packages/aai/tsdown.config.ts[24m
|
|
7
|
-
[34mℹ[39m entry: [34mindex.ts, sdk/protocol.ts, host/runtime-barrel.ts, sdk/manifest-barrel.ts[39m
|
|
7
|
+
[34mℹ[39m entry: [34mindex.ts, sdk/protocol.ts, host/runtime-barrel.ts, sdk/manifest-barrel.ts, host/providers/stt-barrel.ts, host/providers/tts-barrel.ts[39m
|
|
8
8
|
[34mℹ[39m target: [34mnode22[39m
|
|
9
9
|
[34mℹ[39m tsconfig: [34mtsconfig.json[39m
|
|
10
10
|
[34mℹ[39m Build start
|
|
11
|
-
[34mℹ[39m [2mdist/[22m[1mhost/runtime-barrel.js[22m
|
|
12
|
-
[34mℹ[39m [2mdist/[22m[1mindex.js[22m
|
|
13
|
-
[34mℹ[39m [2mdist/[22m[
|
|
14
|
-
[34mℹ[39m [2mdist/[22m[1msdk/
|
|
15
|
-
[34mℹ[39m [2mdist/[
|
|
16
|
-
[34mℹ[39m [2mdist/[
|
|
17
|
-
[34mℹ[39m [2mdist/[
|
|
18
|
-
[34mℹ[39m [2mdist/[
|
|
19
|
-
[34mℹ[39m
|
|
20
|
-
[
|
|
11
|
+
[34mℹ[39m [2mdist/[22m[1mhost/runtime-barrel.js[22m [2m63.51 kB[22m [2m│ gzip: 19.17 kB[22m
|
|
12
|
+
[34mℹ[39m [2mdist/[22m[1mindex.js[22m [2m 6.67 kB[22m [2m│ gzip: 2.65 kB[22m
|
|
13
|
+
[34mℹ[39m [2mdist/[22m[1mhost/providers/tts-barrel.js[22m [2m 5.52 kB[22m [2m│ gzip: 2.12 kB[22m
|
|
14
|
+
[34mℹ[39m [2mdist/[22m[1msdk/protocol.js[22m [2m 4.75 kB[22m [2m│ gzip: 1.76 kB[22m
|
|
15
|
+
[34mℹ[39m [2mdist/[22m[1mhost/providers/stt-barrel.js[22m [2m 3.08 kB[22m [2m│ gzip: 1.26 kB[22m
|
|
16
|
+
[34mℹ[39m [2mdist/[22m[1msdk/manifest-barrel.js[22m [2m 0.26 kB[22m [2m│ gzip: 0.17 kB[22m
|
|
17
|
+
[34mℹ[39m [2mdist/[22mconstants-BL3nvg4I.js [2m 3.10 kB[22m [2m│ gzip: 1.38 kB[22m
|
|
18
|
+
[34mℹ[39m [2mdist/[22m_internal-types-CoDTiBd1.js [2m 2.33 kB[22m [2m│ gzip: 0.99 kB[22m
|
|
19
|
+
[34mℹ[39m [2mdist/[22mtypes-Cfx_4QDK.js [2m 1.74 kB[22m [2m│ gzip: 0.93 kB[22m
|
|
20
|
+
[34mℹ[39m [2mdist/[22mws-upgrade-BeOQ7fXL.js [2m 1.14 kB[22m [2m│ gzip: 0.54 kB[22m
|
|
21
|
+
[34mℹ[39m 10 files, total: 92.10 kB
|
|
22
|
+
[32m✔[39m Build complete in [32m43ms[39m
|
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,25 @@
|
|
|
1
1
|
# @alexkroman1/aai
|
|
2
2
|
|
|
3
|
+
## 1.3.1
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- 5a9f3d5: Pipeline session concurrency fixes: serialize turns across duplicate STT finals, bound TTS flush with abort+timeout, cascade provider errors to terminate session, atomic provider open, snapshot conversation history in tool executions.
|
|
8
|
+
|
|
9
|
+
## 1.3.0
|
|
10
|
+
|
|
11
|
+
### Minor Changes
|
|
12
|
+
|
|
13
|
+
- f1a9764: Internal: manifests now classify session mode (`s2s` | `pipeline`) at parse time, and expose optional `stt`, `llm`, and `tts` fields on the `Manifest` type. Groundwork for upcoming pluggable provider support — no user-visible behavior change yet.
|
|
14
|
+
|
|
15
|
+
### Patch Changes
|
|
16
|
+
|
|
17
|
+
- c95212a: Fix runtime crash when loading the host runtime without the provider SDKs installed. `ai`, `assemblyai`, and `@cartesia/cartesia-js` are now regular dependencies instead of optional peer dependencies — the runtime eagerly imports `pipeline-session.ts`, so they were already required at module load even for S2S-mode agents. Optional peer deps described a design the code didn't enforce; now the metadata matches behavior.
|
|
18
|
+
- f1a9764: Fix PipelineSession: thread agentConfig.maxSteps into streamText via stopWhen: stepCountIs(n). Vercel AI SDK v6 defaults to a single step, so multi-step tool use would silently terminate after the first tool-result.
|
|
19
|
+
- f1a9764: agent() helper accepts stt/llm/tts fields directly, removing the need for the spread workaround in pipeline-mode agents
|
|
20
|
+
- 0231114: Simplify pipeline-session state management and parallelize provider open. Removes redundant PipelineState variable (equivalent to turnController != null), opens STT+TTS concurrently via Promise.allSettled (halves session-start latency), and cleans up either session if one open fails or the session aborts mid-open.
|
|
21
|
+
- 8a79282: Add sendAudioRaw to S2sHandle for batch-encoded audio frames
|
|
22
|
+
|
|
3
23
|
## 1.2.3
|
|
4
24
|
|
|
5
25
|
### Patch Changes
|
|
@@ -21,6 +21,13 @@ const FETCH_TIMEOUT_MS = 15e3;
|
|
|
21
21
|
const RUN_CODE_TIMEOUT_MS = 5e3;
|
|
22
22
|
/** Maximum time to wait for sessions to stop during graceful shutdown. */
|
|
23
23
|
const DEFAULT_SHUTDOWN_TIMEOUT_MS = 3e4;
|
|
24
|
+
/**
|
|
25
|
+
* Maximum time to wait for a pipeline-mode TTS drain after `flush()` before
|
|
26
|
+
* forcing the turn to complete. Prevents a stuck TTS provider from wedging
|
|
27
|
+
* the session. Short relative to `DEFAULT_SHUTDOWN_TIMEOUT_MS` so stop()
|
|
28
|
+
* can still reclaim the socket cleanly.
|
|
29
|
+
*/
|
|
30
|
+
const PIPELINE_FLUSH_TIMEOUT_MS = 1e4;
|
|
24
31
|
/** Maximum length for tool result strings sent to clients. */
|
|
25
32
|
const MAX_TOOL_RESULT_CHARS = 4e3;
|
|
26
33
|
/** Maximum chars for webpage text after HTML-to-text conversion. */
|
|
@@ -44,4 +51,4 @@ const WS_OPEN = 1;
|
|
|
44
51
|
*/
|
|
45
52
|
const AGENT_CSP = "default-src 'self'; script-src 'self' 'unsafe-eval' blob:; style-src 'self' 'unsafe-inline' https://fonts.googleapis.com; connect-src 'self' wss: ws:; img-src 'self' data:; font-src 'self' https://fonts.gstatic.com; object-src 'none'; base-uri 'self'";
|
|
46
53
|
//#endregion
|
|
47
|
-
export {
|
|
54
|
+
export { TOOL_EXECUTION_TIMEOUT_MS as _, DEFAULT_SHUTDOWN_TIMEOUT_MS as a, FETCH_TIMEOUT_MS as c, MAX_PAGE_CHARS as d, MAX_TOOL_RESULT_CHARS as f, RUN_CODE_TIMEOUT_MS as g, PIPELINE_FLUSH_TIMEOUT_MS as h, DEFAULT_SESSION_START_TIMEOUT_MS as i, MAX_HTML_BYTES as l, MAX_WS_PAYLOAD_BYTES as m, DEFAULT_IDLE_TIMEOUT_MS as n, DEFAULT_STT_SAMPLE_RATE as o, MAX_VALUE_SIZE as p, DEFAULT_MAX_HISTORY as r, DEFAULT_TTS_SAMPLE_RATE as s, AGENT_CSP as t, MAX_MESSAGE_BUFFER_SIZE as u, WS_OPEN as v };
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* In-memory fake providers + fake `LanguageModel` for pipeline-session tests.
|
|
3
|
+
*
|
|
4
|
+
* These fakes do not touch the network. Each `createFake*Provider()` returns a
|
|
5
|
+
* provider whose `open()` records the most recently opened session so tests
|
|
6
|
+
* can reach into it via `.last()` and drive events (partial/final transcripts,
|
|
7
|
+
* TTS chunks) or observe calls (`sendText`, `flush`, `cancel`).
|
|
8
|
+
*
|
|
9
|
+
* The fake `LanguageModel` implements the minimum of {@link LanguageModelV3}
|
|
10
|
+
* required by `streamText` — `doStream()` returns a `ReadableStream` of
|
|
11
|
+
* {@link LanguageModelV3StreamPart}s produced from a scripted sequence.
|
|
12
|
+
*
|
|
13
|
+
* @internal Not part of the public API.
|
|
14
|
+
*/
|
|
15
|
+
import type { LanguageModel } from "ai";
|
|
16
|
+
import { type Emitter } from "nanoevents";
|
|
17
|
+
import { vi } from "vitest";
|
|
18
|
+
import type { SttEvents, SttOpenOptions, SttProvider, SttSession, TtsEvents, TtsOpenOptions, TtsProvider, TtsSession } from "../sdk/providers.ts";
|
|
19
|
+
export type FakeSttSession = SttSession & {
|
|
20
|
+
readonly emitter: Emitter<SttEvents>;
|
|
21
|
+
readonly opts: SttOpenOptions;
|
|
22
|
+
readonly audioFrames: Int16Array[];
|
|
23
|
+
readonly closed: {
|
|
24
|
+
value: boolean;
|
|
25
|
+
};
|
|
26
|
+
firePartial(text: string): void;
|
|
27
|
+
fireFinal(text: string): void;
|
|
28
|
+
fireError(code: "stt_stream_error" | "stt_connect_failed" | "stt_auth_failed", message: string): void;
|
|
29
|
+
};
|
|
30
|
+
export type FakeSttProvider = SttProvider & {
|
|
31
|
+
/** The most recently opened session, or undefined if `open()` hasn't been called. */
|
|
32
|
+
last(): FakeSttSession | undefined;
|
|
33
|
+
readonly sessions: FakeSttSession[];
|
|
34
|
+
};
|
|
35
|
+
export declare function createFakeSttProvider(): FakeSttProvider;
|
|
36
|
+
export type FakeTtsSession = TtsSession & {
|
|
37
|
+
readonly emitter: Emitter<TtsEvents>;
|
|
38
|
+
readonly opts: TtsOpenOptions;
|
|
39
|
+
readonly textChunks: string[];
|
|
40
|
+
readonly closed: {
|
|
41
|
+
value: boolean;
|
|
42
|
+
};
|
|
43
|
+
readonly sendText: ReturnType<typeof vi.fn<(text: string) => void>>;
|
|
44
|
+
readonly flush: ReturnType<typeof vi.fn<() => void>>;
|
|
45
|
+
readonly cancel: ReturnType<typeof vi.fn<() => void>>;
|
|
46
|
+
fireAudio(pcm: Int16Array): void;
|
|
47
|
+
fireError(code: "tts_stream_error" | "tts_connect_failed" | "tts_auth_failed", message: string): void;
|
|
48
|
+
};
|
|
49
|
+
export type FakeTtsProvider = TtsProvider & {
|
|
50
|
+
/** The most recently opened session, or undefined if `open()` hasn't been called. */
|
|
51
|
+
last(): FakeTtsSession | undefined;
|
|
52
|
+
readonly sessions: FakeTtsSession[];
|
|
53
|
+
};
|
|
54
|
+
/**
|
|
55
|
+
* Fake TTS provider. By default, `flush()` synchronously emits a single `done`
|
|
56
|
+
* event so tests don't have to script the drain separately. Pass
|
|
57
|
+
* `{ autoDoneOnFlush: false }` to drive `done` manually.
|
|
58
|
+
*/
|
|
59
|
+
export declare function createFakeTtsProvider(options?: {
|
|
60
|
+
autoDoneOnFlush?: boolean;
|
|
61
|
+
}): FakeTtsProvider;
|
|
62
|
+
/**
|
|
63
|
+
* Fake STT provider that throws on `open()` with a given error code. Used to
|
|
64
|
+
* test atomic provider open — TTS should not be opened at all when STT fails.
|
|
65
|
+
*/
|
|
66
|
+
export declare function createFailingSttProvider(code: "stt_connect_failed" | "stt_auth_failed" | "stt_stream_error", message: string): SttProvider;
|
|
67
|
+
/**
|
|
68
|
+
* Fake TTS provider that throws on `open()` with a given error code. Used to
|
|
69
|
+
* test atomic provider open — STT should be closed when TTS fails.
|
|
70
|
+
*/
|
|
71
|
+
export declare function createFailingTtsProvider(code: "tts_connect_failed" | "tts_auth_failed" | "tts_stream_error", message: string): TtsProvider;
|
|
72
|
+
/**
|
|
73
|
+
* A scripted stream part. `text` yields a `text-delta` in the LLM provider's
|
|
74
|
+
* raw wire format; `tool-call` / `tool-result` emit the corresponding parts
|
|
75
|
+
* (v3 provider spec: `toolCallId`, `toolName`, `input` as JSON string for
|
|
76
|
+
* calls, `result` as JSON-serialisable value for results).
|
|
77
|
+
*/
|
|
78
|
+
export type ScriptedPart = {
|
|
79
|
+
type: "text";
|
|
80
|
+
text: string;
|
|
81
|
+
} | {
|
|
82
|
+
type: "tool-call";
|
|
83
|
+
toolCallId: string;
|
|
84
|
+
toolName: string;
|
|
85
|
+
input: string;
|
|
86
|
+
} | {
|
|
87
|
+
type: "tool-result";
|
|
88
|
+
toolCallId: string;
|
|
89
|
+
toolName: string;
|
|
90
|
+
result: unknown;
|
|
91
|
+
} | {
|
|
92
|
+
type: "error";
|
|
93
|
+
error: unknown;
|
|
94
|
+
};
|
|
95
|
+
/**
|
|
96
|
+
* Create a fake {@link LanguageModel} that yields a scripted sequence of
|
|
97
|
+
* parts when `streamText` drives `doStream()`. The fake ignores the prompt
|
|
98
|
+
* and tools — it simply replays the script.
|
|
99
|
+
*
|
|
100
|
+
* Pass `{ delayMs: N }` to space out parts with `setTimeout(N)` so that
|
|
101
|
+
* barge-in tests can abort mid-stream deterministically.
|
|
102
|
+
*
|
|
103
|
+
* Pass `{ steps: ScriptedPart[][] }` (instead of `script`) for multi-step
|
|
104
|
+
* scenarios: each call to `doStream()` consumes the next step's parts.
|
|
105
|
+
* This is how `streamText` drives multi-turn tool loops under `stopWhen`.
|
|
106
|
+
*
|
|
107
|
+
* The returned value is cast to the `LanguageModel` union because we
|
|
108
|
+
* implement the provider shape structurally rather than importing the
|
|
109
|
+
* full `@ai-sdk/provider` types into the aai package.
|
|
110
|
+
*/
|
|
111
|
+
export declare function createFakeLanguageModel(options: {
|
|
112
|
+
script: ScriptedPart[];
|
|
113
|
+
delayMs?: number;
|
|
114
|
+
} | {
|
|
115
|
+
steps: ScriptedPart[][];
|
|
116
|
+
delayMs?: number;
|
|
117
|
+
}): LanguageModel;
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/** Pipeline session context — base ctx + STT/TTS session slots. */
|
|
2
|
+
import type { AgentConfig, ExecuteTool } from "../sdk/_internal-types.ts";
|
|
3
|
+
import type { ClientSink } from "../sdk/protocol.ts";
|
|
4
|
+
import type { SttSession, TtsSession } from "../sdk/providers.ts";
|
|
5
|
+
import type { Logger } from "./runtime-config.ts";
|
|
6
|
+
import { type BaseSessionCtx } from "./session-ctx.ts";
|
|
7
|
+
/**
|
|
8
|
+
* Pipeline session context — {@link BaseSessionCtx} plus STT/TTS provider
|
|
9
|
+
* session handles. Replaces the S2S `s2s` field with decoupled `stt` + `tts`
|
|
10
|
+
* slots so the pipeline orchestrator can drive independent providers.
|
|
11
|
+
*/
|
|
12
|
+
export type PipelineSessionCtx = BaseSessionCtx & {
|
|
13
|
+
stt: SttSession | null;
|
|
14
|
+
tts: TtsSession | null;
|
|
15
|
+
};
|
|
16
|
+
export declare function buildPipelineCtx(opts: {
|
|
17
|
+
id: string;
|
|
18
|
+
agent: string;
|
|
19
|
+
client: ClientSink;
|
|
20
|
+
agentConfig: AgentConfig;
|
|
21
|
+
executeTool: ExecuteTool;
|
|
22
|
+
log: Logger;
|
|
23
|
+
maxHistory?: number | undefined;
|
|
24
|
+
}): PipelineSessionCtx;
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pipeline session — pluggable STT → LLM → TTS orchestrator.
|
|
3
|
+
*
|
|
4
|
+
* Alternative to the S2S session (see `session.ts`) that drives three
|
|
5
|
+
* independent providers. A new partial STT event while the agent is replying
|
|
6
|
+
* triggers barge-in (aborts the LLM stream and cancels TTS).
|
|
7
|
+
*/
|
|
8
|
+
import type { LanguageModel } from "ai";
|
|
9
|
+
import type { AgentConfig, ExecuteTool, ToolSchema } from "../sdk/_internal-types.ts";
|
|
10
|
+
import type { ClientSink } from "../sdk/protocol.ts";
|
|
11
|
+
import type { SttProvider, TtsProvider } from "../sdk/providers.ts";
|
|
12
|
+
import { type Logger } from "./runtime-config.ts";
|
|
13
|
+
import type { Session } from "./session.ts";
|
|
14
|
+
/** Configuration options for {@link createPipelineSession}. */
|
|
15
|
+
export interface PipelineSessionOptions {
|
|
16
|
+
/** Unique session identifier. */
|
|
17
|
+
id: string;
|
|
18
|
+
/** Agent slug. */
|
|
19
|
+
agent: string;
|
|
20
|
+
/** Sink for wire events + audio back to the browser client. */
|
|
21
|
+
client: ClientSink;
|
|
22
|
+
/** Serializable agent config (name, system prompt, maxSteps, etc.). */
|
|
23
|
+
agentConfig: AgentConfig;
|
|
24
|
+
/** JSON Schema definitions for the agent's tools. */
|
|
25
|
+
toolSchemas: readonly ToolSchema[];
|
|
26
|
+
/** Optional natural-language guidance appended to the system prompt. */
|
|
27
|
+
toolGuidance?: readonly string[] | undefined;
|
|
28
|
+
/** Function to invoke tools by name. */
|
|
29
|
+
executeTool: ExecuteTool;
|
|
30
|
+
/** STT provider (injected via manifest in pipeline mode). */
|
|
31
|
+
stt: SttProvider;
|
|
32
|
+
/** LLM provider (Vercel AI SDK `LanguageModel`). */
|
|
33
|
+
llm: LanguageModel;
|
|
34
|
+
/** TTS provider (injected via manifest in pipeline mode). */
|
|
35
|
+
tts: TtsProvider;
|
|
36
|
+
/** STT API key. */
|
|
37
|
+
sttApiKey: string;
|
|
38
|
+
/** TTS API key. */
|
|
39
|
+
ttsApiKey: string;
|
|
40
|
+
/** Audio sample rate (PCM16, Hz). Defaults to {@link DEFAULT_STT_SAMPLE_RATE}. */
|
|
41
|
+
sampleRate?: number | undefined;
|
|
42
|
+
/** Logger. Defaults to the console logger. */
|
|
43
|
+
logger?: Logger | undefined;
|
|
44
|
+
/** Sliding-window conversation history size. */
|
|
45
|
+
maxHistory?: number | undefined;
|
|
46
|
+
}
|
|
47
|
+
/** Create a pluggable-provider voice session. */
|
|
48
|
+
export declare function createPipelineSession(opts: PipelineSessionOptions): Session;
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* AssemblyAI Universal-Streaming STT adapter.
|
|
3
|
+
*
|
|
4
|
+
* Wraps the `assemblyai` Node SDK's {@link StreamingTranscriber} and
|
|
5
|
+
* normalizes its event surface onto the {@link SttProvider} /
|
|
6
|
+
* {@link SttEvents} contract consumed by the pipeline orchestrator.
|
|
7
|
+
*
|
|
8
|
+
* Default model: `"u3pro-rt"` (Universal-3 Pro Real-Time). The adapter
|
|
9
|
+
* maps that to the SDK's `"u3-rt-pro"` `speechModel` value; any other
|
|
10
|
+
* string is forwarded verbatim.
|
|
11
|
+
*/
|
|
12
|
+
import { type StreamingTranscriber } from "assemblyai";
|
|
13
|
+
import type { SttProvider, SttSession } from "../../../sdk/providers.ts";
|
|
14
|
+
export interface AssemblyAIOptions {
|
|
15
|
+
/**
|
|
16
|
+
* Streaming speech model. Defaults to `"u3pro-rt"` (Universal-3 Pro
|
|
17
|
+
* Real-Time). Arbitrary strings are forwarded to the SDK unchanged.
|
|
18
|
+
*/
|
|
19
|
+
model?: "u3pro-rt" | string;
|
|
20
|
+
/**
|
|
21
|
+
* AssemblyAI API key. Falls back to `SttOpenOptions.apiKey`, then
|
|
22
|
+
* `process.env.ASSEMBLYAI_API_KEY`.
|
|
23
|
+
*/
|
|
24
|
+
apiKey?: string;
|
|
25
|
+
}
|
|
26
|
+
/** Internal: SttSession with a test-only handle to the raw SDK transcriber. */
|
|
27
|
+
export interface AssemblyAISession extends SttSession {
|
|
28
|
+
/** @internal Test-only: exposes the underlying SDK transcriber for fixture replay. */
|
|
29
|
+
readonly _transcriber: StreamingTranscriber;
|
|
30
|
+
}
|
|
31
|
+
export declare function assemblyAI(opts?: AssemblyAIOptions): SttProvider;
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `@alexkroman1/aai/stt` subpath barrel. Re-exports the STT provider
|
|
3
|
+
* contract types (via `stt.ts` → `sdk/providers.ts`) alongside the
|
|
4
|
+
* concrete AssemblyAI adapter factory. Task 9 owns wiring this file
|
|
5
|
+
* into `package.json` exports.
|
|
6
|
+
*/
|
|
7
|
+
export * from "./stt/assemblyai.ts";
|
|
8
|
+
export type * from "./stt.ts";
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import { createNanoEvents } from "nanoevents";
|
|
2
|
+
import { AssemblyAI } from "assemblyai";
|
|
3
|
+
//#region host/providers/stt/assemblyai.ts
|
|
4
|
+
/**
|
|
5
|
+
* AssemblyAI Universal-Streaming STT adapter.
|
|
6
|
+
*
|
|
7
|
+
* Wraps the `assemblyai` Node SDK's {@link StreamingTranscriber} and
|
|
8
|
+
* normalizes its event surface onto the {@link SttProvider} /
|
|
9
|
+
* {@link SttEvents} contract consumed by the pipeline orchestrator.
|
|
10
|
+
*
|
|
11
|
+
* Default model: `"u3pro-rt"` (Universal-3 Pro Real-Time). The adapter
|
|
12
|
+
* maps that to the SDK's `"u3-rt-pro"` `speechModel` value; any other
|
|
13
|
+
* string is forwarded verbatim.
|
|
14
|
+
*/
|
|
15
|
+
/** Translate the adapter's model alias to the SDK's `speechModel` value. */
|
|
16
|
+
function resolveSpeechModel(model) {
|
|
17
|
+
if (model === "u3pro-rt") return "u3-rt-pro";
|
|
18
|
+
return model;
|
|
19
|
+
}
|
|
20
|
+
function makeError(message) {
|
|
21
|
+
const err = new Error(message);
|
|
22
|
+
err.code = "stt_stream_error";
|
|
23
|
+
return err;
|
|
24
|
+
}
|
|
25
|
+
function assemblyAI(opts = {}) {
|
|
26
|
+
return {
|
|
27
|
+
name: "assemblyai",
|
|
28
|
+
async open(openOpts) {
|
|
29
|
+
const apiKey = opts.apiKey ?? openOpts.apiKey ?? process.env.ASSEMBLYAI_API_KEY;
|
|
30
|
+
if (!apiKey) {
|
|
31
|
+
const err = /* @__PURE__ */ new Error("AssemblyAI STT adapter: missing API key. Provide via the factory option, SttOpenOptions, or the ASSEMBLYAI_API_KEY environment variable.");
|
|
32
|
+
err.code = "stt_auth_failed";
|
|
33
|
+
throw err;
|
|
34
|
+
}
|
|
35
|
+
const client = new AssemblyAI({ apiKey });
|
|
36
|
+
const speechModel = resolveSpeechModel(opts.model ?? "u3pro-rt");
|
|
37
|
+
const transcriber = client.streaming.transcriber({
|
|
38
|
+
sampleRate: openOpts.sampleRate,
|
|
39
|
+
speechModel,
|
|
40
|
+
...openOpts.sttPrompt ? { prompt: openOpts.sttPrompt } : {}
|
|
41
|
+
});
|
|
42
|
+
const emitter = createNanoEvents();
|
|
43
|
+
let closed = false;
|
|
44
|
+
transcriber.on("turn", (event) => {
|
|
45
|
+
if (closed) return;
|
|
46
|
+
const text = event.transcript ?? "";
|
|
47
|
+
if (event.end_of_turn) {
|
|
48
|
+
if (text.length > 0) emitter.emit("final", text);
|
|
49
|
+
} else if (text.length > 0) emitter.emit("partial", text);
|
|
50
|
+
});
|
|
51
|
+
transcriber.on("error", (err) => {
|
|
52
|
+
if (closed) return;
|
|
53
|
+
emitter.emit("error", makeError(err?.message ?? String(err)));
|
|
54
|
+
});
|
|
55
|
+
transcriber.on("close", (code) => {
|
|
56
|
+
if (closed) return;
|
|
57
|
+
if (code !== 1e3) emitter.emit("error", makeError(`socket closed ${code}`));
|
|
58
|
+
});
|
|
59
|
+
try {
|
|
60
|
+
await transcriber.connect();
|
|
61
|
+
} catch (cause) {
|
|
62
|
+
const err = /* @__PURE__ */ new Error(`AssemblyAI STT: connect failed: ${cause instanceof Error ? cause.message : String(cause)}`);
|
|
63
|
+
err.code = "stt_connect_failed";
|
|
64
|
+
throw err;
|
|
65
|
+
}
|
|
66
|
+
const close = async () => {
|
|
67
|
+
if (closed) return;
|
|
68
|
+
closed = true;
|
|
69
|
+
try {
|
|
70
|
+
await transcriber.close();
|
|
71
|
+
} catch {}
|
|
72
|
+
};
|
|
73
|
+
if (openOpts.signal.aborted) close();
|
|
74
|
+
else openOpts.signal.addEventListener("abort", () => void close(), { once: true });
|
|
75
|
+
return {
|
|
76
|
+
sendAudio(pcm) {
|
|
77
|
+
if (closed) return;
|
|
78
|
+
const copy = new Uint8Array(pcm.byteLength);
|
|
79
|
+
copy.set(new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength));
|
|
80
|
+
transcriber.sendAudio(copy.buffer);
|
|
81
|
+
},
|
|
82
|
+
on(event, fn) {
|
|
83
|
+
return emitter.on(event, fn);
|
|
84
|
+
},
|
|
85
|
+
close,
|
|
86
|
+
_transcriber: transcriber
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
//#endregion
|
|
92
|
+
export { assemblyAI };
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cartesia TTS adapter — streaming WebSocket with per-turn `context_id`.
|
|
3
|
+
*
|
|
4
|
+
* Wraps `@cartesia/cartesia-js`'s `TTSWS` / `TTSWSContext` and normalizes it
|
|
5
|
+
* onto the {@link TtsProvider} / {@link TtsEvents} contract consumed by the
|
|
6
|
+
* pipeline orchestrator.
|
|
7
|
+
*
|
|
8
|
+
* **Per-turn context lifecycle.** Each `sendText(...)` within the same turn
|
|
9
|
+
* appends to the same Cartesia context. On `flush()` or `cancel()`, a new
|
|
10
|
+
* context is minted for the next turn — so concurrent `cancel({ contextId })`
|
|
11
|
+
* only targets the in-flight turn, never the one that follows.
|
|
12
|
+
*
|
|
13
|
+
* **Audio format.** The adapter requests `raw` / `pcm_s16le` at the
|
|
14
|
+
* negotiated `sampleRate` so it can forward chunks as `Int16Array` with no
|
|
15
|
+
* conversion.
|
|
16
|
+
*/
|
|
17
|
+
import type { TTSWS } from "@cartesia/cartesia-js/resources/tts";
|
|
18
|
+
import type { TtsProvider, TtsSession } from "../../../sdk/providers.ts";
|
|
19
|
+
export interface CartesiaOptions {
|
|
20
|
+
/** Cartesia voice ID. Required. */
|
|
21
|
+
voice: string;
|
|
22
|
+
/** Model ID. Defaults to `"sonic-2"`. */
|
|
23
|
+
model?: string;
|
|
24
|
+
/**
|
|
25
|
+
* Cartesia API key. Falls back to `TtsOpenOptions.apiKey`, then
|
|
26
|
+
* `process.env.CARTESIA_API_KEY`.
|
|
27
|
+
*/
|
|
28
|
+
apiKey?: string;
|
|
29
|
+
/** Spoken language hint. Defaults to `"en"`. */
|
|
30
|
+
language?: string;
|
|
31
|
+
}
|
|
32
|
+
/** Internal: TtsSession with a test-only handle to the raw SDK socket. */
|
|
33
|
+
export interface CartesiaSession extends TtsSession {
|
|
34
|
+
/** @internal Test-only: exposes the underlying SDK WebSocket wrapper. */
|
|
35
|
+
readonly _ws: TTSWS;
|
|
36
|
+
/** @internal Test-only: id of the currently-active context. */
|
|
37
|
+
readonly _currentContextId: () => string;
|
|
38
|
+
}
|
|
39
|
+
export declare function cartesia(opts: CartesiaOptions): TtsProvider;
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `@alexkroman1/aai/tts` subpath barrel. Re-exports the TTS provider
|
|
3
|
+
* contract types (via `tts.ts` → `sdk/providers.ts`) alongside the
|
|
4
|
+
* concrete Cartesia adapter factory. Task 9 owns wiring this file
|
|
5
|
+
* into `package.json` exports.
|
|
6
|
+
*/
|
|
7
|
+
export * from "./tts/cartesia.ts";
|
|
8
|
+
export type * from "./tts.ts";
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
import { createNanoEvents } from "nanoevents";
|
|
2
|
+
import { randomUUID } from "node:crypto";
|
|
3
|
+
import { Cartesia } from "@cartesia/cartesia-js";
|
|
4
|
+
//#region host/providers/tts/cartesia.ts
|
|
5
|
+
/**
|
|
6
|
+
* Cartesia TTS adapter — streaming WebSocket with per-turn `context_id`.
|
|
7
|
+
*
|
|
8
|
+
* Wraps `@cartesia/cartesia-js`'s `TTSWS` / `TTSWSContext` and normalizes it
|
|
9
|
+
* onto the {@link TtsProvider} / {@link TtsEvents} contract consumed by the
|
|
10
|
+
* pipeline orchestrator.
|
|
11
|
+
*
|
|
12
|
+
* **Per-turn context lifecycle.** Each `sendText(...)` within the same turn
|
|
13
|
+
* appends to the same Cartesia context. On `flush()` or `cancel()`, a new
|
|
14
|
+
* context is minted for the next turn — so concurrent `cancel({ contextId })`
|
|
15
|
+
* only targets the in-flight turn, never the one that follows.
|
|
16
|
+
*
|
|
17
|
+
* **Audio format.** The adapter requests `raw` / `pcm_s16le` at the
|
|
18
|
+
* negotiated `sampleRate` so it can forward chunks as `Int16Array` with no
|
|
19
|
+
* conversion.
|
|
20
|
+
*/
|
|
21
|
+
function makeError(message) {
|
|
22
|
+
const err = new Error(message);
|
|
23
|
+
err.code = "tts_stream_error";
|
|
24
|
+
return err;
|
|
25
|
+
}
|
|
26
|
+
/** PCM16 sample rates supported by Cartesia's `raw` output format. */
|
|
27
|
+
const CARTESIA_PCM16_RATES = [
|
|
28
|
+
8e3,
|
|
29
|
+
16e3,
|
|
30
|
+
22050,
|
|
31
|
+
24e3,
|
|
32
|
+
44100,
|
|
33
|
+
48e3
|
|
34
|
+
];
|
|
35
|
+
function assertSupportedSampleRate(rate) {
|
|
36
|
+
if (CARTESIA_PCM16_RATES.includes(rate)) return rate;
|
|
37
|
+
const err = /* @__PURE__ */ new Error(`Cartesia TTS adapter: unsupported sample rate ${rate}. Supported: ${CARTESIA_PCM16_RATES.join(", ")}.`);
|
|
38
|
+
err.code = "tts_connect_failed";
|
|
39
|
+
throw err;
|
|
40
|
+
}
|
|
41
|
+
function cartesia(opts) {
|
|
42
|
+
return {
|
|
43
|
+
name: "cartesia",
|
|
44
|
+
async open(openOpts) {
|
|
45
|
+
const apiKey = opts.apiKey ?? openOpts.apiKey ?? process.env.CARTESIA_API_KEY;
|
|
46
|
+
if (!apiKey) {
|
|
47
|
+
const err = /* @__PURE__ */ new Error("Cartesia TTS adapter: missing API key. Provide via the factory option, TtsOpenOptions, or the CARTESIA_API_KEY environment variable.");
|
|
48
|
+
err.code = "tts_auth_failed";
|
|
49
|
+
throw err;
|
|
50
|
+
}
|
|
51
|
+
const sampleRate = assertSupportedSampleRate(openOpts.sampleRate);
|
|
52
|
+
const model = opts.model ?? "sonic-2";
|
|
53
|
+
const language = opts.language ?? "en";
|
|
54
|
+
const client = new Cartesia({ apiKey });
|
|
55
|
+
let ws;
|
|
56
|
+
try {
|
|
57
|
+
ws = await client.tts.websocket();
|
|
58
|
+
} catch (cause) {
|
|
59
|
+
const err = /* @__PURE__ */ new Error(`Cartesia TTS: connect failed: ${cause instanceof Error ? cause.message : String(cause)}`);
|
|
60
|
+
err.code = "tts_connect_failed";
|
|
61
|
+
throw err;
|
|
62
|
+
}
|
|
63
|
+
const emitter = createNanoEvents();
|
|
64
|
+
let closed = false;
|
|
65
|
+
/** Mint a fresh context bound to the shared TTSWS connection. */
|
|
66
|
+
const mintContext = () => ws.context({
|
|
67
|
+
model_id: model,
|
|
68
|
+
voice: {
|
|
69
|
+
mode: "id",
|
|
70
|
+
id: opts.voice
|
|
71
|
+
},
|
|
72
|
+
output_format: {
|
|
73
|
+
container: "raw",
|
|
74
|
+
encoding: "pcm_s16le",
|
|
75
|
+
sample_rate: sampleRate
|
|
76
|
+
},
|
|
77
|
+
contextId: randomUUID()
|
|
78
|
+
});
|
|
79
|
+
let context = mintContext();
|
|
80
|
+
/**
|
|
81
|
+
* `doneEmitted` guards against emitting `done` more than once per turn.
|
|
82
|
+
* Reset whenever a fresh context is minted (i.e. at turn boundaries).
|
|
83
|
+
*/
|
|
84
|
+
let doneEmitted = false;
|
|
85
|
+
const rotateContext = () => {
|
|
86
|
+
context = mintContext();
|
|
87
|
+
doneEmitted = false;
|
|
88
|
+
};
|
|
89
|
+
const emitDoneOnce = () => {
|
|
90
|
+
if (doneEmitted || closed) return;
|
|
91
|
+
doneEmitted = true;
|
|
92
|
+
emitter.emit("done");
|
|
93
|
+
};
|
|
94
|
+
ws.on("chunk", (event) => {
|
|
95
|
+
if (closed) return;
|
|
96
|
+
if (event.context_id !== context.contextId) return;
|
|
97
|
+
const buf = event.audio;
|
|
98
|
+
if (!buf || buf.byteLength === 0) return;
|
|
99
|
+
const evenBytes = buf.byteLength - buf.byteLength % 2;
|
|
100
|
+
if (evenBytes === 0) return;
|
|
101
|
+
const pcm = new Int16Array(buf.buffer.slice(buf.byteOffset, buf.byteOffset + evenBytes));
|
|
102
|
+
emitter.emit("audio", pcm);
|
|
103
|
+
});
|
|
104
|
+
ws.on("done", (event) => {
|
|
105
|
+
if (closed) return;
|
|
106
|
+
if (event.context_id !== context.contextId) return;
|
|
107
|
+
emitDoneOnce();
|
|
108
|
+
});
|
|
109
|
+
ws.on("error", (err) => {
|
|
110
|
+
if (closed) return;
|
|
111
|
+
emitter.emit("error", makeError(err?.message ?? String(err)));
|
|
112
|
+
});
|
|
113
|
+
const close = async () => {
|
|
114
|
+
if (closed) return;
|
|
115
|
+
closed = true;
|
|
116
|
+
try {
|
|
117
|
+
ws.close({
|
|
118
|
+
code: 1e3,
|
|
119
|
+
reason: "client close"
|
|
120
|
+
});
|
|
121
|
+
} catch {}
|
|
122
|
+
};
|
|
123
|
+
if (openOpts.signal.aborted) close();
|
|
124
|
+
else openOpts.signal.addEventListener("abort", () => void close(), { once: true });
|
|
125
|
+
/** Static part of each generation request; only `transcript` and
|
|
126
|
+
* `continue` vary per send. Pinned here so `language` threads through. */
|
|
127
|
+
const baseRequest = {
|
|
128
|
+
model_id: model,
|
|
129
|
+
voice: {
|
|
130
|
+
mode: "id",
|
|
131
|
+
id: opts.voice
|
|
132
|
+
},
|
|
133
|
+
output_format: {
|
|
134
|
+
container: "raw",
|
|
135
|
+
encoding: "pcm_s16le",
|
|
136
|
+
sample_rate: sampleRate
|
|
137
|
+
},
|
|
138
|
+
language
|
|
139
|
+
};
|
|
140
|
+
/**
|
|
141
|
+
* Swallow rejections from async SDK calls — the global `error`
|
|
142
|
+
* listener on `ws` emits a normalized {@link TtsError}, so there's
|
|
143
|
+
* nothing useful for the caller to do with per-send failures.
|
|
144
|
+
*/
|
|
145
|
+
const ignoreRejection = (_err) => {};
|
|
146
|
+
return {
|
|
147
|
+
sendText(text) {
|
|
148
|
+
if (closed || text.length === 0) return;
|
|
149
|
+
context.send({
|
|
150
|
+
...baseRequest,
|
|
151
|
+
transcript: text,
|
|
152
|
+
continue: true
|
|
153
|
+
}).catch(ignoreRejection);
|
|
154
|
+
},
|
|
155
|
+
flush() {
|
|
156
|
+
if (closed) return;
|
|
157
|
+
context.send({
|
|
158
|
+
...baseRequest,
|
|
159
|
+
transcript: "",
|
|
160
|
+
continue: false
|
|
161
|
+
}).catch(ignoreRejection);
|
|
162
|
+
queueMicrotask(emitDoneOnce);
|
|
163
|
+
rotateContext();
|
|
164
|
+
},
|
|
165
|
+
cancel() {
|
|
166
|
+
if (closed) return;
|
|
167
|
+
context.cancel().catch(ignoreRejection);
|
|
168
|
+
emitDoneOnce();
|
|
169
|
+
rotateContext();
|
|
170
|
+
},
|
|
171
|
+
on(event, fn) {
|
|
172
|
+
return emitter.on(event, fn);
|
|
173
|
+
},
|
|
174
|
+
close,
|
|
175
|
+
_ws: ws,
|
|
176
|
+
_currentContextId: () => context.contextId
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
};
|
|
180
|
+
}
|
|
181
|
+
//#endregion
|
|
182
|
+
export { cartesia };
|