@rajnandan1/atticus 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,510 @@
1
+ import { RealtimeItem } from '@openai/agents/realtime';
2
+
3
+ /**
4
+ * Configuration for the AI agent's behavior and personality.
5
+ */
6
+ interface AgentConfig {
7
+ /** The name of the agent (used for display/logging purposes) */
8
+ name: string;
9
+ /** System instructions that define the agent's behavior and personality */
10
+ instructions: string;
11
+ }
12
+ /**
13
+ * D2Snap options for DOM compression.
14
+ */
15
+ interface D2SnapOptions {
16
+ /** Maximum tokens for adaptive compression @default 4096 */
17
+ maxTokens?: number;
18
+ /** Assign unique IDs to interactive elements @default true */
19
+ assignUniqueIDs?: boolean;
20
+ }
21
+ /**
22
+ * Configuration for UI awareness.
23
+ */
24
+ interface UIConfig {
25
+ /**
26
+ * Whether to enable UI-aware interactions.
27
+ * When enabled, the agent can understand and interact with the DOM.
28
+ */
29
+ enabled: boolean;
30
+ /**
31
+ * The root DOM element to capture.
32
+ * The library will use element.innerHTML and compress it with d2snap.
33
+ *
34
+ * @example document.body
35
+ * @example document.getElementById('app')
36
+ */
37
+ rootElement: Element;
38
+ /**
39
+ * D2Snap options for DOM compression.
40
+ */
41
+ d2SnapOptions?: D2SnapOptions;
42
+ /**
43
+ * Whether to automatically update DOM context periodically.
44
+ * @default false
45
+ */
46
+ autoUpdate?: boolean;
47
+ /**
48
+ * Interval in milliseconds for auto-updating DOM context.
49
+ * Only used if autoUpdate is true.
50
+ * @default 5000
51
+ */
52
+ autoUpdateInterval?: number;
53
+ }
54
+ /**
55
+ * Available voice options for the agent.
56
+ * These are OpenAI's text-to-speech voices.
57
+ */
58
+ type AtticusVoice = "alloy" | "ash" | "ballad" | "coral" | "echo" | "sage" | "shimmer" | "verse";
59
+ /**
60
+ * Configuration options for Atticus Voice Agent.
61
+ */
62
+ interface AtticusConfig {
63
+ /**
64
+ * The OpenAI client secret (ephemeral key) for the Realtime API.
65
+ * Obtain this from your backend server.
66
+ *
67
+ * @example 'ek_...'
68
+ */
69
+ clientSecret: string;
70
+ /**
71
+ * Configuration for the AI agent.
72
+ */
73
+ agent: AgentConfig;
74
+ /**
75
+ * The voice to use for the agent's speech.
76
+ * Available voices: 'alloy', 'ash', 'ballad', 'coral', 'echo', 'sage', 'shimmer', 'verse'
77
+ * @default 'alloy'
78
+ */
79
+ voice?: AtticusVoice;
80
+ /**
81
+ * The language for the voice conversation.
82
+ * The agent will speak and understand this language.
83
+ *
84
+ * @example 'en' (English)
85
+ * @example 'hi' (Hindi)
86
+ * @example 'es' (Spanish)
87
+ * @example 'fr' (French)
88
+ * @default 'en'
89
+ */
90
+ language?: string;
91
+ /**
92
+ * The OpenAI model to use for realtime conversations.
93
+ * @default 'gpt-4o-realtime-preview'
94
+ */
95
+ model?: string;
96
+ /**
97
+ * Whether to automatically send a greeting message when connected.
98
+ * @default true
99
+ */
100
+ autoGreet?: boolean;
101
+ /**
102
+ * The initial message to send when connected (if autoGreet is true).
103
+ * @default 'Hello!'
104
+ */
105
+ greetingMessage?: string;
106
+ /**
107
+ * Enable debug logging.
108
+ * @default false
109
+ */
110
+ debug?: boolean;
111
+ /**
112
+ * If true, UI actions will not be automatically executed.
113
+ * The 'action' event will still be emitted for you to handle manually.
114
+ * @default false
115
+ */
116
+ doNotExecuteActions?: boolean;
117
+ /**
118
+ * Configuration for UI-aware interactions.
119
+ * When enabled, the agent can understand and interact with the page's DOM.
120
+ */
121
+ ui?: UIConfig;
122
+ }
123
+ /**
124
+ * The connection status of the voice agent.
125
+ *
126
+ * - `idle`: Not connected, ready to connect
127
+ * - `connecting`: Currently establishing connection
128
+ * - `connected`: Successfully connected and ready for conversation
129
+ * - `error`: Connection failed or encountered an error
130
+ */
131
+ type AtticusStatus = "idle" | "connecting" | "connected" | "error";
132
+ /**
133
+ * The current state of the conversation.
134
+ *
135
+ * - `idle`: No active conversation
136
+ * - `ai_speaking`: The AI assistant is currently speaking
137
+ * - `user_turn`: Waiting for the user to speak
138
+ * - `user_speaking`: The user is currently speaking
139
+ */
140
+ type ConversationState = "idle" | "ai_speaking" | "user_turn" | "user_speaking";
141
+ /**
142
+ * The complete state of the voice agent at any given moment.
143
+ */
144
+ interface AtticusState {
145
+ /** Current connection status */
146
+ status: AtticusStatus;
147
+ /** Current conversation state */
148
+ conversationState: ConversationState;
149
+ /** Error message if status is 'error', null otherwise */
150
+ error: string | null;
151
+ /** Conversation history */
152
+ history: Message[];
153
+ /** Whether the agent is currently connected */
154
+ isConnected: boolean;
155
+ /** Whether the AI is currently speaking */
156
+ isAiSpeaking: boolean;
157
+ /** Whether the user is currently speaking */
158
+ isUserSpeaking: boolean;
159
+ }
160
+ /**
161
+ * Content types that can appear in a message.
162
+ */
163
+ type MessageContent = {
164
+ type: "text";
165
+ text: string;
166
+ } | {
167
+ type: "audio";
168
+ transcript: string | null;
169
+ };
170
+ /**
171
+ * A parsed message from the conversation history.
172
+ */
173
+ interface Message {
174
+ /** Unique identifier for this message */
175
+ id: string;
176
+ /** Who sent this message */
177
+ role: "user" | "assistant";
178
+ /** The content of the message */
179
+ content: MessageContent;
180
+ /** The original raw item from the API */
181
+ raw: RealtimeItem;
182
+ /** Timestamp when this message was created */
183
+ timestamp: Date;
184
+ }
185
+ /**
186
+ * A UI action requested by the agent.
187
+ * Contains both the spoken response and executable code.
188
+ */
189
+ interface UIAction {
190
+ /** Unique identifier for this action */
191
+ id: string;
192
+ /** The text the agent spoke (explanation of the action) */
193
+ outputText: string;
194
+ /** JavaScript code to execute the UI interaction */
195
+ outputCode: string | null;
196
+ /** Description of what the code does */
197
+ actionDescription: string | null;
198
+ /** The element selector/identifier being targeted */
199
+ targetElement: string | null;
200
+ /** Type of action (click, type, scroll, etc.) */
201
+ actionType: UIActionType | null;
202
+ /** Timestamp when this action was created */
203
+ timestamp: Date;
204
+ }
205
+ /**
206
+ * Types of UI actions the agent can perform.
207
+ */
208
+ type UIActionType = "click" | "type" | "scroll" | "focus" | "hover" | "select" | "navigate" | "read" | "other";
209
+ /**
210
+ * All events emitted by Atticus.
211
+ *
212
+ * Subscribe to these events using `atticus.on(eventName, callback)`.
213
+ */
214
+ interface AtticusEvents {
215
+ /**
216
+ * Emitted when the connection status changes.
217
+ * @param status - The new status
218
+ */
219
+ statusChange: (status: AtticusStatus) => void;
220
+ /**
221
+ * Emitted when the conversation state changes.
222
+ * @param state - The new conversation state
223
+ */
224
+ conversationStateChange: (state: ConversationState) => void;
225
+ /**
226
+ * Emitted when an error occurs.
227
+ * @param error - The error message
228
+ */
229
+ error: (error: string) => void;
230
+ /**
231
+ * Emitted when a new message is added to the history.
232
+ * @param message - The new message
233
+ */
234
+ message: (message: Message) => void;
235
+ /**
236
+ * Emitted when the conversation history is updated.
237
+ * @param history - The complete conversation history
238
+ */
239
+ historyChange: (history: Message[]) => void;
240
+ /**
241
+ * Emitted when the complete state changes.
242
+ * Useful for frameworks that want a single state object.
243
+ * @param state - The complete current state
244
+ */
245
+ stateChange: (state: AtticusState) => void;
246
+ /**
247
+ * Emitted when the agent starts speaking.
248
+ */
249
+ agentStart: () => void;
250
+ /**
251
+ * Emitted when the agent stops speaking (response generation ended).
252
+ */
253
+ agentEnd: () => void;
254
+ /**
255
+ * Emitted when audio playback starts.
256
+ */
257
+ audioStart: () => void;
258
+ /**
259
+ * Emitted when audio playback ends.
260
+ * This is when the user can start speaking.
261
+ */
262
+ audioEnd: () => void;
263
+ /**
264
+ * Emitted when user audio is detected.
265
+ */
266
+ userAudio: () => void;
267
+ /**
268
+ * Emitted when successfully connected.
269
+ */
270
+ connected: () => void;
271
+ /**
272
+ * Emitted when disconnected.
273
+ */
274
+ disconnected: () => void;
275
+ /**
276
+ * Emitted when the agent requests a UI action.
277
+ * The developer should execute the code or handle the action.
278
+ * @param action - The UI action to perform
279
+ */
280
+ action: (action: UIAction) => void;
281
+ }
282
+ /**
283
+ * Event names for Atticus.
284
+ */
285
+ type AtticusEventName = keyof AtticusEvents;
286
+
287
+ /**
288
+ * Map of language codes to their full names.
289
+ */
290
+ declare const LANGUAGE_NAMES: Record<string, string>;
291
+ /**
292
+ * Languages officially supported by OpenAI's transcription API.
293
+ * Only these language codes can be passed to inputAudioTranscription.language
294
+ */
295
+ declare const SUPPORTED_TRANSCRIPTION_LANGUAGES: Set<string>;
296
+ /**
297
+ * Native greetings for each supported language.
298
+ */
299
+ declare const LANGUAGE_GREETINGS: Record<string, string>;
300
+ /**
301
+ * Get the full name of a language from its code.
302
+ * @param code - Language code (e.g., 'en', 'hi')
303
+ * @returns The full language name or the code if not found
304
+ */
305
+ declare function getLanguageName(code: string): string;
306
+ /**
307
+ * Get the native greeting for a language.
308
+ * @param code - Language code (e.g., 'en', 'hi')
309
+ * @returns The native greeting or "Hello!" if not found
310
+ */
311
+ declare function getLanguageGreeting(code: string): string;
312
+ /**
313
+ * Check if a language is supported for transcription.
314
+ * @param code - Language code to check
315
+ * @returns True if the language is supported for transcription
316
+ */
317
+ declare function isTranscriptionSupported(code: string): boolean;
318
+
319
+ /**
320
+ * Atticus - A framework-agnostic voice agent for voice-controlled UI interactions.
321
+ *
322
+ * @example
323
+ * ```ts
324
+ * import { Atticus } from 'atticus';
325
+ *
326
+ * const agent = new Atticus({
327
+ * clientSecret: 'ek_...',
328
+ * agent: {
329
+ * name: 'Assistant',
330
+ * instructions: 'You are a helpful assistant.'
331
+ * }
332
+ * });
333
+ *
334
+ * agent.on('connected', () => console.log('Connected!'));
335
+ * agent.on('message', (msg) => console.log('Message:', msg));
336
+ *
337
+ * await agent.connect();
338
+ * ```
339
+ */
340
+ declare class Atticus {
341
+ private config;
342
+ private agent;
343
+ private session;
344
+ private listeners;
345
+ private _status;
346
+ private _conversationState;
347
+ private _error;
348
+ private _history;
349
+ private _messageIdCounter;
350
+ private _actionIdCounter;
351
+ private _currentDOM;
352
+ private _autoUpdateTimer;
353
+ /**
354
+ * Create a new Atticus instance.
355
+ *
356
+ * @param config - Configuration options
357
+ */
358
+ constructor(config: AtticusConfig);
359
+ private createGetUIStateTool;
360
+ private createUIActionTool;
361
+ private buildInstructions;
362
+ private getLanguageDirective;
363
+ private getUIDirective;
364
+ /**
365
+ * Subscribe to an event.
366
+ *
367
+ * @param event - The event name
368
+ * @param callback - The callback function
369
+ * @returns A function to unsubscribe
370
+ */
371
+ on<T extends AtticusEventName>(event: T, callback: AtticusEvents[T]): () => void;
372
+ /**
373
+ * Subscribe to an event for one invocation only.
374
+ *
375
+ * @param event - The event name
376
+ * @param callback - The callback function
377
+ * @returns A function to unsubscribe
378
+ */
379
+ once<T extends AtticusEventName>(event: T, callback: AtticusEvents[T]): () => void;
380
+ /**
381
+ * Unsubscribe from an event.
382
+ *
383
+ * @param event - The event name
384
+ * @param callback - The callback function to remove
385
+ */
386
+ off<T extends AtticusEventName>(event: T, callback: AtticusEvents[T]): void;
387
+ /**
388
+ * Remove all event listeners.
389
+ */
390
+ removeAllListeners(): void;
391
+ /**
392
+ * Get the current connection status.
393
+ */
394
+ get status(): AtticusStatus;
395
+ /**
396
+ * Get the current conversation state.
397
+ */
398
+ get conversationState(): ConversationState;
399
+ /**
400
+ * Get the current error message (if any).
401
+ */
402
+ get error(): string | null;
403
+ /**
404
+ * Get the conversation history.
405
+ */
406
+ get history(): Message[];
407
+ /**
408
+ * Check if the agent is currently connected.
409
+ */
410
+ get isConnected(): boolean;
411
+ /**
412
+ * Check if the AI is currently speaking.
413
+ */
414
+ get isAiSpeaking(): boolean;
415
+ /**
416
+ * Check if the user is currently speaking.
417
+ */
418
+ get isUserSpeaking(): boolean;
419
+ /**
420
+ * Get the configured language code.
421
+ */
422
+ get language(): string;
423
+ /**
424
+ * Get the complete current state as a single object.
425
+ */
426
+ getState(): AtticusState;
427
+ /**
428
+ * Connect to the voice agent and start a conversation.
429
+ *
430
+ * @throws Error if connection fails
431
+ */
432
+ connect(): Promise<void>;
433
+ /**
434
+ * Disconnect from the voice agent and end the conversation.
435
+ */
436
+ disconnect(): void;
437
+ /**
438
+ * Interrupt the AI while it's speaking.
439
+ */
440
+ interrupt(): void;
441
+ /**
442
+ * Send a text message to the agent.
443
+ *
444
+ * @param message - The text message to send
445
+ */
446
+ sendMessage(message: string): void;
447
+ /**
448
+ * Toggle the connection state.
449
+ */
450
+ toggle(): Promise<void>;
451
+ /**
452
+ * Destroy the agent instance and clean up all resources.
453
+ */
454
+ destroy(): void;
455
+ /**
456
+ * Update the DOM context manually.
457
+ *
458
+ * @param dom - The DOM Element or HTML string
459
+ */
460
+ updateDOM(dom: string | Element): Promise<void>;
461
+ /**
462
+ * Refresh the DOM context from the configured root element.
463
+ */
464
+ refreshDOM(): Promise<void>;
465
+ /**
466
+ * Start auto-updating the DOM context.
467
+ */
468
+ startAutoUpdate(): void;
469
+ /**
470
+ * Stop auto-updating the DOM context.
471
+ */
472
+ stopAutoUpdate(): void;
473
+ /**
474
+ * Get the current DOM context.
475
+ */
476
+ get currentDOM(): string | null;
477
+ /**
478
+ * Check if UI mode is enabled.
479
+ */
480
+ get isUIEnabled(): boolean;
481
+ private setupSessionListeners;
482
+ private handleHistoryUpdate;
483
+ private setStatus;
484
+ private setConversationState;
485
+ private emitStateChange;
486
+ private emit;
487
+ private log;
488
+ private captureDOM;
489
+ private sendDOMContext;
490
+ /**
491
+ * Get the text content of a message.
492
+ *
493
+ * @param message - The message to extract text from
494
+ * @returns The text content or transcript
495
+ */
496
+ getMessageText(message: Message): string;
497
+ /**
498
+ * Execute a UI action's code.
499
+ *
500
+ * @param action - The UI action to execute
501
+ * @returns The result of the execution or error
502
+ */
503
+ executeAction(action: UIAction): Promise<{
504
+ success: boolean;
505
+ result?: unknown;
506
+ error?: string;
507
+ }>;
508
+ }
509
+
510
+ export { type AgentConfig, Atticus, type AtticusConfig, type AtticusEventName, type AtticusEvents, type AtticusState, type AtticusStatus, type AtticusVoice, type ConversationState, type D2SnapOptions, LANGUAGE_GREETINGS, LANGUAGE_NAMES, type Message, type MessageContent, SUPPORTED_TRANSCRIPTION_LANGUAGES, type UIAction, type UIActionType, type UIConfig, getLanguageGreeting, getLanguageName, isTranscriptionSupported };