livekit-plugins-aws 1.0.0rc6__py3-none-any.whl → 1.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/aws/__init__.py +47 -7
- livekit/plugins/aws/experimental/realtime/__init__.py +11 -0
- livekit/plugins/aws/experimental/realtime/events.py +545 -0
- livekit/plugins/aws/experimental/realtime/pretty_printer.py +49 -0
- livekit/plugins/aws/experimental/realtime/realtime_model.py +2106 -0
- livekit/plugins/aws/experimental/realtime/turn_tracker.py +171 -0
- livekit/plugins/aws/experimental/realtime/types.py +38 -0
- livekit/plugins/aws/llm.py +109 -71
- livekit/plugins/aws/log.py +4 -0
- livekit/plugins/aws/models.py +4 -3
- livekit/plugins/aws/stt.py +214 -71
- livekit/plugins/aws/tts.py +96 -116
- livekit/plugins/aws/utils.py +29 -125
- livekit/plugins/aws/version.py +1 -1
- livekit_plugins_aws-1.3.9.dist-info/METADATA +385 -0
- livekit_plugins_aws-1.3.9.dist-info/RECORD +18 -0
- {livekit_plugins_aws-1.0.0rc6.dist-info → livekit_plugins_aws-1.3.9.dist-info}/WHEEL +1 -1
- livekit_plugins_aws-1.0.0rc6.dist-info/METADATA +0 -43
- livekit_plugins_aws-1.0.0rc6.dist-info/RECORD +0 -12
livekit/plugins/aws/__init__.py
CHANGED
|
@@ -12,19 +12,59 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
-
|
|
16
|
-
from .stt import STT, SpeechStream
|
|
17
|
-
from .tts import TTS, ChunkedStream
|
|
18
|
-
from .version import __version__
|
|
15
|
+
"""AWS plugin for LiveKit Agents
|
|
19
16
|
|
|
20
|
-
|
|
17
|
+
Support for AWS AI including Bedrock, Polly, Transcribe and optionally Nova Sonic.
|
|
21
18
|
|
|
22
|
-
|
|
19
|
+
See https://docs.livekit.io/agents/integrations/aws/ for more information.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
import typing # noqa: I001
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
if typing.TYPE_CHECKING:
|
|
26
|
+
from .experimental import realtime
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def __getattr__(name: str) -> typing.Any:
|
|
30
|
+
if name == "realtime":
|
|
31
|
+
try:
|
|
32
|
+
from .experimental import realtime
|
|
33
|
+
except ImportError as e:
|
|
34
|
+
raise ImportError(
|
|
35
|
+
"The 'realtime' module requires optional dependencies. "
|
|
36
|
+
"Please install them with: pip install 'livekit-plugins-aws[realtime]'"
|
|
37
|
+
) from e
|
|
38
|
+
|
|
39
|
+
return realtime
|
|
40
|
+
|
|
41
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
from .llm import LLM # noqa: E402
|
|
45
|
+
from .stt import STT, SpeechStream # noqa: E402
|
|
46
|
+
from .tts import TTS, ChunkedStream # noqa: E402
|
|
47
|
+
from .version import __version__ # noqa: E402
|
|
48
|
+
|
|
49
|
+
__all__ = ["STT", "SpeechStream", "TTS", "ChunkedStream", "LLM", "realtime", "__version__"]
|
|
50
|
+
|
|
51
|
+
from livekit.agents import Plugin # noqa: E402
|
|
52
|
+
|
|
53
|
+
from .log import logger # noqa: E402
|
|
23
54
|
|
|
24
55
|
|
|
25
56
|
class AWSPlugin(Plugin):
|
|
26
57
|
def __init__(self) -> None:
|
|
27
|
-
super().__init__(__name__, __version__, __package__)
|
|
58
|
+
super().__init__(__name__, __version__, __package__, logger)
|
|
28
59
|
|
|
29
60
|
|
|
30
61
|
Plugin.register_plugin(AWSPlugin())
|
|
62
|
+
|
|
63
|
+
# Cleanup docs of unexported modules
|
|
64
|
+
_module = dir()
|
|
65
|
+
NOT_IN_ALL = [m for m in _module if m not in __all__]
|
|
66
|
+
|
|
67
|
+
__pdoc__ = {}
|
|
68
|
+
|
|
69
|
+
for n in NOT_IN_ALL:
|
|
70
|
+
__pdoc__[n] = False
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from .realtime_model import RealtimeModel, RealtimeSession
|
|
2
|
+
from .types import MODALITIES, REALTIME_MODELS, SONIC1_VOICES, SONIC2_VOICES
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"RealtimeSession",
|
|
6
|
+
"RealtimeModel",
|
|
7
|
+
"MODALITIES",
|
|
8
|
+
"REALTIME_MODELS",
|
|
9
|
+
"SONIC1_VOICES",
|
|
10
|
+
"SONIC2_VOICES",
|
|
11
|
+
]
|
|
@@ -0,0 +1,545 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import uuid
|
|
3
|
+
from typing import Any, Literal, Optional, Union, cast
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel as _BaseModel, ConfigDict, Field
|
|
6
|
+
|
|
7
|
+
from livekit.agents import llm
|
|
8
|
+
|
|
9
|
+
from ...log import logger
|
|
10
|
+
from .types import TURN_DETECTION
|
|
11
|
+
|
|
12
|
+
MEDIA_TYPE = Literal["text/plain", "audio/lpcm", "application/json"]
|
|
13
|
+
TYPE = Literal["TEXT", "AUDIO", "TOOL"]
|
|
14
|
+
ROLE = Literal["USER", "ASSISTANT", "TOOL", "SYSTEM"]
|
|
15
|
+
GENERATION_STAGE = Literal["SPECULATIVE", "FINAL"]
|
|
16
|
+
STOP_REASON = Literal["PARTIAL_TURN", "END_TURN", "INTERRUPTED"]
|
|
17
|
+
SAMPLE_RATE_HERTZ = Literal[8_000, 16_000, 24_000]
|
|
18
|
+
AUDIO_ENCODING = Literal["base64"] # all audio data must be base64 encoded
|
|
19
|
+
SAMPLE_SIZE_BITS = Literal[16] # only supports 16-bit audio
|
|
20
|
+
CHANNEL_COUNT = Literal[1] # only supports monochannel audio
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class BaseModel(_BaseModel):
|
|
24
|
+
model_config = ConfigDict(populate_by_name=True, extra="forbid")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class InferenceConfiguration(BaseModel):
|
|
28
|
+
maxTokens: int = Field(default=1024, ge=1, le=10_000, frozen=True)
|
|
29
|
+
topP: float = Field(default=0.9, ge=0.0, le=1.0, frozen=True)
|
|
30
|
+
temperature: float = Field(default=0.7, ge=0.0, le=1.0, frozen=True)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class AudioInputConfiguration(BaseModel):
|
|
34
|
+
mediaType: MEDIA_TYPE = "audio/lpcm"
|
|
35
|
+
sampleRateHertz: SAMPLE_RATE_HERTZ = Field(default=16000)
|
|
36
|
+
sampleSizeBits: SAMPLE_SIZE_BITS = 16
|
|
37
|
+
channelCount: CHANNEL_COUNT = 1
|
|
38
|
+
audioType: str = "SPEECH"
|
|
39
|
+
encoding: AUDIO_ENCODING = "base64"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class AudioOutputConfiguration(BaseModel):
|
|
43
|
+
mediaType: MEDIA_TYPE = "audio/lpcm"
|
|
44
|
+
sampleRateHertz: SAMPLE_RATE_HERTZ = Field(default=24_000)
|
|
45
|
+
sampleSizeBits: SAMPLE_SIZE_BITS = 16
|
|
46
|
+
channelCount: CHANNEL_COUNT = 1
|
|
47
|
+
voiceId: str = Field(...)
|
|
48
|
+
encoding: AUDIO_ENCODING = "base64"
|
|
49
|
+
audioType: str = "SPEECH"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class TextInputConfiguration(BaseModel):
|
|
53
|
+
mediaType: MEDIA_TYPE = "text/plain"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class TextOutputConfiguration(BaseModel):
|
|
57
|
+
mediaType: MEDIA_TYPE = "text/plain"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class ToolUseOutputConfiguration(BaseModel):
|
|
61
|
+
mediaType: MEDIA_TYPE = "application/json"
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class ToolResultInputConfiguration(BaseModel):
|
|
65
|
+
toolUseId: str
|
|
66
|
+
type: TYPE = "TEXT"
|
|
67
|
+
textInputConfiguration: TextInputConfiguration = TextInputConfiguration()
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class ToolInputSchema(BaseModel):
|
|
71
|
+
json_: str = Field(
|
|
72
|
+
default_factory=lambda: json.dumps(
|
|
73
|
+
{
|
|
74
|
+
"type": "object",
|
|
75
|
+
"properties": {},
|
|
76
|
+
"required": [],
|
|
77
|
+
}
|
|
78
|
+
),
|
|
79
|
+
alias="json",
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class ToolSpec(BaseModel):
|
|
84
|
+
name: str
|
|
85
|
+
description: str
|
|
86
|
+
inputSchema: ToolInputSchema
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class Tool(BaseModel):
|
|
90
|
+
toolSpec: ToolSpec
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class ToolConfiguration(BaseModel):
|
|
94
|
+
toolChoice: Optional[dict[str, dict[str, str]]] = None
|
|
95
|
+
tools: list[Tool]
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class SessionStart(BaseModel):
|
|
99
|
+
inferenceConfiguration: InferenceConfiguration
|
|
100
|
+
endpointingSensitivity: Optional[TURN_DETECTION] = "MEDIUM"
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class InputTextContentStart(BaseModel):
|
|
104
|
+
promptName: str
|
|
105
|
+
contentName: str
|
|
106
|
+
type: TYPE = "TEXT"
|
|
107
|
+
interactive: bool = False
|
|
108
|
+
role: ROLE
|
|
109
|
+
textInputConfiguration: TextInputConfiguration
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class InputAudioContentStart(BaseModel):
|
|
113
|
+
promptName: str
|
|
114
|
+
contentName: str
|
|
115
|
+
type: TYPE = "AUDIO"
|
|
116
|
+
interactive: bool = True
|
|
117
|
+
role: ROLE = "USER"
|
|
118
|
+
audioInputConfiguration: AudioInputConfiguration
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class InputToolContentStart(BaseModel):
|
|
122
|
+
promptName: str
|
|
123
|
+
contentName: str
|
|
124
|
+
type: TYPE = "TOOL"
|
|
125
|
+
interactive: bool = False
|
|
126
|
+
role: ROLE = "TOOL"
|
|
127
|
+
toolResultInputConfiguration: ToolResultInputConfiguration
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class PromptStart(BaseModel):
|
|
131
|
+
promptName: str
|
|
132
|
+
textOutputConfiguration: TextOutputConfiguration
|
|
133
|
+
audioOutputConfiguration: AudioOutputConfiguration
|
|
134
|
+
toolUseOutputConfiguration: ToolUseOutputConfiguration
|
|
135
|
+
toolConfiguration: ToolConfiguration
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class TextInput(BaseModel):
|
|
139
|
+
promptName: str
|
|
140
|
+
contentName: str
|
|
141
|
+
content: str
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class AudioInput(BaseModel):
|
|
145
|
+
promptName: str
|
|
146
|
+
contentName: str
|
|
147
|
+
content: str
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
class ToolResult(BaseModel):
|
|
151
|
+
promptName: str
|
|
152
|
+
contentName: str
|
|
153
|
+
content: str
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class ContentEndEvent(BaseModel):
|
|
157
|
+
promptName: str
|
|
158
|
+
contentName: str
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
class PromptEnd(BaseModel):
|
|
162
|
+
promptName: str
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
class SessionEnd(BaseModel):
|
|
166
|
+
pass
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
class SessionStartEvent(BaseModel):
|
|
170
|
+
sessionStart: SessionStart
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
class InputTextContentStartEvent(BaseModel):
|
|
174
|
+
contentStart: InputTextContentStart
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
class InputAudioContentStartEvent(BaseModel):
|
|
178
|
+
contentStart: InputAudioContentStart
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
class InputToolContentStartEvent(BaseModel):
|
|
182
|
+
contentStart: InputToolContentStart
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
class PromptStartEvent(BaseModel):
|
|
186
|
+
promptStart: PromptStart
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
class TextInputContentEvent(BaseModel):
|
|
190
|
+
textInput: TextInput
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
class AudioInputContentEvent(BaseModel):
|
|
194
|
+
audioInput: AudioInput
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
class ToolResultContentEvent(BaseModel):
|
|
198
|
+
toolResult: ToolResult
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
class InputContentEndEvent(BaseModel):
|
|
202
|
+
contentEnd: ContentEndEvent
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
class PromptEndEvent(BaseModel):
|
|
206
|
+
promptEnd: PromptEnd
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
class SessionEndEvent(BaseModel):
|
|
210
|
+
sessionEnd: SessionEnd
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
class Event(BaseModel):
|
|
214
|
+
event: Union[
|
|
215
|
+
SessionStartEvent,
|
|
216
|
+
InputTextContentStartEvent,
|
|
217
|
+
InputAudioContentStartEvent,
|
|
218
|
+
InputToolContentStartEvent,
|
|
219
|
+
PromptStartEvent,
|
|
220
|
+
TextInputContentEvent,
|
|
221
|
+
AudioInputContentEvent,
|
|
222
|
+
ToolResultContentEvent,
|
|
223
|
+
InputContentEndEvent,
|
|
224
|
+
PromptEndEvent,
|
|
225
|
+
SessionEndEvent,
|
|
226
|
+
]
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
class SonicEventBuilder:
|
|
230
|
+
def __init__(self, prompt_name: str, audio_content_name: str):
|
|
231
|
+
self.prompt_name = prompt_name
|
|
232
|
+
self.audio_content_name = audio_content_name
|
|
233
|
+
|
|
234
|
+
@classmethod
|
|
235
|
+
def get_event_type(cls, json_data: dict) -> str:
|
|
236
|
+
if event := json_data.get("event"):
|
|
237
|
+
if event.get("contentStart", {}).get("type") == "AUDIO":
|
|
238
|
+
return "audio_output_content_start"
|
|
239
|
+
elif event.get("contentEnd", {}).get("type") == "AUDIO":
|
|
240
|
+
return "audio_output_content_end"
|
|
241
|
+
elif event.get("contentStart", {}).get("type") == "TEXT":
|
|
242
|
+
return "text_output_content_start"
|
|
243
|
+
elif event.get("contentEnd", {}).get("type") == "TEXT":
|
|
244
|
+
return "text_output_content_end"
|
|
245
|
+
elif event.get("contentStart", {}).get("type") == "TOOL":
|
|
246
|
+
return "tool_output_content_start"
|
|
247
|
+
elif event.get("contentEnd", {}).get("type") == "TOOL":
|
|
248
|
+
return "tool_output_content_end"
|
|
249
|
+
elif event.get("textOutput"):
|
|
250
|
+
return "text_output_content"
|
|
251
|
+
elif event.get("audioOutput"):
|
|
252
|
+
return "audio_output_content"
|
|
253
|
+
elif event.get("toolUse"):
|
|
254
|
+
return "tool_output_content"
|
|
255
|
+
elif "completionStart" in event:
|
|
256
|
+
return "completion_start"
|
|
257
|
+
elif "completionEnd" in event:
|
|
258
|
+
return "completion_end"
|
|
259
|
+
elif "usageEvent" in event:
|
|
260
|
+
return "usage"
|
|
261
|
+
else:
|
|
262
|
+
return "other_event"
|
|
263
|
+
|
|
264
|
+
raise ValueError(f"Unknown event type: {json_data}")
|
|
265
|
+
|
|
266
|
+
def create_text_content_block(
|
|
267
|
+
self,
|
|
268
|
+
content_name: str,
|
|
269
|
+
role: ROLE,
|
|
270
|
+
content: str,
|
|
271
|
+
) -> list[str]:
|
|
272
|
+
return [
|
|
273
|
+
self.create_text_content_start_event(content_name, role),
|
|
274
|
+
self.create_text_content_event(content_name, content),
|
|
275
|
+
self.create_content_end_event(content_name),
|
|
276
|
+
]
|
|
277
|
+
|
|
278
|
+
def create_tool_content_block(
|
|
279
|
+
self,
|
|
280
|
+
content_name: str,
|
|
281
|
+
tool_use_id: str,
|
|
282
|
+
content: str,
|
|
283
|
+
) -> list[str]:
|
|
284
|
+
return [
|
|
285
|
+
self.create_tool_content_start_event(content_name, tool_use_id),
|
|
286
|
+
self.create_tool_result_event(content_name, content),
|
|
287
|
+
self.create_content_end_event(content_name),
|
|
288
|
+
]
|
|
289
|
+
|
|
290
|
+
def create_prompt_end_block(self) -> list[str]:
|
|
291
|
+
return [
|
|
292
|
+
self.create_content_end_event(self.audio_content_name, is_audio=True),
|
|
293
|
+
self.create_prompt_end_event(),
|
|
294
|
+
self.create_session_end_event(),
|
|
295
|
+
]
|
|
296
|
+
|
|
297
|
+
def create_prompt_start_block(
|
|
298
|
+
self,
|
|
299
|
+
voice_id: str,
|
|
300
|
+
sample_rate: SAMPLE_RATE_HERTZ,
|
|
301
|
+
system_content: str,
|
|
302
|
+
chat_ctx: llm.ChatContext,
|
|
303
|
+
tool_configuration: Optional[Union[ToolConfiguration, dict[str, Any], str]] = None,
|
|
304
|
+
max_tokens: int = 1024,
|
|
305
|
+
top_p: float = 0.9,
|
|
306
|
+
temperature: float = 0.7,
|
|
307
|
+
endpointing_sensitivity: Optional[TURN_DETECTION] = "MEDIUM",
|
|
308
|
+
) -> list[str]:
|
|
309
|
+
system_content_name = str(uuid.uuid4())
|
|
310
|
+
init_events = [
|
|
311
|
+
self.create_session_start_event(
|
|
312
|
+
max_tokens, top_p, temperature, endpointing_sensitivity
|
|
313
|
+
),
|
|
314
|
+
self.create_prompt_start_event(voice_id, sample_rate, tool_configuration),
|
|
315
|
+
*self.create_text_content_block(system_content_name, "SYSTEM", system_content),
|
|
316
|
+
]
|
|
317
|
+
|
|
318
|
+
# note: tool call events are not supported yet
|
|
319
|
+
if chat_ctx.items:
|
|
320
|
+
logger.debug("initiating session with chat context")
|
|
321
|
+
for item in chat_ctx.items:
|
|
322
|
+
if item.type != "message":
|
|
323
|
+
continue
|
|
324
|
+
|
|
325
|
+
if (role := item.role.upper()) not in ["USER", "ASSISTANT", "SYSTEM"]:
|
|
326
|
+
continue
|
|
327
|
+
|
|
328
|
+
ctx_content_name = str(uuid.uuid4())
|
|
329
|
+
init_events.extend(
|
|
330
|
+
self.create_text_content_block(
|
|
331
|
+
ctx_content_name,
|
|
332
|
+
cast(ROLE, role),
|
|
333
|
+
"".join(c for c in item.content if isinstance(c, str)),
|
|
334
|
+
)
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
return init_events
|
|
338
|
+
|
|
339
|
+
def create_session_start_event(
|
|
340
|
+
self,
|
|
341
|
+
max_tokens: int = 1024,
|
|
342
|
+
top_p: float = 0.9,
|
|
343
|
+
temperature: float = 0.7,
|
|
344
|
+
endpointing_sensitivity: Optional[TURN_DETECTION] = "MEDIUM",
|
|
345
|
+
) -> str:
|
|
346
|
+
event = Event(
|
|
347
|
+
event=SessionStartEvent(
|
|
348
|
+
sessionStart=SessionStart(
|
|
349
|
+
inferenceConfiguration=InferenceConfiguration(
|
|
350
|
+
maxTokens=max_tokens,
|
|
351
|
+
topP=top_p,
|
|
352
|
+
temperature=temperature,
|
|
353
|
+
),
|
|
354
|
+
endpointingSensitivity=endpointing_sensitivity,
|
|
355
|
+
)
|
|
356
|
+
)
|
|
357
|
+
)
|
|
358
|
+
return event.model_dump_json(exclude_none=False)
|
|
359
|
+
|
|
360
|
+
def create_audio_content_start_event(
|
|
361
|
+
self,
|
|
362
|
+
sample_rate: SAMPLE_RATE_HERTZ = 16_000,
|
|
363
|
+
) -> str:
|
|
364
|
+
event = Event(
|
|
365
|
+
event=InputAudioContentStartEvent(
|
|
366
|
+
contentStart=InputAudioContentStart(
|
|
367
|
+
promptName=self.prompt_name,
|
|
368
|
+
contentName=self.audio_content_name,
|
|
369
|
+
audioInputConfiguration=AudioInputConfiguration(
|
|
370
|
+
sampleRateHertz=sample_rate,
|
|
371
|
+
),
|
|
372
|
+
)
|
|
373
|
+
)
|
|
374
|
+
)
|
|
375
|
+
return event.model_dump_json(exclude_none=True, by_alias=True)
|
|
376
|
+
|
|
377
|
+
def create_text_content_start_event(
|
|
378
|
+
self,
|
|
379
|
+
content_name: str,
|
|
380
|
+
role: ROLE,
|
|
381
|
+
) -> str:
|
|
382
|
+
event = Event(
|
|
383
|
+
event=InputTextContentStartEvent(
|
|
384
|
+
contentStart=InputTextContentStart(
|
|
385
|
+
promptName=self.prompt_name,
|
|
386
|
+
contentName=content_name,
|
|
387
|
+
role=role,
|
|
388
|
+
textInputConfiguration=TextInputConfiguration(),
|
|
389
|
+
)
|
|
390
|
+
)
|
|
391
|
+
)
|
|
392
|
+
return event.model_dump_json(exclude_none=True, by_alias=True)
|
|
393
|
+
|
|
394
|
+
def create_text_content_start_event_interactive(
|
|
395
|
+
self,
|
|
396
|
+
content_name: str,
|
|
397
|
+
role: ROLE,
|
|
398
|
+
) -> str:
|
|
399
|
+
"""Create text content start event with interactive=True for Nova Sonic 2.0."""
|
|
400
|
+
event = Event(
|
|
401
|
+
event=InputTextContentStartEvent(
|
|
402
|
+
contentStart=InputTextContentStart(
|
|
403
|
+
promptName=self.prompt_name,
|
|
404
|
+
contentName=content_name,
|
|
405
|
+
role=role,
|
|
406
|
+
interactive=True,
|
|
407
|
+
textInputConfiguration=TextInputConfiguration(),
|
|
408
|
+
)
|
|
409
|
+
)
|
|
410
|
+
)
|
|
411
|
+
return event.model_dump_json(exclude_none=True, by_alias=True)
|
|
412
|
+
|
|
413
|
+
def create_tool_content_start_event(
|
|
414
|
+
self,
|
|
415
|
+
content_name: str,
|
|
416
|
+
tool_use_id: str,
|
|
417
|
+
) -> str:
|
|
418
|
+
event = Event(
|
|
419
|
+
event=InputToolContentStartEvent(
|
|
420
|
+
contentStart=InputToolContentStart(
|
|
421
|
+
promptName=self.prompt_name,
|
|
422
|
+
contentName=content_name,
|
|
423
|
+
toolResultInputConfiguration=ToolResultInputConfiguration(
|
|
424
|
+
toolUseId=tool_use_id,
|
|
425
|
+
textInputConfiguration=TextInputConfiguration(),
|
|
426
|
+
),
|
|
427
|
+
)
|
|
428
|
+
)
|
|
429
|
+
)
|
|
430
|
+
return event.model_dump_json(exclude_none=True, by_alias=True)
|
|
431
|
+
|
|
432
|
+
def create_audio_input_event(
|
|
433
|
+
self,
|
|
434
|
+
audio_content: str,
|
|
435
|
+
) -> str:
|
|
436
|
+
event = Event(
|
|
437
|
+
event=AudioInputContentEvent(
|
|
438
|
+
audioInput=AudioInput(
|
|
439
|
+
promptName=self.prompt_name,
|
|
440
|
+
contentName=self.audio_content_name,
|
|
441
|
+
content=audio_content,
|
|
442
|
+
)
|
|
443
|
+
)
|
|
444
|
+
)
|
|
445
|
+
return event.model_dump_json(exclude_none=True, by_alias=True)
|
|
446
|
+
|
|
447
|
+
def create_text_content_event(
|
|
448
|
+
self,
|
|
449
|
+
content_name: str,
|
|
450
|
+
content: str,
|
|
451
|
+
) -> str:
|
|
452
|
+
event = Event(
|
|
453
|
+
event=TextInputContentEvent(
|
|
454
|
+
textInput=TextInput(
|
|
455
|
+
promptName=self.prompt_name,
|
|
456
|
+
contentName=content_name,
|
|
457
|
+
content=content,
|
|
458
|
+
)
|
|
459
|
+
)
|
|
460
|
+
)
|
|
461
|
+
return event.model_dump_json(exclude_none=True, by_alias=True)
|
|
462
|
+
|
|
463
|
+
def create_tool_result_event(
|
|
464
|
+
self,
|
|
465
|
+
content_name: str,
|
|
466
|
+
content: Union[str, dict[str, Any]],
|
|
467
|
+
) -> str:
|
|
468
|
+
if isinstance(content, dict):
|
|
469
|
+
content_str = json.dumps(content)
|
|
470
|
+
else:
|
|
471
|
+
content_str = content
|
|
472
|
+
|
|
473
|
+
event = Event(
|
|
474
|
+
event=ToolResultContentEvent(
|
|
475
|
+
toolResult=ToolResult(
|
|
476
|
+
promptName=self.prompt_name,
|
|
477
|
+
contentName=content_name,
|
|
478
|
+
content=content_str,
|
|
479
|
+
)
|
|
480
|
+
)
|
|
481
|
+
)
|
|
482
|
+
return event.model_dump_json(exclude_none=True, by_alias=True)
|
|
483
|
+
|
|
484
|
+
def create_content_end_event(
|
|
485
|
+
self,
|
|
486
|
+
content_name: str,
|
|
487
|
+
is_audio: bool = False,
|
|
488
|
+
) -> str:
|
|
489
|
+
event = Event(
|
|
490
|
+
event=InputContentEndEvent(
|
|
491
|
+
contentEnd=ContentEndEvent(
|
|
492
|
+
promptName=self.prompt_name,
|
|
493
|
+
contentName=content_name if not is_audio else self.audio_content_name,
|
|
494
|
+
)
|
|
495
|
+
)
|
|
496
|
+
)
|
|
497
|
+
return event.model_dump_json(exclude_none=True, by_alias=True)
|
|
498
|
+
|
|
499
|
+
def create_prompt_end_event(self) -> str:
|
|
500
|
+
event = Event(
|
|
501
|
+
event=PromptEndEvent(
|
|
502
|
+
promptEnd=PromptEnd(promptName=self.prompt_name),
|
|
503
|
+
)
|
|
504
|
+
)
|
|
505
|
+
return event.model_dump_json(exclude_none=True, by_alias=True)
|
|
506
|
+
|
|
507
|
+
def create_session_end_event(self) -> str:
|
|
508
|
+
event = Event(
|
|
509
|
+
event=SessionEndEvent(sessionEnd=SessionEnd()),
|
|
510
|
+
)
|
|
511
|
+
return event.model_dump_json(exclude_none=True, by_alias=True)
|
|
512
|
+
|
|
513
|
+
def create_prompt_start_event(
|
|
514
|
+
self,
|
|
515
|
+
voice_id: str,
|
|
516
|
+
sample_rate: SAMPLE_RATE_HERTZ,
|
|
517
|
+
tool_configuration: Optional[Union[ToolConfiguration, dict[str, Any], str]] = None,
|
|
518
|
+
) -> str:
|
|
519
|
+
if tool_configuration is None:
|
|
520
|
+
tool_configuration = ToolConfiguration(tools=[])
|
|
521
|
+
elif isinstance(tool_configuration, str):
|
|
522
|
+
tool_configuration = ToolConfiguration.model_validate_json(tool_configuration)
|
|
523
|
+
elif isinstance(tool_configuration, dict):
|
|
524
|
+
tool_configuration = ToolConfiguration.model_validate(tool_configuration)
|
|
525
|
+
|
|
526
|
+
for tool in tool_configuration.tools:
|
|
527
|
+
logger.debug(f"TOOL JSON SCHEMA: {tool.toolSpec.inputSchema}")
|
|
528
|
+
|
|
529
|
+
tool_objects = list(tool_configuration.tools)
|
|
530
|
+
event = Event(
|
|
531
|
+
event=PromptStartEvent(
|
|
532
|
+
promptStart=PromptStart(
|
|
533
|
+
promptName=self.prompt_name,
|
|
534
|
+
textOutputConfiguration=TextOutputConfiguration(),
|
|
535
|
+
audioOutputConfiguration=AudioOutputConfiguration(
|
|
536
|
+
voiceId=voice_id, sampleRateHertz=sample_rate
|
|
537
|
+
),
|
|
538
|
+
toolUseOutputConfiguration=ToolUseOutputConfiguration(),
|
|
539
|
+
toolConfiguration=ToolConfiguration(
|
|
540
|
+
tools=tool_objects, toolChoice=tool_configuration.toolChoice
|
|
541
|
+
),
|
|
542
|
+
)
|
|
543
|
+
)
|
|
544
|
+
)
|
|
545
|
+
return event.model_dump_json(exclude_none=True, by_alias=True)
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
from .events import SonicEventBuilder
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger("livekit.plugins.aws")
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# https://jakob-bagterp.github.io/colorist-for-python/ansi-escape-codes/standard-16-colors/#bright-colors
|
|
10
|
+
class AnsiColors:
|
|
11
|
+
RED = "\033[91m"
|
|
12
|
+
GREEN = "\033[92m"
|
|
13
|
+
YELLOW = "\033[93m"
|
|
14
|
+
BLUE = "\033[94m"
|
|
15
|
+
MAGENTA = "\033[95m"
|
|
16
|
+
CYAN = "\033[96m"
|
|
17
|
+
|
|
18
|
+
BOLD = "\033[1m"
|
|
19
|
+
UNDERLINE = "\033[4m"
|
|
20
|
+
ENDC = "\033[0m"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
EVENT_COLOR_MAP = {
|
|
24
|
+
"audio_output_content_start": AnsiColors.GREEN,
|
|
25
|
+
"audio_output_content_end": AnsiColors.GREEN,
|
|
26
|
+
"text_output_content_start": AnsiColors.BLUE,
|
|
27
|
+
"text_output_content_end": AnsiColors.BLUE,
|
|
28
|
+
"tool_output_content_start": AnsiColors.YELLOW,
|
|
29
|
+
"tool_output_content_end": AnsiColors.YELLOW,
|
|
30
|
+
"text_output_content": AnsiColors.BLUE,
|
|
31
|
+
"audio_output_content": AnsiColors.GREEN,
|
|
32
|
+
"tool_output_content": AnsiColors.YELLOW,
|
|
33
|
+
"completion_start": AnsiColors.MAGENTA,
|
|
34
|
+
"completion_end": AnsiColors.MAGENTA,
|
|
35
|
+
"usage": AnsiColors.CYAN,
|
|
36
|
+
"other_event": AnsiColors.UNDERLINE,
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def log_event_data(event_data: dict) -> None:
|
|
41
|
+
event_type = SonicEventBuilder.get_event_type(event_data)
|
|
42
|
+
color = EVENT_COLOR_MAP[event_type]
|
|
43
|
+
logger.debug(
|
|
44
|
+
f"{color}{event_type.upper()}: {json.dumps(event_data, indent=2)}{AnsiColors.ENDC}"
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def log_message(message: str, color: str) -> None:
|
|
49
|
+
logger.debug(f"{color}{message}{AnsiColors.ENDC}")
|