voiceground 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,546 @@
1
+ """VoicegroundObserver - Track conversation events from pipecat pipelines."""
2
+
3
+ import uuid
4
+ from collections.abc import Callable
5
+ from dataclasses import dataclass, field
6
+ from typing import TYPE_CHECKING, Any
7
+
8
+ from pipecat.frames.frames import (
9
+ BotStartedSpeakingFrame,
10
+ BotStoppedSpeakingFrame,
11
+ CancelFrame,
12
+ EndFrame,
13
+ FunctionCallCancelFrame,
14
+ FunctionCallResultFrame,
15
+ FunctionCallsStartedFrame,
16
+ LLMContextFrame,
17
+ LLMFullResponseEndFrame,
18
+ LLMRunFrame,
19
+ LLMTextFrame,
20
+ StartFrame,
21
+ TranscriptionFrame,
22
+ TTSAudioRawFrame,
23
+ TTSStartedFrame,
24
+ TTSStoppedFrame,
25
+ UserStartedSpeakingFrame,
26
+ UserStoppedSpeakingFrame,
27
+ VADUserStartedSpeakingFrame,
28
+ )
29
+ from pipecat.observers.base_observer import BaseObserver, FrameProcessed, FramePushed
30
+ from pipecat.processors.aggregators.llm_response import (
31
+ LLMAssistantContextAggregator,
32
+ LLMUserContextAggregator,
33
+ )
34
+ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContextFrame
35
+ from pipecat.processors.frame_processor import FrameDirection
36
+ from pipecat.services.llm_service import LLMService
37
+
38
+ from voiceground.events import EventCategory, EventType, VoicegroundEvent
39
+
40
+ if TYPE_CHECKING:
41
+ from voiceground.reporters.base import BaseReporter
42
+
43
+
44
+ @dataclass
45
+ class FrameTrigger:
46
+ """Trigger configuration for a specific frame type.
47
+
48
+ Attributes:
49
+ frame: The frame type that can trigger this event.
50
+ source_class: Only trigger if source is an instance of one of these classes.
51
+ direction: Only trigger if frame is pushed in this direction (None = any direction).
52
+ data_extractor: Optional function to extract event data from the frame.
53
+ Takes the frame as argument and returns a dict to merge into event.data.
54
+ """
55
+
56
+ frame: type
57
+ source_class: tuple[type, ...] | None = None
58
+ direction: FrameDirection | None = None
59
+ data_extractor: Callable[[Any], dict[str, Any]] | None = field(default=None)
60
+
61
+
62
+ @dataclass
63
+ class EventTrigger:
64
+ """Trigger configuration for an event type.
65
+
66
+ Attributes:
67
+ frame_triggers: List of frame triggers, each with its own optional source filter.
68
+ """
69
+
70
+ frame_triggers: list[FrameTrigger]
71
+
72
+
73
+ @dataclass
74
+ class CategoryEvents:
75
+ """Event configuration for a category.
76
+
77
+ Each category can define:
78
+ - start: Trigger for the start event (closes category gate until end)
79
+ - end: Trigger for the end event (reopens category gate)
80
+ - first_byte: Trigger for first byte event (auto-resets on start)
81
+ - data_accumulator: Function to accumulate data from frames between start and end.
82
+ Called with (frame, accumulated_data) and returns updated accumulated_data dict.
83
+ The accumulated data is merged into the end event's data.
84
+
85
+ Attributes:
86
+ category: The event category.
87
+ description: Human-readable description of what this category measures.
88
+ start: Start event trigger (optional).
89
+ end: End event trigger (optional).
90
+ first_byte: First byte event trigger (optional, resets on start).
91
+ data_accumulator: Function to accumulate data from frames (optional).
92
+ """
93
+
94
+ category: EventCategory
95
+ description: str = ""
96
+ start: EventTrigger | None = None
97
+ end: EventTrigger | None = None
98
+ first_byte: EventTrigger | None = None
99
+ data_accumulator: Callable[[Any, dict[str, Any]], dict[str, Any]] | None = field(default=None)
100
+
101
+
102
+ # =============================================================================
103
+ # EVENT CONFIGURATION - Edit this to customize event tracking
104
+ # =============================================================================
105
+ # Each category defines triggers for start, end, and first_byte events.
106
+ #
107
+ # Gates (implicit, per category):
108
+ # - start: closes the category gate (prevents re-firing until end)
109
+ # - end: reopens the category gate
110
+ # - first_byte: only fires once per start (auto-resets when start fires)
111
+ # =============================================================================
112
+
113
+ CATEGORIES: list[CategoryEvents] = [
114
+ # --- User Speech ---
115
+ CategoryEvents(
116
+ category=EventCategory.USER_SPEAK,
117
+ start=EventTrigger(frame_triggers=[FrameTrigger(frame=UserStartedSpeakingFrame)]),
118
+ end=EventTrigger(
119
+ frame_triggers=[
120
+ FrameTrigger(
121
+ frame=UserStoppedSpeakingFrame,
122
+ direction=FrameDirection.DOWNSTREAM, # Only process DOWNSTREAM to avoid duplicates
123
+ )
124
+ ]
125
+ ),
126
+ ),
127
+ # --- Bot Speech ---
128
+ CategoryEvents(
129
+ category=EventCategory.BOT_SPEAK,
130
+ start=EventTrigger(frame_triggers=[FrameTrigger(frame=BotStartedSpeakingFrame)]),
131
+ end=EventTrigger(
132
+ frame_triggers=[
133
+ FrameTrigger(
134
+ frame=BotStoppedSpeakingFrame,
135
+ direction=FrameDirection.DOWNSTREAM, # Only process DOWNSTREAM to avoid duplicates
136
+ )
137
+ ]
138
+ ),
139
+ ),
140
+ # --- STT (Speech-to-Text) ---
141
+ CategoryEvents(
142
+ category=EventCategory.STT,
143
+ start=EventTrigger(frame_triggers=[FrameTrigger(frame=VADUserStartedSpeakingFrame)]),
144
+ end=EventTrigger(
145
+ frame_triggers=[
146
+ FrameTrigger(
147
+ frame=TranscriptionFrame,
148
+ data_extractor=lambda f: {"text": getattr(f, "text", "") or ""},
149
+ )
150
+ ]
151
+ ),
152
+ ),
153
+ # --- LLM (Large Language Model) ---
154
+ CategoryEvents(
155
+ category=EventCategory.LLM,
156
+ start=EventTrigger(
157
+ frame_triggers=[
158
+ FrameTrigger(
159
+ frame=OpenAILLMContextFrame,
160
+ source_class=(LLMUserContextAggregator,),
161
+ ),
162
+ FrameTrigger(
163
+ frame=LLMContextFrame,
164
+ source_class=(LLMUserContextAggregator,),
165
+ ),
166
+ FrameTrigger(
167
+ frame=OpenAILLMContextFrame,
168
+ source_class=(LLMAssistantContextAggregator,),
169
+ direction=FrameDirection.UPSTREAM, # Only when triggering LLM run after tool calls
170
+ ),
171
+ FrameTrigger(
172
+ frame=LLMContextFrame,
173
+ source_class=(LLMAssistantContextAggregator,),
174
+ direction=FrameDirection.UPSTREAM, # Only when triggering LLM run after tool calls
175
+ ),
176
+ FrameTrigger(
177
+ frame=LLMRunFrame,
178
+ ),
179
+ ]
180
+ ),
181
+ end=EventTrigger(
182
+ frame_triggers=[
183
+ FrameTrigger(
184
+ frame=LLMFullResponseEndFrame,
185
+ source_class=(LLMService,), # Only from LLM services, not downstream services
186
+ )
187
+ ]
188
+ ),
189
+ first_byte=EventTrigger(frame_triggers=[FrameTrigger(frame=LLMTextFrame)]),
190
+ data_accumulator=lambda frame, acc: {
191
+ **acc,
192
+ "text": acc.get("text", "") + (getattr(frame, "text", "") or ""),
193
+ }
194
+ if isinstance(frame, LLMTextFrame)
195
+ else acc,
196
+ ),
197
+ # --- TTS (Text-to-Speech) ---
198
+ CategoryEvents(
199
+ category=EventCategory.TTS,
200
+ start=EventTrigger(frame_triggers=[FrameTrigger(frame=TTSStartedFrame)]),
201
+ end=EventTrigger(frame_triggers=[FrameTrigger(frame=TTSStoppedFrame)]),
202
+ first_byte=EventTrigger(frame_triggers=[FrameTrigger(frame=TTSAudioRawFrame)]),
203
+ ),
204
+ # --- Tool Calling (Function Calling) ---
205
+ CategoryEvents(
206
+ category=EventCategory.TOOL_CALL,
207
+ description="LLM function/tool calling",
208
+ start=EventTrigger(
209
+ frame_triggers=[
210
+ FrameTrigger(
211
+ frame=FunctionCallsStartedFrame,
212
+ data_extractor=lambda f: {
213
+ "description": ", ".join(fc.function_name for fc in f.function_calls),
214
+ "operation": ", ".join(fc.function_name for fc in f.function_calls),
215
+ }
216
+ if f.function_calls
217
+ else {},
218
+ )
219
+ ]
220
+ ),
221
+ end=EventTrigger(
222
+ frame_triggers=[
223
+ FrameTrigger(
224
+ frame=FunctionCallResultFrame,
225
+ data_extractor=lambda f: {
226
+ "operation": f.function_name,
227
+ },
228
+ ),
229
+ FrameTrigger(
230
+ frame=FunctionCallCancelFrame,
231
+ data_extractor=lambda f: {
232
+ "operation": f.function_name,
233
+ },
234
+ ),
235
+ ]
236
+ ),
237
+ ),
238
+ # --- System: Context Aggregation ---
239
+ CategoryEvents(
240
+ category=EventCategory.SYSTEM,
241
+ description="Context aggregation timeout",
242
+ start=EventTrigger(
243
+ frame_triggers=[
244
+ FrameTrigger(
245
+ frame=TranscriptionFrame,
246
+ data_extractor=lambda f: {
247
+ "operation": "context_aggregation_timeout",
248
+ },
249
+ )
250
+ ]
251
+ ),
252
+ end=EventTrigger(
253
+ frame_triggers=[
254
+ FrameTrigger(
255
+ frame=OpenAILLMContextFrame,
256
+ source_class=(LLMUserContextAggregator,),
257
+ data_extractor=lambda f: {
258
+ "operation": "context_aggregation_timeout",
259
+ },
260
+ ),
261
+ FrameTrigger(
262
+ frame=LLMContextFrame,
263
+ source_class=(LLMUserContextAggregator,),
264
+ data_extractor=lambda f: {
265
+ "operation": "context_aggregation_timeout",
266
+ },
267
+ ),
268
+ FrameTrigger(
269
+ frame=LLMRunFrame,
270
+ source_class=(LLMUserContextAggregator,),
271
+ data_extractor=lambda f: {
272
+ "operation": "context_aggregation_timeout",
273
+ },
274
+ ),
275
+ ]
276
+ ),
277
+ ),
278
+ ]
279
+
280
+
281
+ class VoicegroundObserver(BaseObserver):
282
+ """Observer for tracking conversation events in pipecat pipelines.
283
+
284
+ This observer monitors frames flowing through the pipeline and emits
285
+ normalized VoicegroundEvents to registered reporters.
286
+
287
+ Args:
288
+ reporters: List of reporters to receive events.
289
+ """
290
+
291
+ def __init__(
292
+ self, reporters: list["BaseReporter"] | None = None, conversation_id: str | None = None
293
+ ):
294
+ super().__init__()
295
+ self._reporters: list[BaseReporter] = reporters or []
296
+ # Generate conversation_id if not provided
297
+ self._conversation_id: str = conversation_id or str(uuid.uuid4())
298
+ # Track processed frames to avoid duplicate events from same frame
299
+ # Using frame.id which is a unique identifier for each frame instance
300
+ self._processed_frames: set[int] = set()
301
+ # Category gates: True = ready to fire start event
302
+ self._category_ready: dict[EventCategory, bool] = {}
303
+ # First byte gates: True = first byte not yet seen for this category
304
+ self._first_byte_pending: dict[EventCategory, bool] = {}
305
+ # Track accumulated data per category (for data_accumulator)
306
+ self._category_accumulated_data: dict[EventCategory, dict[str, Any]] = {}
307
+ # Prevent multiple end() calls
308
+ self._ended: bool = False
309
+ self._init_gates()
310
+
311
+ def _init_gates(self) -> None:
312
+ """Initialize all gates."""
313
+ for cat_config in CATEGORIES:
314
+ # Categories with start triggers have gates (start open)
315
+ if cat_config.start:
316
+ self._category_ready[cat_config.category] = True
317
+ # Categories with first_byte triggers track first byte
318
+ if cat_config.first_byte:
319
+ self._first_byte_pending[cat_config.category] = True
320
+ # Initialize accumulated data for categories with data accumulators
321
+ if cat_config.data_accumulator:
322
+ self._category_accumulated_data[cat_config.category] = {}
323
+
324
+ def add_reporter(self, reporter: "BaseReporter") -> None:
325
+ """Add a reporter to receive events."""
326
+ self._reporters.append(reporter)
327
+
328
+ def remove_reporter(self, reporter: "BaseReporter") -> None:
329
+ """Remove a reporter."""
330
+ self._reporters.remove(reporter)
331
+
332
+ async def _emit_event(self, event: VoicegroundEvent) -> None:
333
+ """Emit an event to all registered reporters."""
334
+ for reporter in self._reporters:
335
+ await reporter.on_event(event)
336
+
337
+ async def end(self) -> None:
338
+ """End the observation session and finalize all reporters.
339
+
340
+ Called automatically when EndFrame or CancelFrame is detected.
341
+ Can also be called manually. Safe to call multiple times.
342
+ """
343
+ if self._ended:
344
+ return
345
+ self._ended = True
346
+
347
+ for reporter in self._reporters:
348
+ await reporter.on_end()
349
+
350
+ def _create_event(
351
+ self,
352
+ category: EventCategory,
353
+ event_type: EventType,
354
+ timestamp: int,
355
+ source: str = "",
356
+ data: dict[str, Any] | None = None,
357
+ ) -> VoicegroundEvent:
358
+ """Create a VoicegroundEvent with the given parameters."""
359
+ timestamp_seconds = timestamp / 1_000_000_000
360
+ return VoicegroundEvent(
361
+ id=str(uuid.uuid4()),
362
+ timestamp=timestamp_seconds,
363
+ category=category,
364
+ type=event_type,
365
+ source=source,
366
+ data=data or {},
367
+ )
368
+
369
+ def _check_source(
370
+ self,
371
+ source_class: tuple[type, ...] | None,
372
+ source_obj: object,
373
+ ) -> bool:
374
+ """Check if source matches filter. Returns True if no filter."""
375
+ if source_class is not None:
376
+ return isinstance(source_obj, source_class)
377
+ return True
378
+
379
+ async def _try_emit(
380
+ self,
381
+ category: EventCategory,
382
+ event_type: EventType,
383
+ trigger: EventTrigger,
384
+ frame,
385
+ timestamp: int,
386
+ timestamp_seconds: float,
387
+ source_obj: object,
388
+ source_name: str,
389
+ direction: FrameDirection,
390
+ ) -> bool:
391
+ """Try to emit an event if conditions are met. Returns True if emitted."""
392
+ matched_trigger: FrameTrigger | None = None
393
+
394
+ # Check if frame matches any frame trigger and its source filter
395
+ for frame_trigger in trigger.frame_triggers:
396
+ if isinstance(frame, frame_trigger.frame):
397
+ # Check source filter for this frame type
398
+ if not self._check_source(frame_trigger.source_class, source_obj):
399
+ continue
400
+ # Check direction filter if specified
401
+ if frame_trigger.direction is not None and direction != frame_trigger.direction:
402
+ continue
403
+ # Frame matches and all filters pass
404
+ matched_trigger = frame_trigger
405
+ break
406
+ else:
407
+ # No matching frame trigger found
408
+ return False
409
+
410
+ # Extract frame-specific data using the matched trigger's extractor
411
+ event_data: dict[str, Any] = {}
412
+ if matched_trigger.data_extractor:
413
+ event_data = matched_trigger.data_extractor(frame)
414
+
415
+ # Emit the event
416
+ event = self._create_event(category, event_type, timestamp, source_name, event_data)
417
+ await self._emit_event(event)
418
+ return True
419
+
420
+ async def on_push_frame(self, data: FramePushed) -> None:
421
+ """Handle frame push events."""
422
+ frame = data.frame
423
+ timestamp = data.timestamp
424
+ timestamp_seconds = timestamp / 1_000_000_000
425
+ source_name = data.source.name if hasattr(data.source, "name") else ""
426
+
427
+ # Call on_start when StartFrame is encountered
428
+ if isinstance(frame, StartFrame):
429
+ if not hasattr(self, "_started"):
430
+ self._started = True
431
+ for reporter in self._reporters:
432
+ await reporter.on_start(self._conversation_id)
433
+ return
434
+
435
+ # Check for pipeline end frames
436
+ if isinstance(frame, (EndFrame, CancelFrame)):
437
+ await self.end()
438
+ return
439
+
440
+ # Early exit if we've already processed this frame instance
441
+ # Use frame.id which is a unique identifier for each frame instance
442
+ if frame.id in self._processed_frames:
443
+ return
444
+ self._processed_frames.add(frame.id)
445
+
446
+ # Update data accumulators for categories that are currently active (gate closed)
447
+ for cat_config in CATEGORIES:
448
+ if cat_config.data_accumulator and not self._category_ready.get(
449
+ cat_config.category, True
450
+ ):
451
+ # Category is active (gate closed), accumulate data from this frame
452
+ acc_data = self._category_accumulated_data.get(cat_config.category, {})
453
+ self._category_accumulated_data[cat_config.category] = cat_config.data_accumulator(
454
+ frame, acc_data
455
+ )
456
+
457
+ # Process each category (no early returns - allow multiple events per frame)
458
+ for cat_config in CATEGORIES:
459
+ category = cat_config.category
460
+
461
+ # --- START event ---
462
+ if cat_config.start:
463
+ # Check if category gate is open
464
+ gate_open = self._category_ready.get(category, True)
465
+
466
+ if gate_open and await self._try_emit(
467
+ category,
468
+ EventType.START,
469
+ cat_config.start,
470
+ frame,
471
+ timestamp,
472
+ timestamp_seconds,
473
+ data.source,
474
+ source_name,
475
+ data.direction,
476
+ ):
477
+ # Close gate and reset first_byte
478
+ self._category_ready[category] = False
479
+ if category in self._first_byte_pending:
480
+ self._first_byte_pending[category] = True
481
+ # Reset accumulated data when category starts
482
+ if cat_config.data_accumulator:
483
+ self._category_accumulated_data[category] = {}
484
+
485
+ # --- FIRST_BYTE event ---
486
+ if cat_config.first_byte and self._first_byte_pending.get(category, False):
487
+ if await self._try_emit(
488
+ category,
489
+ EventType.FIRST_BYTE,
490
+ cat_config.first_byte,
491
+ frame,
492
+ timestamp,
493
+ timestamp_seconds,
494
+ data.source,
495
+ source_name,
496
+ data.direction,
497
+ ):
498
+ self._first_byte_pending[category] = False
499
+
500
+ # --- END event ---
501
+ if cat_config.end:
502
+ # Check if this frame matches the end trigger
503
+ matched_end_trigger: FrameTrigger | None = None
504
+ for frame_trigger in cat_config.end.frame_triggers:
505
+ if isinstance(frame, frame_trigger.frame):
506
+ if not self._check_source(frame_trigger.source_class, data.source):
507
+ continue
508
+ if (
509
+ frame_trigger.direction is not None
510
+ and data.direction != frame_trigger.direction
511
+ ):
512
+ continue
513
+ matched_end_trigger = frame_trigger
514
+ break
515
+
516
+ if matched_end_trigger:
517
+ # Extract frame-specific data
518
+ frame_data: dict[str, Any] = {}
519
+ if matched_end_trigger.data_extractor:
520
+ frame_data = matched_end_trigger.data_extractor(frame)
521
+
522
+ # Merge accumulated data if accumulator exists
523
+ accumulated_data = self._category_accumulated_data.get(category, {})
524
+ if cat_config.data_accumulator and accumulated_data:
525
+ event_data = {**frame_data, **accumulated_data}
526
+ else:
527
+ event_data = frame_data
528
+
529
+ # Emit event with merged data
530
+ event = self._create_event(
531
+ category, EventType.END, timestamp, source_name, event_data
532
+ )
533
+ await self._emit_event(event)
534
+
535
+ # Clear accumulated data after emitting
536
+ if cat_config.data_accumulator:
537
+ self._category_accumulated_data[category] = {}
538
+ # Reopen category gate
539
+ self._category_ready[category] = True
540
+
541
+ async def on_process_frame(self, data: FrameProcessed) -> None:
542
+ """Handle frame process events.
543
+
544
+ Currently not used - we primarily track push events.
545
+ """
546
+ pass
voiceground/py.typed ADDED
File without changes
@@ -0,0 +1,19 @@
1
+ """Voiceground reporters for event output."""
2
+
3
+ from voiceground.reporters.base import BaseReporter
4
+ from voiceground.reporters.html import HTMLReporter
5
+ from voiceground.reporters.metrics import (
6
+ MetricsReporter,
7
+ SystemOverheadData,
8
+ ToolCallData,
9
+ TurnMetricsData,
10
+ )
11
+
12
+ __all__ = [
13
+ "BaseReporter",
14
+ "HTMLReporter",
15
+ "MetricsReporter",
16
+ "SystemOverheadData",
17
+ "ToolCallData",
18
+ "TurnMetricsData",
19
+ ]
@@ -0,0 +1,43 @@
1
+ """Base reporter interface for Voiceground."""
2
+
3
+ from abc import ABC, abstractmethod
4
+
5
+ from voiceground.events import VoicegroundEvent
6
+
7
+
8
+ class BaseReporter(ABC):
9
+ """Abstract base class for event reporters.
10
+
11
+ Reporters receive VoicegroundEvents from the observer and can
12
+ process them in various ways (logging, storage, streaming, etc.).
13
+ """
14
+
15
+ @abstractmethod
16
+ async def on_start(self, conversation_id: str) -> None:
17
+ """Handle pipeline start.
18
+
19
+ Called when the pipeline starts. Reporters can initialize
20
+ any resources or set metadata here.
21
+
22
+ Args:
23
+ conversation_id: Unique identifier for this conversation session.
24
+ """
25
+ pass
26
+
27
+ @abstractmethod
28
+ async def on_event(self, event: VoicegroundEvent) -> None:
29
+ """Handle a new event.
30
+
31
+ Args:
32
+ event: The event to process.
33
+ """
34
+ pass
35
+
36
+ @abstractmethod
37
+ async def on_end(self) -> None:
38
+ """Handle pipeline termination.
39
+
40
+ Called when the pipeline ends (EndFrame or CancelFrame).
41
+ Reporters should finalize any pending operations here.
42
+ """
43
+ pass