atom-audio-engine 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {atom_audio_engine-0.1.0.dist-info → atom_audio_engine-0.1.2.dist-info}/METADATA +10 -5
- atom_audio_engine-0.1.2.dist-info/RECORD +57 -0
- atom_audio_engine-0.1.2.dist-info/top_level.txt +1 -0
- audio_engine/__init__.py +80 -0
- audio_engine/examples/__init__.py +1 -0
- audio_engine/examples/basic_stt_llm_tts.py +200 -0
- audio_engine/examples/geneface_animation.py +99 -0
- audio_engine/examples/personaplex_pipeline.py +116 -0
- audio_engine/examples/websocket_server.py +86 -0
- audio_engine/pipelines/personaplex/__init__.py +41 -0
- audio_engine/pipelines/personaplex/client.py +259 -0
- audio_engine/pipelines/personaplex/config.py +69 -0
- audio_engine/pipelines/personaplex/pipeline.py +301 -0
- audio_engine/pipelines/personaplex/types.py +173 -0
- audio_engine/pipelines/personaplex/utils.py +192 -0
- audio_engine/scripts/debug_pipeline.py +79 -0
- audio_engine/scripts/debug_tts.py +162 -0
- audio_engine/scripts/test_cartesia_connect.py +57 -0
- audio_engine/tests/__init__.py +1 -0
- audio_engine/tests/test_personaplex/__init__.py +1 -0
- audio_engine/tests/test_personaplex/test_personaplex.py +10 -0
- audio_engine/tests/test_personaplex/test_personaplex_client.py +259 -0
- audio_engine/tests/test_personaplex/test_personaplex_config.py +71 -0
- audio_engine/tests/test_personaplex/test_personaplex_message.py +80 -0
- audio_engine/tests/test_personaplex/test_personaplex_pipeline.py +226 -0
- audio_engine/tests/test_personaplex/test_personaplex_session.py +184 -0
- audio_engine/tests/test_personaplex/test_personaplex_transcript.py +184 -0
- audio_engine/tests/test_traditional_pipeline/__init__.py +1 -0
- audio_engine/tests/test_traditional_pipeline/test_cartesia_asr.py +474 -0
- audio_engine/tests/test_traditional_pipeline/test_config_env.py +97 -0
- audio_engine/tests/test_traditional_pipeline/test_conversation_context.py +115 -0
- audio_engine/tests/test_traditional_pipeline/test_pipeline_creation.py +64 -0
- audio_engine/tests/test_traditional_pipeline/test_pipeline_with_mocks.py +173 -0
- audio_engine/tests/test_traditional_pipeline/test_provider_factories.py +61 -0
- audio_engine/tests/test_traditional_pipeline/test_websocket_server.py +58 -0
- atom_audio_engine-0.1.0.dist-info/RECORD +0 -25
- atom_audio_engine-0.1.0.dist-info/top_level.txt +0 -8
- {atom_audio_engine-0.1.0.dist-info → atom_audio_engine-0.1.2.dist-info}/WHEEL +0 -0
- {asr → audio_engine/asr}/__init__.py +0 -0
- {asr → audio_engine/asr}/base.py +0 -0
- {asr → audio_engine/asr}/cartesia.py +0 -0
- {asr → audio_engine/asr}/deepgram.py +0 -0
- {core → audio_engine/core}/__init__.py +0 -0
- {core → audio_engine/core}/config.py +0 -0
- {core → audio_engine/core}/pipeline.py +0 -0
- {core → audio_engine/core}/types.py +0 -0
- {integrations → audio_engine/integrations}/__init__.py +0 -0
- {integrations → audio_engine/integrations}/geneface.py +0 -0
- {llm → audio_engine/llm}/__init__.py +0 -0
- {llm → audio_engine/llm}/base.py +0 -0
- {llm → audio_engine/llm}/groq.py +0 -0
- {pipelines → audio_engine/pipelines}/__init__.py +0 -0
- {streaming → audio_engine/streaming}/__init__.py +0 -0
- {streaming → audio_engine/streaming}/websocket_server.py +0 -0
- {tts → audio_engine/tts}/__init__.py +0 -0
- {tts → audio_engine/tts}/base.py +0 -0
- {tts → audio_engine/tts}/cartesia.py +0 -0
- {utils → audio_engine/utils}/__init__.py +0 -0
- {utils → audio_engine/utils}/audio.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: atom-audio-engine
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: A pluggable, async-first Python framework for real-time audio-to-audio conversational AI
|
|
5
5
|
Author-email: ATOM Group <info@atomgroup.ng>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -63,8 +63,13 @@ A pluggable audio-to-audio conversational engine with real-time streaming suppor
|
|
|
63
63
|
## Installation
|
|
64
64
|
|
|
65
65
|
```bash
|
|
66
|
-
|
|
67
|
-
|
|
66
|
+
pip install atom-audio-engine
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
For development with all optional dependencies:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
pip install atom-audio-engine[all,dev]
|
|
68
73
|
```
|
|
69
74
|
|
|
70
75
|
## Quick Start
|
|
@@ -79,8 +84,8 @@ from audio_engine.tts import CartesiaTTS
|
|
|
79
84
|
|
|
80
85
|
# Create pipeline with your providers
|
|
81
86
|
pipeline = Pipeline(
|
|
82
|
-
asr=
|
|
83
|
-
llm=
|
|
87
|
+
asr=CartesiaASR(api_key="your-cartesia-key"),
|
|
88
|
+
llm=GroqLLM(api_key="your-groq-key", model="mixtral-8x7b-32768"),
|
|
84
89
|
tts=CartesiaTTS(api_key="your-cartesia-key", voice_id="your-voice-id"),
|
|
85
90
|
system_prompt="You are a helpful assistant.",
|
|
86
91
|
)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
audio_engine/__init__.py,sha256=AQ0uto-Jn3cNqW35MMtSyX5mhXJMFv9AQhjcAkqZ7L4,1499
|
|
2
|
+
audio_engine/asr/__init__.py,sha256=w0t2ahxgApZbZjSc748tN3tmKDeXzasfBh51ZjPF9uc,1203
|
|
3
|
+
audio_engine/asr/base.py,sha256=MFC_7HmyEDnhDwUn62CWZsiF9_-mBVVsUK-Yppiq4Vk,2378
|
|
4
|
+
audio_engine/asr/cartesia.py,sha256=BXnvscO9VaR3LsfEGn7lJ66udzUjz44JzZTmSizZqIg,13321
|
|
5
|
+
audio_engine/asr/deepgram.py,sha256=M59lgrVFMS6-3YQcYaUY7cUdt2-MBptt_VExdfnSXr0,6429
|
|
6
|
+
audio_engine/core/__init__.py,sha256=7naTEkqDjrPsejviXk662OR86xVCyckU7eMKVpjwYys,301
|
|
7
|
+
audio_engine/core/config.py,sha256=EF98O2Gt8q29FX3T6UeDwWNIbm77bni99SThiJKl5Tk,5203
|
|
8
|
+
audio_engine/core/pipeline.py,sha256=jX9jAlIfwU6V8GjqjivyK8Y7P41S-QS8xKYv5c9_qG0,8850
|
|
9
|
+
audio_engine/core/types.py,sha256=iFQPajgeS1YgMWXJvubA8sWbxLI1Z8nF-z1uucrgNm4,2295
|
|
10
|
+
audio_engine/examples/__init__.py,sha256=4oFCZaD-vg0o48hnj03ZsktG2JrtwJ7HXUYOwEYSNCY,44
|
|
11
|
+
audio_engine/examples/basic_stt_llm_tts.py,sha256=tw8IIAL0WSG2M9U5SuLri75AOb7YM-twvAVAspaYVQM,6354
|
|
12
|
+
audio_engine/examples/geneface_animation.py,sha256=ogjQAqPHT5EW6X3R8hn0tJwj-_QBbPiBFDZDl_olTGo,2945
|
|
13
|
+
audio_engine/examples/personaplex_pipeline.py,sha256=OcpN8i5qoAS3Nmuc62tESzpRwPxsjxTGTrY_qICLETo,3641
|
|
14
|
+
audio_engine/examples/websocket_server.py,sha256=HhTlAFnJQXJyOs_prwFJASuh6h-0FKEh2JGeJSChf_c,2398
|
|
15
|
+
audio_engine/integrations/__init__.py,sha256=1y4CTaqybOwmfk_xxkWANYkc-A7PgH0JFMZCTq33fe4,126
|
|
16
|
+
audio_engine/integrations/geneface.py,sha256=2oeVZazp2R9gN-YmQhzzrZb87CBpEiAyKA8hHUxUZJk,8788
|
|
17
|
+
audio_engine/llm/__init__.py,sha256=mwr0C1E1Wf5589fVt7emOFMA2fHoXxQ5t-3dOxkXQEI,997
|
|
18
|
+
audio_engine/llm/base.py,sha256=C-ZNOab0Ca-vlxWgnPzB8uZXFNYbPgAYfQLNvaal2KU,2873
|
|
19
|
+
audio_engine/llm/groq.py,sha256=oGSjJBW0TiCmOzzl1HTE8zUhPC78I3ywhAYFq7Te2IA,6694
|
|
20
|
+
audio_engine/pipelines/__init__.py,sha256=Q1iZjX38TigrZPBaFgv_5AXw21wBN1Z-4nfXPjV-xDI,49
|
|
21
|
+
audio_engine/pipelines/personaplex/__init__.py,sha256=nX37MS93pYUPKiYwY2aa9G-PEI4x2yKjdLqGeab7wWI,916
|
|
22
|
+
audio_engine/pipelines/personaplex/client.py,sha256=NAiG6V9nTWh8ozrb5jT-6h8fesTuJZDgh-l7DlHQm6M,8667
|
|
23
|
+
audio_engine/pipelines/personaplex/config.py,sha256=6fBteI-HjJJl3ZcK5QZCCa9kcKVNDgPptLIkJNZc9kg,2935
|
|
24
|
+
audio_engine/pipelines/personaplex/pipeline.py,sha256=WUkFalPQ9sxICeFpF-58HJxzfQ30vfZ4WAs-E5aI60s,10411
|
|
25
|
+
audio_engine/pipelines/personaplex/types.py,sha256=6MvU2hBukBflJxat3MtC6bGQY1b33jaOIiOi2tZJRnU,4727
|
|
26
|
+
audio_engine/pipelines/personaplex/utils.py,sha256=um_7nGRFH0QaLIIfLwPnBXgFW0fVGU7gkjF8Gm-Hq4U,5000
|
|
27
|
+
audio_engine/scripts/debug_pipeline.py,sha256=HkrrVzimrmFsbltbEPKoAuJ_5yzBWBCWyrEH0_ZHOQM,2276
|
|
28
|
+
audio_engine/scripts/debug_tts.py,sha256=Aj-vW8kmcR7lDa2FdTn1_6wrFw1vpP8Kjnh1rLwQ_ag,4479
|
|
29
|
+
audio_engine/scripts/test_cartesia_connect.py,sha256=KoaBWxmfzdMBqpnDXwT2fFzAJsJlKg3hMsUYvAeU-L8,1529
|
|
30
|
+
audio_engine/streaming/__init__.py,sha256=Pd_ICcYeW75DXMsFpMrJnn9N-RU5s1_Wb3WZ3YbOTC4,136
|
|
31
|
+
audio_engine/streaming/websocket_server.py,sha256=miqHoVkUjznpmpQQrgkyaURR6DsDJLzkP_OGrBFOBYk,10994
|
|
32
|
+
audio_engine/tests/__init__.py,sha256=1JoGYWcW0zfdTZAgxs7NZaK4Zo0zlvq79dXzVwKMP3I,34
|
|
33
|
+
audio_engine/tests/test_personaplex/__init__.py,sha256=1JoGYWcW0zfdTZAgxs7NZaK4Zo0zlvq79dXzVwKMP3I,34
|
|
34
|
+
audio_engine/tests/test_personaplex/test_personaplex.py,sha256=BrYWbWmWqlzdK3H5YZtpLr4DxtK5UeLpbdwUabuUTnE,457
|
|
35
|
+
audio_engine/tests/test_personaplex/test_personaplex_client.py,sha256=RlGNHa-IcKC7CCiTQJDhUYN9HNMun7Q45AsFSu5swZ8,8377
|
|
36
|
+
audio_engine/tests/test_personaplex/test_personaplex_config.py,sha256=c-86tJ81NSfPOk8tIV_JfDn3IcJnFrgCHVqJGyw14lM,2487
|
|
37
|
+
audio_engine/tests/test_personaplex/test_personaplex_message.py,sha256=6gAbQUk954x4-PXkFdNb0GadxuJIJ49tRixPteFCiw4,2636
|
|
38
|
+
audio_engine/tests/test_personaplex/test_personaplex_pipeline.py,sha256=GCvNRgUN72d81RK0klc3z5ecBhBMgf4rJXgq5auXv6M,7424
|
|
39
|
+
audio_engine/tests/test_personaplex/test_personaplex_session.py,sha256=pF2s649MAh0TlRs4ooQBCExN-VSuc_DntknyfLw8Pxw,5780
|
|
40
|
+
audio_engine/tests/test_personaplex/test_personaplex_transcript.py,sha256=XdNAghb1Gjg68BBcj6BPt-1K-6rzS9gD3tufnp8vVPo,6400
|
|
41
|
+
audio_engine/tests/test_traditional_pipeline/__init__.py,sha256=1JoGYWcW0zfdTZAgxs7NZaK4Zo0zlvq79dXzVwKMP3I,34
|
|
42
|
+
audio_engine/tests/test_traditional_pipeline/test_cartesia_asr.py,sha256=rLM_7s-UQJEJGL98A8ewXrgckruog6ei-lFtpPetIkk,15353
|
|
43
|
+
audio_engine/tests/test_traditional_pipeline/test_config_env.py,sha256=pZd0doTKzZg7e_ZwEKLe3pfmZTBdXIlrO1-CUU1lPmc,3192
|
|
44
|
+
audio_engine/tests/test_traditional_pipeline/test_conversation_context.py,sha256=t6lk_5QwGE1CfU1RIAGVIB6d6flfoqVLNgPYs-aE1PA,4049
|
|
45
|
+
audio_engine/tests/test_traditional_pipeline/test_pipeline_creation.py,sha256=U8s4vc36JU79YTFVyv7HQlFN3Hj2KRfh-gWQKhsjiSA,2278
|
|
46
|
+
audio_engine/tests/test_traditional_pipeline/test_pipeline_with_mocks.py,sha256=N5ajn2QevssnP0xEBeR87FumT0w1j7BdVAiMmBLqL2A,5583
|
|
47
|
+
audio_engine/tests/test_traditional_pipeline/test_provider_factories.py,sha256=a9Da5wjhXV6-E_Q7E8AquKxbcTKAhjd2eVKUGBj3zpo,2240
|
|
48
|
+
audio_engine/tests/test_traditional_pipeline/test_websocket_server.py,sha256=InR8GCRiRW09zJk9Htx6YQE_--_KJhpEJCuCs_lJjKE,1936
|
|
49
|
+
audio_engine/tts/__init__.py,sha256=85XrpIkxFrRvOn19mWphkeBjTaEcsrFECYK_ZoGv1dQ,987
|
|
50
|
+
audio_engine/tts/base.py,sha256=vo0MSiep9QJQtpdCmDJWN-okK-ERYRA6Sk_g6IXCYZk,4475
|
|
51
|
+
audio_engine/tts/cartesia.py,sha256=bxhkNbWpQmlPTZ8RWcVCQzG_Q2mYr3t1aAd9OonSSWQ,17011
|
|
52
|
+
audio_engine/utils/__init__.py,sha256=WIeVykg3MqyOoCYEWsuzGyVniP8SIl9FE881ieR7WuE,250
|
|
53
|
+
audio_engine/utils/audio.py,sha256=Z7avyNqhzZ2fnBxZ_d0qUglOCCvHSffBveg5CQWTCM0,5529
|
|
54
|
+
atom_audio_engine-0.1.2.dist-info/METADATA,sha256=l8ztaq4vAmVNT4qg1mHhJW7R2sjTHs1BJsjTPpM108w,6690
|
|
55
|
+
atom_audio_engine-0.1.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
56
|
+
atom_audio_engine-0.1.2.dist-info/top_level.txt,sha256=IyumwgFrsDL7nlZlBijX-0shiSVhhBCFPUNBRNKzWP4,13
|
|
57
|
+
atom_audio_engine-0.1.2.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
audio_engine
|
audio_engine/__init__.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Audio Engine - Pluggable audio-to-audio conversational AI framework.
|
|
3
|
+
|
|
4
|
+
Orchestrates ASR → LLM → TTS pipeline with real-time streaming support.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
__version__ = "0.1.0"
|
|
8
|
+
|
|
9
|
+
# Core exports
|
|
10
|
+
from .core.pipeline import Pipeline
|
|
11
|
+
from .core.config import (
|
|
12
|
+
AudioEngineConfig,
|
|
13
|
+
ASRConfig,
|
|
14
|
+
LLMConfig,
|
|
15
|
+
TTSConfig,
|
|
16
|
+
StreamingConfig,
|
|
17
|
+
)
|
|
18
|
+
from .core.types import (
|
|
19
|
+
AudioChunk,
|
|
20
|
+
TranscriptChunk,
|
|
21
|
+
ResponseChunk,
|
|
22
|
+
ConversationContext,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# ASR Providers
|
|
26
|
+
from .asr.base import BaseASR
|
|
27
|
+
from .asr.cartesia import CartesiaASR
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
from .asr.deepgram import DeepgramASR
|
|
31
|
+
except ImportError:
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
# LLM Providers
|
|
35
|
+
from .llm.base import BaseLLM
|
|
36
|
+
from .llm.groq import GroqLLM
|
|
37
|
+
|
|
38
|
+
# TTS Providers
|
|
39
|
+
from .tts.base import BaseTTS
|
|
40
|
+
from .tts.cartesia import CartesiaTTS
|
|
41
|
+
|
|
42
|
+
# Streaming
|
|
43
|
+
from .streaming.websocket_server import WebSocketServer
|
|
44
|
+
|
|
45
|
+
# Integrations
|
|
46
|
+
try:
|
|
47
|
+
from .integrations.geneface import GeneFacePipelineWrapper, GeneFaceConfig
|
|
48
|
+
except ImportError:
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
__all__ = [
|
|
52
|
+
# Version
|
|
53
|
+
"__version__",
|
|
54
|
+
# Core
|
|
55
|
+
"Pipeline",
|
|
56
|
+
"AudioEngineConfig",
|
|
57
|
+
"ASRConfig",
|
|
58
|
+
"LLMConfig",
|
|
59
|
+
"TTSConfig",
|
|
60
|
+
"StreamingConfig",
|
|
61
|
+
"AudioChunk",
|
|
62
|
+
"TranscriptChunk",
|
|
63
|
+
"ResponseChunk",
|
|
64
|
+
"ConversationContext",
|
|
65
|
+
# ASR
|
|
66
|
+
"BaseASR",
|
|
67
|
+
"CartesiaASR",
|
|
68
|
+
"DeepgramASR",
|
|
69
|
+
# LLM
|
|
70
|
+
"BaseLLM",
|
|
71
|
+
"GroqLLM",
|
|
72
|
+
# TTS
|
|
73
|
+
"BaseTTS",
|
|
74
|
+
"CartesiaTTS",
|
|
75
|
+
# Streaming
|
|
76
|
+
"WebSocketServer",
|
|
77
|
+
# Integrations
|
|
78
|
+
"GeneFacePipelineWrapper",
|
|
79
|
+
"GeneFaceConfig",
|
|
80
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Example scripts for the audio engine."""
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Example: Basic STT-LLM-TTS Pipeline
|
|
4
|
+
|
|
5
|
+
Simple example showing how to use the core audio-to-audio pipeline:
|
|
6
|
+
Audio Input → ASR → LLM → TTS → Audio Output
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
# Text input only (skips ASR)
|
|
10
|
+
python examples/basic_stt_llm_tts.py
|
|
11
|
+
|
|
12
|
+
# Audio file input (full STT-LLM-TTS)
|
|
13
|
+
python examples/basic_stt_llm_tts.py examples/audio_clip_1.mp3
|
|
14
|
+
|
|
15
|
+
Setup:
|
|
16
|
+
export CARTESIA_API_KEY="your-cartesia-key"
|
|
17
|
+
export GROQ_API_KEY="your-groq-key"
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import asyncio
|
|
21
|
+
import logging
|
|
22
|
+
import sys
|
|
23
|
+
import wave
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
|
|
26
|
+
# Add parent to path
|
|
27
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
28
|
+
|
|
29
|
+
from dotenv import load_dotenv
|
|
30
|
+
from core.config import AudioEngineConfig
|
|
31
|
+
from core.types import AudioChunk
|
|
32
|
+
|
|
33
|
+
# Load environment variables from .env file
|
|
34
|
+
load_dotenv(Path(__file__).parent.parent.parent / ".env")
|
|
35
|
+
|
|
36
|
+
# Setup logging to see debug messages
|
|
37
|
+
logging.basicConfig(
|
|
38
|
+
level=logging.DEBUG, format="%(name)s - %(levelname)s - %(message)s"
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
async def stream_audio_file(file_path: str, chunk_size: int = 4096):
|
|
43
|
+
"""Stream audio from file (MP3, WAV, etc.) as chunks."""
|
|
44
|
+
try:
|
|
45
|
+
from pydub import AudioSegment
|
|
46
|
+
except ImportError:
|
|
47
|
+
print("❌ pydub required. Install: pip install pydub")
|
|
48
|
+
raise
|
|
49
|
+
|
|
50
|
+
file_path = Path(file_path)
|
|
51
|
+
if not file_path.exists():
|
|
52
|
+
raise FileNotFoundError(f"Audio file not found: {file_path}")
|
|
53
|
+
|
|
54
|
+
print(f"📁 Loading: {file_path.name}")
|
|
55
|
+
|
|
56
|
+
# Load and convert to 16kHz mono 16-bit PCM
|
|
57
|
+
audio = AudioSegment.from_file(str(file_path))
|
|
58
|
+
audio = audio.set_channels(1).set_frame_rate(16000).set_sample_width(2)
|
|
59
|
+
|
|
60
|
+
duration_sec = len(audio) / 1000.0
|
|
61
|
+
print(f"✓ Audio: {duration_sec:.2f}s @ 16kHz mono\n")
|
|
62
|
+
|
|
63
|
+
# Stream as chunks
|
|
64
|
+
audio_bytes = audio.raw_data
|
|
65
|
+
chunk_index = 0
|
|
66
|
+
offset = 0
|
|
67
|
+
total_chunks = (len(audio_bytes) + chunk_size - 1) // chunk_size
|
|
68
|
+
|
|
69
|
+
while offset < len(audio_bytes):
|
|
70
|
+
chunk_data = audio_bytes[offset : offset + chunk_size]
|
|
71
|
+
chunk_index += 1
|
|
72
|
+
is_final = offset + chunk_size >= len(audio_bytes)
|
|
73
|
+
|
|
74
|
+
# Yield as AudioChunk object
|
|
75
|
+
yield AudioChunk(
|
|
76
|
+
data=chunk_data, sample_rate=16000, format="pcm_s16le", is_final=is_final
|
|
77
|
+
)
|
|
78
|
+
offset += chunk_size
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
async def main():
|
|
82
|
+
"""Run basic pipeline example."""
|
|
83
|
+
print("=" * 60)
|
|
84
|
+
print("Audio Engine: Basic STT-LLM-TTS Pipeline")
|
|
85
|
+
print("=" * 60)
|
|
86
|
+
print()
|
|
87
|
+
|
|
88
|
+
# Check if audio file provided as argument
|
|
89
|
+
audio_file = None
|
|
90
|
+
if len(sys.argv) > 1:
|
|
91
|
+
audio_file = sys.argv[1]
|
|
92
|
+
|
|
93
|
+
# Load configuration from environment
|
|
94
|
+
config = AudioEngineConfig.from_env()
|
|
95
|
+
print(f"✓ Config loaded:")
|
|
96
|
+
print(f" - ASR: {config.asr.provider}")
|
|
97
|
+
print(f" - LLM: {config.llm.provider}")
|
|
98
|
+
print(f" - TTS: {config.tts.provider}")
|
|
99
|
+
print()
|
|
100
|
+
|
|
101
|
+
# Create pipeline from config
|
|
102
|
+
pipeline = config.create_pipeline(
|
|
103
|
+
system_prompt="You are a helpful assistant. Keep responses brief."
|
|
104
|
+
)
|
|
105
|
+
print(f"✓ Pipeline created")
|
|
106
|
+
print()
|
|
107
|
+
|
|
108
|
+
if audio_file:
|
|
109
|
+
# Full pipeline: Audio → ASR → LLM → TTS
|
|
110
|
+
print("-" * 60)
|
|
111
|
+
print("FULL PIPELINE: STT → LLM → TTS")
|
|
112
|
+
print("-" * 60 + "\n")
|
|
113
|
+
|
|
114
|
+
transcript = ""
|
|
115
|
+
llm_response = ""
|
|
116
|
+
audio_output = bytearray()
|
|
117
|
+
chunk_count = 0
|
|
118
|
+
|
|
119
|
+
try:
|
|
120
|
+
audio_generator = stream_audio_file(audio_file)
|
|
121
|
+
|
|
122
|
+
async for result in pipeline.stream(audio_generator):
|
|
123
|
+
if hasattr(result, "text"):
|
|
124
|
+
class_name = result.__class__.__name__
|
|
125
|
+
if class_name == "TranscriptChunk":
|
|
126
|
+
transcript += result.text
|
|
127
|
+
print(f"🎤 STT: {result.text!r}")
|
|
128
|
+
elif class_name == "ResponseChunk":
|
|
129
|
+
llm_response += result.text
|
|
130
|
+
print(f"🧠 LLM: {result.text!r}")
|
|
131
|
+
|
|
132
|
+
elif hasattr(result, "data") and result.data:
|
|
133
|
+
audio_output.extend(result.data)
|
|
134
|
+
chunk_count += 1
|
|
135
|
+
if chunk_count % 5 == 0:
|
|
136
|
+
print(f"🔊 TTS: {len(audio_output)} bytes...")
|
|
137
|
+
|
|
138
|
+
if chunk_count > 0:
|
|
139
|
+
print(f"🔊 TTS: {len(audio_output)} bytes total")
|
|
140
|
+
|
|
141
|
+
# Save output
|
|
142
|
+
if audio_output:
|
|
143
|
+
output_dir = Path(__file__).parent / "output_samples"
|
|
144
|
+
output_dir.mkdir(exist_ok=True)
|
|
145
|
+
output_path = output_dir / "output_audio.wav"
|
|
146
|
+
with wave.open(str(output_path), "wb") as wav_file:
|
|
147
|
+
wav_file.setnchannels(1)
|
|
148
|
+
wav_file.setsampwidth(2)
|
|
149
|
+
wav_file.setframerate(16000)
|
|
150
|
+
wav_file.writeframes(bytes(audio_output))
|
|
151
|
+
|
|
152
|
+
duration_sec = len(audio_output) / 32000
|
|
153
|
+
print(f"\n✓ Audio saved to {output_path} ({duration_sec:.2f}s)")
|
|
154
|
+
|
|
155
|
+
# Print results
|
|
156
|
+
print("\n" + "=" * 60)
|
|
157
|
+
print("RESULTS")
|
|
158
|
+
print("=" * 60)
|
|
159
|
+
if transcript:
|
|
160
|
+
print(f"🎤 Transcribed: {transcript!r}")
|
|
161
|
+
if llm_response:
|
|
162
|
+
print(f"🧠 Response: {llm_response!r}")
|
|
163
|
+
print("=" * 60 + "\n")
|
|
164
|
+
|
|
165
|
+
except Exception as e:
|
|
166
|
+
print(f"❌ Error: {e}")
|
|
167
|
+
import traceback
|
|
168
|
+
|
|
169
|
+
traceback.print_exc()
|
|
170
|
+
return
|
|
171
|
+
|
|
172
|
+
else:
|
|
173
|
+
# Simplified pipeline: Text → LLM → TTS (no ASR)
|
|
174
|
+
print("-" * 60)
|
|
175
|
+
print("SIMPLIFIED PIPELINE: Text → LLM → TTS")
|
|
176
|
+
print("-" * 60 + "\n")
|
|
177
|
+
|
|
178
|
+
user_text = "What is the capital of France?"
|
|
179
|
+
print(f"User: {user_text}\n")
|
|
180
|
+
|
|
181
|
+
chunk_count = 0
|
|
182
|
+
total_bytes = 0
|
|
183
|
+
async for audio_chunk in pipeline.stream_text_input(user_text):
|
|
184
|
+
chunk_count += 1
|
|
185
|
+
total_bytes += len(audio_chunk.data)
|
|
186
|
+
print(f" • Audio chunk {chunk_count}: {len(audio_chunk.data)} bytes")
|
|
187
|
+
|
|
188
|
+
print()
|
|
189
|
+
print("✓ Pipeline complete")
|
|
190
|
+
print(f" Total audio: {total_bytes} bytes across {chunk_count} chunks")
|
|
191
|
+
print()
|
|
192
|
+
print("Conversation history:")
|
|
193
|
+
for msg in pipeline.context.messages:
|
|
194
|
+
role = msg.role.upper()
|
|
195
|
+
content = msg.content[:60]
|
|
196
|
+
print(f" {role}: {content}...")
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
if __name__ == "__main__":
|
|
200
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Example: GeneFace++ Face Animation Integration
|
|
4
|
+
|
|
5
|
+
Combines audio with GeneFace++ to generate an animated face
|
|
6
|
+
that speaks the assistant's responses.
|
|
7
|
+
|
|
8
|
+
GeneFaceIntegration takes a GeneFaceConfig (no pipeline parameter).
|
|
9
|
+
It generates videos from audio bytes using:
|
|
10
|
+
- generate_video(audio_bytes, sample_rate) -> video_path
|
|
11
|
+
- generate_video_stream(audio_chunks) -> video_path
|
|
12
|
+
|
|
13
|
+
Setup:
|
|
14
|
+
1. Clone GeneFace++ repository:
|
|
15
|
+
git clone https://github.com/yerfor/GeneFace-plusplus.git
|
|
16
|
+
|
|
17
|
+
2. Install GeneFace++ (see their docs)
|
|
18
|
+
|
|
19
|
+
3. Update geneface_path in this script
|
|
20
|
+
|
|
21
|
+
Run:
|
|
22
|
+
python examples/geneface_animation.py
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
import asyncio
|
|
26
|
+
import sys
|
|
27
|
+
import logging
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
|
|
30
|
+
# Add parent to path
|
|
31
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
32
|
+
|
|
33
|
+
from dotenv import load_dotenv
|
|
34
|
+
from integrations.geneface import GeneFaceIntegration, GeneFaceConfig
|
|
35
|
+
|
|
36
|
+
# Load environment variables from .env file
|
|
37
|
+
load_dotenv(Path(__file__).parent.parent / ".env")
|
|
38
|
+
|
|
39
|
+
# Setup logging
|
|
40
|
+
logging.basicConfig(
|
|
41
|
+
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
|
42
|
+
)
|
|
43
|
+
logger = logging.getLogger(__name__)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
async def main():
|
|
47
|
+
"""Run GeneFace++ animation example."""
|
|
48
|
+
print("=" * 60)
|
|
49
|
+
print("Audio Engine: GeneFace++ Face Animation")
|
|
50
|
+
print("=" * 60)
|
|
51
|
+
print()
|
|
52
|
+
|
|
53
|
+
# Setup GeneFace++ configuration
|
|
54
|
+
geneface_path = ""
|
|
55
|
+
if not geneface_path.exists():
|
|
56
|
+
logger.error(f"GeneFace++ path not found: {geneface_path}")
|
|
57
|
+
logger.info("Clone GeneFace-plusplus:")
|
|
58
|
+
logger.info(" git clone https://github.com/yerfor/GeneFace-plusplus.git")
|
|
59
|
+
logger.info()
|
|
60
|
+
logger.info("Update geneface_path variable in this script")
|
|
61
|
+
return
|
|
62
|
+
|
|
63
|
+
geneface_config = GeneFaceConfig(
|
|
64
|
+
geneface_path=str(geneface_path),
|
|
65
|
+
checkpoint_path=None, # Specify trained model checkpoint if available
|
|
66
|
+
output_resolution=(512, 512),
|
|
67
|
+
fps=25,
|
|
68
|
+
device="cuda", # or "cpu"
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# Initialize GeneFace integration
|
|
72
|
+
try:
|
|
73
|
+
geneface = GeneFaceIntegration(config=geneface_config)
|
|
74
|
+
logger.info("GeneFace++ integration initialized")
|
|
75
|
+
print()
|
|
76
|
+
|
|
77
|
+
# Example: Generate video from audio
|
|
78
|
+
sample_rate = 16000
|
|
79
|
+
duration_seconds = 2
|
|
80
|
+
# For demo, create silence (all zeros)
|
|
81
|
+
audio_bytes = bytes(sample_rate * duration_seconds * 2) # 16-bit PCM
|
|
82
|
+
|
|
83
|
+
logger.info(f"Generating video from {duration_seconds}s audio...")
|
|
84
|
+
video_path = await geneface.generate_video(
|
|
85
|
+
audio=audio_bytes,
|
|
86
|
+
sample_rate=sample_rate,
|
|
87
|
+
output_path=None, # Uses temp file
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
logger.info(f"✓ Video saved to: {video_path}")
|
|
91
|
+
logger.info("GeneFace++ video generation working!")
|
|
92
|
+
|
|
93
|
+
except Exception as e:
|
|
94
|
+
logger.error(f"Error: {e}")
|
|
95
|
+
logger.info("Make sure GeneFace++ is properly installed and configured")
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
if __name__ == "__main__":
|
|
99
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Example: PersonaPlex Full-Duplex Pipeline
|
|
4
|
+
|
|
5
|
+
PersonaPlex is a full-duplex audio pipeline allowing simultaneous
|
|
6
|
+
input/output - user can interrupt while assistant is speaking.
|
|
7
|
+
|
|
8
|
+
PersonaPlexPipeline API:
|
|
9
|
+
- __init__(config=None, system_prompt="...", save_transcripts=True)
|
|
10
|
+
- await start() -> connects to server
|
|
11
|
+
- await stop() -> closes connection, saves transcript
|
|
12
|
+
- async for (audio_chunk, text_chunk) in stream(audio_stream=None)
|
|
13
|
+
- PersonaPlexConfig uses voice_prompt (not voice_id)
|
|
14
|
+
|
|
15
|
+
Setup:
|
|
16
|
+
Obtain PersonaPlex server URL from RunPod or similar service
|
|
17
|
+
Set environment (optional, PersonaPlexConfig has defaults):
|
|
18
|
+
export PERSONAPLEX_SERVER="wss://your-server/"
|
|
19
|
+
export PERSONAPLEX_VOICE_PROMPT="NATF0.pt"
|
|
20
|
+
|
|
21
|
+
Run:
|
|
22
|
+
python examples/personaplex_pipeline.py
|
|
23
|
+
|
|
24
|
+
Note:
|
|
25
|
+
PersonaPlex is full-duplex (simultaneous I/O) unlike
|
|
26
|
+
sequential STT-LLM-TTS pipeline.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
import asyncio
|
|
30
|
+
import sys
|
|
31
|
+
import logging
|
|
32
|
+
from pathlib import Path
|
|
33
|
+
|
|
34
|
+
# Add parent to path
|
|
35
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
36
|
+
|
|
37
|
+
from dotenv import load_dotenv
|
|
38
|
+
from pipelines.personaplex import PersonaPlexPipeline, PersonaPlexConfig
|
|
39
|
+
|
|
40
|
+
# Load environment variables from .env file
|
|
41
|
+
load_dotenv(Path(__file__).parent.parent / ".env")
|
|
42
|
+
|
|
43
|
+
# Setup logging
|
|
44
|
+
logging.basicConfig(
|
|
45
|
+
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
|
46
|
+
)
|
|
47
|
+
logger = logging.getLogger(__name__)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
async def main():
|
|
51
|
+
"""Run PersonaPlex full-duplex pipeline example."""
|
|
52
|
+
print("=" * 60)
|
|
53
|
+
print("Audio Engine: PersonaPlex Full-Duplex Pipeline")
|
|
54
|
+
print("=" * 60)
|
|
55
|
+
print()
|
|
56
|
+
|
|
57
|
+
# Configure PersonaPlex
|
|
58
|
+
config = PersonaPlexConfig(
|
|
59
|
+
# server_url="wss://your-personaplex-server", # Uses default if not set
|
|
60
|
+
voice_prompt="NATF0.pt", # Voice ID (not voice_id)
|
|
61
|
+
text_temperature=0.7,
|
|
62
|
+
audio_temperature=0.8,
|
|
63
|
+
save_transcripts=True,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
logger.info("PersonaPlex Configuration:")
|
|
67
|
+
logger.info(f" Server: {config.server_url[:50]}...")
|
|
68
|
+
logger.info(f" Voice Prompt: {config.voice_prompt}")
|
|
69
|
+
logger.info(f" Text Temp: {config.text_temperature}")
|
|
70
|
+
logger.info(f" Audio Temp: {config.audio_temperature}")
|
|
71
|
+
print()
|
|
72
|
+
|
|
73
|
+
# Create pipeline
|
|
74
|
+
pipeline = PersonaPlexPipeline(
|
|
75
|
+
config=config,
|
|
76
|
+
system_prompt="You are a helpful, friendly AI assistant.",
|
|
77
|
+
save_transcripts=True,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
# Start connection
|
|
82
|
+
await pipeline.start()
|
|
83
|
+
logger.info("✓ Connected to PersonaPlex server")
|
|
84
|
+
logger.info("✓ Ready for full-duplex streaming")
|
|
85
|
+
print()
|
|
86
|
+
|
|
87
|
+
# Example: Stream bidirectional audio/text
|
|
88
|
+
logger.info("Streaming audio and text (press Ctrl+C to stop)...")
|
|
89
|
+
chunk_count = 0
|
|
90
|
+
|
|
91
|
+
async for audio_chunk, text_chunk in pipeline.stream():
|
|
92
|
+
if text_chunk:
|
|
93
|
+
logger.info(f"[Assistant] {text_chunk.text}")
|
|
94
|
+
if audio_chunk:
|
|
95
|
+
chunk_count += 1
|
|
96
|
+
logger.info(
|
|
97
|
+
f"[Audio] Chunk {chunk_count}: {len(audio_chunk.data)} bytes"
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# Stop and save transcript
|
|
101
|
+
transcript_data = await pipeline.stop()
|
|
102
|
+
logger.info("✓ Pipeline stopped")
|
|
103
|
+
if transcript_data:
|
|
104
|
+
logger.info(f"✓ Transcript saved (session: {transcript_data.session_id})")
|
|
105
|
+
|
|
106
|
+
except KeyboardInterrupt:
|
|
107
|
+
logger.info("\n⏹️ Stopped by user")
|
|
108
|
+
await pipeline.stop()
|
|
109
|
+
except Exception as e:
|
|
110
|
+
logger.error(f"Error: {e}")
|
|
111
|
+
logger.info("Make sure PersonaPlex server is running and accessible")
|
|
112
|
+
await pipeline.stop()
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
if __name__ == "__main__":
|
|
116
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Example: WebSocket Server for Real-Time Audio Streaming
|
|
4
|
+
|
|
5
|
+
Runs a WebSocket server that accepts live audio streams and returns
|
|
6
|
+
responses in real-time. Clients can send binary audio and receive text
|
|
7
|
+
transcripts + response audio.
|
|
8
|
+
|
|
9
|
+
Setup:
|
|
10
|
+
export DEEPGRAM_API_KEY="your-deepgram-key"
|
|
11
|
+
export GROQ_API_KEY="your-groq-key"
|
|
12
|
+
export CARTESIA_API_KEY="your-cartesia-key"
|
|
13
|
+
|
|
14
|
+
Run:
|
|
15
|
+
python examples/websocket_server.py
|
|
16
|
+
|
|
17
|
+
Then connect a WebSocket client to ws://localhost:8765
|
|
18
|
+
|
|
19
|
+
Example client (JavaScript):
|
|
20
|
+
const ws = new WebSocket("ws://localhost:8765");
|
|
21
|
+
ws.binaryType = "arraybuffer";
|
|
22
|
+
ws.onopen = () => {
|
|
23
|
+
// Send audio data
|
|
24
|
+
ws.send(audioBuffer);
|
|
25
|
+
// Signal end of speech
|
|
26
|
+
ws.send(JSON.stringify({"type": "end_of_speech"}));
|
|
27
|
+
};
|
|
28
|
+
ws.onmessage = (event) => {
|
|
29
|
+
if (typeof event.data === "string") {
|
|
30
|
+
console.log("Event:", JSON.parse(event.data));
|
|
31
|
+
} else {
|
|
32
|
+
console.log("Audio data:", event.data);
|
|
33
|
+
}
|
|
34
|
+
};
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
import asyncio
|
|
38
|
+
import sys
|
|
39
|
+
import logging
|
|
40
|
+
from pathlib import Path
|
|
41
|
+
|
|
42
|
+
# Add parent to path
|
|
43
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
44
|
+
|
|
45
|
+
from dotenv import load_dotenv
|
|
46
|
+
from core.config import AudioEngineConfig
|
|
47
|
+
from streaming.websocket_server import run_server_from_config
|
|
48
|
+
|
|
49
|
+
# Load environment variables from .env file
|
|
50
|
+
load_dotenv(Path(__file__).parent.parent / ".env")
|
|
51
|
+
|
|
52
|
+
# Setup logging
|
|
53
|
+
logging.basicConfig(
|
|
54
|
+
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
55
|
+
)
|
|
56
|
+
logger = logging.getLogger(__name__)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
async def main():
|
|
60
|
+
"""Run WebSocket server."""
|
|
61
|
+
print("=" * 60)
|
|
62
|
+
print("Audio Engine: WebSocket Server")
|
|
63
|
+
print("=" * 60)
|
|
64
|
+
print()
|
|
65
|
+
|
|
66
|
+
# Load configuration from environment
|
|
67
|
+
config = AudioEngineConfig.from_env()
|
|
68
|
+
logger.info(f"Config loaded")
|
|
69
|
+
logger.info(f" ASR: {config.asr.provider}")
|
|
70
|
+
logger.info(f" LLM: {config.llm.provider}")
|
|
71
|
+
logger.info(f" TTS: {config.tts.provider}")
|
|
72
|
+
logger.info(f" Host: {config.streaming.host}")
|
|
73
|
+
logger.info(f" Port: {config.streaming.port}")
|
|
74
|
+
print()
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
await run_server_from_config(
|
|
78
|
+
config,
|
|
79
|
+
system_prompt="You are a helpful audio assistant. Keep responses brief and natural.",
|
|
80
|
+
)
|
|
81
|
+
except KeyboardInterrupt:
|
|
82
|
+
logger.info("Server shutting down...")
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
if __name__ == "__main__":
|
|
86
|
+
asyncio.run(main())
|