@agentunion/kite 1.0.6 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. package/cli.js +127 -25
  2. package/core/event_hub/entry.py +384 -61
  3. package/core/event_hub/hub.py +8 -0
  4. package/core/event_hub/module.md +0 -1
  5. package/core/event_hub/server.py +169 -38
  6. package/core/kite_log.py +241 -0
  7. package/core/launcher/entry.py +1306 -425
  8. package/core/launcher/module_scanner.py +10 -9
  9. package/core/launcher/process_manager.py +555 -121
  10. package/core/registry/entry.py +335 -30
  11. package/core/registry/server.py +339 -256
  12. package/core/registry/store.py +13 -2
  13. package/extensions/agents/__init__.py +1 -0
  14. package/extensions/agents/assistant/__init__.py +1 -0
  15. package/extensions/agents/assistant/entry.py +380 -0
  16. package/extensions/agents/assistant/module.md +22 -0
  17. package/extensions/agents/assistant/server.py +236 -0
  18. package/extensions/channels/__init__.py +1 -0
  19. package/extensions/channels/acp_channel/__init__.py +1 -0
  20. package/extensions/channels/acp_channel/entry.py +380 -0
  21. package/extensions/channels/acp_channel/module.md +22 -0
  22. package/extensions/channels/acp_channel/server.py +236 -0
  23. package/{core → extensions}/event_hub_bench/entry.py +664 -371
  24. package/{core → extensions}/event_hub_bench/module.md +4 -2
  25. package/extensions/services/backup/__init__.py +1 -0
  26. package/extensions/services/backup/entry.py +380 -0
  27. package/extensions/services/backup/module.md +22 -0
  28. package/extensions/services/backup/server.py +244 -0
  29. package/extensions/services/model_service/__init__.py +1 -0
  30. package/extensions/services/model_service/entry.py +380 -0
  31. package/extensions/services/model_service/module.md +22 -0
  32. package/extensions/services/model_service/server.py +236 -0
  33. package/extensions/services/watchdog/entry.py +460 -143
  34. package/extensions/services/watchdog/module.md +3 -0
  35. package/extensions/services/watchdog/monitor.py +128 -13
  36. package/extensions/services/watchdog/server.py +75 -13
  37. package/extensions/services/web/__init__.py +1 -0
  38. package/extensions/services/web/config.yaml +149 -0
  39. package/extensions/services/web/entry.py +487 -0
  40. package/extensions/services/web/module.md +24 -0
  41. package/extensions/services/web/routes/__init__.py +1 -0
  42. package/extensions/services/web/routes/routes_call.py +189 -0
  43. package/extensions/services/web/routes/routes_config.py +512 -0
  44. package/extensions/services/web/routes/routes_contacts.py +98 -0
  45. package/extensions/services/web/routes/routes_devlog.py +99 -0
  46. package/extensions/services/web/routes/routes_phone.py +81 -0
  47. package/extensions/services/web/routes/routes_sms.py +48 -0
  48. package/extensions/services/web/routes/routes_stats.py +17 -0
  49. package/extensions/services/web/routes/routes_voicechat.py +554 -0
  50. package/extensions/services/web/routes/schemas.py +216 -0
  51. package/extensions/services/web/server.py +332 -0
  52. package/extensions/services/web/static/css/style.css +1064 -0
  53. package/extensions/services/web/static/index.html +1445 -0
  54. package/extensions/services/web/static/js/app.js +4671 -0
  55. package/extensions/services/web/vendor/__init__.py +1 -0
  56. package/extensions/services/web/vendor/bluetooth/audio.py +348 -0
  57. package/extensions/services/web/vendor/bluetooth/contacts.py +251 -0
  58. package/extensions/services/web/vendor/bluetooth/manager.py +395 -0
  59. package/extensions/services/web/vendor/bluetooth/sms.py +290 -0
  60. package/extensions/services/web/vendor/bluetooth/telephony.py +274 -0
  61. package/extensions/services/web/vendor/config.py +139 -0
  62. package/extensions/services/web/vendor/conversation/__init__.py +0 -0
  63. package/extensions/services/web/vendor/conversation/asr.py +936 -0
  64. package/extensions/services/web/vendor/conversation/engine.py +548 -0
  65. package/extensions/services/web/vendor/conversation/llm.py +534 -0
  66. package/extensions/services/web/vendor/conversation/mcp_tools.py +190 -0
  67. package/extensions/services/web/vendor/conversation/tts.py +322 -0
  68. package/extensions/services/web/vendor/conversation/vad.py +138 -0
  69. package/extensions/services/web/vendor/storage/__init__.py +1 -0
  70. package/extensions/services/web/vendor/storage/identity.py +312 -0
  71. package/extensions/services/web/vendor/storage/store.py +507 -0
  72. package/extensions/services/web/vendor/task/__init__.py +0 -0
  73. package/extensions/services/web/vendor/task/manager.py +864 -0
  74. package/extensions/services/web/vendor/task/models.py +45 -0
  75. package/extensions/services/web/vendor/task/webhook.py +263 -0
  76. package/extensions/services/web/vendor/tools/__init__.py +0 -0
  77. package/extensions/services/web/vendor/tools/registry.py +321 -0
  78. package/main.py +344 -4
  79. package/package.json +11 -2
  80. package/core/__pycache__/__init__.cpython-313.pyc +0 -0
  81. package/core/__pycache__/data_dir.cpython-313.pyc +0 -0
  82. package/core/data_dir.py +0 -62
  83. package/core/event_hub/__pycache__/__init__.cpython-313.pyc +0 -0
  84. package/core/event_hub/__pycache__/bench.cpython-313.pyc +0 -0
  85. package/core/event_hub/__pycache__/bench_perf.cpython-313.pyc +0 -0
  86. package/core/event_hub/__pycache__/dedup.cpython-313.pyc +0 -0
  87. package/core/event_hub/__pycache__/entry.cpython-313.pyc +0 -0
  88. package/core/event_hub/__pycache__/hub.cpython-313.pyc +0 -0
  89. package/core/event_hub/__pycache__/router.cpython-313.pyc +0 -0
  90. package/core/event_hub/__pycache__/server.cpython-313.pyc +0 -0
  91. package/core/event_hub/bench_results/2026-02-28_13-26-48.json +0 -51
  92. package/core/event_hub/bench_results/2026-02-28_13-44-45.json +0 -51
  93. package/core/event_hub/bench_results/2026-02-28_13-45-39.json +0 -51
  94. package/core/launcher/__pycache__/__init__.cpython-313.pyc +0 -0
  95. package/core/launcher/__pycache__/entry.cpython-313.pyc +0 -0
  96. package/core/launcher/__pycache__/module_scanner.cpython-313.pyc +0 -0
  97. package/core/launcher/__pycache__/process_manager.cpython-313.pyc +0 -0
  98. package/core/launcher/data/log/lifecycle.jsonl +0 -1158
  99. package/core/launcher/data/token.txt +0 -1
  100. package/core/registry/__pycache__/__init__.cpython-313.pyc +0 -0
  101. package/core/registry/__pycache__/entry.cpython-313.pyc +0 -0
  102. package/core/registry/__pycache__/server.cpython-313.pyc +0 -0
  103. package/core/registry/__pycache__/store.cpython-313.pyc +0 -0
  104. package/core/registry/data/port.txt +0 -1
  105. package/core/registry/data/port_484.txt +0 -1
  106. package/extensions/__pycache__/__init__.cpython-313.pyc +0 -0
  107. package/extensions/services/__pycache__/__init__.cpython-313.pyc +0 -0
  108. package/extensions/services/watchdog/__pycache__/__init__.cpython-313.pyc +0 -0
  109. package/extensions/services/watchdog/__pycache__/entry.cpython-313.pyc +0 -0
  110. package/extensions/services/watchdog/__pycache__/monitor.cpython-313.pyc +0 -0
  111. package/extensions/services/watchdog/__pycache__/server.cpython-313.pyc +0 -0
  112. /package/{core/event_hub/bench_results/.gitkeep → extensions/services/web/vendor/bluetooth/__init__.py} +0 -0
@@ -0,0 +1,190 @@
1
+ """MCP tool definitions for in-call use by the LLM."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import copy
6
+ from typing import Any
7
+
8
+ # ---------------------------------------------------------------------------
9
+ # Canonical tool definitions (provider-agnostic)
10
+ # ---------------------------------------------------------------------------
11
+
12
+ MCP_TOOLS: list[dict[str, Any]] = [
13
+ {
14
+ "name": "confirm_with_caller",
15
+ "description": (
16
+ "Ask the task caller to confirm something or provide information. "
17
+ "Use when you need to verify details with the person who initiated "
18
+ "this call task."
19
+ ),
20
+ "parameters": {
21
+ "type": "object",
22
+ "properties": {
23
+ "question": {
24
+ "type": "string",
25
+ "description": "The question to ask the caller",
26
+ },
27
+ "options": {
28
+ "type": "array",
29
+ "items": {"type": "string"},
30
+ "description": "Optional multiple-choice answers",
31
+ },
32
+ "urgent": {
33
+ "type": "boolean",
34
+ "default": False,
35
+ "description": "Whether this requires an immediate response",
36
+ },
37
+ },
38
+ "required": ["question"],
39
+ },
40
+ },
41
+ {
42
+ "name": "send_sms",
43
+ "description": "Send an SMS message to a phone number.",
44
+ "parameters": {
45
+ "type": "object",
46
+ "properties": {
47
+ "phone_number": {
48
+ "type": "string",
49
+ "description": "The recipient phone number",
50
+ },
51
+ "content": {
52
+ "type": "string",
53
+ "description": "The SMS text content",
54
+ },
55
+ },
56
+ "required": ["phone_number", "content"],
57
+ },
58
+ },
59
+ {
60
+ "name": "add_contact",
61
+ "description": "Add a new contact to the address book.",
62
+ "parameters": {
63
+ "type": "object",
64
+ "properties": {
65
+ "name": {
66
+ "type": "string",
67
+ "description": "Contact name",
68
+ },
69
+ "phone": {
70
+ "type": "string",
71
+ "description": "Contact phone number",
72
+ },
73
+ "company": {
74
+ "type": "string",
75
+ "description": "Company or organization name",
76
+ },
77
+ "notes": {
78
+ "type": "string",
79
+ "description": "Additional notes about this contact",
80
+ },
81
+ },
82
+ "required": ["name", "phone"],
83
+ },
84
+ },
85
+ {
86
+ "name": "search_contacts",
87
+ "description": "Search the contact list by name, phone, or company.",
88
+ "parameters": {
89
+ "type": "object",
90
+ "properties": {
91
+ "query": {
92
+ "type": "string",
93
+ "description": "Search query string",
94
+ },
95
+ },
96
+ "required": ["query"],
97
+ },
98
+ },
99
+ {
100
+ "name": "notify_caller",
101
+ "description": (
102
+ "Send a one-way notification to the task caller. "
103
+ "No reply is expected."
104
+ ),
105
+ "parameters": {
106
+ "type": "object",
107
+ "properties": {
108
+ "message": {
109
+ "type": "string",
110
+ "description": "Notification message content",
111
+ },
112
+ },
113
+ "required": ["message"],
114
+ },
115
+ },
116
+ ]
117
+
118
+
119
+ # ---------------------------------------------------------------------------
120
+ # Provider-specific format converters
121
+ # ---------------------------------------------------------------------------
122
+
123
+ def get_tools_for_provider(provider: str) -> list[dict[str, Any]]:
124
+ """Convert ``MCP_TOOLS`` into the tool schema expected by each LLM provider.
125
+
126
+ Parameters
127
+ ----------
128
+ provider:
129
+ One of ``"openai"``, ``"claude"``, or ``"gemini"``.
130
+
131
+ Returns
132
+ -------
133
+ A list of tool definitions in the target provider's native format.
134
+ """
135
+ if provider == "openai":
136
+ return _to_openai_format()
137
+ elif provider == "claude":
138
+ return _to_claude_format()
139
+ elif provider == "gemini":
140
+ return _to_gemini_format()
141
+ else:
142
+ raise ValueError(f"Unknown provider: {provider}")
143
+
144
+
145
+ def _to_openai_format() -> list[dict[str, Any]]:
146
+ """OpenAI / OpenAI-compatible format.
147
+
148
+ Each tool is wrapped in ``{"type": "function", "function": {...}}``.
149
+ """
150
+ tools: list[dict[str, Any]] = []
151
+ for tool in MCP_TOOLS:
152
+ tools.append({
153
+ "type": "function",
154
+ "function": {
155
+ "name": tool["name"],
156
+ "description": tool["description"],
157
+ "parameters": copy.deepcopy(tool["parameters"]),
158
+ },
159
+ })
160
+ return tools
161
+
162
+
163
+ def _to_claude_format() -> list[dict[str, Any]]:
164
+ """Anthropic Claude format.
165
+
166
+ Each tool uses ``{"name": ..., "description": ..., "input_schema": ...}``.
167
+ """
168
+ tools: list[dict[str, Any]] = []
169
+ for tool in MCP_TOOLS:
170
+ tools.append({
171
+ "name": tool["name"],
172
+ "description": tool["description"],
173
+ "input_schema": copy.deepcopy(tool["parameters"]),
174
+ })
175
+ return tools
176
+
177
+
178
+ def _to_gemini_format() -> list[dict[str, Any]]:
179
+ """Google Gemini format.
180
+
181
+ Returns ``[{"function_declarations": [...]}]``.
182
+ """
183
+ declarations: list[dict[str, Any]] = []
184
+ for tool in MCP_TOOLS:
185
+ declarations.append({
186
+ "name": tool["name"],
187
+ "description": tool["description"],
188
+ "parameters": copy.deepcopy(tool["parameters"]),
189
+ })
190
+ return [{"function_declarations": declarations}]
@@ -0,0 +1,322 @@
1
+ """TTS (Text-to-Speech) abstraction with multiple provider implementations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import base64
6
+ import hashlib
7
+ import hmac
8
+ import json
9
+ import logging
10
+ import time
11
+ import uuid
12
+ from abc import ABC, abstractmethod
13
+ from datetime import datetime, timezone
14
+ from typing import Any
15
+
16
+ import httpx
17
+
18
+ from .. import config as cfg
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ # ---------------------------------------------------------------------------
24
+ # Abstract base
25
+ # ---------------------------------------------------------------------------
26
+
27
+ class TTSProvider(ABC):
28
+ """Base class for all TTS (text-to-speech) providers."""
29
+
30
+ @abstractmethod
31
+ async def synthesize(self, text: str, language: str = "zh") -> bytes:
32
+ """Synthesize *text* into audio bytes.
33
+
34
+ Returns MP3-encoded audio data. Callers that need raw PCM should
35
+ decode separately (e.g. via ffmpeg or a pure-Python decoder).
36
+ """
37
+ ...
38
+
39
+
40
+ # ---------------------------------------------------------------------------
41
+ # Edge-TTS (Microsoft Edge online TTS, free)
42
+ # ---------------------------------------------------------------------------
43
+
44
+ class EdgeTTSProvider(TTSProvider):
45
+ """Microsoft Edge TTS via the ``edge-tts`` library (free)."""
46
+
47
+ def __init__(
48
+ self,
49
+ voice: str = "zh-CN-XiaoxiaoNeural",
50
+ rate: str = "+0%",
51
+ volume: str = "+0%",
52
+ ) -> None:
53
+ self.voice = voice
54
+ self.rate = rate
55
+ self.volume = volume
56
+
57
+ async def synthesize(self, text: str, language: str = "zh") -> bytes:
58
+ if not text or not text.strip():
59
+ return b""
60
+ import edge_tts
61
+
62
+ communicate = edge_tts.Communicate(
63
+ text=text,
64
+ voice=self.voice,
65
+ rate=self.rate,
66
+ volume=self.volume,
67
+ )
68
+
69
+ audio_chunks: list[bytes] = []
70
+ async for chunk in communicate.stream():
71
+ if chunk["type"] == "audio":
72
+ audio_chunks.append(chunk["data"])
73
+
74
+ return b"".join(audio_chunks)
75
+
76
+
77
+
78
+ # ---------------------------------------------------------------------------
79
+ # Volcengine TTS (火山引擎)
80
+ # ---------------------------------------------------------------------------
81
+
82
+ class VolcengineTTSProvider(TTSProvider):
83
+ """Volcengine (ByteDance) TTS via HTTP API."""
84
+
85
+ API_URL = "https://openspeech.bytedance.com/api/v1/tts"
86
+
87
+ def __init__(
88
+ self,
89
+ app_id: str = "",
90
+ access_token: str = "",
91
+ cluster: str = "volcano_tts",
92
+ voice_type: str = "BV001_streaming",
93
+ speed_ratio: float = 1.0,
94
+ volume_ratio: float = 1.0,
95
+ pitch_ratio: float = 1.0,
96
+ encoding: str = "mp3",
97
+ sample_rate: int = 24000,
98
+ ) -> None:
99
+ self.app_id = app_id
100
+ self.access_token = access_token
101
+ self.cluster = cluster
102
+ self.voice_type = voice_type
103
+ self.speed_ratio = speed_ratio
104
+ self.volume_ratio = volume_ratio
105
+ self.pitch_ratio = pitch_ratio
106
+ self.encoding = encoding
107
+ self.sample_rate = sample_rate
108
+
109
+ async def synthesize(self, text: str, language: str = "zh") -> bytes:
110
+ if not text or not text.strip():
111
+ return b""
112
+
113
+ payload = {
114
+ "app": {
115
+ "appid": self.app_id,
116
+ "token": self.access_token,
117
+ "cluster": self.cluster,
118
+ },
119
+ "user": {"uid": "tts-test"},
120
+ "audio": {
121
+ "voice_type": self.voice_type,
122
+ "encoding": self.encoding,
123
+ "speed_ratio": self.speed_ratio,
124
+ "volume_ratio": self.volume_ratio,
125
+ "pitch_ratio": self.pitch_ratio,
126
+ "sample_rate": self.sample_rate,
127
+ },
128
+ "request": {
129
+ "reqid": str(uuid.uuid4()),
130
+ "text": text,
131
+ "operation": "query",
132
+ },
133
+ }
134
+
135
+ headers = {
136
+ "Content-Type": "application/json",
137
+ "Authorization": f"Bearer;{self.access_token}",
138
+ }
139
+
140
+ async with httpx.AsyncClient(timeout=30) as client:
141
+ resp = await client.post(self.API_URL, json=payload, headers=headers)
142
+ resp.raise_for_status()
143
+ result = resp.json()
144
+
145
+ if result.get("code") != 3000:
146
+ msg = result.get("message", "unknown error")
147
+ raise RuntimeError(f"Volcengine TTS error: {msg} (code={result.get('code')})")
148
+
149
+ audio_b64 = result.get("data", "")
150
+ if not audio_b64:
151
+ raise RuntimeError("Volcengine TTS returned empty audio data")
152
+
153
+ return base64.b64decode(audio_b64)
154
+
155
+
156
+ # ---------------------------------------------------------------------------
157
+ # Tencent Cloud TTS (腾讯云)
158
+ # ---------------------------------------------------------------------------
159
+
160
+ class TencentTTSProvider(TTSProvider):
161
+ """Tencent Cloud TTS via HTTP API (TextToVoice action)."""
162
+
163
+ API_HOST = "tts.tencentcloudapi.com"
164
+ SERVICE = "tts"
165
+ ACTION = "TextToVoice"
166
+ VERSION = "2019-08-23"
167
+
168
+ def __init__(
169
+ self,
170
+ secret_id: str = "",
171
+ secret_key: str = "",
172
+ app_id: int = 0,
173
+ voice_type: int = 101001,
174
+ codec: str = "pcm",
175
+ sample_rate: int = 16000,
176
+ speed: float = 0,
177
+ volume: float = 0,
178
+ ) -> None:
179
+ self.secret_id = secret_id
180
+ self.secret_key = secret_key
181
+ self.app_id = app_id
182
+ self.voice_type = voice_type
183
+ self.codec = codec
184
+ self.sample_rate = sample_rate
185
+ self.speed = speed
186
+ self.volume = volume
187
+
188
+ def _sign(self, payload_json: str, timestamp: int) -> dict[str, str]:
189
+ """Build TC3-HMAC-SHA256 signed headers."""
190
+ date = datetime.fromtimestamp(timestamp, tz=timezone.utc).strftime("%Y-%m-%d")
191
+
192
+ # 1. Canonical request
193
+ ct = "application/json; charset=utf-8"
194
+ canonical = (
195
+ f"POST\n/\n\n"
196
+ f"content-type:{ct}\n"
197
+ f"host:{self.API_HOST}\n"
198
+ f"x-tc-action:{self.ACTION.lower()}\n\n"
199
+ f"content-type;host;x-tc-action\n"
200
+ f"{hashlib.sha256(payload_json.encode()).hexdigest()}"
201
+ )
202
+
203
+ # 2. String to sign
204
+ scope = f"{date}/{self.SERVICE}/tc3_request"
205
+ string_to_sign = (
206
+ f"TC3-HMAC-SHA256\n{timestamp}\n{scope}\n"
207
+ f"{hashlib.sha256(canonical.encode()).hexdigest()}"
208
+ )
209
+
210
+ # 3. Signing key
211
+ def _hmac_sha256(key: bytes, msg: str) -> bytes:
212
+ return hmac.new(key, msg.encode(), hashlib.sha256).digest()
213
+
214
+ secret_date = _hmac_sha256(f"TC3{self.secret_key}".encode(), date)
215
+ secret_service = _hmac_sha256(secret_date, self.SERVICE)
216
+ secret_signing = _hmac_sha256(secret_service, "tc3_request")
217
+ signature = hmac.new(secret_signing, string_to_sign.encode(), hashlib.sha256).hexdigest()
218
+
219
+ auth = (
220
+ f"TC3-HMAC-SHA256 Credential={self.secret_id}/{scope}, "
221
+ f"SignedHeaders=content-type;host;x-tc-action, "
222
+ f"Signature={signature}"
223
+ )
224
+
225
+ return {
226
+ "Content-Type": ct,
227
+ "Host": self.API_HOST,
228
+ "X-TC-Action": self.ACTION,
229
+ "X-TC-Version": self.VERSION,
230
+ "X-TC-Timestamp": str(timestamp),
231
+ "Authorization": auth,
232
+ }
233
+
234
+ async def synthesize(self, text: str, language: str = "zh") -> bytes:
235
+ if not text or not text.strip():
236
+ return b""
237
+
238
+ session_id = str(uuid.uuid4()).replace("-", "")
239
+ payload = {
240
+ "Text": text,
241
+ "SessionId": session_id,
242
+ "Volume": self.volume,
243
+ "Speed": self.speed,
244
+ "VoiceType": self.voice_type,
245
+ "Codec": self.codec,
246
+ "SampleRate": self.sample_rate,
247
+ "ModelType": 1,
248
+ }
249
+
250
+ payload_json = json.dumps(payload)
251
+ timestamp = int(time.time())
252
+ headers = self._sign(payload_json, timestamp)
253
+
254
+ async with httpx.AsyncClient(timeout=30) as client:
255
+ resp = await client.post(
256
+ f"https://{self.API_HOST}",
257
+ content=payload_json,
258
+ headers=headers,
259
+ )
260
+ resp.raise_for_status()
261
+ result = resp.json()
262
+
263
+ response = result.get("Response", {})
264
+ if response.get("Error"):
265
+ err = response["Error"]
266
+ raise RuntimeError(
267
+ f"Tencent TTS error: {err.get('Message', '')} ({err.get('Code', '')})"
268
+ )
269
+
270
+ audio_b64 = response.get("Audio", "")
271
+ if not audio_b64:
272
+ raise RuntimeError("Tencent TTS returned empty audio data")
273
+
274
+ return base64.b64decode(audio_b64)
275
+
276
+
277
+ # ---------------------------------------------------------------------------
278
+ # Factory
279
+ # ---------------------------------------------------------------------------
280
+
281
+ def create_tts_provider(provider: str | None = None, **overrides: Any) -> TTSProvider:
282
+ """Create a TTS provider instance.
283
+
284
+ If *provider* is ``None``, the value from config is used.
285
+ Extra *overrides* are merged on top of config values.
286
+ """
287
+ if provider is None:
288
+ provider = cfg.get("tts.provider", "edge-tts")
289
+
290
+ if provider == "edge-tts":
291
+ return EdgeTTSProvider(
292
+ voice=overrides.get("voice", cfg.get("tts.edge_tts.voice", "zh-CN-XiaoxiaoNeural")),
293
+ rate=overrides.get("rate", cfg.get("tts.edge_tts.rate", "+0%")),
294
+ volume=overrides.get("volume", cfg.get("tts.edge_tts.volume", "+0%")),
295
+ )
296
+
297
+ if provider == "volcengine":
298
+ return VolcengineTTSProvider(
299
+ app_id=overrides.get("app_id", cfg.get("tts.volcengine.app_id") or cfg.get("asr.volcengine.app_id", "")),
300
+ access_token=overrides.get("access_token", cfg.get("tts.volcengine.access_token") or cfg.get("asr.volcengine.access_token", "")),
301
+ cluster=overrides.get("cluster", cfg.get("tts.volcengine.cluster", "volcano_tts")),
302
+ voice_type=overrides.get("voice_type", cfg.get("tts.volcengine.voice_type", "BV001_streaming")),
303
+ speed_ratio=float(overrides.get("speed_ratio", cfg.get("tts.volcengine.speed_ratio", 1.0))),
304
+ volume_ratio=float(overrides.get("volume_ratio", cfg.get("tts.volcengine.volume_ratio", 1.0))),
305
+ pitch_ratio=float(overrides.get("pitch_ratio", cfg.get("tts.volcengine.pitch_ratio", 1.0))),
306
+ encoding=overrides.get("encoding", cfg.get("tts.volcengine.encoding", "mp3")),
307
+ sample_rate=int(overrides.get("sample_rate", cfg.get("tts.volcengine.sample_rate", 24000))),
308
+ )
309
+
310
+ if provider == "tencent":
311
+ return TencentTTSProvider(
312
+ secret_id=overrides.get("secret_id", cfg.get("tts.tencent.secret_id") or cfg.get("asr.tencent.secret_id", "")),
313
+ secret_key=overrides.get("secret_key", cfg.get("tts.tencent.secret_key") or cfg.get("asr.tencent.secret_key", "")),
314
+ app_id=int(overrides.get("app_id", cfg.get("tts.tencent.app_id") or cfg.get("asr.tencent.app_id", 0))),
315
+ voice_type=int(overrides.get("voice_type", cfg.get("tts.tencent.voice_type", 101001))),
316
+ codec=overrides.get("codec", cfg.get("tts.tencent.codec", "pcm")),
317
+ sample_rate=int(overrides.get("sample_rate", cfg.get("tts.tencent.sample_rate", 16000))),
318
+ speed=float(overrides.get("speed", cfg.get("tts.tencent.speed", 0))),
319
+ volume=float(overrides.get("volume", cfg.get("tts.tencent.volume", 0))),
320
+ )
321
+
322
+ raise ValueError(f"Unknown TTS provider: {provider}")
@@ -0,0 +1,138 @@
1
+ """Voice Activity Detection (VAD) using webrtcvad."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+
7
+ import webrtcvad
8
+
9
+ from .. import config as cfg
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class VADDetector:
15
+ """Frame-level voice activity detector backed by ``webrtcvad``.
16
+
17
+ Feed it 16 kHz S16LE mono PCM frames and it will emit events:
18
+
19
+ * ``"speech_start"`` -- the user started speaking
20
+ * ``"speech_end"`` -- the user stopped speaking (silence exceeded threshold)
21
+ * ``None`` -- no state change
22
+
23
+ webrtcvad requires frames of exactly 10, 20, or 30 ms. At 16 kHz S16LE
24
+ mono that corresponds to 320, 640, or 960 bytes respectively. If the
25
+ caller supplies frames of a different length they will be buffered
26
+ internally and split into 20 ms chunks.
27
+ """
28
+
29
+ # At 16 kHz S16LE mono: 20 ms = 640 bytes
30
+ FRAME_DURATION_MS = 20
31
+ BYTES_PER_FRAME = 640 # 16000 * 2 * 20 / 1000
32
+
33
+ def __init__(
34
+ self,
35
+ mode: int = 3,
36
+ sample_rate: int = 16000,
37
+ silence_threshold_ms: int = 800,
38
+ min_speech_ms: int = 250,
39
+ ) -> None:
40
+ """
41
+ Parameters
42
+ ----------
43
+ mode:
44
+ webrtcvad aggressiveness (0-3). Higher = more aggressive filtering.
45
+ sample_rate:
46
+ Audio sample rate in Hz (must be 8000, 16000, 32000, or 48000).
47
+ silence_threshold_ms:
48
+ How many milliseconds of silence before emitting ``speech_end``.
49
+ min_speech_ms:
50
+ Minimum speech duration before we consider it real speech
51
+ (avoids false triggers on short noise bursts).
52
+ """
53
+ self._vad = webrtcvad.Vad(mode)
54
+ self._sample_rate = sample_rate
55
+ self._silence_threshold_ms = silence_threshold_ms
56
+ self._min_speech_ms = min_speech_ms
57
+
58
+ # How many *consecutive* silent frames required to declare end-of-speech
59
+ self._silence_frames_needed = silence_threshold_ms // self.FRAME_DURATION_MS
60
+ # How many *consecutive* speech frames required to declare start-of-speech
61
+ self._speech_frames_needed = max(1, min_speech_ms // self.FRAME_DURATION_MS)
62
+
63
+ # Internal state
64
+ self._in_speech = False
65
+ self._speech_frame_count = 0
66
+ self._silence_frame_count = 0
67
+ self._buffer = bytearray()
68
+
69
+ def feed(self, chunk: bytes) -> str | None:
70
+ """Feed audio data and return an event string or ``None``.
71
+
72
+ The chunk can be any size; it will be buffered internally and
73
+ processed in 20 ms frames. Only the *last* event produced by
74
+ the batch is returned (to keep the API simple).
75
+ """
76
+ self._buffer.extend(chunk)
77
+ last_event: str | None = None
78
+
79
+ while len(self._buffer) >= self.BYTES_PER_FRAME:
80
+ frame = bytes(self._buffer[: self.BYTES_PER_FRAME])
81
+ self._buffer = self._buffer[self.BYTES_PER_FRAME :]
82
+ event = self._process_frame(frame)
83
+ if event is not None:
84
+ last_event = event
85
+
86
+ return last_event
87
+
88
+ def reset(self) -> None:
89
+ """Reset detector state for a new utterance."""
90
+ self._in_speech = False
91
+ self._speech_frame_count = 0
92
+ self._silence_frame_count = 0
93
+ self._buffer = bytearray()
94
+
95
+ # ------------------------------------------------------------------
96
+
97
+ def _process_frame(self, frame: bytes) -> str | None:
98
+ """Process a single 20 ms frame."""
99
+ try:
100
+ is_speech = self._vad.is_speech(frame, self._sample_rate)
101
+ except Exception:
102
+ # webrtcvad can raise on malformed frames; treat as silence
103
+ is_speech = False
104
+
105
+ if is_speech:
106
+ self._silence_frame_count = 0
107
+ self._speech_frame_count += 1
108
+
109
+ if not self._in_speech and self._speech_frame_count >= self._speech_frames_needed:
110
+ self._in_speech = True
111
+ logger.debug("VAD: speech_start (after %d frames)", self._speech_frame_count)
112
+ return "speech_start"
113
+ else:
114
+ self._speech_frame_count = 0
115
+ if self._in_speech:
116
+ self._silence_frame_count += 1
117
+ if self._silence_frame_count >= self._silence_frames_needed:
118
+ self._in_speech = False
119
+ self._silence_frame_count = 0
120
+ logger.debug("VAD: speech_end")
121
+ return "speech_end"
122
+
123
+ return None
124
+
125
+ @property
126
+ def is_speaking(self) -> bool:
127
+ """``True`` if the detector currently believes the user is speaking."""
128
+ return self._in_speech
129
+
130
+
131
+ def create_vad_detector() -> VADDetector:
132
+ """Create a VAD detector instance based on the current configuration."""
133
+ return VADDetector(
134
+ mode=cfg.get("vad.mode", 3),
135
+ sample_rate=cfg.get("audio.sample_rate", 16000),
136
+ silence_threshold_ms=cfg.get("vad.silence_threshold_ms", 800),
137
+ min_speech_ms=cfg.get("vad.min_speech_ms", 250),
138
+ )
@@ -0,0 +1 @@
1
+ from .store import * # noqa: F401,F403