@agentunion/kite 1.0.7 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +208 -0
- package/README.md +48 -0
- package/cli.js +1 -1
- package/extensions/agents/__init__.py +1 -0
- package/extensions/agents/assistant/__init__.py +1 -0
- package/extensions/agents/assistant/entry.py +329 -0
- package/extensions/agents/assistant/module.md +22 -0
- package/extensions/agents/assistant/server.py +197 -0
- package/extensions/channels/__init__.py +1 -0
- package/extensions/channels/acp_channel/__init__.py +1 -0
- package/extensions/channels/acp_channel/entry.py +329 -0
- package/extensions/channels/acp_channel/module.md +22 -0
- package/extensions/channels/acp_channel/server.py +197 -0
- package/extensions/event_hub_bench/entry.py +624 -379
- package/extensions/event_hub_bench/module.md +2 -1
- package/extensions/services/backup/__init__.py +1 -0
- package/extensions/services/backup/entry.py +508 -0
- package/extensions/services/backup/module.md +22 -0
- package/extensions/services/model_service/__init__.py +1 -0
- package/extensions/services/model_service/entry.py +508 -0
- package/extensions/services/model_service/module.md +22 -0
- package/extensions/services/watchdog/entry.py +468 -102
- package/extensions/services/watchdog/module.md +3 -0
- package/extensions/services/watchdog/monitor.py +170 -69
- package/extensions/services/web/__init__.py +1 -0
- package/extensions/services/web/config.yaml +149 -0
- package/extensions/services/web/entry.py +390 -0
- package/extensions/services/web/module.md +24 -0
- package/extensions/services/web/routes/__init__.py +1 -0
- package/extensions/services/web/routes/routes_call.py +189 -0
- package/extensions/services/web/routes/routes_config.py +512 -0
- package/extensions/services/web/routes/routes_contacts.py +98 -0
- package/extensions/services/web/routes/routes_devlog.py +99 -0
- package/extensions/services/web/routes/routes_phone.py +81 -0
- package/extensions/services/web/routes/routes_sms.py +48 -0
- package/extensions/services/web/routes/routes_stats.py +17 -0
- package/extensions/services/web/routes/routes_voicechat.py +554 -0
- package/extensions/services/web/routes/schemas.py +216 -0
- package/extensions/services/web/server.py +375 -0
- package/extensions/services/web/static/css/style.css +1064 -0
- package/extensions/services/web/static/index.html +1445 -0
- package/extensions/services/web/static/js/app.js +4671 -0
- package/extensions/services/web/vendor/__init__.py +1 -0
- package/extensions/services/web/vendor/bluetooth/audio.py +348 -0
- package/extensions/services/web/vendor/bluetooth/contacts.py +251 -0
- package/extensions/services/web/vendor/bluetooth/manager.py +395 -0
- package/extensions/services/web/vendor/bluetooth/sms.py +290 -0
- package/extensions/services/web/vendor/bluetooth/telephony.py +274 -0
- package/extensions/services/web/vendor/config.py +139 -0
- package/extensions/services/web/vendor/conversation/asr.py +936 -0
- package/extensions/services/web/vendor/conversation/engine.py +548 -0
- package/extensions/services/web/vendor/conversation/llm.py +534 -0
- package/extensions/services/web/vendor/conversation/mcp_tools.py +190 -0
- package/extensions/services/web/vendor/conversation/tts.py +322 -0
- package/extensions/services/web/vendor/conversation/vad.py +138 -0
- package/extensions/services/web/vendor/storage/__init__.py +1 -0
- package/extensions/services/web/vendor/storage/identity.py +312 -0
- package/extensions/services/web/vendor/storage/store.py +507 -0
- package/extensions/services/web/vendor/task/manager.py +864 -0
- package/extensions/services/web/vendor/task/models.py +45 -0
- package/extensions/services/web/vendor/task/webhook.py +263 -0
- package/extensions/services/web/vendor/tools/registry.py +321 -0
- package/kernel/__init__.py +0 -0
- package/kernel/entry.py +407 -0
- package/{core/event_hub/hub.py → kernel/event_hub.py} +62 -74
- package/kernel/module.md +33 -0
- package/{core/registry/store.py → kernel/registry_store.py} +23 -8
- package/kernel/rpc_router.py +388 -0
- package/kernel/server.py +267 -0
- package/launcher/__init__.py +10 -0
- package/launcher/__main__.py +6 -0
- package/launcher/count_lines.py +258 -0
- package/launcher/entry.py +1778 -0
- package/launcher/logging_setup.py +289 -0
- package/{core/launcher → launcher}/module_scanner.py +11 -6
- package/launcher/process_manager.py +880 -0
- package/main.py +11 -210
- package/package.json +6 -9
- package/__init__.py +0 -1
- package/__main__.py +0 -15
- package/core/event_hub/BENCHMARK.md +0 -94
- package/core/event_hub/bench.py +0 -459
- package/core/event_hub/bench_extreme.py +0 -308
- package/core/event_hub/bench_perf.py +0 -350
- package/core/event_hub/entry.py +0 -157
- package/core/event_hub/module.md +0 -20
- package/core/event_hub/server.py +0 -206
- package/core/launcher/entry.py +0 -1158
- package/core/launcher/process_manager.py +0 -470
- package/core/registry/entry.py +0 -110
- package/core/registry/module.md +0 -30
- package/core/registry/server.py +0 -289
- package/extensions/services/watchdog/server.py +0 -167
- /package/{core → extensions/services/web/vendor/bluetooth}/__init__.py +0 -0
- /package/{core/event_hub → extensions/services/web/vendor/conversation}/__init__.py +0 -0
- /package/{core/launcher → extensions/services/web/vendor/task}/__init__.py +0 -0
- /package/{core/registry → extensions/services/web/vendor/tools}/__init__.py +0 -0
- /package/{core/event_hub → kernel}/dedup.py +0 -0
- /package/{core/event_hub → kernel}/router.py +0 -0
- /package/{core/launcher → launcher}/module.md +0 -0
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""MCP tool definitions for in-call use by the LLM."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import copy
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
# ---------------------------------------------------------------------------
|
|
9
|
+
# Canonical tool definitions (provider-agnostic)
|
|
10
|
+
# ---------------------------------------------------------------------------
|
|
11
|
+
|
|
12
|
+
MCP_TOOLS: list[dict[str, Any]] = [
|
|
13
|
+
{
|
|
14
|
+
"name": "confirm_with_caller",
|
|
15
|
+
"description": (
|
|
16
|
+
"Ask the task caller to confirm something or provide information. "
|
|
17
|
+
"Use when you need to verify details with the person who initiated "
|
|
18
|
+
"this call task."
|
|
19
|
+
),
|
|
20
|
+
"parameters": {
|
|
21
|
+
"type": "object",
|
|
22
|
+
"properties": {
|
|
23
|
+
"question": {
|
|
24
|
+
"type": "string",
|
|
25
|
+
"description": "The question to ask the caller",
|
|
26
|
+
},
|
|
27
|
+
"options": {
|
|
28
|
+
"type": "array",
|
|
29
|
+
"items": {"type": "string"},
|
|
30
|
+
"description": "Optional multiple-choice answers",
|
|
31
|
+
},
|
|
32
|
+
"urgent": {
|
|
33
|
+
"type": "boolean",
|
|
34
|
+
"default": False,
|
|
35
|
+
"description": "Whether this requires an immediate response",
|
|
36
|
+
},
|
|
37
|
+
},
|
|
38
|
+
"required": ["question"],
|
|
39
|
+
},
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
"name": "send_sms",
|
|
43
|
+
"description": "Send an SMS message to a phone number.",
|
|
44
|
+
"parameters": {
|
|
45
|
+
"type": "object",
|
|
46
|
+
"properties": {
|
|
47
|
+
"phone_number": {
|
|
48
|
+
"type": "string",
|
|
49
|
+
"description": "The recipient phone number",
|
|
50
|
+
},
|
|
51
|
+
"content": {
|
|
52
|
+
"type": "string",
|
|
53
|
+
"description": "The SMS text content",
|
|
54
|
+
},
|
|
55
|
+
},
|
|
56
|
+
"required": ["phone_number", "content"],
|
|
57
|
+
},
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
"name": "add_contact",
|
|
61
|
+
"description": "Add a new contact to the address book.",
|
|
62
|
+
"parameters": {
|
|
63
|
+
"type": "object",
|
|
64
|
+
"properties": {
|
|
65
|
+
"name": {
|
|
66
|
+
"type": "string",
|
|
67
|
+
"description": "Contact name",
|
|
68
|
+
},
|
|
69
|
+
"phone": {
|
|
70
|
+
"type": "string",
|
|
71
|
+
"description": "Contact phone number",
|
|
72
|
+
},
|
|
73
|
+
"company": {
|
|
74
|
+
"type": "string",
|
|
75
|
+
"description": "Company or organization name",
|
|
76
|
+
},
|
|
77
|
+
"notes": {
|
|
78
|
+
"type": "string",
|
|
79
|
+
"description": "Additional notes about this contact",
|
|
80
|
+
},
|
|
81
|
+
},
|
|
82
|
+
"required": ["name", "phone"],
|
|
83
|
+
},
|
|
84
|
+
},
|
|
85
|
+
{
|
|
86
|
+
"name": "search_contacts",
|
|
87
|
+
"description": "Search the contact list by name, phone, or company.",
|
|
88
|
+
"parameters": {
|
|
89
|
+
"type": "object",
|
|
90
|
+
"properties": {
|
|
91
|
+
"query": {
|
|
92
|
+
"type": "string",
|
|
93
|
+
"description": "Search query string",
|
|
94
|
+
},
|
|
95
|
+
},
|
|
96
|
+
"required": ["query"],
|
|
97
|
+
},
|
|
98
|
+
},
|
|
99
|
+
{
|
|
100
|
+
"name": "notify_caller",
|
|
101
|
+
"description": (
|
|
102
|
+
"Send a one-way notification to the task caller. "
|
|
103
|
+
"No reply is expected."
|
|
104
|
+
),
|
|
105
|
+
"parameters": {
|
|
106
|
+
"type": "object",
|
|
107
|
+
"properties": {
|
|
108
|
+
"message": {
|
|
109
|
+
"type": "string",
|
|
110
|
+
"description": "Notification message content",
|
|
111
|
+
},
|
|
112
|
+
},
|
|
113
|
+
"required": ["message"],
|
|
114
|
+
},
|
|
115
|
+
},
|
|
116
|
+
]
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# ---------------------------------------------------------------------------
|
|
120
|
+
# Provider-specific format converters
|
|
121
|
+
# ---------------------------------------------------------------------------
|
|
122
|
+
|
|
123
|
+
def get_tools_for_provider(provider: str) -> list[dict[str, Any]]:
|
|
124
|
+
"""Convert ``MCP_TOOLS`` into the tool schema expected by each LLM provider.
|
|
125
|
+
|
|
126
|
+
Parameters
|
|
127
|
+
----------
|
|
128
|
+
provider:
|
|
129
|
+
One of ``"openai"``, ``"claude"``, or ``"gemini"``.
|
|
130
|
+
|
|
131
|
+
Returns
|
|
132
|
+
-------
|
|
133
|
+
A list of tool definitions in the target provider's native format.
|
|
134
|
+
"""
|
|
135
|
+
if provider == "openai":
|
|
136
|
+
return _to_openai_format()
|
|
137
|
+
elif provider == "claude":
|
|
138
|
+
return _to_claude_format()
|
|
139
|
+
elif provider == "gemini":
|
|
140
|
+
return _to_gemini_format()
|
|
141
|
+
else:
|
|
142
|
+
raise ValueError(f"Unknown provider: {provider}")
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _to_openai_format() -> list[dict[str, Any]]:
|
|
146
|
+
"""OpenAI / OpenAI-compatible format.
|
|
147
|
+
|
|
148
|
+
Each tool is wrapped in ``{"type": "function", "function": {...}}``.
|
|
149
|
+
"""
|
|
150
|
+
tools: list[dict[str, Any]] = []
|
|
151
|
+
for tool in MCP_TOOLS:
|
|
152
|
+
tools.append({
|
|
153
|
+
"type": "function",
|
|
154
|
+
"function": {
|
|
155
|
+
"name": tool["name"],
|
|
156
|
+
"description": tool["description"],
|
|
157
|
+
"parameters": copy.deepcopy(tool["parameters"]),
|
|
158
|
+
},
|
|
159
|
+
})
|
|
160
|
+
return tools
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _to_claude_format() -> list[dict[str, Any]]:
|
|
164
|
+
"""Anthropic Claude format.
|
|
165
|
+
|
|
166
|
+
Each tool uses ``{"name": ..., "description": ..., "input_schema": ...}``.
|
|
167
|
+
"""
|
|
168
|
+
tools: list[dict[str, Any]] = []
|
|
169
|
+
for tool in MCP_TOOLS:
|
|
170
|
+
tools.append({
|
|
171
|
+
"name": tool["name"],
|
|
172
|
+
"description": tool["description"],
|
|
173
|
+
"input_schema": copy.deepcopy(tool["parameters"]),
|
|
174
|
+
})
|
|
175
|
+
return tools
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _to_gemini_format() -> list[dict[str, Any]]:
|
|
179
|
+
"""Google Gemini format.
|
|
180
|
+
|
|
181
|
+
Returns ``[{"function_declarations": [...]}]``.
|
|
182
|
+
"""
|
|
183
|
+
declarations: list[dict[str, Any]] = []
|
|
184
|
+
for tool in MCP_TOOLS:
|
|
185
|
+
declarations.append({
|
|
186
|
+
"name": tool["name"],
|
|
187
|
+
"description": tool["description"],
|
|
188
|
+
"parameters": copy.deepcopy(tool["parameters"]),
|
|
189
|
+
})
|
|
190
|
+
return [{"function_declarations": declarations}]
|
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
"""TTS (Text-to-Speech) abstraction with multiple provider implementations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import base64
|
|
6
|
+
import hashlib
|
|
7
|
+
import hmac
|
|
8
|
+
import json
|
|
9
|
+
import logging
|
|
10
|
+
import time
|
|
11
|
+
import uuid
|
|
12
|
+
from abc import ABC, abstractmethod
|
|
13
|
+
from datetime import datetime, timezone
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
import httpx
|
|
17
|
+
|
|
18
|
+
from .. import config as cfg
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# ---------------------------------------------------------------------------
|
|
24
|
+
# Abstract base
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
|
|
27
|
+
class TTSProvider(ABC):
|
|
28
|
+
"""Base class for all TTS (text-to-speech) providers."""
|
|
29
|
+
|
|
30
|
+
@abstractmethod
|
|
31
|
+
async def synthesize(self, text: str, language: str = "zh") -> bytes:
|
|
32
|
+
"""Synthesize *text* into audio bytes.
|
|
33
|
+
|
|
34
|
+
Returns MP3-encoded audio data. Callers that need raw PCM should
|
|
35
|
+
decode separately (e.g. via ffmpeg or a pure-Python decoder).
|
|
36
|
+
"""
|
|
37
|
+
...
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# ---------------------------------------------------------------------------
|
|
41
|
+
# Edge-TTS (Microsoft Edge online TTS, free)
|
|
42
|
+
# ---------------------------------------------------------------------------
|
|
43
|
+
|
|
44
|
+
class EdgeTTSProvider(TTSProvider):
|
|
45
|
+
"""Microsoft Edge TTS via the ``edge-tts`` library (free)."""
|
|
46
|
+
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
voice: str = "zh-CN-XiaoxiaoNeural",
|
|
50
|
+
rate: str = "+0%",
|
|
51
|
+
volume: str = "+0%",
|
|
52
|
+
) -> None:
|
|
53
|
+
self.voice = voice
|
|
54
|
+
self.rate = rate
|
|
55
|
+
self.volume = volume
|
|
56
|
+
|
|
57
|
+
async def synthesize(self, text: str, language: str = "zh") -> bytes:
|
|
58
|
+
if not text or not text.strip():
|
|
59
|
+
return b""
|
|
60
|
+
import edge_tts
|
|
61
|
+
|
|
62
|
+
communicate = edge_tts.Communicate(
|
|
63
|
+
text=text,
|
|
64
|
+
voice=self.voice,
|
|
65
|
+
rate=self.rate,
|
|
66
|
+
volume=self.volume,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
audio_chunks: list[bytes] = []
|
|
70
|
+
async for chunk in communicate.stream():
|
|
71
|
+
if chunk["type"] == "audio":
|
|
72
|
+
audio_chunks.append(chunk["data"])
|
|
73
|
+
|
|
74
|
+
return b"".join(audio_chunks)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
# ---------------------------------------------------------------------------
|
|
79
|
+
# Volcengine TTS (火山引擎)
|
|
80
|
+
# ---------------------------------------------------------------------------
|
|
81
|
+
|
|
82
|
+
class VolcengineTTSProvider(TTSProvider):
|
|
83
|
+
"""Volcengine (ByteDance) TTS via HTTP API."""
|
|
84
|
+
|
|
85
|
+
API_URL = "https://openspeech.bytedance.com/api/v1/tts"
|
|
86
|
+
|
|
87
|
+
def __init__(
|
|
88
|
+
self,
|
|
89
|
+
app_id: str = "",
|
|
90
|
+
access_token: str = "",
|
|
91
|
+
cluster: str = "volcano_tts",
|
|
92
|
+
voice_type: str = "BV001_streaming",
|
|
93
|
+
speed_ratio: float = 1.0,
|
|
94
|
+
volume_ratio: float = 1.0,
|
|
95
|
+
pitch_ratio: float = 1.0,
|
|
96
|
+
encoding: str = "mp3",
|
|
97
|
+
sample_rate: int = 24000,
|
|
98
|
+
) -> None:
|
|
99
|
+
self.app_id = app_id
|
|
100
|
+
self.access_token = access_token
|
|
101
|
+
self.cluster = cluster
|
|
102
|
+
self.voice_type = voice_type
|
|
103
|
+
self.speed_ratio = speed_ratio
|
|
104
|
+
self.volume_ratio = volume_ratio
|
|
105
|
+
self.pitch_ratio = pitch_ratio
|
|
106
|
+
self.encoding = encoding
|
|
107
|
+
self.sample_rate = sample_rate
|
|
108
|
+
|
|
109
|
+
async def synthesize(self, text: str, language: str = "zh") -> bytes:
|
|
110
|
+
if not text or not text.strip():
|
|
111
|
+
return b""
|
|
112
|
+
|
|
113
|
+
payload = {
|
|
114
|
+
"app": {
|
|
115
|
+
"appid": self.app_id,
|
|
116
|
+
"token": self.access_token,
|
|
117
|
+
"cluster": self.cluster,
|
|
118
|
+
},
|
|
119
|
+
"user": {"uid": "tts-test"},
|
|
120
|
+
"audio": {
|
|
121
|
+
"voice_type": self.voice_type,
|
|
122
|
+
"encoding": self.encoding,
|
|
123
|
+
"speed_ratio": self.speed_ratio,
|
|
124
|
+
"volume_ratio": self.volume_ratio,
|
|
125
|
+
"pitch_ratio": self.pitch_ratio,
|
|
126
|
+
"sample_rate": self.sample_rate,
|
|
127
|
+
},
|
|
128
|
+
"request": {
|
|
129
|
+
"reqid": str(uuid.uuid4()),
|
|
130
|
+
"text": text,
|
|
131
|
+
"operation": "query",
|
|
132
|
+
},
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
headers = {
|
|
136
|
+
"Content-Type": "application/json",
|
|
137
|
+
"Authorization": f"Bearer;{self.access_token}",
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
async with httpx.AsyncClient(timeout=30) as client:
|
|
141
|
+
resp = await client.post(self.API_URL, json=payload, headers=headers)
|
|
142
|
+
resp.raise_for_status()
|
|
143
|
+
result = resp.json()
|
|
144
|
+
|
|
145
|
+
if result.get("code") != 3000:
|
|
146
|
+
msg = result.get("message", "unknown error")
|
|
147
|
+
raise RuntimeError(f"Volcengine TTS error: {msg} (code={result.get('code')})")
|
|
148
|
+
|
|
149
|
+
audio_b64 = result.get("data", "")
|
|
150
|
+
if not audio_b64:
|
|
151
|
+
raise RuntimeError("Volcengine TTS returned empty audio data")
|
|
152
|
+
|
|
153
|
+
return base64.b64decode(audio_b64)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
# ---------------------------------------------------------------------------
|
|
157
|
+
# Tencent Cloud TTS (腾讯云)
|
|
158
|
+
# ---------------------------------------------------------------------------
|
|
159
|
+
|
|
160
|
+
class TencentTTSProvider(TTSProvider):
|
|
161
|
+
"""Tencent Cloud TTS via HTTP API (TextToVoice action)."""
|
|
162
|
+
|
|
163
|
+
API_HOST = "tts.tencentcloudapi.com"
|
|
164
|
+
SERVICE = "tts"
|
|
165
|
+
ACTION = "TextToVoice"
|
|
166
|
+
VERSION = "2019-08-23"
|
|
167
|
+
|
|
168
|
+
def __init__(
|
|
169
|
+
self,
|
|
170
|
+
secret_id: str = "",
|
|
171
|
+
secret_key: str = "",
|
|
172
|
+
app_id: int = 0,
|
|
173
|
+
voice_type: int = 101001,
|
|
174
|
+
codec: str = "pcm",
|
|
175
|
+
sample_rate: int = 16000,
|
|
176
|
+
speed: float = 0,
|
|
177
|
+
volume: float = 0,
|
|
178
|
+
) -> None:
|
|
179
|
+
self.secret_id = secret_id
|
|
180
|
+
self.secret_key = secret_key
|
|
181
|
+
self.app_id = app_id
|
|
182
|
+
self.voice_type = voice_type
|
|
183
|
+
self.codec = codec
|
|
184
|
+
self.sample_rate = sample_rate
|
|
185
|
+
self.speed = speed
|
|
186
|
+
self.volume = volume
|
|
187
|
+
|
|
188
|
+
def _sign(self, payload_json: str, timestamp: int) -> dict[str, str]:
|
|
189
|
+
"""Build TC3-HMAC-SHA256 signed headers."""
|
|
190
|
+
date = datetime.fromtimestamp(timestamp, tz=timezone.utc).strftime("%Y-%m-%d")
|
|
191
|
+
|
|
192
|
+
# 1. Canonical request
|
|
193
|
+
ct = "application/json; charset=utf-8"
|
|
194
|
+
canonical = (
|
|
195
|
+
f"POST\n/\n\n"
|
|
196
|
+
f"content-type:{ct}\n"
|
|
197
|
+
f"host:{self.API_HOST}\n"
|
|
198
|
+
f"x-tc-action:{self.ACTION.lower()}\n\n"
|
|
199
|
+
f"content-type;host;x-tc-action\n"
|
|
200
|
+
f"{hashlib.sha256(payload_json.encode()).hexdigest()}"
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
# 2. String to sign
|
|
204
|
+
scope = f"{date}/{self.SERVICE}/tc3_request"
|
|
205
|
+
string_to_sign = (
|
|
206
|
+
f"TC3-HMAC-SHA256\n{timestamp}\n{scope}\n"
|
|
207
|
+
f"{hashlib.sha256(canonical.encode()).hexdigest()}"
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
# 3. Signing key
|
|
211
|
+
def _hmac_sha256(key: bytes, msg: str) -> bytes:
|
|
212
|
+
return hmac.new(key, msg.encode(), hashlib.sha256).digest()
|
|
213
|
+
|
|
214
|
+
secret_date = _hmac_sha256(f"TC3{self.secret_key}".encode(), date)
|
|
215
|
+
secret_service = _hmac_sha256(secret_date, self.SERVICE)
|
|
216
|
+
secret_signing = _hmac_sha256(secret_service, "tc3_request")
|
|
217
|
+
signature = hmac.new(secret_signing, string_to_sign.encode(), hashlib.sha256).hexdigest()
|
|
218
|
+
|
|
219
|
+
auth = (
|
|
220
|
+
f"TC3-HMAC-SHA256 Credential={self.secret_id}/{scope}, "
|
|
221
|
+
f"SignedHeaders=content-type;host;x-tc-action, "
|
|
222
|
+
f"Signature={signature}"
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
return {
|
|
226
|
+
"Content-Type": ct,
|
|
227
|
+
"Host": self.API_HOST,
|
|
228
|
+
"X-TC-Action": self.ACTION,
|
|
229
|
+
"X-TC-Version": self.VERSION,
|
|
230
|
+
"X-TC-Timestamp": str(timestamp),
|
|
231
|
+
"Authorization": auth,
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
async def synthesize(self, text: str, language: str = "zh") -> bytes:
|
|
235
|
+
if not text or not text.strip():
|
|
236
|
+
return b""
|
|
237
|
+
|
|
238
|
+
session_id = str(uuid.uuid4()).replace("-", "")
|
|
239
|
+
payload = {
|
|
240
|
+
"Text": text,
|
|
241
|
+
"SessionId": session_id,
|
|
242
|
+
"Volume": self.volume,
|
|
243
|
+
"Speed": self.speed,
|
|
244
|
+
"VoiceType": self.voice_type,
|
|
245
|
+
"Codec": self.codec,
|
|
246
|
+
"SampleRate": self.sample_rate,
|
|
247
|
+
"ModelType": 1,
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
payload_json = json.dumps(payload)
|
|
251
|
+
timestamp = int(time.time())
|
|
252
|
+
headers = self._sign(payload_json, timestamp)
|
|
253
|
+
|
|
254
|
+
async with httpx.AsyncClient(timeout=30) as client:
|
|
255
|
+
resp = await client.post(
|
|
256
|
+
f"https://{self.API_HOST}",
|
|
257
|
+
content=payload_json,
|
|
258
|
+
headers=headers,
|
|
259
|
+
)
|
|
260
|
+
resp.raise_for_status()
|
|
261
|
+
result = resp.json()
|
|
262
|
+
|
|
263
|
+
response = result.get("Response", {})
|
|
264
|
+
if response.get("Error"):
|
|
265
|
+
err = response["Error"]
|
|
266
|
+
raise RuntimeError(
|
|
267
|
+
f"Tencent TTS error: {err.get('Message', '')} ({err.get('Code', '')})"
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
audio_b64 = response.get("Audio", "")
|
|
271
|
+
if not audio_b64:
|
|
272
|
+
raise RuntimeError("Tencent TTS returned empty audio data")
|
|
273
|
+
|
|
274
|
+
return base64.b64decode(audio_b64)
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
# ---------------------------------------------------------------------------
|
|
278
|
+
# Factory
|
|
279
|
+
# ---------------------------------------------------------------------------
|
|
280
|
+
|
|
281
|
+
def create_tts_provider(provider: str | None = None, **overrides: Any) -> TTSProvider:
|
|
282
|
+
"""Create a TTS provider instance.
|
|
283
|
+
|
|
284
|
+
If *provider* is ``None``, the value from config is used.
|
|
285
|
+
Extra *overrides* are merged on top of config values.
|
|
286
|
+
"""
|
|
287
|
+
if provider is None:
|
|
288
|
+
provider = cfg.get("tts.provider", "edge-tts")
|
|
289
|
+
|
|
290
|
+
if provider == "edge-tts":
|
|
291
|
+
return EdgeTTSProvider(
|
|
292
|
+
voice=overrides.get("voice", cfg.get("tts.edge_tts.voice", "zh-CN-XiaoxiaoNeural")),
|
|
293
|
+
rate=overrides.get("rate", cfg.get("tts.edge_tts.rate", "+0%")),
|
|
294
|
+
volume=overrides.get("volume", cfg.get("tts.edge_tts.volume", "+0%")),
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
if provider == "volcengine":
|
|
298
|
+
return VolcengineTTSProvider(
|
|
299
|
+
app_id=overrides.get("app_id", cfg.get("tts.volcengine.app_id") or cfg.get("asr.volcengine.app_id", "")),
|
|
300
|
+
access_token=overrides.get("access_token", cfg.get("tts.volcengine.access_token") or cfg.get("asr.volcengine.access_token", "")),
|
|
301
|
+
cluster=overrides.get("cluster", cfg.get("tts.volcengine.cluster", "volcano_tts")),
|
|
302
|
+
voice_type=overrides.get("voice_type", cfg.get("tts.volcengine.voice_type", "BV001_streaming")),
|
|
303
|
+
speed_ratio=float(overrides.get("speed_ratio", cfg.get("tts.volcengine.speed_ratio", 1.0))),
|
|
304
|
+
volume_ratio=float(overrides.get("volume_ratio", cfg.get("tts.volcengine.volume_ratio", 1.0))),
|
|
305
|
+
pitch_ratio=float(overrides.get("pitch_ratio", cfg.get("tts.volcengine.pitch_ratio", 1.0))),
|
|
306
|
+
encoding=overrides.get("encoding", cfg.get("tts.volcengine.encoding", "mp3")),
|
|
307
|
+
sample_rate=int(overrides.get("sample_rate", cfg.get("tts.volcengine.sample_rate", 24000))),
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
if provider == "tencent":
|
|
311
|
+
return TencentTTSProvider(
|
|
312
|
+
secret_id=overrides.get("secret_id", cfg.get("tts.tencent.secret_id") or cfg.get("asr.tencent.secret_id", "")),
|
|
313
|
+
secret_key=overrides.get("secret_key", cfg.get("tts.tencent.secret_key") or cfg.get("asr.tencent.secret_key", "")),
|
|
314
|
+
app_id=int(overrides.get("app_id", cfg.get("tts.tencent.app_id") or cfg.get("asr.tencent.app_id", 0))),
|
|
315
|
+
voice_type=int(overrides.get("voice_type", cfg.get("tts.tencent.voice_type", 101001))),
|
|
316
|
+
codec=overrides.get("codec", cfg.get("tts.tencent.codec", "pcm")),
|
|
317
|
+
sample_rate=int(overrides.get("sample_rate", cfg.get("tts.tencent.sample_rate", 16000))),
|
|
318
|
+
speed=float(overrides.get("speed", cfg.get("tts.tencent.speed", 0))),
|
|
319
|
+
volume=float(overrides.get("volume", cfg.get("tts.tencent.volume", 0))),
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
raise ValueError(f"Unknown TTS provider: {provider}")
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""Voice Activity Detection (VAD) using webrtcvad."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
import webrtcvad
|
|
8
|
+
|
|
9
|
+
from .. import config as cfg
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class VADDetector:
|
|
15
|
+
"""Frame-level voice activity detector backed by ``webrtcvad``.
|
|
16
|
+
|
|
17
|
+
Feed it 16 kHz S16LE mono PCM frames and it will emit events:
|
|
18
|
+
|
|
19
|
+
* ``"speech_start"`` -- the user started speaking
|
|
20
|
+
* ``"speech_end"`` -- the user stopped speaking (silence exceeded threshold)
|
|
21
|
+
* ``None`` -- no state change
|
|
22
|
+
|
|
23
|
+
webrtcvad requires frames of exactly 10, 20, or 30 ms. At 16 kHz S16LE
|
|
24
|
+
mono that corresponds to 320, 640, or 960 bytes respectively. If the
|
|
25
|
+
caller supplies frames of a different length they will be buffered
|
|
26
|
+
internally and split into 20 ms chunks.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
# At 16 kHz S16LE mono: 20 ms = 640 bytes
|
|
30
|
+
FRAME_DURATION_MS = 20
|
|
31
|
+
BYTES_PER_FRAME = 640 # 16000 * 2 * 20 / 1000
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
mode: int = 3,
|
|
36
|
+
sample_rate: int = 16000,
|
|
37
|
+
silence_threshold_ms: int = 800,
|
|
38
|
+
min_speech_ms: int = 250,
|
|
39
|
+
) -> None:
|
|
40
|
+
"""
|
|
41
|
+
Parameters
|
|
42
|
+
----------
|
|
43
|
+
mode:
|
|
44
|
+
webrtcvad aggressiveness (0-3). Higher = more aggressive filtering.
|
|
45
|
+
sample_rate:
|
|
46
|
+
Audio sample rate in Hz (must be 8000, 16000, 32000, or 48000).
|
|
47
|
+
silence_threshold_ms:
|
|
48
|
+
How many milliseconds of silence before emitting ``speech_end``.
|
|
49
|
+
min_speech_ms:
|
|
50
|
+
Minimum speech duration before we consider it real speech
|
|
51
|
+
(avoids false triggers on short noise bursts).
|
|
52
|
+
"""
|
|
53
|
+
self._vad = webrtcvad.Vad(mode)
|
|
54
|
+
self._sample_rate = sample_rate
|
|
55
|
+
self._silence_threshold_ms = silence_threshold_ms
|
|
56
|
+
self._min_speech_ms = min_speech_ms
|
|
57
|
+
|
|
58
|
+
# How many *consecutive* silent frames required to declare end-of-speech
|
|
59
|
+
self._silence_frames_needed = silence_threshold_ms // self.FRAME_DURATION_MS
|
|
60
|
+
# How many *consecutive* speech frames required to declare start-of-speech
|
|
61
|
+
self._speech_frames_needed = max(1, min_speech_ms // self.FRAME_DURATION_MS)
|
|
62
|
+
|
|
63
|
+
# Internal state
|
|
64
|
+
self._in_speech = False
|
|
65
|
+
self._speech_frame_count = 0
|
|
66
|
+
self._silence_frame_count = 0
|
|
67
|
+
self._buffer = bytearray()
|
|
68
|
+
|
|
69
|
+
def feed(self, chunk: bytes) -> str | None:
|
|
70
|
+
"""Feed audio data and return an event string or ``None``.
|
|
71
|
+
|
|
72
|
+
The chunk can be any size; it will be buffered internally and
|
|
73
|
+
processed in 20 ms frames. Only the *last* event produced by
|
|
74
|
+
the batch is returned (to keep the API simple).
|
|
75
|
+
"""
|
|
76
|
+
self._buffer.extend(chunk)
|
|
77
|
+
last_event: str | None = None
|
|
78
|
+
|
|
79
|
+
while len(self._buffer) >= self.BYTES_PER_FRAME:
|
|
80
|
+
frame = bytes(self._buffer[: self.BYTES_PER_FRAME])
|
|
81
|
+
self._buffer = self._buffer[self.BYTES_PER_FRAME :]
|
|
82
|
+
event = self._process_frame(frame)
|
|
83
|
+
if event is not None:
|
|
84
|
+
last_event = event
|
|
85
|
+
|
|
86
|
+
return last_event
|
|
87
|
+
|
|
88
|
+
def reset(self) -> None:
|
|
89
|
+
"""Reset detector state for a new utterance."""
|
|
90
|
+
self._in_speech = False
|
|
91
|
+
self._speech_frame_count = 0
|
|
92
|
+
self._silence_frame_count = 0
|
|
93
|
+
self._buffer = bytearray()
|
|
94
|
+
|
|
95
|
+
# ------------------------------------------------------------------
|
|
96
|
+
|
|
97
|
+
def _process_frame(self, frame: bytes) -> str | None:
|
|
98
|
+
"""Process a single 20 ms frame."""
|
|
99
|
+
try:
|
|
100
|
+
is_speech = self._vad.is_speech(frame, self._sample_rate)
|
|
101
|
+
except Exception:
|
|
102
|
+
# webrtcvad can raise on malformed frames; treat as silence
|
|
103
|
+
is_speech = False
|
|
104
|
+
|
|
105
|
+
if is_speech:
|
|
106
|
+
self._silence_frame_count = 0
|
|
107
|
+
self._speech_frame_count += 1
|
|
108
|
+
|
|
109
|
+
if not self._in_speech and self._speech_frame_count >= self._speech_frames_needed:
|
|
110
|
+
self._in_speech = True
|
|
111
|
+
logger.debug("VAD: speech_start (after %d frames)", self._speech_frame_count)
|
|
112
|
+
return "speech_start"
|
|
113
|
+
else:
|
|
114
|
+
self._speech_frame_count = 0
|
|
115
|
+
if self._in_speech:
|
|
116
|
+
self._silence_frame_count += 1
|
|
117
|
+
if self._silence_frame_count >= self._silence_frames_needed:
|
|
118
|
+
self._in_speech = False
|
|
119
|
+
self._silence_frame_count = 0
|
|
120
|
+
logger.debug("VAD: speech_end")
|
|
121
|
+
return "speech_end"
|
|
122
|
+
|
|
123
|
+
return None
|
|
124
|
+
|
|
125
|
+
@property
|
|
126
|
+
def is_speaking(self) -> bool:
|
|
127
|
+
"""``True`` if the detector currently believes the user is speaking."""
|
|
128
|
+
return self._in_speech
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def create_vad_detector() -> VADDetector:
|
|
132
|
+
"""Create a VAD detector instance based on the current configuration."""
|
|
133
|
+
return VADDetector(
|
|
134
|
+
mode=cfg.get("vad.mode", 3),
|
|
135
|
+
sample_rate=cfg.get("audio.sample_rate", 16000),
|
|
136
|
+
silence_threshold_ms=cfg.get("vad.silence_threshold_ms", 800),
|
|
137
|
+
min_speech_ms=cfg.get("vad.min_speech_ms", 250),
|
|
138
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .store import * # noqa: F401,F403
|