xiaozhi-sdk 0.2.5__tar.gz → 0.2.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. {xiaozhi_sdk-0.2.5/xiaozhi_sdk.egg-info → xiaozhi_sdk-0.2.8}/PKG-INFO +15 -3
  2. {xiaozhi_sdk-0.2.5 → xiaozhi_sdk-0.2.8}/README.md +14 -2
  3. xiaozhi_sdk-0.2.8/file/audio/test_16k.wav +0 -0
  4. xiaozhi_sdk-0.2.8/file/audio/test_24k.wav +0 -0
  5. xiaozhi_sdk-0.2.8/file/audio/test_48k.wav +0 -0
  6. {xiaozhi_sdk-0.2.5 → xiaozhi_sdk-0.2.8}/pyproject.toml +1 -0
  7. xiaozhi_sdk-0.2.8/tests/test_wake_word.py +55 -0
  8. {xiaozhi_sdk-0.2.5 → xiaozhi_sdk-0.2.8}/tests/test_xiaozhi.py +14 -10
  9. xiaozhi_sdk-0.2.8/tests/test_xiaozhi_opus.py +88 -0
  10. {xiaozhi_sdk-0.2.5 → xiaozhi_sdk-0.2.8}/xiaozhi_sdk/__init__.py +1 -1
  11. {xiaozhi_sdk-0.2.5 → xiaozhi_sdk-0.2.8}/xiaozhi_sdk/cli.py +87 -24
  12. xiaozhi_sdk-0.2.8/xiaozhi_sdk/config.py +7 -0
  13. {xiaozhi_sdk-0.2.5 → xiaozhi_sdk-0.2.8}/xiaozhi_sdk/core.py +32 -23
  14. xiaozhi_sdk-0.2.8/xiaozhi_sdk/opus.py +74 -0
  15. {xiaozhi_sdk-0.2.5 → xiaozhi_sdk-0.2.8}/xiaozhi_sdk/utils/__init__.py +3 -2
  16. {xiaozhi_sdk-0.2.5 → xiaozhi_sdk-0.2.8/xiaozhi_sdk.egg-info}/PKG-INFO +15 -3
  17. {xiaozhi_sdk-0.2.5 → xiaozhi_sdk-0.2.8}/xiaozhi_sdk.egg-info/SOURCES.txt +15 -8
  18. xiaozhi_sdk-0.2.5/tests/test_wake_word.py +0 -33
  19. xiaozhi_sdk-0.2.5/xiaozhi_sdk/config.py +0 -3
  20. xiaozhi_sdk-0.2.5/xiaozhi_sdk/opus.py +0 -61
  21. {xiaozhi_sdk-0.2.5 → xiaozhi_sdk-0.2.8}/LICENSE +0 -0
  22. {xiaozhi_sdk-0.2.5 → xiaozhi_sdk-0.2.8}/MANIFEST.in +0 -0
  23. /xiaozhi_sdk-0.2.5/file/audio/greet.wav → /xiaozhi_sdk-0.2.8/file/audio/16k_greet.wav +0 -0
  24. /xiaozhi_sdk-0.2.5/file/audio/play_music.wav → /xiaozhi_sdk-0.2.8/file/audio/16k_play_music.wav +0 -0
  25. /xiaozhi_sdk-0.2.5/file/audio/say_hello.wav → /xiaozhi_sdk-0.2.8/file/audio/16k_say_hello.wav +0 -0
  26. /xiaozhi_sdk-0.2.5/file/audio/take_photo.wav → /xiaozhi_sdk-0.2.8/file/audio/16k_take_photo.wav +0 -0
  27. {xiaozhi_sdk-0.2.5 → xiaozhi_sdk-0.2.8}/file/image/leijun.jpg +0 -0
  28. {xiaozhi_sdk-0.2.5 → xiaozhi_sdk-0.2.8}/file/opus/linux-arm64-libopus.so +0 -0
  29. {xiaozhi_sdk-0.2.5 → xiaozhi_sdk-0.2.8}/file/opus/linux-x64-libopus.so +0 -0
  30. {xiaozhi_sdk-0.2.5 → xiaozhi_sdk-0.2.8}/file/opus/macos-arm64-libopus.dylib +0 -0
  31. {xiaozhi_sdk-0.2.5 → xiaozhi_sdk-0.2.8}/file/opus/macos-x64-libopus.dylib +0 -0
  32. {xiaozhi_sdk-0.2.5 → xiaozhi_sdk-0.2.8}/file/opus/windows-opus.dll +0 -0
  33. {xiaozhi_sdk-0.2.5 → xiaozhi_sdk-0.2.8}/setup.cfg +0 -0
  34. {xiaozhi_sdk-0.2.5 → xiaozhi_sdk-0.2.8}/tests/test_iot.py +0 -0
  35. {xiaozhi_sdk-0.2.5 → xiaozhi_sdk-0.2.8}/tests/test_pic.py +0 -0
  36. {xiaozhi_sdk-0.2.5 → xiaozhi_sdk-0.2.8}/xiaozhi_sdk/__main__.py +0 -0
  37. {xiaozhi_sdk-0.2.5 → xiaozhi_sdk-0.2.8}/xiaozhi_sdk/iot.py +0 -0
  38. {xiaozhi_sdk-0.2.5 → xiaozhi_sdk-0.2.8}/xiaozhi_sdk/mcp.py +0 -0
  39. {xiaozhi_sdk-0.2.5 → xiaozhi_sdk-0.2.8}/xiaozhi_sdk/utils/mcp_tool.py +0 -0
  40. {xiaozhi_sdk-0.2.5 → xiaozhi_sdk-0.2.8}/xiaozhi_sdk/utils/tool_func.py +0 -0
  41. {xiaozhi_sdk-0.2.5 → xiaozhi_sdk-0.2.8}/xiaozhi_sdk.egg-info/dependency_links.txt +0 -0
  42. {xiaozhi_sdk-0.2.5 → xiaozhi_sdk-0.2.8}/xiaozhi_sdk.egg-info/requires.txt +0 -0
  43. {xiaozhi_sdk-0.2.5 → xiaozhi_sdk-0.2.8}/xiaozhi_sdk.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xiaozhi-sdk
3
- Version: 0.2.5
3
+ Version: 0.2.8
4
4
  Summary: 一个用于连接和控制小智智能设备的Python SDK,支持实时音频通信、MCP工具集成和设备管理功能。
5
5
  Author-email: dairoot <623815825@qq.com>
6
6
  License-Expression: MIT
@@ -43,7 +43,8 @@ Dynamic: license-file
43
43
  ## 📦 安装
44
44
 
45
45
  ```bash
46
- pip install xiaozhi-sdk
46
+ pip install uv
47
+ uv pip install xiaozhi-sdk -U
47
48
  ```
48
49
 
49
50
  ---
@@ -60,10 +61,21 @@ pip install xiaozhi-sdk
60
61
  python -m xiaozhi_sdk --help
61
62
  ```
62
63
 
63
- #### 连接设备(需要提供 MAC 地址)
64
+ #### 连接设备
64
65
 
65
66
  ```bash
67
+ # 默认本机 mac 地址
68
+ python -m xiaozhi_sdk
69
+
70
+ # 指定 mac 地址
66
71
  python -m xiaozhi_sdk 00:22:44:66:88:00
72
+
73
+ # 更多常用操作
74
+ ## --url 指定服务端 websocket 地址
75
+ ## --wake_word 指定唤醒词
76
+ python -m xiaozhi_sdk 00:22:44:66:88:00 \
77
+ --url ws://127.0.0.1:8180 \
78
+ --wake_word="你好啊"
67
79
  ```
68
80
 
69
81
  ### 2. 编程使用 (高阶用法)
@@ -16,7 +16,8 @@
16
16
  ## 📦 安装
17
17
 
18
18
  ```bash
19
- pip install xiaozhi-sdk
19
+ pip install uv
20
+ uv pip install xiaozhi-sdk -U
20
21
  ```
21
22
 
22
23
  ---
@@ -33,10 +34,21 @@ pip install xiaozhi-sdk
33
34
  python -m xiaozhi_sdk --help
34
35
  ```
35
36
 
36
- #### 连接设备(需要提供 MAC 地址)
37
+ #### 连接设备
37
38
 
38
39
  ```bash
40
+ # 默认本机 mac 地址
41
+ python -m xiaozhi_sdk
42
+
43
+ # 指定 mac 地址
39
44
  python -m xiaozhi_sdk 00:22:44:66:88:00
45
+
46
+ # 更多常用操作
47
+ ## --url 指定服务端 websocket 地址
48
+ ## --wake_word 指定唤醒词
49
+ python -m xiaozhi_sdk 00:22:44:66:88:00 \
50
+ --url ws://127.0.0.1:8180 \
51
+ --wake_word="你好啊"
40
52
  ```
41
53
 
42
54
  ### 2. 编程使用 (高阶用法)
@@ -43,6 +43,7 @@ include = ["xiaozhi_sdk*"]
43
43
  xiaozhi_sdk = ["../file/**/*"]
44
44
 
45
45
  [tool.uv]
46
+ index-url = "https://pypi.tuna.tsinghua.edu.cn/simple"
46
47
  dev-dependencies = [
47
48
  "black>=24.8.0",
48
49
  "flake8>=5.0.4",
@@ -0,0 +1,55 @@
1
+ import asyncio
2
+ import os
3
+ import sys
4
+
5
+ import pytest
6
+
7
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
8
+
9
+ from xiaozhi_sdk import XiaoZhiWebsocket
10
+ from xiaozhi_sdk.utils import read_audio_file
11
+
12
+ sample_rate = 16000
13
+ frame_duration = 60
14
+
15
+ MAC_ADDR = "00:22:44:66:88:00"
16
+
17
+ URL = None
18
+ ota_url = None
19
+
20
+
21
+ async def test_main():
22
+ is_end = asyncio.Event()
23
+ async def message_handler_callback(message):
24
+ if message.get("state") == "stop":
25
+ is_end.set()
26
+ print("message received:", message)
27
+
28
+ xiaozhi = XiaoZhiWebsocket(
29
+ message_handler_callback, url=URL, ota_url=ota_url,
30
+ audio_sample_rate=sample_rate, audio_frame_duration=frame_duration)
31
+ await xiaozhi.init_connection(MAC_ADDR)
32
+
33
+ await xiaozhi.send_wake_word("你好")
34
+ await asyncio.sleep(5)
35
+
36
+ # await xiaozhi.send_wake_word("1+1")
37
+ # await asyncio.sleep(5)
38
+ #
39
+ # await xiaozhi.send_wake_word("你是什么大语言模型")
40
+ # await asyncio.sleep(5)
41
+
42
+ # say hellow
43
+ for pcm in read_audio_file("./file/audio/16k_say_hello.wav", sample_rate, frame_duration):
44
+ await xiaozhi.send_audio(pcm)
45
+ await xiaozhi.send_silence_audio()
46
+ await asyncio.sleep(5)
47
+
48
+ await xiaozhi.close()
49
+
50
+
51
+ if __name__ == "__main__":
52
+ asyncio.run(test_main())
53
+
54
+
55
+
@@ -11,11 +11,12 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
11
11
 
12
12
  from xiaozhi_sdk import XiaoZhiWebsocket
13
13
  from xiaozhi_sdk.utils import read_audio_file
14
-
14
+ sample_rate = 16000
15
+ frame_duration = 60
15
16
 
16
17
  async def assistant_audio_play(audio_queue, wait_time=5):
17
18
  # 创建一个持续播放的流
18
- stream = sd.OutputStream(samplerate=16000, channels=1, dtype=np.int16)
19
+ stream = sd.OutputStream(samplerate=sample_rate, channels=1, dtype=np.int16)
19
20
  stream.start()
20
21
  last_time = int(time.time())
21
22
  while True:
@@ -78,29 +79,32 @@ URL = None
78
79
  # URL = None
79
80
 
80
81
 
81
- @pytest.mark.asyncio
82
82
  async def test_main():
83
- xiaozhi = XiaoZhiWebsocket(message_handler_callback, url=URL, ota_url=ota_url)
83
+ xiaozhi = XiaoZhiWebsocket(message_handler_callback, url=URL, ota_url=ota_url, audio_sample_rate=sample_rate, audio_frame_duration=frame_duration)
84
84
 
85
85
  await xiaozhi.set_mcp_tool(mcp_tool_func())
86
86
  await xiaozhi.init_connection(MAC_ADDR)
87
87
 
88
88
  # # say hellow
89
- for pcm in read_audio_file("./file/audio/say_hello.wav"):
89
+ for pcm in read_audio_file("./file/audio/16k_say_hello.wav", sample_rate, frame_duration):
90
90
  await xiaozhi.send_audio(pcm)
91
91
  await xiaozhi.send_silence_audio()
92
92
  await assistant_audio_play(xiaozhi.output_audio_queue)
93
93
 
94
94
  # say take photo
95
- for pcm in read_audio_file("./file/audio/take_photo.wav"):
95
+ for pcm in read_audio_file("./file/audio/16k_take_photo.wav", sample_rate, frame_duration):
96
96
  await xiaozhi.send_audio(pcm)
97
97
  await xiaozhi.send_silence_audio()
98
98
  await assistant_audio_play(xiaozhi.output_audio_queue, 5)
99
99
 
100
100
  # play music
101
- # for pcm in read_audio_file("./file/audio/play_music.wav"):
102
- # await xiaozhi.send_audio(pcm)
103
- # await xiaozhi.send_silence_audio()
104
- # await assistant_audio_play(xiaozhi.output_audio_queue, 500)
101
+ for pcm in read_audio_file("./file/audio/16k_play_music.wav", sample_rate, frame_duration):
102
+ await xiaozhi.send_audio(pcm)
103
+ await xiaozhi.send_silence_audio()
104
+ await assistant_audio_play(xiaozhi.output_audio_queue, 500)
105
105
 
106
106
  await xiaozhi.close()
107
+
108
+
109
+ if __name__ == "__main__":
110
+ asyncio.run(test_main())
@@ -0,0 +1,88 @@
1
+ import asyncio
2
+ import os
3
+ import sys
4
+ import time
5
+
6
+ import numpy as np
7
+ import sounddevice as sd
8
+
9
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
10
+
11
+ from xiaozhi_sdk import XiaoZhiWebsocket
12
+ from xiaozhi_sdk.utils import read_audio_file
13
+
14
+ sample_rate = 48000
15
+ frame_duration = 60
16
+ MAC_ADDR = "00:22:44:66:88:00"
17
+
18
+
19
+ async def assistant_audio_play(audio_queue, wait_time=5):
20
+ # 创建一个持续播放的流
21
+ stream = sd.OutputStream(samplerate=sample_rate, channels=1, dtype=np.int16)
22
+ stream.start()
23
+ last_time = int(time.time())
24
+ while True:
25
+ if not audio_queue:
26
+ await asyncio.sleep(0.01)
27
+ if last_time and time.time() - last_time > wait_time:
28
+ break
29
+
30
+ continue
31
+
32
+ pcm_data = audio_queue.popleft()
33
+
34
+ # 将字节数据转换为 numpy int16 数组
35
+ audio_array = pcm_data
36
+
37
+ stream.write(audio_array)
38
+ last_time = time.time()
39
+
40
+ stream.stop()
41
+ stream.close()
42
+
43
+
44
+ async def message_handler_callback(message):
45
+ print("message received:", message)
46
+ if message["type"] == "music":
47
+ print("music:", message["text"])
48
+
49
+
50
+ async def test_main():
51
+ xiaozhi = XiaoZhiWebsocket(message_handler_callback, audio_sample_rate=sample_rate,
52
+ audio_frame_duration=frame_duration)
53
+
54
+ await xiaozhi.init_connection(MAC_ADDR)
55
+ current_dir = os.path.dirname(os.path.abspath(__file__))
56
+ test_audio_file = "../file/audio/test_16k.wav"
57
+
58
+ if sample_rate == 24000:
59
+ test_audio_file = "../file/audio/test_24k.wav"
60
+ elif sample_rate == 48000:
61
+ test_audio_file = "../file/audio/test_48k.wav"
62
+ wav_path = os.path.join(current_dir, test_audio_file)
63
+
64
+ for pcm in read_audio_file(wav_path, sample_rate, frame_duration):
65
+ await xiaozhi.send_audio(pcm)
66
+ await xiaozhi.send_silence_audio()
67
+
68
+ await assistant_audio_play(xiaozhi.output_audio_queue)
69
+
70
+ for pcm in read_audio_file(wav_path, sample_rate, frame_duration):
71
+ await xiaozhi.send_audio(pcm)
72
+ await xiaozhi.send_silence_audio()
73
+
74
+ await assistant_audio_play(xiaozhi.output_audio_queue)
75
+
76
+ for pcm in read_audio_file(wav_path, sample_rate, frame_duration):
77
+ await xiaozhi.send_audio(pcm)
78
+ await xiaozhi.send_silence_audio()
79
+
80
+ await assistant_audio_play(xiaozhi.output_audio_queue)
81
+
82
+ time.sleep(10)
83
+
84
+ await xiaozhi.close()
85
+
86
+
87
+ if __name__ == "__main__":
88
+ asyncio.run(test_main())
@@ -1,3 +1,3 @@
1
- __version__ = "0.2.5"
1
+ __version__ = "0.2.8"
2
2
 
3
3
  from xiaozhi_sdk.core import XiaoZhiWebsocket # noqa
@@ -2,7 +2,9 @@ import asyncio
2
2
  import io
3
3
  import logging
4
4
  import time
5
+ import uuid
5
6
  from collections import deque
7
+ from tkinter import NO
6
8
  from typing import Optional
7
9
 
8
10
  import click
@@ -12,7 +14,11 @@ import sounddevice as sd
12
14
  from PIL import ImageGrab
13
15
 
14
16
  from xiaozhi_sdk import XiaoZhiWebsocket
15
- from xiaozhi_sdk.config import INPUT_SERVER_AUDIO_SAMPLE_RATE
17
+ from xiaozhi_sdk.config import (
18
+ INPUT_AUDIO_CHANNELS,
19
+ INPUT_AUDIO_FRAME_DURATION,
20
+ INPUT_AUDIO_SAMPLE_RATE,
21
+ )
16
22
 
17
23
  # 定义自定义日志级别
18
24
  INFO1 = 21
@@ -50,7 +56,7 @@ logging.Logger.info3 = info3
50
56
  handler = colorlog.StreamHandler()
51
57
  handler.setFormatter(
52
58
  colorlog.ColoredFormatter(
53
- "%(log_color)s%(asctime)s - %(name)s - %(levelname)s - %(message)s",
59
+ "%(log_color)s%(asctime)s - %(name)s - %(levelname)-5s - %(message)s",
54
60
  datefmt="%Y-%m-%d %H:%M:%S",
55
61
  log_colors={
56
62
  "DEBUG": "white",
@@ -71,8 +77,10 @@ logger.setLevel(logging.DEBUG)
71
77
 
72
78
  # 全局状态
73
79
  input_audio_buffer: deque[bytes] = deque()
74
- is_playing_audio = False
80
+ device_stauts = "listen" # "speak" or "listen"
81
+
75
82
  is_end = False
83
+ human_speak_time = None
76
84
 
77
85
 
78
86
  def get_image_byte(data):
@@ -103,16 +111,26 @@ def get_image_byte(data):
103
111
 
104
112
 
105
113
  async def handle_message(message):
114
+ global device_stauts
115
+ global human_speak_time
116
+
106
117
  """处理接收到的消息"""
107
118
  global is_end
108
- if message["type"] == "stt": # 人类语音
119
+ if message["type"] == "tts" and message["state"] == "start": # start
120
+ pass
121
+
122
+ elif message["type"] == "stt": # 人类语音
123
+ human_speak_time = time.time()
109
124
  logger.info1("human: %s", message["text"])
125
+
110
126
  elif message["type"] == "tts" and message["state"] == "sentence_start": # AI语音
127
+ device_stauts = "speak" # 防止打断
111
128
  logger.info2("AI: %s", message["text"])
129
+
112
130
  elif message["type"] == "tts" and message["state"] == "stop":
113
- pass
131
+ device_stauts = "listen"
114
132
  # logger.info2("播放结束")
115
- # logger.info("聆听中...")
133
+ logger.info("聆听中...")
116
134
  elif message["type"] == "llm": # 表情
117
135
  logger.info3("emotion: %s", message["text"])
118
136
  else: # 其他消息
@@ -123,31 +141,42 @@ async def handle_message(message):
123
141
  is_end = True
124
142
 
125
143
 
126
- async def play_assistant_audio(audio_queue: deque[bytes], enable_audio):
144
+ async def play_assistant_audio(audio_queue: deque[bytes], enable_audio, audio_samplerate):
127
145
  """播放音频流"""
128
- global is_playing_audio
146
+ global device_stauts
147
+ global human_speak_time
129
148
 
130
149
  stream = None
131
150
  if enable_audio:
132
- stream = sd.OutputStream(samplerate=INPUT_SERVER_AUDIO_SAMPLE_RATE, channels=1, dtype=np.int16)
151
+ stream = sd.OutputStream(samplerate=audio_samplerate, channels=INPUT_AUDIO_CHANNELS, dtype=np.int16)
133
152
  stream.start()
153
+
134
154
  last_audio_time = None
135
155
 
136
156
  while True:
137
157
  if is_end:
138
158
  return
139
159
 
160
+ if device_stauts == "listen":
161
+ last_audio_time = None
162
+
140
163
  if not audio_queue:
164
+ # 空音频 超过 2s ,将device_stauts 设置为listen,代表聆听中
165
+ if device_stauts == "speak" and last_audio_time and time.time() - last_audio_time > 2:
166
+ device_stauts = "listen"
167
+
141
168
  await asyncio.sleep(0.01)
142
- if last_audio_time and time.time() - last_audio_time > 1:
143
- is_playing_audio = False
144
169
  continue
145
170
 
146
- is_playing_audio = True
171
+ last_audio_time = time.time()
172
+
173
+ if human_speak_time:
174
+ logger.debug("首个音频包响应时间:%s 秒", time.time() - human_speak_time)
175
+ human_speak_time = None
176
+
147
177
  pcm_data = audio_queue.popleft()
148
178
  if stream:
149
179
  stream.write(pcm_data)
150
- last_audio_time = time.time()
151
180
 
152
181
 
153
182
  class XiaoZhiClient:
@@ -165,10 +194,16 @@ class XiaoZhiClient:
165
194
  self.mac_address = ""
166
195
  self.wake_word = wake_word
167
196
 
168
- async def start(self, mac_address: str, serial_number: str, license_key: str, enable_audio):
197
+ async def start(self, mac_address: str, serial_number: str, license_key: str, enable_audio, audio_samplerate):
169
198
  """启动客户端连接"""
170
199
  self.mac_address = mac_address
171
- self.xiaozhi = XiaoZhiWebsocket(handle_message, url=self.url, ota_url=self.ota_url, wake_word=self.wake_word)
200
+ self.xiaozhi = XiaoZhiWebsocket(
201
+ handle_message,
202
+ url=self.url,
203
+ ota_url=self.ota_url,
204
+ wake_word=self.wake_word,
205
+ audio_sample_rate=audio_samplerate,
206
+ )
172
207
  from xiaozhi_sdk.utils.mcp_tool import take_photo
173
208
 
174
209
  take_photo["tool_func"] = get_image_byte
@@ -178,7 +213,7 @@ class XiaoZhiClient:
178
213
  self.mac_address, aec=False, serial_number=serial_number, license_key=license_key
179
214
  )
180
215
 
181
- asyncio.create_task(play_assistant_audio(self.xiaozhi.output_audio_queue, enable_audio))
216
+ asyncio.create_task(play_assistant_audio(self.xiaozhi.output_audio_queue, enable_audio, audio_samplerate))
182
217
 
183
218
  def audio_callback(self, indata, frames, time, status):
184
219
  """音频输入回调函数"""
@@ -193,28 +228,49 @@ class XiaoZhiClient:
193
228
  return
194
229
 
195
230
  if not input_audio_buffer:
196
- await asyncio.sleep(0.02)
231
+ await asyncio.sleep(0.01)
197
232
  continue
198
233
 
199
234
  pcm_data = input_audio_buffer.popleft()
200
- if not is_playing_audio:
235
+ if device_stauts == "listen":
236
+
201
237
  await self.xiaozhi.send_audio(pcm_data)
238
+ else:
239
+ input_audio_buffer.clear()
202
240
 
203
241
 
204
242
  async def run_client(
205
- mac_address: str, url: str, ota_url: str, serial_number: str, license_key: str, enable_audio: bool, wake_word: str
243
+ mac_address: str,
244
+ url: str,
245
+ ota_url: str,
246
+ serial_number: str,
247
+ license_key: str,
248
+ enable_audio: bool,
249
+ wake_word: str,
206
250
  ):
207
251
  """运行客户端的异步函数"""
208
252
  logger.debug("Recording... Press Ctrl+C to stop.")
209
253
  client = XiaoZhiClient(url, ota_url, wake_word)
210
- await client.start(mac_address, serial_number, license_key, enable_audio)
211
-
212
- with sd.InputStream(callback=client.audio_callback, channels=1, samplerate=16000, blocksize=960):
254
+ await client.start(mac_address, serial_number, license_key, enable_audio, INPUT_AUDIO_SAMPLE_RATE)
255
+ blocksize = INPUT_AUDIO_SAMPLE_RATE * INPUT_AUDIO_FRAME_DURATION // 1000
256
+ with sd.InputStream(
257
+ callback=client.audio_callback,
258
+ channels=INPUT_AUDIO_CHANNELS,
259
+ samplerate=INPUT_AUDIO_SAMPLE_RATE,
260
+ blocksize=blocksize,
261
+ ):
262
+ logger.info("聆听中...")
213
263
  await client.process_audio_input()
214
264
 
215
265
 
266
+ def get_mac_address():
267
+ mac = uuid.getnode()
268
+ mac_addr = ":".join(["%02x" % ((mac >> ele) & 0xFF) for ele in range(40, -8, -8)])
269
+ return mac_addr
270
+
271
+
216
272
  @click.command()
217
- @click.argument("mac_address")
273
+ @click.argument("mac_address", required=False)
218
274
  @click.option("--url", help="服务端websocket地址")
219
275
  @click.option("--ota_url", help="OTA地址")
220
276
  @click.option("--serial_number", default="", help="设备的序列号")
@@ -222,10 +278,17 @@ async def run_client(
222
278
  @click.option("--enable_audio", default=True, help="是否开启音频播放")
223
279
  @click.option("--wake_word", default="", help="唤醒词")
224
280
  def main(
225
- mac_address: str, url: str, ota_url: str, serial_number: str, license_key: str, enable_audio: bool, wake_word: str
281
+ mac_address: str,
282
+ url: str,
283
+ ota_url: str,
284
+ serial_number: str,
285
+ license_key: str,
286
+ enable_audio: bool,
287
+ wake_word: str,
226
288
  ):
227
289
  """小智SDK客户端
228
290
 
229
291
  MAC_ADDRESS: 设备的MAC地址 (格式: XX:XX:XX:XX:XX:XX)
230
292
  """
293
+ mac_address = mac_address or get_mac_address()
231
294
  asyncio.run(run_client(mac_address, url, ota_url, serial_number, license_key, enable_audio, wake_word))
@@ -0,0 +1,7 @@
1
+ XIAOZHI_SAMPLE_RATE = 16000 # 固定不变动
2
+
3
+ INPUT_AUDIO_SAMPLE_RATE = 24000
4
+ INPUT_AUDIO_CHANNELS = 1
5
+ INPUT_AUDIO_FRAME_DURATION = 60
6
+
7
+ OTA_URL = "https://api.tenclass.net/xiaozhi/ota"
@@ -1,7 +1,6 @@
1
1
  import asyncio
2
2
  import json
3
3
  import logging
4
- import os
5
4
  import re
6
5
  import uuid
7
6
  from collections import deque
@@ -9,10 +8,15 @@ from typing import Any, Callable, Deque, Dict, Optional
9
8
 
10
9
  import websockets
11
10
 
12
- from xiaozhi_sdk.config import INPUT_SERVER_AUDIO_SAMPLE_RATE
11
+ from xiaozhi_sdk.config import (
12
+ INPUT_AUDIO_CHANNELS,
13
+ INPUT_AUDIO_FRAME_DURATION,
14
+ INPUT_AUDIO_SAMPLE_RATE,
15
+ XIAOZHI_SAMPLE_RATE,
16
+ )
13
17
  from xiaozhi_sdk.iot import OtaDevice
14
18
  from xiaozhi_sdk.mcp import McpTool
15
- from xiaozhi_sdk.utils import get_wav_info, read_audio_file, setup_opus
19
+ from xiaozhi_sdk.utils import setup_opus
16
20
 
17
21
  setup_opus()
18
22
  from xiaozhi_sdk.opus import AudioOpus
@@ -27,15 +31,17 @@ class XiaoZhiWebsocket(McpTool):
27
31
  message_handler_callback: Optional[Callable] = None,
28
32
  url: Optional[str] = None,
29
33
  ota_url: Optional[str] = None,
30
- audio_sample_rate: int = 16000,
31
- audio_channels: int = 1,
34
+ audio_sample_rate: int = INPUT_AUDIO_SAMPLE_RATE,
35
+ audio_channels: int = INPUT_AUDIO_CHANNELS,
36
+ audio_frame_duration=INPUT_AUDIO_FRAME_DURATION,
32
37
  wake_word: str = "",
33
38
  ):
34
39
  super().__init__()
35
40
  self.url = url
36
41
  self.ota_url = ota_url
37
42
  self.audio_channels = audio_channels
38
- self.audio_opus = AudioOpus(audio_sample_rate, audio_channels)
43
+ self.audio_frame_duration = audio_frame_duration
44
+ self.audio_opus = AudioOpus(audio_sample_rate, audio_channels, audio_frame_duration)
39
45
  self.wake_word = wake_word
40
46
 
41
47
  # 客户端标识
@@ -70,13 +76,13 @@ class XiaoZhiWebsocket(McpTool):
70
76
  hello_message = {
71
77
  "type": "hello",
72
78
  "version": 1,
73
- "features": {"mcp": True, "aec": aec},
79
+ "features": {"mcp": True, "aec": aec, "consistent_sample_rate": False},
74
80
  "transport": "websocket",
75
81
  "audio_params": {
76
82
  "format": "opus",
77
- "sample_rate": 16000,
83
+ "sample_rate": XIAOZHI_SAMPLE_RATE,
78
84
  "channels": 1,
79
- "frame_duration": 60,
85
+ "frame_duration": self.audio_opus.input_frame_duration,
80
86
  },
81
87
  }
82
88
  await self.websocket.send(json.dumps(hello_message))
@@ -108,17 +114,17 @@ class XiaoZhiWebsocket(McpTool):
108
114
  break
109
115
  await asyncio.sleep(3)
110
116
 
111
- async def _send_demo_audio(self) -> None:
112
- """发送演示音频"""
113
- current_dir = os.path.dirname(os.path.abspath(__file__))
114
- wav_path = os.path.join(current_dir, "../file/audio/greet.wav")
115
- framerate, channels = get_wav_info(wav_path)
116
- audio_opus = AudioOpus(framerate, channels)
117
-
118
- for pcm_data in read_audio_file(wav_path):
119
- opus_data = await audio_opus.pcm_to_opus(pcm_data)
120
- await self.websocket.send(opus_data)
121
- await self.send_silence_audio()
117
+ # async def _send_demo_audio(self) -> None:
118
+ # """发送演示音频"""
119
+ # current_dir = os.path.dirname(os.path.abspath(__file__))
120
+ # wav_path = os.path.join(current_dir, "../file/audio/16k_greet.wav")
121
+ # framerate, channels = get_wav_info(wav_path)
122
+ # audio_opus = AudioOpus(framerate, channels, self.audio_frame_duration)
123
+ #
124
+ # for pcm_data in read_audio_file(wav_path, 16000, self.audio_frame_duration):
125
+ # opus_data = await audio_opus.pcm_to_opus(pcm_data)
126
+ # await self.websocket.send(opus_data)
127
+ # await self.send_silence_audio()
122
128
 
123
129
  async def send_wake_word(self, wake_word: str) -> bool:
124
130
  """发送唤醒词"""
@@ -137,8 +143,8 @@ class XiaoZhiWebsocket(McpTool):
137
143
 
138
144
  async def send_silence_audio(self, duration_seconds: float = 1.2) -> None:
139
145
  """发送静音音频"""
140
- frames_count = int(duration_seconds * 1000 / 60)
141
- pcm_frame = b"\x00\x00" * int(INPUT_SERVER_AUDIO_SAMPLE_RATE / 1000 * 60)
146
+ frames_count = int(duration_seconds * 1000 / self.audio_opus.input_frame_duration)
147
+ pcm_frame = b"\x00\x00" * int(self.audio_opus.input_sample_rate / 1000 * self.audio_opus.input_frame_duration)
142
148
 
143
149
  for _ in range(frames_count):
144
150
  await self.send_audio(pcm_frame)
@@ -159,6 +165,7 @@ class XiaoZhiWebsocket(McpTool):
159
165
  data = json.loads(message)
160
166
  message_type = data["type"]
161
167
  if message_type == "hello":
168
+ self.audio_opus.set_out_audio_frame(data["audio_params"])
162
169
  self.hello_received.set()
163
170
  self.session_id = data["session_id"]
164
171
  return
@@ -219,7 +226,7 @@ class XiaoZhiWebsocket(McpTool):
219
226
 
220
227
  await self._send_hello(self.aec)
221
228
  await self._start_listen()
222
- logger.debug("[websocket] Connection successful")
229
+ logger.debug("[websocket] Connection successful. mac_addr: %s", self.mac_addr)
223
230
  await asyncio.sleep(0.5)
224
231
 
225
232
  async def init_connection(
@@ -250,7 +257,9 @@ class XiaoZhiWebsocket(McpTool):
250
257
 
251
258
  if not await self.is_activate(ota_info):
252
259
  self.iot_task = asyncio.create_task(self._activate_iot_device(license_key, ota_info))
260
+ await self.send_wake_word("hi")
253
261
  logger.debug("[IOT] 设备未激活")
262
+ return
254
263
 
255
264
  if self.wake_word:
256
265
  await self.send_wake_word(self.wake_word)
@@ -0,0 +1,74 @@
1
+ import av
2
+ import numpy as np
3
+ import opuslib
4
+
5
+ from xiaozhi_sdk.config import XIAOZHI_SAMPLE_RATE
6
+
7
+
8
+ class AudioOpus:
9
+
10
+ def __init__(self, sample_rate, channels, frame_duration):
11
+ self.input_frame_duration = frame_duration
12
+ self.input_sample_rate = sample_rate
13
+ self.input_channels = channels
14
+ self.input_frame_size = self.input_sample_rate * self.input_frame_duration // 1000
15
+
16
+ # 创建 Opus 编码器
17
+ self.opus_encoder_16k = opuslib.Encoder(
18
+ fs=XIAOZHI_SAMPLE_RATE, channels=1, application=opuslib.APPLICATION_VOIP
19
+ )
20
+
21
+ self.resampler = av.AudioResampler(format="s16", layout="mono", rate=sample_rate)
22
+ self.resampler_16k = av.AudioResampler(format="s16", layout="mono", rate=16000)
23
+
24
+ def set_out_audio_frame(self, audio_params):
25
+ # 小智服务端 的 音频信息
26
+ self.out_sample_rate = audio_params["sample_rate"]
27
+ self.out_frame_size = self.out_sample_rate * audio_params["frame_duration"] // 1000
28
+
29
+ # 创建 Opus 解码器
30
+ self.opus_decoder = opuslib.Decoder(
31
+ fs=self.out_sample_rate, # 采样率
32
+ channels=audio_params["channels"], # 单声道
33
+ )
34
+
35
+ def to_16k_samplerate_pcm(self, pcm_array):
36
+ layout = "mono" if self.input_channels == 1 else "stereo"
37
+ frame = av.AudioFrame.from_ndarray(pcm_array.reshape(1, -1), format="s16", layout=layout)
38
+ frame.sample_rate = self.input_sample_rate
39
+ resampled_frames = self.resampler_16k.resample(frame)
40
+ samples = resampled_frames[0].to_ndarray().flatten()
41
+ return samples
42
+
43
+ async def pcm_to_opus(self, pcm):
44
+ pcm_array = np.frombuffer(pcm, dtype=np.int16)
45
+ pcm_bytes = pcm_array.tobytes()
46
+ if self.input_sample_rate != XIAOZHI_SAMPLE_RATE:
47
+ # 小智服务端仅支持 16000 采样率, 将 pcm_array 转 16k 采样率
48
+ pcm_array = self.to_16k_samplerate_pcm(pcm_array)
49
+ pcm_bytes = pcm_array.tobytes()
50
+
51
+ frame_size = XIAOZHI_SAMPLE_RATE * self.input_frame_duration // 1000
52
+ return self.opus_encoder_16k.encode(pcm_bytes, frame_size)
53
+
54
+ async def change_sample_rate(self, pcm_array) -> np.ndarray:
55
+ # 采样率 变更
56
+ frame = av.AudioFrame.from_ndarray(np.array(pcm_array).reshape(1, -1), format="s16", layout="mono")
57
+ frame.sample_rate = self.out_sample_rate
58
+ resampled_frames = self.resampler.resample(frame)
59
+ samples = resampled_frames[0].to_ndarray().flatten()
60
+ return samples
61
+
62
+ def padding(self, samples):
63
+ # 不足 self.frame_size 补 0
64
+ samples_padded = np.pad(samples, (0, self.input_frame_size - samples.size), mode="constant", constant_values=0)
65
+ return samples_padded.reshape(1, self.input_frame_size)
66
+
67
+ async def opus_to_pcm(self, opus) -> np.ndarray:
68
+ pcm_data = self.opus_decoder.decode(opus, frame_size=self.out_frame_size)
69
+ pcm_array = np.frombuffer(pcm_data, dtype=np.int16)
70
+ if self.input_sample_rate != self.out_sample_rate:
71
+ pcm_array = await self.change_sample_rate(pcm_array)
72
+
73
+ pcm_array = self.padding(pcm_array)
74
+ return pcm_array
@@ -9,7 +9,7 @@ def get_wav_info(file_path):
9
9
  return wav_file.getframerate(), wav_file.getnchannels()
10
10
 
11
11
 
12
- def read_audio_file(file_path):
12
+ def read_audio_file(file_path, sample_rate, frame_duration):
13
13
  """
14
14
  读取音频文件并通过yield返回PCM流
15
15
 
@@ -19,9 +19,10 @@ def read_audio_file(file_path):
19
19
  Yields:
20
20
  bytes: PCM音频数据块
21
21
  """
22
+ frame_size = sample_rate * frame_duration // 1000
22
23
  with wave.open(file_path, "rb") as wav_file:
23
24
  while True:
24
- pcm = wav_file.readframes(960) # 每次读取960帧(60ms的音频数据)
25
+ pcm = wav_file.readframes(frame_size)
25
26
  if not pcm:
26
27
  break
27
28
  yield pcm
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xiaozhi-sdk
3
- Version: 0.2.5
3
+ Version: 0.2.8
4
4
  Summary: 一个用于连接和控制小智智能设备的Python SDK,支持实时音频通信、MCP工具集成和设备管理功能。
5
5
  Author-email: dairoot <623815825@qq.com>
6
6
  License-Expression: MIT
@@ -43,7 +43,8 @@ Dynamic: license-file
43
43
  ## 📦 安装
44
44
 
45
45
  ```bash
46
- pip install xiaozhi-sdk
46
+ pip install uv
47
+ uv pip install xiaozhi-sdk -U
47
48
  ```
48
49
 
49
50
  ---
@@ -60,10 +61,21 @@ pip install xiaozhi-sdk
60
61
  python -m xiaozhi_sdk --help
61
62
  ```
62
63
 
63
- #### 连接设备(需要提供 MAC 地址)
64
+ #### 连接设备
64
65
 
65
66
  ```bash
67
+ # 默认本机 mac 地址
68
+ python -m xiaozhi_sdk
69
+
70
+ # 指定 mac 地址
66
71
  python -m xiaozhi_sdk 00:22:44:66:88:00
72
+
73
+ # 更多常用操作
74
+ ## --url 指定服务端 websocket 地址
75
+ ## --wake_word 指定唤醒词
76
+ python -m xiaozhi_sdk 00:22:44:66:88:00 \
77
+ --url ws://127.0.0.1:8180 \
78
+ --wake_word="你好啊"
67
79
  ```
68
80
 
69
81
  ### 2. 编程使用 (高阶用法)
@@ -2,10 +2,13 @@ LICENSE
2
2
  MANIFEST.in
3
3
  README.md
4
4
  pyproject.toml
5
- file/audio/greet.wav
6
- file/audio/play_music.wav
7
- file/audio/say_hello.wav
8
- file/audio/take_photo.wav
5
+ file/audio/16k_greet.wav
6
+ file/audio/16k_play_music.wav
7
+ file/audio/16k_say_hello.wav
8
+ file/audio/16k_take_photo.wav
9
+ file/audio/test_16k.wav
10
+ file/audio/test_24k.wav
11
+ file/audio/test_48k.wav
9
12
  file/image/leijun.jpg
10
13
  file/opus/linux-arm64-libopus.so
11
14
  file/opus/linux-x64-libopus.so
@@ -16,6 +19,7 @@ tests/test_iot.py
16
19
  tests/test_pic.py
17
20
  tests/test_wake_word.py
18
21
  tests/test_xiaozhi.py
22
+ tests/test_xiaozhi_opus.py
19
23
  xiaozhi_sdk/__init__.py
20
24
  xiaozhi_sdk/__main__.py
21
25
  xiaozhi_sdk/cli.py
@@ -29,10 +33,13 @@ xiaozhi_sdk.egg-info/SOURCES.txt
29
33
  xiaozhi_sdk.egg-info/dependency_links.txt
30
34
  xiaozhi_sdk.egg-info/requires.txt
31
35
  xiaozhi_sdk.egg-info/top_level.txt
32
- xiaozhi_sdk/../file/audio/greet.wav
33
- xiaozhi_sdk/../file/audio/play_music.wav
34
- xiaozhi_sdk/../file/audio/say_hello.wav
35
- xiaozhi_sdk/../file/audio/take_photo.wav
36
+ xiaozhi_sdk/../file/audio/16k_greet.wav
37
+ xiaozhi_sdk/../file/audio/16k_play_music.wav
38
+ xiaozhi_sdk/../file/audio/16k_say_hello.wav
39
+ xiaozhi_sdk/../file/audio/16k_take_photo.wav
40
+ xiaozhi_sdk/../file/audio/test_16k.wav
41
+ xiaozhi_sdk/../file/audio/test_24k.wav
42
+ xiaozhi_sdk/../file/audio/test_48k.wav
36
43
  xiaozhi_sdk/../file/image/leijun.jpg
37
44
  xiaozhi_sdk/../file/opus/linux-arm64-libopus.so
38
45
  xiaozhi_sdk/../file/opus/linux-x64-libopus.so
@@ -1,33 +0,0 @@
1
- import asyncio
2
- import os
3
- import sys
4
-
5
- import pytest
6
-
7
- sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
8
-
9
- from xiaozhi_sdk import XiaoZhiWebsocket
10
-
11
-
12
- MAC_ADDR = "00:22:44:66:88:00"
13
- ota_url = None
14
- URL = None
15
-
16
-
17
- @pytest.mark.asyncio
18
- async def test_main():
19
- is_end = asyncio.Event()
20
- async def message_handler_callback(message):
21
- if message.get("state") == "stop":
22
- is_end.set()
23
- print("message received:", message)
24
-
25
- xiaozhi = XiaoZhiWebsocket(message_handler_callback, url=URL, ota_url=ota_url)
26
- await xiaozhi.init_connection(MAC_ADDR)
27
-
28
- await xiaozhi.send_wake_word("退下,拜拜不聊了")
29
- await asyncio.wait_for(is_end.wait(), timeout=20.0)
30
- await xiaozhi.send_wake_word("你好")
31
-
32
- await asyncio.wait_for(is_end.wait(), timeout=20.0)
33
- await xiaozhi.close()
@@ -1,3 +0,0 @@
1
- INPUT_SERVER_AUDIO_SAMPLE_RATE = 16000
2
-
3
- OTA_URL = "https://api.tenclass.net/xiaozhi/ota"
@@ -1,61 +0,0 @@
1
- import math
2
-
3
- import av
4
- import numpy as np
5
- import opuslib
6
-
7
- from xiaozhi_sdk.config import INPUT_SERVER_AUDIO_SAMPLE_RATE
8
-
9
-
10
- class AudioOpus:
11
-
12
- def __init__(self, sample_rate, channels):
13
- self.sample_rate = sample_rate
14
- self.channels = channels
15
-
16
- # 创建 Opus 编码器
17
- self.opus_encoder = opuslib.Encoder(
18
- fs=sample_rate, channels=channels, application=opuslib.APPLICATION_VOIP # 采样率 # 单声道 # 语音应用
19
- )
20
-
21
- # 创建 Opus 解码器
22
- self.opus_decoder = opuslib.Decoder(
23
- fs=INPUT_SERVER_AUDIO_SAMPLE_RATE, # 采样率
24
- channels=1, # 单声道
25
- )
26
-
27
- self.resampler = av.AudioResampler(format="s16", layout="mono", rate=sample_rate)
28
-
29
- async def pcm_to_opus(self, pcm):
30
- pcm_array = np.frombuffer(pcm, dtype=np.int16)
31
- pcm_bytes = pcm_array.tobytes()
32
- return self.opus_encoder.encode(pcm_bytes, 960)
33
-
34
- @staticmethod
35
- def to_n_960(samples) -> np.ndarray:
36
- n = math.ceil(samples.shape[0] / 960)
37
- arr_padded = np.pad(samples, (0, 960 * n - samples.shape[0]), mode="constant", constant_values=0)
38
- return arr_padded.reshape(n, 960)
39
-
40
- async def change_sample_rate(self, pcm_array) -> np.ndarray:
41
- if self.sample_rate == INPUT_SERVER_AUDIO_SAMPLE_RATE:
42
- return self.to_n_960(pcm_array)
43
-
44
- frame = av.AudioFrame.from_ndarray(np.array(pcm_array).reshape(1, -1), format="s16", layout="mono")
45
- frame.sample_rate = INPUT_SERVER_AUDIO_SAMPLE_RATE # Assuming input is 16kHz
46
- resampled_frames = self.resampler.resample(frame)
47
- samples = resampled_frames[0].to_ndarray().flatten()
48
- new_frame = av.AudioFrame.from_ndarray(
49
- samples.reshape(1, -1),
50
- format="s16",
51
- layout="mono",
52
- )
53
- new_frame.sample_rate = self.sample_rate
54
- new_samples = new_frame.to_ndarray().flatten()
55
- return self.to_n_960(new_samples)
56
-
57
- async def opus_to_pcm(self, opus) -> np.ndarray:
58
- pcm_data = self.opus_decoder.decode(opus, 960)
59
- pcm_array = np.frombuffer(pcm_data, dtype=np.int16)
60
- samples = await self.change_sample_rate(pcm_array)
61
- return samples
File without changes
File without changes
File without changes