PyPI - xiaozhi-sdk - Versions diffs - 0.2.4__tar.gz → 0.2.7__tar.gz - Mend

xiaozhi-sdk 0.2.4tar.gz → 0.2.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xiaozhi-sdk might be problematic. Click here for more details.

Files changed (43) hide show

{xiaozhi_sdk-0.2.4/xiaozhi_sdk.egg-info → xiaozhi_sdk-0.2.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xiaozhi-sdk
-Version: 0.2.4
+Version: 0.2.7
 Summary: 一个用于连接和控制小智智能设备的Python SDK，支持实时音频通信、MCP工具集成和设备管理功能。
 Author-email: dairoot <623815825@qq.com>
 License-Expression: MIT
@@ -43,7 +43,8 @@ Dynamic: license-file
 ## 📦 安装
 ```bash
-pip install xiaozhi-sdk
+pip install uv
+uv pip install xiaozhi-sdk -U
 ```
 ---
@@ -60,10 +61,21 @@ pip install xiaozhi-sdk
 python -m xiaozhi_sdk --help
 ```
-#### 连接设备（需要提供 MAC 地址）
+#### 连接设备
 ```bash
+# 默认本机 mac 地址
+python -m xiaozhi_sdk
+# 指定 mac 地址
 python -m xiaozhi_sdk 00:22:44:66:88:00
+# 更多常用操作
+## --url 指定服务端 websocket 地址
+## --wake_word 指定唤醒词
+python -m xiaozhi_sdk 00:22:44:66:88:00 \
+  --url ws://127.0.0.1:8180 \
+  --wake_word="你好啊"
 ```
 ### 2. 编程使用 (高阶用法)

{xiaozhi_sdk-0.2.4 → xiaozhi_sdk-0.2.7}/README.md RENAMED Viewed

@@ -16,7 +16,8 @@
 ## 📦 安装
 ```bash
-pip install xiaozhi-sdk
+pip install uv
+uv pip install xiaozhi-sdk -U
 ```
 ---
@@ -33,10 +34,21 @@ pip install xiaozhi-sdk
 python -m xiaozhi_sdk --help
 ```
-#### 连接设备（需要提供 MAC 地址）
+#### 连接设备
 ```bash
+# 默认本机 mac 地址
+python -m xiaozhi_sdk
+# 指定 mac 地址
 python -m xiaozhi_sdk 00:22:44:66:88:00
+# 更多常用操作
+## --url 指定服务端 websocket 地址
+## --wake_word 指定唤醒词
+python -m xiaozhi_sdk 00:22:44:66:88:00 \
+  --url ws://127.0.0.1:8180 \
+  --wake_word="你好啊"
 ```
 ### 2. 编程使用 (高阶用法)

xiaozhi_sdk-0.2.7/file/audio/test_16k.wav ADDED Viewed

Binary file

xiaozhi_sdk-0.2.7/file/audio/test_24k.wav ADDED Viewed

Binary file

xiaozhi_sdk-0.2.7/file/audio/test_48k.wav ADDED Viewed

Binary file

{xiaozhi_sdk-0.2.4 → xiaozhi_sdk-0.2.7}/pyproject.toml RENAMED Viewed

@@ -43,6 +43,7 @@ include = ["xiaozhi_sdk*"]
 xiaozhi_sdk = ["../file/**/*"]
 [tool.uv]
+index-url = "https://pypi.tuna.tsinghua.edu.cn/simple"
 dev-dependencies = [
     "black>=24.8.0",
     "flake8>=5.0.4",
@@ -65,3 +66,5 @@ omit = [
     "xiaozhi_sdk/cli.py",
     "tests/*",
 ]

xiaozhi_sdk-0.2.7/tests/test_wake_word.py ADDED Viewed

@@ -0,0 +1,55 @@
+import asyncio
+import os
+import sys
+import pytest
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+from xiaozhi_sdk import XiaoZhiWebsocket
+from xiaozhi_sdk.utils import read_audio_file
+sample_rate = 16000
+frame_duration = 60
+MAC_ADDR = "00:22:44:66:88:00"
+URL = None
+ota_url = None
+async def test_main():
+    is_end = asyncio.Event()
+    async def message_handler_callback(message):
+        if message.get("state") == "stop":
+            is_end.set()
+        print("message received:", message)
+    xiaozhi = XiaoZhiWebsocket(
+        message_handler_callback, url=URL, ota_url=ota_url,
+        audio_sample_rate=sample_rate, audio_frame_duration=frame_duration)
+    await xiaozhi.init_connection(MAC_ADDR)
+    await xiaozhi.send_wake_word("你好")
+    await asyncio.sleep(5)
+    # await xiaozhi.send_wake_word("1+1")
+    # await asyncio.sleep(5)
+    #
+    # await xiaozhi.send_wake_word("你是什么大语言模型")
+    # await asyncio.sleep(5)
+    # say hellow
+    for pcm in read_audio_file("./file/audio/16k_say_hello.wav", sample_rate, frame_duration):
+        await xiaozhi.send_audio(pcm)
+    await xiaozhi.send_silence_audio()
+    await asyncio.sleep(5)
+    await xiaozhi.close()
+if __name__ == "__main__":
+    asyncio.run(test_main())

{xiaozhi_sdk-0.2.4 → xiaozhi_sdk-0.2.7}/tests/test_xiaozhi.py RENAMED Viewed

@@ -11,11 +11,12 @@ sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
 from xiaozhi_sdk import XiaoZhiWebsocket
 from xiaozhi_sdk.utils import read_audio_file
+sample_rate = 16000
+frame_duration = 60
 async def assistant_audio_play(audio_queue, wait_time=5):
     # 创建一个持续播放的流
-    stream = sd.OutputStream(samplerate=16000, channels=1, dtype=np.int16)
+    stream = sd.OutputStream(samplerate=sample_rate, channels=1, dtype=np.int16)
     stream.start()
     last_time = int(time.time())
     while True:
@@ -78,29 +79,32 @@ URL = None
 # URL = None
-@pytest.mark.asyncio
 async def test_main():
-    xiaozhi = XiaoZhiWebsocket(message_handler_callback, url=URL, ota_url=ota_url)
+    xiaozhi = XiaoZhiWebsocket(message_handler_callback, url=URL, ota_url=ota_url, audio_sample_rate=sample_rate, audio_frame_duration=frame_duration)
     await xiaozhi.set_mcp_tool(mcp_tool_func())
     await xiaozhi.init_connection(MAC_ADDR)
     # # say hellow
-    for pcm in read_audio_file("./file/audio/say_hello.wav"):
+    for pcm in read_audio_file("./file/audio/16k_say_hello.wav", sample_rate, frame_duration):
         await xiaozhi.send_audio(pcm)
     await xiaozhi.send_silence_audio()
     await assistant_audio_play(xiaozhi.output_audio_queue)
     # say take photo
-    for pcm in read_audio_file("./file/audio/take_photo.wav"):
+    for pcm in read_audio_file("./file/audio/16k_take_photo.wav", sample_rate, frame_duration):
         await xiaozhi.send_audio(pcm)
     await xiaozhi.send_silence_audio()
     await assistant_audio_play(xiaozhi.output_audio_queue, 5)
     # play music
-    # for pcm in read_audio_file("./file/audio/play_music.wav"):
-    #     await xiaozhi.send_audio(pcm)
-    # await xiaozhi.send_silence_audio()
-    # await assistant_audio_play(xiaozhi.output_audio_queue, 500)
+    for pcm in read_audio_file("./file/audio/16k_play_music.wav", sample_rate, frame_duration):
+        await xiaozhi.send_audio(pcm)
+    await xiaozhi.send_silence_audio()
+    await assistant_audio_play(xiaozhi.output_audio_queue, 500)
     await xiaozhi.close()
+if __name__ == "__main__":
+    asyncio.run(test_main())

xiaozhi_sdk-0.2.7/tests/test_xiaozhi_opus.py ADDED Viewed

@@ -0,0 +1,88 @@
+import asyncio
+import os
+import sys
+import time
+import numpy as np
+import sounddevice as sd
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+from xiaozhi_sdk import XiaoZhiWebsocket
+from xiaozhi_sdk.utils import read_audio_file
+sample_rate = 48000
+frame_duration = 60
+MAC_ADDR = "00:22:44:66:88:00"
+async def assistant_audio_play(audio_queue, wait_time=5):
+    # 创建一个持续播放的流
+    stream = sd.OutputStream(samplerate=sample_rate, channels=1, dtype=np.int16)
+    stream.start()
+    last_time = int(time.time())
+    while True:
+        if not audio_queue:
+            await asyncio.sleep(0.01)
+            if last_time and time.time() - last_time > wait_time:
+                break
+            continue
+        pcm_data = audio_queue.popleft()
+        # 将字节数据转换为 numpy int16 数组
+        audio_array = pcm_data
+        stream.write(audio_array)
+        last_time = time.time()
+    stream.stop()
+    stream.close()
+async def message_handler_callback(message):
+    print("message received:", message)
+    if message["type"] == "music":
+        print("music:", message["text"])
+async def test_main():
+    xiaozhi = XiaoZhiWebsocket(message_handler_callback, audio_sample_rate=sample_rate,
+                               audio_frame_duration=frame_duration)
+    await xiaozhi.init_connection(MAC_ADDR)
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    test_audio_file = "../file/audio/test_16k.wav"
+    if sample_rate == 24000:
+        test_audio_file = "../file/audio/test_24k.wav"
+    elif sample_rate == 48000:
+        test_audio_file = "../file/audio/test_48k.wav"
+    wav_path = os.path.join(current_dir, test_audio_file)
+    for pcm in read_audio_file(wav_path, sample_rate, frame_duration):
+        await xiaozhi.send_audio(pcm)
+    await xiaozhi.send_silence_audio()
+    await assistant_audio_play(xiaozhi.output_audio_queue)
+    for pcm in read_audio_file(wav_path, sample_rate, frame_duration):
+        await xiaozhi.send_audio(pcm)
+    await xiaozhi.send_silence_audio()
+    await assistant_audio_play(xiaozhi.output_audio_queue)
+    for pcm in read_audio_file(wav_path, sample_rate, frame_duration):
+        await xiaozhi.send_audio(pcm)
+    await xiaozhi.send_silence_audio()
+    await assistant_audio_play(xiaozhi.output_audio_queue)
+    time.sleep(10)
+    await xiaozhi.close()
+if __name__ == "__main__":
+    asyncio.run(test_main())

{xiaozhi_sdk-0.2.4 → xiaozhi_sdk-0.2.7}/xiaozhi_sdk/__init__.py RENAMED Viewed

@@ -1,3 +1,3 @@
-__version__ = "0.2.4"
+__version__ = "0.2.7"
 from xiaozhi_sdk.core import XiaoZhiWebsocket  # noqa

{xiaozhi_sdk-0.2.4 → xiaozhi_sdk-0.2.7}/xiaozhi_sdk/cli.py RENAMED Viewed

@@ -2,6 +2,7 @@ import asyncio
 import io
 import logging
 import time
+import uuid
 from collections import deque
 from typing import Optional
@@ -12,7 +13,11 @@ import sounddevice as sd
 from PIL import ImageGrab
 from xiaozhi_sdk import XiaoZhiWebsocket
-from xiaozhi_sdk.config import INPUT_SERVER_AUDIO_SAMPLE_RATE
+from xiaozhi_sdk.config import (
+    INPUT_AUDIO_CHANNELS,
+    INPUT_AUDIO_FRAME_DURATION,
+    INPUT_AUDIO_SAMPLE_RATE,
+)
 # 定义自定义日志级别
 INFO1 = 21
@@ -50,7 +55,7 @@ logging.Logger.info3 = info3
 handler = colorlog.StreamHandler()
 handler.setFormatter(
     colorlog.ColoredFormatter(
-        "%(log_color)s%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        "%(log_color)s%(asctime)s - %(name)s - %(levelname)-5s - %(message)s",
         datefmt="%Y-%m-%d %H:%M:%S",
         log_colors={
             "DEBUG": "white",
@@ -73,6 +78,7 @@ logger.setLevel(logging.DEBUG)
 input_audio_buffer: deque[bytes] = deque()
 is_playing_audio = False
 is_end = False
+human_speak_time = None
 def get_image_byte(data):
@@ -103,16 +109,26 @@ def get_image_byte(data):
 async def handle_message(message):
+    global is_playing_audio
+    global human_speak_time
     """处理接收到的消息"""
     global is_end
-    if message["type"] == "stt":  # 人类语音
+    if message["type"] == "tts" and message["state"] == "start":  # start
+        pass
+    elif message["type"] == "stt":  # 人类语音
+        human_speak_time = time.time()
         logger.info1("human: %s", message["text"])
     elif message["type"] == "tts" and message["state"] == "sentence_start":  # AI语音
+        is_playing_audio = True  # 防止打断
         logger.info2("AI: %s", message["text"])
     elif message["type"] == "tts" and message["state"] == "stop":
-        pass
+        is_playing_audio = False
         # logger.info2("播放结束")
-        # logger.info("聆听中...")
+        logger.info("聆听中...")
     elif message["type"] == "llm":  # 表情
         logger.info3("emotion: %s", message["text"])
     else:  # 其他消息
@@ -123,13 +139,14 @@ async def handle_message(message):
         is_end = True
-async def play_assistant_audio(audio_queue: deque[bytes], enable_audio):
+async def play_assistant_audio(audio_queue: deque[bytes], enable_audio, audio_samplerate):
     """播放音频流"""
     global is_playing_audio
+    global human_speak_time
     stream = None
     if enable_audio:
-        stream = sd.OutputStream(samplerate=INPUT_SERVER_AUDIO_SAMPLE_RATE, channels=1, dtype=np.int16)
+        stream = sd.OutputStream(samplerate=audio_samplerate, channels=INPUT_AUDIO_CHANNELS, dtype=np.int16)
         stream.start()
     last_audio_time = None
@@ -138,12 +155,17 @@ async def play_assistant_audio(audio_queue: deque[bytes], enable_audio):
             return
         if not audio_queue:
-            await asyncio.sleep(0.01)
-            if last_audio_time and time.time() - last_audio_time > 1:
+            if last_audio_time and time.time() - last_audio_time > 2:
+                last_audio_time = time.time()
                 is_playing_audio = False
+            await asyncio.sleep(0.01)
             continue
-        is_playing_audio = True
+        if human_speak_time:
+            logger.debug("首个音频包响应时间：%s 秒", time.time() - human_speak_time)
+            human_speak_time = None
         pcm_data = audio_queue.popleft()
         if stream:
             stream.write(pcm_data)
@@ -165,10 +187,16 @@ class XiaoZhiClient:
         self.mac_address = ""
         self.wake_word = wake_word
-    async def start(self, mac_address: str, serial_number: str, license_key: str, enable_audio):
+    async def start(self, mac_address: str, serial_number: str, license_key: str, enable_audio, audio_samplerate):
         """启动客户端连接"""
         self.mac_address = mac_address
-        self.xiaozhi = XiaoZhiWebsocket(handle_message, url=self.url, ota_url=self.ota_url, wake_word=self.wake_word)
+        self.xiaozhi = XiaoZhiWebsocket(
+            handle_message,
+            url=self.url,
+            ota_url=self.ota_url,
+            wake_word=self.wake_word,
+            audio_sample_rate=audio_samplerate,
+        )
         from xiaozhi_sdk.utils.mcp_tool import take_photo
         take_photo["tool_func"] = get_image_byte
@@ -178,7 +206,7 @@ class XiaoZhiClient:
             self.mac_address, aec=False, serial_number=serial_number, license_key=license_key
         )
-        asyncio.create_task(play_assistant_audio(self.xiaozhi.output_audio_queue, enable_audio))
+        asyncio.create_task(play_assistant_audio(self.xiaozhi.output_audio_queue, enable_audio, audio_samplerate))
     def audio_callback(self, indata, frames, time, status):
         """音频输入回调函数"""
@@ -193,28 +221,49 @@ class XiaoZhiClient:
                 return
             if not input_audio_buffer:
-                await asyncio.sleep(0.02)
+                await asyncio.sleep(0.01)
                 continue
             pcm_data = input_audio_buffer.popleft()
             if not is_playing_audio:
                 await self.xiaozhi.send_audio(pcm_data)
+            else:
+                input_audio_buffer.clear()
 async def run_client(
-    mac_address: str, url: str, ota_url: str, serial_number: str, license_key: str, enable_audio: bool, wake_word: str
+    mac_address: str,
+    url: str,
+    ota_url: str,
+    serial_number: str,
+    license_key: str,
+    enable_audio: bool,
+    wake_word: str,
 ):
     """运行客户端的异步函数"""
     logger.debug("Recording... Press Ctrl+C to stop.")
     client = XiaoZhiClient(url, ota_url, wake_word)
-    await client.start(mac_address, serial_number, license_key, enable_audio)
-    with sd.InputStream(callback=client.audio_callback, channels=1, samplerate=16000, blocksize=960):
+    await client.start(mac_address, serial_number, license_key, enable_audio, INPUT_AUDIO_SAMPLE_RATE)
+    blocksize = INPUT_AUDIO_SAMPLE_RATE * INPUT_AUDIO_FRAME_DURATION // 1000
+    with sd.InputStream(
+        callback=client.audio_callback,
+        channels=INPUT_AUDIO_CHANNELS,
+        samplerate=INPUT_AUDIO_SAMPLE_RATE,
+        blocksize=blocksize,
+    ):
+        logger.info("聆听中...")
         await client.process_audio_input()
+def get_mac_address():
+    mac = uuid.getnode()
+    mac_addr = ":".join(["%02x" % ((mac >> ele) & 0xFF) for ele in range(40, -8, -8)])
+    return mac_addr
 @click.command()
-@click.argument("mac_address")
+@click.argument("mac_address", required=False)
 @click.option("--url", help="服务端websocket地址")
 @click.option("--ota_url", help="OTA地址")
 @click.option("--serial_number", default="", help="设备的序列号")
@@ -222,10 +271,17 @@ async def run_client(
 @click.option("--enable_audio", default=True, help="是否开启音频播放")
 @click.option("--wake_word", default="", help="唤醒词")
 def main(
-    mac_address: str, url: str, ota_url: str, serial_number: str, license_key: str, enable_audio: bool, wake_word: str
+    mac_address: str,
+    url: str,
+    ota_url: str,
+    serial_number: str,
+    license_key: str,
+    enable_audio: bool,
+    wake_word: str,
 ):
     """小智SDK客户端
     MAC_ADDRESS: 设备的MAC地址 (格式: XX:XX:XX:XX:XX:XX)
     """
+    mac_address = mac_address or get_mac_address()
     asyncio.run(run_client(mac_address, url, ota_url, serial_number, license_key, enable_audio, wake_word))

xiaozhi_sdk-0.2.7/xiaozhi_sdk/config.py ADDED Viewed

@@ -0,0 +1,7 @@
+XIAOZHI_SAMPLE_RATE = 16000  # 固定不变动
+INPUT_AUDIO_SAMPLE_RATE = 24000
+INPUT_AUDIO_CHANNELS = 1
+INPUT_AUDIO_FRAME_DURATION = 60
+OTA_URL = "https://api.tenclass.net/xiaozhi/ota"

{xiaozhi_sdk-0.2.4 → xiaozhi_sdk-0.2.7}/xiaozhi_sdk/core.py RENAMED Viewed

@@ -1,7 +1,6 @@
 import asyncio
 import json
 import logging
-import os
 import re
 import uuid
 from collections import deque
@@ -9,10 +8,15 @@ from typing import Any, Callable, Deque, Dict, Optional
 import websockets
-from xiaozhi_sdk.config import INPUT_SERVER_AUDIO_SAMPLE_RATE
+from xiaozhi_sdk.config import (
+    INPUT_AUDIO_CHANNELS,
+    INPUT_AUDIO_FRAME_DURATION,
+    INPUT_AUDIO_SAMPLE_RATE,
+    XIAOZHI_SAMPLE_RATE,
+)
 from xiaozhi_sdk.iot import OtaDevice
 from xiaozhi_sdk.mcp import McpTool
-from xiaozhi_sdk.utils import get_wav_info, read_audio_file, setup_opus
+from xiaozhi_sdk.utils import setup_opus
 setup_opus()
 from xiaozhi_sdk.opus import AudioOpus
@@ -27,15 +31,17 @@ class XiaoZhiWebsocket(McpTool):
         message_handler_callback: Optional[Callable] = None,
         url: Optional[str] = None,
         ota_url: Optional[str] = None,
-        audio_sample_rate: int = 16000,
-        audio_channels: int = 1,
+        audio_sample_rate: int = INPUT_AUDIO_SAMPLE_RATE,
+        audio_channels: int = INPUT_AUDIO_CHANNELS,
+        audio_frame_duration=INPUT_AUDIO_FRAME_DURATION,
         wake_word: str = "",
     ):
         super().__init__()
         self.url = url
         self.ota_url = ota_url
         self.audio_channels = audio_channels
-        self.audio_opus = AudioOpus(audio_sample_rate, audio_channels)
+        self.audio_frame_duration = audio_frame_duration
+        self.audio_opus = AudioOpus(audio_sample_rate, audio_channels, audio_frame_duration)
         self.wake_word = wake_word
         # 客户端标识
@@ -70,13 +76,13 @@ class XiaoZhiWebsocket(McpTool):
         hello_message = {
             "type": "hello",
             "version": 1,
-            "features": {"mcp": True, "aec": aec},
+            "features": {"mcp": True, "aec": aec, "consistent_sample_rate": False},
             "transport": "websocket",
             "audio_params": {
                 "format": "opus",
-                "sample_rate": 16000,
+                "sample_rate": XIAOZHI_SAMPLE_RATE,
                 "channels": 1,
-                "frame_duration": 60,
+                "frame_duration": self.audio_opus.input_frame_duration,
             },
         }
         await self.websocket.send(json.dumps(hello_message))
@@ -108,17 +114,17 @@ class XiaoZhiWebsocket(McpTool):
                 break
             await asyncio.sleep(3)
-    async def _send_demo_audio(self) -> None:
-        """发送演示音频"""
-        current_dir = os.path.dirname(os.path.abspath(__file__))
-        wav_path = os.path.join(current_dir, "../file/audio/greet.wav")
-        framerate, channels = get_wav_info(wav_path)
-        audio_opus = AudioOpus(framerate, channels)
-        for pcm_data in read_audio_file(wav_path):
-            opus_data = await audio_opus.pcm_to_opus(pcm_data)
-            await self.websocket.send(opus_data)
-        await self.send_silence_audio()
+    # async def _send_demo_audio(self) -> None:
+    #     """发送演示音频"""
+    #     current_dir = os.path.dirname(os.path.abspath(__file__))
+    #     wav_path = os.path.join(current_dir, "../file/audio/16k_greet.wav")
+    #     framerate, channels = get_wav_info(wav_path)
+    #     audio_opus = AudioOpus(framerate, channels, self.audio_frame_duration)
+    #
+    #     for pcm_data in read_audio_file(wav_path, 16000, self.audio_frame_duration):
+    #         opus_data = await audio_opus.pcm_to_opus(pcm_data)
+    #         await self.websocket.send(opus_data)
+    #     await self.send_silence_audio()
     async def send_wake_word(self, wake_word: str) -> bool:
         """发送唤醒词"""
@@ -137,8 +143,8 @@ class XiaoZhiWebsocket(McpTool):
     async def send_silence_audio(self, duration_seconds: float = 1.2) -> None:
         """发送静音音频"""
-        frames_count = int(duration_seconds * 1000 / 60)
-        pcm_frame = b"\x00\x00" * int(INPUT_SERVER_AUDIO_SAMPLE_RATE / 1000 * 60)
+        frames_count = int(duration_seconds * 1000 / self.audio_opus.input_frame_duration)
+        pcm_frame = b"\x00\x00" * int(self.audio_opus.input_sample_rate / 1000 * self.audio_opus.input_frame_duration)
         for _ in range(frames_count):
             await self.send_audio(pcm_frame)
@@ -159,6 +165,7 @@ class XiaoZhiWebsocket(McpTool):
         data = json.loads(message)
         message_type = data["type"]
         if message_type == "hello":
+            self.audio_opus.set_out_audio_frame(data["audio_params"])
             self.hello_received.set()
             self.session_id = data["session_id"]
             return
@@ -219,7 +226,7 @@ class XiaoZhiWebsocket(McpTool):
         await self._send_hello(self.aec)
         await self._start_listen()
-        logger.debug("[websocket] Connection successful")
+        logger.debug("[websocket] Connection successful. mac_addr: %s", self.mac_addr)
         await asyncio.sleep(0.5)
     async def init_connection(
@@ -250,7 +257,9 @@ class XiaoZhiWebsocket(McpTool):
         if not await self.is_activate(ota_info):
             self.iot_task = asyncio.create_task(self._activate_iot_device(license_key, ota_info))
+            await self.send_wake_word("hi")
             logger.debug("[IOT] 设备未激活")
+            return
         if self.wake_word:
             await self.send_wake_word(self.wake_word)

{xiaozhi_sdk-0.2.4 → xiaozhi_sdk-0.2.7}/xiaozhi_sdk/mcp.py RENAMED Viewed

@@ -12,24 +12,23 @@ from xiaozhi_sdk.utils.tool_func import _get_random_music_info
 logger = logging.getLogger("xiaozhi_sdk")
-mcp_initialize_payload: Dict[str, Any] = {
-    "jsonrpc": "2.0",
-    "id": 1,
-    "result": {
-        "protocolVersion": "2024-11-05",
-        "capabilities": {"tools": {}},
-        "serverInfo": {"name": "", "version": "0.0.1"},
-    },
-}
-mcp_tools_payload: Dict[str, Any] = {
-    "jsonrpc": "2.0",
-    "id": 2,
-    "result": {"tools": []},
-}
 class McpTool(object):
+    mcp_initialize_payload: Dict[str, Any] = {
+        "jsonrpc": "2.0",
+        "id": 1,
+        "result": {
+            "protocolVersion": "2024-11-05",
+            "capabilities": {"tools": {}},
+            "serverInfo": {"name": "", "version": "0.0.1"},
+        },
+    }
+    mcp_tools_payload: Dict[str, Any] = {
+        "id": 2,
+        "jsonrpc": "2.0",
+        "result": {"tools": []},
+    }
     def __init__(self):
         self.session_id = ""
@@ -131,8 +130,8 @@ class McpTool(object):
             # self.explain_url = "http://82.157.143.133:8000/vision/explain"
             self.explain_token = payload["params"]["capabilities"]["vision"]["token"]
-            mcp_initialize_payload["id"] = payload["id"]
-            await self.websocket.send(self.get_mcp_json(mcp_initialize_payload))
+            self.mcp_initialize_payload["id"] = payload["id"]
+            await self.websocket.send(self.get_mcp_json(self.mcp_initialize_payload))
         elif method == "notifications/initialized":
             # print("\nMCP 工具初始化")
@@ -142,9 +141,9 @@ class McpTool(object):
             logger.error("[MCP] 工具加载失败")
         elif method == "tools/list":
-            mcp_tools_payload["id"] = payload["id"]
             tool_name_list = []
             mcp_tool_dict = copy.deepcopy(self.mcp_tool_dict)
+            mcp_tool_list = []
             for _, mcp_tool in mcp_tool_dict.items():
                 tool_name_list.append(mcp_tool["name"])
                 tool_func = mcp_tool.pop("tool_func", None)
@@ -152,9 +151,11 @@ class McpTool(object):
                     logger.error("[MCP] Tool %s has no tool_func", mcp_tool["name"])
                     return
                 mcp_tool.pop("is_async", None)
-                mcp_tools_payload["result"]["tools"].append(mcp_tool)
+                mcp_tool_list.append(mcp_tool)
-            await self.websocket.send(self.get_mcp_json(mcp_tools_payload))
+            self.mcp_tools_payload["id"] = payload["id"]
+            self.mcp_tools_payload["result"]["tools"] = mcp_tool_list
+            await self.websocket.send(self.get_mcp_json(self.mcp_tools_payload))
             logger.debug("[MCP] 加载成功，当前可用工具列表为：%s", tool_name_list)
         elif method == "tools/call":

xiaozhi_sdk-0.2.7/xiaozhi_sdk/opus.py ADDED Viewed

@@ -0,0 +1,74 @@
+import av
+import numpy as np
+import opuslib
+from xiaozhi_sdk.config import XIAOZHI_SAMPLE_RATE
+class AudioOpus:
+    def __init__(self, sample_rate, channels, frame_duration):
+        self.input_frame_duration = frame_duration
+        self.input_sample_rate = sample_rate
+        self.input_channels = channels
+        self.input_frame_size = self.input_sample_rate * self.input_frame_duration // 1000
+        # 创建 Opus 编码器
+        self.opus_encoder_16k = opuslib.Encoder(
+            fs=XIAOZHI_SAMPLE_RATE, channels=1, application=opuslib.APPLICATION_VOIP
+        )
+        self.resampler = av.AudioResampler(format="s16", layout="mono", rate=sample_rate)
+        self.resampler_16k = av.AudioResampler(format="s16", layout="mono", rate=16000)
+    def set_out_audio_frame(self, audio_params):
+        # 小智服务端 的 音频信息
+        self.out_sample_rate = audio_params["sample_rate"]
+        self.out_frame_size = self.out_sample_rate * audio_params["frame_duration"] // 1000
+        # 创建 Opus 解码器
+        self.opus_decoder = opuslib.Decoder(
+            fs=self.out_sample_rate,  # 采样率
+            channels=audio_params["channels"],  # 单声道
+        )
+    def to_16k_samplerate_pcm(self, pcm_array):
+        layout = "mono" if self.input_channels == 1 else "stereo"
+        frame = av.AudioFrame.from_ndarray(pcm_array.reshape(1, -1), format="s16", layout=layout)
+        frame.sample_rate = self.input_sample_rate
+        resampled_frames = self.resampler_16k.resample(frame)
+        samples = resampled_frames[0].to_ndarray().flatten()
+        return samples
+    async def pcm_to_opus(self, pcm):
+        pcm_array = np.frombuffer(pcm, dtype=np.int16)
+        pcm_bytes = pcm_array.tobytes()
+        if self.input_sample_rate != XIAOZHI_SAMPLE_RATE:
+            # 小智服务端仅支持 16000 采样率， 将 pcm_array 转 16k 采样率
+            pcm_array = self.to_16k_samplerate_pcm(pcm_array)
+            pcm_bytes = pcm_array.tobytes()
+        frame_size = XIAOZHI_SAMPLE_RATE * self.input_frame_duration // 1000
+        return self.opus_encoder_16k.encode(pcm_bytes, frame_size)
+    async def change_sample_rate(self, pcm_array) -> np.ndarray:
+        # 采样率 变更
+        frame = av.AudioFrame.from_ndarray(np.array(pcm_array).reshape(1, -1), format="s16", layout="mono")
+        frame.sample_rate = self.out_sample_rate
+        resampled_frames = self.resampler.resample(frame)
+        samples = resampled_frames[0].to_ndarray().flatten()
+        return samples
+    def padding(self, samples):
+        # 不足 self.frame_size 补 0
+        samples_padded = np.pad(samples, (0, self.input_frame_size - samples.size), mode="constant", constant_values=0)
+        return samples_padded.reshape(1, self.input_frame_size)
+    async def opus_to_pcm(self, opus) -> np.ndarray:
+        pcm_data = self.opus_decoder.decode(opus, frame_size=self.out_frame_size)
+        pcm_array = np.frombuffer(pcm_data, dtype=np.int16)
+        if self.input_sample_rate != self.out_sample_rate:
+            pcm_array = await self.change_sample_rate(pcm_array)
+        pcm_array = self.padding(pcm_array)
+        return pcm_array

{xiaozhi_sdk-0.2.4 → xiaozhi_sdk-0.2.7}/xiaozhi_sdk/utils/__init__.py RENAMED Viewed

@@ -9,7 +9,7 @@ def get_wav_info(file_path):
         return wav_file.getframerate(), wav_file.getnchannels()
-def read_audio_file(file_path):
+def read_audio_file(file_path, sample_rate, frame_duration):
     """
     读取音频文件并通过yield返回PCM流
@@ -19,9 +19,10 @@ def read_audio_file(file_path):
     Yields:
         bytes: PCM音频数据块
     """
+    frame_size = sample_rate * frame_duration // 1000
     with wave.open(file_path, "rb") as wav_file:
         while True:
-            pcm = wav_file.readframes(960)  # 每次读取960帧（60ms的音频数据）
+            pcm = wav_file.readframes(frame_size)
             if not pcm:
                 break
             yield pcm

{xiaozhi_sdk-0.2.4 → xiaozhi_sdk-0.2.7/xiaozhi_sdk.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xiaozhi-sdk
-Version: 0.2.4
+Version: 0.2.7
 Summary: 一个用于连接和控制小智智能设备的Python SDK，支持实时音频通信、MCP工具集成和设备管理功能。
 Author-email: dairoot <623815825@qq.com>
 License-Expression: MIT
@@ -43,7 +43,8 @@ Dynamic: license-file
 ## 📦 安装
 ```bash
-pip install xiaozhi-sdk
+pip install uv
+uv pip install xiaozhi-sdk -U
 ```
 ---
@@ -60,10 +61,21 @@ pip install xiaozhi-sdk
 python -m xiaozhi_sdk --help
 ```
-#### 连接设备（需要提供 MAC 地址）
+#### 连接设备
 ```bash
+# 默认本机 mac 地址
+python -m xiaozhi_sdk
+# 指定 mac 地址
 python -m xiaozhi_sdk 00:22:44:66:88:00
+# 更多常用操作
+## --url 指定服务端 websocket 地址
+## --wake_word 指定唤醒词
+python -m xiaozhi_sdk 00:22:44:66:88:00 \
+  --url ws://127.0.0.1:8180 \
+  --wake_word="你好啊"
 ```
 ### 2. 编程使用 (高阶用法)

{xiaozhi_sdk-0.2.4 → xiaozhi_sdk-0.2.7}/xiaozhi_sdk.egg-info/SOURCES.txt RENAMED Viewed

@@ -2,10 +2,13 @@ LICENSE
 MANIFEST.in
 README.md
 pyproject.toml
-file/audio/greet.wav
-file/audio/play_music.wav
-file/audio/say_hello.wav
-file/audio/take_photo.wav
+file/audio/16k_greet.wav
+file/audio/16k_play_music.wav
+file/audio/16k_say_hello.wav
+file/audio/16k_take_photo.wav
+file/audio/test_16k.wav
+file/audio/test_24k.wav
+file/audio/test_48k.wav
 file/image/leijun.jpg
 file/opus/linux-arm64-libopus.so
 file/opus/linux-x64-libopus.so
@@ -16,6 +19,7 @@ tests/test_iot.py
 tests/test_pic.py
 tests/test_wake_word.py
 tests/test_xiaozhi.py
+tests/test_xiaozhi_opus.py
 xiaozhi_sdk/__init__.py
 xiaozhi_sdk/__main__.py
 xiaozhi_sdk/cli.py
@@ -29,10 +33,13 @@ xiaozhi_sdk.egg-info/SOURCES.txt
 xiaozhi_sdk.egg-info/dependency_links.txt
 xiaozhi_sdk.egg-info/requires.txt
 xiaozhi_sdk.egg-info/top_level.txt
-xiaozhi_sdk/../file/audio/greet.wav
-xiaozhi_sdk/../file/audio/play_music.wav
-xiaozhi_sdk/../file/audio/say_hello.wav
-xiaozhi_sdk/../file/audio/take_photo.wav
+xiaozhi_sdk/../file/audio/16k_greet.wav
+xiaozhi_sdk/../file/audio/16k_play_music.wav
+xiaozhi_sdk/../file/audio/16k_say_hello.wav
+xiaozhi_sdk/../file/audio/16k_take_photo.wav
+xiaozhi_sdk/../file/audio/test_16k.wav
+xiaozhi_sdk/../file/audio/test_24k.wav
+xiaozhi_sdk/../file/audio/test_48k.wav
 xiaozhi_sdk/../file/image/leijun.jpg
 xiaozhi_sdk/../file/opus/linux-arm64-libopus.so
 xiaozhi_sdk/../file/opus/linux-x64-libopus.so

xiaozhi_sdk-0.2.4/tests/test_wake_word.py DELETED Viewed

@@ -1,33 +0,0 @@
-import asyncio
-import os
-import sys
-import pytest
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
-from xiaozhi_sdk import XiaoZhiWebsocket
-MAC_ADDR = "00:22:44:66:88:00"
-ota_url = None
-URL = None
-@pytest.mark.asyncio
-async def test_main():
-    is_end = asyncio.Event()
-    async def message_handler_callback(message):
-        if message.get("state") == "stop":
-            is_end.set()
-        print("message received:", message)
-    xiaozhi = XiaoZhiWebsocket(message_handler_callback, url=URL, ota_url=ota_url)
-    await xiaozhi.init_connection(MAC_ADDR)
-    await xiaozhi.send_wake_word("退下，拜拜不聊了")
-    await asyncio.wait_for(is_end.wait(), timeout=20.0)
-    await xiaozhi.send_wake_word("你好")
-    await asyncio.wait_for(is_end.wait(), timeout=20.0)
-    await xiaozhi.close()

xiaozhi_sdk-0.2.4/xiaozhi_sdk/config.py DELETED Viewed

@@ -1,3 +0,0 @@
-INPUT_SERVER_AUDIO_SAMPLE_RATE = 16000
-OTA_URL = "https://api.tenclass.net/xiaozhi/ota"

xiaozhi_sdk-0.2.4/xiaozhi_sdk/opus.py DELETED Viewed

@@ -1,61 +0,0 @@
-import math
-import av
-import numpy as np
-import opuslib
-from xiaozhi_sdk.config import INPUT_SERVER_AUDIO_SAMPLE_RATE
-class AudioOpus:
-    def __init__(self, sample_rate, channels):
-        self.sample_rate = sample_rate
-        self.channels = channels
-        # 创建 Opus 编码器
-        self.opus_encoder = opuslib.Encoder(
-            fs=sample_rate, channels=channels, application=opuslib.APPLICATION_VOIP  # 采样率  # 单声道  # 语音应用
-        )
-        # 创建 Opus 解码器
-        self.opus_decoder = opuslib.Decoder(
-            fs=INPUT_SERVER_AUDIO_SAMPLE_RATE,  # 采样率
-            channels=1,  # 单声道
-        )
-        self.resampler = av.AudioResampler(format="s16", layout="mono", rate=sample_rate)
-    async def pcm_to_opus(self, pcm):
-        pcm_array = np.frombuffer(pcm, dtype=np.int16)
-        pcm_bytes = pcm_array.tobytes()
-        return self.opus_encoder.encode(pcm_bytes, 960)
-    @staticmethod
-    def to_n_960(samples) -> np.ndarray:
-        n = math.ceil(samples.shape[0] / 960)
-        arr_padded = np.pad(samples, (0, 960 * n - samples.shape[0]), mode="constant", constant_values=0)
-        return arr_padded.reshape(n, 960)
-    async def change_sample_rate(self, pcm_array) -> np.ndarray:
-        if self.sample_rate == INPUT_SERVER_AUDIO_SAMPLE_RATE:
-            return self.to_n_960(pcm_array)
-        frame = av.AudioFrame.from_ndarray(np.array(pcm_array).reshape(1, -1), format="s16", layout="mono")
-        frame.sample_rate = INPUT_SERVER_AUDIO_SAMPLE_RATE  # Assuming input is 16kHz
-        resampled_frames = self.resampler.resample(frame)
-        samples = resampled_frames[0].to_ndarray().flatten()
-        new_frame = av.AudioFrame.from_ndarray(
-            samples.reshape(1, -1),
-            format="s16",
-            layout="mono",
-        )
-        new_frame.sample_rate = self.sample_rate
-        new_samples = new_frame.to_ndarray().flatten()
-        return self.to_n_960(new_samples)
-    async def opus_to_pcm(self, opus) -> np.ndarray:
-        pcm_data = self.opus_decoder.decode(opus, 960)
-        pcm_array = np.frombuffer(pcm_data, dtype=np.int16)
-        samples = await self.change_sample_rate(pcm_array)
-        return samples