deeptalk-asd 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deeptalk_asd-0.2.0/LICENSE +21 -0
- deeptalk_asd-0.2.0/PKG-INFO +17 -0
- deeptalk_asd-0.2.0/README.md +98 -0
- deeptalk_asd-0.2.0/deeptalk_asd.egg-info/PKG-INFO +17 -0
- deeptalk_asd-0.2.0/deeptalk_asd.egg-info/SOURCES.txt +36 -0
- deeptalk_asd-0.2.0/deeptalk_asd.egg-info/dependency_links.txt +1 -0
- deeptalk_asd-0.2.0/deeptalk_asd.egg-info/requires.txt +9 -0
- deeptalk_asd-0.2.0/deeptalk_asd.egg-info/top_level.txt +1 -0
- deeptalk_asd-0.2.0/pyproject.toml +20 -0
- deeptalk_asd-0.2.0/setup.cfg +4 -0
- deeptalk_asd-0.2.0/setup.py +29 -0
- deeptalk_asd-0.2.0/src/__init__.py +26 -0
- deeptalk_asd-0.2.0/src/asd.py +133 -0
- deeptalk_asd-0.2.0/src/asd_factory.py +89 -0
- deeptalk_asd-0.2.0/src/asd_interface.py +57 -0
- deeptalk_asd-0.2.0/src/audio_frame.py +250 -0
- deeptalk_asd-0.2.0/src/deeptalk_logger.py +65 -0
- deeptalk_asd-0.2.0/src/face_detector/__init__.py +0 -0
- deeptalk_asd-0.2.0/src/face_detector/face_compare_helper.py +121 -0
- deeptalk_asd-0.2.0/src/face_detector/face_info.py +87 -0
- deeptalk_asd-0.2.0/src/face_detector/factory.py +33 -0
- deeptalk_asd-0.2.0/src/face_detector/inspireface_detector.py +362 -0
- deeptalk_asd-0.2.0/src/face_detector/interface.py +18 -0
- deeptalk_asd-0.2.0/src/speaker_detector/__init__.py +0 -0
- deeptalk_asd-0.2.0/src/speaker_detector/factory.py +35 -0
- deeptalk_asd-0.2.0/src/speaker_detector/interface.py +56 -0
- deeptalk_asd-0.2.0/src/speaker_detector/lrasd_onnx.py +311 -0
- deeptalk_asd-0.2.0/src/turn_detector/__init__.py +0 -0
- deeptalk_asd-0.2.0/src/turn_detector/factory.py +40 -0
- deeptalk_asd-0.2.0/src/turn_detector/interface.py +18 -0
- deeptalk_asd-0.2.0/src/turn_detector/resampler/__init__.py +17 -0
- deeptalk_asd-0.2.0/src/turn_detector/resampler/audio_resampler.py +201 -0
- deeptalk_asd-0.2.0/src/turn_detector/resampler/audio_resampler_factory.py +131 -0
- deeptalk_asd-0.2.0/src/turn_detector/resampler/rosa_audio_resampler.py +300 -0
- deeptalk_asd-0.2.0/src/turn_detector/resampler/sox_audio_resampler.py +305 -0
- deeptalk_asd-0.2.0/src/turn_detector/silero_vad_turn_detector.py +193 -0
- deeptalk_asd-0.2.0/src/turn_detector/utterance.py +85 -0
- deeptalk_asd-0.2.0/src/video_frame.py +161 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 huyyxy
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: deeptalk-asd
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: DeepTalk Active Speaker Detection
|
|
5
|
+
Requires-Python: >=3.8
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Requires-Dist: python-dotenv
|
|
8
|
+
Requires-Dist: termcolor
|
|
9
|
+
Requires-Dist: tornado
|
|
10
|
+
Requires-Dist: python_speech_features
|
|
11
|
+
Requires-Dist: numpy==1.26.4
|
|
12
|
+
Requires-Dist: opencv-python
|
|
13
|
+
Requires-Dist: scipy
|
|
14
|
+
Requires-Dist: pandas
|
|
15
|
+
Requires-Dist: tqdm
|
|
16
|
+
Dynamic: license-file
|
|
17
|
+
Dynamic: requires-python
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# DeepTalk-ASD
|
|
2
|
+
|
|
3
|
+
[LR-ASD](https://github.com/Junhua-Liao/LR-ASD) is a SOTA Active Speaker Detection (ASD) model. While it offers exceptional performance, the official open-source project relies on GPUs and relatively older face detection models. This project aims to provide a production-ready, out-of-the-box ASD system.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
DeepTalk-ASD is an efficient Active Speaker Detection (ASD) system. By fusing audio and video features, it determines in real-time which face in a video frame is speaking.
|
|
8
|
+
|
|
9
|
+
## Key Features
|
|
10
|
+
|
|
11
|
+
- **Multimodal Fusion**: Combines face detection, voice activity detection (VAD), and speaker identification models.
|
|
12
|
+
- **Modular Design**:
|
|
13
|
+
- **FaceDetector**: Detects and tracks faces (supports InspireFace).
|
|
14
|
+
- **TurnDetector**: Audio VAD detection (supports Silero VAD).
|
|
15
|
+
- **SpeakerDetector**: Audio-visual feature fusion and decision making (based on LR-ASD).
|
|
16
|
+
- **High Performance**: Supports ONNX inference, suitable for real-time applications.
|
|
17
|
+
- **Easy to Use**: Provides command-line demos supporting real-time camera input and video file processing.
|
|
18
|
+
|
|
19
|
+
## System Architecture
|
|
20
|
+
|
|
21
|
+
The system is orchestrated through three main sub-components:
|
|
22
|
+
1. **FaceDetector**: Responsible for locating faces in each video frame.
|
|
23
|
+
2. **TurnDetector**: Responsible for determining if the current audio stream contains speech.
|
|
24
|
+
3. **SpeakerDetector**: The core decision layer that calculates speaking probability based on VAD results and face image sequences.
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
### 1. Environment Setup
|
|
29
|
+
|
|
30
|
+
Python 3.8 or higher is recommended.
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
# Clone the repository
|
|
34
|
+
git clone <repository_url>
|
|
35
|
+
cd DeepTalk-ASD
|
|
36
|
+
|
|
37
|
+
# Install dependencies
|
|
38
|
+
python3 -m pip install -r requirements.txt
|
|
39
|
+
|
|
40
|
+
# Install the project in editable mode
|
|
41
|
+
pip3 install -e .
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### 2. Verify Installation
|
|
45
|
+
|
|
46
|
+
Check the installation via the Python interactive environment:
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
python3
|
|
50
|
+
>>> import deeptalk_asd
|
|
51
|
+
>>> print(deeptalk_asd.__version__)
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### 3. Model Weights
|
|
55
|
+
|
|
56
|
+
Ensure the `weights` directory contains the necessary model files. The project includes support for converting models to ONNX.
|
|
57
|
+
- `Pikachu`: InspireFace related models.
|
|
58
|
+
- `silero_vad.onnx`: Silero VAD model.
|
|
59
|
+
- LR-ASD related models: `audio_frontend.onnx`, `visual_frontend.onnx`, `av_backend.onnx`.
|
|
60
|
+
|
|
61
|
+
### 4. Running Demos
|
|
62
|
+
|
|
63
|
+
#### Video File Processing Demo
|
|
64
|
+
```bash
|
|
65
|
+
python3 demo/video_asd_demo.py --input demo/demo.mp4 --display
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
#### Real-time Camera Demo
|
|
69
|
+
```bash
|
|
70
|
+
python3 demo/realtime_asd_demo.py
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Configuration
|
|
74
|
+
|
|
75
|
+
The components can be flexibly configured using the factory method:
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
from deeptalk_asd import ASDDetectorFactory
|
|
79
|
+
|
|
80
|
+
config = {
|
|
81
|
+
"face_detector": {"type": "inspireface"},
|
|
82
|
+
"turn_detector": {"type": "silero-vad", "model_path": "weights/silero_vad.onnx"},
|
|
83
|
+
"speaker_detector": {"type": "LR-ASD-ONNX", "onnx_dir": "weights"}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
factory = ASDDetectorFactory(**config)
|
|
87
|
+
asd = factory.create()
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## License
|
|
91
|
+
|
|
92
|
+
The code in this project is licensed under the **MIT License**. However, please note that the integrated models and their related code are subject to their respective licenses:
|
|
93
|
+
|
|
94
|
+
1. **InspireFace**: Core code is MIT, but the provided pre-trained models are typically restricted to **non-commercial research use**. For commercial use, please refer to [InsightFace](https://github.com/deepinsight/insightface) documentation.
|
|
95
|
+
2. **Silero VAD**: Licensed under the **MIT License**. Please refer to [Silero VAD](https://github.com/snakers4/silero-vad).
|
|
96
|
+
3. **LR-ASD**: Licensed under the **MIT License**. Please refer to [LR-ASD](https://github.com/Junhua-Liao/LR-ASD).
|
|
97
|
+
|
|
98
|
+
While you may redistribute the code of this project under the MIT license, you must explicitly inform users in the documentation that: **When using specific pre-trained models (especially face detection models), users must comply with the non-commercial restrictions of the original authors.** If commercialization is required, users should replace them with commercially-friendly models or contact the original authors for authorization.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: deeptalk-asd
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: DeepTalk Active Speaker Detection
|
|
5
|
+
Requires-Python: >=3.8
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Requires-Dist: python-dotenv
|
|
8
|
+
Requires-Dist: termcolor
|
|
9
|
+
Requires-Dist: tornado
|
|
10
|
+
Requires-Dist: python_speech_features
|
|
11
|
+
Requires-Dist: numpy==1.26.4
|
|
12
|
+
Requires-Dist: opencv-python
|
|
13
|
+
Requires-Dist: scipy
|
|
14
|
+
Requires-Dist: pandas
|
|
15
|
+
Requires-Dist: tqdm
|
|
16
|
+
Dynamic: license-file
|
|
17
|
+
Dynamic: requires-python
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
setup.py
|
|
5
|
+
deeptalk_asd.egg-info/PKG-INFO
|
|
6
|
+
deeptalk_asd.egg-info/SOURCES.txt
|
|
7
|
+
deeptalk_asd.egg-info/dependency_links.txt
|
|
8
|
+
deeptalk_asd.egg-info/requires.txt
|
|
9
|
+
deeptalk_asd.egg-info/top_level.txt
|
|
10
|
+
src/__init__.py
|
|
11
|
+
src/asd.py
|
|
12
|
+
src/asd_factory.py
|
|
13
|
+
src/asd_interface.py
|
|
14
|
+
src/audio_frame.py
|
|
15
|
+
src/deeptalk_logger.py
|
|
16
|
+
src/video_frame.py
|
|
17
|
+
src/face_detector/__init__.py
|
|
18
|
+
src/face_detector/face_compare_helper.py
|
|
19
|
+
src/face_detector/face_info.py
|
|
20
|
+
src/face_detector/factory.py
|
|
21
|
+
src/face_detector/inspireface_detector.py
|
|
22
|
+
src/face_detector/interface.py
|
|
23
|
+
src/speaker_detector/__init__.py
|
|
24
|
+
src/speaker_detector/factory.py
|
|
25
|
+
src/speaker_detector/interface.py
|
|
26
|
+
src/speaker_detector/lrasd_onnx.py
|
|
27
|
+
src/turn_detector/__init__.py
|
|
28
|
+
src/turn_detector/factory.py
|
|
29
|
+
src/turn_detector/interface.py
|
|
30
|
+
src/turn_detector/silero_vad_turn_detector.py
|
|
31
|
+
src/turn_detector/utterance.py
|
|
32
|
+
src/turn_detector/resampler/__init__.py
|
|
33
|
+
src/turn_detector/resampler/audio_resampler.py
|
|
34
|
+
src/turn_detector/resampler/audio_resampler_factory.py
|
|
35
|
+
src/turn_detector/resampler/rosa_audio_resampler.py
|
|
36
|
+
src/turn_detector/resampler/sox_audio_resampler.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
deeptalk_asd
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=64", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "deeptalk-asd"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "DeepTalk Active Speaker Detection"
|
|
9
|
+
requires-python = ">=3.8"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"python-dotenv",
|
|
12
|
+
"termcolor",
|
|
13
|
+
"tornado",
|
|
14
|
+
"python_speech_features",
|
|
15
|
+
"numpy==1.26.4",
|
|
16
|
+
"opencv-python",
|
|
17
|
+
"scipy",
|
|
18
|
+
"pandas",
|
|
19
|
+
"tqdm",
|
|
20
|
+
]
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
sub_packages = find_packages(where="src")
|
|
4
|
+
|
|
5
|
+
# Build package_dir mapping: each sub-package must be explicitly mapped
|
|
6
|
+
# e.g. "deeptalk_asd.face_detector" -> "src/face_detector"
|
|
7
|
+
package_dir = {"deeptalk_asd": "src"}
|
|
8
|
+
for pkg in sub_packages:
|
|
9
|
+
package_dir[f"deeptalk_asd.{pkg}"] = f"src/{pkg.replace('.', '/')}"
|
|
10
|
+
|
|
11
|
+
setup(
|
|
12
|
+
name="deeptalk_asd",
|
|
13
|
+
version="0.2.0",
|
|
14
|
+
description="DeepTalk Active Speaker Detection",
|
|
15
|
+
package_dir=package_dir,
|
|
16
|
+
packages=["deeptalk_asd"] + [f"deeptalk_asd.{pkg}" for pkg in sub_packages],
|
|
17
|
+
python_requires=">=3.8",
|
|
18
|
+
install_requires=[
|
|
19
|
+
"python-dotenv",
|
|
20
|
+
"termcolor",
|
|
21
|
+
"tornado",
|
|
22
|
+
"python_speech_features",
|
|
23
|
+
"numpy==1.26.4",
|
|
24
|
+
"opencv-python",
|
|
25
|
+
"scipy",
|
|
26
|
+
"pandas",
|
|
27
|
+
"tqdm",
|
|
28
|
+
],
|
|
29
|
+
)
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DeepTalk-ASD: 活动说话者检测模块
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .asd_factory import ASDDetectorFactory
|
|
6
|
+
from .asd_interface import ASDInterface
|
|
7
|
+
from .audio_frame import AudioFrame
|
|
8
|
+
from .video_frame import VideoFrame, VideoBufferType, VideoRotation, VideoCodec, VideoStreamType
|
|
9
|
+
from .face_detector.face_info import FaceProfile, FaceRectangle, HeadPose
|
|
10
|
+
from .turn_detector.utterance import Utterance, TurnState
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"ASDDetectorFactory",
|
|
14
|
+
"ASDInterface",
|
|
15
|
+
"AudioFrame",
|
|
16
|
+
"VideoFrame",
|
|
17
|
+
"VideoBufferType",
|
|
18
|
+
"VideoRotation",
|
|
19
|
+
"VideoCodec",
|
|
20
|
+
"VideoStreamType",
|
|
21
|
+
"FaceProfile",
|
|
22
|
+
"FaceRectangle",
|
|
23
|
+
"HeadPose",
|
|
24
|
+
"Utterance",
|
|
25
|
+
"TurnState",
|
|
26
|
+
]
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""
|
|
2
|
+
文件名: asd.py
|
|
3
|
+
描述:
|
|
4
|
+
ASDInterface 的实现,编排 FaceDetector、TurnDetector、SpeakerDetector 三个子组件
|
|
5
|
+
完成活动说话者检测的完整流程。
|
|
6
|
+
"""
|
|
7
|
+
import cv2
|
|
8
|
+
import numpy as np
|
|
9
|
+
from typing import List
|
|
10
|
+
|
|
11
|
+
from .asd_interface import ASDInterface
|
|
12
|
+
from .video_frame import VideoFrame
|
|
13
|
+
from .audio_frame import AudioFrame
|
|
14
|
+
from .face_detector.face_info import FaceProfile
|
|
15
|
+
from .turn_detector.interface import TurnDetectorInterface
|
|
16
|
+
from .turn_detector.utterance import Utterance, TurnState
|
|
17
|
+
from .face_detector.interface import FaceDetectorInterface
|
|
18
|
+
from .speaker_detector.interface import SpeakerDetectorInterface
|
|
19
|
+
|
|
20
|
+
from .deeptalk_logger import DeepTalkLogger
|
|
21
|
+
|
|
22
|
+
logger = DeepTalkLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ASD(ASDInterface):
|
|
26
|
+
"""活动说话者检测实现类
|
|
27
|
+
|
|
28
|
+
编排三个子组件:
|
|
29
|
+
- FaceDetector: 从视频帧中检测人脸
|
|
30
|
+
- TurnDetector: 从音频帧中检测语音轮次 (VAD)
|
|
31
|
+
- SpeakerDetector: 融合音视频特征,判定活跃说话者
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
face_detector: FaceDetectorInterface,
|
|
37
|
+
turn_detector: TurnDetectorInterface,
|
|
38
|
+
speaker_detector: SpeakerDetectorInterface,
|
|
39
|
+
**kwargs,
|
|
40
|
+
):
|
|
41
|
+
"""
|
|
42
|
+
初始化活动说话者检测系统
|
|
43
|
+
|
|
44
|
+
参数:
|
|
45
|
+
face_detector: 人脸检测器实例
|
|
46
|
+
turn_detector: 语音轮次检测器实例
|
|
47
|
+
speaker_detector: 说话者检测器实例
|
|
48
|
+
kwargs: 可选参数,保留用于未来扩展
|
|
49
|
+
"""
|
|
50
|
+
super().__init__(**kwargs)
|
|
51
|
+
self._face_detector = face_detector
|
|
52
|
+
self._turn_detector = turn_detector
|
|
53
|
+
self._speaker_detector = speaker_detector
|
|
54
|
+
|
|
55
|
+
def append_video(self, video_frame: VideoFrame, create_time: float = None) -> List[FaceProfile]:
|
|
56
|
+
"""
|
|
57
|
+
添加视频帧,视频帧率缺省为25
|
|
58
|
+
|
|
59
|
+
流程:
|
|
60
|
+
1. 使用 FaceDetector 检测视频帧中的人脸
|
|
61
|
+
2. 将每个人脸的灰度图与 track_id 传给 SpeakerDetector
|
|
62
|
+
|
|
63
|
+
参数:
|
|
64
|
+
video_frame: 当前视频帧
|
|
65
|
+
create_time: 此块视频帧的创建时间
|
|
66
|
+
"""
|
|
67
|
+
# 1. 人脸检测
|
|
68
|
+
face_profiles = self._face_detector.detect(video_frame)
|
|
69
|
+
|
|
70
|
+
# 2. 转换为 SpeakerDetector 需要的格式: [{'id': track_id, 'image': face_gray}, ...]
|
|
71
|
+
frame_faces = []
|
|
72
|
+
for profile in face_profiles:
|
|
73
|
+
face_image = profile.face_image
|
|
74
|
+
if face_image is not None:
|
|
75
|
+
# 转换为灰度图
|
|
76
|
+
if len(face_image.shape) == 3:
|
|
77
|
+
face_gray = cv2.cvtColor(face_image, cv2.COLOR_BGR2GRAY)
|
|
78
|
+
else:
|
|
79
|
+
face_gray = face_image
|
|
80
|
+
face_gray = cv2.resize(face_gray, (224, 224))
|
|
81
|
+
face_gray = face_gray[56:168, 56:168] # 中心裁剪 112x112
|
|
82
|
+
frame_faces.append({
|
|
83
|
+
'id': profile.track_id,
|
|
84
|
+
'image': face_gray,
|
|
85
|
+
})
|
|
86
|
+
|
|
87
|
+
# 3. 传入 SpeakerDetector
|
|
88
|
+
if frame_faces:
|
|
89
|
+
self._speaker_detector.append_video(frame_faces, create_time)
|
|
90
|
+
return face_profiles
|
|
91
|
+
|
|
92
|
+
def append_audio(self, audio_frame: AudioFrame, create_time: float = None) -> Utterance:
|
|
93
|
+
"""
|
|
94
|
+
添加音频块到处理队列
|
|
95
|
+
|
|
96
|
+
流程:
|
|
97
|
+
1. 将音频 PCM 数据传给 SpeakerDetector
|
|
98
|
+
2. 使用 TurnDetector 进行 VAD 检测
|
|
99
|
+
|
|
100
|
+
参数:
|
|
101
|
+
audio_frame: 当前音频帧
|
|
102
|
+
create_time: 此块音频帧的创建时间
|
|
103
|
+
|
|
104
|
+
返回:
|
|
105
|
+
轮次检测结果 (Utterance)
|
|
106
|
+
"""
|
|
107
|
+
# 1. 将音频数据传入 SpeakerDetector
|
|
108
|
+
audio_data = bytes(audio_frame.data)
|
|
109
|
+
self._speaker_detector.append_audio(audio_data, create_time)
|
|
110
|
+
|
|
111
|
+
# 2. TurnDetector 进行 VAD 轮次检测
|
|
112
|
+
utterance = self._turn_detector.detect(audio_frame)
|
|
113
|
+
if utterance.turn_state in [TurnState.TURN_START, TurnState.TURN_CONFIRMED, TurnState.TURN_END, TurnState.TURN_REJECTED]:
|
|
114
|
+
logger.critical(f"Utterance: {utterance.turn_state.name}")
|
|
115
|
+
return utterance
|
|
116
|
+
|
|
117
|
+
def evaluate(self, start_time: float = None, end_time: float = None):
|
|
118
|
+
"""
|
|
119
|
+
评估当前活动说话者
|
|
120
|
+
|
|
121
|
+
参数:
|
|
122
|
+
start_time: 评估起始时间(time.perf_counter 时间戳),可选
|
|
123
|
+
end_time: 评估结束时间(time.perf_counter 时间戳),可选
|
|
124
|
+
|
|
125
|
+
返回:
|
|
126
|
+
最新时间点的活动说话者的 track_id 和置信度得分
|
|
127
|
+
格式: {track_id: score, ...}
|
|
128
|
+
"""
|
|
129
|
+
return self._speaker_detector.evaluate(start_time, end_time)
|
|
130
|
+
|
|
131
|
+
def reset(self):
|
|
132
|
+
"""重置系统状态"""
|
|
133
|
+
self._speaker_detector.reset()
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""
|
|
2
|
+
文件名: asd_factory.py
|
|
3
|
+
描述:
|
|
4
|
+
ASD 检测器工厂,通过配置信息分别创建 FaceDetector、TurnDetector、SpeakerDetector,
|
|
5
|
+
然后组装为 ASD 实例。
|
|
6
|
+
"""
|
|
7
|
+
from .deeptalk_logger import DeepTalkLogger
|
|
8
|
+
from .asd_interface import ASDInterface
|
|
9
|
+
import traceback
|
|
10
|
+
|
|
11
|
+
logger = DeepTalkLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ASDDetectorFactory:
|
|
15
|
+
def __init__(self, **kwargs):
|
|
16
|
+
"""
|
|
17
|
+
使用配置信息创建 ASD 实例。
|
|
18
|
+
|
|
19
|
+
参数:
|
|
20
|
+
kwargs: 包含三个子组件的配置信息:
|
|
21
|
+
face_detector: dict, 人脸检测器配置, 如 {"type": "inspireface", ...}
|
|
22
|
+
turn_detector: dict, 轮次检测器配置, 如 {"type": "silero-vad", "model_path": "...", ...}
|
|
23
|
+
speaker_detector: dict, 说话者检测器配置, 如 {"type": "LR-ASD-ONNX", "onnx_dir": "...", ...}
|
|
24
|
+
"""
|
|
25
|
+
self.kwargs = kwargs
|
|
26
|
+
|
|
27
|
+
def create(self) -> ASDInterface:
|
|
28
|
+
"""
|
|
29
|
+
根据配置创建 ASD 实例。
|
|
30
|
+
|
|
31
|
+
:return: ASDInterface 实例,创建失败时返回 None
|
|
32
|
+
"""
|
|
33
|
+
try:
|
|
34
|
+
# 1. 创建 FaceDetector
|
|
35
|
+
face_config = dict(self.kwargs.get('face_detector', {}))
|
|
36
|
+
face_type = face_config.pop('type', None)
|
|
37
|
+
if face_type is None:
|
|
38
|
+
logger.error("face_detector 配置缺少 'type' 字段")
|
|
39
|
+
return None
|
|
40
|
+
|
|
41
|
+
from .face_detector.factory import FaceDetectorFactory
|
|
42
|
+
face_detector = FaceDetectorFactory(face_type, **face_config).face_detector()
|
|
43
|
+
if face_detector is None:
|
|
44
|
+
logger.error(f"创建 FaceDetector(type={face_type}) 失败")
|
|
45
|
+
return None
|
|
46
|
+
logger.info(f"FaceDetector 创建成功: type={face_type}")
|
|
47
|
+
|
|
48
|
+
# 2. 创建 TurnDetector
|
|
49
|
+
turn_config = dict(self.kwargs.get('turn_detector', {}))
|
|
50
|
+
turn_type = turn_config.pop('type', None)
|
|
51
|
+
if turn_type is None:
|
|
52
|
+
logger.error("turn_detector 配置缺少 'type' 字段")
|
|
53
|
+
return None
|
|
54
|
+
|
|
55
|
+
from .turn_detector.factory import TurnDetectorFactory
|
|
56
|
+
turn_detector = TurnDetectorFactory(turn_type, **turn_config).turn_detector()
|
|
57
|
+
if turn_detector is None:
|
|
58
|
+
logger.error(f"创建 TurnDetector(type={turn_type}) 失败")
|
|
59
|
+
return None
|
|
60
|
+
logger.info(f"TurnDetector 创建成功: type={turn_type}")
|
|
61
|
+
|
|
62
|
+
# 3. 创建 SpeakerDetector
|
|
63
|
+
speaker_config = dict(self.kwargs.get('speaker_detector', {}))
|
|
64
|
+
speaker_type = speaker_config.pop('type', None)
|
|
65
|
+
if speaker_type is None:
|
|
66
|
+
logger.error("speaker_detector 配置缺少 'type' 字段")
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
from .speaker_detector.factory import SpeakerDetectorFactory
|
|
70
|
+
speaker_detector = SpeakerDetectorFactory(speaker_type, **speaker_config).speaker_detector()
|
|
71
|
+
if speaker_detector is None:
|
|
72
|
+
logger.error(f"创建 SpeakerDetector(type={speaker_type}) 失败")
|
|
73
|
+
return None
|
|
74
|
+
logger.info(f"SpeakerDetector 创建成功: type={speaker_type}")
|
|
75
|
+
|
|
76
|
+
# 4. 组装 ASD 实例
|
|
77
|
+
from .asd import ASD
|
|
78
|
+
asd = ASD(
|
|
79
|
+
face_detector=face_detector,
|
|
80
|
+
turn_detector=turn_detector,
|
|
81
|
+
speaker_detector=speaker_detector,
|
|
82
|
+
)
|
|
83
|
+
logger.info("ASD 实例创建成功")
|
|
84
|
+
return asd
|
|
85
|
+
|
|
86
|
+
except Exception as e:
|
|
87
|
+
logger.error(f"创建 ASD 实例失败: {e}")
|
|
88
|
+
traceback.print_exc()
|
|
89
|
+
return None
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from .video_frame import VideoFrame
|
|
2
|
+
from .audio_frame import AudioFrame
|
|
3
|
+
from .turn_detector.utterance import Utterance
|
|
4
|
+
from .face_detector.interface import FaceProfile
|
|
5
|
+
from typing import List
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ASDInterface:
|
|
9
|
+
"""活动说话者检测核心类"""
|
|
10
|
+
def __init__(self, **kwargs):
|
|
11
|
+
"""
|
|
12
|
+
初始化活动说话者检测系统
|
|
13
|
+
|
|
14
|
+
参数:
|
|
15
|
+
kwargs: 可选参数,当前未使用,保留用于未来扩展。
|
|
16
|
+
"""
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
def append_video(self, video_frame: VideoFrame, create_time: float = None) -> List[FaceProfile]:
|
|
20
|
+
"""
|
|
21
|
+
添加视频帧,视频帧率缺省为25
|
|
22
|
+
|
|
23
|
+
参数:
|
|
24
|
+
video_frame: 当前视频帧
|
|
25
|
+
create_time: 此块视频帧的创建时间,调用方一般使用time.perf_counter()获得,用于和音频块时间对齐
|
|
26
|
+
"""
|
|
27
|
+
pass
|
|
28
|
+
|
|
29
|
+
def append_audio(self, audio_frame: AudioFrame, create_time: float = None) -> Utterance:
|
|
30
|
+
"""
|
|
31
|
+
添加音频块到处理队列
|
|
32
|
+
|
|
33
|
+
参数:
|
|
34
|
+
audio_frame: 当前音频帧
|
|
35
|
+
create_time: 此块音频帧的创建时间,调用方一般使用time.perf_counter()获得,用于和视频帧时间对齐
|
|
36
|
+
"""
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
def evaluate(self, start_time: float = None, end_time: float = None):
|
|
40
|
+
"""
|
|
41
|
+
评估当前活动说话者,一般是前置VAD检测发现结束说话时调用
|
|
42
|
+
|
|
43
|
+
参数:
|
|
44
|
+
start_time: 评估起始时间(time.perf_counter 时间戳),可选
|
|
45
|
+
end_time: 评估结束时间(time.perf_counter 时间戳),可选
|
|
46
|
+
若均提供,则只对该时间范围内的音视频做推理;
|
|
47
|
+
若时间范围超过缓冲区长度,则回退为使用整个缓冲区。
|
|
48
|
+
若不传,则使用整个缓冲区。
|
|
49
|
+
|
|
50
|
+
返回:
|
|
51
|
+
最新时间点的活动说话者的tracker_id和置信度得分
|
|
52
|
+
"""
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
def reset(self):
|
|
56
|
+
"""重置系统状态"""
|
|
57
|
+
pass
|