praasper 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- praasper-0.1.0/LICENSE +21 -0
- praasper-0.1.0/PKG-INFO +148 -0
- praasper-0.1.0/README.md +119 -0
- praasper-0.1.0/praasper/VAD/__init__.py +0 -0
- praasper-0.1.0/praasper/VAD/core_auto.py +394 -0
- praasper-0.1.0/praasper/VAD/tool_auto.py +199 -0
- praasper-0.1.0/praasper/__init__.py +41 -0
- praasper-0.1.0/praasper/process.py +310 -0
- praasper-0.1.0/praasper/tool.py +67 -0
- praasper-0.1.0/praasper.egg-info/PKG-INFO +148 -0
- praasper-0.1.0/praasper.egg-info/SOURCES.txt +14 -0
- praasper-0.1.0/praasper.egg-info/dependency_links.txt +1 -0
- praasper-0.1.0/praasper.egg-info/requires.txt +4 -0
- praasper-0.1.0/praasper.egg-info/top_level.txt +1 -0
- praasper-0.1.0/setup.cfg +4 -0
- praasper-0.1.0/setup.py +25 -0
praasper-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Tony Liu
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
praasper-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: praasper
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: VAD-Enhanced ASR with Word- and Phoneme-Level Timestamps
|
|
5
|
+
Home-page: https://github.com/ParadeLuxe/Praasper
|
|
6
|
+
Author: Tony Liu
|
|
7
|
+
Author-email: paradeluxe3726@gmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.8
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Requires-Dist: openai-whisper
|
|
15
|
+
Requires-Dist: textgrid
|
|
16
|
+
Requires-Dist: pypinyin
|
|
17
|
+
Requires-Dist: librosa
|
|
18
|
+
Dynamic: author
|
|
19
|
+
Dynamic: author-email
|
|
20
|
+
Dynamic: classifier
|
|
21
|
+
Dynamic: description
|
|
22
|
+
Dynamic: description-content-type
|
|
23
|
+
Dynamic: home-page
|
|
24
|
+
Dynamic: license
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
Dynamic: requires-dist
|
|
27
|
+
Dynamic: requires-python
|
|
28
|
+
Dynamic: summary
|
|
29
|
+
|
|
30
|
+
# Praasper
|
|
31
|
+

|
|
32
|
+

|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
**Praasper** is an Automatic Speech Recognition (ASR) application designed help researchers transribe audio files to both **word-** and **phoneme-level** text.
|
|
36
|
+
|
|
37
|
+

|
|
38
|
+
|
|
39
|
+
In **Praasper**, we adopt a rather simple and straightforward pipeline to extract phoneme-level information from audio files. The pipeline includes [Whisper](https://github.com/openai/whisper) and [Praditor](https://github.com/Paradeluxe/Praditor).
|
|
40
|
+
|
|
41
|
+
Now **Praasper** support **Mandarin**. In the near future we plan to add support for **Cantonese** and **English**.
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# How to use
|
|
46
|
+
|
|
47
|
+
The default model is `large-v3-turbo`.
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
import praasper
|
|
51
|
+
|
|
52
|
+
model = praasper.init_model(model_name="large-v3-turbo")
|
|
53
|
+
model.annote(input_path="data") # The folder where you store .wav
|
|
54
|
+
|
|
55
|
+
# If you want to know what other models are available:
|
|
56
|
+
# import whisper
|
|
57
|
+
# print(whisper.available_models())
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# Mechanism
|
|
62
|
+
|
|
63
|
+
**Whisper** is used to transcribe the audio file to **word-level text**. At this point, speech onsets and offsets exhibit time deviations in seconds.
|
|
64
|
+
|
|
65
|
+
**Praditor** is applied to perform **Voice Activity Detection (VAD)** algorithm to trim the currently existing word/character-level timestamps to **millisecond level**. It is a Speech Onset Detection (SOT) algorithm we developed for langauge researchers.
|
|
66
|
+
|
|
67
|
+
To extract phoneme boundaries, we designed an **edge detection algorithm**.
|
|
68
|
+
- The audio file is first resampled to **16 kHz** as to remove noise in the high-frequency domain.
|
|
69
|
+
- A kernel,`[-1, 0, 1]`, is then applied to the frequency domain to enhance the edge(s) between phonetic segments.
|
|
70
|
+
- The most prominent **n** peaks are then selected so as to match the wanted number of phonemes.
|
|
71
|
+
|
|
72
|
+
# Setup
|
|
73
|
+
## pip installation
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pip install praasper
|
|
77
|
+
```
|
|
78
|
+
> If you have a succesful installation and don't care if there is GPU accelaration, you can stop it right here.
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
## GPU Acceleration (Windows/Linux)
|
|
82
|
+
`Whisper` can automaticly detects the best currently available device to use. But you still need to first install GPU-support version `torch` in order to enable CUDA acceleration.
|
|
83
|
+
|
|
84
|
+
- For **macOS** users, `Whisper` only detects CPU as the processing device.
|
|
85
|
+
- For **Windows/Linux** users, the priority order should be: `CUDA` -> `CPU`.
|
|
86
|
+
|
|
87
|
+
If you have no experience in installing `CUDA`, follow the steps below:
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
**First**, go to command line and check the latest CUDA version your system supports:
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
nvidia-smi
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Results should pop up like this:
|
|
98
|
+
```bash
|
|
99
|
+
| NVIDIA-SMI 576.80 Driver Version: 576.80 CUDA Version: 12.9 |
|
|
100
|
+
```
|
|
101
|
+
It means that this device supports CUDA up to version 12.9.
|
|
102
|
+
|
|
103
|
+
**Next**, go to [**NVIDIA CUDA Toolkit**](https://developer.nvidia.com/cuda-toolkit) and download the latest version, or whichever version that fits your system/need.
|
|
104
|
+
|
|
105
|
+
**Next**, uninstall the default CPU-only torch:
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
pip uninstall torch
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
**Lastly**, install `torch` that fits your CUDA version. Find the correct `pip` command [**in this link**](https://pytorch.org/get-started/locally/).
|
|
112
|
+
|
|
113
|
+
Here is an example for CUDA 12.9:
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
pip install torch --index-url https://download.pytorch.org/whl/cu129
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
## (Advanced) uv installation
|
|
121
|
+
`uv` is also highly recommended for way **FASTER** installation. First, make sure `uv` is installed to your default environment:
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
pip install uv
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
Then, create a virtual environment (e.g., .venv):
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
uv venv .venv
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
You should see a new `.venv` folder pops up in your project folder now. (You might also want to restart the terminal.)
|
|
134
|
+
|
|
135
|
+
Lastly, install `praasper` (by adding `uv` before `pip`):
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
uv pip install praasper
|
|
140
|
+
```
|
|
141
|
+
For `CUDA` support,
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
uv pip uninstall torch
|
|
145
|
+
|
|
146
|
+
# Or whichever version that matches your CUDA version
|
|
147
|
+
uv pip install torch --index-url https://download.pytorch.org/whl/cu129
|
|
148
|
+
```
|
praasper-0.1.0/README.md
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
# Praasper
|
|
2
|
+

|
|
3
|
+

|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
**Praasper** is an Automatic Speech Recognition (ASR) application designed help researchers transribe audio files to both **word-** and **phoneme-level** text.
|
|
7
|
+
|
|
8
|
+

|
|
9
|
+
|
|
10
|
+
In **Praasper**, we adopt a rather simple and straightforward pipeline to extract phoneme-level information from audio files. The pipeline includes [Whisper](https://github.com/openai/whisper) and [Praditor](https://github.com/Paradeluxe/Praditor).
|
|
11
|
+
|
|
12
|
+
Now **Praasper** support **Mandarin**. In the near future we plan to add support for **Cantonese** and **English**.
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# How to use
|
|
17
|
+
|
|
18
|
+
The default model is `large-v3-turbo`.
|
|
19
|
+
|
|
20
|
+
```python
|
|
21
|
+
import praasper
|
|
22
|
+
|
|
23
|
+
model = praasper.init_model(model_name="large-v3-turbo")
|
|
24
|
+
model.annote(input_path="data") # The folder where you store .wav
|
|
25
|
+
|
|
26
|
+
# If you want to know what other models are available:
|
|
27
|
+
# import whisper
|
|
28
|
+
# print(whisper.available_models())
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# Mechanism
|
|
33
|
+
|
|
34
|
+
**Whisper** is used to transcribe the audio file to **word-level text**. At this point, speech onsets and offsets exhibit time deviations in seconds.
|
|
35
|
+
|
|
36
|
+
**Praditor** is applied to perform **Voice Activity Detection (VAD)** algorithm to trim the currently existing word/character-level timestamps to **millisecond level**. It is a Speech Onset Detection (SOT) algorithm we developed for langauge researchers.
|
|
37
|
+
|
|
38
|
+
To extract phoneme boundaries, we designed an **edge detection algorithm**.
|
|
39
|
+
- The audio file is first resampled to **16 kHz** as to remove noise in the high-frequency domain.
|
|
40
|
+
- A kernel,`[-1, 0, 1]`, is then applied to the frequency domain to enhance the edge(s) between phonetic segments.
|
|
41
|
+
- The most prominent **n** peaks are then selected so as to match the wanted number of phonemes.
|
|
42
|
+
|
|
43
|
+
# Setup
|
|
44
|
+
## pip installation
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
pip install praasper
|
|
48
|
+
```
|
|
49
|
+
> If you have a succesful installation and don't care if there is GPU accelaration, you can stop it right here.
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
## GPU Acceleration (Windows/Linux)
|
|
53
|
+
`Whisper` can automaticly detects the best currently available device to use. But you still need to first install GPU-support version `torch` in order to enable CUDA acceleration.
|
|
54
|
+
|
|
55
|
+
- For **macOS** users, `Whisper` only detects CPU as the processing device.
|
|
56
|
+
- For **Windows/Linux** users, the priority order should be: `CUDA` -> `CPU`.
|
|
57
|
+
|
|
58
|
+
If you have no experience in installing `CUDA`, follow the steps below:
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
**First**, go to command line and check the latest CUDA version your system supports:
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
nvidia-smi
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Results should pop up like this:
|
|
69
|
+
```bash
|
|
70
|
+
| NVIDIA-SMI 576.80 Driver Version: 576.80 CUDA Version: 12.9 |
|
|
71
|
+
```
|
|
72
|
+
It means that this device supports CUDA up to version 12.9.
|
|
73
|
+
|
|
74
|
+
**Next**, go to [**NVIDIA CUDA Toolkit**](https://developer.nvidia.com/cuda-toolkit) and download the latest version, or whichever version that fits your system/need.
|
|
75
|
+
|
|
76
|
+
**Next**, uninstall the default CPU-only torch:
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
pip uninstall torch
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
**Lastly**, install `torch` that fits your CUDA version. Find the correct `pip` command [**in this link**](https://pytorch.org/get-started/locally/).
|
|
83
|
+
|
|
84
|
+
Here is an example for CUDA 12.9:
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
pip install torch --index-url https://download.pytorch.org/whl/cu129
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
## (Advanced) uv installation
|
|
92
|
+
`uv` is also highly recommended for way **FASTER** installation. First, make sure `uv` is installed to your default environment:
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
pip install uv
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Then, create a virtual environment (e.g., .venv):
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
uv venv .venv
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
You should see a new `.venv` folder pops up in your project folder now. (You might also want to restart the terminal.)
|
|
105
|
+
|
|
106
|
+
Lastly, install `praasper` (by adding `uv` before `pip`):
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
uv pip install praasper
|
|
111
|
+
```
|
|
112
|
+
For `CUDA` support,
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
uv pip uninstall torch
|
|
116
|
+
|
|
117
|
+
# Or whichever version that matches your CUDA version
|
|
118
|
+
uv pip install torch --index-url https://download.pytorch.org/whl/cu129
|
|
119
|
+
```
|
|
File without changes
|
|
@@ -0,0 +1,394 @@
|
|
|
1
|
+
import gc
|
|
2
|
+
import math
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from sklearn.cluster import DBSCAN
|
|
7
|
+
# from dbscan import DBSCAN
|
|
8
|
+
|
|
9
|
+
from textgrid import TextGrid, PointTier, Point, IntervalTier, Interval
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
from .tool_auto import bandpass_filter, get_current_time, ReadSound
|
|
14
|
+
except ImportError:
|
|
15
|
+
from tool_auto import bandpass_filter, get_current_time, ReadSound
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
from ..tool import show_elapsed_time
|
|
19
|
+
except ImportError:
|
|
20
|
+
# from tool_auto import bandpass_filter, get_current_time, ReadSound
|
|
21
|
+
from tool import show_elapsed_time
|
|
22
|
+
|
|
23
|
+
# plat = os.name.lower()
|
|
24
|
+
# check if ffmpeg exists in the system path or the pydub package can find it
|
|
25
|
+
# os.environ["PATH"] += os.pathsep + resource_path(f".\\ffmpeg\\{plat}")
|
|
26
|
+
# print(resource_path(f".\\ffmpeg\\{plat}"))
|
|
27
|
+
|
|
28
|
+
def autoPraditorWithTimeRange(params, audio_obj, which_set, stime=0, etime=-1):
|
|
29
|
+
if etime == -1:
|
|
30
|
+
ans_tps = autoPraditor(params, audio_obj, which_set)
|
|
31
|
+
|
|
32
|
+
else:
|
|
33
|
+
ans_tps = autoPraditor(params, audio_obj[stime*1000:etime*1000], which_set)
|
|
34
|
+
ans_tps = [tp + stime for tp in ans_tps if 5 < tp <ans_tps[-1] - 5]
|
|
35
|
+
return ans_tps
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def autoPraditor(params, audio_obj, which_set):
|
|
39
|
+
# 导入数据,并且遵循一定之格式
|
|
40
|
+
for xset in params:
|
|
41
|
+
for item in params[xset]:
|
|
42
|
+
try:
|
|
43
|
+
params[xset][item] = eval(params[xset][item])
|
|
44
|
+
except:
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
params = params[which_set] # 选择是onset还是offset
|
|
48
|
+
|
|
49
|
+
# 固定某些值
|
|
50
|
+
params["ratio"] = 0.9
|
|
51
|
+
params["win_size"] = 50
|
|
52
|
+
params["ref_len"] = params["win_size"] * 5
|
|
53
|
+
params["penalty"] = 10
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# print(params)
|
|
57
|
+
_answer_frames = []
|
|
58
|
+
|
|
59
|
+
_audio_obj = audio_obj
|
|
60
|
+
|
|
61
|
+
_audio_samplerate = audio_obj.frame_rate
|
|
62
|
+
|
|
63
|
+
_audio_arr_filtered = bandpass_filter(
|
|
64
|
+
np.array(audio_obj.get_array_of_samples()),
|
|
65
|
+
lowcut=params["cutoff0"],
|
|
66
|
+
highcut=params["cutoff1"],
|
|
67
|
+
fs=_audio_obj.frame_rate
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# Audio已经准备好
|
|
73
|
+
# 接下来就是第一步:DBSCAN聚类找噪声片段
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# 1.1. 降采样
|
|
77
|
+
# 把一秒钟的音频分成n=40份
|
|
78
|
+
|
|
79
|
+
_dsFactor = _audio_obj.frame_rate // 40
|
|
80
|
+
# _audio_arr_ds = _audio_arr_filtered
|
|
81
|
+
|
|
82
|
+
# 用取余容易得到[:-0],返回一个空list
|
|
83
|
+
# 用整除解决这个问题
|
|
84
|
+
if len(_audio_arr_filtered) % _dsFactor != 0:
|
|
85
|
+
_audio_arr_filtered = _audio_arr_filtered[:-(len(_audio_arr_filtered) % _dsFactor)]
|
|
86
|
+
|
|
87
|
+
# warning or auto change?
|
|
88
|
+
if which_set == "offset":
|
|
89
|
+
_audio_arr_filtered = np.flip(_audio_arr_filtered)
|
|
90
|
+
|
|
91
|
+
# max_frm_num = len(_audio_arr_filtered)
|
|
92
|
+
_audio_arr_ds = _audio_arr_filtered.reshape((len(_audio_arr_filtered) // _dsFactor, _dsFactor))
|
|
93
|
+
_audio_arr_ds = np.max(_audio_arr_ds, axis=1) # 用max方法降采样
|
|
94
|
+
|
|
95
|
+
# print(_audio_arr_ds[10:1000])
|
|
96
|
+
|
|
97
|
+
_points_array = np.array([
|
|
98
|
+
[_audio_arr_ds[i] for i in range(len(_audio_arr_ds) - 1)],
|
|
99
|
+
[_audio_arr_ds[i + 1] for i in range(len(_audio_arr_ds) - 1)]
|
|
100
|
+
]).T
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
_eps = params["eps_ratio"] * float(np.max(np.sort(_audio_arr_ds)[:int(.8 * len(_audio_arr_ds))])) # 找到合适的radius,防止异常值
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
del _audio_arr_ds
|
|
109
|
+
gc.collect()
|
|
110
|
+
|
|
111
|
+
_min_samples = math.ceil(0.3/_dsFactor * _audio_obj.frame_rate) #math.ceil(2 / (target_audio_samplerate/44100) / (interval*2/4281))
|
|
112
|
+
try:
|
|
113
|
+
_cluster = DBSCAN(eps=_eps, min_samples=_min_samples, metric="manhattan").fit(_points_array)
|
|
114
|
+
# print(_cluster.labels_)
|
|
115
|
+
|
|
116
|
+
# _cluster = DBSCAN_(eps=_eps, min_samples=_min_samples).fit(_points_array)
|
|
117
|
+
# print(_cluster.labels_)
|
|
118
|
+
except MemoryError:
|
|
119
|
+
print("not enough memory")
|
|
120
|
+
return []
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
# To look for the label with which the coordinate is closet to the zero point
|
|
126
|
+
# xy值加起来最小值 -> 最接近零点
|
|
127
|
+
noise_label = 0
|
|
128
|
+
for i in range(0, len(set(_cluster.labels_))-1):
|
|
129
|
+
if np.min(np.sum(_points_array[_cluster.labels_ == i], axis=1)) < np.min(np.sum(_points_array[_cluster.labels_ == noise_label], axis=1)):
|
|
130
|
+
noise_label = i
|
|
131
|
+
_points_confirmed = _points_array[_cluster.labels_ == noise_label]
|
|
132
|
+
# print(_labels, noise_label)
|
|
133
|
+
|
|
134
|
+
# 把最小cluster以下的所有点都囊括进来
|
|
135
|
+
_points_compensation = np.array(range(len(_points_array)))[np.sum(np.square(_points_array), axis=1) <= np.mean(np.sum(np.square(_points_confirmed), axis=1))]
|
|
136
|
+
# print(_points_confirmed)
|
|
137
|
+
|
|
138
|
+
_labels = _cluster.labels_
|
|
139
|
+
|
|
140
|
+
for i in _points_compensation:
|
|
141
|
+
_labels[int(i)] = noise_label
|
|
142
|
+
|
|
143
|
+
_labels = [noise_label] * 3 + [i for i in _labels] + [noise_label] * 3 # 这句干啥用的???
|
|
144
|
+
_indices_confirmed = [i-3 for i in range(len(_labels)) if _labels[i] == noise_label] # or labels[i] == -1]
|
|
145
|
+
|
|
146
|
+
# print(_indices_confirmed)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
# gather sampled area | target area
|
|
150
|
+
|
|
151
|
+
_indices_completed = []
|
|
152
|
+
for i in _indices_confirmed:
|
|
153
|
+
for j in range(3): # 原来是4,改成了3,只加上012
|
|
154
|
+
if (i + j) not in _indices_completed:
|
|
155
|
+
_indices_completed.append(i + j)
|
|
156
|
+
|
|
157
|
+
# print(_indices_completed)
|
|
158
|
+
|
|
159
|
+
_onsets = []
|
|
160
|
+
_offsets = []
|
|
161
|
+
for i in range(min(_indices_completed), max(_indices_completed) + 2): # +2取不到 +1必定不存在
|
|
162
|
+
if i in _indices_completed and (i - 1) not in _indices_completed:
|
|
163
|
+
_onsets.append(i)
|
|
164
|
+
elif i not in _indices_completed and (i - 1) in _indices_completed:
|
|
165
|
+
_offsets.append(i - 1)
|
|
166
|
+
|
|
167
|
+
# print(f" > Silence/Noise Onsets: {onsets}") # 打印所有无声范围的onset
|
|
168
|
+
# print(f" > Silence/Noise Offsets: {offsets}") # 打印所有无声范围的offset
|
|
169
|
+
|
|
170
|
+
# onset和offset的数量一定一样(? not really)
|
|
171
|
+
# 把它俩成对组合后,筛除掉其中长度不够的(noise too short)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
while True:
|
|
175
|
+
_bad_onoffsets = []
|
|
176
|
+
for i in range(len(_offsets)-1):
|
|
177
|
+
if (_onsets[i+1] - _offsets[i]) * _dsFactor /_audio_samplerate < .1:
|
|
178
|
+
_bad_onoffsets.append(_onsets[i+1])
|
|
179
|
+
_bad_onoffsets.append(_offsets[i])
|
|
180
|
+
if len(_bad_onoffsets) == 0:
|
|
181
|
+
break
|
|
182
|
+
|
|
183
|
+
_onsets = [i for i in _onsets if i not in _bad_onoffsets]
|
|
184
|
+
_offsets = [i for i in _offsets if i not in _bad_onoffsets]
|
|
185
|
+
_onoffsets = [(_onsets[i], _offsets[i]) for i in range(len(_onsets))]
|
|
186
|
+
|
|
187
|
+
# print(_onoffsets)
|
|
188
|
+
|
|
189
|
+
# now offset means sound offset (not silence offset)
|
|
190
|
+
# onset means sound onset (not silence onset)
|
|
191
|
+
# with ProcessPoolExecutor(max_workers=threads) as executor:
|
|
192
|
+
# results = list(executor.map(process_items_with_params, *parameters, chunksize=threads))
|
|
193
|
+
# print(np.cuda.get_device_id())
|
|
194
|
+
# np.cuda.device = 1
|
|
195
|
+
# print("---↘")
|
|
196
|
+
total_num = len(_onoffsets)
|
|
197
|
+
for i, (__offset, __onset) in enumerate(_onoffsets):
|
|
198
|
+
print(f"\r[{show_elapsed_time()}] ({i+1}/{total_num}, {(i+1)/total_num*100:.1f}%)", end="")
|
|
199
|
+
|
|
200
|
+
# -------------------------------------------------
|
|
201
|
+
# 强制跳过条件 Skip Condition
|
|
202
|
+
|
|
203
|
+
if __onset <= 0 - 3:
|
|
204
|
+
continue
|
|
205
|
+
|
|
206
|
+
if __offset >= len(_cluster.labels_) + 3:
|
|
207
|
+
continue
|
|
208
|
+
|
|
209
|
+
# -----------------------------------------------
|
|
210
|
+
|
|
211
|
+
__offset = 0 if __offset <= 0 else __offset
|
|
212
|
+
# print(__offset * _dsFactor+1, __onset * _dsFactor, __offset * _dsFactor, __onset * _dsFactor-1)
|
|
213
|
+
try:
|
|
214
|
+
__candidate_y1_area = abs(np.array(
|
|
215
|
+
_audio_arr_filtered[__offset * _dsFactor+1:__onset * _dsFactor] -
|
|
216
|
+
_audio_arr_filtered[__offset * _dsFactor:__onset * _dsFactor-1]
|
|
217
|
+
))
|
|
218
|
+
|
|
219
|
+
except ValueError:
|
|
220
|
+
continue # hit the bottom with no more frames
|
|
221
|
+
|
|
222
|
+
__sample_startpoint = int(np.argmin(__candidate_y1_area) + __offset * _dsFactor)
|
|
223
|
+
__sample_endpoint = __sample_startpoint - params["ref_len"]
|
|
224
|
+
if __sample_endpoint < 0:
|
|
225
|
+
__sample_endpoint = 0
|
|
226
|
+
|
|
227
|
+
try:
|
|
228
|
+
__candidate_y1_area = abs(np.array(
|
|
229
|
+
_audio_arr_filtered[__sample_endpoint+1:__sample_startpoint] -
|
|
230
|
+
_audio_arr_filtered[__sample_endpoint:__sample_startpoint - 1]
|
|
231
|
+
))
|
|
232
|
+
except ValueError:
|
|
233
|
+
__sample_startpoint = __sample_endpoint + params["ref_len"]
|
|
234
|
+
__candidate_y1_area = abs(np.array(
|
|
235
|
+
_audio_arr_filtered[__sample_endpoint+1:__sample_startpoint] -
|
|
236
|
+
_audio_arr_filtered[__sample_endpoint:__sample_startpoint - 1]
|
|
237
|
+
))
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
__candidate_y1_area = np.sort(__candidate_y1_area)[:int(len(__candidate_y1_area) * params["ratio"])]
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
__y1_threshold = float(np.sum(__candidate_y1_area) / (__sample_startpoint - __sample_endpoint) * params["amp"])
|
|
244
|
+
__ref_midpoint = int(__offset*_dsFactor + (__onset-__offset) * _dsFactor * 0.8) # 3/4偏移量
|
|
245
|
+
|
|
246
|
+
if i < len(_onoffsets) - 1:
|
|
247
|
+
__ref_midpoint_next = int(_onoffsets[i+1][0]*_dsFactor + (_onoffsets[i+1][1]-_onoffsets[i+1][0]) * _dsFactor * 0.8)
|
|
248
|
+
else:
|
|
249
|
+
__ref_midpoint_next = len(_audio_arr_filtered) # 设置为音频最后一帧的位置
|
|
250
|
+
|
|
251
|
+
if i > 0:
|
|
252
|
+
__ref_midpoint_prev = int(_onoffsets[i-1][0]*_dsFactor + (_onoffsets[i-1][1]-_onoffsets[i-1][0]) * _dsFactor * 0.8)
|
|
253
|
+
else:
|
|
254
|
+
__ref_midpoint_prev = 0 # 设置为音频第一帧的位置
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
if __ref_midpoint < __sample_startpoint:
|
|
258
|
+
__ref_midpoint = __sample_startpoint
|
|
259
|
+
# print(np.argmin(candidate_y1_area), ref_midpoint)
|
|
260
|
+
|
|
261
|
+
# ----------------- Processing onset area --------------
|
|
262
|
+
# print(f"\r{audio_file}\t|\t{get_current_time()}\t|\tProcessing >> onset {onset} + offset {offset}", end="")
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
# print(y1_threshold)
|
|
266
|
+
|
|
267
|
+
__countValidPiece = 0
|
|
268
|
+
__countBadPiece = 0
|
|
269
|
+
__countDSTime = -1
|
|
270
|
+
|
|
271
|
+
_final_answer = None
|
|
272
|
+
while __ref_midpoint + __countDSTime < __ref_midpoint_next: # 遍历从midpoint到next midpoint之间的每一帧
|
|
273
|
+
__countDSTime += 1
|
|
274
|
+
|
|
275
|
+
__left_boundary = __ref_midpoint + __countDSTime - params["win_size"]
|
|
276
|
+
__right_boundary = __ref_midpoint + __countDSTime
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
try:
|
|
280
|
+
__raw_value = abs(_audio_arr_filtered[__left_boundary:__right_boundary] - _audio_arr_filtered[__left_boundary-1:__right_boundary-1])
|
|
281
|
+
except ValueError:
|
|
282
|
+
break
|
|
283
|
+
__raw_value.sort()
|
|
284
|
+
__raw_value = __raw_value[:int(len(__raw_value) * params["ratio"])]
|
|
285
|
+
|
|
286
|
+
__y1_value = sum(__raw_value)/len(__raw_value)
|
|
287
|
+
|
|
288
|
+
if __y1_value > __y1_threshold:
|
|
289
|
+
|
|
290
|
+
__countValidPiece += 1
|
|
291
|
+
else:
|
|
292
|
+
__countBadPiece += 1 #params["penalty"]
|
|
293
|
+
|
|
294
|
+
if __countValidPiece - __countBadPiece * params["penalty"] <= 0:
|
|
295
|
+
__countValidPiece = 0
|
|
296
|
+
__countBadPiece = 0
|
|
297
|
+
|
|
298
|
+
elif __countValidPiece - __countBadPiece >= params["numValid"]:
|
|
299
|
+
_final_answer = __ref_midpoint + __countDSTime - __countValidPiece - __countBadPiece
|
|
300
|
+
|
|
301
|
+
if which_set == "offset":
|
|
302
|
+
_final_answer = len(_audio_arr_filtered) - (_final_answer + len(_audio_arr_filtered) % _dsFactor)
|
|
303
|
+
_answer_frames.append(_final_answer)
|
|
304
|
+
break
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
# if _final_answer is None:
|
|
308
|
+
# continue
|
|
309
|
+
|
|
310
|
+
# 对 _answer_frames 进行从小到大排序
|
|
311
|
+
# print(_answer_frames)
|
|
312
|
+
|
|
313
|
+
print()
|
|
314
|
+
if which_set == "offset":
|
|
315
|
+
_answer_frames.reverse()
|
|
316
|
+
# print(_answer_frames)
|
|
317
|
+
|
|
318
|
+
return [frm/_audio_samplerate for frm in _answer_frames]
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def create_textgrid_with_time_point(audio_file_path, onsets=[], offsets=[]):
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
# 检测 onsets 和 offsets 的数量是否一致
|
|
329
|
+
if len(onsets) != len(offsets):
|
|
330
|
+
raise ValueError(f"The number of onsets ({len(onsets)}) and offsets ({len(offsets)}) does not match. ")
|
|
331
|
+
|
|
332
|
+
# 检测并删除包含 None 的对应元素
|
|
333
|
+
indices_to_remove = [i for i in range(len(onsets)) if onsets[i] is None or offsets[i] is None]
|
|
334
|
+
for idx in sorted(indices_to_remove, reverse=True):
|
|
335
|
+
del onsets[idx]
|
|
336
|
+
del offsets[idx]
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
# 获取音频文件的目录和文件名(不包括扩展名)
|
|
340
|
+
audio_dir = os.path.dirname(os.path.abspath(audio_file_path))
|
|
341
|
+
audio_filename = os.path.splitext(os.path.basename(audio_file_path))[0]
|
|
342
|
+
audio_extension = os.path.splitext(os.path.basename(audio_file_path))[1]
|
|
343
|
+
audio_obj = ReadSound(os.path.join(audio_dir, audio_filename+audio_extension))
|
|
344
|
+
audio_duration = audio_obj.duration_seconds
|
|
345
|
+
audio_samplerate = audio_obj.frame_rate
|
|
346
|
+
|
|
347
|
+
# 创建一个新的TextGrid对象
|
|
348
|
+
tg_filename = os.path.join(audio_dir, audio_filename + ".TextGrid")
|
|
349
|
+
tg = TextGrid()
|
|
350
|
+
|
|
351
|
+
# 时间
|
|
352
|
+
# time_points = [frm/audio_samplerate for frm in frame_points]
|
|
353
|
+
|
|
354
|
+
for set_mode in ["onset", "offset"]:
|
|
355
|
+
point_tier = PointTier(name=set_mode, minTime=0., maxTime=audio_duration)
|
|
356
|
+
|
|
357
|
+
if set_mode == "onset":
|
|
358
|
+
xsets = onsets
|
|
359
|
+
elif set_mode == "offset":
|
|
360
|
+
xsets = offsets
|
|
361
|
+
# print(xsets)
|
|
362
|
+
for time_point in xsets:
|
|
363
|
+
try:
|
|
364
|
+
point_tier.addPoint(Point(time_point, set_mode))
|
|
365
|
+
except ValueError:
|
|
366
|
+
continue
|
|
367
|
+
|
|
368
|
+
tg.append(point_tier) # 不再使用pointtier
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
tg.append(point_tier)
|
|
372
|
+
tg.write(tg_filename) # 将TextGrid对象写入文件
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
tg_filename = os.path.join(audio_dir, audio_filename + "_VAD.TextGrid")
|
|
376
|
+
tg = TextGrid()
|
|
377
|
+
|
|
378
|
+
# 间隔
|
|
379
|
+
interval_tier = IntervalTier(name="interval", minTime=0., maxTime=audio_duration)
|
|
380
|
+
for i in range(len(onsets)):
|
|
381
|
+
try:
|
|
382
|
+
interval_tier.addInterval(Interval(onsets[i], offsets[i], "sound"))
|
|
383
|
+
except ValueError:
|
|
384
|
+
continue
|
|
385
|
+
tg.append(interval_tier)
|
|
386
|
+
tg.write(tg_filename) # 将TextGrid对象写入文件
|
|
387
|
+
|
|
388
|
+
print(f"{audio_filename}\t|\t{get_current_time()}\t|\tTextGrid created at: {tg_filename}")
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
|