praasper 0.1.0.dev1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- praasper-0.1.0.dev1/LICENSE +21 -0
- praasper-0.1.0.dev1/PKG-INFO +73 -0
- praasper-0.1.0.dev1/README.md +44 -0
- praasper-0.1.0.dev1/praasper/__init__.py +0 -0
- praasper-0.1.0.dev1/praasper/process.py +272 -0
- praasper-0.1.0.dev1/praasper/tool.py +54 -0
- praasper-0.1.0.dev1/praasper.egg-info/PKG-INFO +73 -0
- praasper-0.1.0.dev1/praasper.egg-info/SOURCES.txt +11 -0
- praasper-0.1.0.dev1/praasper.egg-info/dependency_links.txt +1 -0
- praasper-0.1.0.dev1/praasper.egg-info/requires.txt +4 -0
- praasper-0.1.0.dev1/praasper.egg-info/top_level.txt +1 -0
- praasper-0.1.0.dev1/setup.cfg +4 -0
- praasper-0.1.0.dev1/setup.py +25 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Tony Liu
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: praasper
|
|
3
|
+
Version: 0.1.0.dev1
|
|
4
|
+
Summary: VAD-Enhanced ASR with Word- and Phoneme-Level Timestamps
|
|
5
|
+
Home-page: https://github.com/ParadeLuxe/Praasper
|
|
6
|
+
Author: Tony Liu
|
|
7
|
+
Author-email: paradeluxe3726@gmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.8
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Requires-Dist: openai-whisper
|
|
15
|
+
Requires-Dist: textgrid
|
|
16
|
+
Requires-Dist: pypinyin
|
|
17
|
+
Requires-Dist: librosa
|
|
18
|
+
Dynamic: author
|
|
19
|
+
Dynamic: author-email
|
|
20
|
+
Dynamic: classifier
|
|
21
|
+
Dynamic: description
|
|
22
|
+
Dynamic: description-content-type
|
|
23
|
+
Dynamic: home-page
|
|
24
|
+
Dynamic: license
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
Dynamic: requires-dist
|
|
27
|
+
Dynamic: requires-python
|
|
28
|
+
Dynamic: summary
|
|
29
|
+
|
|
30
|
+
# Praasper
|
|
31
|
+

|
|
32
|
+

|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
**Praasper** is an Automatic Speech Recognition (ASR) application designed help researchers transribe audio files to both **word-** and **phoneme-level** text.
|
|
36
|
+
|
|
37
|
+

|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# Mechanism
|
|
42
|
+
In **Praasper**, we adopt a rather simple and straightforward pipeline to extract phoneme-level information from audio files.
|
|
43
|
+
|
|
44
|
+
**Whisper** ([repo](https://github.com/openai/whisper)) is used to transcribe the audio file to **word-level text**. At this point, speech onsets and offsets exhibit time deviations in seconds.
|
|
45
|
+
|
|
46
|
+
```Python
|
|
47
|
+
model = whisper.load_model("large-v3-turbo", device="cuda")
|
|
48
|
+
result = model.transcribe(wav, word_timestamps=True)
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
[**Praditor**](https://github.com/Paradeluxe/Praditor) is applied to perform **Voice Activity Detection (VAD)** algorithm to trim the currently existing word/character-level timestamps (at millisecond level). It is a Speech Onset Detection (SOT) algorithm we developed for langauge researchers.
|
|
52
|
+
|
|
53
|
+
To extract phoneme boundaries, we designed an **edge detection algorithm**.
|
|
54
|
+
- The audio file is first resampled to **16 kHz** as to remove noise in the high-frequency domain.
|
|
55
|
+
- A kernel,`[-1, 0, 1]`, is then applied to the frequency domain to enhance the edge(s) between phonetic segments.
|
|
56
|
+
- The most prominent **n** peaks are then selected so as to match the wanted number of phonemes.
|
|
57
|
+
|
|
58
|
+
# Support
|
|
59
|
+
|
|
60
|
+
| Precision | Completed | Developing |
|
|
61
|
+
| :---: | :---: | :---: |
|
|
62
|
+
| Word | Mandarin | Cantonese, English |
|
|
63
|
+
| Phoneme | Mandarin | Cantonese, English |
|
|
64
|
+
|
|
65
|
+
# Setup
|
|
66
|
+
|
|
67
|
+
## pip installation
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
pip install praasper
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
`uv` is also highly recommended
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# Praasper
|
|
2
|
+

|
|
3
|
+

|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
**Praasper** is an Automatic Speech Recognition (ASR) application designed help researchers transribe audio files to both **word-** and **phoneme-level** text.
|
|
7
|
+
|
|
8
|
+

|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# Mechanism
|
|
13
|
+
In **Praasper**, we adopt a rather simple and straightforward pipeline to extract phoneme-level information from audio files.
|
|
14
|
+
|
|
15
|
+
**Whisper** ([repo](https://github.com/openai/whisper)) is used to transcribe the audio file to **word-level text**. At this point, speech onsets and offsets exhibit time deviations in seconds.
|
|
16
|
+
|
|
17
|
+
```Python
|
|
18
|
+
model = whisper.load_model("large-v3-turbo", device="cuda")
|
|
19
|
+
result = model.transcribe(wav, word_timestamps=True)
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
[**Praditor**](https://github.com/Paradeluxe/Praditor) is applied to perform **Voice Activity Detection (VAD)** algorithm to trim the currently existing word/character-level timestamps (at millisecond level). It is a Speech Onset Detection (SOT) algorithm we developed for langauge researchers.
|
|
23
|
+
|
|
24
|
+
To extract phoneme boundaries, we designed an **edge detection algorithm**.
|
|
25
|
+
- The audio file is first resampled to **16 kHz** as to remove noise in the high-frequency domain.
|
|
26
|
+
- A kernel,`[-1, 0, 1]`, is then applied to the frequency domain to enhance the edge(s) between phonetic segments.
|
|
27
|
+
- The most prominent **n** peaks are then selected so as to match the wanted number of phonemes.
|
|
28
|
+
|
|
29
|
+
# Support
|
|
30
|
+
|
|
31
|
+
| Precision | Completed | Developing |
|
|
32
|
+
| :---: | :---: | :---: |
|
|
33
|
+
| Word | Mandarin | Cantonese, English |
|
|
34
|
+
| Phoneme | Mandarin | Cantonese, English |
|
|
35
|
+
|
|
36
|
+
# Setup
|
|
37
|
+
|
|
38
|
+
## pip installation
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install praasper
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
`uv` is also highly recommended
|
|
File without changes
|
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import whisper
|
|
3
|
+
from textgrid import TextGrid, IntervalTier
|
|
4
|
+
import librosa
|
|
5
|
+
import numpy as np
|
|
6
|
+
from scipy.signal import convolve2d, find_peaks
|
|
7
|
+
from tool import *
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# defs
|
|
11
|
+
def transcribe_wav_file(wav, vad):
|
|
12
|
+
"""
|
|
13
|
+
使用 Whisper 模型转录 .wav 文件
|
|
14
|
+
|
|
15
|
+
:param file_path: .wav 文件的路径
|
|
16
|
+
:param path_vad: VAD TextGrid 文件的路径
|
|
17
|
+
:return: 转录结果
|
|
18
|
+
"""
|
|
19
|
+
# 加载最佳模型(large-v3)并指定使用设备
|
|
20
|
+
# device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
21
|
+
model = whisper.load_model("large-v3-turbo")
|
|
22
|
+
print(f"Model loaded successfully. Current device in use: {model.device if hasattr(model, 'device') else 'Unknown'}")
|
|
23
|
+
|
|
24
|
+
# 转录音频文件
|
|
25
|
+
result = model.transcribe(wav, word_timestamps=True)
|
|
26
|
+
language = result["language"]
|
|
27
|
+
|
|
28
|
+
print(result)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# 加载 path_vad 对应的 TextGrid 文件
|
|
32
|
+
try:
|
|
33
|
+
vad_tg = TextGrid.fromFile(vad)
|
|
34
|
+
except FileNotFoundError:
|
|
35
|
+
print(f"错误:未找到文件 {vad}")
|
|
36
|
+
raise
|
|
37
|
+
|
|
38
|
+
# 提取所有 mark 为空字符串的 interval 的起止时间
|
|
39
|
+
vad_intervals = []
|
|
40
|
+
empty_mark_intervals = []
|
|
41
|
+
for tier in vad_tg:
|
|
42
|
+
for interval in tier:
|
|
43
|
+
if interval.mark == "":
|
|
44
|
+
empty_mark_intervals.append((interval.minTime, interval.maxTime))
|
|
45
|
+
else:
|
|
46
|
+
vad_intervals.append((interval.minTime, interval.maxTime))
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
tg = TextGrid()
|
|
51
|
+
tier = IntervalTier(name='word', minTime=0.0, maxTime=vad_tg.tiers[0].maxTime)
|
|
52
|
+
|
|
53
|
+
for segment in result["segments"]:
|
|
54
|
+
for idx, word in enumerate(segment["words"]):
|
|
55
|
+
start_time = word["start"]
|
|
56
|
+
end_time = word["end"]
|
|
57
|
+
|
|
58
|
+
text = word["word"]
|
|
59
|
+
|
|
60
|
+
for empty_mark_interval in empty_mark_intervals:
|
|
61
|
+
if empty_mark_interval[0] <= end_time <= empty_mark_interval[1]:
|
|
62
|
+
end_time = empty_mark_interval[0]
|
|
63
|
+
|
|
64
|
+
if empty_mark_interval[0] <= start_time <= empty_mark_interval[1]:
|
|
65
|
+
start_time = empty_mark_interval[1]
|
|
66
|
+
|
|
67
|
+
if start_time < empty_mark_interval[0] < empty_mark_interval[1] < end_time:
|
|
68
|
+
pass
|
|
69
|
+
|
|
70
|
+
print(start_time, end_time, text)
|
|
71
|
+
tier.add(start_time, end_time, text)
|
|
72
|
+
|
|
73
|
+
for vad_interval in vad_intervals:
|
|
74
|
+
# 找到距离 vad_interval[0] 最近的 interval.minTime
|
|
75
|
+
closest_interval = min(tier.intervals, key=lambda x: abs(x.minTime - vad_interval[0]))
|
|
76
|
+
|
|
77
|
+
if closest_interval.minTime - vad_interval[0] != 0:
|
|
78
|
+
closest_interval.minTime = vad_interval[0]
|
|
79
|
+
|
|
80
|
+
# 找到距离 vad_interval[1] 最近的 interval.maxTime
|
|
81
|
+
closest_interval = min(tier.intervals, key=lambda x: abs(x.maxTime - vad_interval[1]))
|
|
82
|
+
|
|
83
|
+
if closest_interval.maxTime - vad_interval[1] != 0:
|
|
84
|
+
closest_interval.maxTime = vad_interval[1]
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
tg.append(tier)
|
|
88
|
+
tg.write(wav.replace(".wav", "_whisper.TextGrid"))
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def word_timestamp(wav, tg_path):
|
|
92
|
+
|
|
93
|
+
# 加载音频文件
|
|
94
|
+
y, sr = librosa.load(wav, sr=16000)
|
|
95
|
+
|
|
96
|
+
# 创建一个新的IntervalTier
|
|
97
|
+
max_time = librosa.core.get_duration(y=y, sr=sr)
|
|
98
|
+
print(max_time)
|
|
99
|
+
|
|
100
|
+
# 加载 TextGrid 文件
|
|
101
|
+
tg = TextGrid.fromFile(tg_path)
|
|
102
|
+
word_tier = [tier for tier in tg if tier.name == 'word'][0]
|
|
103
|
+
|
|
104
|
+
# 计算 tg 的 segment 中 mark 不为空的 interval 的平均时长
|
|
105
|
+
non_empty_intervals = [interval.maxTime - interval.minTime for tier in tg for interval in tier if interval.mark != ""]
|
|
106
|
+
average_word_duration = np.mean(non_empty_intervals) if non_empty_intervals else 0
|
|
107
|
+
print(f"Speech rate (word dur) is {average_word_duration:.4f} seconds")
|
|
108
|
+
|
|
109
|
+
# adjacent_pairs = []
|
|
110
|
+
|
|
111
|
+
word_intervals = [interval for interval in word_tier.intervals if interval.mark != ""]
|
|
112
|
+
for i in range(len(word_intervals) - 1):
|
|
113
|
+
current_interval = word_intervals[i]
|
|
114
|
+
next_interval = word_intervals[i + 1]
|
|
115
|
+
# 检查两个 interval 是否相粘着(前一个的结束时间等于后一个的开始时间)
|
|
116
|
+
|
|
117
|
+
if current_interval.maxTime == next_interval.minTime:
|
|
118
|
+
target_boundary = current_interval.maxTime - current_interval.minTime
|
|
119
|
+
|
|
120
|
+
start_sample = int(current_interval.minTime * sr)
|
|
121
|
+
end_sample = int(next_interval.maxTime * sr)
|
|
122
|
+
y_vad = y[start_sample:end_sample]
|
|
123
|
+
|
|
124
|
+
# 计算频谱图
|
|
125
|
+
spectrogram = librosa.stft(y_vad, n_fft=2048, win_length=1024, center=True)
|
|
126
|
+
spectrogram_db = librosa.amplitude_to_db(abs(spectrogram), ref=1.0) # 使用librosa.amplitude_to_db已将y值转换为对数刻度,top_db=None确保不限制最大分贝值
|
|
127
|
+
|
|
128
|
+
kernel = np.array([[-1, 0, 1]])
|
|
129
|
+
convolved_spectrogram = convolve2d(spectrogram_db, kernel, mode='same', boundary='symm')
|
|
130
|
+
convolved_spectrogram = np.where(np.abs(convolved_spectrogram) < 15, 0, convolved_spectrogram)
|
|
131
|
+
|
|
132
|
+
# 按频率轴求和,保持维度以方便后续绘图
|
|
133
|
+
convolved_spectrogram = np.sum(np.abs(convolved_spectrogram), axis=0, keepdims=False)
|
|
134
|
+
# 在保持输出信号长度不变的情况下,对卷积后的频谱图求一阶导
|
|
135
|
+
# convolved_spectrogram = np.gradient(convolved_spectrogram)
|
|
136
|
+
time_axis = np.linspace(0, len(convolved_spectrogram) * librosa.core.get_duration(y=y_vad, sr=sr) / len(convolved_spectrogram), len(convolved_spectrogram))
|
|
137
|
+
|
|
138
|
+
# 找到所有的波峰和波谷
|
|
139
|
+
peaks, _ = find_peaks(convolved_spectrogram, prominence=(10, None))
|
|
140
|
+
valleys, _ = find_peaks(-convolved_spectrogram, prominence=(10, None))
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
# 只保留波峰和波谷绝对值大于100的点
|
|
144
|
+
valid_peaks = peaks[np.abs(convolved_spectrogram[peaks]) > 0]
|
|
145
|
+
valid_valleys = valleys[np.abs(convolved_spectrogram[valleys]) > 0]
|
|
146
|
+
|
|
147
|
+
# 提取有效波峰和波谷对应的时间和值
|
|
148
|
+
peak_times = time_axis[valid_peaks]
|
|
149
|
+
peak_values = convolved_spectrogram[valid_peaks]
|
|
150
|
+
|
|
151
|
+
valley_times = time_axis[valid_valleys]
|
|
152
|
+
valley_values = convolved_spectrogram[valid_valleys]
|
|
153
|
+
|
|
154
|
+
# 筛选出不在 current_interval.minTime 到 current_interval.minTime + 0.05s 之间的波峰
|
|
155
|
+
valid_peak_times = [t for t in peak_times if t >= 0.05 and (target_boundary - average_word_duration/2 <= t <= target_boundary + average_word_duration * 3/4)]
|
|
156
|
+
|
|
157
|
+
if valid_peak_times:
|
|
158
|
+
# 找到距离 target_boundary 最近且最大的波峰
|
|
159
|
+
# 获取波峰对应的数值
|
|
160
|
+
peak_values_nearby = [convolved_spectrogram[int((t / librosa.core.get_duration(y=y_vad, sr=sr)) * len(convolved_spectrogram))] for t in valid_peak_times]
|
|
161
|
+
# 找到最大波峰对应的时间
|
|
162
|
+
closest_peak_time = valid_peak_times[np.argmax(peak_values_nearby)]
|
|
163
|
+
else:
|
|
164
|
+
closest_peak_time = target_boundary
|
|
165
|
+
|
|
166
|
+
# 找到之后,开始写入
|
|
167
|
+
target_boundary = closest_peak_time + current_interval.minTime
|
|
168
|
+
|
|
169
|
+
current_interval.maxTime = target_boundary
|
|
170
|
+
next_interval.minTime = target_boundary
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
phon_tier = IntervalTier(name="phoneme", minTime=0, maxTime=word_tier.maxTime)
|
|
174
|
+
|
|
175
|
+
for interval in word_intervals:
|
|
176
|
+
print()
|
|
177
|
+
con, vow, tone = get_pinyin_info(interval.mark)
|
|
178
|
+
expected_num = len(vow) + 1 if con else len(vow)
|
|
179
|
+
phon_series = [con] + vow if con else vow
|
|
180
|
+
print(expected_num)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
start_sample = int(interval.minTime * sr)
|
|
185
|
+
end_sample = int(interval.maxTime * sr)
|
|
186
|
+
print(interval.mark, interval.minTime, interval.maxTime)
|
|
187
|
+
|
|
188
|
+
y_vad = y[start_sample:end_sample]
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
# 计算频谱图
|
|
192
|
+
spectrogram = librosa.stft(y_vad, n_fft=2048, win_length=1024, center=True)
|
|
193
|
+
spectrogram_db = librosa.amplitude_to_db(abs(spectrogram), ref=1.0) # 使用librosa.amplitude_to_db已将y值转换为对数刻度,top_db=None确保不限制最大分贝值
|
|
194
|
+
|
|
195
|
+
kernel = np.array([[-1, 0, 1]])
|
|
196
|
+
convolved_spectrogram = convolve2d(spectrogram_db, kernel, mode='same', boundary='symm')
|
|
197
|
+
convolved_spectrogram = np.where(np.abs(convolved_spectrogram) < 20, 0, convolved_spectrogram)
|
|
198
|
+
|
|
199
|
+
# 按频率轴求和,保持维度以方便后续绘图
|
|
200
|
+
convolved_spectrogram = np.sum(np.abs(convolved_spectrogram), axis=0, keepdims=False)
|
|
201
|
+
# 在保持输出信号长度不变的情况下,对卷积后的频谱图求一阶导
|
|
202
|
+
# convolved_spectrogram = np.gradient(convolved_spectrogram)
|
|
203
|
+
time_axis = np.linspace(0, len(convolved_spectrogram) * librosa.core.get_duration(y=y_vad, sr=sr) / len(convolved_spectrogram), len(convolved_spectrogram))
|
|
204
|
+
|
|
205
|
+
# 找到所有峰值,指定最小峰值高度为 0,后续再筛选最大的前几个
|
|
206
|
+
peaks, _ = find_peaks(convolved_spectrogram)
|
|
207
|
+
|
|
208
|
+
if con in ["k", 'b', 't', 'p', 'd']:
|
|
209
|
+
valid_peaks = [p for p in peaks if time_axis[p] <= len(y_vad)/sr - 0.05]
|
|
210
|
+
|
|
211
|
+
# if "i" == vow[0]:
|
|
212
|
+
# vow = vow[0] + vow
|
|
213
|
+
# expected_num -= 1
|
|
214
|
+
# phon_series = [con] + vow if con else vow
|
|
215
|
+
|
|
216
|
+
else:
|
|
217
|
+
# 忽略掉所有头0.05s和后0.05s的peak
|
|
218
|
+
valid_peaks = [p for p in peaks if time_axis[p] >= 0.05 and time_axis[p] <= len(y_vad)/sr - 0.05]
|
|
219
|
+
|
|
220
|
+
peaks = np.array(valid_peaks)
|
|
221
|
+
|
|
222
|
+
# 按峰值大小对峰值索引进行排序
|
|
223
|
+
sorted_peaks = sorted(peaks, key=lambda x: convolved_spectrogram[x], reverse=True)
|
|
224
|
+
# 假设前 5 个峰值最大,可根据实际需求修改数量
|
|
225
|
+
peaks = sorted_peaks[:expected_num-1]
|
|
226
|
+
|
|
227
|
+
# 获取波峰对应的时间戳
|
|
228
|
+
peak_times = time_axis[peaks]
|
|
229
|
+
|
|
230
|
+
peak_timestamps = [interval.minTime] + [pt + interval.minTime for pt in peak_times] + [interval.maxTime]
|
|
231
|
+
|
|
232
|
+
peak_timestamps.sort()
|
|
233
|
+
|
|
234
|
+
print(peak_timestamps)
|
|
235
|
+
for t, time_stamp in enumerate(peak_timestamps):
|
|
236
|
+
if t == 0:
|
|
237
|
+
continue
|
|
238
|
+
phon_tier.add(peak_timestamps[t-1], peak_timestamps[t], phon_series[t-1])
|
|
239
|
+
print(tg.maxTime)
|
|
240
|
+
tg.append(phon_tier)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
# 保存修改后的 TextGrid 文件
|
|
244
|
+
new_tg_path = os.path.join("output", os.path.basename(wav).replace(".wav", ".TextGrid"))
|
|
245
|
+
tg.write(new_tg_path)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
if __name__ == "__main__":
|
|
253
|
+
data_path = os.path.abspath("data")
|
|
254
|
+
# input_dir = os.path.abspath("input")
|
|
255
|
+
# output_dir = os.path.abspath("output")
|
|
256
|
+
|
|
257
|
+
fnames = [os.path.splitext(f)[0] for f in os.listdir(data_path) if f.endswith('.wav')]
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
for fname in fnames:
|
|
261
|
+
wav_path = os.path.join(data_path, fname + ".wav")
|
|
262
|
+
tg_path = wav_path.replace(".wav", "_whisper.TextGrid")
|
|
263
|
+
vad_path = wav_path.replace(".wav", "_VAD.TextGrid")
|
|
264
|
+
|
|
265
|
+
transcribe_wav_file(wav_path, vad=vad_path)
|
|
266
|
+
word_timestamp(wav_path, tg_path)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import pypinyin
|
|
2
|
+
|
|
3
|
+
def get_pinyin_info(character):
|
|
4
|
+
"""
|
|
5
|
+
给定一个中文单字,返回其对应的拼音(声母、韵母、声调)
|
|
6
|
+
|
|
7
|
+
:param character: 单个中文字符
|
|
8
|
+
:return: 包含声母、韵母、声调的字典
|
|
9
|
+
"""
|
|
10
|
+
if len(character) != 1:
|
|
11
|
+
raise ValueError("只能输入单个中文字符")
|
|
12
|
+
|
|
13
|
+
# 获取拼音信息
|
|
14
|
+
pinyin_result = pypinyin.pinyin(character, style=pypinyin.TONE3, heteronym=False)[0][0]
|
|
15
|
+
|
|
16
|
+
# 提取声母、韵母和声调
|
|
17
|
+
initial = pypinyin.pinyin(character, style=pypinyin.INITIALS, heteronym=False)[0][0]
|
|
18
|
+
final_with_tone = pypinyin.pinyin(character, style=pypinyin.FINALS_TONE3, heteronym=False)[0][0]
|
|
19
|
+
|
|
20
|
+
# 分离韵母和声调
|
|
21
|
+
tone = ''
|
|
22
|
+
final = final_with_tone
|
|
23
|
+
for char in final_with_tone:
|
|
24
|
+
if char.isdigit():
|
|
25
|
+
tone = char
|
|
26
|
+
final = final_with_tone.replace(char, '')
|
|
27
|
+
break
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
final = list(final)
|
|
32
|
+
# 如果n和g相邻,合并成ng
|
|
33
|
+
new_final = []
|
|
34
|
+
i = 0
|
|
35
|
+
while i < len(final):
|
|
36
|
+
if i < len(final) - 1 and final[i] == 'n' and final[i+1] == 'g':
|
|
37
|
+
new_final.append('ng')
|
|
38
|
+
i += 2
|
|
39
|
+
else:
|
|
40
|
+
new_final.append(final[i])
|
|
41
|
+
i += 1
|
|
42
|
+
final = new_final
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
return (
|
|
47
|
+
initial if initial else '',
|
|
48
|
+
final,
|
|
49
|
+
tone if tone else ''
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
if __name__ == '__main__':
|
|
54
|
+
print(get_pinyin_info('边'))
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: praasper
|
|
3
|
+
Version: 0.1.0.dev1
|
|
4
|
+
Summary: VAD-Enhanced ASR with Word- and Phoneme-Level Timestamps
|
|
5
|
+
Home-page: https://github.com/ParadeLuxe/Praasper
|
|
6
|
+
Author: Tony Liu
|
|
7
|
+
Author-email: paradeluxe3726@gmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.8
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Requires-Dist: openai-whisper
|
|
15
|
+
Requires-Dist: textgrid
|
|
16
|
+
Requires-Dist: pypinyin
|
|
17
|
+
Requires-Dist: librosa
|
|
18
|
+
Dynamic: author
|
|
19
|
+
Dynamic: author-email
|
|
20
|
+
Dynamic: classifier
|
|
21
|
+
Dynamic: description
|
|
22
|
+
Dynamic: description-content-type
|
|
23
|
+
Dynamic: home-page
|
|
24
|
+
Dynamic: license
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
Dynamic: requires-dist
|
|
27
|
+
Dynamic: requires-python
|
|
28
|
+
Dynamic: summary
|
|
29
|
+
|
|
30
|
+
# Praasper
|
|
31
|
+

|
|
32
|
+

|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
**Praasper** is an Automatic Speech Recognition (ASR) application designed help researchers transribe audio files to both **word-** and **phoneme-level** text.
|
|
36
|
+
|
|
37
|
+

|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# Mechanism
|
|
42
|
+
In **Praasper**, we adopt a rather simple and straightforward pipeline to extract phoneme-level information from audio files.
|
|
43
|
+
|
|
44
|
+
**Whisper** ([repo](https://github.com/openai/whisper)) is used to transcribe the audio file to **word-level text**. At this point, speech onsets and offsets exhibit time deviations in seconds.
|
|
45
|
+
|
|
46
|
+
```Python
|
|
47
|
+
model = whisper.load_model("large-v3-turbo", device="cuda")
|
|
48
|
+
result = model.transcribe(wav, word_timestamps=True)
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
[**Praditor**](https://github.com/Paradeluxe/Praditor) is applied to perform **Voice Activity Detection (VAD)** algorithm to trim the currently existing word/character-level timestamps (at millisecond level). It is a Speech Onset Detection (SOT) algorithm we developed for langauge researchers.
|
|
52
|
+
|
|
53
|
+
To extract phoneme boundaries, we designed an **edge detection algorithm**.
|
|
54
|
+
- The audio file is first resampled to **16 kHz** as to remove noise in the high-frequency domain.
|
|
55
|
+
- A kernel,`[-1, 0, 1]`, is then applied to the frequency domain to enhance the edge(s) between phonetic segments.
|
|
56
|
+
- The most prominent **n** peaks are then selected so as to match the wanted number of phonemes.
|
|
57
|
+
|
|
58
|
+
# Support
|
|
59
|
+
|
|
60
|
+
| Precision | Completed | Developing |
|
|
61
|
+
| :---: | :---: | :---: |
|
|
62
|
+
| Word | Mandarin | Cantonese, English |
|
|
63
|
+
| Phoneme | Mandarin | Cantonese, English |
|
|
64
|
+
|
|
65
|
+
# Setup
|
|
66
|
+
|
|
67
|
+
## pip installation
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
pip install praasper
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
`uv` is also highly recommended
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
setup.py
|
|
4
|
+
praasper/__init__.py
|
|
5
|
+
praasper/process.py
|
|
6
|
+
praasper/tool.py
|
|
7
|
+
praasper.egg-info/PKG-INFO
|
|
8
|
+
praasper.egg-info/SOURCES.txt
|
|
9
|
+
praasper.egg-info/dependency_links.txt
|
|
10
|
+
praasper.egg-info/requires.txt
|
|
11
|
+
praasper.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
praasper
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
setup(
|
|
4
|
+
name='praasper', # 在PyPI上唯一的包名
|
|
5
|
+
version='0.1.0.dev1', # 版本号,遵循语义化版本规范
|
|
6
|
+
description='VAD-Enhanced ASR with Word- and Phoneme-Level Timestamps',
|
|
7
|
+
long_description=open('README.md').read(),
|
|
8
|
+
long_description_content_type='text/markdown',
|
|
9
|
+
author='Tony Liu',
|
|
10
|
+
author_email='paradeluxe3726@gmail.com',
|
|
11
|
+
url='https://github.com/ParadeLuxe/Praasper',
|
|
12
|
+
packages=find_packages(), # 自动查找包
|
|
13
|
+
install_requires=[ # 你的项目所依赖的第三方库
|
|
14
|
+
'openai-whisper',
|
|
15
|
+
'textgrid',
|
|
16
|
+
'pypinyin',
|
|
17
|
+
'librosa',
|
|
18
|
+
],
|
|
19
|
+
license='MIT',
|
|
20
|
+
classifiers=[
|
|
21
|
+
'Programming Language :: Python :: 3',
|
|
22
|
+
'Operating System :: OS Independent',
|
|
23
|
+
],
|
|
24
|
+
python_requires='>=3.8', # 指定Python版本要求
|
|
25
|
+
)
|