dora-distil-whisper 0.3.8__py3-none-any.whl → 0.3.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dora_distil_whisper/__init__.py +1 -1
- dora_distil_whisper/main.py +64 -35
- dora_distil_whisper-0.3.9.dist-info/METADATA +47 -0
- dora_distil_whisper-0.3.9.dist-info/RECORD +7 -0
- {dora_distil_whisper-0.3.8.dist-info → dora_distil_whisper-0.3.9.dist-info}/WHEEL +2 -1
- dora_distil_whisper-0.3.9.dist-info/entry_points.txt +2 -0
- dora_distil_whisper-0.3.9.dist-info/top_level.txt +1 -0
- dora_distil_whisper-0.3.8.dist-info/METADATA +0 -32
- dora_distil_whisper-0.3.8.dist-info/RECORD +0 -6
- dora_distil_whisper-0.3.8.dist-info/entry_points.txt +0 -3
dora_distil_whisper/__init__.py
CHANGED
@@ -5,7 +5,7 @@ readme_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "README.m
|
|
5
5
|
|
6
6
|
# Read the content of the README file
|
7
7
|
try:
|
8
|
-
with open(readme_path,
|
8
|
+
with open(readme_path, encoding="utf-8") as f:
|
9
9
|
__doc__ = f.read()
|
10
10
|
except FileNotFoundError:
|
11
11
|
__doc__ = "README file not found."
|
dora_distil_whisper/main.py
CHANGED
@@ -1,52 +1,62 @@
|
|
1
|
-
import torch
|
2
|
-
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
|
3
|
-
from dora import Node
|
4
|
-
import pyarrow as pa
|
5
1
|
import os
|
2
|
+
import sys
|
6
3
|
from pathlib import Path
|
7
4
|
|
5
|
+
import pyarrow as pa
|
6
|
+
import torch
|
7
|
+
from dora import Node
|
8
|
+
|
8
9
|
DEFAULT_PATH = "openai/whisper-large-v3-turbo"
|
9
|
-
TARGET_LANGUAGE = os.getenv("TARGET_LANGUAGE", "
|
10
|
+
TARGET_LANGUAGE = os.getenv("TARGET_LANGUAGE", "english")
|
10
11
|
TRANSLATE = bool(os.getenv("TRANSLATE", "False") in ["True", "true"])
|
11
12
|
|
12
13
|
|
13
|
-
|
14
|
+
def load_model():
|
15
|
+
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
|
16
|
+
|
17
|
+
MODEL_NAME_OR_PATH = os.getenv("MODEL_NAME_OR_PATH", DEFAULT_PATH)
|
14
18
|
|
15
|
-
if bool(os.getenv("USE_MODELSCOPE_HUB") in ["True", "true"]):
|
16
|
-
|
19
|
+
if bool(os.getenv("USE_MODELSCOPE_HUB") in ["True", "true"]):
|
20
|
+
from modelscope import snapshot_download
|
17
21
|
|
18
|
-
|
19
|
-
|
22
|
+
if not Path(MODEL_NAME_OR_PATH).exists():
|
23
|
+
MODEL_NAME_OR_PATH = snapshot_download(MODEL_NAME_OR_PATH)
|
20
24
|
|
21
|
-
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
22
|
-
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
25
|
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
26
|
+
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
23
27
|
|
28
|
+
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
29
|
+
MODEL_NAME_OR_PATH,
|
30
|
+
torch_dtype=torch_dtype,
|
31
|
+
low_cpu_mem_usage=True,
|
32
|
+
use_safetensors=True,
|
33
|
+
)
|
34
|
+
model.to(device)
|
24
35
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
36
|
+
processor = AutoProcessor.from_pretrained(MODEL_NAME_OR_PATH)
|
37
|
+
pipe = pipeline(
|
38
|
+
"automatic-speech-recognition",
|
39
|
+
model=model,
|
40
|
+
tokenizer=processor.tokenizer,
|
41
|
+
feature_extractor=processor.feature_extractor,
|
42
|
+
max_new_tokens=400,
|
43
|
+
torch_dtype=torch_dtype,
|
44
|
+
device=device,
|
45
|
+
)
|
46
|
+
return pipe
|
32
47
|
|
33
|
-
processor = AutoProcessor.from_pretrained(MODEL_NAME_OR_PATH)
|
34
|
-
pipe = pipeline(
|
35
|
-
"automatic-speech-recognition",
|
36
|
-
model=model,
|
37
|
-
tokenizer=processor.tokenizer,
|
38
|
-
feature_extractor=processor.feature_extractor,
|
39
|
-
max_new_tokens=400,
|
40
|
-
torch_dtype=torch_dtype,
|
41
|
-
device=device,
|
42
|
-
)
|
43
48
|
|
44
49
|
BAD_SENTENCES = [
|
50
|
+
"",
|
51
|
+
" so",
|
52
|
+
" so so",
|
45
53
|
"字幕",
|
46
54
|
"字幕志愿",
|
47
55
|
"中文字幕",
|
48
56
|
"我",
|
49
57
|
"你",
|
58
|
+
" you",
|
59
|
+
"!",
|
50
60
|
"THANK YOU",
|
51
61
|
" Thank you.",
|
52
62
|
" www.microsoft.com",
|
@@ -60,11 +70,14 @@ BAD_SENTENCES = [
|
|
60
70
|
|
61
71
|
|
62
72
|
def cut_repetition(text, min_repeat_length=4, max_repeat_length=50):
|
73
|
+
if len(text) == 0:
|
74
|
+
return text
|
63
75
|
# Check if the text is primarily Chinese (you may need to adjust this threshold)
|
64
76
|
if sum(1 for char in text if "\u4e00" <= char <= "\u9fff") / len(text) > 0.5:
|
65
77
|
# Chinese text processing
|
66
78
|
for repeat_length in range(
|
67
|
-
min_repeat_length,
|
79
|
+
min_repeat_length,
|
80
|
+
min(max_repeat_length, len(text) // 2),
|
68
81
|
):
|
69
82
|
for i in range(len(text) - repeat_length * 2 + 1):
|
70
83
|
chunk1 = text[i : i + repeat_length]
|
@@ -76,7 +89,8 @@ def cut_repetition(text, min_repeat_length=4, max_repeat_length=50):
|
|
76
89
|
# Non-Chinese (space-separated) text processing
|
77
90
|
words = text.split()
|
78
91
|
for repeat_length in range(
|
79
|
-
min_repeat_length,
|
92
|
+
min_repeat_length,
|
93
|
+
min(max_repeat_length, len(words) // 2),
|
80
94
|
):
|
81
95
|
for i in range(len(words) - repeat_length * 2 + 1):
|
82
96
|
chunk1 = " ".join(words[i : i + repeat_length])
|
@@ -90,6 +104,11 @@ def cut_repetition(text, min_repeat_length=4, max_repeat_length=50):
|
|
90
104
|
|
91
105
|
def main():
|
92
106
|
node = Node()
|
107
|
+
|
108
|
+
# For macos use mlx:
|
109
|
+
if sys.platform != "darwin":
|
110
|
+
pipe = load_model()
|
111
|
+
|
93
112
|
for event in node:
|
94
113
|
if event["type"] == "INPUT":
|
95
114
|
audio = event["value"].to_numpy()
|
@@ -100,10 +119,20 @@ def main():
|
|
100
119
|
"language": TARGET_LANGUAGE,
|
101
120
|
}
|
102
121
|
)
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
122
|
+
if sys.platform == "darwin":
|
123
|
+
import mlx_whisper
|
124
|
+
|
125
|
+
result = mlx_whisper.transcribe(
|
126
|
+
audio,
|
127
|
+
path_or_hf_repo="mlx-community/whisper-large-v3-turbo",
|
128
|
+
append_punctuations=".",
|
129
|
+
)
|
130
|
+
|
131
|
+
else:
|
132
|
+
result = pipe(
|
133
|
+
audio,
|
134
|
+
generate_kwargs=confg,
|
135
|
+
)
|
107
136
|
if result["text"] in BAD_SENTENCES:
|
108
137
|
continue
|
109
138
|
text = cut_repetition(result["text"])
|
@@ -0,0 +1,47 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: dora-distil-whisper
|
3
|
+
Version: 0.3.9
|
4
|
+
Summary: Dora dora-distil-whisper
|
5
|
+
Author-email: Haixuan Xavier Tao <tao.xavier@outlook.com>, Enzo Le Van <dev@enzo-le-van.fr>
|
6
|
+
License: MIT
|
7
|
+
Requires-Python: >=3.8
|
8
|
+
Description-Content-Type: text/markdown
|
9
|
+
Requires-Dist: dora-rs>=0.3.6
|
10
|
+
Requires-Dist: numpy<2.0.0
|
11
|
+
Requires-Dist: pyarrow>=5.0.0
|
12
|
+
Requires-Dist: transformers>=4.0.0
|
13
|
+
Requires-Dist: accelerate>=0.29.2
|
14
|
+
Requires-Dist: torch>=2.2.0
|
15
|
+
Requires-Dist: modelscope>=1.18.1
|
16
|
+
Requires-Dist: mlx-whisper>=0.4.1; sys_platform == "darwin"
|
17
|
+
|
18
|
+
# Dora Whisper Node for transforming speech to text
|
19
|
+
|
20
|
+
## YAML Specification
|
21
|
+
|
22
|
+
This node is supposed to be used as follows:
|
23
|
+
|
24
|
+
```yaml
|
25
|
+
- id: dora-distil-whisper
|
26
|
+
build: pip install dora-distil-whisper
|
27
|
+
path: dora-distil-whisper
|
28
|
+
inputs:
|
29
|
+
input: dora-vad/audio
|
30
|
+
outputs:
|
31
|
+
- text
|
32
|
+
env:
|
33
|
+
TARGET_LANGUAGE: english
|
34
|
+
```
|
35
|
+
|
36
|
+
## Examples
|
37
|
+
|
38
|
+
- speech to text
|
39
|
+
- github: https://github.com/dora-rs/dora/blob/main/examples/speech-to-text
|
40
|
+
- website: https://dora-rs.ai/docs/examples/stt
|
41
|
+
- vision language model
|
42
|
+
- github: https://github.com/dora-rs/dora/blob/main/examples/vlm
|
43
|
+
- website: https://dora-rs.ai/docs/examples/vlm
|
44
|
+
|
45
|
+
## License
|
46
|
+
|
47
|
+
Dora-whisper's code and model weights are released under the MIT License
|
@@ -0,0 +1,7 @@
|
|
1
|
+
dora_distil_whisper/__init__.py,sha256=HuSK3dnyI9Pb5QAuaKFwQQ3J5SIZnLcKHPJO0norGzc,353
|
2
|
+
dora_distil_whisper/main.py,sha256=MbT9nsEHxpyzcFzkDe4FjITzUeemh8LzCMmQSRL4xqo,4083
|
3
|
+
dora_distil_whisper-0.3.9.dist-info/METADATA,sha256=Gxiyc_5VSjaQDduwawrH3VN2d0pKreJRC-_qaw7KKb4,1253
|
4
|
+
dora_distil_whisper-0.3.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
5
|
+
dora_distil_whisper-0.3.9.dist-info/entry_points.txt,sha256=c6QYCZs0YVR2uasYHES67JhOOvTm5QbcwGk-9IrG9oM,70
|
6
|
+
dora_distil_whisper-0.3.9.dist-info/top_level.txt,sha256=h5QH64SWnqZA83bx740-NTxfQKdeiKTLAdGqhnwKhuQ,20
|
7
|
+
dora_distil_whisper-0.3.9.dist-info/RECORD,,
|
@@ -0,0 +1 @@
|
|
1
|
+
dora_distil_whisper
|
@@ -1,32 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.1
|
2
|
-
Name: dora-distil-whisper
|
3
|
-
Version: 0.3.8
|
4
|
-
Summary: Dora dora-distil-whisper
|
5
|
-
Home-page: https://github.com/dora-rs/dora.git
|
6
|
-
License: MIT
|
7
|
-
Author: Haixuan Xavier Tao
|
8
|
-
Author-email: tao.xavier@outlook.com
|
9
|
-
Requires-Python: >=3.7,<4.0
|
10
|
-
Classifier: License :: OSI Approved :: MIT License
|
11
|
-
Classifier: Programming Language :: Python :: 3
|
12
|
-
Classifier: Programming Language :: Python :: 3.7
|
13
|
-
Classifier: Programming Language :: Python :: 3.8
|
14
|
-
Classifier: Programming Language :: Python :: 3.9
|
15
|
-
Classifier: Programming Language :: Python :: 3.10
|
16
|
-
Classifier: Programming Language :: Python :: 3.11
|
17
|
-
Classifier: Programming Language :: Python :: 3.12
|
18
|
-
Classifier: Programming Language :: Python :: 3.13
|
19
|
-
Requires-Dist: accelerate (>=0.29.2,<0.30.0)
|
20
|
-
Requires-Dist: dora-rs (>=0.3.6,<0.4.0)
|
21
|
-
Requires-Dist: modelscope (>=1.18.1,<2.0.0)
|
22
|
-
Requires-Dist: numpy (<2.0.0)
|
23
|
-
Requires-Dist: pyarrow (>=5.0.0)
|
24
|
-
Requires-Dist: torch (>=2.2.0,<3.0.0)
|
25
|
-
Requires-Dist: transformers (>=4.0.0,<5.0.0)
|
26
|
-
Project-URL: Documentation, https://github.com/dora-rs/dora/blob/main/node-hub/dora-distil-whisper/README.md
|
27
|
-
Description-Content-Type: text/markdown
|
28
|
-
|
29
|
-
# Dora Node for transforming speech to text (English only)
|
30
|
-
|
31
|
-
Check example at [examples/speech-to-text](examples/speech-to-text)
|
32
|
-
|
@@ -1,6 +0,0 @@
|
|
1
|
-
dora_distil_whisper/__init__.py,sha256=Gy4qL4vCeTyA5HR1Yp3ioL4-ClJyW8oi_38CzMuMsBM,358
|
2
|
-
dora_distil_whisper/main.py,sha256=-lMXHjnBw0tWnQXyeoKkrbSC4w6F6UyHjzY0GT1EENs,3398
|
3
|
-
dora_distil_whisper-0.3.8.dist-info/METADATA,sha256=Hpv9jDKCjy9vesIZLCwnth1Iqf8uUvo2yKfbib6yG7g,1256
|
4
|
-
dora_distil_whisper-0.3.8.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
5
|
-
dora_distil_whisper-0.3.8.dist-info/entry_points.txt,sha256=Q_8wNgkDYxgoKETJjM6ewXWcr_yzRUgsSeBd0uetuRs,69
|
6
|
-
dora_distil_whisper-0.3.8.dist-info/RECORD,,
|