dora-distil-whisper 0.3.10rc0__tar.gz → 0.3.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,12 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: dora-distil-whisper
3
- Version: 0.3.10rc0
3
+ Version: 0.3.11
4
4
  Summary: Dora dora-distil-whisper
5
5
  Author-email: Haixuan Xavier Tao <tao.xavier@outlook.com>, Enzo Le Van <dev@enzo-le-van.fr>
6
6
  License: MIT
7
7
  Requires-Python: >=3.8
8
8
  Description-Content-Type: text/markdown
9
- Requires-Dist: dora-rs>=0.3.6
9
+ Requires-Dist: dora-rs>=0.3.9
10
10
  Requires-Dist: numpy<2.0.0
11
11
  Requires-Dist: pyarrow>=5.0.0
12
12
  Requires-Dist: transformers>=4.0.0
@@ -1,3 +1,5 @@
1
+ """TODO: Add docstring."""
2
+
1
3
  import os
2
4
 
3
5
  # Define the path to the README file relative to the package directory
@@ -0,0 +1,240 @@
1
+ """TODO: Add docstring."""
2
+
3
+ import os
4
+ import re
5
+ import sys
6
+ import time
7
+ from pathlib import Path
8
+
9
+ import pyarrow as pa
10
+ import torch
11
+ from dora import Node
12
+
13
+ DEFAULT_PATH = "openai/whisper-large-v3-turbo"
14
+ TARGET_LANGUAGE = os.getenv("TARGET_LANGUAGE", "english")
15
+ TRANSLATE = bool(os.getenv("TRANSLATE", "False") in ["True", "true"])
16
+
17
+
18
+ def remove_text_noise(text: str, text_noise="") -> str:
19
+ """Remove noise from text.
20
+
21
+ Args:
22
+ text (str): Original text
23
+ text_noise (str): text to remove from the original text
24
+
25
+ Returns:
26
+ str: Cleaned text
27
+
28
+ """
29
+ # Handle the case where text_noise is empty
30
+ if not text_noise.strip():
31
+ return (
32
+ text # Return the original text if text_noise is empty or just whitespace
33
+ )
34
+
35
+ # Helper function to normalize text (remove punctuation, make lowercase, and handle hyphens)
36
+ def normalize(s):
37
+ # Replace hyphens with spaces to treat "Notre-Dame" and "notre dame" as equivalent
38
+ s = re.sub(r"-", " ", s)
39
+ # Remove other punctuation and convert to lowercase
40
+ return re.sub(r"[^\w\s]", "", s).lower()
41
+
42
+ # Normalize both text and text_noise
43
+ normalized_text = normalize(text)
44
+ normalized_noise = normalize(text_noise)
45
+
46
+ # Split into words
47
+ text_words = normalized_text.split()
48
+ noise_words = normalized_noise.split()
49
+
50
+ # Function to find and remove noise sequence flexibly
51
+ def remove_flexible(text_list, noise_list):
52
+ i = 0
53
+ while i <= len(text_list) - len(noise_list):
54
+ match = True
55
+ extra_words = 0
56
+ for j, noise_word in enumerate(noise_list):
57
+ if i + j + extra_words >= len(text_list):
58
+ match = False
59
+ break
60
+ # Allow skipping extra words in text_list
61
+ while (
62
+ i + j + extra_words < len(text_list)
63
+ and text_list[i + j + extra_words] != noise_word
64
+ ):
65
+ extra_words += 1
66
+ if i + j + extra_words >= len(text_list):
67
+ match = False
68
+ break
69
+ if not match:
70
+ break
71
+ if match:
72
+ # Remove matched part
73
+ del text_list[i : i + len(noise_list) + extra_words]
74
+ i = max(0, i - len(noise_list)) # Adjust index after removal
75
+ else:
76
+ i += 1
77
+ return text_list
78
+
79
+ # Only remove parts of text_noise that are found in text
80
+ cleaned_words = text_words[:]
81
+ for noise_word in noise_words:
82
+ if noise_word in cleaned_words:
83
+ cleaned_words.remove(noise_word)
84
+
85
+ # Reconstruct the cleaned text
86
+ return " ".join(cleaned_words)
87
+
88
+
89
+ def load_model():
90
+ """TODO: Add docstring."""
91
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
92
+
93
+ model_name_or_path = os.getenv("MODEL_NAME_OR_PATH", DEFAULT_PATH)
94
+
95
+ if bool(os.getenv("USE_MODELSCOPE_HUB") in ["True", "true"]):
96
+ from modelscope import snapshot_download
97
+
98
+ if not Path(model_name_or_path).exists():
99
+ model_name_or_path = snapshot_download(model_name_or_path)
100
+
101
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
102
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
103
+
104
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
105
+ model_name_or_path,
106
+ torch_dtype=torch_dtype,
107
+ low_cpu_mem_usage=True,
108
+ use_safetensors=True,
109
+ )
110
+ model.to(device)
111
+
112
+ processor = AutoProcessor.from_pretrained(model_name_or_path)
113
+ return pipeline(
114
+ "automatic-speech-recognition",
115
+ model=model,
116
+ tokenizer=processor.tokenizer,
117
+ feature_extractor=processor.feature_extractor,
118
+ max_new_tokens=400,
119
+ torch_dtype=torch_dtype,
120
+ device=device,
121
+ )
122
+
123
+
124
+ BAD_SENTENCES = [
125
+ "",
126
+ " so",
127
+ " so so",
128
+ "字幕",
129
+ "字幕志愿",
130
+ "中文字幕",
131
+ "我",
132
+ "你",
133
+ " you",
134
+ "!",
135
+ "THANK YOU",
136
+ " Thank you.",
137
+ " www.microsoft.com",
138
+ " The",
139
+ " BANG",
140
+ " Silence.",
141
+ " Sous-titrage Société Radio-Canada",
142
+ " Sous",
143
+ " Sous-",
144
+ " i'm going to go to the next one.",
145
+ ]
146
+
147
+
148
+ def cut_repetition(text, min_repeat_length=4, max_repeat_length=50):
149
+ """TODO: Add docstring."""
150
+ if len(text) == 0:
151
+ return text
152
+ # Check if the text is primarily Chinese (you may need to adjust this threshold)
153
+ if sum(1 for char in text if "\u4e00" <= char <= "\u9fff") / len(text) > 0.5:
154
+ # Chinese text processing
155
+ for repeat_length in range(
156
+ min_repeat_length,
157
+ min(max_repeat_length, len(text) // 2),
158
+ ):
159
+ for i in range(len(text) - repeat_length * 2 + 1):
160
+ chunk1 = text[i : i + repeat_length]
161
+ chunk2 = text[i + repeat_length : i + repeat_length * 2]
162
+
163
+ if chunk1 == chunk2:
164
+ return text[: i + repeat_length]
165
+ else:
166
+ # Non-Chinese (space-separated) text processing
167
+ words = text.split()
168
+ for repeat_length in range(
169
+ min_repeat_length,
170
+ min(max_repeat_length, len(words) // 2),
171
+ ):
172
+ for i in range(len(words) - repeat_length * 2 + 1):
173
+ chunk1 = " ".join(words[i : i + repeat_length])
174
+ chunk2 = " ".join(words[i + repeat_length : i + repeat_length * 2])
175
+
176
+ if chunk1 == chunk2:
177
+ return " ".join(words[: i + repeat_length])
178
+
179
+ return text
180
+
181
+
182
+ def main():
183
+ """TODO: Add docstring."""
184
+ node = Node()
185
+ text_noise = ""
186
+ noise_timestamp = time.time()
187
+ # For macos use mlx:
188
+ if sys.platform != "darwin":
189
+ pipe = load_model()
190
+
191
+ for event in node:
192
+ if event["type"] == "INPUT":
193
+ if "text_noise" in event["id"]:
194
+ text_noise = event["value"][0].as_py()
195
+ text_noise = (
196
+ text_noise.replace("(", "")
197
+ .replace(")", "")
198
+ .replace("[", "")
199
+ .replace("]", "")
200
+ )
201
+ noise_timestamp = time.time()
202
+ else:
203
+ audio = event["value"].to_numpy()
204
+ confg = (
205
+ {"language": TARGET_LANGUAGE, "task": "translate"}
206
+ if TRANSLATE
207
+ else {
208
+ "language": TARGET_LANGUAGE,
209
+ }
210
+ )
211
+ if sys.platform == "darwin":
212
+ import mlx_whisper
213
+
214
+ result = mlx_whisper.transcribe(
215
+ audio,
216
+ path_or_hf_repo="mlx-community/whisper-large-v3-turbo",
217
+ append_punctuations=".",
218
+ )
219
+
220
+ else:
221
+ result = pipe(
222
+ audio,
223
+ generate_kwargs=confg,
224
+ )
225
+ if result["text"] in BAD_SENTENCES:
226
+ continue
227
+ text = cut_repetition(result["text"])
228
+
229
+ # Remove noise filter after some time
230
+ if time.time() - noise_timestamp > (len(text_noise.split()) / 2): # WPS
231
+ text_noise = ""
232
+
233
+ ## Remove text noise independently of casing
234
+ text = remove_text_noise(text, text_noise)
235
+
236
+ if text.strip() == "" or text.strip() == ".":
237
+ continue
238
+ node.send_output(
239
+ "text", pa.array([text]), {"language": TARGET_LANGUAGE},
240
+ )
@@ -1,12 +1,12 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: dora-distil-whisper
3
- Version: 0.3.10rc0
3
+ Version: 0.3.11
4
4
  Summary: Dora dora-distil-whisper
5
5
  Author-email: Haixuan Xavier Tao <tao.xavier@outlook.com>, Enzo Le Van <dev@enzo-le-van.fr>
6
6
  License: MIT
7
7
  Requires-Python: >=3.8
8
8
  Description-Content-Type: text/markdown
9
- Requires-Dist: dora-rs>=0.3.6
9
+ Requires-Dist: dora-rs>=0.3.9
10
10
  Requires-Dist: numpy<2.0.0
11
11
  Requires-Dist: pyarrow>=5.0.0
12
12
  Requires-Dist: transformers>=4.0.0
@@ -1,4 +1,4 @@
1
- dora-rs>=0.3.6
1
+ dora-rs>=0.3.9
2
2
  numpy<2.0.0
3
3
  pyarrow>=5.0.0
4
4
  transformers>=4.0.0
@@ -0,0 +1,40 @@
1
+ [project]
2
+ name = "dora-distil-whisper"
3
+ version = "0.3.11"
4
+ authors = [
5
+ { name = "Haixuan Xavier Tao", email = "tao.xavier@outlook.com" },
6
+ { name = "Enzo Le Van", email = "dev@enzo-le-van.fr" },
7
+ ]
8
+ description = "Dora dora-distil-whisper"
9
+ license = { text = "MIT" }
10
+ readme = "README.md"
11
+ requires-python = ">=3.8"
12
+
13
+ dependencies = [
14
+ "dora-rs >= 0.3.9",
15
+ "numpy < 2.0.0",
16
+ "pyarrow >= 5.0.0",
17
+ "transformers >= 4.0.0",
18
+ "accelerate >= 0.29.2",
19
+ "torch >= 2.2.0",
20
+ "modelscope >= 1.18.1",
21
+ "mlx-whisper >= 0.4.1; sys_platform == 'darwin'",
22
+ ]
23
+
24
+ [dependency-groups]
25
+ dev = ["pytest >=8.1.1", "ruff >=0.9.1"]
26
+
27
+ [project.scripts]
28
+ dora-distil-whisper = "dora_distil_whisper.main:main"
29
+
30
+ [tool.ruff.lint]
31
+ extend-select = [
32
+ "D", # pydocstyle
33
+ "UP", # Ruff's UP rule
34
+ "PERF", # Ruff's PERF rule
35
+ "RET", # Ruff's RET rule
36
+ "RSE", # Ruff's RSE rule
37
+ "NPY", # Ruff's NPY rule
38
+ "N", # Ruff's N rule
39
+ "I", # Ruff's I rule
40
+ ]
@@ -1,7 +1,10 @@
1
+ """TODO: Add docstring."""
2
+
1
3
  import pytest
2
4
 
3
5
 
4
6
  def test_import_main():
7
+ """TODO: Add docstring."""
5
8
  from dora_distil_whisper.main import main
6
9
 
7
10
  # Check that everything is working, and catch dora Runtime Exception as we're not running in a dora dataflow.
@@ -1,139 +0,0 @@
1
- import os
2
- import sys
3
- from pathlib import Path
4
-
5
- import pyarrow as pa
6
- import torch
7
- from dora import Node
8
-
9
- DEFAULT_PATH = "openai/whisper-large-v3-turbo"
10
- TARGET_LANGUAGE = os.getenv("TARGET_LANGUAGE", "english")
11
- TRANSLATE = bool(os.getenv("TRANSLATE", "False") in ["True", "true"])
12
-
13
-
14
- def load_model():
15
- from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
16
-
17
- MODEL_NAME_OR_PATH = os.getenv("MODEL_NAME_OR_PATH", DEFAULT_PATH)
18
-
19
- if bool(os.getenv("USE_MODELSCOPE_HUB") in ["True", "true"]):
20
- from modelscope import snapshot_download
21
-
22
- if not Path(MODEL_NAME_OR_PATH).exists():
23
- MODEL_NAME_OR_PATH = snapshot_download(MODEL_NAME_OR_PATH)
24
-
25
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
26
- torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
27
-
28
- model = AutoModelForSpeechSeq2Seq.from_pretrained(
29
- MODEL_NAME_OR_PATH,
30
- torch_dtype=torch_dtype,
31
- low_cpu_mem_usage=True,
32
- use_safetensors=True,
33
- )
34
- model.to(device)
35
-
36
- processor = AutoProcessor.from_pretrained(MODEL_NAME_OR_PATH)
37
- pipe = pipeline(
38
- "automatic-speech-recognition",
39
- model=model,
40
- tokenizer=processor.tokenizer,
41
- feature_extractor=processor.feature_extractor,
42
- max_new_tokens=400,
43
- torch_dtype=torch_dtype,
44
- device=device,
45
- )
46
- return pipe
47
-
48
-
49
- BAD_SENTENCES = [
50
- "",
51
- " so",
52
- " so so",
53
- "字幕",
54
- "字幕志愿",
55
- "中文字幕",
56
- "我",
57
- "你",
58
- " you",
59
- "!",
60
- "THANK YOU",
61
- " Thank you.",
62
- " www.microsoft.com",
63
- " The",
64
- " BANG",
65
- " Silence.",
66
- " Sous-titrage Société Radio-Canada",
67
- " Sous",
68
- " Sous-",
69
- ]
70
-
71
-
72
- def cut_repetition(text, min_repeat_length=4, max_repeat_length=50):
73
- if len(text) == 0:
74
- return text
75
- # Check if the text is primarily Chinese (you may need to adjust this threshold)
76
- if sum(1 for char in text if "\u4e00" <= char <= "\u9fff") / len(text) > 0.5:
77
- # Chinese text processing
78
- for repeat_length in range(
79
- min_repeat_length,
80
- min(max_repeat_length, len(text) // 2),
81
- ):
82
- for i in range(len(text) - repeat_length * 2 + 1):
83
- chunk1 = text[i : i + repeat_length]
84
- chunk2 = text[i + repeat_length : i + repeat_length * 2]
85
-
86
- if chunk1 == chunk2:
87
- return text[: i + repeat_length]
88
- else:
89
- # Non-Chinese (space-separated) text processing
90
- words = text.split()
91
- for repeat_length in range(
92
- min_repeat_length,
93
- min(max_repeat_length, len(words) // 2),
94
- ):
95
- for i in range(len(words) - repeat_length * 2 + 1):
96
- chunk1 = " ".join(words[i : i + repeat_length])
97
- chunk2 = " ".join(words[i + repeat_length : i + repeat_length * 2])
98
-
99
- if chunk1 == chunk2:
100
- return " ".join(words[: i + repeat_length])
101
-
102
- return text
103
-
104
-
105
- def main():
106
- node = Node()
107
-
108
- # For macos use mlx:
109
- if sys.platform != "darwin":
110
- pipe = load_model()
111
-
112
- for event in node:
113
- if event["type"] == "INPUT":
114
- audio = event["value"].to_numpy()
115
- confg = (
116
- {"language": TARGET_LANGUAGE, "task": "translate"}
117
- if TRANSLATE
118
- else {
119
- "language": TARGET_LANGUAGE,
120
- }
121
- )
122
- if sys.platform == "darwin":
123
- import mlx_whisper
124
-
125
- result = mlx_whisper.transcribe(
126
- audio,
127
- path_or_hf_repo="mlx-community/whisper-large-v3-turbo",
128
- append_punctuations=".",
129
- )
130
-
131
- else:
132
- result = pipe(
133
- audio,
134
- generate_kwargs=confg,
135
- )
136
- if result["text"] in BAD_SENTENCES:
137
- continue
138
- text = cut_repetition(result["text"])
139
- node.send_output("text", pa.array([text]), {"language": TARGET_LANGUAGE})
@@ -1,29 +0,0 @@
1
- [project]
2
- name = "dora-distil-whisper"
3
- version = "0.3.10-rc0"
4
- authors = [
5
- { name = "Haixuan Xavier Tao", email = "tao.xavier@outlook.com" },
6
- { name = "Enzo Le Van", email = "dev@enzo-le-van.fr" },
7
- ]
8
- description = "Dora dora-distil-whisper"
9
- license = { text = "MIT" }
10
- readme = "README.md"
11
- requires-python = ">=3.8"
12
-
13
- dependencies = [
14
- "dora-rs >= 0.3.6",
15
- "numpy < 2.0.0",
16
- "pyarrow >= 5.0.0",
17
- "transformers >= 4.0.0",
18
- "accelerate >= 0.29.2",
19
- "torch >= 2.2.0",
20
- "modelscope >= 1.18.1",
21
- "mlx-whisper >= 0.4.1; sys_platform == 'darwin'",
22
- ]
23
-
24
-
25
- [dependency-groups]
26
- dev = ["pytest >=8.1.1", "ruff >=0.9.1"]
27
-
28
- [project.scripts]
29
- dora-distil-whisper = "dora_distil_whisper.main:main"