dora-distil-whisper 0.3.10rc0__py3-none-any.whl → 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,5 @@
1
+ """TODO: Add docstring."""
2
+
1
3
  import os
2
4
 
3
5
  # Define the path to the README file relative to the package directory
@@ -1,5 +1,9 @@
1
+ """TODO: Add docstring."""
2
+
1
3
  import os
4
+ import re
2
5
  import sys
6
+ import time
3
7
  from pathlib import Path
4
8
 
5
9
  import pyarrow as pa
@@ -11,30 +15,102 @@ TARGET_LANGUAGE = os.getenv("TARGET_LANGUAGE", "english")
11
15
  TRANSLATE = bool(os.getenv("TRANSLATE", "False") in ["True", "true"])
12
16
 
13
17
 
18
+ def remove_text_noise(text: str, text_noise="") -> str:
19
+ """Remove noise from text.
20
+
21
+ Args:
22
+ text (str): Original text
23
+ text_noise (str): text to remove from the original text
24
+
25
+ Returns:
26
+ str: Cleaned text
27
+
28
+ """
29
+ # Handle the case where text_noise is empty
30
+ if not text_noise.strip():
31
+ return (
32
+ text # Return the original text if text_noise is empty or just whitespace
33
+ )
34
+
35
+ # Helper function to normalize text (remove punctuation, make lowercase, and handle hyphens)
36
+ def normalize(s):
37
+ # Replace hyphens with spaces to treat "Notre-Dame" and "notre dame" as equivalent
38
+ s = re.sub(r"-", " ", s)
39
+ # Remove other punctuation and convert to lowercase
40
+ return re.sub(r"[^\w\s]", "", s).lower()
41
+
42
+ # Normalize both text and text_noise
43
+ normalized_text = normalize(text)
44
+ normalized_noise = normalize(text_noise)
45
+
46
+ # Split into words
47
+ text_words = normalized_text.split()
48
+ noise_words = normalized_noise.split()
49
+
50
+ # Function to find and remove noise sequence flexibly
51
+ def remove_flexible(text_list, noise_list):
52
+ i = 0
53
+ while i <= len(text_list) - len(noise_list):
54
+ match = True
55
+ extra_words = 0
56
+ for j, noise_word in enumerate(noise_list):
57
+ if i + j + extra_words >= len(text_list):
58
+ match = False
59
+ break
60
+ # Allow skipping extra words in text_list
61
+ while (
62
+ i + j + extra_words < len(text_list)
63
+ and text_list[i + j + extra_words] != noise_word
64
+ ):
65
+ extra_words += 1
66
+ if i + j + extra_words >= len(text_list):
67
+ match = False
68
+ break
69
+ if not match:
70
+ break
71
+ if match:
72
+ # Remove matched part
73
+ del text_list[i : i + len(noise_list) + extra_words]
74
+ i = max(0, i - len(noise_list)) # Adjust index after removal
75
+ else:
76
+ i += 1
77
+ return text_list
78
+
79
+ # Only remove parts of text_noise that are found in text
80
+ cleaned_words = text_words[:]
81
+ for noise_word in noise_words:
82
+ if noise_word in cleaned_words:
83
+ cleaned_words.remove(noise_word)
84
+
85
+ # Reconstruct the cleaned text
86
+ return " ".join(cleaned_words)
87
+
88
+
14
89
  def load_model():
90
+ """TODO: Add docstring."""
15
91
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
16
92
 
17
- MODEL_NAME_OR_PATH = os.getenv("MODEL_NAME_OR_PATH", DEFAULT_PATH)
93
+ model_name_or_path = os.getenv("MODEL_NAME_OR_PATH", DEFAULT_PATH)
18
94
 
19
95
  if bool(os.getenv("USE_MODELSCOPE_HUB") in ["True", "true"]):
20
96
  from modelscope import snapshot_download
21
97
 
22
- if not Path(MODEL_NAME_OR_PATH).exists():
23
- MODEL_NAME_OR_PATH = snapshot_download(MODEL_NAME_OR_PATH)
98
+ if not Path(model_name_or_path).exists():
99
+ model_name_or_path = snapshot_download(model_name_or_path)
24
100
 
25
101
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
26
102
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
27
103
 
28
104
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
29
- MODEL_NAME_OR_PATH,
105
+ model_name_or_path,
30
106
  torch_dtype=torch_dtype,
31
107
  low_cpu_mem_usage=True,
32
108
  use_safetensors=True,
33
109
  )
34
110
  model.to(device)
35
111
 
36
- processor = AutoProcessor.from_pretrained(MODEL_NAME_OR_PATH)
37
- pipe = pipeline(
112
+ processor = AutoProcessor.from_pretrained(model_name_or_path)
113
+ return pipeline(
38
114
  "automatic-speech-recognition",
39
115
  model=model,
40
116
  tokenizer=processor.tokenizer,
@@ -43,7 +119,6 @@ def load_model():
43
119
  torch_dtype=torch_dtype,
44
120
  device=device,
45
121
  )
46
- return pipe
47
122
 
48
123
 
49
124
  BAD_SENTENCES = [
@@ -66,10 +141,12 @@ BAD_SENTENCES = [
66
141
  " Sous-titrage Société Radio-Canada",
67
142
  " Sous",
68
143
  " Sous-",
144
+ " i'm going to go to the next one.",
69
145
  ]
70
146
 
71
147
 
72
148
  def cut_repetition(text, min_repeat_length=4, max_repeat_length=50):
149
+ """TODO: Add docstring."""
73
150
  if len(text) == 0:
74
151
  return text
75
152
  # Check if the text is primarily Chinese (you may need to adjust this threshold)
@@ -103,37 +180,61 @@ def cut_repetition(text, min_repeat_length=4, max_repeat_length=50):
103
180
 
104
181
 
105
182
  def main():
183
+ """TODO: Add docstring."""
106
184
  node = Node()
107
-
185
+ text_noise = ""
186
+ noise_timestamp = time.time()
108
187
  # For macos use mlx:
109
188
  if sys.platform != "darwin":
110
189
  pipe = load_model()
111
190
 
112
191
  for event in node:
113
192
  if event["type"] == "INPUT":
114
- audio = event["value"].to_numpy()
115
- confg = (
116
- {"language": TARGET_LANGUAGE, "task": "translate"}
117
- if TRANSLATE
118
- else {
119
- "language": TARGET_LANGUAGE,
120
- }
121
- )
122
- if sys.platform == "darwin":
123
- import mlx_whisper
124
-
125
- result = mlx_whisper.transcribe(
126
- audio,
127
- path_or_hf_repo="mlx-community/whisper-large-v3-turbo",
128
- append_punctuations=".",
193
+ if "text_noise" in event["id"]:
194
+ text_noise = event["value"][0].as_py()
195
+ text_noise = (
196
+ text_noise.replace("(", "")
197
+ .replace(")", "")
198
+ .replace("[", "")
199
+ .replace("]", "")
129
200
  )
130
-
201
+ noise_timestamp = time.time()
131
202
  else:
132
- result = pipe(
133
- audio,
134
- generate_kwargs=confg,
203
+ audio = event["value"].to_numpy()
204
+ confg = (
205
+ {"language": TARGET_LANGUAGE, "task": "translate"}
206
+ if TRANSLATE
207
+ else {
208
+ "language": TARGET_LANGUAGE,
209
+ }
210
+ )
211
+ if sys.platform == "darwin":
212
+ import mlx_whisper
213
+
214
+ result = mlx_whisper.transcribe(
215
+ audio,
216
+ path_or_hf_repo="mlx-community/whisper-large-v3-turbo",
217
+ append_punctuations=".",
218
+ )
219
+
220
+ else:
221
+ result = pipe(
222
+ audio,
223
+ generate_kwargs=confg,
224
+ )
225
+ if result["text"] in BAD_SENTENCES:
226
+ continue
227
+ text = cut_repetition(result["text"])
228
+
229
+ # Remove noise filter after some time
230
+ if time.time() - noise_timestamp > (len(text_noise.split()) / 2): # WPS
231
+ text_noise = ""
232
+
233
+ ## Remove text noise independently of casing
234
+ text = remove_text_noise(text, text_noise)
235
+
236
+ if text.strip() == "" or text.strip() == ".":
237
+ continue
238
+ node.send_output(
239
+ "text", pa.array([text]), {"language": TARGET_LANGUAGE},
135
240
  )
136
- if result["text"] in BAD_SENTENCES:
137
- continue
138
- text = cut_repetition(result["text"])
139
- node.send_output("text", pa.array([text]), {"language": TARGET_LANGUAGE})
@@ -1,12 +1,12 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: dora-distil-whisper
3
- Version: 0.3.10rc0
3
+ Version: 0.3.11
4
4
  Summary: Dora dora-distil-whisper
5
5
  Author-email: Haixuan Xavier Tao <tao.xavier@outlook.com>, Enzo Le Van <dev@enzo-le-van.fr>
6
6
  License: MIT
7
7
  Requires-Python: >=3.8
8
8
  Description-Content-Type: text/markdown
9
- Requires-Dist: dora-rs>=0.3.6
9
+ Requires-Dist: dora-rs>=0.3.9
10
10
  Requires-Dist: numpy<2.0.0
11
11
  Requires-Dist: pyarrow>=5.0.0
12
12
  Requires-Dist: transformers>=4.0.0
@@ -0,0 +1,7 @@
1
+ dora_distil_whisper/__init__.py,sha256=tF7WHhHiDweUUzyHsbmFe_ktphE08aA5j33E4ja1udA,381
2
+ dora_distil_whisper/main.py,sha256=wOOJ9TeigWm-byZ0wGVJbgS15fLRSMH2cGFRHXcj9eg,7643
3
+ dora_distil_whisper-0.3.11.dist-info/METADATA,sha256=FwdwRV5UY0Wd4dDswj1tdiO5nXQf9asKyLHFhgcemFQ,1254
4
+ dora_distil_whisper-0.3.11.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
5
+ dora_distil_whisper-0.3.11.dist-info/entry_points.txt,sha256=c6QYCZs0YVR2uasYHES67JhOOvTm5QbcwGk-9IrG9oM,70
6
+ dora_distil_whisper-0.3.11.dist-info/top_level.txt,sha256=h5QH64SWnqZA83bx740-NTxfQKdeiKTLAdGqhnwKhuQ,20
7
+ dora_distil_whisper-0.3.11.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.0)
2
+ Generator: setuptools (78.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,7 +0,0 @@
1
- dora_distil_whisper/__init__.py,sha256=HuSK3dnyI9Pb5QAuaKFwQQ3J5SIZnLcKHPJO0norGzc,353
2
- dora_distil_whisper/main.py,sha256=MbT9nsEHxpyzcFzkDe4FjITzUeemh8LzCMmQSRL4xqo,4083
3
- dora_distil_whisper-0.3.10rc0.dist-info/METADATA,sha256=0elfHye3_gyyVg5dWnWcaDaPddXLZUUO2SRJxoDIwGA,1257
4
- dora_distil_whisper-0.3.10rc0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
- dora_distil_whisper-0.3.10rc0.dist-info/entry_points.txt,sha256=c6QYCZs0YVR2uasYHES67JhOOvTm5QbcwGk-9IrG9oM,70
6
- dora_distil_whisper-0.3.10rc0.dist-info/top_level.txt,sha256=h5QH64SWnqZA83bx740-NTxfQKdeiKTLAdGqhnwKhuQ,20
7
- dora_distil_whisper-0.3.10rc0.dist-info/RECORD,,