dora-distil-whisper 0.3.8__py3-none-any.whl → 0.3.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,7 +5,7 @@ readme_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "README.m
5
5
 
6
6
  # Read the content of the README file
7
7
  try:
8
- with open(readme_path, "r", encoding="utf-8") as f:
8
+ with open(readme_path, encoding="utf-8") as f:
9
9
  __doc__ = f.read()
10
10
  except FileNotFoundError:
11
11
  __doc__ = "README file not found."
@@ -1,52 +1,62 @@
1
- import torch
2
- from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
3
- from dora import Node
4
- import pyarrow as pa
5
1
  import os
2
+ import sys
6
3
  from pathlib import Path
7
4
 
5
+ import pyarrow as pa
6
+ import torch
7
+ from dora import Node
8
+
8
9
  DEFAULT_PATH = "openai/whisper-large-v3-turbo"
9
- TARGET_LANGUAGE = os.getenv("TARGET_LANGUAGE", "chinese")
10
+ TARGET_LANGUAGE = os.getenv("TARGET_LANGUAGE", "english")
10
11
  TRANSLATE = bool(os.getenv("TRANSLATE", "False") in ["True", "true"])
11
12
 
12
13
 
13
- MODEL_NAME_OR_PATH = os.getenv("MODEL_NAME_OR_PATH", DEFAULT_PATH)
14
+ def load_model():
15
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
16
+
17
+ MODEL_NAME_OR_PATH = os.getenv("MODEL_NAME_OR_PATH", DEFAULT_PATH)
14
18
 
15
- if bool(os.getenv("USE_MODELSCOPE_HUB") in ["True", "true"]):
16
- from modelscope import snapshot_download
19
+ if bool(os.getenv("USE_MODELSCOPE_HUB") in ["True", "true"]):
20
+ from modelscope import snapshot_download
17
21
 
18
- if not Path(MODEL_NAME_OR_PATH).exists():
19
- MODEL_NAME_OR_PATH = snapshot_download(MODEL_NAME_OR_PATH)
22
+ if not Path(MODEL_NAME_OR_PATH).exists():
23
+ MODEL_NAME_OR_PATH = snapshot_download(MODEL_NAME_OR_PATH)
20
24
 
21
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
22
- torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
25
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
26
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
23
27
 
28
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
29
+ MODEL_NAME_OR_PATH,
30
+ torch_dtype=torch_dtype,
31
+ low_cpu_mem_usage=True,
32
+ use_safetensors=True,
33
+ )
34
+ model.to(device)
24
35
 
25
- model = AutoModelForSpeechSeq2Seq.from_pretrained(
26
- MODEL_NAME_OR_PATH,
27
- torch_dtype=torch_dtype,
28
- low_cpu_mem_usage=True,
29
- use_safetensors=True,
30
- )
31
- model.to(device)
36
+ processor = AutoProcessor.from_pretrained(MODEL_NAME_OR_PATH)
37
+ pipe = pipeline(
38
+ "automatic-speech-recognition",
39
+ model=model,
40
+ tokenizer=processor.tokenizer,
41
+ feature_extractor=processor.feature_extractor,
42
+ max_new_tokens=400,
43
+ torch_dtype=torch_dtype,
44
+ device=device,
45
+ )
46
+ return pipe
32
47
 
33
- processor = AutoProcessor.from_pretrained(MODEL_NAME_OR_PATH)
34
- pipe = pipeline(
35
- "automatic-speech-recognition",
36
- model=model,
37
- tokenizer=processor.tokenizer,
38
- feature_extractor=processor.feature_extractor,
39
- max_new_tokens=400,
40
- torch_dtype=torch_dtype,
41
- device=device,
42
- )
43
48
 
44
49
  BAD_SENTENCES = [
50
+ "",
51
+ " so",
52
+ " so so",
45
53
  "字幕",
46
54
  "字幕志愿",
47
55
  "中文字幕",
48
56
  "我",
49
57
  "你",
58
+ " you",
59
+ "!",
50
60
  "THANK YOU",
51
61
  " Thank you.",
52
62
  " www.microsoft.com",
@@ -60,11 +70,14 @@ BAD_SENTENCES = [
60
70
 
61
71
 
62
72
  def cut_repetition(text, min_repeat_length=4, max_repeat_length=50):
73
+ if len(text) == 0:
74
+ return text
63
75
  # Check if the text is primarily Chinese (you may need to adjust this threshold)
64
76
  if sum(1 for char in text if "\u4e00" <= char <= "\u9fff") / len(text) > 0.5:
65
77
  # Chinese text processing
66
78
  for repeat_length in range(
67
- min_repeat_length, min(max_repeat_length, len(text) // 2)
79
+ min_repeat_length,
80
+ min(max_repeat_length, len(text) // 2),
68
81
  ):
69
82
  for i in range(len(text) - repeat_length * 2 + 1):
70
83
  chunk1 = text[i : i + repeat_length]
@@ -76,7 +89,8 @@ def cut_repetition(text, min_repeat_length=4, max_repeat_length=50):
76
89
  # Non-Chinese (space-separated) text processing
77
90
  words = text.split()
78
91
  for repeat_length in range(
79
- min_repeat_length, min(max_repeat_length, len(words) // 2)
92
+ min_repeat_length,
93
+ min(max_repeat_length, len(words) // 2),
80
94
  ):
81
95
  for i in range(len(words) - repeat_length * 2 + 1):
82
96
  chunk1 = " ".join(words[i : i + repeat_length])
@@ -90,6 +104,11 @@ def cut_repetition(text, min_repeat_length=4, max_repeat_length=50):
90
104
 
91
105
  def main():
92
106
  node = Node()
107
+
108
+ # For macos use mlx:
109
+ if sys.platform != "darwin":
110
+ pipe = load_model()
111
+
93
112
  for event in node:
94
113
  if event["type"] == "INPUT":
95
114
  audio = event["value"].to_numpy()
@@ -100,10 +119,20 @@ def main():
100
119
  "language": TARGET_LANGUAGE,
101
120
  }
102
121
  )
103
- result = pipe(
104
- audio,
105
- generate_kwargs=confg,
106
- )
122
+ if sys.platform == "darwin":
123
+ import mlx_whisper
124
+
125
+ result = mlx_whisper.transcribe(
126
+ audio,
127
+ path_or_hf_repo="mlx-community/whisper-large-v3-turbo",
128
+ append_punctuations=".",
129
+ )
130
+
131
+ else:
132
+ result = pipe(
133
+ audio,
134
+ generate_kwargs=confg,
135
+ )
107
136
  if result["text"] in BAD_SENTENCES:
108
137
  continue
109
138
  text = cut_repetition(result["text"])
@@ -0,0 +1,47 @@
1
+ Metadata-Version: 2.2
2
+ Name: dora-distil-whisper
3
+ Version: 0.3.9
4
+ Summary: Dora dora-distil-whisper
5
+ Author-email: Haixuan Xavier Tao <tao.xavier@outlook.com>, Enzo Le Van <dev@enzo-le-van.fr>
6
+ License: MIT
7
+ Requires-Python: >=3.8
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: dora-rs>=0.3.6
10
+ Requires-Dist: numpy<2.0.0
11
+ Requires-Dist: pyarrow>=5.0.0
12
+ Requires-Dist: transformers>=4.0.0
13
+ Requires-Dist: accelerate>=0.29.2
14
+ Requires-Dist: torch>=2.2.0
15
+ Requires-Dist: modelscope>=1.18.1
16
+ Requires-Dist: mlx-whisper>=0.4.1; sys_platform == "darwin"
17
+
18
+ # Dora Whisper Node for transforming speech to text
19
+
20
+ ## YAML Specification
21
+
22
+ This node is supposed to be used as follows:
23
+
24
+ ```yaml
25
+ - id: dora-distil-whisper
26
+ build: pip install dora-distil-whisper
27
+ path: dora-distil-whisper
28
+ inputs:
29
+ input: dora-vad/audio
30
+ outputs:
31
+ - text
32
+ env:
33
+ TARGET_LANGUAGE: english
34
+ ```
35
+
36
+ ## Examples
37
+
38
+ - speech to text
39
+ - github: https://github.com/dora-rs/dora/blob/main/examples/speech-to-text
40
+ - website: https://dora-rs.ai/docs/examples/stt
41
+ - vision language model
42
+ - github: https://github.com/dora-rs/dora/blob/main/examples/vlm
43
+ - website: https://dora-rs.ai/docs/examples/vlm
44
+
45
+ ## License
46
+
47
+ Dora-whisper's code and model weights are released under the MIT License
@@ -0,0 +1,7 @@
1
+ dora_distil_whisper/__init__.py,sha256=HuSK3dnyI9Pb5QAuaKFwQQ3J5SIZnLcKHPJO0norGzc,353
2
+ dora_distil_whisper/main.py,sha256=MbT9nsEHxpyzcFzkDe4FjITzUeemh8LzCMmQSRL4xqo,4083
3
+ dora_distil_whisper-0.3.9.dist-info/METADATA,sha256=Gxiyc_5VSjaQDduwawrH3VN2d0pKreJRC-_qaw7KKb4,1253
4
+ dora_distil_whisper-0.3.9.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
5
+ dora_distil_whisper-0.3.9.dist-info/entry_points.txt,sha256=c6QYCZs0YVR2uasYHES67JhOOvTm5QbcwGk-9IrG9oM,70
6
+ dora_distil_whisper-0.3.9.dist-info/top_level.txt,sha256=h5QH64SWnqZA83bx740-NTxfQKdeiKTLAdGqhnwKhuQ,20
7
+ dora_distil_whisper-0.3.9.dist-info/RECORD,,
@@ -1,4 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 1.9.1
2
+ Generator: setuptools (75.8.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ dora-distil-whisper = dora_distil_whisper.main:main
@@ -0,0 +1 @@
1
+ dora_distil_whisper
@@ -1,32 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: dora-distil-whisper
3
- Version: 0.3.8
4
- Summary: Dora dora-distil-whisper
5
- Home-page: https://github.com/dora-rs/dora.git
6
- License: MIT
7
- Author: Haixuan Xavier Tao
8
- Author-email: tao.xavier@outlook.com
9
- Requires-Python: >=3.7,<4.0
10
- Classifier: License :: OSI Approved :: MIT License
11
- Classifier: Programming Language :: Python :: 3
12
- Classifier: Programming Language :: Python :: 3.7
13
- Classifier: Programming Language :: Python :: 3.8
14
- Classifier: Programming Language :: Python :: 3.9
15
- Classifier: Programming Language :: Python :: 3.10
16
- Classifier: Programming Language :: Python :: 3.11
17
- Classifier: Programming Language :: Python :: 3.12
18
- Classifier: Programming Language :: Python :: 3.13
19
- Requires-Dist: accelerate (>=0.29.2,<0.30.0)
20
- Requires-Dist: dora-rs (>=0.3.6,<0.4.0)
21
- Requires-Dist: modelscope (>=1.18.1,<2.0.0)
22
- Requires-Dist: numpy (<2.0.0)
23
- Requires-Dist: pyarrow (>=5.0.0)
24
- Requires-Dist: torch (>=2.2.0,<3.0.0)
25
- Requires-Dist: transformers (>=4.0.0,<5.0.0)
26
- Project-URL: Documentation, https://github.com/dora-rs/dora/blob/main/node-hub/dora-distil-whisper/README.md
27
- Description-Content-Type: text/markdown
28
-
29
- # Dora Node for transforming speech to text (English only)
30
-
31
- Check example at [examples/speech-to-text](examples/speech-to-text)
32
-
@@ -1,6 +0,0 @@
1
- dora_distil_whisper/__init__.py,sha256=Gy4qL4vCeTyA5HR1Yp3ioL4-ClJyW8oi_38CzMuMsBM,358
2
- dora_distil_whisper/main.py,sha256=-lMXHjnBw0tWnQXyeoKkrbSC4w6F6UyHjzY0GT1EENs,3398
3
- dora_distil_whisper-0.3.8.dist-info/METADATA,sha256=Hpv9jDKCjy9vesIZLCwnth1Iqf8uUvo2yKfbib6yG7g,1256
4
- dora_distil_whisper-0.3.8.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
5
- dora_distil_whisper-0.3.8.dist-info/entry_points.txt,sha256=Q_8wNgkDYxgoKETJjM6ewXWcr_yzRUgsSeBd0uetuRs,69
6
- dora_distil_whisper-0.3.8.dist-info/RECORD,,
@@ -1,3 +0,0 @@
1
- [console_scripts]
2
- dora-distil-whisper=dora_distil_whisper.main:main
3
-