phonexia-enhanced-speech-to-text-built-on-whisper-client 1.10.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {phonexia_enhanced_speech_to_text_built_on_whisper_client-1.10.0.dist-info → phonexia_enhanced_speech_to_text_built_on_whisper_client-2.0.0.dist-info}/METADATA +12 -21
- phonexia_enhanced_speech_to_text_built_on_whisper_client-2.0.0.dist-info/RECORD +6 -0
- {phonexia_enhanced_speech_to_text_built_on_whisper_client-1.10.0.dist-info → phonexia_enhanced_speech_to_text_built_on_whisper_client-2.0.0.dist-info}/WHEEL +2 -1
- phonexia_enhanced_speech_to_text_built_on_whisper_client-2.0.0.dist-info/entry_points.txt +2 -0
- phonexia_enhanced_speech_to_text_built_on_whisper_client-2.0.0.dist-info/top_level.txt +1 -0
- phonexia_enhanced_speech_to_text_built_on_whisper_client.py +405 -178
- phonexia_enhanced_speech_to_text_built_on_whisper_client-1.10.0.dist-info/RECORD +0 -5
- phonexia_enhanced_speech_to_text_built_on_whisper_client-1.10.0.dist-info/entry_points.txt +0 -3
|
@@ -1,27 +1,18 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: phonexia-enhanced-speech-to-text-built-on-whisper-client
|
|
3
|
-
Version:
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 2.0.0
|
|
4
|
+
Summary: Audio Quality Estimation Client
|
|
5
|
+
Author-email: Phonexia <info@phonexia.com>
|
|
5
6
|
Keywords: grpc,transcription,STT,ASR,speech to text,speech,language,microservice
|
|
6
|
-
|
|
7
|
-
Author-email: info@phonexia.com
|
|
8
|
-
Requires-Python: >=3.9,<4.0
|
|
9
|
-
Classifier: Programming Language :: Python :: 3
|
|
10
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
11
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
-
Requires-Dist: grpcio (>=1.54.0,<2.0.0)
|
|
16
|
-
Requires-Dist: numpy (<2.0.0) ; python_version < "3.12"
|
|
17
|
-
Requires-Dist: numpy (>=2.0.0) ; python_version >= "3.12"
|
|
18
|
-
Requires-Dist: phonexia-grpc (>=2.0.0,<3.0.0)
|
|
19
|
-
Requires-Dist: protobuf (>=5.0.0,<6.0.0)
|
|
20
|
-
Requires-Dist: soundfile (>=0.13.0,<0.14.0)
|
|
21
|
-
Project-URL: Homepage, https://phonexia.com
|
|
22
|
-
Project-URL: Issues, https://phonexia.atlassian.net/servicedesk/customer/portal/15/group/20/create/40
|
|
23
|
-
Project-URL: protofiles, https://github.com/phonexia/protofiles
|
|
7
|
+
Requires-Python: >=3.9
|
|
24
8
|
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: more-itertools>=10.6.0
|
|
10
|
+
Requires-Dist: phonexia-grpc>=2.26.0
|
|
11
|
+
Requires-Dist: numpy>=2.0.0; python_version >= "3.12"
|
|
12
|
+
Requires-Dist: numpy<2.0.0; python_version < "3.12"
|
|
13
|
+
Requires-Dist: typer>=0.16.0
|
|
14
|
+
Requires-Dist: soundfile>=0.13.0
|
|
15
|
+
Requires-Dist: py-ubjson>=0.16.1
|
|
25
16
|
|
|
26
17
|
|
|
27
18
|

|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
phonexia_enhanced_speech_to_text_built_on_whisper_client.py,sha256=xkhDL3LCyMw0QuZL3-ZeVa9Fx2AxKjgVlOtsZvKqujE,17202
|
|
2
|
+
phonexia_enhanced_speech_to_text_built_on_whisper_client-2.0.0.dist-info/METADATA,sha256=isJHGgPYELCcAi1HYO0GGDsHE0veXaXs5KHRf43nXZU,1748
|
|
3
|
+
phonexia_enhanced_speech_to_text_built_on_whisper_client-2.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
4
|
+
phonexia_enhanced_speech_to_text_built_on_whisper_client-2.0.0.dist-info/entry_points.txt,sha256=3t-DG5W0VDbM-4oqWylIKsJ5f-IDo8eJtvB8fpF9tOk,129
|
|
5
|
+
phonexia_enhanced_speech_to_text_built_on_whisper_client-2.0.0.dist-info/top_level.txt,sha256=UrRc-bXR5jArOtIsymBnXsH9Z1wEWmGu8hB0n1A32Q8,57
|
|
6
|
+
phonexia_enhanced_speech_to_text_built_on_whisper_client-2.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
phonexia_enhanced_speech_to_text_built_on_whisper_client
|
|
@@ -1,15 +1,17 @@
|
|
|
1
|
-
import argparse
|
|
2
1
|
import json
|
|
3
2
|
import logging
|
|
4
|
-
import
|
|
3
|
+
import re
|
|
4
|
+
from collections.abc import Iterator
|
|
5
5
|
from datetime import datetime
|
|
6
6
|
from enum import Enum
|
|
7
|
-
from typing import
|
|
7
|
+
from typing import Annotated, BinaryIO, Optional, TextIO
|
|
8
8
|
|
|
9
|
-
import google.protobuf.duration_pb2
|
|
10
9
|
import grpc
|
|
10
|
+
import numpy as np
|
|
11
11
|
import phonexia.grpc.technologies.enhanced_speech_to_text_built_on_whisper.v1.enhanced_speech_to_text_built_on_whisper_pb2_grpc as stt_grpc
|
|
12
12
|
import soundfile
|
|
13
|
+
import typer
|
|
14
|
+
from google.protobuf.duration_pb2 import Duration
|
|
13
15
|
from google.protobuf.json_format import MessageToDict
|
|
14
16
|
from phonexia.grpc.common.core_pb2 import Audio, RawAudioConfig, TimeRange
|
|
15
17
|
from phonexia.grpc.technologies.enhanced_speech_to_text_built_on_whisper.v1.enhanced_speech_to_text_built_on_whisper_pb2 import (
|
|
@@ -22,25 +24,25 @@ from phonexia.grpc.technologies.enhanced_speech_to_text_built_on_whisper.v1.enha
|
|
|
22
24
|
CHUNK_SIZE = 32000
|
|
23
25
|
|
|
24
26
|
|
|
25
|
-
class
|
|
26
|
-
|
|
27
|
-
|
|
27
|
+
class LogLevel(str, Enum):
|
|
28
|
+
CRITICAL = "critical"
|
|
29
|
+
ERROR = "error"
|
|
30
|
+
WARNING = "warning"
|
|
31
|
+
INFO = "info"
|
|
32
|
+
DEBUG = "debug"
|
|
28
33
|
|
|
29
|
-
def __str__(self):
|
|
30
|
-
return self.value
|
|
31
34
|
|
|
32
|
-
|
|
33
|
-
def time_to_duration(time: float) -> Optional[google.protobuf.duration_pb2.Duration]:
|
|
35
|
+
def time_to_duration(time: Optional[float]) -> Optional[Duration]:
|
|
34
36
|
if time is None:
|
|
35
37
|
return None
|
|
36
|
-
duration =
|
|
38
|
+
duration = Duration()
|
|
37
39
|
duration.seconds = int(time)
|
|
38
40
|
duration.nanos = int((time - duration.seconds) * 1e9)
|
|
39
41
|
return duration
|
|
40
42
|
|
|
41
43
|
|
|
42
44
|
def transcribe_request_iterator(
|
|
43
|
-
file:
|
|
45
|
+
file: BinaryIO,
|
|
44
46
|
specified_language: Optional[str],
|
|
45
47
|
start: Optional[float],
|
|
46
48
|
end: Optional[float],
|
|
@@ -63,11 +65,15 @@ def transcribe_request_iterator(
|
|
|
63
65
|
encoding=RawAudioConfig.AudioEncoding.PCM16,
|
|
64
66
|
)
|
|
65
67
|
|
|
66
|
-
for data in r.blocks(blocksize=r.samplerate, dtype="
|
|
68
|
+
for data in r.blocks(blocksize=r.samplerate, dtype="float32"):
|
|
67
69
|
logging.debug("Sending chunk of size %d samples", len(data))
|
|
70
|
+
int16_info = np.iinfo(np.int16)
|
|
71
|
+
data_scaled = np.clip(
|
|
72
|
+
data * (int16_info.max + 1), int16_info.min, int16_info.max
|
|
73
|
+
).astype("int16")
|
|
68
74
|
yield TranscribeRequest(
|
|
69
75
|
audio=Audio(
|
|
70
|
-
content=
|
|
76
|
+
content=data_scaled.flatten().tobytes(),
|
|
71
77
|
time_range=time_range,
|
|
72
78
|
raw_audio_config=raw_audio_config,
|
|
73
79
|
),
|
|
@@ -77,22 +83,22 @@ def transcribe_request_iterator(
|
|
|
77
83
|
raw_audio_config = None
|
|
78
84
|
config = None
|
|
79
85
|
else:
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
config = None
|
|
86
|
+
while chunk := file.read(CHUNK_SIZE):
|
|
87
|
+
yield TranscribeRequest(
|
|
88
|
+
audio=Audio(content=chunk, time_range=time_range), config=config
|
|
89
|
+
)
|
|
90
|
+
time_range = None
|
|
91
|
+
config = None
|
|
87
92
|
|
|
88
93
|
|
|
89
94
|
def translate_request_iterator(
|
|
90
|
-
file:
|
|
95
|
+
file: BinaryIO,
|
|
91
96
|
specified_language: Optional[str],
|
|
92
97
|
start: Optional[float],
|
|
93
98
|
end: Optional[float],
|
|
94
99
|
enable_language_switching: bool = False,
|
|
95
100
|
enable_word_segmentation: bool = False,
|
|
101
|
+
use_raw_audio: bool = False,
|
|
96
102
|
) -> Iterator[TranslateRequest]:
|
|
97
103
|
time_range = TimeRange(start=time_to_duration(start), end=time_to_duration(end))
|
|
98
104
|
config = TranslateConfig(
|
|
@@ -101,197 +107,418 @@ def translate_request_iterator(
|
|
|
101
107
|
enable_word_segmentation=enable_word_segmentation,
|
|
102
108
|
)
|
|
103
109
|
|
|
104
|
-
|
|
105
|
-
|
|
110
|
+
if use_raw_audio:
|
|
111
|
+
with soundfile.SoundFile(file) as r:
|
|
112
|
+
raw_audio_config = RawAudioConfig(
|
|
113
|
+
channels=r.channels,
|
|
114
|
+
sample_rate_hertz=r.samplerate,
|
|
115
|
+
encoding=RawAudioConfig.AudioEncoding.PCM16,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
for data in r.blocks(blocksize=r.samplerate, dtype="float32"):
|
|
119
|
+
logging.debug("Sending chunk of size %d samples", len(data))
|
|
120
|
+
int16_info = np.iinfo(np.int16)
|
|
121
|
+
data_scaled = np.clip(
|
|
122
|
+
data * (int16_info.max + 1), int16_info.min, int16_info.max
|
|
123
|
+
).astype("int16")
|
|
124
|
+
yield TranslateRequest(
|
|
125
|
+
audio=Audio(
|
|
126
|
+
content=data_scaled.flatten().tobytes(),
|
|
127
|
+
time_range=time_range,
|
|
128
|
+
raw_audio_config=raw_audio_config,
|
|
129
|
+
),
|
|
130
|
+
config=config,
|
|
131
|
+
)
|
|
132
|
+
time_range = None
|
|
133
|
+
raw_audio_config = None
|
|
134
|
+
config = None
|
|
135
|
+
else:
|
|
136
|
+
while chunk := file.read(CHUNK_SIZE):
|
|
106
137
|
yield TranslateRequest(audio=Audio(content=chunk, time_range=time_range), config=config)
|
|
107
138
|
time_range = None
|
|
108
139
|
config = None
|
|
109
140
|
|
|
110
141
|
|
|
111
|
-
def
|
|
142
|
+
def write_result(
|
|
143
|
+
audio_path: str,
|
|
144
|
+
responses: list,
|
|
145
|
+
output: TextIO,
|
|
146
|
+
language: Optional[str],
|
|
147
|
+
):
|
|
148
|
+
logging.info(f"{audio_path!s} -> {output.name}")
|
|
149
|
+
|
|
150
|
+
# Aggregate all responses
|
|
151
|
+
response_dict = None
|
|
152
|
+
|
|
153
|
+
for _response in responses:
|
|
154
|
+
if not response_dict:
|
|
155
|
+
response_dict = MessageToDict(
|
|
156
|
+
message=_response,
|
|
157
|
+
always_print_fields_with_no_presence=True,
|
|
158
|
+
preserving_proto_field_name=True,
|
|
159
|
+
)
|
|
160
|
+
else:
|
|
161
|
+
response_dict["result"]["one_best"]["segments"] += \
|
|
162
|
+
MessageToDict(
|
|
163
|
+
message=_response,
|
|
164
|
+
always_print_fields_with_no_presence=True,
|
|
165
|
+
preserving_proto_field_name=True,
|
|
166
|
+
)["result"]["one_best"]["segments"] # fmt: skip
|
|
167
|
+
|
|
168
|
+
json.dump(response_dict, output, indent=2, ensure_ascii=False)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def translate_impl(
|
|
112
172
|
channel: grpc.Channel,
|
|
113
|
-
file:
|
|
173
|
+
file: BinaryIO,
|
|
174
|
+
output: TextIO,
|
|
114
175
|
language: Optional[str],
|
|
115
176
|
start: Optional[float],
|
|
116
177
|
end: Optional[float],
|
|
117
178
|
metadata: Optional[list],
|
|
118
|
-
task: Task,
|
|
119
179
|
enable_language_switching: bool = False,
|
|
120
180
|
enable_word_segmentation: bool = False,
|
|
121
181
|
use_raw_audio: bool = False,
|
|
122
182
|
):
|
|
183
|
+
logging.info("Processing audio file with translate")
|
|
123
184
|
stub = stt_grpc.SpeechToTextStub(channel)
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
translate_request_iterator(
|
|
140
|
-
file=file,
|
|
141
|
-
specified_language=language,
|
|
142
|
-
start=start,
|
|
143
|
-
end=end,
|
|
144
|
-
enable_language_switching=enable_language_switching,
|
|
145
|
-
enable_word_segmentation=enable_word_segmentation,
|
|
146
|
-
),
|
|
147
|
-
metadata=metadata,
|
|
148
|
-
)
|
|
149
|
-
else:
|
|
150
|
-
raise RuntimeError("Unknown task")
|
|
185
|
+
response = stub.Translate(
|
|
186
|
+
translate_request_iterator(
|
|
187
|
+
file=file,
|
|
188
|
+
specified_language=language,
|
|
189
|
+
start=start,
|
|
190
|
+
end=end,
|
|
191
|
+
enable_language_switching=enable_language_switching,
|
|
192
|
+
enable_word_segmentation=enable_word_segmentation,
|
|
193
|
+
use_raw_audio=use_raw_audio,
|
|
194
|
+
),
|
|
195
|
+
metadata=metadata,
|
|
196
|
+
)
|
|
197
|
+
# Collect all responses
|
|
198
|
+
responses = list(response)
|
|
199
|
+
write_result(file.name, responses, output, language)
|
|
151
200
|
|
|
152
|
-
info_message = []
|
|
153
|
-
response_dict = None
|
|
154
|
-
for _response in response:
|
|
155
|
-
if not response_dict:
|
|
156
|
-
response_dict = MessageToDict(_response)
|
|
157
|
-
else:
|
|
158
|
-
response_dict["result"]["oneBest"]["segments"] += \
|
|
159
|
-
MessageToDict(_response)["result"]["oneBest"]["segments"] # fmt: skip
|
|
160
|
-
|
|
161
|
-
for segment in _response.result.one_best.segments:
|
|
162
|
-
if segment.source_language != segment.detected_source_language:
|
|
163
|
-
info_message.append(
|
|
164
|
-
f"Language '{segment.detected_source_language}' was detected in the audio, but instead "
|
|
165
|
-
f"the segment was {'transcribed' if task == Task.transcribe else 'translated'} with the "
|
|
166
|
-
+ (
|
|
167
|
-
f"closest available source language '{segment.source_language}'"
|
|
168
|
-
if language is None
|
|
169
|
-
else f"language '{language}' that was enforced by the '--language' argument"
|
|
170
|
-
)
|
|
171
|
-
)
|
|
172
201
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
202
|
+
def transcribe_impl(
|
|
203
|
+
channel: grpc.Channel,
|
|
204
|
+
file: BinaryIO,
|
|
205
|
+
output: TextIO,
|
|
206
|
+
language: Optional[str],
|
|
207
|
+
start: Optional[float],
|
|
208
|
+
end: Optional[float],
|
|
209
|
+
metadata: Optional[list],
|
|
210
|
+
enable_language_switching: bool = False,
|
|
211
|
+
enable_word_segmentation: bool = False,
|
|
212
|
+
use_raw_audio: bool = False,
|
|
213
|
+
):
|
|
214
|
+
logging.info("Processing audio file with transcribe")
|
|
215
|
+
stub = stt_grpc.SpeechToTextStub(channel)
|
|
216
|
+
response = stub.Transcribe(
|
|
217
|
+
transcribe_request_iterator(
|
|
218
|
+
file=file,
|
|
219
|
+
specified_language=language,
|
|
220
|
+
start=start,
|
|
221
|
+
end=end,
|
|
222
|
+
enable_language_switching=enable_language_switching,
|
|
223
|
+
enable_word_segmentation=enable_word_segmentation,
|
|
224
|
+
use_raw_audio=use_raw_audio,
|
|
225
|
+
),
|
|
226
|
+
metadata=metadata,
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
# Collect all responses
|
|
230
|
+
responses = list(response)
|
|
231
|
+
write_result(file.name, responses, output, language)
|
|
178
232
|
|
|
179
233
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
234
|
+
# Helper functions
|
|
235
|
+
def _parse_time_range(time_range: str) -> tuple[Optional[float], Optional[float]]:
|
|
236
|
+
if time_range is None:
|
|
237
|
+
return None, None
|
|
238
|
+
|
|
239
|
+
if len(time_range) == 0:
|
|
240
|
+
raise typer.BadParameter("Parameter 'time_range' must be of the form '[START]:[END]'.")
|
|
241
|
+
|
|
242
|
+
# Regex pattern to match [START]:[END] format where START and END are positive floats
|
|
243
|
+
pattern = r"^(\d+(?:\.\d+)?)?:(\d+(?:\.\d+)?)?$"
|
|
244
|
+
match = re.match(pattern, time_range.strip())
|
|
245
|
+
|
|
246
|
+
if not match:
|
|
247
|
+
raise typer.BadParameter(
|
|
248
|
+
"Parameter 'time_range' must be of the form '[START]:[END]' where START and END are positive float numbers."
|
|
185
249
|
)
|
|
186
|
-
)
|
|
187
250
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
type=str,
|
|
192
|
-
default="localhost:8080",
|
|
193
|
-
help="Server address, default: localhost:8080",
|
|
194
|
-
)
|
|
195
|
-
parser.add_argument(
|
|
196
|
-
"-l",
|
|
197
|
-
"--log_level",
|
|
198
|
-
type=str,
|
|
199
|
-
default="error",
|
|
200
|
-
choices=["critical", "error", "warning", "info", "debug"],
|
|
201
|
-
)
|
|
202
|
-
parser.add_argument(
|
|
203
|
-
"--metadata",
|
|
204
|
-
metavar="key=value",
|
|
205
|
-
nargs="+",
|
|
206
|
-
type=lambda x: tuple(x.split("=")),
|
|
207
|
-
help="Custom client metadata",
|
|
208
|
-
)
|
|
209
|
-
parser.add_argument("--use_ssl", action="store_true", help="Use SSL connection")
|
|
210
|
-
parser.add_argument("--start", type=float, help="Audio start time")
|
|
211
|
-
parser.add_argument("--end", type=float, help="Audio end time")
|
|
212
|
-
|
|
213
|
-
parser.add_argument(
|
|
214
|
-
"--language",
|
|
215
|
-
type=str,
|
|
216
|
-
default=None,
|
|
217
|
-
help=(
|
|
218
|
-
"Force transcription to specified language, if not set, language is detected"
|
|
219
|
-
" automatically"
|
|
220
|
-
),
|
|
221
|
-
)
|
|
222
|
-
parser.add_argument(
|
|
223
|
-
"--task",
|
|
224
|
-
type=Task,
|
|
225
|
-
default=Task.transcribe,
|
|
226
|
-
choices=list(Task),
|
|
227
|
-
help="Select whether to transcribe or translate the recording",
|
|
228
|
-
)
|
|
229
|
-
parser.add_argument(
|
|
230
|
-
"--enable-language-switching",
|
|
231
|
-
action="store_true",
|
|
232
|
-
help="Enable dynamic language switching during transcription, with the language being detected approximately every 30 seconds",
|
|
233
|
-
)
|
|
234
|
-
parser.add_argument(
|
|
235
|
-
"--enable-word-segmentation",
|
|
236
|
-
action="store_true",
|
|
237
|
-
help="Enable word-level transcription. Note: Enabling this option may increase processing time",
|
|
238
|
-
)
|
|
239
|
-
parser.add_argument("file", type=str, help="Path to input file")
|
|
240
|
-
parser.add_argument("--use_raw_audio", action="store_true", help="Send a raw audio in")
|
|
251
|
+
# Parse START and END from regex groups
|
|
252
|
+
start_str = match.group(1)
|
|
253
|
+
end_str = match.group(2)
|
|
241
254
|
|
|
242
|
-
|
|
255
|
+
start = float(start_str) if start_str is not None else None
|
|
256
|
+
end = float(end_str) if end_str is not None else None
|
|
243
257
|
|
|
244
|
-
if
|
|
245
|
-
raise
|
|
258
|
+
if start is not None and end is not None and start >= end:
|
|
259
|
+
raise typer.BadParameter("Parameter 'end' must be larger than 'start'.")
|
|
246
260
|
|
|
247
|
-
if
|
|
248
|
-
raise ValueError("Parameter 'end' must be a positive float.")
|
|
261
|
+
return (None if start == 0.0 else start, end)
|
|
249
262
|
|
|
250
|
-
if args.start is not None and args.end is not None and args.start >= args.end:
|
|
251
|
-
raise ValueError("Parameter 'end' must be larger than 'start'.")
|
|
252
263
|
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
264
|
+
def _parse_metadata_callback(
|
|
265
|
+
ctx: typer.Context, metadata_list: Optional[list[str]]
|
|
266
|
+
) -> list[tuple[str, str]]:
|
|
267
|
+
if ctx.resilient_parsing or metadata_list is None:
|
|
268
|
+
return []
|
|
269
|
+
|
|
270
|
+
params = []
|
|
271
|
+
for item in metadata_list:
|
|
272
|
+
t = tuple(item.split("=", 1))
|
|
273
|
+
if len(t) != 2:
|
|
274
|
+
raise typer.BadParameter(f"Metadata must be in format 'KEY=VALUE': {item}")
|
|
275
|
+
params.append(t)
|
|
276
|
+
return params
|
|
258
277
|
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
278
|
+
|
|
279
|
+
app = typer.Typer(context_settings={"help_option_names": ["-h", "--help"]}, no_args_is_help=True)
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
@app.command()
|
|
283
|
+
def translate(
|
|
284
|
+
ctx: typer.Context,
|
|
285
|
+
input_file: Annotated[
|
|
286
|
+
typer.FileBinaryRead,
|
|
287
|
+
typer.Argument(
|
|
288
|
+
help="Input audio file path.",
|
|
289
|
+
),
|
|
290
|
+
] = "-",
|
|
291
|
+
time_range: Annotated[
|
|
292
|
+
Optional[str],
|
|
293
|
+
typer.Option(
|
|
294
|
+
"-t",
|
|
295
|
+
"--time-range",
|
|
296
|
+
callback=_parse_time_range,
|
|
297
|
+
metavar="[START]:[END]",
|
|
298
|
+
help=(
|
|
299
|
+
"Time range in seconds using format [START]:[END] where START and END are positive float numbers. "
|
|
300
|
+
"START can be omitted to process from beginning, END can be omitted to process to the end of the recording. "
|
|
301
|
+
"Examples: --time-range :10 (0 to 10), --time-range 10.1: (10.1 to end), --time-range 5:10 (5 to 10)."
|
|
302
|
+
),
|
|
303
|
+
),
|
|
304
|
+
] = None,
|
|
305
|
+
language: Annotated[
|
|
306
|
+
Optional[str],
|
|
307
|
+
typer.Option(
|
|
308
|
+
"--language",
|
|
309
|
+
help=(
|
|
310
|
+
"Force transcription to specified language, if not set, language is detected "
|
|
311
|
+
"automatically."
|
|
312
|
+
),
|
|
313
|
+
),
|
|
314
|
+
] = None,
|
|
315
|
+
enable_language_switching: Annotated[
|
|
316
|
+
bool,
|
|
317
|
+
typer.Option(
|
|
318
|
+
"--enable-language-switching",
|
|
319
|
+
help="Enable dynamic language switching during transcription, with the language being detected approximately every 30 seconds.",
|
|
320
|
+
),
|
|
321
|
+
] = False,
|
|
322
|
+
enable_word_segmentation: Annotated[
|
|
323
|
+
bool,
|
|
324
|
+
typer.Option(
|
|
325
|
+
"--enable-word-segmentation",
|
|
326
|
+
help="Enable word-level transcription. Note: Enabling this option may increase processing time.",
|
|
327
|
+
),
|
|
328
|
+
] = False,
|
|
329
|
+
use_raw_audio: Annotated[
|
|
330
|
+
bool,
|
|
331
|
+
typer.Option(
|
|
332
|
+
"--use-raw-audio",
|
|
333
|
+
help="Send raw audio in chunks. Enables continuous audio processing with less server memory usage.",
|
|
334
|
+
),
|
|
335
|
+
] = False,
|
|
336
|
+
output: Annotated[
|
|
337
|
+
typer.FileTextWrite,
|
|
338
|
+
typer.Option(
|
|
339
|
+
"--output", "-o", help="Output file path. If omitted, prints to stdout.", lazy=False
|
|
340
|
+
),
|
|
341
|
+
] = "-",
|
|
342
|
+
) -> None:
|
|
343
|
+
"""Translates input audio into segments with timestamps."""
|
|
262
344
|
|
|
263
345
|
try:
|
|
264
|
-
logging.info(f"Connecting to {
|
|
265
|
-
|
|
266
|
-
grpc.
|
|
267
|
-
if
|
|
268
|
-
else grpc.
|
|
269
|
-
|
|
346
|
+
logging.info(f"Connecting to {ctx.obj['host']}")
|
|
347
|
+
with (
|
|
348
|
+
grpc.insecure_channel(target=ctx.obj["host"])
|
|
349
|
+
if ctx.obj["plaintext"]
|
|
350
|
+
else grpc.secure_channel(
|
|
351
|
+
target=ctx.obj["host"], credentials=grpc.ssl_channel_credentials()
|
|
352
|
+
)
|
|
353
|
+
) as channel:
|
|
354
|
+
start_time = datetime.now()
|
|
355
|
+
|
|
356
|
+
translate_impl(
|
|
357
|
+
channel=channel,
|
|
358
|
+
file=input_file,
|
|
359
|
+
output=output,
|
|
360
|
+
language=language,
|
|
361
|
+
start=time_range[0],
|
|
362
|
+
end=time_range[1],
|
|
363
|
+
metadata=ctx.obj["metadata"],
|
|
364
|
+
enable_language_switching=enable_language_switching,
|
|
365
|
+
enable_word_segmentation=enable_word_segmentation,
|
|
366
|
+
use_raw_audio=use_raw_audio,
|
|
367
|
+
)
|
|
270
368
|
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
369
|
+
logging.debug(f"Elapsed time {(datetime.now() - start_time)}")
|
|
370
|
+
|
|
371
|
+
except grpc.RpcError:
|
|
372
|
+
logging.exception("RPC failed")
|
|
373
|
+
raise typer.Exit(code=1) from None
|
|
374
|
+
except (typer.Exit, typer.BadParameter):
|
|
375
|
+
raise
|
|
376
|
+
except Exception:
|
|
377
|
+
logging.exception("Unknown error")
|
|
378
|
+
raise typer.Exit(code=2) from None
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
@app.command()
|
|
382
|
+
def transcribe(
|
|
383
|
+
ctx: typer.Context,
|
|
384
|
+
input_file: Annotated[
|
|
385
|
+
typer.FileBinaryRead,
|
|
386
|
+
typer.Argument(
|
|
387
|
+
help="Input audio file path.",
|
|
388
|
+
),
|
|
389
|
+
] = "-",
|
|
390
|
+
time_range: Annotated[
|
|
391
|
+
Optional[str],
|
|
392
|
+
typer.Option(
|
|
393
|
+
"-t",
|
|
394
|
+
"--time-range",
|
|
395
|
+
callback=_parse_time_range,
|
|
396
|
+
metavar="[START]:[END]",
|
|
397
|
+
help=(
|
|
398
|
+
"Time range in seconds using format [START]:[END] where START and END are positive float numbers. "
|
|
399
|
+
"START can be omitted to process from beginning, END can be omitted to process to the end of the recording. "
|
|
400
|
+
"Examples: --time-range :10 (0 to 10), --time-range 10.1: (10.1 to end), --time-range 5:10 (5 to 10)."
|
|
401
|
+
),
|
|
402
|
+
),
|
|
403
|
+
] = None,
|
|
404
|
+
language: Annotated[
|
|
405
|
+
Optional[str],
|
|
406
|
+
typer.Option(
|
|
407
|
+
"--language",
|
|
408
|
+
help=(
|
|
409
|
+
"Force transcription to specified language, if not set, language is detected "
|
|
410
|
+
"automatically."
|
|
411
|
+
),
|
|
412
|
+
),
|
|
413
|
+
] = None,
|
|
414
|
+
enable_language_switching: Annotated[
|
|
415
|
+
bool,
|
|
416
|
+
typer.Option(
|
|
417
|
+
"--enable-language-switching",
|
|
418
|
+
help="Enable dynamic language switching during transcription, with the language being detected approximately every 30 seconds.",
|
|
419
|
+
),
|
|
420
|
+
] = False,
|
|
421
|
+
enable_word_segmentation: Annotated[
|
|
422
|
+
bool,
|
|
423
|
+
typer.Option(
|
|
424
|
+
"--enable-word-segmentation",
|
|
425
|
+
help="Enable word-level transcription. Note: Enabling this option may increase processing time.",
|
|
426
|
+
),
|
|
427
|
+
] = False,
|
|
428
|
+
use_raw_audio: Annotated[
|
|
429
|
+
bool,
|
|
430
|
+
typer.Option(
|
|
431
|
+
"--use-raw-audio",
|
|
432
|
+
help="Send raw audio in chunks. Enables continuous audio processing with less server memory usage.",
|
|
433
|
+
),
|
|
434
|
+
] = False,
|
|
435
|
+
output: Annotated[
|
|
436
|
+
typer.FileTextWrite,
|
|
437
|
+
typer.Option(
|
|
438
|
+
"--output", "-o", help="Output file path. If omitted, prints to stdout.", lazy=False
|
|
439
|
+
),
|
|
440
|
+
] = "-",
|
|
441
|
+
) -> None:
|
|
442
|
+
"""Transcribes input audio into segments with timestamps."""
|
|
443
|
+
|
|
444
|
+
try:
|
|
445
|
+
logging.info(f"Connecting to {ctx.obj['host']}")
|
|
446
|
+
with (
|
|
447
|
+
grpc.insecure_channel(target=ctx.obj["host"])
|
|
448
|
+
if ctx.obj["plaintext"]
|
|
449
|
+
else grpc.secure_channel(
|
|
450
|
+
target=ctx.obj["host"], credentials=grpc.ssl_channel_credentials()
|
|
451
|
+
)
|
|
452
|
+
) as channel:
|
|
453
|
+
start_time = datetime.now()
|
|
454
|
+
|
|
455
|
+
transcribe_impl(
|
|
456
|
+
channel=channel,
|
|
457
|
+
file=input_file,
|
|
458
|
+
output=output,
|
|
459
|
+
language=language,
|
|
460
|
+
start=time_range[0],
|
|
461
|
+
end=time_range[1],
|
|
462
|
+
metadata=ctx.obj["metadata"],
|
|
463
|
+
enable_language_switching=enable_language_switching,
|
|
464
|
+
enable_word_segmentation=enable_word_segmentation,
|
|
465
|
+
use_raw_audio=use_raw_audio,
|
|
466
|
+
)
|
|
285
467
|
|
|
286
|
-
|
|
468
|
+
logging.debug(f"Elapsed time {(datetime.now() - start_time)}")
|
|
287
469
|
|
|
288
470
|
except grpc.RpcError:
|
|
289
471
|
logging.exception("RPC failed")
|
|
290
|
-
|
|
472
|
+
raise typer.Exit(code=1) from None
|
|
473
|
+
except (typer.Exit, typer.BadParameter):
|
|
474
|
+
raise
|
|
291
475
|
except Exception:
|
|
292
476
|
logging.exception("Unknown error")
|
|
293
|
-
|
|
477
|
+
raise typer.Exit(code=2) from None
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
@app.callback()
|
|
481
|
+
def cli(
|
|
482
|
+
ctx: typer.Context,
|
|
483
|
+
host: Annotated[
|
|
484
|
+
str,
|
|
485
|
+
typer.Option("--host", "-H", help="Server address (host:port)."),
|
|
486
|
+
] = "localhost:8080",
|
|
487
|
+
log_level: Annotated[
|
|
488
|
+
LogLevel, typer.Option("--log-level", "-l", help="Logging level.")
|
|
489
|
+
] = LogLevel.ERROR,
|
|
490
|
+
metadata: Annotated[
|
|
491
|
+
list[str],
|
|
492
|
+
typer.Option(
|
|
493
|
+
"--metadata",
|
|
494
|
+
metavar="key=value",
|
|
495
|
+
help="Custom client metadata.",
|
|
496
|
+
show_default=False,
|
|
497
|
+
callback=_parse_metadata_callback,
|
|
498
|
+
),
|
|
499
|
+
] = [],
|
|
500
|
+
plaintext: Annotated[
|
|
501
|
+
bool,
|
|
502
|
+
typer.Option(
|
|
503
|
+
"--plaintext", help="Use plain-text HTTP/2 when connecting to server (no TLS)."
|
|
504
|
+
),
|
|
505
|
+
] = False,
|
|
506
|
+
) -> None:
|
|
507
|
+
"""Enhanced Speech to Text Built on Whisper gRPC client."""
|
|
508
|
+
|
|
509
|
+
ctx.obj = {
|
|
510
|
+
"host": host,
|
|
511
|
+
"metadata": metadata,
|
|
512
|
+
"log_level": log_level,
|
|
513
|
+
"plaintext": plaintext,
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
logging.basicConfig(
|
|
517
|
+
level=log_level.value.upper(),
|
|
518
|
+
format="[%(asctime)s.%(msecs)03d] [%(levelname)s] %(message)s",
|
|
519
|
+
datefmt="%Y-%m-%d %H:%M:%S",
|
|
520
|
+
)
|
|
294
521
|
|
|
295
522
|
|
|
296
523
|
if __name__ == "__main__":
|
|
297
|
-
|
|
524
|
+
app()
|
|
@@ -1,5 +0,0 @@
|
|
|
1
|
-
phonexia_enhanced_speech_to_text_built_on_whisper_client.py,sha256=acZwljE4vRaKtf71f6Cm75c81iYhrmq59FoQIz5k0kI,9928
|
|
2
|
-
phonexia_enhanced_speech_to_text_built_on_whisper_client-1.10.0.dist-info/METADATA,sha256=c9vqgI_gYaz7EFROo2Sq_36MUEp0rxDKRSRnRiKhsVs,2343
|
|
3
|
-
phonexia_enhanced_speech_to_text_built_on_whisper_client-1.10.0.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
|
|
4
|
-
phonexia_enhanced_speech_to_text_built_on_whisper_client-1.10.0.dist-info/entry_points.txt,sha256=RZ7mWDaVGagDYxjXloW7ndadXlJVwg9Xov0gqvPTqHs,129
|
|
5
|
-
phonexia_enhanced_speech_to_text_built_on_whisper_client-1.10.0.dist-info/RECORD,,
|