phonexia-enhanced-speech-to-text-built-on-whisper-client 1.10.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,27 +1,18 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: phonexia-enhanced-speech-to-text-built-on-whisper-client
3
- Version: 1.10.0
4
- Summary: Client for communication with Phonexia Enhanced Speech To Text Built On Whisper microservice.
3
+ Version: 2.0.0
4
+ Summary: Audio Quality Estimation Client
5
+ Author-email: Phonexia <info@phonexia.com>
5
6
  Keywords: grpc,transcription,STT,ASR,speech to text,speech,language,microservice
6
- Author: Phonexia
7
- Author-email: info@phonexia.com
8
- Requires-Python: >=3.9,<4.0
9
- Classifier: Programming Language :: Python :: 3
10
- Classifier: Programming Language :: Python :: 3.9
11
- Classifier: Programming Language :: Python :: 3.10
12
- Classifier: Programming Language :: Python :: 3.11
13
- Classifier: Programming Language :: Python :: 3.12
14
- Classifier: Programming Language :: Python :: 3.13
15
- Requires-Dist: grpcio (>=1.54.0,<2.0.0)
16
- Requires-Dist: numpy (<2.0.0) ; python_version < "3.12"
17
- Requires-Dist: numpy (>=2.0.0) ; python_version >= "3.12"
18
- Requires-Dist: phonexia-grpc (>=2.0.0,<3.0.0)
19
- Requires-Dist: protobuf (>=5.0.0,<6.0.0)
20
- Requires-Dist: soundfile (>=0.13.0,<0.14.0)
21
- Project-URL: Homepage, https://phonexia.com
22
- Project-URL: Issues, https://phonexia.atlassian.net/servicedesk/customer/portal/15/group/20/create/40
23
- Project-URL: protofiles, https://github.com/phonexia/protofiles
7
+ Requires-Python: >=3.9
24
8
  Description-Content-Type: text/markdown
9
+ Requires-Dist: more-itertools>=10.6.0
10
+ Requires-Dist: phonexia-grpc>=2.26.0
11
+ Requires-Dist: numpy>=2.0.0; python_version >= "3.12"
12
+ Requires-Dist: numpy<2.0.0; python_version < "3.12"
13
+ Requires-Dist: typer>=0.16.0
14
+ Requires-Dist: soundfile>=0.13.0
15
+ Requires-Dist: py-ubjson>=0.16.1
25
16
 
26
17
 
27
18
  ![](https://www.phonexia.com/wp-content/uploads/PHX_logotype_basic_2016_positive_transparent_RGB.png)
@@ -0,0 +1,6 @@
1
+ phonexia_enhanced_speech_to_text_built_on_whisper_client.py,sha256=xkhDL3LCyMw0QuZL3-ZeVa9Fx2AxKjgVlOtsZvKqujE,17202
2
+ phonexia_enhanced_speech_to_text_built_on_whisper_client-2.0.0.dist-info/METADATA,sha256=isJHGgPYELCcAi1HYO0GGDsHE0veXaXs5KHRf43nXZU,1748
3
+ phonexia_enhanced_speech_to_text_built_on_whisper_client-2.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
4
+ phonexia_enhanced_speech_to_text_built_on_whisper_client-2.0.0.dist-info/entry_points.txt,sha256=3t-DG5W0VDbM-4oqWylIKsJ5f-IDo8eJtvB8fpF9tOk,129
5
+ phonexia_enhanced_speech_to_text_built_on_whisper_client-2.0.0.dist-info/top_level.txt,sha256=UrRc-bXR5jArOtIsymBnXsH9Z1wEWmGu8hB0n1A32Q8,57
6
+ phonexia_enhanced_speech_to_text_built_on_whisper_client-2.0.0.dist-info/RECORD,,
@@ -1,4 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.1.2
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ enhanced_speech_to_text_built_on_whisper_client = phonexia_enhanced_speech_to_text_built_on_whisper_client:app
@@ -0,0 +1 @@
1
+ phonexia_enhanced_speech_to_text_built_on_whisper_client
@@ -1,15 +1,17 @@
1
- import argparse
2
1
  import json
3
2
  import logging
4
- import os
3
+ import re
4
+ from collections.abc import Iterator
5
5
  from datetime import datetime
6
6
  from enum import Enum
7
- from typing import Iterator, Optional
7
+ from typing import Annotated, BinaryIO, Optional, TextIO
8
8
 
9
- import google.protobuf.duration_pb2
10
9
  import grpc
10
+ import numpy as np
11
11
  import phonexia.grpc.technologies.enhanced_speech_to_text_built_on_whisper.v1.enhanced_speech_to_text_built_on_whisper_pb2_grpc as stt_grpc
12
12
  import soundfile
13
+ import typer
14
+ from google.protobuf.duration_pb2 import Duration
13
15
  from google.protobuf.json_format import MessageToDict
14
16
  from phonexia.grpc.common.core_pb2 import Audio, RawAudioConfig, TimeRange
15
17
  from phonexia.grpc.technologies.enhanced_speech_to_text_built_on_whisper.v1.enhanced_speech_to_text_built_on_whisper_pb2 import (
@@ -22,25 +24,25 @@ from phonexia.grpc.technologies.enhanced_speech_to_text_built_on_whisper.v1.enha
22
24
  CHUNK_SIZE = 32000
23
25
 
24
26
 
25
- class Task(Enum):
26
- transcribe = "transcribe"
27
- translate = "translate"
27
+ class LogLevel(str, Enum):
28
+ CRITICAL = "critical"
29
+ ERROR = "error"
30
+ WARNING = "warning"
31
+ INFO = "info"
32
+ DEBUG = "debug"
28
33
 
29
- def __str__(self):
30
- return self.value
31
34
 
32
-
33
- def time_to_duration(time: float) -> Optional[google.protobuf.duration_pb2.Duration]:
35
+ def time_to_duration(time: Optional[float]) -> Optional[Duration]:
34
36
  if time is None:
35
37
  return None
36
- duration = google.protobuf.duration_pb2.Duration()
38
+ duration = Duration()
37
39
  duration.seconds = int(time)
38
40
  duration.nanos = int((time - duration.seconds) * 1e9)
39
41
  return duration
40
42
 
41
43
 
42
44
  def transcribe_request_iterator(
43
- file: str,
45
+ file: BinaryIO,
44
46
  specified_language: Optional[str],
45
47
  start: Optional[float],
46
48
  end: Optional[float],
@@ -63,11 +65,15 @@ def transcribe_request_iterator(
63
65
  encoding=RawAudioConfig.AudioEncoding.PCM16,
64
66
  )
65
67
 
66
- for data in r.blocks(blocksize=r.samplerate, dtype="int16"):
68
+ for data in r.blocks(blocksize=r.samplerate, dtype="float32"):
67
69
  logging.debug("Sending chunk of size %d samples", len(data))
70
+ int16_info = np.iinfo(np.int16)
71
+ data_scaled = np.clip(
72
+ data * (int16_info.max + 1), int16_info.min, int16_info.max
73
+ ).astype("int16")
68
74
  yield TranscribeRequest(
69
75
  audio=Audio(
70
- content=data.flatten().tobytes(),
76
+ content=data_scaled.flatten().tobytes(),
71
77
  time_range=time_range,
72
78
  raw_audio_config=raw_audio_config,
73
79
  ),
@@ -77,22 +83,22 @@ def transcribe_request_iterator(
77
83
  raw_audio_config = None
78
84
  config = None
79
85
  else:
80
- with open(file, "rb") as f:
81
- while chunk := f.read(CHUNK_SIZE):
82
- yield TranscribeRequest(
83
- audio=Audio(content=chunk, time_range=time_range), config=config
84
- )
85
- time_range = None
86
- config = None
86
+ while chunk := file.read(CHUNK_SIZE):
87
+ yield TranscribeRequest(
88
+ audio=Audio(content=chunk, time_range=time_range), config=config
89
+ )
90
+ time_range = None
91
+ config = None
87
92
 
88
93
 
89
94
  def translate_request_iterator(
90
- file: str,
95
+ file: BinaryIO,
91
96
  specified_language: Optional[str],
92
97
  start: Optional[float],
93
98
  end: Optional[float],
94
99
  enable_language_switching: bool = False,
95
100
  enable_word_segmentation: bool = False,
101
+ use_raw_audio: bool = False,
96
102
  ) -> Iterator[TranslateRequest]:
97
103
  time_range = TimeRange(start=time_to_duration(start), end=time_to_duration(end))
98
104
  config = TranslateConfig(
@@ -101,197 +107,418 @@ def translate_request_iterator(
101
107
  enable_word_segmentation=enable_word_segmentation,
102
108
  )
103
109
 
104
- with open(file, "rb") as f:
105
- while chunk := f.read(CHUNK_SIZE):
110
+ if use_raw_audio:
111
+ with soundfile.SoundFile(file) as r:
112
+ raw_audio_config = RawAudioConfig(
113
+ channels=r.channels,
114
+ sample_rate_hertz=r.samplerate,
115
+ encoding=RawAudioConfig.AudioEncoding.PCM16,
116
+ )
117
+
118
+ for data in r.blocks(blocksize=r.samplerate, dtype="float32"):
119
+ logging.debug("Sending chunk of size %d samples", len(data))
120
+ int16_info = np.iinfo(np.int16)
121
+ data_scaled = np.clip(
122
+ data * (int16_info.max + 1), int16_info.min, int16_info.max
123
+ ).astype("int16")
124
+ yield TranslateRequest(
125
+ audio=Audio(
126
+ content=data_scaled.flatten().tobytes(),
127
+ time_range=time_range,
128
+ raw_audio_config=raw_audio_config,
129
+ ),
130
+ config=config,
131
+ )
132
+ time_range = None
133
+ raw_audio_config = None
134
+ config = None
135
+ else:
136
+ while chunk := file.read(CHUNK_SIZE):
106
137
  yield TranslateRequest(audio=Audio(content=chunk, time_range=time_range), config=config)
107
138
  time_range = None
108
139
  config = None
109
140
 
110
141
 
111
- def transcribe(
142
+ def write_result(
143
+ audio_path: str,
144
+ responses: list,
145
+ output: TextIO,
146
+ language: Optional[str],
147
+ ):
148
+ logging.info(f"{audio_path!s} -> {output.name}")
149
+
150
+ # Aggregate all responses
151
+ response_dict = None
152
+
153
+ for _response in responses:
154
+ if not response_dict:
155
+ response_dict = MessageToDict(
156
+ message=_response,
157
+ always_print_fields_with_no_presence=True,
158
+ preserving_proto_field_name=True,
159
+ )
160
+ else:
161
+ response_dict["result"]["one_best"]["segments"] += \
162
+ MessageToDict(
163
+ message=_response,
164
+ always_print_fields_with_no_presence=True,
165
+ preserving_proto_field_name=True,
166
+ )["result"]["one_best"]["segments"] # fmt: skip
167
+
168
+ json.dump(response_dict, output, indent=2, ensure_ascii=False)
169
+
170
+
171
+ def translate_impl(
112
172
  channel: grpc.Channel,
113
- file: str,
173
+ file: BinaryIO,
174
+ output: TextIO,
114
175
  language: Optional[str],
115
176
  start: Optional[float],
116
177
  end: Optional[float],
117
178
  metadata: Optional[list],
118
- task: Task,
119
179
  enable_language_switching: bool = False,
120
180
  enable_word_segmentation: bool = False,
121
181
  use_raw_audio: bool = False,
122
182
  ):
183
+ logging.info("Processing audio file with translate")
123
184
  stub = stt_grpc.SpeechToTextStub(channel)
124
- if task == Task.transcribe:
125
- response = stub.Transcribe(
126
- transcribe_request_iterator(
127
- file=file,
128
- specified_language=language,
129
- start=start,
130
- end=end,
131
- enable_language_switching=enable_language_switching,
132
- enable_word_segmentation=enable_word_segmentation,
133
- use_raw_audio=use_raw_audio,
134
- ),
135
- metadata=metadata,
136
- )
137
- elif task == Task.translate:
138
- response = stub.Translate(
139
- translate_request_iterator(
140
- file=file,
141
- specified_language=language,
142
- start=start,
143
- end=end,
144
- enable_language_switching=enable_language_switching,
145
- enable_word_segmentation=enable_word_segmentation,
146
- ),
147
- metadata=metadata,
148
- )
149
- else:
150
- raise RuntimeError("Unknown task")
185
+ response = stub.Translate(
186
+ translate_request_iterator(
187
+ file=file,
188
+ specified_language=language,
189
+ start=start,
190
+ end=end,
191
+ enable_language_switching=enable_language_switching,
192
+ enable_word_segmentation=enable_word_segmentation,
193
+ use_raw_audio=use_raw_audio,
194
+ ),
195
+ metadata=metadata,
196
+ )
197
+ # Collect all responses
198
+ responses = list(response)
199
+ write_result(file.name, responses, output, language)
151
200
 
152
- info_message = []
153
- response_dict = None
154
- for _response in response:
155
- if not response_dict:
156
- response_dict = MessageToDict(_response)
157
- else:
158
- response_dict["result"]["oneBest"]["segments"] += \
159
- MessageToDict(_response)["result"]["oneBest"]["segments"] # fmt: skip
160
-
161
- for segment in _response.result.one_best.segments:
162
- if segment.source_language != segment.detected_source_language:
163
- info_message.append(
164
- f"Language '{segment.detected_source_language}' was detected in the audio, but instead "
165
- f"the segment was {'transcribed' if task == Task.transcribe else 'translated'} with the "
166
- + (
167
- f"closest available source language '{segment.source_language}'"
168
- if language is None
169
- else f"language '{language}' that was enforced by the '--language' argument"
170
- )
171
- )
172
201
 
173
- print(json.dumps(response_dict, indent=2, ensure_ascii=False))
174
- info_message = set(info_message)
175
- if len(info_message) > 0:
176
- for msg in info_message:
177
- logging.info(msg)
202
+ def transcribe_impl(
203
+ channel: grpc.Channel,
204
+ file: BinaryIO,
205
+ output: TextIO,
206
+ language: Optional[str],
207
+ start: Optional[float],
208
+ end: Optional[float],
209
+ metadata: Optional[list],
210
+ enable_language_switching: bool = False,
211
+ enable_word_segmentation: bool = False,
212
+ use_raw_audio: bool = False,
213
+ ):
214
+ logging.info("Processing audio file with transcribe")
215
+ stub = stt_grpc.SpeechToTextStub(channel)
216
+ response = stub.Transcribe(
217
+ transcribe_request_iterator(
218
+ file=file,
219
+ specified_language=language,
220
+ start=start,
221
+ end=end,
222
+ enable_language_switching=enable_language_switching,
223
+ enable_word_segmentation=enable_word_segmentation,
224
+ use_raw_audio=use_raw_audio,
225
+ ),
226
+ metadata=metadata,
227
+ )
228
+
229
+ # Collect all responses
230
+ responses = list(response)
231
+ write_result(file.name, responses, output, language)
178
232
 
179
233
 
180
- def main():
181
- parser = argparse.ArgumentParser(
182
- description=(
183
- "Enhanced Speech to Text Built on Whisper gRPC client. Transcribes input audio into segments"
184
- " with timestamps."
234
+ # Helper functions
235
+ def _parse_time_range(time_range: str) -> tuple[Optional[float], Optional[float]]:
236
+ if time_range is None:
237
+ return None, None
238
+
239
+ if len(time_range) == 0:
240
+ raise typer.BadParameter("Parameter 'time_range' must be of the form '[START]:[END]'.")
241
+
242
+ # Regex pattern to match [START]:[END] format where START and END are positive floats
243
+ pattern = r"^(\d+(?:\.\d+)?)?:(\d+(?:\.\d+)?)?$"
244
+ match = re.match(pattern, time_range.strip())
245
+
246
+ if not match:
247
+ raise typer.BadParameter(
248
+ "Parameter 'time_range' must be of the form '[START]:[END]' where START and END are positive float numbers."
185
249
  )
186
- )
187
250
 
188
- parser.add_argument(
189
- "-H",
190
- "--host",
191
- type=str,
192
- default="localhost:8080",
193
- help="Server address, default: localhost:8080",
194
- )
195
- parser.add_argument(
196
- "-l",
197
- "--log_level",
198
- type=str,
199
- default="error",
200
- choices=["critical", "error", "warning", "info", "debug"],
201
- )
202
- parser.add_argument(
203
- "--metadata",
204
- metavar="key=value",
205
- nargs="+",
206
- type=lambda x: tuple(x.split("=")),
207
- help="Custom client metadata",
208
- )
209
- parser.add_argument("--use_ssl", action="store_true", help="Use SSL connection")
210
- parser.add_argument("--start", type=float, help="Audio start time")
211
- parser.add_argument("--end", type=float, help="Audio end time")
212
-
213
- parser.add_argument(
214
- "--language",
215
- type=str,
216
- default=None,
217
- help=(
218
- "Force transcription to specified language, if not set, language is detected"
219
- " automatically"
220
- ),
221
- )
222
- parser.add_argument(
223
- "--task",
224
- type=Task,
225
- default=Task.transcribe,
226
- choices=list(Task),
227
- help="Select whether to transcribe or translate the recording",
228
- )
229
- parser.add_argument(
230
- "--enable-language-switching",
231
- action="store_true",
232
- help="Enable dynamic language switching during transcription, with the language being detected approximately every 30 seconds",
233
- )
234
- parser.add_argument(
235
- "--enable-word-segmentation",
236
- action="store_true",
237
- help="Enable word-level transcription. Note: Enabling this option may increase processing time",
238
- )
239
- parser.add_argument("file", type=str, help="Path to input file")
240
- parser.add_argument("--use_raw_audio", action="store_true", help="Send a raw audio in")
251
+ # Parse START and END from regex groups
252
+ start_str = match.group(1)
253
+ end_str = match.group(2)
241
254
 
242
- args = parser.parse_args()
255
+ start = float(start_str) if start_str is not None else None
256
+ end = float(end_str) if end_str is not None else None
243
257
 
244
- if args.start is not None and args.start < 0:
245
- raise ValueError("Parameter 'start' must be a non-negative float.")
258
+ if start is not None and end is not None and start >= end:
259
+ raise typer.BadParameter("Parameter 'end' must be larger than 'start'.")
246
260
 
247
- if args.end is not None and args.end <= 0:
248
- raise ValueError("Parameter 'end' must be a positive float.")
261
+ return (None if start == 0.0 else start, end)
249
262
 
250
- if args.start is not None and args.end is not None and args.start >= args.end:
251
- raise ValueError("Parameter 'end' must be larger than 'start'.")
252
263
 
253
- logging.basicConfig(
254
- level=args.log_level.upper(),
255
- format="[%(asctime)s.%(msecs)03d] [%(levelname)s] %(message)s",
256
- datefmt="%Y-%m-%d %H:%M:%S",
257
- )
264
+ def _parse_metadata_callback(
265
+ ctx: typer.Context, metadata_list: Optional[list[str]]
266
+ ) -> list[tuple[str, str]]:
267
+ if ctx.resilient_parsing or metadata_list is None:
268
+ return []
269
+
270
+ params = []
271
+ for item in metadata_list:
272
+ t = tuple(item.split("=", 1))
273
+ if len(t) != 2:
274
+ raise typer.BadParameter(f"Metadata must be in format 'KEY=VALUE': {item}")
275
+ params.append(t)
276
+ return params
258
277
 
259
- if not os.path.isfile(args.file):
260
- logging.error(f"no such file {args.file}")
261
- exit(1)
278
+
279
+ app = typer.Typer(context_settings={"help_option_names": ["-h", "--help"]}, no_args_is_help=True)
280
+
281
+
282
+ @app.command()
283
+ def translate(
284
+ ctx: typer.Context,
285
+ input_file: Annotated[
286
+ typer.FileBinaryRead,
287
+ typer.Argument(
288
+ help="Input audio file path.",
289
+ ),
290
+ ] = "-",
291
+ time_range: Annotated[
292
+ Optional[str],
293
+ typer.Option(
294
+ "-t",
295
+ "--time-range",
296
+ callback=_parse_time_range,
297
+ metavar="[START]:[END]",
298
+ help=(
299
+ "Time range in seconds using format [START]:[END] where START and END are positive float numbers. "
300
+ "START can be omitted to process from beginning, END can be omitted to process to the end of the recording. "
301
+ "Examples: --time-range :10 (0 to 10), --time-range 10.1: (10.1 to end), --time-range 5:10 (5 to 10)."
302
+ ),
303
+ ),
304
+ ] = None,
305
+ language: Annotated[
306
+ Optional[str],
307
+ typer.Option(
308
+ "--language",
309
+ help=(
310
+ "Force transcription to specified language, if not set, language is detected "
311
+ "automatically."
312
+ ),
313
+ ),
314
+ ] = None,
315
+ enable_language_switching: Annotated[
316
+ bool,
317
+ typer.Option(
318
+ "--enable-language-switching",
319
+ help="Enable dynamic language switching during transcription, with the language being detected approximately every 30 seconds.",
320
+ ),
321
+ ] = False,
322
+ enable_word_segmentation: Annotated[
323
+ bool,
324
+ typer.Option(
325
+ "--enable-word-segmentation",
326
+ help="Enable word-level transcription. Note: Enabling this option may increase processing time.",
327
+ ),
328
+ ] = False,
329
+ use_raw_audio: Annotated[
330
+ bool,
331
+ typer.Option(
332
+ "--use-raw-audio",
333
+ help="Send raw audio in chunks. Enables continuous audio processing with less server memory usage.",
334
+ ),
335
+ ] = False,
336
+ output: Annotated[
337
+ typer.FileTextWrite,
338
+ typer.Option(
339
+ "--output", "-o", help="Output file path. If omitted, prints to stdout.", lazy=False
340
+ ),
341
+ ] = "-",
342
+ ) -> None:
343
+ """Translates input audio into segments with timestamps."""
262
344
 
263
345
  try:
264
- logging.info(f"Connecting to {args.host}")
265
- channel = (
266
- grpc.secure_channel(target=args.host, credentials=grpc.ssl_channel_credentials())
267
- if args.use_ssl
268
- else grpc.insecure_channel(target=args.host)
269
- )
346
+ logging.info(f"Connecting to {ctx.obj['host']}")
347
+ with (
348
+ grpc.insecure_channel(target=ctx.obj["host"])
349
+ if ctx.obj["plaintext"]
350
+ else grpc.secure_channel(
351
+ target=ctx.obj["host"], credentials=grpc.ssl_channel_credentials()
352
+ )
353
+ ) as channel:
354
+ start_time = datetime.now()
355
+
356
+ translate_impl(
357
+ channel=channel,
358
+ file=input_file,
359
+ output=output,
360
+ language=language,
361
+ start=time_range[0],
362
+ end=time_range[1],
363
+ metadata=ctx.obj["metadata"],
364
+ enable_language_switching=enable_language_switching,
365
+ enable_word_segmentation=enable_word_segmentation,
366
+ use_raw_audio=use_raw_audio,
367
+ )
270
368
 
271
- start_time = datetime.now()
272
-
273
- transcribe(
274
- channel=channel,
275
- file=args.file,
276
- language=args.language,
277
- start=args.start,
278
- end=args.end,
279
- metadata=args.metadata,
280
- task=args.task,
281
- enable_language_switching=args.enable_language_switching,
282
- enable_word_segmentation=args.enable_word_segmentation,
283
- use_raw_audio=args.use_raw_audio,
284
- )
369
+ logging.debug(f"Elapsed time {(datetime.now() - start_time)}")
370
+
371
+ except grpc.RpcError:
372
+ logging.exception("RPC failed")
373
+ raise typer.Exit(code=1) from None
374
+ except (typer.Exit, typer.BadParameter):
375
+ raise
376
+ except Exception:
377
+ logging.exception("Unknown error")
378
+ raise typer.Exit(code=2) from None
379
+
380
+
381
+ @app.command()
382
+ def transcribe(
383
+ ctx: typer.Context,
384
+ input_file: Annotated[
385
+ typer.FileBinaryRead,
386
+ typer.Argument(
387
+ help="Input audio file path.",
388
+ ),
389
+ ] = "-",
390
+ time_range: Annotated[
391
+ Optional[str],
392
+ typer.Option(
393
+ "-t",
394
+ "--time-range",
395
+ callback=_parse_time_range,
396
+ metavar="[START]:[END]",
397
+ help=(
398
+ "Time range in seconds using format [START]:[END] where START and END are positive float numbers. "
399
+ "START can be omitted to process from beginning, END can be omitted to process to the end of the recording. "
400
+ "Examples: --time-range :10 (0 to 10), --time-range 10.1: (10.1 to end), --time-range 5:10 (5 to 10)."
401
+ ),
402
+ ),
403
+ ] = None,
404
+ language: Annotated[
405
+ Optional[str],
406
+ typer.Option(
407
+ "--language",
408
+ help=(
409
+ "Force transcription to specified language, if not set, language is detected "
410
+ "automatically."
411
+ ),
412
+ ),
413
+ ] = None,
414
+ enable_language_switching: Annotated[
415
+ bool,
416
+ typer.Option(
417
+ "--enable-language-switching",
418
+ help="Enable dynamic language switching during transcription, with the language being detected approximately every 30 seconds.",
419
+ ),
420
+ ] = False,
421
+ enable_word_segmentation: Annotated[
422
+ bool,
423
+ typer.Option(
424
+ "--enable-word-segmentation",
425
+ help="Enable word-level transcription. Note: Enabling this option may increase processing time.",
426
+ ),
427
+ ] = False,
428
+ use_raw_audio: Annotated[
429
+ bool,
430
+ typer.Option(
431
+ "--use-raw-audio",
432
+ help="Send raw audio in chunks. Enables continuous audio processing with less server memory usage.",
433
+ ),
434
+ ] = False,
435
+ output: Annotated[
436
+ typer.FileTextWrite,
437
+ typer.Option(
438
+ "--output", "-o", help="Output file path. If omitted, prints to stdout.", lazy=False
439
+ ),
440
+ ] = "-",
441
+ ) -> None:
442
+ """Transcribes input audio into segments with timestamps."""
443
+
444
+ try:
445
+ logging.info(f"Connecting to {ctx.obj['host']}")
446
+ with (
447
+ grpc.insecure_channel(target=ctx.obj["host"])
448
+ if ctx.obj["plaintext"]
449
+ else grpc.secure_channel(
450
+ target=ctx.obj["host"], credentials=grpc.ssl_channel_credentials()
451
+ )
452
+ ) as channel:
453
+ start_time = datetime.now()
454
+
455
+ transcribe_impl(
456
+ channel=channel,
457
+ file=input_file,
458
+ output=output,
459
+ language=language,
460
+ start=time_range[0],
461
+ end=time_range[1],
462
+ metadata=ctx.obj["metadata"],
463
+ enable_language_switching=enable_language_switching,
464
+ enable_word_segmentation=enable_word_segmentation,
465
+ use_raw_audio=use_raw_audio,
466
+ )
285
467
 
286
- logging.debug(f"Elapsed time {(datetime.now() - start_time)}")
468
+ logging.debug(f"Elapsed time {(datetime.now() - start_time)}")
287
469
 
288
470
  except grpc.RpcError:
289
471
  logging.exception("RPC failed")
290
- exit(1)
472
+ raise typer.Exit(code=1) from None
473
+ except (typer.Exit, typer.BadParameter):
474
+ raise
291
475
  except Exception:
292
476
  logging.exception("Unknown error")
293
- exit(1)
477
+ raise typer.Exit(code=2) from None
478
+
479
+
480
+ @app.callback()
481
+ def cli(
482
+ ctx: typer.Context,
483
+ host: Annotated[
484
+ str,
485
+ typer.Option("--host", "-H", help="Server address (host:port)."),
486
+ ] = "localhost:8080",
487
+ log_level: Annotated[
488
+ LogLevel, typer.Option("--log-level", "-l", help="Logging level.")
489
+ ] = LogLevel.ERROR,
490
+ metadata: Annotated[
491
+ list[str],
492
+ typer.Option(
493
+ "--metadata",
494
+ metavar="key=value",
495
+ help="Custom client metadata.",
496
+ show_default=False,
497
+ callback=_parse_metadata_callback,
498
+ ),
499
+ ] = [],
500
+ plaintext: Annotated[
501
+ bool,
502
+ typer.Option(
503
+ "--plaintext", help="Use plain-text HTTP/2 when connecting to server (no TLS)."
504
+ ),
505
+ ] = False,
506
+ ) -> None:
507
+ """Enhanced Speech to Text Built on Whisper gRPC client."""
508
+
509
+ ctx.obj = {
510
+ "host": host,
511
+ "metadata": metadata,
512
+ "log_level": log_level,
513
+ "plaintext": plaintext,
514
+ }
515
+
516
+ logging.basicConfig(
517
+ level=log_level.value.upper(),
518
+ format="[%(asctime)s.%(msecs)03d] [%(levelname)s] %(message)s",
519
+ datefmt="%Y-%m-%d %H:%M:%S",
520
+ )
294
521
 
295
522
 
296
523
  if __name__ == "__main__":
297
- main()
524
+ app()
@@ -1,5 +0,0 @@
1
- phonexia_enhanced_speech_to_text_built_on_whisper_client.py,sha256=acZwljE4vRaKtf71f6Cm75c81iYhrmq59FoQIz5k0kI,9928
2
- phonexia_enhanced_speech_to_text_built_on_whisper_client-1.10.0.dist-info/METADATA,sha256=c9vqgI_gYaz7EFROo2Sq_36MUEp0rxDKRSRnRiKhsVs,2343
3
- phonexia_enhanced_speech_to_text_built_on_whisper_client-1.10.0.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
4
- phonexia_enhanced_speech_to_text_built_on_whisper_client-1.10.0.dist-info/entry_points.txt,sha256=RZ7mWDaVGagDYxjXloW7ndadXlJVwg9Xov0gqvPTqHs,129
5
- phonexia_enhanced_speech_to_text_built_on_whisper_client-1.10.0.dist-info/RECORD,,
@@ -1,3 +0,0 @@
1
- [console_scripts]
2
- enhanced_speech_to_text_built_on_whisper_client=phonexia_enhanced_speech_to_text_built_on_whisper_client:main
3
-