phonexia-enhanced-speech-to-text-built-on-whisper-client 1.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phonexia_enhanced_speech_to_text_built_on_whisper_client-1.2.1/PKG-INFO +33 -0
- phonexia_enhanced_speech_to_text_built_on_whisper_client-1.2.1/phonexia_enhanced_speech_to_text_built_on_whisper_client.py +252 -0
- phonexia_enhanced_speech_to_text_built_on_whisper_client-1.2.1/pypi-README.md +12 -0
- phonexia_enhanced_speech_to_text_built_on_whisper_client-1.2.1/pyproject.toml +92 -0
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: phonexia-enhanced-speech-to-text-built-on-whisper-client
|
|
3
|
+
Version: 1.2.1
|
|
4
|
+
Summary: Client for communication with Phonexia Enhanced Speech To Text Built On Whisper microservice.
|
|
5
|
+
Keywords: grpc,transcription,STT,ASR,speech to text,speech,language,microservice
|
|
6
|
+
Author: Phonexia
|
|
7
|
+
Author-email: info@phonexia.com
|
|
8
|
+
Requires-Python: >=3.8,<4.0
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Requires-Dist: grpcio (>=1.54.0,<2.0.0)
|
|
16
|
+
Requires-Dist: phonexia-grpc (>=1.0.0,<2.0.0)
|
|
17
|
+
Project-URL: Homepage, https://phonexia.com
|
|
18
|
+
Project-URL: Issues, https://phonexia.atlassian.net/servicedesk/customer/portal/15/group/20/create/40
|
|
19
|
+
Project-URL: protofiles, https://github.com/phonexia/protofiles
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+

|
|
24
|
+
|
|
25
|
+
# Phonexia enhanced speech to text built on whisper client
|
|
26
|
+
|
|
27
|
+
This module contains client for communication with [enhanced speech to text built on whisper](https://hub.docker.com/r/phonexia/enhanced-speech-to-text-built-on-whisper/) developed by [Phonexia](https://phonexia.com).
|
|
28
|
+
|
|
29
|
+
To use this client you will first need a running instance of any *Phonexia enhanced speech to text built on whisper microservice*. If you don't yet have any running instance, don't hesitate to [contact our sales department](mailto:info@phonexia.com).
|
|
30
|
+
|
|
31
|
+
You can learn more about the enhanced speech to text built on whisper technology [here](https://docs.cloud.phonexia.com/docs/category/enhanced-speech-to-text-built-on-whisper).
|
|
32
|
+
|
|
33
|
+
On [this page](https://docs.cloud.phonexia.com/docs/products/speech-platform-4/grpc/api/phonexia/grpc/technologies/enhanced_speech_to_text_built_on_whisper/v1/enhanced_speech_to_text_built_on_whisper.proto), you will find a *gRPC API* reference for *enhanced speech to text built on whisper microservice*.
|
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import Iterator, Optional
|
|
6
|
+
|
|
7
|
+
import google.protobuf.duration_pb2
|
|
8
|
+
import grpc
|
|
9
|
+
import phonexia.grpc.technologies.enhanced_speech_to_text_built_on_whisper.v1.enhanced_speech_to_text_built_on_whisper_pb2_grpc as stt_grpc
|
|
10
|
+
from phonexia.grpc.common.core_pb2 import Audio, TimeRange
|
|
11
|
+
from phonexia.grpc.technologies.enhanced_speech_to_text_built_on_whisper.v1.enhanced_speech_to_text_built_on_whisper_pb2 import (
|
|
12
|
+
TranscribeConfig,
|
|
13
|
+
TranscribeRequest,
|
|
14
|
+
TranslateConfig,
|
|
15
|
+
TranslateRequest,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
CHUNK_SIZE = 32000
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Task(Enum):
|
|
22
|
+
transcribe = "transcribe"
|
|
23
|
+
translate = "translate"
|
|
24
|
+
|
|
25
|
+
def __str__(self):
|
|
26
|
+
return self.value
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def time_to_duration(time: float) -> google.protobuf.duration_pb2.Duration | None:
|
|
30
|
+
if time is None:
|
|
31
|
+
return None
|
|
32
|
+
duration = google.protobuf.duration_pb2.Duration()
|
|
33
|
+
duration.seconds = int(time)
|
|
34
|
+
duration.nanos = int((time - duration.seconds) * 1e9)
|
|
35
|
+
return duration
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def transcribe_request_iterator(
|
|
39
|
+
file: str,
|
|
40
|
+
specified_language: Optional[str],
|
|
41
|
+
start: Optional[float],
|
|
42
|
+
end: Optional[float],
|
|
43
|
+
enable_language_switching: bool = False,
|
|
44
|
+
) -> Iterator[TranscribeRequest]:
|
|
45
|
+
time_range = TimeRange(start=time_to_duration(start), end=time_to_duration(end))
|
|
46
|
+
config = TranscribeConfig(
|
|
47
|
+
language=specified_language, enable_language_switching=enable_language_switching
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
with open(file, "rb") as f:
|
|
51
|
+
while chunk := f.read(CHUNK_SIZE):
|
|
52
|
+
yield TranscribeRequest(
|
|
53
|
+
audio=Audio(content=chunk, time_range=time_range), config=config
|
|
54
|
+
)
|
|
55
|
+
time_range = None
|
|
56
|
+
config = None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def translate_request_iterator(
|
|
60
|
+
file: str,
|
|
61
|
+
specified_language: Optional[str],
|
|
62
|
+
start: Optional[float],
|
|
63
|
+
end: Optional[float],
|
|
64
|
+
enable_language_switching: bool = False,
|
|
65
|
+
) -> Iterator[TranslateRequest]:
|
|
66
|
+
time_range = TimeRange(start=time_to_duration(start), end=time_to_duration(end))
|
|
67
|
+
config = TranslateConfig(
|
|
68
|
+
source_language=specified_language, enable_language_switching=enable_language_switching
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
with open(file, "rb") as f:
|
|
72
|
+
while chunk := f.read(CHUNK_SIZE):
|
|
73
|
+
yield TranslateRequest(audio=Audio(content=chunk, time_range=time_range), config=config)
|
|
74
|
+
time_range = None
|
|
75
|
+
config = None
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def transcribe(
|
|
79
|
+
channel: grpc.Channel,
|
|
80
|
+
file: str,
|
|
81
|
+
language: Optional[str],
|
|
82
|
+
start: Optional[float],
|
|
83
|
+
end: Optional[float],
|
|
84
|
+
metadata: Optional[list],
|
|
85
|
+
task: Task,
|
|
86
|
+
enable_language_switching: bool = False,
|
|
87
|
+
):
|
|
88
|
+
stub = stt_grpc.SpeechToTextStub(channel)
|
|
89
|
+
if task == Task.transcribe:
|
|
90
|
+
response = stub.Transcribe(
|
|
91
|
+
transcribe_request_iterator(
|
|
92
|
+
file=file,
|
|
93
|
+
specified_language=language,
|
|
94
|
+
start=start,
|
|
95
|
+
end=end,
|
|
96
|
+
enable_language_switching=enable_language_switching,
|
|
97
|
+
),
|
|
98
|
+
metadata=metadata,
|
|
99
|
+
)
|
|
100
|
+
elif task == Task.translate:
|
|
101
|
+
response = stub.Translate(
|
|
102
|
+
translate_request_iterator(
|
|
103
|
+
file=file,
|
|
104
|
+
specified_language=language,
|
|
105
|
+
start=start,
|
|
106
|
+
end=end,
|
|
107
|
+
enable_language_switching=enable_language_switching,
|
|
108
|
+
),
|
|
109
|
+
metadata=metadata,
|
|
110
|
+
)
|
|
111
|
+
else:
|
|
112
|
+
raise RuntimeError("Unknown task")
|
|
113
|
+
|
|
114
|
+
warning_message = []
|
|
115
|
+
for _response in response:
|
|
116
|
+
for segment in _response.result.one_best.segments:
|
|
117
|
+
print(
|
|
118
|
+
f"[{segment.start_time.ToJsonString()} -> {segment.end_time.ToJsonString()} "
|
|
119
|
+
+ (
|
|
120
|
+
f"{segment.language}"
|
|
121
|
+
if task == Task.transcribe
|
|
122
|
+
else f"{segment.source_language} -> {segment.language}"
|
|
123
|
+
)
|
|
124
|
+
+ f"] {segment.text}"
|
|
125
|
+
)
|
|
126
|
+
if segment.source_language != segment.detected_source_language:
|
|
127
|
+
warning_message.append(
|
|
128
|
+
f"Language '{segment.detected_source_language}' was detected, but the license does not support this language. "
|
|
129
|
+
f"Instead the segment was {'transcribed' if task == Task.transcribe else 'translated'} with the "
|
|
130
|
+
+ (
|
|
131
|
+
f"closest available source language '{segment.source_language}'"
|
|
132
|
+
if language is None
|
|
133
|
+
else f"language '{language}' that was enforced by argument '--language'"
|
|
134
|
+
)
|
|
135
|
+
+ ". For more info on this problem don't hesitate to contact Phonexia."
|
|
136
|
+
)
|
|
137
|
+
if _response.HasField("processed_audio_length"):
|
|
138
|
+
print(f"Processed audio length: {_response.processed_audio_length.ToJsonString()}")
|
|
139
|
+
|
|
140
|
+
warning_message = set(warning_message)
|
|
141
|
+
if len(warning_message) > 0:
|
|
142
|
+
print()
|
|
143
|
+
for warning in warning_message:
|
|
144
|
+
print(f"WARNING: {warning}")
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def main():
|
|
148
|
+
parser = argparse.ArgumentParser(
|
|
149
|
+
description=(
|
|
150
|
+
"Enhanced Speech to Text Built on Whisper gRPC client. Transcribes input audio into segments"
|
|
151
|
+
" with timestamps."
|
|
152
|
+
)
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
parser.add_argument(
|
|
156
|
+
"-H",
|
|
157
|
+
"--host",
|
|
158
|
+
type=str,
|
|
159
|
+
default="localhost:8080",
|
|
160
|
+
help="Server address, default: localhost:8080",
|
|
161
|
+
)
|
|
162
|
+
parser.add_argument(
|
|
163
|
+
"-l",
|
|
164
|
+
"--log_level",
|
|
165
|
+
type=str,
|
|
166
|
+
default="error",
|
|
167
|
+
choices=["critical", "error", "warning", "info", "debug"],
|
|
168
|
+
)
|
|
169
|
+
parser.add_argument(
|
|
170
|
+
"--metadata",
|
|
171
|
+
metavar="key=value",
|
|
172
|
+
nargs="+",
|
|
173
|
+
type=lambda x: tuple(x.split("=")),
|
|
174
|
+
help="Custom client metadata",
|
|
175
|
+
)
|
|
176
|
+
parser.add_argument("--use_ssl", action="store_true", help="Use SSL connection")
|
|
177
|
+
parser.add_argument("--start", type=float, help="Audio start time")
|
|
178
|
+
parser.add_argument("--end", type=float, help="Audio end time")
|
|
179
|
+
|
|
180
|
+
parser.add_argument(
|
|
181
|
+
"--language",
|
|
182
|
+
type=str,
|
|
183
|
+
default=None,
|
|
184
|
+
help=(
|
|
185
|
+
"Force transcription to specified language, if not set, language is detected"
|
|
186
|
+
" automatically"
|
|
187
|
+
),
|
|
188
|
+
)
|
|
189
|
+
parser.add_argument(
|
|
190
|
+
"--task",
|
|
191
|
+
type=Task,
|
|
192
|
+
default=Task.transcribe,
|
|
193
|
+
choices=list(Task),
|
|
194
|
+
help="Select whether to transcribe or translate the recording",
|
|
195
|
+
)
|
|
196
|
+
parser.add_argument(
|
|
197
|
+
"--enable-language-switching",
|
|
198
|
+
action="store_true",
|
|
199
|
+
help="Enable dynamic language switching during transcription, with the language being detected approximately every 30 seconds",
|
|
200
|
+
)
|
|
201
|
+
parser.add_argument("file", type=str, help="Path to input file")
|
|
202
|
+
|
|
203
|
+
args = parser.parse_args()
|
|
204
|
+
|
|
205
|
+
if args.start is not None and args.start < 0:
|
|
206
|
+
raise ValueError("Parameter 'start' must be a non-negative float.")
|
|
207
|
+
|
|
208
|
+
if args.end is not None and args.end <= 0:
|
|
209
|
+
raise ValueError("Parameter 'end' must be a positive float.")
|
|
210
|
+
|
|
211
|
+
if args.start is not None and args.end is not None and args.start >= args.end:
|
|
212
|
+
raise ValueError("Parameter 'end' must be larger than 'start'.")
|
|
213
|
+
|
|
214
|
+
logging.basicConfig(
|
|
215
|
+
level=args.log_level.upper(),
|
|
216
|
+
format="[%(asctime)s.%(msecs)03d] [%(levelname)s] %(message)s",
|
|
217
|
+
datefmt="%Y-%m-%d %H:%M:%S",
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
if not os.path.isfile(args.file):
|
|
221
|
+
logging.error(f"no such file {args.file}")
|
|
222
|
+
exit(1)
|
|
223
|
+
|
|
224
|
+
try:
|
|
225
|
+
logging.info(f"Connecting to {args.host}")
|
|
226
|
+
channel = (
|
|
227
|
+
grpc.secure_channel(target=args.host, credentials=grpc.ssl_channel_credentials())
|
|
228
|
+
if args.use_ssl
|
|
229
|
+
else grpc.insecure_channel(target=args.host)
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
transcribe(
|
|
233
|
+
channel=channel,
|
|
234
|
+
file=args.file,
|
|
235
|
+
language=args.language,
|
|
236
|
+
start=args.start,
|
|
237
|
+
end=args.end,
|
|
238
|
+
metadata=args.metadata,
|
|
239
|
+
task=args.task,
|
|
240
|
+
enable_language_switching=args.enable_language_switching,
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
except grpc.RpcError:
|
|
244
|
+
logging.exception("RPC failed")
|
|
245
|
+
exit(1)
|
|
246
|
+
except Exception:
|
|
247
|
+
logging.exception("Unknown error")
|
|
248
|
+
exit(1)
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
if __name__ == "__main__":
|
|
252
|
+
main()
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
|
|
2
|
+

|
|
3
|
+
|
|
4
|
+
# Phonexia enhanced speech to text built on whisper client
|
|
5
|
+
|
|
6
|
+
This module contains client for communication with [enhanced speech to text built on whisper](https://hub.docker.com/r/phonexia/enhanced-speech-to-text-built-on-whisper/) developed by [Phonexia](https://phonexia.com).
|
|
7
|
+
|
|
8
|
+
To use this client you will first need a running instance of any *Phonexia enhanced speech to text built on whisper microservice*. If you don't yet have any running instance, don't hesitate to [contact our sales department](mailto:info@phonexia.com).
|
|
9
|
+
|
|
10
|
+
You can learn more about the enhanced speech to text built on whisper technology [here](https://docs.cloud.phonexia.com/docs/category/enhanced-speech-to-text-built-on-whisper).
|
|
11
|
+
|
|
12
|
+
On [this page](https://docs.cloud.phonexia.com/docs/products/speech-platform-4/grpc/api/phonexia/grpc/technologies/enhanced_speech_to_text_built_on_whisper/v1/enhanced_speech_to_text_built_on_whisper.proto), you will find a *gRPC API* reference for *enhanced speech to text built on whisper microservice*.
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "phonexia-enhanced-speech-to-text-built-on-whisper-client"
|
|
3
|
+
version = "1.2.1"
|
|
4
|
+
description = "Client for communication with Phonexia Enhanced Speech To Text Built On Whisper microservice."
|
|
5
|
+
readme = "pypi-README.md"
|
|
6
|
+
keywords = ["grpc", "transcription", "STT", "ASR", "speech to text", "speech", "language", "microservice"]
|
|
7
|
+
authors = ["Phonexia <info@phonexia.com>"]
|
|
8
|
+
|
|
9
|
+
[tool.poetry.urls]
|
|
10
|
+
Homepage = "https://phonexia.com"
|
|
11
|
+
Issues = "https://phonexia.atlassian.net/servicedesk/customer/portal/15/group/20/create/40"
|
|
12
|
+
protofiles = "https://github.com/phonexia/protofiles"
|
|
13
|
+
|
|
14
|
+
[tool.poetry.scripts]
|
|
15
|
+
enhanced_speech_to_text_built_on_whisper_client = 'phonexia_enhanced_speech_to_text_built_on_whisper_client:main'
|
|
16
|
+
|
|
17
|
+
[tool.poetry.dependencies]
|
|
18
|
+
python = ">=3.8,<4.0"
|
|
19
|
+
grpcio = "^1.54.0"
|
|
20
|
+
phonexia-grpc = {version="^1.0.0", source="pypi"}
|
|
21
|
+
|
|
22
|
+
[tool.poetry.group.dev.dependencies]
|
|
23
|
+
pytest = "^8.0.0"
|
|
24
|
+
pytest-cov = "^5.0.0"
|
|
25
|
+
pytest-env = "^1.0.0"
|
|
26
|
+
pytest-random-order = "^1.1.0"
|
|
27
|
+
black = "^24.0.0"
|
|
28
|
+
ruff = "^0.4.0"
|
|
29
|
+
|
|
30
|
+
[[tool.poetry.source]]
|
|
31
|
+
name = "gitlab"
|
|
32
|
+
url = "https://gitlab.cloud.phonexia.com/api/v4/groups/39/-/packages/pypi/simple"
|
|
33
|
+
priority = "primary"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
[[tool.poetry.source]]
|
|
37
|
+
name = "PyPI"
|
|
38
|
+
priority = "default"
|
|
39
|
+
|
|
40
|
+
[build-system]
|
|
41
|
+
requires = ["poetry-core>=1.0.0"]
|
|
42
|
+
build-backend = "poetry.core.masonry.api"
|
|
43
|
+
|
|
44
|
+
[tool.black]
|
|
45
|
+
line-length = 100
|
|
46
|
+
target-version = ['py38']
|
|
47
|
+
preview = true
|
|
48
|
+
|
|
49
|
+
[tool.ruff]
|
|
50
|
+
target-version = "py38"
|
|
51
|
+
line-length = 100
|
|
52
|
+
fix = true
|
|
53
|
+
select = [
|
|
54
|
+
# flake8-2020
|
|
55
|
+
"YTT",
|
|
56
|
+
# flake8-bandit
|
|
57
|
+
"S",
|
|
58
|
+
# flake8-bugbear
|
|
59
|
+
"B",
|
|
60
|
+
# flake8-builtins
|
|
61
|
+
"A",
|
|
62
|
+
# flake8-comprehensions
|
|
63
|
+
"C4",
|
|
64
|
+
# flake8-debugger
|
|
65
|
+
"T10",
|
|
66
|
+
# flake8-simplify
|
|
67
|
+
"SIM",
|
|
68
|
+
# isort
|
|
69
|
+
"I",
|
|
70
|
+
# mccabe
|
|
71
|
+
"C90",
|
|
72
|
+
# pycodestyle
|
|
73
|
+
"E", "W",
|
|
74
|
+
# pyflakes
|
|
75
|
+
"F",
|
|
76
|
+
# pygrep-hooks
|
|
77
|
+
"PGH",
|
|
78
|
+
# pyupgrade
|
|
79
|
+
"UP",
|
|
80
|
+
# ruff
|
|
81
|
+
"RUF",
|
|
82
|
+
# tryceratops
|
|
83
|
+
"TRY",
|
|
84
|
+
]
|
|
85
|
+
ignore = [
|
|
86
|
+
# LineTooLong
|
|
87
|
+
"E501",
|
|
88
|
+
# DoNotAssignLambda
|
|
89
|
+
"E731",
|
|
90
|
+
# RaiseVanillaArgs aka Avoid specifying long messages outside the exception class
|
|
91
|
+
"TRY003",
|
|
92
|
+
]
|