phonexia-enhanced-speech-to-text-built-on-whisper-client 1.3.0__tar.gz → 1.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {phonexia_enhanced_speech_to_text_built_on_whisper_client-1.3.0 → phonexia_enhanced_speech_to_text_built_on_whisper_client-1.5.0}/PKG-INFO +4 -1
- {phonexia_enhanced_speech_to_text_built_on_whisper_client-1.3.0 → phonexia_enhanced_speech_to_text_built_on_whisper_client-1.5.0}/phonexia_enhanced_speech_to_text_built_on_whisper_client.py +41 -9
- {phonexia_enhanced_speech_to_text_built_on_whisper_client-1.3.0 → phonexia_enhanced_speech_to_text_built_on_whisper_client-1.5.0}/pyproject.toml +9 -6
- {phonexia_enhanced_speech_to_text_built_on_whisper_client-1.3.0 → phonexia_enhanced_speech_to_text_built_on_whisper_client-1.5.0}/pypi-README.md +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: phonexia-enhanced-speech-to-text-built-on-whisper-client
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.5.0
|
|
4
4
|
Summary: Client for communication with Phonexia Enhanced Speech To Text Built On Whisper microservice.
|
|
5
5
|
Keywords: grpc,transcription,STT,ASR,speech to text,speech,language,microservice
|
|
6
6
|
Author: Phonexia
|
|
@@ -13,7 +13,10 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.11
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.12
|
|
15
15
|
Requires-Dist: grpcio (>=1.54.0,<2.0.0)
|
|
16
|
+
Requires-Dist: numpy (<2.0.0) ; python_version < "3.12"
|
|
17
|
+
Requires-Dist: numpy (>=2.0.0) ; python_version >= "3.12"
|
|
16
18
|
Requires-Dist: phonexia-grpc (>=2.0.0,<3.0.0)
|
|
19
|
+
Requires-Dist: soundfile (>=0.12.1,<0.13.0)
|
|
17
20
|
Project-URL: Homepage, https://phonexia.com
|
|
18
21
|
Project-URL: Issues, https://phonexia.atlassian.net/servicedesk/customer/portal/15/group/20/create/40
|
|
19
22
|
Project-URL: protofiles, https://github.com/phonexia/protofiles
|
|
@@ -2,15 +2,16 @@ import argparse
|
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
|
+
from datetime import datetime
|
|
5
6
|
from enum import Enum
|
|
6
7
|
from typing import Iterator, Optional
|
|
7
8
|
|
|
8
9
|
import google.protobuf.duration_pb2
|
|
9
10
|
import grpc
|
|
10
|
-
from google.protobuf.json_format import MessageToDict
|
|
11
|
-
|
|
12
11
|
import phonexia.grpc.technologies.enhanced_speech_to_text_built_on_whisper.v1.enhanced_speech_to_text_built_on_whisper_pb2_grpc as stt_grpc
|
|
13
|
-
|
|
12
|
+
import soundfile
|
|
13
|
+
from google.protobuf.json_format import MessageToDict
|
|
14
|
+
from phonexia.grpc.common.core_pb2 import Audio, RawAudioConfig, TimeRange
|
|
14
15
|
from phonexia.grpc.technologies.enhanced_speech_to_text_built_on_whisper.v1.enhanced_speech_to_text_built_on_whisper_pb2 import (
|
|
15
16
|
TranscribeConfig,
|
|
16
17
|
TranscribeRequest,
|
|
@@ -44,19 +45,42 @@ def transcribe_request_iterator(
|
|
|
44
45
|
start: Optional[float],
|
|
45
46
|
end: Optional[float],
|
|
46
47
|
enable_language_switching: bool = False,
|
|
48
|
+
use_raw_audio: bool = False,
|
|
47
49
|
) -> Iterator[TranscribeRequest]:
|
|
48
50
|
time_range = TimeRange(start=time_to_duration(start), end=time_to_duration(end))
|
|
49
51
|
config = TranscribeConfig(
|
|
50
52
|
language=specified_language, enable_language_switching=enable_language_switching
|
|
51
53
|
)
|
|
52
54
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
55
|
+
if use_raw_audio:
|
|
56
|
+
with soundfile.SoundFile(file) as r:
|
|
57
|
+
raw_audio_config = RawAudioConfig(
|
|
58
|
+
channels=r.channels,
|
|
59
|
+
sample_rate_hertz=r.samplerate,
|
|
60
|
+
encoding=RawAudioConfig.AudioEncoding.PCM16,
|
|
57
61
|
)
|
|
58
|
-
|
|
59
|
-
|
|
62
|
+
|
|
63
|
+
for data in r.blocks(blocksize=r.samplerate, dtype="int16"):
|
|
64
|
+
logging.debug("Sending chunk of size %d samples", len(data))
|
|
65
|
+
yield TranscribeRequest(
|
|
66
|
+
audio=Audio(
|
|
67
|
+
content=data.flatten().tobytes(),
|
|
68
|
+
time_range=time_range,
|
|
69
|
+
raw_audio_config=raw_audio_config,
|
|
70
|
+
),
|
|
71
|
+
config=config,
|
|
72
|
+
)
|
|
73
|
+
time_range = None
|
|
74
|
+
raw_audio_config = None
|
|
75
|
+
config = None
|
|
76
|
+
else:
|
|
77
|
+
with open(file, "rb") as f:
|
|
78
|
+
while chunk := f.read(CHUNK_SIZE):
|
|
79
|
+
yield TranscribeRequest(
|
|
80
|
+
audio=Audio(content=chunk, time_range=time_range), config=config
|
|
81
|
+
)
|
|
82
|
+
time_range = None
|
|
83
|
+
config = None
|
|
60
84
|
|
|
61
85
|
|
|
62
86
|
def translate_request_iterator(
|
|
@@ -87,6 +111,7 @@ def transcribe(
|
|
|
87
111
|
metadata: Optional[list],
|
|
88
112
|
task: Task,
|
|
89
113
|
enable_language_switching: bool = False,
|
|
114
|
+
use_raw_audio: bool = False,
|
|
90
115
|
):
|
|
91
116
|
stub = stt_grpc.SpeechToTextStub(channel)
|
|
92
117
|
if task == Task.transcribe:
|
|
@@ -97,6 +122,7 @@ def transcribe(
|
|
|
97
122
|
start=start,
|
|
98
123
|
end=end,
|
|
99
124
|
enable_language_switching=enable_language_switching,
|
|
125
|
+
use_raw_audio=use_raw_audio,
|
|
100
126
|
),
|
|
101
127
|
metadata=metadata,
|
|
102
128
|
)
|
|
@@ -197,6 +223,7 @@ def main():
|
|
|
197
223
|
help="Enable dynamic language switching during transcription, with the language being detected approximately every 30 seconds",
|
|
198
224
|
)
|
|
199
225
|
parser.add_argument("file", type=str, help="Path to input file")
|
|
226
|
+
parser.add_argument("--use_raw_audio", action="store_true", help="Send a raw audio in")
|
|
200
227
|
|
|
201
228
|
args = parser.parse_args()
|
|
202
229
|
|
|
@@ -227,6 +254,8 @@ def main():
|
|
|
227
254
|
else grpc.insecure_channel(target=args.host)
|
|
228
255
|
)
|
|
229
256
|
|
|
257
|
+
start_time = datetime.now()
|
|
258
|
+
|
|
230
259
|
transcribe(
|
|
231
260
|
channel=channel,
|
|
232
261
|
file=args.file,
|
|
@@ -236,8 +265,11 @@ def main():
|
|
|
236
265
|
metadata=args.metadata,
|
|
237
266
|
task=args.task,
|
|
238
267
|
enable_language_switching=args.enable_language_switching,
|
|
268
|
+
use_raw_audio=args.use_raw_audio,
|
|
239
269
|
)
|
|
240
270
|
|
|
271
|
+
logging.debug(f"Elapsed time {(datetime.now() - start_time)}")
|
|
272
|
+
|
|
241
273
|
except grpc.RpcError:
|
|
242
274
|
logging.exception("RPC failed")
|
|
243
275
|
exit(1)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "phonexia-enhanced-speech-to-text-built-on-whisper-client"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.5.0"
|
|
4
4
|
description = "Client for communication with Phonexia Enhanced Speech To Text Built On Whisper microservice."
|
|
5
5
|
readme = "pypi-README.md"
|
|
6
6
|
keywords = ["grpc", "transcription", "STT", "ASR", "speech to text", "speech", "language", "microservice"]
|
|
@@ -18,6 +18,11 @@ enhanced_speech_to_text_built_on_whisper_client = 'phonexia_enhanced_speech_to_t
|
|
|
18
18
|
python = ">=3.8,<4.0"
|
|
19
19
|
grpcio = "^1.54.0"
|
|
20
20
|
phonexia-grpc = {version="^2.0.0", source="pypi"}
|
|
21
|
+
soundfile = "^0.12.1"
|
|
22
|
+
numpy = [
|
|
23
|
+
{ version = "<2.0.0", markers = "python_version < '3.12'" },
|
|
24
|
+
{ version = ">=2.0.0", markers = "python_version >= '3.12'" }
|
|
25
|
+
]
|
|
21
26
|
|
|
22
27
|
[tool.poetry.group.dev.dependencies]
|
|
23
28
|
pytest = "^8.0.0"
|
|
@@ -28,14 +33,12 @@ black = "^24.0.0"
|
|
|
28
33
|
ruff = "^0.4.0"
|
|
29
34
|
|
|
30
35
|
[[tool.poetry.source]]
|
|
31
|
-
name = "
|
|
32
|
-
url = "https://gitlab.cloud.phonexia.com/api/v4/groups/39/-/packages/pypi/simple"
|
|
36
|
+
name = "PyPI"
|
|
33
37
|
priority = "primary"
|
|
34
38
|
|
|
35
|
-
|
|
36
39
|
[[tool.poetry.source]]
|
|
37
|
-
name = "
|
|
38
|
-
|
|
40
|
+
name = "gitlab"
|
|
41
|
+
url = "https://gitlab.cloud.phonexia.com/api/v4/groups/39/-/packages/pypi/simple"
|
|
39
42
|
|
|
40
43
|
[build-system]
|
|
41
44
|
requires = ["poetry-core>=1.0.0"]
|