audex 1.0.7a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- audex/__init__.py +9 -0
- audex/__main__.py +7 -0
- audex/cli/__init__.py +189 -0
- audex/cli/apis/__init__.py +12 -0
- audex/cli/apis/init/__init__.py +34 -0
- audex/cli/apis/init/gencfg.py +130 -0
- audex/cli/apis/init/setup.py +330 -0
- audex/cli/apis/init/vprgroup.py +125 -0
- audex/cli/apis/serve.py +141 -0
- audex/cli/args.py +356 -0
- audex/cli/exceptions.py +44 -0
- audex/cli/helper/__init__.py +0 -0
- audex/cli/helper/ansi.py +193 -0
- audex/cli/helper/display.py +288 -0
- audex/config/__init__.py +64 -0
- audex/config/core/__init__.py +30 -0
- audex/config/core/app.py +29 -0
- audex/config/core/audio.py +45 -0
- audex/config/core/logging.py +163 -0
- audex/config/core/session.py +11 -0
- audex/config/helper/__init__.py +1 -0
- audex/config/helper/client/__init__.py +1 -0
- audex/config/helper/client/http.py +28 -0
- audex/config/helper/client/websocket.py +21 -0
- audex/config/helper/provider/__init__.py +1 -0
- audex/config/helper/provider/dashscope.py +13 -0
- audex/config/helper/provider/unisound.py +18 -0
- audex/config/helper/provider/xfyun.py +23 -0
- audex/config/infrastructure/__init__.py +31 -0
- audex/config/infrastructure/cache.py +51 -0
- audex/config/infrastructure/database.py +48 -0
- audex/config/infrastructure/recorder.py +32 -0
- audex/config/infrastructure/store.py +19 -0
- audex/config/provider/__init__.py +18 -0
- audex/config/provider/transcription.py +109 -0
- audex/config/provider/vpr.py +99 -0
- audex/container.py +40 -0
- audex/entity/__init__.py +468 -0
- audex/entity/doctor.py +109 -0
- audex/entity/doctor.pyi +51 -0
- audex/entity/fields.py +401 -0
- audex/entity/segment.py +115 -0
- audex/entity/segment.pyi +38 -0
- audex/entity/session.py +133 -0
- audex/entity/session.pyi +47 -0
- audex/entity/utterance.py +142 -0
- audex/entity/utterance.pyi +48 -0
- audex/entity/vp.py +68 -0
- audex/entity/vp.pyi +35 -0
- audex/exceptions.py +157 -0
- audex/filters/__init__.py +692 -0
- audex/filters/generated/__init__.py +21 -0
- audex/filters/generated/doctor.py +987 -0
- audex/filters/generated/segment.py +723 -0
- audex/filters/generated/session.py +978 -0
- audex/filters/generated/utterance.py +939 -0
- audex/filters/generated/vp.py +815 -0
- audex/helper/__init__.py +1 -0
- audex/helper/hash.py +33 -0
- audex/helper/mixin.py +65 -0
- audex/helper/net.py +19 -0
- audex/helper/settings/__init__.py +830 -0
- audex/helper/settings/fields.py +317 -0
- audex/helper/stream.py +153 -0
- audex/injectors/__init__.py +1 -0
- audex/injectors/config.py +12 -0
- audex/injectors/lifespan.py +7 -0
- audex/lib/__init__.py +1 -0
- audex/lib/cache/__init__.py +383 -0
- audex/lib/cache/inmemory.py +513 -0
- audex/lib/database/__init__.py +83 -0
- audex/lib/database/sqlite.py +406 -0
- audex/lib/exporter.py +189 -0
- audex/lib/injectors/__init__.py +1 -0
- audex/lib/injectors/cache.py +25 -0
- audex/lib/injectors/container.py +47 -0
- audex/lib/injectors/exporter.py +26 -0
- audex/lib/injectors/recorder.py +33 -0
- audex/lib/injectors/server.py +17 -0
- audex/lib/injectors/session.py +18 -0
- audex/lib/injectors/sqlite.py +24 -0
- audex/lib/injectors/store.py +13 -0
- audex/lib/injectors/transcription.py +42 -0
- audex/lib/injectors/usb.py +12 -0
- audex/lib/injectors/vpr.py +65 -0
- audex/lib/injectors/wifi.py +7 -0
- audex/lib/recorder.py +844 -0
- audex/lib/repos/__init__.py +149 -0
- audex/lib/repos/container.py +23 -0
- audex/lib/repos/database/__init__.py +1 -0
- audex/lib/repos/database/sqlite.py +672 -0
- audex/lib/repos/decorators.py +74 -0
- audex/lib/repos/doctor.py +286 -0
- audex/lib/repos/segment.py +302 -0
- audex/lib/repos/session.py +285 -0
- audex/lib/repos/tables/__init__.py +70 -0
- audex/lib/repos/tables/doctor.py +137 -0
- audex/lib/repos/tables/segment.py +113 -0
- audex/lib/repos/tables/session.py +140 -0
- audex/lib/repos/tables/utterance.py +131 -0
- audex/lib/repos/tables/vp.py +102 -0
- audex/lib/repos/utterance.py +288 -0
- audex/lib/repos/vp.py +286 -0
- audex/lib/restful.py +251 -0
- audex/lib/server/__init__.py +97 -0
- audex/lib/server/auth.py +98 -0
- audex/lib/server/handlers.py +248 -0
- audex/lib/server/templates/index.html.j2 +226 -0
- audex/lib/server/templates/login.html.j2 +111 -0
- audex/lib/server/templates/static/script.js +68 -0
- audex/lib/server/templates/static/style.css +579 -0
- audex/lib/server/types.py +123 -0
- audex/lib/session.py +503 -0
- audex/lib/store/__init__.py +238 -0
- audex/lib/store/localfile.py +411 -0
- audex/lib/transcription/__init__.py +33 -0
- audex/lib/transcription/dashscope.py +525 -0
- audex/lib/transcription/events.py +62 -0
- audex/lib/usb.py +554 -0
- audex/lib/vpr/__init__.py +38 -0
- audex/lib/vpr/unisound/__init__.py +185 -0
- audex/lib/vpr/unisound/types.py +469 -0
- audex/lib/vpr/xfyun/__init__.py +483 -0
- audex/lib/vpr/xfyun/types.py +679 -0
- audex/lib/websocket/__init__.py +8 -0
- audex/lib/websocket/connection.py +485 -0
- audex/lib/websocket/pool.py +991 -0
- audex/lib/wifi.py +1146 -0
- audex/lifespan.py +75 -0
- audex/service/__init__.py +27 -0
- audex/service/decorators.py +73 -0
- audex/service/doctor/__init__.py +652 -0
- audex/service/doctor/const.py +36 -0
- audex/service/doctor/exceptions.py +96 -0
- audex/service/doctor/types.py +54 -0
- audex/service/export/__init__.py +236 -0
- audex/service/export/const.py +17 -0
- audex/service/export/exceptions.py +34 -0
- audex/service/export/types.py +21 -0
- audex/service/injectors/__init__.py +1 -0
- audex/service/injectors/container.py +53 -0
- audex/service/injectors/doctor.py +34 -0
- audex/service/injectors/export.py +27 -0
- audex/service/injectors/session.py +49 -0
- audex/service/session/__init__.py +754 -0
- audex/service/session/const.py +34 -0
- audex/service/session/exceptions.py +67 -0
- audex/service/session/types.py +91 -0
- audex/types.py +39 -0
- audex/utils.py +287 -0
- audex/valueobj/__init__.py +81 -0
- audex/valueobj/common/__init__.py +1 -0
- audex/valueobj/common/auth.py +84 -0
- audex/valueobj/common/email.py +16 -0
- audex/valueobj/common/ops.py +22 -0
- audex/valueobj/common/phone.py +84 -0
- audex/valueobj/common/version.py +72 -0
- audex/valueobj/session.py +19 -0
- audex/valueobj/utterance.py +15 -0
- audex/view/__init__.py +51 -0
- audex/view/container.py +17 -0
- audex/view/decorators.py +303 -0
- audex/view/pages/__init__.py +1 -0
- audex/view/pages/dashboard/__init__.py +286 -0
- audex/view/pages/dashboard/wifi.py +407 -0
- audex/view/pages/login.py +110 -0
- audex/view/pages/recording.py +348 -0
- audex/view/pages/register.py +202 -0
- audex/view/pages/sessions/__init__.py +196 -0
- audex/view/pages/sessions/details.py +224 -0
- audex/view/pages/sessions/export.py +443 -0
- audex/view/pages/settings.py +374 -0
- audex/view/pages/voiceprint/__init__.py +1 -0
- audex/view/pages/voiceprint/enroll.py +195 -0
- audex/view/pages/voiceprint/update.py +195 -0
- audex/view/static/css/dashboard.css +452 -0
- audex/view/static/css/glass.css +22 -0
- audex/view/static/css/global.css +541 -0
- audex/view/static/css/login.css +386 -0
- audex/view/static/css/recording.css +439 -0
- audex/view/static/css/register.css +293 -0
- audex/view/static/css/sessions/styles.css +501 -0
- audex/view/static/css/settings.css +186 -0
- audex/view/static/css/voiceprint/enroll.css +43 -0
- audex/view/static/css/voiceprint/styles.css +209 -0
- audex/view/static/css/voiceprint/update.css +44 -0
- audex/view/static/images/logo.svg +95 -0
- audex/view/static/js/recording.js +42 -0
- audex-1.0.7a3.dist-info/METADATA +361 -0
- audex-1.0.7a3.dist-info/RECORD +192 -0
- audex-1.0.7a3.dist-info/WHEEL +4 -0
- audex-1.0.7a3.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,525 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import contextlib
|
|
5
|
+
import typing as t
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
from pydantic import TypeAdapter
|
|
9
|
+
|
|
10
|
+
from audex import utils
|
|
11
|
+
from audex.helper.mixin import LoggingMixin
|
|
12
|
+
from audex.helper.stream import AsyncStream
|
|
13
|
+
from audex.lib.transcription import ReceiveType
|
|
14
|
+
from audex.lib.transcription import Transcription
|
|
15
|
+
from audex.lib.transcription import TranscriptionError
|
|
16
|
+
from audex.lib.transcription import TranscriptSession
|
|
17
|
+
from audex.lib.transcription.events import Delta
|
|
18
|
+
from audex.lib.transcription.events import Done
|
|
19
|
+
from audex.lib.transcription.events import Start
|
|
20
|
+
from audex.lib.websocket.connection import WebsocketConnection
|
|
21
|
+
from audex.lib.websocket.pool import WebsocketConnectionPool
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class InvalidParamError(TranscriptionError):
|
|
25
|
+
default_message = "Invalid transcription parameters"
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
message: str | None = None,
|
|
30
|
+
params: dict[str, t.Any] | None = None,
|
|
31
|
+
):
|
|
32
|
+
super().__init__(message or self.default_message)
|
|
33
|
+
self.params = params or {}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class RunTaskPayloadParams(BaseModel):
|
|
37
|
+
format: t.Literal["pcm", "wav", "mp3", "opus", "speex", "aac", "amr"] = "pcm"
|
|
38
|
+
sample_rate: int = 16000
|
|
39
|
+
vocabulary_id: str | None = None
|
|
40
|
+
disfluency_removal_enabled: bool | None = None
|
|
41
|
+
language_hints: list[t.Literal["zh", "en", "ja", "yue", "ko", "de", "fr", "ru"]] | None = None
|
|
42
|
+
semantic_punctuation_enabled: bool | None = None
|
|
43
|
+
max_sentence_silence: int | None = None
|
|
44
|
+
multi_threshold_mode_enabled: bool | None = None
|
|
45
|
+
punctuation_prediction_enabled: bool | None = None
|
|
46
|
+
heartbeat: bool | None = None
|
|
47
|
+
inverse_text_normalization_enabled: bool | None = None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class RunTaskPayloadResource(BaseModel):
|
|
51
|
+
resource_id: str
|
|
52
|
+
resource_type: t.Literal["asr_phrase"] = "asr_phrase"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class RunTaskPayload(BaseModel):
|
|
56
|
+
task_group: t.Literal["audio"] = "audio"
|
|
57
|
+
task: t.Literal["asr"] = "asr"
|
|
58
|
+
function: t.Literal["recognition"] = "recognition"
|
|
59
|
+
model: str = "paraformer-realtime-v2"
|
|
60
|
+
parameters: RunTaskPayloadParams = RunTaskPayloadParams()
|
|
61
|
+
resources: list[RunTaskPayloadResource] = []
|
|
62
|
+
input: dict[str, object] = {}
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class RunTaskHeader(BaseModel):
|
|
66
|
+
task_id: str
|
|
67
|
+
action: t.Literal["run-task"] = "run-task"
|
|
68
|
+
streaming: t.Literal["duplex"] = "duplex"
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class RunTask(BaseModel):
|
|
72
|
+
header: RunTaskHeader
|
|
73
|
+
payload: RunTaskPayload
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class FinishTaskHeader(BaseModel):
|
|
77
|
+
task_id: str
|
|
78
|
+
action: t.Literal["finish-task"] = "finish-task"
|
|
79
|
+
streaming: t.Literal["duplex"] = "duplex"
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class FinishTaskPayload(BaseModel):
|
|
83
|
+
input: dict[str, object] = {}
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class FinishTask(BaseModel):
|
|
87
|
+
header: FinishTaskHeader
|
|
88
|
+
payload: FinishTaskPayload = FinishTaskPayload()
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class BaseServerHeader(BaseModel):
|
|
92
|
+
task_id: str | None = None
|
|
93
|
+
event: t.Literal["task-started", "result-generated", "task-finished", "task-failed"]
|
|
94
|
+
attributes: dict[str, t.Any]
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class BaseServerMessage(BaseModel):
|
|
98
|
+
header: BaseServerHeader
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class TaskStartedHeader(BaseServerHeader):
|
|
102
|
+
event: t.Literal["task-started"]
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class TaskStarted(BaseServerMessage):
|
|
106
|
+
header: TaskStartedHeader
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class ResultGeneratedHeader(BaseServerHeader):
|
|
110
|
+
event: t.Literal["result-generated"]
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class ResultGeneratedPayloadOutputSentence(BaseModel):
|
|
114
|
+
begin_time: int # in milliseconds
|
|
115
|
+
end_time: int | None
|
|
116
|
+
text: str
|
|
117
|
+
words: list[dict[str, object]] | None
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class ResultGeneratedPayloadOutput(BaseModel):
|
|
121
|
+
sentence: ResultGeneratedPayloadOutputSentence
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class ResultGeneratedPayload(BaseModel):
|
|
125
|
+
output: ResultGeneratedPayloadOutput
|
|
126
|
+
usage: t.Any = None
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class ResultGenerated(BaseServerMessage):
|
|
130
|
+
header: ResultGeneratedHeader
|
|
131
|
+
payload: ResultGeneratedPayload
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class TaskFinishedHeader(BaseServerHeader):
|
|
135
|
+
event: t.Literal["task-finished"]
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class TaskFinished(BaseServerMessage):
|
|
139
|
+
header: TaskFinishedHeader
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class TaskFailedHeader(BaseServerHeader):
|
|
143
|
+
event: t.Literal["task-failed"]
|
|
144
|
+
error_code: str
|
|
145
|
+
error_message: str
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class TaskFailed(BaseServerMessage):
|
|
149
|
+
header: TaskFailedHeader
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
ServerMessage = TaskStarted | ResultGenerated | TaskFinished | TaskFailed
|
|
153
|
+
adapter: TypeAdapter[ServerMessage] = TypeAdapter(ServerMessage)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def parse_server_message(
|
|
157
|
+
data: str | bytes,
|
|
158
|
+
) -> TaskStarted | ResultGenerated | TaskFinished | TaskFailed:
|
|
159
|
+
try:
|
|
160
|
+
return adapter.validate_json(data)
|
|
161
|
+
except Exception as e:
|
|
162
|
+
print("=" * 20)
|
|
163
|
+
print(str(e))
|
|
164
|
+
print(data)
|
|
165
|
+
print("=" * 20)
|
|
166
|
+
|
|
167
|
+
raise
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class DashscopeParaformer(LoggingMixin, Transcription):
|
|
171
|
+
__logtag__ = "audex.lib.transcript.dashscope"
|
|
172
|
+
|
|
173
|
+
def __init__(
|
|
174
|
+
self,
|
|
175
|
+
*,
|
|
176
|
+
model: str = "paraformer-realtime-v2",
|
|
177
|
+
url: str = "wss://dashscope.aliyuncs.com/api-ws/v1/inference",
|
|
178
|
+
api_key: str,
|
|
179
|
+
user_agent: str | None = None,
|
|
180
|
+
workspace: str | None = None,
|
|
181
|
+
max_connections: int = 1000,
|
|
182
|
+
idle_timeout: int = 60,
|
|
183
|
+
drain_timeout: float = 5.0,
|
|
184
|
+
fmt: t.Literal["pcm", "wav", "mp3", "opus", "speex", "aac", "amr"] = "pcm",
|
|
185
|
+
sample_rate: int = 8000,
|
|
186
|
+
silence_duration_ms: int | None = None,
|
|
187
|
+
vocabulary_id: str | None = None,
|
|
188
|
+
disfluency_removal_enabled: bool | None = None,
|
|
189
|
+
lang_hints: list[t.Literal["zh", "en", "ja", "yue", "ko", "de", "fr", "ru"]] | None = None,
|
|
190
|
+
semantic_punctuation: bool | None = None,
|
|
191
|
+
multi_thres_mode: bool | None = None,
|
|
192
|
+
punctuation_pred: bool | None = None,
|
|
193
|
+
heartbeat: bool | None = None,
|
|
194
|
+
itn: bool | None = None,
|
|
195
|
+
resources: list[str] | None = None,
|
|
196
|
+
**kwargs: t.Any,
|
|
197
|
+
):
|
|
198
|
+
super().__init__()
|
|
199
|
+
self.model = model
|
|
200
|
+
self.api_key = api_key
|
|
201
|
+
self.user_agent = user_agent
|
|
202
|
+
|
|
203
|
+
self.headers = {
|
|
204
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
205
|
+
"X-DashScope-DataInspection": "enable",
|
|
206
|
+
}
|
|
207
|
+
if self.user_agent:
|
|
208
|
+
self.headers["User-Agent"] = self.user_agent
|
|
209
|
+
if workspace:
|
|
210
|
+
self.headers["X-DashScope-Workspace"] = workspace
|
|
211
|
+
|
|
212
|
+
self.pool = WebsocketConnectionPool(
|
|
213
|
+
uri=url,
|
|
214
|
+
headers=self.headers,
|
|
215
|
+
idle_timeout=idle_timeout,
|
|
216
|
+
max_connections=max_connections,
|
|
217
|
+
check_server_data_on_release=True,
|
|
218
|
+
drain_timeout=drain_timeout,
|
|
219
|
+
**kwargs,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
self.fmt = fmt
|
|
223
|
+
self.sample_rate = sample_rate
|
|
224
|
+
self.silence_duration_ms = silence_duration_ms
|
|
225
|
+
self.vocabulary_id = vocabulary_id
|
|
226
|
+
self.disfluency_removal_enabled = disfluency_removal_enabled
|
|
227
|
+
self.lang_hints = lang_hints
|
|
228
|
+
self.semantic_punctuation = semantic_punctuation
|
|
229
|
+
self.multi_thres_mode = multi_thres_mode
|
|
230
|
+
self.punctuation_pred = punctuation_pred
|
|
231
|
+
self.heartbeat = heartbeat
|
|
232
|
+
self.itn = itn
|
|
233
|
+
self.resources = resources
|
|
234
|
+
|
|
235
|
+
def verify(
|
|
236
|
+
self,
|
|
237
|
+
model: str,
|
|
238
|
+
sr: int,
|
|
239
|
+
has_lang_hints: bool,
|
|
240
|
+
semantic_punctuation: bool | None = None,
|
|
241
|
+
multi_thres_mode: bool | None = None,
|
|
242
|
+
punctuation_pred: bool | None = None,
|
|
243
|
+
heartbeat: bool | None = None,
|
|
244
|
+
itn: bool | None = None,
|
|
245
|
+
) -> None:
|
|
246
|
+
v2_params = [semantic_punctuation, multi_thres_mode, punctuation_pred, heartbeat, itn]
|
|
247
|
+
if model not in {"paraformer-realtime-v2", "paraformer-realtime-8k-v2"} and any(
|
|
248
|
+
p is not None for p in v2_params
|
|
249
|
+
):
|
|
250
|
+
error_msg = (
|
|
251
|
+
'Only "paraformer-realtime-v2" and "paraformer-realtime-8k-v2" support v2 parameters ('
|
|
252
|
+
"`semantic_punctuation`, `multi_thres_mode`, `punctuation_pred`, `heartbeat`, `itn`)."
|
|
253
|
+
)
|
|
254
|
+
self.logger.error(f"Invalid parameter combination: {error_msg}")
|
|
255
|
+
raise InvalidParamError(message=error_msg, params={"model": model})
|
|
256
|
+
|
|
257
|
+
if model in {"paraformer-realtime-8k-v1", "paraformer-realtime-8k-v2"} and sr != 8000:
|
|
258
|
+
error_msg = "The sample rate for 8k models must be 8000 Hz."
|
|
259
|
+
self.logger.error(f"Invalid sample rate: {error_msg}")
|
|
260
|
+
raise InvalidParamError(message=error_msg, params={"sample_rate": sr})
|
|
261
|
+
|
|
262
|
+
if model == "paraformer-realtime-v1" and sr != 16000:
|
|
263
|
+
error_msg = 'The sample rate for "paraformer-realtime-v1" must be 16000 Hz.'
|
|
264
|
+
self.logger.error(f"Invalid sample rate: {error_msg}")
|
|
265
|
+
raise InvalidParamError(message=error_msg, params={"sample_rate": sr})
|
|
266
|
+
|
|
267
|
+
if model != "paraformer-realtime-v2" and has_lang_hints:
|
|
268
|
+
error_msg = 'Only "paraformer-realtime-v2" supports `lang_hints` parameter.'
|
|
269
|
+
self.logger.error(f"Invalid parameter combination: {error_msg}")
|
|
270
|
+
raise InvalidParamError(message=error_msg, params={"lang_hints": ""})
|
|
271
|
+
|
|
272
|
+
def session(
|
|
273
|
+
self,
|
|
274
|
+
*,
|
|
275
|
+
fmt: t.Literal["pcm", "mp3"] | None = None,
|
|
276
|
+
sample_rate: int | None = None,
|
|
277
|
+
silence_duration_ms: int | None = None,
|
|
278
|
+
vocabulary_id: str | None = None,
|
|
279
|
+
) -> TranscriptSession:
|
|
280
|
+
self.verify(
|
|
281
|
+
model=self.model,
|
|
282
|
+
sr=sample_rate or self.sample_rate,
|
|
283
|
+
has_lang_hints=self.lang_hints is not None,
|
|
284
|
+
semantic_punctuation=self.semantic_punctuation,
|
|
285
|
+
multi_thres_mode=self.multi_thres_mode,
|
|
286
|
+
punctuation_pred=self.punctuation_pred,
|
|
287
|
+
heartbeat=self.heartbeat,
|
|
288
|
+
itn=self.itn,
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
return DashscopeParaformerSession(
|
|
292
|
+
pool=self.pool,
|
|
293
|
+
model=self.model,
|
|
294
|
+
fmt=fmt or self.fmt,
|
|
295
|
+
sample_rate=sample_rate or self.sample_rate,
|
|
296
|
+
vocabulary_id=vocabulary_id or self.vocabulary_id,
|
|
297
|
+
disfluency_removal_enabled=self.disfluency_removal_enabled,
|
|
298
|
+
lang_hints=self.lang_hints,
|
|
299
|
+
semantic_punctuation=self.semantic_punctuation,
|
|
300
|
+
max_sentence_silence=silence_duration_ms or self.silence_duration_ms,
|
|
301
|
+
multi_thres_mode=self.multi_thres_mode,
|
|
302
|
+
punctuation_pred=self.punctuation_pred,
|
|
303
|
+
heartbeat=self.heartbeat,
|
|
304
|
+
itn=self.itn,
|
|
305
|
+
resources=self.resources,
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
class DashscopeParaformerSession(LoggingMixin, TranscriptSession):
|
|
310
|
+
__logtag__ = "audex.lib.transcript.dashscope.session"
|
|
311
|
+
|
|
312
|
+
def __init__(
|
|
313
|
+
self,
|
|
314
|
+
*,
|
|
315
|
+
pool: WebsocketConnectionPool,
|
|
316
|
+
model: str,
|
|
317
|
+
fmt: t.Literal["pcm", "wav", "mp3", "opus", "speex", "aac", "amr"],
|
|
318
|
+
sample_rate: int,
|
|
319
|
+
vocabulary_id: str | None = None,
|
|
320
|
+
disfluency_removal_enabled: bool | None = None,
|
|
321
|
+
lang_hints: list[t.Literal["zh", "en", "ja", "yue", "ko", "de", "fr", "ru"]] | None = None,
|
|
322
|
+
semantic_punctuation: bool | None = None,
|
|
323
|
+
max_sentence_silence: int | None = None,
|
|
324
|
+
multi_thres_mode: bool | None = None,
|
|
325
|
+
punctuation_pred: bool | None = None,
|
|
326
|
+
heartbeat: bool | None = None,
|
|
327
|
+
itn: bool | None = None,
|
|
328
|
+
resources: list[str] | None = None,
|
|
329
|
+
):
|
|
330
|
+
super().__init__()
|
|
331
|
+
self.pool = pool
|
|
332
|
+
self.model = model
|
|
333
|
+
self.format = fmt
|
|
334
|
+
self.sample_rate = sample_rate
|
|
335
|
+
self.vocabulary_id = vocabulary_id
|
|
336
|
+
self.disfluency_removal_enabled = disfluency_removal_enabled
|
|
337
|
+
self.lang_hints = lang_hints
|
|
338
|
+
self.semantic_punctuation = semantic_punctuation
|
|
339
|
+
self.max_sentence_silence = max_sentence_silence
|
|
340
|
+
self.multi_thres_mode = multi_thres_mode
|
|
341
|
+
self.punctuation_pred = punctuation_pred
|
|
342
|
+
self.heartbeat = heartbeat
|
|
343
|
+
self.itn = itn
|
|
344
|
+
self.resources = resources
|
|
345
|
+
|
|
346
|
+
self.task_id: str | None = None
|
|
347
|
+
self.connection: WebsocketConnection | None = None
|
|
348
|
+
self.lock = asyncio.Lock()
|
|
349
|
+
|
|
350
|
+
# Track utterances to prevent memory leaks
|
|
351
|
+
self._utterance_start_times: dict[str, float] = {}
|
|
352
|
+
|
|
353
|
+
async def start(self) -> None:
|
|
354
|
+
async with self.lock:
|
|
355
|
+
self.logger.debug("Starting DashscopeParaformerSession")
|
|
356
|
+
self.connection = await self.pool.acquire()
|
|
357
|
+
self.task_id = utils.gen_id()
|
|
358
|
+
|
|
359
|
+
resource_objs = [] # type: list[RunTaskPayloadResource]
|
|
360
|
+
if self.resources:
|
|
361
|
+
for res_id in self.resources:
|
|
362
|
+
resource_objs.append(RunTaskPayloadResource(resource_id=res_id))
|
|
363
|
+
|
|
364
|
+
payload_params = RunTaskPayloadParams(
|
|
365
|
+
format=self.format,
|
|
366
|
+
sample_rate=self.sample_rate,
|
|
367
|
+
vocabulary_id=self.vocabulary_id,
|
|
368
|
+
disfluency_removal_enabled=self.disfluency_removal_enabled,
|
|
369
|
+
language_hints=self.lang_hints,
|
|
370
|
+
semantic_punctuation_enabled=self.semantic_punctuation,
|
|
371
|
+
max_sentence_silence=self.max_sentence_silence,
|
|
372
|
+
multi_threshold_mode_enabled=self.multi_thres_mode,
|
|
373
|
+
punctuation_prediction_enabled=self.punctuation_pred,
|
|
374
|
+
heartbeat=self.heartbeat,
|
|
375
|
+
inverse_text_normalization_enabled=self.itn,
|
|
376
|
+
)
|
|
377
|
+
payload = RunTaskPayload(
|
|
378
|
+
model=self.model,
|
|
379
|
+
parameters=payload_params,
|
|
380
|
+
resources=resource_objs,
|
|
381
|
+
)
|
|
382
|
+
header = RunTaskHeader(task_id=self.task_id)
|
|
383
|
+
run_task = RunTask(header=header, payload=payload)
|
|
384
|
+
with self.logger.catch(reraise=True, level="ERROR", message="Failed to start session"):
|
|
385
|
+
_, server_msg = await asyncio.gather(
|
|
386
|
+
self.connection.send(run_task.model_dump_json(exclude_none=True)),
|
|
387
|
+
self.connection.recv(),
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
msg = parse_server_message(server_msg)
|
|
391
|
+
|
|
392
|
+
if not isinstance(msg, TaskStarted):
|
|
393
|
+
raise TranscriptionError(f"Unexpected server message: {server_msg!s}")
|
|
394
|
+
|
|
395
|
+
if not msg.header.task_id == self.task_id:
|
|
396
|
+
raise TranscriptionError(
|
|
397
|
+
f"Task ID mismatch: expected {self.task_id}, got {msg.header.task_id}"
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
async def finish(self) -> None:
|
|
401
|
+
if not self.connection or not self.task_id:
|
|
402
|
+
return
|
|
403
|
+
|
|
404
|
+
async with self.lock:
|
|
405
|
+
self.logger.debug("Finishing DashscopeParaformerSession")
|
|
406
|
+
header = FinishTaskHeader(task_id=self.task_id)
|
|
407
|
+
finish_task = FinishTask(header=header)
|
|
408
|
+
|
|
409
|
+
with self.logger.catch(reraise=True, level="ERROR", message="Failed to finish session"):
|
|
410
|
+
await self.connection.send(finish_task.model_dump_json(exclude_none=True))
|
|
411
|
+
|
|
412
|
+
async def close(self) -> None:
|
|
413
|
+
async with self.lock:
|
|
414
|
+
self.logger.debug("Closing DashscopeParaformerSession")
|
|
415
|
+
if self.connection:
|
|
416
|
+
await self.pool.release(self.connection)
|
|
417
|
+
self.connection = None
|
|
418
|
+
self.task_id = None
|
|
419
|
+
self._utterance_start_times.clear()
|
|
420
|
+
|
|
421
|
+
async def send(self, message: bytes) -> None:
|
|
422
|
+
if not self.connection or not self.task_id:
|
|
423
|
+
raise TranscriptionError("Session not started")
|
|
424
|
+
|
|
425
|
+
with self.logger.catch(reraise=True, level="ERROR", message="Failed to send audio data"):
|
|
426
|
+
await self.connection.send(message)
|
|
427
|
+
|
|
428
|
+
def receive(self) -> AsyncStream[ReceiveType]:
|
|
429
|
+
return AsyncStream(self._receive_iter())
|
|
430
|
+
|
|
431
|
+
async def _receive_iter(self) -> t.AsyncGenerator[ReceiveType, None]:
|
|
432
|
+
await asyncio.sleep(0.0)
|
|
433
|
+
|
|
434
|
+
if not self.connection or not self.task_id:
|
|
435
|
+
raise TranscriptionError("Session not started")
|
|
436
|
+
|
|
437
|
+
current_utterance_id: str | None = None
|
|
438
|
+
|
|
439
|
+
while True:
|
|
440
|
+
self.logger.debug("Waiting for server message")
|
|
441
|
+
with contextlib.suppress(asyncio.TimeoutError):
|
|
442
|
+
server_msg = await asyncio.wait_for(self.connection.recv(), timeout=30.0)
|
|
443
|
+
|
|
444
|
+
msg = parse_server_message(server_msg)
|
|
445
|
+
if not msg.header.task_id == self.task_id:
|
|
446
|
+
raise TranscriptionError(
|
|
447
|
+
f"Task ID mismatch: expected {self.task_id}, got {msg.header.task_id}"
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
if isinstance(msg, ResultGenerated):
|
|
451
|
+
sentence = msg.payload.output.sentence
|
|
452
|
+
interim = sentence.end_time is None
|
|
453
|
+
|
|
454
|
+
# New utterance started
|
|
455
|
+
if current_utterance_id is None:
|
|
456
|
+
current_utterance_id = utils.gen_id(prefix="utt")
|
|
457
|
+
started_at = utils.utcnow().timestamp()
|
|
458
|
+
self._utterance_start_times[current_utterance_id] = started_at
|
|
459
|
+
|
|
460
|
+
self.logger.debug(
|
|
461
|
+
f"New utterance started: id={current_utterance_id}, started_at={started_at}"
|
|
462
|
+
)
|
|
463
|
+
yield Start(utterance_id=current_utterance_id, started_at=started_at)
|
|
464
|
+
|
|
465
|
+
# Convert relative offsets (ms) to seconds
|
|
466
|
+
offset_begin = sentence.begin_time / 1000.0
|
|
467
|
+
offset_end = (sentence.end_time / 1000.0) if sentence.end_time else None
|
|
468
|
+
|
|
469
|
+
self.logger.debug(
|
|
470
|
+
f"Transcription delta: "
|
|
471
|
+
f"utterance_id={current_utterance_id}, "
|
|
472
|
+
f"text='{sentence.text}', "
|
|
473
|
+
f"offset_begin={offset_begin}s, "
|
|
474
|
+
f"offset_end={offset_end if offset_end else 'None'}s, "
|
|
475
|
+
f"interim={interim}",
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
yield Delta(
|
|
479
|
+
utterance_id=current_utterance_id,
|
|
480
|
+
offset_begin=offset_begin,
|
|
481
|
+
offset_end=offset_end,
|
|
482
|
+
text=sentence.text,
|
|
483
|
+
interim=interim,
|
|
484
|
+
)
|
|
485
|
+
|
|
486
|
+
# Final result for this utterance
|
|
487
|
+
if not interim:
|
|
488
|
+
ended_at = utils.utcnow().timestamp()
|
|
489
|
+
|
|
490
|
+
self.logger.debug(
|
|
491
|
+
f"Utterance completed: id={current_utterance_id}, ended_at={ended_at}"
|
|
492
|
+
)
|
|
493
|
+
|
|
494
|
+
yield Done(utterance_id=current_utterance_id, ended_at=ended_at)
|
|
495
|
+
|
|
496
|
+
# Clean up to prevent memory leak
|
|
497
|
+
if current_utterance_id in self._utterance_start_times:
|
|
498
|
+
del self._utterance_start_times[current_utterance_id]
|
|
499
|
+
|
|
500
|
+
# Reset for next utterance
|
|
501
|
+
current_utterance_id = None
|
|
502
|
+
|
|
503
|
+
elif isinstance(msg, TaskFinished):
|
|
504
|
+
if msg.header.task_id != self.task_id:
|
|
505
|
+
raise TranscriptionError(
|
|
506
|
+
f"Task ID mismatch: expected {self.task_id}, got {msg.header.task_id}"
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
self.logger.debug("Transcription task finished by server")
|
|
510
|
+
break
|
|
511
|
+
|
|
512
|
+
elif isinstance(msg, TaskFailed):
|
|
513
|
+
self.logger.error(
|
|
514
|
+
f"Transcription task failed: "
|
|
515
|
+
f"error_code={msg.header.error_code}, "
|
|
516
|
+
f"error_message={msg.header.error_message}"
|
|
517
|
+
)
|
|
518
|
+
raise TranscriptionError(
|
|
519
|
+
f"Transcription task failed: {msg.header.error_message} "
|
|
520
|
+
f"(code: {msg.header.error_code})"
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
else:
|
|
524
|
+
self.logger.error(f"Unexpected server message: {server_msg!s}")
|
|
525
|
+
raise TranscriptionError(f"Unexpected server message: {server_msg!s}")
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class Start:
|
|
5
|
+
"""Indicates the start of a transcription utterance.
|
|
6
|
+
|
|
7
|
+
Attributes:
|
|
8
|
+
utterance_id: Unique ID for this utterance.
|
|
9
|
+
started_at: Absolute timestamp when transcription started (UNIX
|
|
10
|
+
timestamp).
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
__slots__ = ("started_at", "utterance_id")
|
|
14
|
+
|
|
15
|
+
def __init__(self, *, utterance_id: str, started_at: float) -> None:
|
|
16
|
+
self.utterance_id = utterance_id
|
|
17
|
+
self.started_at = started_at
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Delta:
|
|
21
|
+
"""Delta event representing a partial transcription update.
|
|
22
|
+
|
|
23
|
+
Attributes:
|
|
24
|
+
utterance_id: ID of the utterance this delta belongs to.
|
|
25
|
+
offset_begin: Start offset from utterance start (in seconds).
|
|
26
|
+
offset_end: End offset from utterance start (in seconds, None for
|
|
27
|
+
interim).
|
|
28
|
+
text: The transcribed text for the segment.
|
|
29
|
+
interim: Whether this is interim (True) or final (False).
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
__slots__ = ("interim", "offset_begin", "offset_end", "text", "utterance_id")
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
*,
|
|
37
|
+
utterance_id: str,
|
|
38
|
+
offset_begin: float,
|
|
39
|
+
offset_end: float | None,
|
|
40
|
+
text: str,
|
|
41
|
+
interim: bool,
|
|
42
|
+
) -> None:
|
|
43
|
+
self.utterance_id = utterance_id
|
|
44
|
+
self.offset_begin = offset_begin
|
|
45
|
+
self.offset_end = offset_end
|
|
46
|
+
self.text = text
|
|
47
|
+
self.interim = interim
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class Done:
|
|
51
|
+
"""Indicates the completion of a transcription utterance.
|
|
52
|
+
|
|
53
|
+
Attributes:
|
|
54
|
+
utterance_id: ID of the completed utterance.
|
|
55
|
+
ended_at: Absolute timestamp when transcription ended (UNIX timestamp).
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
__slots__ = ("ended_at", "utterance_id")
|
|
59
|
+
|
|
60
|
+
def __init__(self, *, utterance_id: str, ended_at: float) -> None:
|
|
61
|
+
self.utterance_id = utterance_id
|
|
62
|
+
self.ended_at = ended_at
|