audex 1.0.7a3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (192) hide show
  1. audex/__init__.py +9 -0
  2. audex/__main__.py +7 -0
  3. audex/cli/__init__.py +189 -0
  4. audex/cli/apis/__init__.py +12 -0
  5. audex/cli/apis/init/__init__.py +34 -0
  6. audex/cli/apis/init/gencfg.py +130 -0
  7. audex/cli/apis/init/setup.py +330 -0
  8. audex/cli/apis/init/vprgroup.py +125 -0
  9. audex/cli/apis/serve.py +141 -0
  10. audex/cli/args.py +356 -0
  11. audex/cli/exceptions.py +44 -0
  12. audex/cli/helper/__init__.py +0 -0
  13. audex/cli/helper/ansi.py +193 -0
  14. audex/cli/helper/display.py +288 -0
  15. audex/config/__init__.py +64 -0
  16. audex/config/core/__init__.py +30 -0
  17. audex/config/core/app.py +29 -0
  18. audex/config/core/audio.py +45 -0
  19. audex/config/core/logging.py +163 -0
  20. audex/config/core/session.py +11 -0
  21. audex/config/helper/__init__.py +1 -0
  22. audex/config/helper/client/__init__.py +1 -0
  23. audex/config/helper/client/http.py +28 -0
  24. audex/config/helper/client/websocket.py +21 -0
  25. audex/config/helper/provider/__init__.py +1 -0
  26. audex/config/helper/provider/dashscope.py +13 -0
  27. audex/config/helper/provider/unisound.py +18 -0
  28. audex/config/helper/provider/xfyun.py +23 -0
  29. audex/config/infrastructure/__init__.py +31 -0
  30. audex/config/infrastructure/cache.py +51 -0
  31. audex/config/infrastructure/database.py +48 -0
  32. audex/config/infrastructure/recorder.py +32 -0
  33. audex/config/infrastructure/store.py +19 -0
  34. audex/config/provider/__init__.py +18 -0
  35. audex/config/provider/transcription.py +109 -0
  36. audex/config/provider/vpr.py +99 -0
  37. audex/container.py +40 -0
  38. audex/entity/__init__.py +468 -0
  39. audex/entity/doctor.py +109 -0
  40. audex/entity/doctor.pyi +51 -0
  41. audex/entity/fields.py +401 -0
  42. audex/entity/segment.py +115 -0
  43. audex/entity/segment.pyi +38 -0
  44. audex/entity/session.py +133 -0
  45. audex/entity/session.pyi +47 -0
  46. audex/entity/utterance.py +142 -0
  47. audex/entity/utterance.pyi +48 -0
  48. audex/entity/vp.py +68 -0
  49. audex/entity/vp.pyi +35 -0
  50. audex/exceptions.py +157 -0
  51. audex/filters/__init__.py +692 -0
  52. audex/filters/generated/__init__.py +21 -0
  53. audex/filters/generated/doctor.py +987 -0
  54. audex/filters/generated/segment.py +723 -0
  55. audex/filters/generated/session.py +978 -0
  56. audex/filters/generated/utterance.py +939 -0
  57. audex/filters/generated/vp.py +815 -0
  58. audex/helper/__init__.py +1 -0
  59. audex/helper/hash.py +33 -0
  60. audex/helper/mixin.py +65 -0
  61. audex/helper/net.py +19 -0
  62. audex/helper/settings/__init__.py +830 -0
  63. audex/helper/settings/fields.py +317 -0
  64. audex/helper/stream.py +153 -0
  65. audex/injectors/__init__.py +1 -0
  66. audex/injectors/config.py +12 -0
  67. audex/injectors/lifespan.py +7 -0
  68. audex/lib/__init__.py +1 -0
  69. audex/lib/cache/__init__.py +383 -0
  70. audex/lib/cache/inmemory.py +513 -0
  71. audex/lib/database/__init__.py +83 -0
  72. audex/lib/database/sqlite.py +406 -0
  73. audex/lib/exporter.py +189 -0
  74. audex/lib/injectors/__init__.py +1 -0
  75. audex/lib/injectors/cache.py +25 -0
  76. audex/lib/injectors/container.py +47 -0
  77. audex/lib/injectors/exporter.py +26 -0
  78. audex/lib/injectors/recorder.py +33 -0
  79. audex/lib/injectors/server.py +17 -0
  80. audex/lib/injectors/session.py +18 -0
  81. audex/lib/injectors/sqlite.py +24 -0
  82. audex/lib/injectors/store.py +13 -0
  83. audex/lib/injectors/transcription.py +42 -0
  84. audex/lib/injectors/usb.py +12 -0
  85. audex/lib/injectors/vpr.py +65 -0
  86. audex/lib/injectors/wifi.py +7 -0
  87. audex/lib/recorder.py +844 -0
  88. audex/lib/repos/__init__.py +149 -0
  89. audex/lib/repos/container.py +23 -0
  90. audex/lib/repos/database/__init__.py +1 -0
  91. audex/lib/repos/database/sqlite.py +672 -0
  92. audex/lib/repos/decorators.py +74 -0
  93. audex/lib/repos/doctor.py +286 -0
  94. audex/lib/repos/segment.py +302 -0
  95. audex/lib/repos/session.py +285 -0
  96. audex/lib/repos/tables/__init__.py +70 -0
  97. audex/lib/repos/tables/doctor.py +137 -0
  98. audex/lib/repos/tables/segment.py +113 -0
  99. audex/lib/repos/tables/session.py +140 -0
  100. audex/lib/repos/tables/utterance.py +131 -0
  101. audex/lib/repos/tables/vp.py +102 -0
  102. audex/lib/repos/utterance.py +288 -0
  103. audex/lib/repos/vp.py +286 -0
  104. audex/lib/restful.py +251 -0
  105. audex/lib/server/__init__.py +97 -0
  106. audex/lib/server/auth.py +98 -0
  107. audex/lib/server/handlers.py +248 -0
  108. audex/lib/server/templates/index.html.j2 +226 -0
  109. audex/lib/server/templates/login.html.j2 +111 -0
  110. audex/lib/server/templates/static/script.js +68 -0
  111. audex/lib/server/templates/static/style.css +579 -0
  112. audex/lib/server/types.py +123 -0
  113. audex/lib/session.py +503 -0
  114. audex/lib/store/__init__.py +238 -0
  115. audex/lib/store/localfile.py +411 -0
  116. audex/lib/transcription/__init__.py +33 -0
  117. audex/lib/transcription/dashscope.py +525 -0
  118. audex/lib/transcription/events.py +62 -0
  119. audex/lib/usb.py +554 -0
  120. audex/lib/vpr/__init__.py +38 -0
  121. audex/lib/vpr/unisound/__init__.py +185 -0
  122. audex/lib/vpr/unisound/types.py +469 -0
  123. audex/lib/vpr/xfyun/__init__.py +483 -0
  124. audex/lib/vpr/xfyun/types.py +679 -0
  125. audex/lib/websocket/__init__.py +8 -0
  126. audex/lib/websocket/connection.py +485 -0
  127. audex/lib/websocket/pool.py +991 -0
  128. audex/lib/wifi.py +1146 -0
  129. audex/lifespan.py +75 -0
  130. audex/service/__init__.py +27 -0
  131. audex/service/decorators.py +73 -0
  132. audex/service/doctor/__init__.py +652 -0
  133. audex/service/doctor/const.py +36 -0
  134. audex/service/doctor/exceptions.py +96 -0
  135. audex/service/doctor/types.py +54 -0
  136. audex/service/export/__init__.py +236 -0
  137. audex/service/export/const.py +17 -0
  138. audex/service/export/exceptions.py +34 -0
  139. audex/service/export/types.py +21 -0
  140. audex/service/injectors/__init__.py +1 -0
  141. audex/service/injectors/container.py +53 -0
  142. audex/service/injectors/doctor.py +34 -0
  143. audex/service/injectors/export.py +27 -0
  144. audex/service/injectors/session.py +49 -0
  145. audex/service/session/__init__.py +754 -0
  146. audex/service/session/const.py +34 -0
  147. audex/service/session/exceptions.py +67 -0
  148. audex/service/session/types.py +91 -0
  149. audex/types.py +39 -0
  150. audex/utils.py +287 -0
  151. audex/valueobj/__init__.py +81 -0
  152. audex/valueobj/common/__init__.py +1 -0
  153. audex/valueobj/common/auth.py +84 -0
  154. audex/valueobj/common/email.py +16 -0
  155. audex/valueobj/common/ops.py +22 -0
  156. audex/valueobj/common/phone.py +84 -0
  157. audex/valueobj/common/version.py +72 -0
  158. audex/valueobj/session.py +19 -0
  159. audex/valueobj/utterance.py +15 -0
  160. audex/view/__init__.py +51 -0
  161. audex/view/container.py +17 -0
  162. audex/view/decorators.py +303 -0
  163. audex/view/pages/__init__.py +1 -0
  164. audex/view/pages/dashboard/__init__.py +286 -0
  165. audex/view/pages/dashboard/wifi.py +407 -0
  166. audex/view/pages/login.py +110 -0
  167. audex/view/pages/recording.py +348 -0
  168. audex/view/pages/register.py +202 -0
  169. audex/view/pages/sessions/__init__.py +196 -0
  170. audex/view/pages/sessions/details.py +224 -0
  171. audex/view/pages/sessions/export.py +443 -0
  172. audex/view/pages/settings.py +374 -0
  173. audex/view/pages/voiceprint/__init__.py +1 -0
  174. audex/view/pages/voiceprint/enroll.py +195 -0
  175. audex/view/pages/voiceprint/update.py +195 -0
  176. audex/view/static/css/dashboard.css +452 -0
  177. audex/view/static/css/glass.css +22 -0
  178. audex/view/static/css/global.css +541 -0
  179. audex/view/static/css/login.css +386 -0
  180. audex/view/static/css/recording.css +439 -0
  181. audex/view/static/css/register.css +293 -0
  182. audex/view/static/css/sessions/styles.css +501 -0
  183. audex/view/static/css/settings.css +186 -0
  184. audex/view/static/css/voiceprint/enroll.css +43 -0
  185. audex/view/static/css/voiceprint/styles.css +209 -0
  186. audex/view/static/css/voiceprint/update.css +44 -0
  187. audex/view/static/images/logo.svg +95 -0
  188. audex/view/static/js/recording.js +42 -0
  189. audex-1.0.7a3.dist-info/METADATA +361 -0
  190. audex-1.0.7a3.dist-info/RECORD +192 -0
  191. audex-1.0.7a3.dist-info/WHEEL +4 -0
  192. audex-1.0.7a3.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,525 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import contextlib
5
+ import typing as t
6
+
7
+ from pydantic import BaseModel
8
+ from pydantic import TypeAdapter
9
+
10
+ from audex import utils
11
+ from audex.helper.mixin import LoggingMixin
12
+ from audex.helper.stream import AsyncStream
13
+ from audex.lib.transcription import ReceiveType
14
+ from audex.lib.transcription import Transcription
15
+ from audex.lib.transcription import TranscriptionError
16
+ from audex.lib.transcription import TranscriptSession
17
+ from audex.lib.transcription.events import Delta
18
+ from audex.lib.transcription.events import Done
19
+ from audex.lib.transcription.events import Start
20
+ from audex.lib.websocket.connection import WebsocketConnection
21
+ from audex.lib.websocket.pool import WebsocketConnectionPool
22
+
23
+
24
+ class InvalidParamError(TranscriptionError):
25
+ default_message = "Invalid transcription parameters"
26
+
27
+ def __init__(
28
+ self,
29
+ message: str | None = None,
30
+ params: dict[str, t.Any] | None = None,
31
+ ):
32
+ super().__init__(message or self.default_message)
33
+ self.params = params or {}
34
+
35
+
36
+ class RunTaskPayloadParams(BaseModel):
37
+ format: t.Literal["pcm", "wav", "mp3", "opus", "speex", "aac", "amr"] = "pcm"
38
+ sample_rate: int = 16000
39
+ vocabulary_id: str | None = None
40
+ disfluency_removal_enabled: bool | None = None
41
+ language_hints: list[t.Literal["zh", "en", "ja", "yue", "ko", "de", "fr", "ru"]] | None = None
42
+ semantic_punctuation_enabled: bool | None = None
43
+ max_sentence_silence: int | None = None
44
+ multi_threshold_mode_enabled: bool | None = None
45
+ punctuation_prediction_enabled: bool | None = None
46
+ heartbeat: bool | None = None
47
+ inverse_text_normalization_enabled: bool | None = None
48
+
49
+
50
+ class RunTaskPayloadResource(BaseModel):
51
+ resource_id: str
52
+ resource_type: t.Literal["asr_phrase"] = "asr_phrase"
53
+
54
+
55
+ class RunTaskPayload(BaseModel):
56
+ task_group: t.Literal["audio"] = "audio"
57
+ task: t.Literal["asr"] = "asr"
58
+ function: t.Literal["recognition"] = "recognition"
59
+ model: str = "paraformer-realtime-v2"
60
+ parameters: RunTaskPayloadParams = RunTaskPayloadParams()
61
+ resources: list[RunTaskPayloadResource] = []
62
+ input: dict[str, object] = {}
63
+
64
+
65
+ class RunTaskHeader(BaseModel):
66
+ task_id: str
67
+ action: t.Literal["run-task"] = "run-task"
68
+ streaming: t.Literal["duplex"] = "duplex"
69
+
70
+
71
+ class RunTask(BaseModel):
72
+ header: RunTaskHeader
73
+ payload: RunTaskPayload
74
+
75
+
76
+ class FinishTaskHeader(BaseModel):
77
+ task_id: str
78
+ action: t.Literal["finish-task"] = "finish-task"
79
+ streaming: t.Literal["duplex"] = "duplex"
80
+
81
+
82
+ class FinishTaskPayload(BaseModel):
83
+ input: dict[str, object] = {}
84
+
85
+
86
+ class FinishTask(BaseModel):
87
+ header: FinishTaskHeader
88
+ payload: FinishTaskPayload = FinishTaskPayload()
89
+
90
+
91
+ class BaseServerHeader(BaseModel):
92
+ task_id: str | None = None
93
+ event: t.Literal["task-started", "result-generated", "task-finished", "task-failed"]
94
+ attributes: dict[str, t.Any]
95
+
96
+
97
+ class BaseServerMessage(BaseModel):
98
+ header: BaseServerHeader
99
+
100
+
101
+ class TaskStartedHeader(BaseServerHeader):
102
+ event: t.Literal["task-started"]
103
+
104
+
105
+ class TaskStarted(BaseServerMessage):
106
+ header: TaskStartedHeader
107
+
108
+
109
+ class ResultGeneratedHeader(BaseServerHeader):
110
+ event: t.Literal["result-generated"]
111
+
112
+
113
+ class ResultGeneratedPayloadOutputSentence(BaseModel):
114
+ begin_time: int # in milliseconds
115
+ end_time: int | None
116
+ text: str
117
+ words: list[dict[str, object]] | None
118
+
119
+
120
+ class ResultGeneratedPayloadOutput(BaseModel):
121
+ sentence: ResultGeneratedPayloadOutputSentence
122
+
123
+
124
+ class ResultGeneratedPayload(BaseModel):
125
+ output: ResultGeneratedPayloadOutput
126
+ usage: t.Any = None
127
+
128
+
129
+ class ResultGenerated(BaseServerMessage):
130
+ header: ResultGeneratedHeader
131
+ payload: ResultGeneratedPayload
132
+
133
+
134
+ class TaskFinishedHeader(BaseServerHeader):
135
+ event: t.Literal["task-finished"]
136
+
137
+
138
+ class TaskFinished(BaseServerMessage):
139
+ header: TaskFinishedHeader
140
+
141
+
142
+ class TaskFailedHeader(BaseServerHeader):
143
+ event: t.Literal["task-failed"]
144
+ error_code: str
145
+ error_message: str
146
+
147
+
148
+ class TaskFailed(BaseServerMessage):
149
+ header: TaskFailedHeader
150
+
151
+
152
+ ServerMessage = TaskStarted | ResultGenerated | TaskFinished | TaskFailed
153
+ adapter: TypeAdapter[ServerMessage] = TypeAdapter(ServerMessage)
154
+
155
+
156
+ def parse_server_message(
157
+ data: str | bytes,
158
+ ) -> TaskStarted | ResultGenerated | TaskFinished | TaskFailed:
159
+ try:
160
+ return adapter.validate_json(data)
161
+ except Exception as e:
162
+ print("=" * 20)
163
+ print(str(e))
164
+ print(data)
165
+ print("=" * 20)
166
+
167
+ raise
168
+
169
+
170
+ class DashscopeParaformer(LoggingMixin, Transcription):
171
+ __logtag__ = "audex.lib.transcript.dashscope"
172
+
173
+ def __init__(
174
+ self,
175
+ *,
176
+ model: str = "paraformer-realtime-v2",
177
+ url: str = "wss://dashscope.aliyuncs.com/api-ws/v1/inference",
178
+ api_key: str,
179
+ user_agent: str | None = None,
180
+ workspace: str | None = None,
181
+ max_connections: int = 1000,
182
+ idle_timeout: int = 60,
183
+ drain_timeout: float = 5.0,
184
+ fmt: t.Literal["pcm", "wav", "mp3", "opus", "speex", "aac", "amr"] = "pcm",
185
+ sample_rate: int = 8000,
186
+ silence_duration_ms: int | None = None,
187
+ vocabulary_id: str | None = None,
188
+ disfluency_removal_enabled: bool | None = None,
189
+ lang_hints: list[t.Literal["zh", "en", "ja", "yue", "ko", "de", "fr", "ru"]] | None = None,
190
+ semantic_punctuation: bool | None = None,
191
+ multi_thres_mode: bool | None = None,
192
+ punctuation_pred: bool | None = None,
193
+ heartbeat: bool | None = None,
194
+ itn: bool | None = None,
195
+ resources: list[str] | None = None,
196
+ **kwargs: t.Any,
197
+ ):
198
+ super().__init__()
199
+ self.model = model
200
+ self.api_key = api_key
201
+ self.user_agent = user_agent
202
+
203
+ self.headers = {
204
+ "Authorization": f"Bearer {self.api_key}",
205
+ "X-DashScope-DataInspection": "enable",
206
+ }
207
+ if self.user_agent:
208
+ self.headers["User-Agent"] = self.user_agent
209
+ if workspace:
210
+ self.headers["X-DashScope-Workspace"] = workspace
211
+
212
+ self.pool = WebsocketConnectionPool(
213
+ uri=url,
214
+ headers=self.headers,
215
+ idle_timeout=idle_timeout,
216
+ max_connections=max_connections,
217
+ check_server_data_on_release=True,
218
+ drain_timeout=drain_timeout,
219
+ **kwargs,
220
+ )
221
+
222
+ self.fmt = fmt
223
+ self.sample_rate = sample_rate
224
+ self.silence_duration_ms = silence_duration_ms
225
+ self.vocabulary_id = vocabulary_id
226
+ self.disfluency_removal_enabled = disfluency_removal_enabled
227
+ self.lang_hints = lang_hints
228
+ self.semantic_punctuation = semantic_punctuation
229
+ self.multi_thres_mode = multi_thres_mode
230
+ self.punctuation_pred = punctuation_pred
231
+ self.heartbeat = heartbeat
232
+ self.itn = itn
233
+ self.resources = resources
234
+
235
+ def verify(
236
+ self,
237
+ model: str,
238
+ sr: int,
239
+ has_lang_hints: bool,
240
+ semantic_punctuation: bool | None = None,
241
+ multi_thres_mode: bool | None = None,
242
+ punctuation_pred: bool | None = None,
243
+ heartbeat: bool | None = None,
244
+ itn: bool | None = None,
245
+ ) -> None:
246
+ v2_params = [semantic_punctuation, multi_thres_mode, punctuation_pred, heartbeat, itn]
247
+ if model not in {"paraformer-realtime-v2", "paraformer-realtime-8k-v2"} and any(
248
+ p is not None for p in v2_params
249
+ ):
250
+ error_msg = (
251
+ 'Only "paraformer-realtime-v2" and "paraformer-realtime-8k-v2" support v2 parameters ('
252
+ "`semantic_punctuation`, `multi_thres_mode`, `punctuation_pred`, `heartbeat`, `itn`)."
253
+ )
254
+ self.logger.error(f"Invalid parameter combination: {error_msg}")
255
+ raise InvalidParamError(message=error_msg, params={"model": model})
256
+
257
+ if model in {"paraformer-realtime-8k-v1", "paraformer-realtime-8k-v2"} and sr != 8000:
258
+ error_msg = "The sample rate for 8k models must be 8000 Hz."
259
+ self.logger.error(f"Invalid sample rate: {error_msg}")
260
+ raise InvalidParamError(message=error_msg, params={"sample_rate": sr})
261
+
262
+ if model == "paraformer-realtime-v1" and sr != 16000:
263
+ error_msg = 'The sample rate for "paraformer-realtime-v1" must be 16000 Hz.'
264
+ self.logger.error(f"Invalid sample rate: {error_msg}")
265
+ raise InvalidParamError(message=error_msg, params={"sample_rate": sr})
266
+
267
+ if model != "paraformer-realtime-v2" and has_lang_hints:
268
+ error_msg = 'Only "paraformer-realtime-v2" supports `lang_hints` parameter.'
269
+ self.logger.error(f"Invalid parameter combination: {error_msg}")
270
+ raise InvalidParamError(message=error_msg, params={"lang_hints": ""})
271
+
272
+ def session(
273
+ self,
274
+ *,
275
+ fmt: t.Literal["pcm", "mp3"] | None = None,
276
+ sample_rate: int | None = None,
277
+ silence_duration_ms: int | None = None,
278
+ vocabulary_id: str | None = None,
279
+ ) -> TranscriptSession:
280
+ self.verify(
281
+ model=self.model,
282
+ sr=sample_rate or self.sample_rate,
283
+ has_lang_hints=self.lang_hints is not None,
284
+ semantic_punctuation=self.semantic_punctuation,
285
+ multi_thres_mode=self.multi_thres_mode,
286
+ punctuation_pred=self.punctuation_pred,
287
+ heartbeat=self.heartbeat,
288
+ itn=self.itn,
289
+ )
290
+
291
+ return DashscopeParaformerSession(
292
+ pool=self.pool,
293
+ model=self.model,
294
+ fmt=fmt or self.fmt,
295
+ sample_rate=sample_rate or self.sample_rate,
296
+ vocabulary_id=vocabulary_id or self.vocabulary_id,
297
+ disfluency_removal_enabled=self.disfluency_removal_enabled,
298
+ lang_hints=self.lang_hints,
299
+ semantic_punctuation=self.semantic_punctuation,
300
+ max_sentence_silence=silence_duration_ms or self.silence_duration_ms,
301
+ multi_thres_mode=self.multi_thres_mode,
302
+ punctuation_pred=self.punctuation_pred,
303
+ heartbeat=self.heartbeat,
304
+ itn=self.itn,
305
+ resources=self.resources,
306
+ )
307
+
308
+
309
+ class DashscopeParaformerSession(LoggingMixin, TranscriptSession):
310
+ __logtag__ = "audex.lib.transcript.dashscope.session"
311
+
312
+ def __init__(
313
+ self,
314
+ *,
315
+ pool: WebsocketConnectionPool,
316
+ model: str,
317
+ fmt: t.Literal["pcm", "wav", "mp3", "opus", "speex", "aac", "amr"],
318
+ sample_rate: int,
319
+ vocabulary_id: str | None = None,
320
+ disfluency_removal_enabled: bool | None = None,
321
+ lang_hints: list[t.Literal["zh", "en", "ja", "yue", "ko", "de", "fr", "ru"]] | None = None,
322
+ semantic_punctuation: bool | None = None,
323
+ max_sentence_silence: int | None = None,
324
+ multi_thres_mode: bool | None = None,
325
+ punctuation_pred: bool | None = None,
326
+ heartbeat: bool | None = None,
327
+ itn: bool | None = None,
328
+ resources: list[str] | None = None,
329
+ ):
330
+ super().__init__()
331
+ self.pool = pool
332
+ self.model = model
333
+ self.format = fmt
334
+ self.sample_rate = sample_rate
335
+ self.vocabulary_id = vocabulary_id
336
+ self.disfluency_removal_enabled = disfluency_removal_enabled
337
+ self.lang_hints = lang_hints
338
+ self.semantic_punctuation = semantic_punctuation
339
+ self.max_sentence_silence = max_sentence_silence
340
+ self.multi_thres_mode = multi_thres_mode
341
+ self.punctuation_pred = punctuation_pred
342
+ self.heartbeat = heartbeat
343
+ self.itn = itn
344
+ self.resources = resources
345
+
346
+ self.task_id: str | None = None
347
+ self.connection: WebsocketConnection | None = None
348
+ self.lock = asyncio.Lock()
349
+
350
+ # Track utterances to prevent memory leaks
351
+ self._utterance_start_times: dict[str, float] = {}
352
+
353
+ async def start(self) -> None:
354
+ async with self.lock:
355
+ self.logger.debug("Starting DashscopeParaformerSession")
356
+ self.connection = await self.pool.acquire()
357
+ self.task_id = utils.gen_id()
358
+
359
+ resource_objs = [] # type: list[RunTaskPayloadResource]
360
+ if self.resources:
361
+ for res_id in self.resources:
362
+ resource_objs.append(RunTaskPayloadResource(resource_id=res_id))
363
+
364
+ payload_params = RunTaskPayloadParams(
365
+ format=self.format,
366
+ sample_rate=self.sample_rate,
367
+ vocabulary_id=self.vocabulary_id,
368
+ disfluency_removal_enabled=self.disfluency_removal_enabled,
369
+ language_hints=self.lang_hints,
370
+ semantic_punctuation_enabled=self.semantic_punctuation,
371
+ max_sentence_silence=self.max_sentence_silence,
372
+ multi_threshold_mode_enabled=self.multi_thres_mode,
373
+ punctuation_prediction_enabled=self.punctuation_pred,
374
+ heartbeat=self.heartbeat,
375
+ inverse_text_normalization_enabled=self.itn,
376
+ )
377
+ payload = RunTaskPayload(
378
+ model=self.model,
379
+ parameters=payload_params,
380
+ resources=resource_objs,
381
+ )
382
+ header = RunTaskHeader(task_id=self.task_id)
383
+ run_task = RunTask(header=header, payload=payload)
384
+ with self.logger.catch(reraise=True, level="ERROR", message="Failed to start session"):
385
+ _, server_msg = await asyncio.gather(
386
+ self.connection.send(run_task.model_dump_json(exclude_none=True)),
387
+ self.connection.recv(),
388
+ )
389
+
390
+ msg = parse_server_message(server_msg)
391
+
392
+ if not isinstance(msg, TaskStarted):
393
+ raise TranscriptionError(f"Unexpected server message: {server_msg!s}")
394
+
395
+ if not msg.header.task_id == self.task_id:
396
+ raise TranscriptionError(
397
+ f"Task ID mismatch: expected {self.task_id}, got {msg.header.task_id}"
398
+ )
399
+
400
+ async def finish(self) -> None:
401
+ if not self.connection or not self.task_id:
402
+ return
403
+
404
+ async with self.lock:
405
+ self.logger.debug("Finishing DashscopeParaformerSession")
406
+ header = FinishTaskHeader(task_id=self.task_id)
407
+ finish_task = FinishTask(header=header)
408
+
409
+ with self.logger.catch(reraise=True, level="ERROR", message="Failed to finish session"):
410
+ await self.connection.send(finish_task.model_dump_json(exclude_none=True))
411
+
412
+ async def close(self) -> None:
413
+ async with self.lock:
414
+ self.logger.debug("Closing DashscopeParaformerSession")
415
+ if self.connection:
416
+ await self.pool.release(self.connection)
417
+ self.connection = None
418
+ self.task_id = None
419
+ self._utterance_start_times.clear()
420
+
421
+ async def send(self, message: bytes) -> None:
422
+ if not self.connection or not self.task_id:
423
+ raise TranscriptionError("Session not started")
424
+
425
+ with self.logger.catch(reraise=True, level="ERROR", message="Failed to send audio data"):
426
+ await self.connection.send(message)
427
+
428
+ def receive(self) -> AsyncStream[ReceiveType]:
429
+ return AsyncStream(self._receive_iter())
430
+
431
+ async def _receive_iter(self) -> t.AsyncGenerator[ReceiveType, None]:
432
+ await asyncio.sleep(0.0)
433
+
434
+ if not self.connection or not self.task_id:
435
+ raise TranscriptionError("Session not started")
436
+
437
+ current_utterance_id: str | None = None
438
+
439
+ while True:
440
+ self.logger.debug("Waiting for server message")
441
+ with contextlib.suppress(asyncio.TimeoutError):
442
+ server_msg = await asyncio.wait_for(self.connection.recv(), timeout=30.0)
443
+
444
+ msg = parse_server_message(server_msg)
445
+ if not msg.header.task_id == self.task_id:
446
+ raise TranscriptionError(
447
+ f"Task ID mismatch: expected {self.task_id}, got {msg.header.task_id}"
448
+ )
449
+
450
+ if isinstance(msg, ResultGenerated):
451
+ sentence = msg.payload.output.sentence
452
+ interim = sentence.end_time is None
453
+
454
+ # New utterance started
455
+ if current_utterance_id is None:
456
+ current_utterance_id = utils.gen_id(prefix="utt")
457
+ started_at = utils.utcnow().timestamp()
458
+ self._utterance_start_times[current_utterance_id] = started_at
459
+
460
+ self.logger.debug(
461
+ f"New utterance started: id={current_utterance_id}, started_at={started_at}"
462
+ )
463
+ yield Start(utterance_id=current_utterance_id, started_at=started_at)
464
+
465
+ # Convert relative offsets (ms) to seconds
466
+ offset_begin = sentence.begin_time / 1000.0
467
+ offset_end = (sentence.end_time / 1000.0) if sentence.end_time else None
468
+
469
+ self.logger.debug(
470
+ f"Transcription delta: "
471
+ f"utterance_id={current_utterance_id}, "
472
+ f"text='{sentence.text}', "
473
+ f"offset_begin={offset_begin}s, "
474
+ f"offset_end={offset_end if offset_end else 'None'}s, "
475
+ f"interim={interim}",
476
+ )
477
+
478
+ yield Delta(
479
+ utterance_id=current_utterance_id,
480
+ offset_begin=offset_begin,
481
+ offset_end=offset_end,
482
+ text=sentence.text,
483
+ interim=interim,
484
+ )
485
+
486
+ # Final result for this utterance
487
+ if not interim:
488
+ ended_at = utils.utcnow().timestamp()
489
+
490
+ self.logger.debug(
491
+ f"Utterance completed: id={current_utterance_id}, ended_at={ended_at}"
492
+ )
493
+
494
+ yield Done(utterance_id=current_utterance_id, ended_at=ended_at)
495
+
496
+ # Clean up to prevent memory leak
497
+ if current_utterance_id in self._utterance_start_times:
498
+ del self._utterance_start_times[current_utterance_id]
499
+
500
+ # Reset for next utterance
501
+ current_utterance_id = None
502
+
503
+ elif isinstance(msg, TaskFinished):
504
+ if msg.header.task_id != self.task_id:
505
+ raise TranscriptionError(
506
+ f"Task ID mismatch: expected {self.task_id}, got {msg.header.task_id}"
507
+ )
508
+
509
+ self.logger.debug("Transcription task finished by server")
510
+ break
511
+
512
+ elif isinstance(msg, TaskFailed):
513
+ self.logger.error(
514
+ f"Transcription task failed: "
515
+ f"error_code={msg.header.error_code}, "
516
+ f"error_message={msg.header.error_message}"
517
+ )
518
+ raise TranscriptionError(
519
+ f"Transcription task failed: {msg.header.error_message} "
520
+ f"(code: {msg.header.error_code})"
521
+ )
522
+
523
+ else:
524
+ self.logger.error(f"Unexpected server message: {server_msg!s}")
525
+ raise TranscriptionError(f"Unexpected server message: {server_msg!s}")
@@ -0,0 +1,62 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ class Start:
5
+ """Indicates the start of a transcription utterance.
6
+
7
+ Attributes:
8
+ utterance_id: Unique ID for this utterance.
9
+ started_at: Absolute timestamp when transcription started (UNIX
10
+ timestamp).
11
+ """
12
+
13
+ __slots__ = ("started_at", "utterance_id")
14
+
15
+ def __init__(self, *, utterance_id: str, started_at: float) -> None:
16
+ self.utterance_id = utterance_id
17
+ self.started_at = started_at
18
+
19
+
20
+ class Delta:
21
+ """Delta event representing a partial transcription update.
22
+
23
+ Attributes:
24
+ utterance_id: ID of the utterance this delta belongs to.
25
+ offset_begin: Start offset from utterance start (in seconds).
26
+ offset_end: End offset from utterance start (in seconds, None for
27
+ interim).
28
+ text: The transcribed text for the segment.
29
+ interim: Whether this is interim (True) or final (False).
30
+ """
31
+
32
+ __slots__ = ("interim", "offset_begin", "offset_end", "text", "utterance_id")
33
+
34
+ def __init__(
35
+ self,
36
+ *,
37
+ utterance_id: str,
38
+ offset_begin: float,
39
+ offset_end: float | None,
40
+ text: str,
41
+ interim: bool,
42
+ ) -> None:
43
+ self.utterance_id = utterance_id
44
+ self.offset_begin = offset_begin
45
+ self.offset_end = offset_end
46
+ self.text = text
47
+ self.interim = interim
48
+
49
+
50
+ class Done:
51
+ """Indicates the completion of a transcription utterance.
52
+
53
+ Attributes:
54
+ utterance_id: ID of the completed utterance.
55
+ ended_at: Absolute timestamp when transcription ended (UNIX timestamp).
56
+ """
57
+
58
+ __slots__ = ("ended_at", "utterance_id")
59
+
60
+ def __init__(self, *, utterance_id: str, ended_at: float) -> None:
61
+ self.utterance_id = utterance_id
62
+ self.ended_at = ended_at