openspeechapi 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. openspeech/__init__.py +75 -0
  2. openspeech/__main__.py +5 -0
  3. openspeech/cli.py +413 -0
  4. openspeech/client/__init__.py +4 -0
  5. openspeech/client/client.py +145 -0
  6. openspeech/config.py +212 -0
  7. openspeech/core/__init__.py +0 -0
  8. openspeech/core/base.py +75 -0
  9. openspeech/core/enums.py +39 -0
  10. openspeech/core/models.py +61 -0
  11. openspeech/core/registry.py +37 -0
  12. openspeech/core/settings.py +8 -0
  13. openspeech/demo.py +675 -0
  14. openspeech/dispatch/__init__.py +0 -0
  15. openspeech/dispatch/context.py +34 -0
  16. openspeech/dispatch/dispatcher.py +661 -0
  17. openspeech/dispatch/executors/__init__.py +0 -0
  18. openspeech/dispatch/executors/base.py +34 -0
  19. openspeech/dispatch/executors/in_process.py +66 -0
  20. openspeech/dispatch/executors/remote.py +64 -0
  21. openspeech/dispatch/executors/subprocess_exec.py +446 -0
  22. openspeech/dispatch/fanout.py +95 -0
  23. openspeech/dispatch/filters.py +73 -0
  24. openspeech/dispatch/lifecycle.py +178 -0
  25. openspeech/dispatch/watcher.py +82 -0
  26. openspeech/engine_catalog.py +236 -0
  27. openspeech/engine_registry.yaml +347 -0
  28. openspeech/exceptions.py +51 -0
  29. openspeech/factory.py +325 -0
  30. openspeech/local_engines/__init__.py +12 -0
  31. openspeech/local_engines/aim_resolver.py +91 -0
  32. openspeech/local_engines/backends/__init__.py +1 -0
  33. openspeech/local_engines/backends/docker_backend.py +490 -0
  34. openspeech/local_engines/backends/native_backend.py +902 -0
  35. openspeech/local_engines/base.py +30 -0
  36. openspeech/local_engines/engines/__init__.py +1 -0
  37. openspeech/local_engines/engines/faster_whisper.py +36 -0
  38. openspeech/local_engines/engines/fish_speech.py +33 -0
  39. openspeech/local_engines/engines/sherpa_onnx.py +56 -0
  40. openspeech/local_engines/engines/whisper.py +41 -0
  41. openspeech/local_engines/engines/whisperlivekit.py +60 -0
  42. openspeech/local_engines/manager.py +208 -0
  43. openspeech/local_engines/models.py +50 -0
  44. openspeech/local_engines/progress.py +69 -0
  45. openspeech/local_engines/registry.py +19 -0
  46. openspeech/local_engines/task_store.py +52 -0
  47. openspeech/local_engines/tasks.py +71 -0
  48. openspeech/logging_config.py +607 -0
  49. openspeech/observe/__init__.py +0 -0
  50. openspeech/observe/base.py +79 -0
  51. openspeech/observe/debug.py +44 -0
  52. openspeech/observe/latency.py +19 -0
  53. openspeech/observe/metrics.py +47 -0
  54. openspeech/observe/tracing.py +44 -0
  55. openspeech/observe/usage.py +27 -0
  56. openspeech/providers/__init__.py +0 -0
  57. openspeech/providers/_template.py +101 -0
  58. openspeech/providers/stt/__init__.py +0 -0
  59. openspeech/providers/stt/alibaba.py +86 -0
  60. openspeech/providers/stt/assemblyai.py +135 -0
  61. openspeech/providers/stt/azure_speech.py +99 -0
  62. openspeech/providers/stt/baidu.py +135 -0
  63. openspeech/providers/stt/deepgram.py +311 -0
  64. openspeech/providers/stt/elevenlabs.py +385 -0
  65. openspeech/providers/stt/faster_whisper.py +211 -0
  66. openspeech/providers/stt/google_cloud.py +106 -0
  67. openspeech/providers/stt/iflytek.py +427 -0
  68. openspeech/providers/stt/macos_speech.py +226 -0
  69. openspeech/providers/stt/openai.py +84 -0
  70. openspeech/providers/stt/sherpa_onnx.py +353 -0
  71. openspeech/providers/stt/tencent.py +212 -0
  72. openspeech/providers/stt/volcengine.py +107 -0
  73. openspeech/providers/stt/whisper.py +153 -0
  74. openspeech/providers/stt/whisperlivekit.py +530 -0
  75. openspeech/providers/stt/windows_speech.py +249 -0
  76. openspeech/providers/tts/__init__.py +0 -0
  77. openspeech/providers/tts/alibaba.py +95 -0
  78. openspeech/providers/tts/azure_speech.py +123 -0
  79. openspeech/providers/tts/baidu.py +143 -0
  80. openspeech/providers/tts/coqui.py +64 -0
  81. openspeech/providers/tts/cosyvoice.py +90 -0
  82. openspeech/providers/tts/deepgram.py +174 -0
  83. openspeech/providers/tts/elevenlabs.py +311 -0
  84. openspeech/providers/tts/fish_speech.py +158 -0
  85. openspeech/providers/tts/google_cloud.py +107 -0
  86. openspeech/providers/tts/iflytek.py +209 -0
  87. openspeech/providers/tts/macos_say.py +251 -0
  88. openspeech/providers/tts/minimax.py +122 -0
  89. openspeech/providers/tts/openai.py +104 -0
  90. openspeech/providers/tts/piper.py +104 -0
  91. openspeech/providers/tts/tencent.py +189 -0
  92. openspeech/providers/tts/volcengine.py +117 -0
  93. openspeech/providers/tts/windows_sapi.py +234 -0
  94. openspeech/server/__init__.py +1 -0
  95. openspeech/server/app.py +72 -0
  96. openspeech/server/auth.py +42 -0
  97. openspeech/server/middleware.py +75 -0
  98. openspeech/server/routes/__init__.py +1 -0
  99. openspeech/server/routes/management.py +848 -0
  100. openspeech/server/routes/stt.py +121 -0
  101. openspeech/server/routes/tts.py +159 -0
  102. openspeech/server/routes/webui.py +29 -0
  103. openspeech/server/webui/app.js +2649 -0
  104. openspeech/server/webui/index.html +216 -0
  105. openspeech/server/webui/styles.css +617 -0
  106. openspeech/server/ws/__init__.py +1 -0
  107. openspeech/server/ws/stt_stream.py +263 -0
  108. openspeech/server/ws/tts_stream.py +207 -0
  109. openspeech/telemetry/__init__.py +21 -0
  110. openspeech/telemetry/perf.py +307 -0
  111. openspeech/utils/__init__.py +5 -0
  112. openspeech/utils/audio_converter.py +406 -0
  113. openspeech/utils/audio_playback.py +156 -0
  114. openspeech/vendor_registry.yaml +74 -0
  115. openspeechapi-0.1.0.dist-info/METADATA +101 -0
  116. openspeechapi-0.1.0.dist-info/RECORD +118 -0
  117. openspeechapi-0.1.0.dist-info/WHEEL +4 -0
  118. openspeechapi-0.1.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,79 @@
1
+ """DispatchObserver ABC and ObserverManager with error isolation."""
2
+ from __future__ import annotations
3
+ from abc import ABC
4
+ from typing import Any
5
+ from openspeech.logging_config import logger
6
+ from openspeech.core.enums import ExecMode
7
+ from openspeech.dispatch.context import InvokeContext
8
+
9
+
10
+ class DispatchObserver(ABC):
11
+ """Base class for dispatch observers. All methods are optional no-ops."""
12
+
13
+ async def on_invoke_start(self, ctx: InvokeContext) -> None: pass
14
+ async def on_invoke_end(self, ctx: InvokeContext, result: Any) -> None: pass
15
+ async def on_invoke_error(self, ctx: InvokeContext, error: Exception) -> None: pass
16
+ async def on_stream_chunk(self, ctx: InvokeContext, chunk: Any) -> None: pass
17
+ async def on_provider_start(self, provider: str, exec_mode: ExecMode) -> None: pass
18
+ async def on_provider_stop(self, provider: str) -> None: pass
19
+ async def on_health_change(self, provider: str, healthy: bool) -> None: pass
20
+
21
+
22
+ class ObserverManager:
23
+ """Manages observers with error isolation and auto-deregistration."""
24
+
25
+ def __init__(self, max_consecutive_errors: int = 10) -> None:
26
+ self._observers: list[DispatchObserver] = []
27
+ self._error_counts: dict[int, int] = {}
28
+ self._max_errors = max_consecutive_errors
29
+
30
+ def add(self, observer: DispatchObserver) -> None:
31
+ self._observers.append(observer)
32
+ self._error_counts[id(observer)] = 0
33
+
34
+ def remove(self, observer: DispatchObserver) -> None:
35
+ self._observers.remove(observer)
36
+ self._error_counts.pop(id(observer), None)
37
+
38
+ def list(self) -> list[DispatchObserver]:
39
+ return list(self._observers)
40
+
41
+ async def _notify(self, method: str, *args: Any, **kwargs: Any) -> None:
42
+ to_remove = []
43
+ for obs in self._observers:
44
+ try:
45
+ fn = getattr(obs, method)
46
+ await fn(*args, **kwargs)
47
+ self._error_counts[id(obs)] = 0
48
+ except Exception as e:
49
+ obs_id = id(obs)
50
+ self._error_counts[obs_id] = self._error_counts.get(obs_id, 0) + 1
51
+ logger.warning(f"Observer {type(obs).__name__} raised {type(e).__name__}: {e}")
52
+ if self._error_counts[obs_id] >= self._max_errors:
53
+ to_remove.append(obs)
54
+ logger.warning(
55
+ f"Auto-deregistering {type(obs).__name__} after {self._max_errors} consecutive errors"
56
+ )
57
+ for obs in to_remove:
58
+ self.remove(obs)
59
+
60
+ async def notify_invoke_start(self, ctx: InvokeContext) -> None:
61
+ await self._notify("on_invoke_start", ctx)
62
+
63
+ async def notify_invoke_end(self, ctx: InvokeContext, result: Any) -> None:
64
+ await self._notify("on_invoke_end", ctx, result)
65
+
66
+ async def notify_invoke_error(self, ctx: InvokeContext, error: Exception) -> None:
67
+ await self._notify("on_invoke_error", ctx, error)
68
+
69
+ async def notify_stream_chunk(self, ctx: InvokeContext, chunk: Any) -> None:
70
+ await self._notify("on_stream_chunk", ctx, chunk)
71
+
72
+ async def notify_provider_start(self, provider: str, exec_mode: ExecMode) -> None:
73
+ await self._notify("on_provider_start", provider, exec_mode)
74
+
75
+ async def notify_provider_stop(self, provider: str) -> None:
76
+ await self._notify("on_provider_stop", provider)
77
+
78
+ async def notify_health_change(self, provider: str, healthy: bool) -> None:
79
+ await self._notify("on_health_change", provider, healthy)
@@ -0,0 +1,44 @@
1
+ """DebugLogObserver — emits loguru debug output for every lifecycle event."""
2
+ from __future__ import annotations
3
+ from typing import Any
4
+ from openspeech.logging_config import logger
5
+ from openspeech.core.enums import ExecMode
6
+ from openspeech.dispatch.context import InvokeContext
7
+ from openspeech.observe.base import DispatchObserver
8
+
9
+
10
+ class DebugLogObserver(DispatchObserver):
11
+ """Logs all dispatch lifecycle events at DEBUG level via loguru."""
12
+
13
+ async def on_invoke_start(self, ctx: InvokeContext) -> None:
14
+ logger.debug(
15
+ "[openspeech] invoke_start provider={} method={} request_id={}",
16
+ ctx.provider_name, ctx.method, ctx.request_id,
17
+ )
18
+
19
+ async def on_invoke_end(self, ctx: InvokeContext, result: Any) -> None:
20
+ logger.debug(
21
+ "[openspeech] invoke_end provider={} method={} request_id={} elapsed_ms={:.2f}",
22
+ ctx.provider_name, ctx.method, ctx.request_id, ctx.elapsed_ms,
23
+ )
24
+
25
+ async def on_invoke_error(self, ctx: InvokeContext, error: Exception) -> None:
26
+ logger.debug(
27
+ "[openspeech] invoke_error provider={} method={} request_id={} error={}",
28
+ ctx.provider_name, ctx.method, ctx.request_id, repr(error),
29
+ )
30
+
31
+ async def on_stream_chunk(self, ctx: InvokeContext, chunk: Any) -> None:
32
+ logger.debug(
33
+ "[openspeech] stream_chunk provider={} method={} request_id={}",
34
+ ctx.provider_name, ctx.method, ctx.request_id,
35
+ )
36
+
37
+ async def on_provider_start(self, provider: str, exec_mode: ExecMode) -> None:
38
+ logger.debug("[openspeech] provider_start provider={} exec_mode={}", provider, exec_mode)
39
+
40
+ async def on_provider_stop(self, provider: str) -> None:
41
+ logger.debug("[openspeech] provider_stop provider={}", provider)
42
+
43
+ async def on_health_change(self, provider: str, healthy: bool) -> None:
44
+ logger.debug("[openspeech] health_change provider={} healthy={}", provider, healthy)
@@ -0,0 +1,19 @@
1
+ """LatencyObserver — records total_ms and ttfb_ms latency samples."""
2
+ from __future__ import annotations
3
+ from typing import Any
4
+ from openspeech.dispatch.context import InvokeContext
5
+ from openspeech.observe.base import DispatchObserver
6
+
7
+
8
+ class LatencyObserver(DispatchObserver):
9
+ """Accumulates per-invocation latency samples (total duration and TTFB)."""
10
+
11
+ def __init__(self) -> None:
12
+ self.total_ms: list[float] = []
13
+ self.ttfb_ms: list[float] = []
14
+
15
+ async def on_invoke_end(self, ctx: InvokeContext, result: Any) -> None:
16
+ self.total_ms.append(ctx.elapsed_ms)
17
+ if ctx.ttfb_ns is not None:
18
+ ttfb = (ctx.ttfb_ns - ctx.start_time_ns) / 1_000_000
19
+ self.ttfb_ms.append(ttfb)
@@ -0,0 +1,47 @@
1
+ """MetricsObserver — tracks TTFB, duration, and error counts per provider+method."""
2
+ from __future__ import annotations
3
+ from dataclasses import dataclass, field
4
+ from typing import Any
5
+ from openspeech.dispatch.context import InvokeContext
6
+ from openspeech.observe.base import DispatchObserver
7
+
8
+
9
+ @dataclass
10
+ class _ProviderMethodMetrics:
11
+ total_calls: int = 0
12
+ error_count: int = 0
13
+ durations_ms: list[float] = field(default_factory=list)
14
+ ttfb_ms: list[float] = field(default_factory=list)
15
+
16
+
17
+ class MetricsObserver(DispatchObserver):
18
+ """Collects invocation metrics (call counts, durations, TTFB, errors) keyed by provider+method."""
19
+
20
+ def __init__(self) -> None:
21
+ self._data: dict[str, _ProviderMethodMetrics] = {}
22
+
23
+ def _key(self, ctx: InvokeContext) -> str:
24
+ return f"{ctx.provider_name}.{ctx.method}"
25
+
26
+ def _get(self, key: str) -> _ProviderMethodMetrics:
27
+ if key not in self._data:
28
+ self._data[key] = _ProviderMethodMetrics()
29
+ return self._data[key]
30
+
31
+ async def on_invoke_end(self, ctx: InvokeContext, result: Any) -> None:
32
+ key = self._key(ctx)
33
+ m = self._get(key)
34
+ m.total_calls += 1
35
+ m.durations_ms.append(ctx.elapsed_ms)
36
+ if ctx.ttfb_ns is not None:
37
+ ttfb = (ctx.ttfb_ns - ctx.start_time_ns) / 1_000_000
38
+ m.ttfb_ms.append(ttfb)
39
+
40
+ async def on_invoke_error(self, ctx: InvokeContext, error: Exception) -> None:
41
+ key = self._key(ctx)
42
+ m = self._get(key)
43
+ m.error_count += 1
44
+
45
+ def get(self, provider: str, method: str) -> _ProviderMethodMetrics | None:
46
+ """Return accumulated metrics for the given provider+method, or None if not seen."""
47
+ return self._data.get(f"{provider}.{method}")
@@ -0,0 +1,44 @@
1
+ """TracingObserver — OpenTelemetry tracing stub, graceful if not installed."""
2
+ from __future__ import annotations
3
+ from typing import Any
4
+ from openspeech.dispatch.context import InvokeContext
5
+ from openspeech.observe.base import DispatchObserver
6
+
7
+
8
+ class TracingObserver(DispatchObserver):
9
+ """Wraps each invocation in an OpenTelemetry span. Falls back to a no-op when
10
+ the ``opentelemetry`` package is not installed."""
11
+
12
+ def __init__(self, service_name: str = "openspeech") -> None:
13
+ try:
14
+ from opentelemetry import trace
15
+ self._tracer = trace.get_tracer(service_name)
16
+ except ImportError:
17
+ self._tracer = None
18
+ self._spans: dict[str, Any] = {}
19
+
20
+ async def on_invoke_start(self, ctx: InvokeContext) -> None:
21
+ if self._tracer is None:
22
+ return
23
+ span = self._tracer.start_span(
24
+ f"{ctx.provider_name}.{ctx.method}",
25
+ attributes={
26
+ "provider": ctx.provider_name,
27
+ "method": ctx.method,
28
+ "exec_mode": ctx.exec_mode.value,
29
+ "request_id": ctx.request_id,
30
+ },
31
+ )
32
+ self._spans[ctx.request_id] = span
33
+
34
+ async def on_invoke_end(self, ctx: InvokeContext, result: Any) -> None:
35
+ span = self._spans.pop(ctx.request_id, None)
36
+ if span:
37
+ span.end()
38
+
39
+ async def on_invoke_error(self, ctx: InvokeContext, error: Exception) -> None:
40
+ span = self._spans.pop(ctx.request_id, None)
41
+ if span:
42
+ from opentelemetry.trace import StatusCode
43
+ span.set_status(StatusCode.ERROR, str(error))
44
+ span.end()
@@ -0,0 +1,27 @@
1
+ """UsageObserver — tracks call counts, audio duration, and character counts."""
2
+ from __future__ import annotations
3
+ from typing import Any
4
+ from openspeech.dispatch.context import InvokeContext
5
+ from openspeech.observe.base import DispatchObserver
6
+
7
+
8
+ class UsageObserver(DispatchObserver):
9
+ """Accumulates usage statistics: invocation counts, audio duration, and text characters."""
10
+
11
+ def __init__(self) -> None:
12
+ self.call_count: int = 0
13
+ self.total_audio_duration_ms: int = 0
14
+ self.total_characters: int = 0
15
+
16
+ async def on_invoke_end(self, ctx: InvokeContext, result: Any) -> None:
17
+ self.call_count += 1
18
+
19
+ # Accumulate audio duration from AudioData results (STT input or TTS output)
20
+ audio_duration = getattr(result, "duration_ms", None)
21
+ if audio_duration is not None:
22
+ self.total_audio_duration_ms += audio_duration
23
+
24
+ # Accumulate text characters from ctx.metadata["text"] (set by caller for TTS)
25
+ text = ctx.metadata.get("text")
26
+ if isinstance(text, str):
27
+ self.total_characters += len(text)
File without changes
@@ -0,0 +1,101 @@
1
+ """
2
+ Provider adapter template — copy-paste starting point for new adapters.
3
+
4
+ Steps to create a new adapter:
5
+ 1. Copy this file to providers/stt/<name>.py or providers/tts/<name>.py
6
+ 2. Replace MyProvider* placeholders with the real names
7
+ 3. Fill in the correct name, provider_type, execution_mode, capabilities
8
+ 4. Implement start() to import the SDK and build the client
9
+ 5. Implement transcribe() / synthesize() (and streaming variants if supported)
10
+ 6. Add the provider to openspeech/core/registry.py if auto-registration is desired
11
+ """
12
+ from __future__ import annotations
13
+
14
+ from collections.abc import AsyncIterator
15
+ from dataclasses import dataclass
16
+ from typing import Any
17
+
18
+ from openspeech.core.base import STTProvider # swap for TTSProvider as needed
19
+ from openspeech.core.enums import Capability, ExecMode, ProviderType
20
+ from openspeech.core.models import AudioData, STTOptions, Transcription
21
+ from openspeech.core.settings import BaseSettings
22
+
23
+
24
+ # ---------------------------------------------------------------------------
25
+ # Settings dataclass — one field per provider-specific config value
26
+ # ---------------------------------------------------------------------------
27
+ @dataclass
28
+ class MyProviderSettings(BaseSettings):
29
+ api_key: str = ""
30
+ model: str = "default-model"
31
+ # add more fields as needed
32
+
33
+
34
+ # ---------------------------------------------------------------------------
35
+ # Provider class
36
+ # ---------------------------------------------------------------------------
37
+ class MyProviderSTT(STTProvider):
38
+ # --- class-level metadata (required by SpeechProvider ABC) ---
39
+ name = "my-provider"
40
+ provider_type = ProviderType.STT # or ProviderType.TTS
41
+ execution_mode = ExecMode.IN_PROCESS # or SUBPROCESS / REMOTE
42
+ settings_cls = MyProviderSettings
43
+ capabilities = {Capability.BATCH} # choose from enums.Capability
44
+
45
+ def __init__(self, settings: MyProviderSettings | None = None) -> None:
46
+ self.settings = settings or MyProviderSettings()
47
+ self._client: Any = None
48
+
49
+ # --- lifecycle ---
50
+
51
+ async def start(self) -> None:
52
+ """Import SDK and initialise the client."""
53
+ try:
54
+ import my_sdk # noqa: F401 replace with real import
55
+ self._client = object() # replace with real client construction
56
+ except ImportError:
57
+ raise ImportError(
58
+ "Install the SDK: pip install openspeech[my-provider]"
59
+ )
60
+
61
+ async def stop(self) -> None:
62
+ """Release resources."""
63
+ self._client = None
64
+
65
+ async def health_check(self) -> bool:
66
+ return self._client is not None
67
+
68
+ # --- STT interface (remove if implementing TTSProvider) ---
69
+
70
+ async def transcribe(
71
+ self, audio: AudioData, opts: STTOptions | None = None
72
+ ) -> Transcription:
73
+ if self._client is None:
74
+ raise RuntimeError("Provider not started — call start() first")
75
+ opts = opts or STTOptions()
76
+ raise NotImplementedError("Implement transcribe() for MyProviderSTT")
77
+
78
+ async def transcribe_stream(
79
+ self, stream: AsyncIterator[bytes]
80
+ ) -> AsyncIterator[Any]:
81
+ raise NotImplementedError(
82
+ "Streaming transcription is not supported by MyProviderSTT"
83
+ )
84
+ yield # pragma: no cover
85
+
86
+ # --- TTS interface (uncomment if implementing TTSProvider) ---
87
+
88
+ # async def synthesize(
89
+ # self, text: str, opts: TTSOptions | None = None
90
+ # ) -> AudioData:
91
+ # if self._client is None:
92
+ # raise RuntimeError("Provider not started — call start() first")
93
+ # raise NotImplementedError("Implement synthesize() for MyProviderTTS")
94
+
95
+ # async def synthesize_stream(
96
+ # self, text: str, opts: TTSOptions | None = None
97
+ # ) -> AsyncIterator[AudioChunk]:
98
+ # raise NotImplementedError(
99
+ # "Streaming synthesis is not supported by MyProviderTTS"
100
+ # )
101
+ # yield # pragma: no cover
File without changes
@@ -0,0 +1,86 @@
1
+ """Alibaba Cloud (Bailian/DashScope) STT provider adapter — OpenAI-compatible."""
2
+ from __future__ import annotations
3
+
4
+ from openspeech.logging_config import logger
5
+ import time
6
+ from collections.abc import AsyncIterator
7
+ from dataclasses import dataclass
8
+ from typing import Any
9
+
10
+ import httpx
11
+
12
+ from openspeech.core.base import STTProvider
13
+
14
+ from openspeech.core.enums import Capability, ExecMode, ProviderType
15
+ from openspeech.core.models import AudioData, STTOptions, Transcription
16
+ from openspeech.core.settings import BaseSettings
17
+
18
+ @dataclass
19
+ class AlibabaSTTSettings(BaseSettings):
20
+ api_key: str = ""
21
+ model: str = "paraformer-v2"
22
+ base_url: str = "https://dashscope.aliyuncs.com/compatible-mode/v1"
23
+
24
+ class AlibabaSTT(STTProvider):
25
+ name = "alibaba-stt"
26
+ provider_type = ProviderType.STT
27
+ execution_mode = ExecMode.IN_PROCESS
28
+ settings_cls = AlibabaSTTSettings
29
+ capabilities = {Capability.BATCH, Capability.MULTILINGUAL}
30
+ field_options = {"model": ["paraformer-v2", "paraformer-realtime-v2", "paraformer-8k-v1"]}
31
+
32
+ def __init__(self, settings: AlibabaSTTSettings | None = None) -> None:
33
+ self.settings = settings or AlibabaSTTSettings()
34
+ self._client: httpx.AsyncClient | None = None
35
+ self._owns_client: bool = True
36
+
37
+ def set_http_client(self, client) -> None:
38
+ self._client = client
39
+ self._owns_client = False
40
+
41
+ async def start(self) -> None:
42
+ if self._client is None:
43
+ self._client = httpx.AsyncClient(timeout=60.0)
44
+ self._owns_client = True
45
+ logger.info("{} provider started", self.name)
46
+
47
+ async def stop(self) -> None:
48
+ if self._client and self._owns_client:
49
+ await self._client.aclose()
50
+ self._client = None
51
+ logger.info("{} provider stopped", self.name)
52
+
53
+ async def health_check(self) -> bool:
54
+ return bool(self.settings.api_key)
55
+
56
+ async def transcribe(
57
+ self, audio: AudioData, opts: STTOptions | None = None
58
+ ) -> Transcription:
59
+ if self._client is None:
60
+ raise RuntimeError("Provider not started — call start() first")
61
+ logger.info("{}: request received, audio={} bytes", self.name, len(audio.data))
62
+ _t0 = time.perf_counter()
63
+
64
+ opts = opts or STTOptions()
65
+ model = opts.model or self.settings.model
66
+ url = f"{self.settings.base_url}/audio/transcriptions"
67
+
68
+ headers = {"Authorization": f"Bearer {self.settings.api_key}"}
69
+ files = {"file": ("audio.wav", audio.data, "audio/wav")}
70
+ data = {"model": model}
71
+
72
+ resp = await self._client.post(
73
+ url, headers=headers, files=files, data=data
74
+ )
75
+ resp.raise_for_status()
76
+ result = resp.json()
77
+
78
+ transcription = Transcription(text=result.get("text", ""))
79
+ logger.info("{}: completed in {:.0f}ms, result={} chars", self.name, (time.perf_counter() - _t0) * 1000, len(transcription.text))
80
+ return transcription
81
+
82
+ async def transcribe_stream(
83
+ self, stream: AsyncIterator[bytes]
84
+ ) -> AsyncIterator[Any]:
85
+ raise NotImplementedError("Alibaba STT streaming not implemented")
86
+ yield # noqa: unreachable — makes this an async generator
@@ -0,0 +1,135 @@
1
+ """AssemblyAI STT provider adapter (batch, httpx)."""
2
+ from __future__ import annotations
3
+
4
+ import asyncio
5
+ from openspeech.logging_config import logger
6
+ import time
7
+ from collections.abc import AsyncIterator
8
+ from dataclasses import dataclass
9
+ from typing import Any
10
+
11
+ import httpx
12
+
13
+ from openspeech.core.base import STTProvider
14
+
15
+ from openspeech.core.enums import Capability, ExecMode, ProviderType
16
+ from openspeech.core.models import AudioData, STTOptions, Transcription
17
+ from openspeech.core.settings import BaseSettings
18
+
19
+ @dataclass
20
+ class AssemblyAISTTSettings(BaseSettings):
21
+ api_key: str = ""
22
+ model: str = "best"
23
+ language: str = "en"
24
+
25
+ class AssemblyAISTT(STTProvider):
26
+ name = "assemblyai-stt"
27
+ provider_type = ProviderType.STT
28
+ execution_mode = ExecMode.REMOTE
29
+ settings_cls = AssemblyAISTTSettings
30
+ capabilities = {Capability.BATCH, Capability.MULTILINGUAL}
31
+ field_options = {"model": ["best", "nano"], "language": ["en", "zh", "ja", "ko", "es", "fr", "de", "pt", "it", "nl", "ru", "ar", "hi", "auto"]}
32
+
33
+ def __init__(self, settings: AssemblyAISTTSettings | None = None) -> None:
34
+ self.settings = settings or AssemblyAISTTSettings()
35
+ self._client: httpx.AsyncClient | None = None
36
+ self._owns_client: bool = True
37
+
38
+ def set_http_client(self, client) -> None:
39
+ self._client = client
40
+ self._owns_client = False
41
+
42
+ async def start(self) -> None:
43
+ if self._client is None:
44
+ self._client = httpx.AsyncClient(timeout=60.0)
45
+ self._owns_client = True
46
+ logger.info("{} provider started", self.name)
47
+
48
+ async def stop(self) -> None:
49
+ if self._client and self._owns_client:
50
+ await self._client.aclose()
51
+ self._client = None
52
+ logger.info("{} provider stopped", self.name)
53
+
54
+ async def health_check(self) -> bool:
55
+ return bool(self.settings.api_key)
56
+
57
+ async def transcribe(
58
+ self, audio: AudioData, opts: STTOptions | None = None
59
+ ) -> Transcription:
60
+ if self._client is None:
61
+ raise RuntimeError("Provider not started — call start() first")
62
+ logger.info("{}: request received, audio={} bytes", self.name, len(audio.data))
63
+ _t0 = time.perf_counter()
64
+ opts = opts or STTOptions()
65
+ language = opts.language or self.settings.language
66
+ headers = {"Authorization": self.settings.api_key}
67
+
68
+ # Step 1: Upload audio
69
+ upload_resp = await self._client.post(
70
+ "https://api.assemblyai.com/v2/upload",
71
+ headers=headers,
72
+ content=audio.data,
73
+ )
74
+ if upload_resp.status_code != 200:
75
+ raise RuntimeError(
76
+ f"AssemblyAI upload error {upload_resp.status_code}: {upload_resp.text}"
77
+ )
78
+ upload_url = upload_resp.json()["upload_url"]
79
+
80
+ # Step 2: Create transcript
81
+ create_resp = await self._client.post(
82
+ "https://api.assemblyai.com/v2/transcript",
83
+ headers=headers,
84
+ json={
85
+ "audio_url": upload_url,
86
+ "language_code": language,
87
+ "speech_model": self.settings.model,
88
+ },
89
+ )
90
+ if create_resp.status_code != 200:
91
+ raise RuntimeError(
92
+ f"AssemblyAI transcript creation error "
93
+ f"{create_resp.status_code}: {create_resp.text}"
94
+ )
95
+ transcript_id = create_resp.json()["id"]
96
+
97
+ # Step 3: Poll until completed
98
+ poll_url = f"https://api.assemblyai.com/v2/transcript/{transcript_id}"
99
+ max_wait = 60.0
100
+ elapsed = 0.0
101
+ while elapsed < max_wait:
102
+ poll_resp = await self._client.get(poll_url, headers=headers)
103
+ if poll_resp.status_code != 200:
104
+ raise RuntimeError(
105
+ f"AssemblyAI poll error {poll_resp.status_code}: {poll_resp.text}"
106
+ )
107
+ data = poll_resp.json()
108
+ status = data.get("status", "")
109
+ if status == "completed":
110
+ result = Transcription(
111
+ text=data.get("text", ""),
112
+ language=language,
113
+ confidence=data.get("confidence"),
114
+ )
115
+ logger.info("{}: completed in {:.0f}ms, result={} chars", self.name, (time.perf_counter() - _t0) * 1000, len(result.text))
116
+ return result
117
+ if status == "error":
118
+ raise RuntimeError(
119
+ f"AssemblyAI transcription failed: {data.get('error', 'unknown')}"
120
+ )
121
+ await asyncio.sleep(1.0)
122
+ elapsed += 1.0
123
+
124
+ raise RuntimeError(
125
+ f"AssemblyAI transcription timed out after {max_wait}s "
126
+ f"for transcript {transcript_id}"
127
+ )
128
+
129
+ async def transcribe_stream(
130
+ self, stream: AsyncIterator[bytes]
131
+ ) -> AsyncIterator[Any]:
132
+ raise NotImplementedError(
133
+ "AssemblyAI batch provider does not support streaming input"
134
+ )
135
+ yield # pragma: no cover