PyPI - openspeechapi - Versions diffs - 0.2.5__tar.gz → 0.2.7__tar.gz - Mend

openspeechapi 0.2.5tar.gz → 0.2.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (247) hide show

{openspeechapi-0.2.5 → openspeechapi-0.2.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: openspeechapi
-Version: 0.2.5
+Version: 0.2.7
 Summary: Unified speech interface for STT/TTS providers
 Requires-Python: >=3.11
 Requires-Dist: httpx>=0.27

{openspeechapi-0.2.5 → openspeechapi-0.2.7}/openspeechapi/__init__.py RENAMED Viewed

@@ -1,6 +1,6 @@
 """OpenSpeechAPI — Unified speech interface for STT/TTS providers."""
-__version__ = "0.2.5"
+__version__ = "0.2.7"
 from openspeechapi.config import load_config
 from openspeechapi.core.base import SpeechProvider, STTProvider, TTSProvider

{openspeechapi-0.2.5 → openspeechapi-0.2.7}/openspeechapi/core/models.py RENAMED Viewed

@@ -60,6 +60,25 @@ class STTOptions:
     # voice assistant. Providers that don't support VAD finalization
     # (Whisper, Faster-Whisper) silently ignore this field.
     vad_eos: int | None = None
+    # ── iFlytek IAT pass-through (matches Java AsrServiceImpl) ───────
+    # Wallex's Java AsrService forwards the client-supplied
+    # ``audio.common`` / ``audio.business`` / extra ``audio.data``
+    # fields verbatim to iFlytek's WS, treating the panel as the
+    # source of truth for ASR parameters. The Python pipeline now
+    # mirrors that contract: when these fields are non-None, the
+    # iFlytek provider uses them as the basis for the WS first frame
+    # (with ``setdefault`` fallback to its own settings for any keys
+    # the client omitted) instead of building the blocks purely from
+    # ``speech_providers.yaml``. ``None`` preserves the existing
+    # yaml-driven behaviour. Other STT providers ignore these fields.
+    iflytek_common: dict | None = None
+    iflytek_business: dict | None = None
+    # Extra fields to merge into the iFlytek ``data`` block beyond the
+    # canonical ``status``/``format``/``encoding``/``audio`` quadruple
+    # (e.g. panel-supplied ``data_type``). Keys that collide with the
+    # canonical set are preserved (the provider's defaults still win,
+    # since the canonical set is required by the IAT spec).
+    iflytek_data_extras: dict | None = None
 @dataclass

{openspeechapi-0.2.5 → openspeechapi-0.2.7}/openspeechapi/providers/stt/iflytek.py RENAMED Viewed

@@ -52,6 +52,54 @@ class IflytekSTTSettings(BaseSettings):
     # via ``speech_providers.yaml`` so different sites can pick their
     # own latency-vs-tolerance trade-off.
     vad_eos: int = 2000
+    # ``ltc`` — sentence-level timestamp granularity sent in the
+    # business block of the IAT request (1 = sentence segments only;
+    # 2 = + word boundaries; 3 = + character boundaries). Java's
+    # ``AsrConfig.ltc`` defaults to 3; we mirror that so downstream
+    # consumers expecting per-character timing offsets keep working.
+    # Lower values shave a few bytes per response and slightly reduce
+    # post-processing work for callers that don't use the timestamps.
+    ltc: int = 3
+    # ``ws_host`` / ``ws_path`` — iFlytek IAT WebSocket endpoint. The
+    # default ``iat-api.xfyun.cn`` is the global endpoint; multi-region
+    # deployments (e.g. directed-domain endpoints such as
+    # ``ws-api-dx.xfyun.cn``) override these in yaml or via env var
+    # so the WS URL never requires a code change.
+    ws_host: str = "iat-api.xfyun.cn"
+    ws_path: str = "/v2/iat"
+    # ``timeout_secs`` — connect / read timeout for the underlying
+    # httpx AsyncClient. Java's AsrConfig defaults to 15s; we mirror
+    # that. Lower if the network has aggressive proxies, higher only
+    # if the iFlytek endpoint is consistently slow to handshake.
+    timeout_secs: int = 15
+    # ── Java AsrConfig parity (used as setdefault fallbacks) ────────
+    # When a wallex client (panel) supplies ``audio.business``/
+    # ``audio.common`` per-frame, those values flow through via
+    # ``STTOptions.iflytek_business``/``STTOptions.iflytek_common`` and
+    # become the WS first frame body. The settings below act as
+    # ``setdefault`` fallbacks for keys the client omits, mirroring
+    # Java ``AsrConfig``'s field set so the two implementations
+    # produce identical wire frames given the same panel payload.
+    #
+    # ``domain`` — iFlytek IAT domain. Java default ``iat``; a few
+    # vertical models (``medical`` / ``tv``) exist but most
+    # deployments stay on the general one.
+    domain: str = "iat"
+    # ``accent`` — only meaningful when ``language=="zh_cn"`` (selects
+    # mandarin vs. cantonese etc.). Java sends ``mandarin`` blindly;
+    # we keep the same default so the WS frame matches Java byte-for-
+    # byte when the panel omits ``business.accent``. iFlytek treats
+    # it as a no-op for non-Chinese language codes.
+    accent: str = "mandarin"
+    # ``dwa`` — dynamic word adjustment / wpgs (实时纠错). Java's
+    # default ``wpgs`` is the realtime-correction mode panels rely on
+    # for the partial-result protocol described in
+    # ``stt-streaming-spec.md``. Empty disables it.
+    dwa: str = "wpgs"
+    # ``sample_rate`` — required by the IAT directed-domain endpoint
+    # (``ws-api-dx.xfyun.cn``) which expects it in ``business``. Java
+    # AsrConfig.sampleRate=16000.
+    sample_rate: int = 16000
 # iFlytek expects the full locale tag; common ISO short codes need to
@@ -92,9 +140,6 @@ class IflytekSTT(STTProvider):
         "language": ["zh_cn", "en_us", "ja_jp", "ko_kr", "ru-ru"],
     }
-    _WS_HOST = "iat-api.xfyun.cn"
-    _WS_PATH = "/v2/iat"
     def __init__(self, settings: IflytekSTTSettings | None = None) -> None:
         self.settings = settings or IflytekSTTSettings()
         self._client: httpx.AsyncClient | None = None
@@ -106,7 +151,7 @@ class IflytekSTT(STTProvider):
     async def start(self) -> None:
         if self._client is None:
-            self._client = httpx.AsyncClient(timeout=60.0)
+            self._client = httpx.AsyncClient(timeout=float(self.settings.timeout_secs))
             self._owns_client = True
         # Surface the effective language (after alias mapping) and
         # vad_eos at startup so deployments can verify the iFlytek model
@@ -142,10 +187,12 @@ class IflytekSTT(STTProvider):
         now = datetime.now(tz=timezone.utc)
         date = formatdate(timeval=now.timestamp(), localtime=False, usegmt=True)
+        host = self.settings.ws_host
+        path = self.settings.ws_path
         signature_origin = (
-            f"host: {self._WS_HOST}\n"
+            f"host: {host}\n"
             f"date: {date}\n"
-            f"GET {self._WS_PATH} HTTP/1.1"
+            f"GET {path} HTTP/1.1"
         )
         signature_sha = hmac.new(
             self.settings.api_secret.encode("utf-8"),
@@ -165,9 +212,120 @@ class IflytekSTT(STTProvider):
         ).decode("utf-8")
         params = urllib.parse.urlencode(
-            {"authorization": authorization, "date": date, "host": self._WS_HOST}
+            {"authorization": authorization, "date": date, "host": host}
+        )
+        return f"wss://{host}{path}?{params}"
+    def _build_first_frame_blocks(
+        self,
+        opts: STTOptions | None,
+        *,
+        include_dwa: bool,
+    ) -> tuple[dict, dict]:
+        """Build the ``common`` / ``business`` blocks for the WS first frame.
+        Mirrors Java ``AsrServiceImpl.sendToAsr`` semantics: when
+        ``opts.iflytek_common``/``opts.iflytek_business`` is provided
+        (typically by wallex relaying the panel's per-frame
+        ``audio.common`` / ``audio.business``), those dicts are the
+        source of truth. We only ``setdefault`` keys the client omitted,
+        falling back to ``self.settings`` so a panel that misses a
+        single field doesn't get a malformed frame.
+        ``include_dwa`` differs between ``transcribe()`` (batch — no
+        wpgs because there's no streaming protocol) and
+        ``transcribe_stream()`` (always wpgs).
+        """
+        canon = _canonical_language(self.settings.language)
+        eos = (opts.vad_eos
+               if opts is not None and opts.vad_eos is not None
+               else self.settings.vad_eos)
+        # ── business block ─────────────────────────────────────────
+        if opts is not None and opts.iflytek_business:
+            # Panel-supplied is authoritative; copy then fill missing
+            # keys from yaml so we never send a partial frame.
+            business = dict(opts.iflytek_business)
+        else:
+            business = {}
+        business.setdefault("language", canon)
+        business.setdefault("domain", self.settings.domain)
+        business.setdefault("vad_eos", eos)
+        business.setdefault("ltc", self.settings.ltc)
+        if include_dwa:
+            business.setdefault("dwa", self.settings.dwa)
+        # ``accent`` is only meaningful for the Chinese model. Java
+        # sends ``mandarin`` blindly; we keep that for byte-for-byte
+        # parity when the panel omits it AND language is zh_cn. For
+        # other languages we leave it out entirely (sending it is a
+        # no-op on iFlytek's side but confuses log readers).
+        if "accent" not in business and canon == "zh_cn":
+            business["accent"] = self.settings.accent
+        # ── common block ──────────────────────────────────────────
+        if opts is not None and opts.iflytek_common:
+            common = dict(opts.iflytek_common)
+        else:
+            common = {}
+        common.setdefault("app_id", self.settings.app_id)
+        return common, business
+    @staticmethod
+    def _build_data_block(
+        *, status: int, audio_b64: str, opts: STTOptions | None,
+    ) -> dict:
+        """Assemble the ``data`` block, merging panel-supplied extras.
+        Canonical keys (``status``/``format``/``encoding``/``audio``)
+        always win over ``iflytek_data_extras`` because the IAT spec
+        requires them in a specific shape; extras like the panel's
+        ``data_type`` flow through.
+        """
+        if opts is not None and opts.iflytek_data_extras:
+            data = dict(opts.iflytek_data_extras)
+        else:
+            data = {}
+        data["status"] = status
+        data["format"] = "audio/L16;rate=16000"
+        data["encoding"] = "raw"
+        data["audio"] = audio_b64
+        return data
+    async def _connect_with_retry(self) -> "websockets.ClientConnection":
+        """Connect to iFlytek IAT WS with backoff, mirroring Java parity.
+        Java ``AsrServiceImpl.connectWithRetry`` does 4 attempts with
+        300/600/1200ms backoff before giving up. The previous Python
+        path was one-shot: a single TCP/handshake hiccup surfaced as a
+        hard ASR failure. Aligning the retry budget keeps wallex's
+        Python and Java front-ends behaviourally interchangeable on
+        flaky links.
+        """
+        backoffs = (0.3, 0.6, 1.2)  # delays AFTER attempts 1, 2, 3
+        last_exc: Exception | None = None
+        for attempt in range(4):
+            try:
+                url = self._build_auth_url()
+                ws = await websockets.connect(url)
+                if attempt > 0:
+                    logger.info(
+                        "{}: WS connected on attempt {}/4",
+                        self.name, attempt + 1,
+                    )
+                return ws
+            except Exception as e:  # noqa: BLE001 — retry boundary
+                last_exc = e
+                logger.warning(
+                    "{}: WS connect failed (attempt {}/4): {}",
+                    self.name, attempt + 1, e,
+                )
+                if attempt < len(backoffs):
+                    await asyncio.sleep(backoffs[attempt])
+        raise RuntimeError(
+            f"iFlytek STT connect failed after 4 attempts: {last_exc}"
         )
-        return f"wss://{self._WS_HOST}{self._WS_PATH}?{params}"
     async def transcribe(
         self, audio: AudioData, opts: STTOptions | None = None
@@ -177,7 +335,6 @@ class IflytekSTT(STTProvider):
         logger.info("{}: request received, audio={} bytes", self.name, len(audio.data))
         _t0 = time.perf_counter()
-        url = self._build_auth_url()
         audio_bytes = audio.data
         # iFlytek recommends ~40ms per frame at 16kHz 16bit mono = 1280 bytes.
         # Use larger frames (8000 bytes = ~250ms) with pacing to avoid server
@@ -190,7 +347,8 @@ class IflytekSTT(STTProvider):
         result_texts: list[str] = []
-        async with websockets.connect(url) as ws:
+        ws = await self._connect_with_retry()
+        async with ws:
             # Send audio in chunks with interleaved receive
             total = len(audio_bytes)
             offset = 0
@@ -209,46 +367,36 @@ class IflytekSTT(STTProvider):
                 frame_data = base64.b64encode(chunk).decode("utf-8")
                 if status == 0:
-                    # First frame includes common and business params.
-                    # ``accent="mandarin"`` is only meaningful for the
-                    # Chinese model; sending it on en_us / ja_jp / etc.
-                    # is a wire-level no-op on iFlytek's side but
-                    # confuses anyone reading the request body, so
-                    # gate it on the canonical language.
-                    canon = _canonical_language(self.settings.language)
-                    # Per-call override (``opts.vad_eos``) trumps the
-                    # provider default. Wallex routes the panel's
-                    # ``parameter.iat.eos`` through here so a kiosk
-                    # can ship a tighter or looser silence threshold
-                    # than the deployment yaml.
-                    eos = (opts.vad_eos
-                           if opts is not None and opts.vad_eos is not None
-                           else self.settings.vad_eos)
-                    business = {
-                        "language": canon,
-                        "domain": "iat",
-                        "vad_eos": eos,
-                    }
-                    if canon == "zh_cn":
-                        business["accent"] = "mandarin"
+                    # First frame: panel-supplied common/business win;
+                    # batch path doesn't carry wpgs (no streaming
+                    # protocol) so include_dwa=False.
+                    common, business = self._build_first_frame_blocks(
+                        opts, include_dwa=False,
+                    )
+                    data_block = self._build_data_block(
+                        status=0, audio_b64=frame_data, opts=opts,
+                    )
                     msg = {
-                        "common": {"app_id": self.settings.app_id},
+                        "common": common,
                         "business": business,
-                        "data": {
-                            "status": 0,
-                            "format": "audio/L16;rate=16000",
-                            "encoding": "raw",
-                            "audio": frame_data,
-                        },
+                        "data": data_block,
                     }
+                    # Java parity: log the exact blocks we're about to
+                    # ship to iFlytek. Debugging "wrong language /
+                    # wrong endpoint" reports needs to see this from
+                    # the log alone — Java's AsrServiceImpl prints the
+                    # equivalent line at INFO.
+                    logger.info(
+                        "{}: ASR first frame business={}, common={}",
+                        self.name,
+                        json.dumps(business, ensure_ascii=False),
+                        json.dumps(common, ensure_ascii=False),
+                    )
                 else:
                     msg = {
-                        "data": {
-                            "status": status,
-                            "format": "audio/L16;rate=16000",
-                            "encoding": "raw",
-                            "audio": frame_data,
-                        }
+                        "data": self._build_data_block(
+                            status=status, audio_b64=frame_data, opts=opts,
+                        )
                     }
                 await ws.send(json.dumps(msg))
@@ -320,7 +468,6 @@ class IflytekSTT(STTProvider):
         if self._client is None:
             raise RuntimeError("Provider not started — call start() first")
-        url = self._build_auth_url()
         results: asyncio.Queue[Transcription | None] = asyncio.Queue()
         _t0 = time.perf_counter()
         _frames_sent = 0
@@ -332,7 +479,8 @@ class IflytekSTT(STTProvider):
         _sender_stop = asyncio.Event()
         logger.debug("{}: connecting to iFlytek WebSocket...", self.name)
-        async with websockets.connect(url) as ws:
+        ws = await self._connect_with_retry()
+        async with ws:
             _t_connected = time.perf_counter()
             logger.info("{}: WS connected in {:.0f}ms", self.name,
                         (_t_connected - _t0) * 1000)
@@ -350,42 +498,38 @@ class IflytekSTT(STTProvider):
                             break
                         frame_data = base64.b64encode(chunk).decode("utf-8")
                         if is_first:
-                            # See transcribe() for rationale on
-                            # canonicalizing language and gating accent.
-                            canon = _canonical_language(self.settings.language)
-                            # Per-call ``opts.vad_eos`` (e.g. wallex
-                            # forwarding the panel's ``parameter.iat.eos``)
-                            # trumps the provider's configured default.
-                            eos = (opts.vad_eos
-                                   if opts is not None and opts.vad_eos is not None
-                                   else self.settings.vad_eos)
-                            business = {
-                                "language": canon,
-                                "domain": "iat",
-                                "dwa": "wpgs",
-                                "vad_eos": eos,
-                            }
-                            if canon == "zh_cn":
-                                business["accent"] = "mandarin"
+                            # First frame: panel-supplied common/business win;
+                            # streaming path always carries wpgs (see
+                            # stt-streaming-spec.md realtime-correction
+                            # protocol) so include_dwa=True.
+                            common, business = self._build_first_frame_blocks(
+                                opts, include_dwa=True,
+                            )
+                            data_block = self._build_data_block(
+                                status=0, audio_b64=frame_data, opts=opts,
+                            )
                             msg = {
-                                "common": {"app_id": self.settings.app_id},
+                                "common": common,
                                 "business": business,
-                                "data": {
-                                    "status": 0,
-                                    "format": "audio/L16;rate=16000",
-                                    "encoding": "raw",
-                                    "audio": frame_data,
-                                },
+                                "data": data_block,
                             }
+                            # Java parity (AsrServiceImpl line 221): log
+                            # the first-frame business + common at INFO so
+                            # operators can verify which language/eos/dwa
+                            # the panel actually requested without
+                            # rebuilding the call from yaml + STTOptions.
+                            logger.info(
+                                "{}: ASR first frame business={}, common={}",
+                                self.name,
+                                json.dumps(business, ensure_ascii=False),
+                                json.dumps(common, ensure_ascii=False),
+                            )
                             is_first = False
                         else:
                             msg = {
-                                "data": {
-                                    "status": 1,
-                                    "format": "audio/L16;rate=16000",
-                                    "encoding": "raw",
-                                    "audio": frame_data,
-                                }
+                                "data": self._build_data_block(
+                                    status=1, audio_b64=frame_data, opts=opts,
+                                )
                             }
                         await ws.send(json.dumps(msg))
                         _frames_sent += 1
@@ -396,12 +540,9 @@ class IflytekSTT(STTProvider):
                     # Send empty last frame to signal end (only if WS still open)
                     if not _sender_stop.is_set():
                         last_msg = {
-                            "data": {
-                                "status": 2,
-                                "format": "audio/L16;rate=16000",
-                                "encoding": "raw",
-                                "audio": "",
-                            }
+                            "data": self._build_data_block(
+                                status=2, audio_b64="", opts=opts,
+                            )
                         }
                         await ws.send(json.dumps(last_msg))
                 except websockets.exceptions.ConnectionClosed:

{openspeechapi-0.2.5 → openspeechapi-0.2.7}/openspeechapi/providers/tts/iflytek.py RENAMED Viewed

@@ -29,16 +29,22 @@ class IflytekTTSSettings(BaseSettings):
     voice: str = "xiaoyan"
     speed: int = 50
     # Audio output encoding requested from iFlytek.
-    #   - "lame": MP3 frames (default; smaller, but caller must decode)
-    #   - "raw":  16-bit PCM @ 16 kHz mono, big-endian L16 (drop-in
-    #             playable as raw PCM; required by callers that wrap the
-    #             bytes in a fixed-format wire envelope and assume PCM,
-    #             e.g. wallex's RESP_VOICE which advertises
-    #             encoding=raw/bitDepth=16/sampleRate=16000 to the
-    #             panel — feeding MP3 bytes through that envelope plays
-    #             back as pure noise on the speaker).
+    #   - "lame":  MP3 frames (default; smaller, but caller must decode)
+    #   - "raw":   16-bit PCM @ 16 kHz mono, big-endian L16 (drop-in
+    #              playable as raw PCM; required by callers that wrap the
+    #              bytes in a fixed-format wire envelope and assume PCM,
+    #              e.g. wallex's RESP_VOICE which advertises
+    #              encoding=raw/bitDepth=16/sampleRate=16000 to the
+    #              panel — feeding MP3 bytes through that envelope plays
+    #              back as pure noise on the speaker).
+    #   - "speex"/"speex-wb-7": Speex narrowband / wideband (low-bitrate,
+    #              used by some embedded Wallex panels with constrained
+    #              uplink). Requires ``speex_size`` to declare the frame
+    #              size iFlytek should produce. Caller must run a Speex
+    #              decoder; not auto-handled by browsers.
     # Default stays "lame" for backward-compat; deployments that need
-    # PCM (wallex / direct hardware playback) override via yaml.
+    # PCM (wallex / direct hardware playback) or Speex (embedded panels)
+    # override via yaml.
     aue: str = "lame"
     # Output sample rate for raw PCM mode (only meaningful when
     # aue="raw"). 16000 matches what the panel and the iFlytek
@@ -50,6 +56,22 @@ class IflytekTTSSettings(BaseSettings):
     volume: int = 50
     # Pitch (0-100). Same rationale as volume.
     pitch: int = 50
+    # Speex frame size (only meaningful when aue startswith "speex").
+    # iFlytek expects an integer that selects a Speex bitrate / frame
+    # mode; ``0`` is "auto-pick by aue tag". Leave 0 unless the client
+    # decoder requires a specific frame size. Mirrors Java
+    # ``AsrConfig.speex-size`` / ``TtsConfig`` parameter.
+    speex_size: int = 0
+    # ``ws_host`` / ``ws_path`` — iFlytek TTS WebSocket endpoint.
+    # Override in yaml (or via ``OPENSPEECH_IFLYTEK_TTS_HOST`` env var)
+    # for region-specific endpoints. Default is the global endpoint.
+    ws_host: str = "tts-api.xfyun.cn"
+    ws_path: str = "/v2/tts"
+    # ``timeout_secs`` — connect / read timeout for the underlying
+    # httpx AsyncClient. Java's TtsConfig defaults to 8s; we mirror
+    # that for parity. Increase only when the iFlytek endpoint is
+    # consistently slow to handshake.
+    timeout_secs: int = 8
 class IflytekTTS(TTSProvider):
     name = "iflytek-tts"
@@ -66,12 +88,9 @@ class IflytekTTS(TTSProvider):
             # English assistant-style voices used by wallex deployments.
             "x4_enuk_ashleigh_assist",
         ],
-        "aue": ["lame", "raw"],
+        "aue": ["lame", "raw", "speex", "speex-wb-7"],
     }
-    _WS_HOST = "tts-api.xfyun.cn"
-    _WS_PATH = "/v2/tts"
     def __init__(self, settings: IflytekTTSSettings | None = None) -> None:
         self.settings = settings or IflytekTTSSettings()
         self._client: httpx.AsyncClient | None = None
@@ -83,7 +102,7 @@ class IflytekTTS(TTSProvider):
     async def start(self) -> None:
         if self._client is None:
-            self._client = httpx.AsyncClient(timeout=60.0)
+            self._client = httpx.AsyncClient(timeout=float(self.settings.timeout_secs))
             self._owns_client = True
     async def stop(self) -> None:
@@ -99,10 +118,12 @@ class IflytekTTS(TTSProvider):
         now = datetime.now(tz=timezone.utc)
         date = formatdate(timeval=now.timestamp(), localtime=False, usegmt=True)
+        host = self.settings.ws_host
+        path = self.settings.ws_path
         signature_origin = (
-            f"host: {self._WS_HOST}\n"
+            f"host: {host}\n"
             f"date: {date}\n"
-            f"GET {self._WS_PATH} HTTP/1.1"
+            f"GET {path} HTTP/1.1"
         )
         signature_sha = hmac.new(
             self.settings.api_secret.encode("utf-8"),
@@ -122,9 +143,9 @@ class IflytekTTS(TTSProvider):
         ).decode("utf-8")
         params = urllib.parse.urlencode(
-            {"authorization": authorization, "date": date, "host": self._WS_HOST}
+            {"authorization": authorization, "date": date, "host": host}
         )
-        return f"wss://{self._WS_HOST}{self._WS_PATH}?{params}"
+        return f"wss://{host}{path}?{params}"
     async def synthesize(
         self, text: str, opts: TTSOptions | None = None
@@ -138,8 +159,17 @@ class IflytekTTS(TTSProvider):
         # callers downstream may set wire-protocol encoding metadata from
         # this field, and a wrong tag on the bytes plays back as noise on
         # raw-PCM consumers.
-        fmt = "pcm_s16le" if self.settings.aue == "raw" else "mp3"
-        sr = self.settings.auf_rate if self.settings.aue == "raw" else 16000
+        aue = self.settings.aue
+        if aue == "raw":
+            fmt = "pcm_s16le"
+            sr = self.settings.auf_rate
+        elif aue.startswith("speex"):
+            # Speex narrowband is 8 kHz, wideband ("speex-wb-*") is 16 kHz.
+            fmt = "speex"
+            sr = 16000 if "wb" in aue else 8000
+        else:
+            fmt = "mp3"
+            sr = 16000
         logger.info(
             "iFlytek TTS: {} chunks, {} bytes total, format={}, sample_rate={}",
             len(parts), len(audio_bytes), fmt, sr,
@@ -191,7 +221,8 @@ class IflytekTTS(TTSProvider):
             "pitch": self.settings.pitch,
             "tte": "UTF8",
         }
-        if self.settings.aue == "lame":
+        aue = self.settings.aue
+        if aue == "lame":
             # ``sfl=1`` (stream-frame-length) is an MP3-only knob that
             # tells iFlytek to emit per-frame audio rather than waiting
             # for the whole file. It has no meaning for raw PCM (raw is
@@ -199,11 +230,17 @@ class IflytekTTS(TTSProvider):
             # combo with a code 10005 "invalid parameter" — so we only
             # send it on the lame path.
             business["sfl"] = 1
-        else:
+        elif aue == "raw":
             # Raw / L16 mode requires ``auf`` to declare the PCM
             # sample-rate iFlytek should produce. Java wallex sends
             # ``audio/L16;rate=16000`` here; we mirror that exactly.
             business["auf"] = f"audio/L16;rate={self.settings.auf_rate}"
+        elif aue.startswith("speex"):
+            # Speex narrowband / wideband. ``speex_size`` is the iFlytek
+            # frame-size selector (0 = engine default; non-zero values
+            # match the Java ``TtsConfig.speex-size`` parameter).
+            if self.settings.speex_size:
+                business["speex_size"] = self.settings.speex_size
         return {
             "common": {"app_id": self.settings.app_id},
             "business": business,

{openspeechapi-0.2.5 → openspeechapi-0.2.7}/providers.example.yaml RENAMED Viewed

@@ -123,7 +123,12 @@ engines:
   #   exec_mode: remote
   #   preload: true
   #   settings:
-  #     language: zh_cn
+  #     language: zh_cn        # zh_cn / en_us / ja_jp / ko_kr / ru-ru
+  #     vad_eos: 2000          # ms of trailing silence before final
+  #     ltc: 3                 # 1 sentence / 2 +word / 3 +char timestamps
+  #     ws_host: iat-api.xfyun.cn      # override for region-specific endpoints
+  #     ws_path: /v2/iat
+  #     timeout_secs: 15
   # # pip install 'openspeechapi[faster-whisper-stt]'
   # faster_whisper_stt:
@@ -190,7 +195,15 @@ engines:
   #   exec_mode: remote
   #   settings:
   #     voice: xiaoyan
-  #     speed: 50
+  #     speed: 50              # 0-100
+  #     volume: 50             # 0-100
+  #     pitch: 50              # 0-100
+  #     aue: lame              # lame / raw / speex / speex-wb-7
+  #     auf_rate: 16000        # only used when aue=raw (8000 / 16000 / 24000)
+  #     speex_size: 0          # only used when aue startswith speex (0 = auto)
+  #     ws_host: tts-api.xfyun.cn      # override for region-specific endpoints
+  #     ws_path: /v2/tts
+  #     timeout_secs: 8
   # # pip install 'openspeechapi[piper-tts]'
   # piper_tts:

{openspeechapi-0.2.5 → openspeechapi-0.2.7}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "openspeechapi"
-version = "0.2.5"
+version = "0.2.7"
 description = "Unified speech interface for STT/TTS providers"
 requires-python = ">=3.11"
 dependencies = [

{openspeechapi-0.2.5 → openspeechapi-0.2.7}/.dockerignore RENAMED Viewed

File without changes

{openspeechapi-0.2.5 → openspeechapi-0.2.7}/.env.example RENAMED Viewed

File without changes

{openspeechapi-0.2.5 → openspeechapi-0.2.7}/.github/workflows/ci.yml RENAMED Viewed

File without changes

{openspeechapi-0.2.5 → openspeechapi-0.2.7}/.gitignore RENAMED Viewed

File without changes

{openspeechapi-0.2.5 → openspeechapi-0.2.7}/.tmp/audio/en.aiff RENAMED Viewed

File without changes

{openspeechapi-0.2.5 → openspeechapi-0.2.7}/.tmp/audio/en_16k.wav RENAMED Viewed

File without changes

{openspeechapi-0.2.5 → openspeechapi-0.2.7}/.tmp/audio/en_16k_pad6.wav RENAMED Viewed

File without changes

{openspeechapi-0.2.5 → openspeechapi-0.2.7}/.tmp/audio/en_long.aiff RENAMED Viewed

File without changes

{openspeechapi-0.2.5 → openspeechapi-0.2.7}/.tmp/audio/en_long_16k.wav RENAMED Viewed

File without changes

openspeechapi 0.2.5__tar.gz → 0.2.7__tar.gz

openspeechapi 0.2.5tar.gz → 0.2.7tar.gz