PyPI - openspeechapi - Versions diffs - 0.2.6__tar.gz → 0.2.7__tar.gz - Mend

openspeechapi 0.2.6tar.gz → 0.2.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (247) hide show

{openspeechapi-0.2.6 → openspeechapi-0.2.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: openspeechapi
-Version: 0.2.6
+Version: 0.2.7
 Summary: Unified speech interface for STT/TTS providers
 Requires-Python: >=3.11
 Requires-Dist: httpx>=0.27

{openspeechapi-0.2.6 → openspeechapi-0.2.7}/openspeechapi/__init__.py RENAMED Viewed

@@ -1,6 +1,6 @@
 """OpenSpeechAPI — Unified speech interface for STT/TTS providers."""
-__version__ = "0.2.6"
+__version__ = "0.2.7"
 from openspeechapi.config import load_config
 from openspeechapi.core.base import SpeechProvider, STTProvider, TTSProvider

{openspeechapi-0.2.6 → openspeechapi-0.2.7}/openspeechapi/core/models.py RENAMED Viewed

@@ -60,6 +60,25 @@ class STTOptions:
     # voice assistant. Providers that don't support VAD finalization
     # (Whisper, Faster-Whisper) silently ignore this field.
     vad_eos: int | None = None
+    # ── iFlytek IAT pass-through (matches Java AsrServiceImpl) ───────
+    # Wallex's Java AsrService forwards the client-supplied
+    # ``audio.common`` / ``audio.business`` / extra ``audio.data``
+    # fields verbatim to iFlytek's WS, treating the panel as the
+    # source of truth for ASR parameters. The Python pipeline now
+    # mirrors that contract: when these fields are non-None, the
+    # iFlytek provider uses them as the basis for the WS first frame
+    # (with ``setdefault`` fallback to its own settings for any keys
+    # the client omitted) instead of building the blocks purely from
+    # ``speech_providers.yaml``. ``None`` preserves the existing
+    # yaml-driven behaviour. Other STT providers ignore these fields.
+    iflytek_common: dict | None = None
+    iflytek_business: dict | None = None
+    # Extra fields to merge into the iFlytek ``data`` block beyond the
+    # canonical ``status``/``format``/``encoding``/``audio`` quadruple
+    # (e.g. panel-supplied ``data_type``). Keys that collide with the
+    # canonical set are preserved (the provider's defaults still win,
+    # since the canonical set is required by the IAT spec).
+    iflytek_data_extras: dict | None = None
 @dataclass

{openspeechapi-0.2.6 → openspeechapi-0.2.7}/openspeechapi/providers/stt/iflytek.py RENAMED Viewed

@@ -72,6 +72,34 @@ class IflytekSTTSettings(BaseSettings):
     # that. Lower if the network has aggressive proxies, higher only
     # if the iFlytek endpoint is consistently slow to handshake.
     timeout_secs: int = 15
+    # ── Java AsrConfig parity (used as setdefault fallbacks) ────────
+    # When a wallex client (panel) supplies ``audio.business``/
+    # ``audio.common`` per-frame, those values flow through via
+    # ``STTOptions.iflytek_business``/``STTOptions.iflytek_common`` and
+    # become the WS first frame body. The settings below act as
+    # ``setdefault`` fallbacks for keys the client omits, mirroring
+    # Java ``AsrConfig``'s field set so the two implementations
+    # produce identical wire frames given the same panel payload.
+    #
+    # ``domain`` — iFlytek IAT domain. Java default ``iat``; a few
+    # vertical models (``medical`` / ``tv``) exist but most
+    # deployments stay on the general one.
+    domain: str = "iat"
+    # ``accent`` — only meaningful when ``language=="zh_cn"`` (selects
+    # mandarin vs. cantonese etc.). Java sends ``mandarin`` blindly;
+    # we keep the same default so the WS frame matches Java byte-for-
+    # byte when the panel omits ``business.accent``. iFlytek treats
+    # it as a no-op for non-Chinese language codes.
+    accent: str = "mandarin"
+    # ``dwa`` — dynamic word adjustment / wpgs (实时纠错). Java's
+    # default ``wpgs`` is the realtime-correction mode panels rely on
+    # for the partial-result protocol described in
+    # ``stt-streaming-spec.md``. Empty disables it.
+    dwa: str = "wpgs"
+    # ``sample_rate`` — required by the IAT directed-domain endpoint
+    # (``ws-api-dx.xfyun.cn``) which expects it in ``business``. Java
+    # AsrConfig.sampleRate=16000.
+    sample_rate: int = 16000
 # iFlytek expects the full locale tag; common ISO short codes need to
@@ -188,6 +216,117 @@ class IflytekSTT(STTProvider):
         )
         return f"wss://{host}{path}?{params}"
+    def _build_first_frame_blocks(
+        self,
+        opts: STTOptions | None,
+        *,
+        include_dwa: bool,
+    ) -> tuple[dict, dict]:
+        """Build the ``common`` / ``business`` blocks for the WS first frame.
+        Mirrors Java ``AsrServiceImpl.sendToAsr`` semantics: when
+        ``opts.iflytek_common``/``opts.iflytek_business`` is provided
+        (typically by wallex relaying the panel's per-frame
+        ``audio.common`` / ``audio.business``), those dicts are the
+        source of truth. We only ``setdefault`` keys the client omitted,
+        falling back to ``self.settings`` so a panel that misses a
+        single field doesn't get a malformed frame.
+        ``include_dwa`` differs between ``transcribe()`` (batch — no
+        wpgs because there's no streaming protocol) and
+        ``transcribe_stream()`` (always wpgs).
+        """
+        canon = _canonical_language(self.settings.language)
+        eos = (opts.vad_eos
+               if opts is not None and opts.vad_eos is not None
+               else self.settings.vad_eos)
+        # ── business block ─────────────────────────────────────────
+        if opts is not None and opts.iflytek_business:
+            # Panel-supplied is authoritative; copy then fill missing
+            # keys from yaml so we never send a partial frame.
+            business = dict(opts.iflytek_business)
+        else:
+            business = {}
+        business.setdefault("language", canon)
+        business.setdefault("domain", self.settings.domain)
+        business.setdefault("vad_eos", eos)
+        business.setdefault("ltc", self.settings.ltc)
+        if include_dwa:
+            business.setdefault("dwa", self.settings.dwa)
+        # ``accent`` is only meaningful for the Chinese model. Java
+        # sends ``mandarin`` blindly; we keep that for byte-for-byte
+        # parity when the panel omits it AND language is zh_cn. For
+        # other languages we leave it out entirely (sending it is a
+        # no-op on iFlytek's side but confuses log readers).
+        if "accent" not in business and canon == "zh_cn":
+            business["accent"] = self.settings.accent
+        # ── common block ──────────────────────────────────────────
+        if opts is not None and opts.iflytek_common:
+            common = dict(opts.iflytek_common)
+        else:
+            common = {}
+        common.setdefault("app_id", self.settings.app_id)
+        return common, business
+    @staticmethod
+    def _build_data_block(
+        *, status: int, audio_b64: str, opts: STTOptions | None,
+    ) -> dict:
+        """Assemble the ``data`` block, merging panel-supplied extras.
+        Canonical keys (``status``/``format``/``encoding``/``audio``)
+        always win over ``iflytek_data_extras`` because the IAT spec
+        requires them in a specific shape; extras like the panel's
+        ``data_type`` flow through.
+        """
+        if opts is not None and opts.iflytek_data_extras:
+            data = dict(opts.iflytek_data_extras)
+        else:
+            data = {}
+        data["status"] = status
+        data["format"] = "audio/L16;rate=16000"
+        data["encoding"] = "raw"
+        data["audio"] = audio_b64
+        return data
+    async def _connect_with_retry(self) -> "websockets.ClientConnection":
+        """Connect to iFlytek IAT WS with backoff, mirroring Java parity.
+        Java ``AsrServiceImpl.connectWithRetry`` does 4 attempts with
+        300/600/1200ms backoff before giving up. The previous Python
+        path was one-shot: a single TCP/handshake hiccup surfaced as a
+        hard ASR failure. Aligning the retry budget keeps wallex's
+        Python and Java front-ends behaviourally interchangeable on
+        flaky links.
+        """
+        backoffs = (0.3, 0.6, 1.2)  # delays AFTER attempts 1, 2, 3
+        last_exc: Exception | None = None
+        for attempt in range(4):
+            try:
+                url = self._build_auth_url()
+                ws = await websockets.connect(url)
+                if attempt > 0:
+                    logger.info(
+                        "{}: WS connected on attempt {}/4",
+                        self.name, attempt + 1,
+                    )
+                return ws
+            except Exception as e:  # noqa: BLE001 — retry boundary
+                last_exc = e
+                logger.warning(
+                    "{}: WS connect failed (attempt {}/4): {}",
+                    self.name, attempt + 1, e,
+                )
+                if attempt < len(backoffs):
+                    await asyncio.sleep(backoffs[attempt])
+        raise RuntimeError(
+            f"iFlytek STT connect failed after 4 attempts: {last_exc}"
+        )
     async def transcribe(
         self, audio: AudioData, opts: STTOptions | None = None
     ) -> Transcription:
@@ -196,7 +335,6 @@ class IflytekSTT(STTProvider):
         logger.info("{}: request received, audio={} bytes", self.name, len(audio.data))
         _t0 = time.perf_counter()
-        url = self._build_auth_url()
         audio_bytes = audio.data
         # iFlytek recommends ~40ms per frame at 16kHz 16bit mono = 1280 bytes.
         # Use larger frames (8000 bytes = ~250ms) with pacing to avoid server
@@ -209,7 +347,8 @@ class IflytekSTT(STTProvider):
         result_texts: list[str] = []
-        async with websockets.connect(url) as ws:
+        ws = await self._connect_with_retry()
+        async with ws:
             # Send audio in chunks with interleaved receive
             total = len(audio_bytes)
             offset = 0
@@ -228,47 +367,36 @@ class IflytekSTT(STTProvider):
                 frame_data = base64.b64encode(chunk).decode("utf-8")
                 if status == 0:
-                    # First frame includes common and business params.
-                    # ``accent="mandarin"`` is only meaningful for the
-                    # Chinese model; sending it on en_us / ja_jp / etc.
-                    # is a wire-level no-op on iFlytek's side but
-                    # confuses anyone reading the request body, so
-                    # gate it on the canonical language.
-                    canon = _canonical_language(self.settings.language)
-                    # Per-call override (``opts.vad_eos``) trumps the
-                    # provider default. Wallex routes the panel's
-                    # ``parameter.iat.eos`` through here so a kiosk
-                    # can ship a tighter or looser silence threshold
-                    # than the deployment yaml.
-                    eos = (opts.vad_eos
-                           if opts is not None and opts.vad_eos is not None
-                           else self.settings.vad_eos)
-                    business = {
-                        "language": canon,
-                        "domain": "iat",
-                        "vad_eos": eos,
-                        "ltc": self.settings.ltc,
-                    }
-                    if canon == "zh_cn":
-                        business["accent"] = "mandarin"
+                    # First frame: panel-supplied common/business win;
+                    # batch path doesn't carry wpgs (no streaming
+                    # protocol) so include_dwa=False.
+                    common, business = self._build_first_frame_blocks(
+                        opts, include_dwa=False,
+                    )
+                    data_block = self._build_data_block(
+                        status=0, audio_b64=frame_data, opts=opts,
+                    )
                     msg = {
-                        "common": {"app_id": self.settings.app_id},
+                        "common": common,
                         "business": business,
-                        "data": {
-                            "status": 0,
-                            "format": "audio/L16;rate=16000",
-                            "encoding": "raw",
-                            "audio": frame_data,
-                        },
+                        "data": data_block,
                     }
+                    # Java parity: log the exact blocks we're about to
+                    # ship to iFlytek. Debugging "wrong language /
+                    # wrong endpoint" reports needs to see this from
+                    # the log alone — Java's AsrServiceImpl prints the
+                    # equivalent line at INFO.
+                    logger.info(
+                        "{}: ASR first frame business={}, common={}",
+                        self.name,
+                        json.dumps(business, ensure_ascii=False),
+                        json.dumps(common, ensure_ascii=False),
+                    )
                 else:
                     msg = {
-                        "data": {
-                            "status": status,
-                            "format": "audio/L16;rate=16000",
-                            "encoding": "raw",
-                            "audio": frame_data,
-                        }
+                        "data": self._build_data_block(
+                            status=status, audio_b64=frame_data, opts=opts,
+                        )
                     }
                 await ws.send(json.dumps(msg))
@@ -340,7 +468,6 @@ class IflytekSTT(STTProvider):
         if self._client is None:
             raise RuntimeError("Provider not started — call start() first")
-        url = self._build_auth_url()
         results: asyncio.Queue[Transcription | None] = asyncio.Queue()
         _t0 = time.perf_counter()
         _frames_sent = 0
@@ -352,7 +479,8 @@ class IflytekSTT(STTProvider):
         _sender_stop = asyncio.Event()
         logger.debug("{}: connecting to iFlytek WebSocket...", self.name)
-        async with websockets.connect(url) as ws:
+        ws = await self._connect_with_retry()
+        async with ws:
             _t_connected = time.perf_counter()
             logger.info("{}: WS connected in {:.0f}ms", self.name,
                         (_t_connected - _t0) * 1000)
@@ -370,43 +498,38 @@ class IflytekSTT(STTProvider):
                             break
                         frame_data = base64.b64encode(chunk).decode("utf-8")
                         if is_first:
-                            # See transcribe() for rationale on
-                            # canonicalizing language and gating accent.
-                            canon = _canonical_language(self.settings.language)
-                            # Per-call ``opts.vad_eos`` (e.g. wallex
-                            # forwarding the panel's ``parameter.iat.eos``)
-                            # trumps the provider's configured default.
-                            eos = (opts.vad_eos
-                                   if opts is not None and opts.vad_eos is not None
-                                   else self.settings.vad_eos)
-                            business = {
-                                "language": canon,
-                                "domain": "iat",
-                                "dwa": "wpgs",
-                                "vad_eos": eos,
-                                "ltc": self.settings.ltc,
-                            }
-                            if canon == "zh_cn":
-                                business["accent"] = "mandarin"
+                            # First frame: panel-supplied common/business win;
+                            # streaming path always carries wpgs (see
+                            # stt-streaming-spec.md realtime-correction
+                            # protocol) so include_dwa=True.
+                            common, business = self._build_first_frame_blocks(
+                                opts, include_dwa=True,
+                            )
+                            data_block = self._build_data_block(
+                                status=0, audio_b64=frame_data, opts=opts,
+                            )
                             msg = {
-                                "common": {"app_id": self.settings.app_id},
+                                "common": common,
                                 "business": business,
-                                "data": {
-                                    "status": 0,
-                                    "format": "audio/L16;rate=16000",
-                                    "encoding": "raw",
-                                    "audio": frame_data,
-                                },
+                                "data": data_block,
                             }
+                            # Java parity (AsrServiceImpl line 221): log
+                            # the first-frame business + common at INFO so
+                            # operators can verify which language/eos/dwa
+                            # the panel actually requested without
+                            # rebuilding the call from yaml + STTOptions.
+                            logger.info(
+                                "{}: ASR first frame business={}, common={}",
+                                self.name,
+                                json.dumps(business, ensure_ascii=False),
+                                json.dumps(common, ensure_ascii=False),
+                            )
                             is_first = False
                         else:
                             msg = {
-                                "data": {
-                                    "status": 1,
-                                    "format": "audio/L16;rate=16000",
-                                    "encoding": "raw",
-                                    "audio": frame_data,
-                                }
+                                "data": self._build_data_block(
+                                    status=1, audio_b64=frame_data, opts=opts,
+                                )
                             }
                         await ws.send(json.dumps(msg))
                         _frames_sent += 1
@@ -417,12 +540,9 @@ class IflytekSTT(STTProvider):
                     # Send empty last frame to signal end (only if WS still open)
                     if not _sender_stop.is_set():
                         last_msg = {
-                            "data": {
-                                "status": 2,
-                                "format": "audio/L16;rate=16000",
-                                "encoding": "raw",
-                                "audio": "",
-                            }
+                            "data": self._build_data_block(
+                                status=2, audio_b64="", opts=opts,
+                            )
                         }
                         await ws.send(json.dumps(last_msg))
                 except websockets.exceptions.ConnectionClosed:

{openspeechapi-0.2.6 → openspeechapi-0.2.7}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "openspeechapi"
-version = "0.2.6"
+version = "0.2.7"
 description = "Unified speech interface for STT/TTS providers"
 requires-python = ">=3.11"
 dependencies = [