npm - @bdky/aaas-pilot-kit - Versions diffs - 1.0.9 → 1.1.0 - Mend

@bdky/aaas-pilot-kit 1.0.9 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/dist/libs/aaas-pilot-kit/src/lib/utils/audio-processing/FloatAPICompat.d.ts ADDED Viewed

@@ -0,0 +1,103 @@
+/**
+ * @file Float API 兼容层
+ * @description 为老版本浏览器(特别是 iOS Safari 10-13)提供 Float API 的降级方案
+ *
+ * 问题背景:
+ * - AnalyserNode.getFloatTimeDomainData() 和 getFloatFrequencyData() 在老版本 iOS/Android 上不存在
+ * - 直接调用会导致 TypeError 崩溃
+ * - 需要降级到 getByteTimeDomainData() 和 getByteFrequencyData() 并进行格式转换
+ */
+/**
+ * 兼容性包装: getFloatTimeDomainData
+ * 优先使用 Float API,降级到 Byte API 并转换
+ *
+ * @param analyser - AnalyserNode 实例
+ * @param output - 输出缓冲区 (Float32Array)
+ *
+ * @example
+ * ```typescript
+ * const analyser = context.createAnalyser();
+ * const timeData = new Float32Array(analyser.fftSize);
+ * getFloatTimeDomainDataCompat(analyser, timeData);
+ * ```
+ */
+export declare function getFloatTimeDomainDataCompat(analyser: AnalyserNode, output: Float32Array<ArrayBuffer>): void;
+/**
+ * 兼容性包装: getFloatFrequencyData
+ * 优先使用 Float API,降级到 Byte API 并转换
+ *
+ * @param analyser - AnalyserNode 实例
+ * @param output - 输出缓冲区 (Float32Array)
+ *
+ * @example
+ * ```typescript
+ * const analyser = context.createAnalyser();
+ * const freqData = new Float32Array(analyser.frequencyBinCount);
+ * getFloatFrequencyDataCompat(analyser, freqData);
+ * ```
+ */
+export declare function getFloatFrequencyDataCompat(analyser: AnalyserNode, output: Float32Array): void;
+/**
+ * 直接从 Float32Array 采样计算 RMS (均方根)
+ * 比使用 AnalyserNode 更可靠,避免分支不更新的问题
+ *
+ * @param samples - 音频采样数组
+ * @returns RMS 值 (0.0 到 1.0+)
+ *
+ * @example
+ * ```typescript
+ * const inputData = event.inputBuffer.getChannelData(0);
+ * const rms = calculateRMSDirect(inputData);
+ * const energyDb = rmsToDb(rms);
+ * ```
+ */
+export declare function calculateRMSDirect(samples: Float32Array): number;
+/**
+ * 将 RMS 值转换为 dBFS (分贝满刻度)
+ *
+ * @param rms - RMS 值 (0.0 到 1.0+)
+ * @returns dBFS 值 (-100 到 0+)
+ *
+ * @remarks
+ * - 0 dBFS = 最大值 (RMS = 1.0)
+ * - -∞ dBFS = 静音 (RMS = 0.0)
+ * - 实际下限设为 -100 dBFS
+ *
+ * @example
+ * ```typescript
+ * rmsToDb(1.0)    // 0 dBFS (最大)
+ * rmsToDb(0.5)    // -6.02 dBFS
+ * rmsToDb(0.1)    // -20 dBFS
+ * rmsToDb(0.00001) // -100 dBFS (下限)
+ * ```
+ */
+export declare function rmsToDb(rms: number): number;
+/**
+ * 将 dBFS 转换为线性增益系数
+ *
+ * @param db - dBFS 值
+ * @returns 线性增益 (0.0 到 1.0+)
+ *
+ * @example
+ * ```typescript
+ * dbToGain(0)    // 1.0 (无衰减)
+ * dbToGain(-6)   // 0.501 (约 50%)
+ * dbToGain(-20)  // 0.1 (10%)
+ * dbToGain(-40)  // 0.01 (1%)
+ * ```
+ */
+export declare function dbToGain(db: number): number;
+/**
+ * 将线性增益转换为 dB
+ *
+ * @param gain - 线性增益 (0.0 到 1.0+)
+ * @returns dB 值
+ *
+ * @example
+ * ```typescript
+ * gainToDb(1.0)   // 0 dB
+ * gainToDb(0.5)   // -6.02 dB
+ * gainToDb(2.0)   // 6.02 dB (放大)
+ * ```
+ */
+export declare function gainToDb(gain: number): number;

package/dist/libs/aaas-pilot-kit/src/types/config.d.ts CHANGED Viewed

@@ -12,9 +12,247 @@ import { NAbstractDigitalHumanService } from '../lib/service/digital-human/BaseD
 import { IAsrMessageEventPayload, IBusyEventPayload, IInterruptEventPayload, IReplyStartEventPayload } from '../lib/controller';
 import { NDigitalHumanService } from '../lib/service/digital-human/CloudDigitalHumanService';
 import { Language, type LanguageCode } from '../lib/utils/language-mapper';
+import type { AzureSpeechRegionValue, AzureSpeechLanguageValue } from '../lib/constants/azure';
 type TFormatter = (input: string) => string;
 export { Language };
 export type { LanguageCode };
+/**
+ * ASR 提供商类型
+ */
+export type AsrProvider = 'baidu' | 'azure';
+/**
+ * ASR 通用配置（所有提供商共享）
+ */
+export interface ICommonAsrConfig {
+    /**
+     * 🎙️【选填】音频约束配置
+     * 用于控制 getUserMedia 的音频处理特性
+     *
+     * 直接使用 Web 标准 `MediaTrackConstraints` 类型，支持所有标准音频约束参数。
+     *
+     * ⚙️ 三大核心特性：
+     * - `echoCancellation`（回声消除）：消除扬声器播放的声音被麦克风再次录入
+     * - `noiseSuppression`（降噪）：过滤背景噪音
+     * - `autoGainControl`（自动增益）：自动调节音量，保持稳定输出
+     *
+     * 📚 更多可用参数参考：https://developer.mozilla.org/en-US/docs/Web/API/MediaTrackConstraints
+     *
+     * @default
+     * ```typescript
+     * {
+     *   echoCancellation: true,
+     *   noiseSuppression: true,
+     *   autoGainControl: true
+     * }
+     * ```
+     *
+     * @example 自定义配置
+     * ```typescript
+     * {
+     *   echoCancellation: true,
+     *   noiseSuppression: true,
+     *   autoGainControl: false,  // 禁用自动增益
+     *   sampleRate: { ideal: 48000 },  // 48kHz 高质量采样
+     *   channelCount: { exact: 1 },    // 强制单声道
+     *   deviceId: 'specific-device-id' // 指定设备
+     * }
+     * ```
+     */
+    audioConstraints?: MediaTrackConstraints;
+    /**
+     * 🔊【选填】启用外部播放抑制（移动端推荐）
+     * 用于解决移动端外放音频（如 TTS 播报）被 ASR 误识别的问题
+     *
+     * 工作原理：
+     * - 实时监测麦克风音频能量
+     * - 外部音频播放期间，提高 ASR 激活阈值
+     * - 只有当麦克风能量显著高于阈值时，才允许 ASR 识别
+     *
+     * 💡 使用场景：
+     * - 移动端用户外放播报（不使用耳机）
+     * - 浏览器回声消除效果不佳的场景
+     * - 需要在播报过程中仍然支持语音打断
+     *
+     * ⚠️ 注意：
+     * - 会增加约 10-20ms 音频处理延迟
+     * - 环境噪音过大可能影响效果
+     * - 用户需要比正常情况稍大声说话
+     *
+     * @default false
+     */
+    enablePlaybackSuppression?: boolean;
+    /**
+     * 🎚️【选填】外部播放抑制配置（仅当 enablePlaybackSuppression=true 时生效）
+     */
+    playbackSuppressionConfig?: {
+        /**
+         * 抑制启用时的能量倍数阈值
+         * 麦克风能量必须是基准的 N 倍才能通过
+         * 值越大，抑制越激进（可能漏识别用户语音）
+         * @default 3.0
+         */
+        energyMultiplier?: number;
+        /**
+         * 空闲期间的基准能量阈值（dBFS）
+         * 低于此值的信号被视为静音或微弱回声
+         * @default -45
+         */
+        idleThreshold?: number;
+        /**
+         * 能量平滑系数（0-1）
+         * 值越大，能量变化越平滑（降低噪声干扰）
+         * @default 0.7
+         */
+        smoothingFactor?: number;
+        /**
+         * 抑制启用延迟（毫秒）
+         * 启用抑制后，延迟多久才启用高阈值
+         * @default 100
+         */
+        activationDelay?: number;
+        /**
+         * 抑制关闭恢复延迟（毫秒）
+         * 停止抑制后，延迟多久恢复正常阈值
+         * @default 500
+         */
+        recoveryDelay?: number;
+    };
+}
+/**
+ * 百度 ASR 配置
+ */
+export interface IBaiduAsrConfig extends ICommonAsrConfig {
+    /**
+     * ASR 语音端点检测（VAD）的静音超时时长（单位：毫秒）
+     * @default 600
+     */
+    asrVad?: number;
+    /**
+     * 语言配置
+     * @default Language.CHINESE
+     */
+    lang?: LanguageCode;
+}
+/**
+ * Azure Speech 配置
+ */
+export interface IAzureSpeechConfig extends ICommonAsrConfig {
+    /**
+     * Azure Speech Service 订阅密钥
+     * @example "EZ8kfYwedvrliVh7xyH5biJtQbPb7lAF4XkudINlPdS2bXSFXY2YJQQJ99BLAC3pKaRXJ3w3AAAYACOGDGFo"
+     */
+    subscriptionKey: string;
+    /**
+     * Azure 服务区域
+     *
+     * 🎯 推荐使用：
+     * - 亚太地区：AzureSpeechRegion.SOUTHEAST_ASIA（新加坡）、AzureSpeechRegion.EAST_ASIA（香港）
+     * - 日本：AzureSpeechRegion.JAPAN_EAST（东京）
+     * - 韩国：AzureSpeechRegion.KOREA_CENTRAL（首尔）
+     * - 欧美：AzureSpeechRegion.WEST_EUROPE（荷兰）、AzureSpeechRegion.EAST_US（美国东部）
+     *
+     * @example AzureSpeechRegion.SOUTHEAST_ASIA
+     * @example 'southeastasia' // 也支持字符串（向后兼容）
+     */
+    region: AzureSpeechRegionValue | string;
+    /**
+     * 识别语言列表（支持多语言自动检测，最多 4 种）
+     *
+     * 🎯 常用语言：
+     * - 中文：AzureSpeechLanguage.CHINESE_SIMPLIFIED_CN
+     * - 英文：AzureSpeechLanguage.ENGLISH_US
+     * - 日文：AzureSpeechLanguage.JAPANESE_JP
+     * - 韩文：AzureSpeechLanguage.KOREAN_KR
+     *
+     * @default [AzureSpeechLanguage.ENGLISH_US]
+     * @example [AzureSpeechLanguage.CHINESE_SIMPLIFIED_CN, AzureSpeechLanguage.ENGLISH_US]
+     * @example ['zh-CN', 'en-US'] // 也支持字符串（向后兼容）
+     */
+    languages?: Array<AzureSpeechLanguageValue | string>;
+    /**
+     * 自定义短语列表（Phrase List）
+     * 用于提升特定词汇的识别准确率
+     * @example ['客悦 ONE', 'Galaxy S25', 'iPhone 17Pro']
+     */
+    phraseList?: string[];
+    /**
+     * 短语权重（1-10，默认 2）
+     */
+    phraseWeight?: number;
+    /**
+     * 初始静音超时（毫秒）
+     * @default 30000
+     */
+    initialSilenceTimeoutMs?: number;
+    /**
+     * 结束静音超时（毫秒）
+     * @default 30000
+     */
+    endSilenceTimeoutMs?: number;
+    /**
+     * 分段静音超时（毫秒）
+     * @default 1000
+     */
+    segmentationSilenceTimeoutMs?: number;
+    /**
+     * 是否启用音频日志
+     * @default false
+     */
+    enableAudioLogging?: boolean;
+    /**
+     * 自定义端点 ID（如使用 Custom Speech）
+     */
+    customEndpointId?: string;
+    /**
+     * 高级配置参数（用于传递任意 Azure SDK 配置）
+     * @example { recognitionMode: 'Interactive' }
+     */
+    advancedConfig?: Record<string, any>;
+    /**
+     * 连接超时时间（毫秒）
+     * 用于控制连接服务的超时时间
+     * 如果不传入此参数，则不会执行超时逻辑
+     * @example 10000 - 10秒超时
+     */
+    connectionTimeoutMs?: number;
+}
+/**
+ * ASR 配置（判别联合类型）
+ *
+ * 当 provider 为 'baidu' 时，config 自动推断为 IBaiduAsrConfig
+ * 当 provider 为 'azure' 时，config 自动推断为 IAzureSpeechConfig
+ *
+ * @example 百度 ASR
+ * ```typescript
+ * {
+ *   asr: {
+ *     provider: 'baidu',
+ *     config: { asrVad: 600, lang: Language.CHINESE }
+ *   }
+ * }
+ * ```
+ *
+ * @example Azure ASR
+ * ```typescript
+ * {
+ *   asr: {
+ *     provider: 'azure',
+ *     config: {
+ *       subscriptionKey: 'YOUR_KEY',
+ *       region: 'southeastasia',
+ *       languages: ['zh-CN', 'en-US']
+ *     }
+ *   }
+ * }
+ * ```
+ */
+export type AsrConfig = {
+    provider: 'baidu';
+    config: IBaiduAsrConfig;
+} | {
+    provider: 'azure';
+    config: IAzureSpeechConfig;
+};
 export interface IOptions<AS extends BaseAgentService = BaseAgentService> {
     /**
      * 🆔【必填】数字员工形象 ID —— 从平台获取的唯一形象资源标识
@@ -198,6 +436,46 @@ export interface IOptions<AS extends BaseAgentService = BaseAgentService> {
      */
     ttsModel?: 'turbo_v2' | undefined;
     /**
+     * 🎙️【选填】ASR 配置（推荐使用）
+     * —— 使用判别联合类型，根据 provider 自动推断 config 类型
+     *
+     * @example 百度 ASR
+     * ```typescript
+     * {
+     *   asr: {
+     *     provider: 'baidu',
+     *     config: { asrVad: 600, lang: Language.CHINESE }
+     *   }
+     * }
+     * ```
+     *
+     * @example Azure ASR
+     * ```typescript
+     * {
+     *   asr: {
+     *     provider: 'azure',
+     *     config: {
+     *       subscriptionKey: 'YOUR_KEY',
+     *       region: 'southeastasia',
+     *       languages: ['zh-CN', 'en-US']
+     *     }
+     *   }
+     * }
+     * ```
+     */
+    asr?: AsrConfig;
+    /**
+     * @deprecated 请使用 asr.config.asrVad，将在 v2.0 移除
+     *
+     * 迁移示例：
+     * ```typescript
+     * // 旧写法
+     * { asrVad: 600 }
+     *
+     * // 新写法
+     * { asr: { provider: 'baidu', config: { asrVad: 600 } } }
+     * ```
+     *
      * ASR 语音端点检测（VAD）的静音超时时长（单位：毫秒）
      * —— 用于判断用户"说完一句话"的停顿阈值。
      *
@@ -330,6 +608,17 @@ export interface IOptions<AS extends BaseAgentService = BaseAgentService> {
      */
     autoChromaKey?: boolean;
     /**
+     * @deprecated 请使用 asr.config.lang，将在 v2.0 移除
+     *
+     * 迁移示例：
+     * ```typescript
+     * // 旧写法
+     * { lang: Language.CHINESE }
+     *
+     * // 新写法
+     * { asr: { provider: 'baidu', config: { lang: Language.CHINESE } } }
+     * ```
+     *
      * 🌐【选填】语言配置
      *
      * 用于配置 ASR 语音识别和 TTS 语音合成的语言
@@ -377,13 +666,11 @@ export declare const DEFAULT_OPTIONS: {
     readonly minSplitLen: 5;
     readonly ttsModel: "turbo_v2";
     readonly env: "production";
-    readonly asrVad: 600;
     readonly interruptible: true;
     readonly rendererMode: "cloud";
     readonly autoChromaKey: true;
     readonly typeDelay: 163;
     readonly enTypeDelay: 45;
-    readonly lang: "zh";
     readonly inactivityPrompt: "您这么久没讲话，是不是有其它事情要忙，那我先挂断了";
     readonly getMountContainer: () => HTMLElement;
     readonly hotWordReplacementRules: ReplacementRule[];
@@ -392,7 +679,7 @@ export declare const DEFAULT_OPTIONS: {
     readonly rtcID: "appreimunmd7utp";
 };
 type DefaultedOptionKeys = keyof typeof DEFAULT_OPTIONS;
-export type IResolvedOptions<AS extends BaseAgentService = BaseAgentService> = IOptions<AS> & Required<Pick<IOptions<AS>, DefaultedOptionKeys>>;
+export type IResolvedOptions<AS extends BaseAgentService = BaseAgentService> = IOptions<AS> & Required<Pick<IOptions<AS>, DefaultedOptionKeys>> & Required<Pick<IOptions<AS>, 'asr' | 'asrVad' | 'lang'>>;
 export interface IAaaSPilotKitEmitter<AS extends BaseAgentService = BaseAgentService> {
     /**
      * 🔇【状态事件】静音状态变更

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
     "name": "@bdky/aaas-pilot-kit",
-    "version": "1.0.9",
+    "version": "1.1.0",
     "description": "百度数字员工基础套件 - AI智能体、语音识别、数字人渲染全链路SDK，事件驱动、框架无关",
     "keywords": [
         "digital-employee",
@@ -70,6 +70,7 @@
         "fast-xml-parser": "^5.2.5",
         "inversify": "^7.10.4",
         "ky": "^1.14.0",
+        "microsoft-cognitiveservices-speech-sdk": "^1.47.0",
         "p-defer": "^4.0.1",
         "p-queue": "^9.0.0",
         "reflect-metadata": "^0.2.2",