voicepilot-speech-engine 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ export { SpeechToText } from './stt.js';
2
+ export { TextToSpeech } from './tts.js';
3
+ export { AudioStreamManager } from './stream.js';
4
+ export type { STTConfig, TranscriptionResult, TranscriptionCallback, ErrorCallback } from './stt.js';
5
+ export type { TTSConfig, SpeechOptions } from './tts.js';
6
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,UAAU,CAAC;AACxC,OAAO,EAAE,YAAY,EAAE,MAAM,UAAU,CAAC;AACxC,OAAO,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC;AAEjD,YAAY,EAAE,SAAS,EAAE,mBAAmB,EAAE,qBAAqB,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AACrG,YAAY,EAAE,SAAS,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC"}
package/dist/index.js ADDED
@@ -0,0 +1,3 @@
1
+ export { SpeechToText } from './stt.js';
2
+ export { TextToSpeech } from './tts.js';
3
+ export { AudioStreamManager } from './stream.js';
@@ -0,0 +1,11 @@
1
+ export declare class AudioStreamManager {
2
+ private mediaRecorder?;
3
+ private audioChunks;
4
+ private stream?;
5
+ startRecording(): Promise<void>;
6
+ stopRecording(): Promise<Blob>;
7
+ isRecording(): boolean;
8
+ private cleanup;
9
+ getMicrophonePermission(): Promise<boolean>;
10
+ }
11
+ //# sourceMappingURL=stream.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"stream.d.ts","sourceRoot":"","sources":["../src/stream.ts"],"names":[],"mappings":"AAAA,qBAAa,kBAAkB;IAC7B,OAAO,CAAC,aAAa,CAAC,CAAgB;IACtC,OAAO,CAAC,WAAW,CAAc;IACjC,OAAO,CAAC,MAAM,CAAC,CAAc;IAEvB,cAAc,IAAI,OAAO,CAAC,IAAI,CAAC;IAoBrC,aAAa,IAAI,OAAO,CAAC,IAAI,CAAC;IAiB9B,WAAW,IAAI,OAAO;IAItB,OAAO,CAAC,OAAO;IAST,uBAAuB,IAAI,OAAO,CAAC,OAAO,CAAC;CASlD"}
package/dist/stream.js ADDED
@@ -0,0 +1,57 @@
1
+ export class AudioStreamManager {
2
+ mediaRecorder;
3
+ audioChunks = [];
4
+ stream;
5
+ async startRecording() {
6
+ try {
7
+ this.stream = await navigator.mediaDevices.getUserMedia({ audio: true });
8
+ this.mediaRecorder = new MediaRecorder(this.stream);
9
+ this.audioChunks = [];
10
+ this.mediaRecorder.ondataavailable = (event) => {
11
+ if (event.data.size > 0) {
12
+ this.audioChunks.push(event.data);
13
+ }
14
+ };
15
+ this.mediaRecorder.start(100);
16
+ }
17
+ catch (error) {
18
+ console.error('Failed to start recording:', error);
19
+ throw new Error('Microphone access denied or not available');
20
+ }
21
+ }
22
+ stopRecording() {
23
+ return new Promise((resolve, reject) => {
24
+ if (!this.mediaRecorder) {
25
+ reject(new Error('No active recording'));
26
+ return;
27
+ }
28
+ this.mediaRecorder.onstop = () => {
29
+ const audioBlob = new Blob(this.audioChunks, { type: 'audio/webm' });
30
+ this.cleanup();
31
+ resolve(audioBlob);
32
+ };
33
+ this.mediaRecorder.stop();
34
+ });
35
+ }
36
+ isRecording() {
37
+ return this.mediaRecorder?.state === 'recording';
38
+ }
39
+ cleanup() {
40
+ if (this.stream) {
41
+ this.stream.getTracks().forEach(track => track.stop());
42
+ this.stream = undefined;
43
+ }
44
+ this.mediaRecorder = undefined;
45
+ this.audioChunks = [];
46
+ }
47
+ async getMicrophonePermission() {
48
+ try {
49
+ const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
50
+ stream.getTracks().forEach(track => track.stop());
51
+ return true;
52
+ }
53
+ catch {
54
+ return false;
55
+ }
56
+ }
57
+ }
package/dist/stt.d.ts ADDED
@@ -0,0 +1,23 @@
1
+ export interface STTConfig {
2
+ subscriptionKey: string;
3
+ region: string;
4
+ language?: string;
5
+ }
6
+ export interface TranscriptionResult {
7
+ text: string;
8
+ confidence: number;
9
+ isFinal: boolean;
10
+ }
11
+ export type TranscriptionCallback = (result: TranscriptionResult) => void;
12
+ export type ErrorCallback = (error: Error) => void;
13
+ export declare class SpeechToText {
14
+ private recognizer?;
15
+ private config;
16
+ private isListening;
17
+ constructor(config: STTConfig);
18
+ startContinuousRecognition(onTranscription: TranscriptionCallback, onError?: ErrorCallback): void;
19
+ stopContinuousRecognition(): Promise<void>;
20
+ recognizeOnce(): Promise<string>;
21
+ getIsListening(): boolean;
22
+ }
23
+ //# sourceMappingURL=stt.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"stt.d.ts","sourceRoot":"","sources":["../src/stt.ts"],"names":[],"mappings":"AAEA,MAAM,WAAW,SAAS;IACxB,eAAe,EAAE,MAAM,CAAC;IACxB,MAAM,EAAE,MAAM,CAAC;IACf,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,mBAAmB;IAClC,IAAI,EAAE,MAAM,CAAC;IACb,UAAU,EAAE,MAAM,CAAC;IACnB,OAAO,EAAE,OAAO,CAAC;CAClB;AAED,MAAM,MAAM,qBAAqB,GAAG,CAAC,MAAM,EAAE,mBAAmB,KAAK,IAAI,CAAC;AAC1E,MAAM,MAAM,aAAa,GAAG,CAAC,KAAK,EAAE,KAAK,KAAK,IAAI,CAAC;AAEnD,qBAAa,YAAY;IACvB,OAAO,CAAC,UAAU,CAAC,CAAuB;IAC1C,OAAO,CAAC,MAAM,CAAY;IAC1B,OAAO,CAAC,WAAW,CAAS;gBAEhB,MAAM,EAAE,SAAS;IAI7B,0BAA0B,CACxB,eAAe,EAAE,qBAAqB,EACtC,OAAO,CAAC,EAAE,aAAa,GACtB,IAAI;IAiEP,yBAAyB,IAAI,OAAO,CAAC,IAAI,CAAC;IAuBpC,aAAa,IAAI,OAAO,CAAC,MAAM,CAAC;IA6BtC,cAAc,IAAI,OAAO;CAG1B"}
package/dist/stt.js ADDED
@@ -0,0 +1,99 @@
1
+ import * as sdk from 'microsoft-cognitiveservices-speech-sdk';
2
+ export class SpeechToText {
3
+ recognizer;
4
+ config;
5
+ isListening = false;
6
+ constructor(config) {
7
+ this.config = config;
8
+ }
9
+ startContinuousRecognition(onTranscription, onError) {
10
+ if (this.isListening) {
11
+ console.warn('Already listening');
12
+ return;
13
+ }
14
+ const speechConfig = sdk.SpeechConfig.fromSubscription(this.config.subscriptionKey, this.config.region);
15
+ speechConfig.speechRecognitionLanguage = this.config.language || 'en-US';
16
+ speechConfig.enableDictation();
17
+ const audioConfig = sdk.AudioConfig.fromDefaultMicrophoneInput();
18
+ this.recognizer = new sdk.SpeechRecognizer(speechConfig, audioConfig);
19
+ this.recognizer.recognizing = (_sender, event) => {
20
+ if (event.result.reason === sdk.ResultReason.RecognizingSpeech) {
21
+ onTranscription({
22
+ text: event.result.text,
23
+ confidence: 0.5,
24
+ isFinal: false,
25
+ });
26
+ }
27
+ };
28
+ this.recognizer.recognized = (_sender, event) => {
29
+ if (event.result.reason === sdk.ResultReason.RecognizedSpeech) {
30
+ onTranscription({
31
+ text: event.result.text,
32
+ confidence: 1.0,
33
+ isFinal: true,
34
+ });
35
+ }
36
+ else if (event.result.reason === sdk.ResultReason.NoMatch) {
37
+ console.log('No speech could be recognized');
38
+ }
39
+ };
40
+ this.recognizer.canceled = (_sender, event) => {
41
+ console.error('Recognition canceled:', event.reason);
42
+ if (event.reason === sdk.CancellationReason.Error) {
43
+ onError?.(new Error(event.errorDetails));
44
+ }
45
+ this.stopContinuousRecognition();
46
+ };
47
+ this.recognizer.sessionStopped = () => {
48
+ this.isListening = false;
49
+ };
50
+ this.recognizer.startContinuousRecognitionAsync(() => {
51
+ this.isListening = true;
52
+ console.log('Recognition started');
53
+ }, (error) => {
54
+ console.error('Failed to start recognition:', error);
55
+ onError?.(new Error(error));
56
+ });
57
+ }
58
+ stopContinuousRecognition() {
59
+ return new Promise((resolve) => {
60
+ if (!this.recognizer || !this.isListening) {
61
+ resolve();
62
+ return;
63
+ }
64
+ this.recognizer.stopContinuousRecognitionAsync(() => {
65
+ this.isListening = false;
66
+ this.recognizer?.close();
67
+ this.recognizer = undefined;
68
+ resolve();
69
+ }, (error) => {
70
+ console.error('Failed to stop recognition:', error);
71
+ this.isListening = false;
72
+ resolve();
73
+ });
74
+ });
75
+ }
76
+ async recognizeOnce() {
77
+ const speechConfig = sdk.SpeechConfig.fromSubscription(this.config.subscriptionKey, this.config.region);
78
+ speechConfig.speechRecognitionLanguage = this.config.language || 'en-US';
79
+ const audioConfig = sdk.AudioConfig.fromDefaultMicrophoneInput();
80
+ const recognizer = new sdk.SpeechRecognizer(speechConfig, audioConfig);
81
+ return new Promise((resolve, reject) => {
82
+ recognizer.recognizeOnceAsync((result) => {
83
+ if (result.reason === sdk.ResultReason.RecognizedSpeech) {
84
+ resolve(result.text);
85
+ }
86
+ else {
87
+ reject(new Error('No speech recognized'));
88
+ }
89
+ recognizer.close();
90
+ }, (error) => {
91
+ reject(new Error(error));
92
+ recognizer.close();
93
+ });
94
+ });
95
+ }
96
+ getIsListening() {
97
+ return this.isListening;
98
+ }
99
+ }
package/dist/tts.d.ts ADDED
@@ -0,0 +1,26 @@
1
+ export interface TTSConfig {
2
+ subscriptionKey: string;
3
+ region: string;
4
+ voice?: string;
5
+ language?: string;
6
+ conversational?: boolean;
7
+ }
8
+ export interface SpeechOptions {
9
+ text: string;
10
+ voice?: string;
11
+ rate?: number;
12
+ pitch?: number;
13
+ style?: 'friendly' | 'cheerful' | 'empathetic' | 'professional';
14
+ }
15
+ export declare class TextToSpeech {
16
+ private synthesizer?;
17
+ private config;
18
+ constructor(config: TTSConfig);
19
+ speak(options: SpeechOptions): Promise<void>;
20
+ speakText(text: string): Promise<void>;
21
+ synthesizeToAudio(options: SpeechOptions): Promise<ArrayBuffer>;
22
+ stop(): void;
23
+ private generateSSML;
24
+ private escapeXML;
25
+ }
26
+ //# sourceMappingURL=tts.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"tts.d.ts","sourceRoot":"","sources":["../src/tts.ts"],"names":[],"mappings":"AAEA,MAAM,WAAW,SAAS;IACxB,eAAe,EAAE,MAAM,CAAC;IACxB,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,cAAc,CAAC,EAAE,OAAO,CAAC;CAC1B;AAED,MAAM,WAAW,aAAa;IAC5B,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,UAAU,GAAG,UAAU,GAAG,YAAY,GAAG,cAAc,CAAC;CACjE;AAED,qBAAa,YAAY;IACvB,OAAO,CAAC,WAAW,CAAC,CAAwB;IAC5C,OAAO,CAAC,MAAM,CAAY;gBAEd,MAAM,EAAE,SAAS;IAIvB,KAAK,CAAC,OAAO,EAAE,aAAa,GAAG,OAAO,CAAC,IAAI,CAAC;IAqC5C,SAAS,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAItC,iBAAiB,CAAC,OAAO,EAAE,aAAa,GAAG,OAAO,CAAC,WAAW,CAAC;IAiCrE,IAAI,IAAI,IAAI;IAOZ,OAAO,CAAC,YAAY;IAuBpB,OAAO,CAAC,SAAS;CAQlB"}
package/dist/tts.js ADDED
@@ -0,0 +1,94 @@
1
+ import * as sdk from 'microsoft-cognitiveservices-speech-sdk';
2
+ export class TextToSpeech {
3
+ synthesizer;
4
+ config;
5
+ constructor(config) {
6
+ this.config = config;
7
+ }
8
+ async speak(options) {
9
+ const speechConfig = sdk.SpeechConfig.fromSubscription(this.config.subscriptionKey, this.config.region);
10
+ speechConfig.speechSynthesisVoiceName =
11
+ options.voice ||
12
+ this.config.voice ||
13
+ 'en-US-AriaNeural'; // Aria is more conversational than Jenny
14
+ const audioConfig = sdk.AudioConfig.fromDefaultSpeakerOutput();
15
+ this.synthesizer = new sdk.SpeechSynthesizer(speechConfig, audioConfig);
16
+ const ssml = this.generateSSML(options);
17
+ return new Promise((resolve, reject) => {
18
+ this.synthesizer.speakSsmlAsync(ssml, (result) => {
19
+ if (result.reason === sdk.ResultReason.SynthesizingAudioCompleted) {
20
+ resolve();
21
+ }
22
+ else {
23
+ reject(new Error('Speech synthesis failed'));
24
+ }
25
+ this.synthesizer?.close();
26
+ this.synthesizer = undefined;
27
+ }, (error) => {
28
+ reject(new Error(error));
29
+ this.synthesizer?.close();
30
+ this.synthesizer = undefined;
31
+ });
32
+ });
33
+ }
34
+ async speakText(text) {
35
+ return this.speak({ text });
36
+ }
37
+ async synthesizeToAudio(options) {
38
+ const speechConfig = sdk.SpeechConfig.fromSubscription(this.config.subscriptionKey, this.config.region);
39
+ speechConfig.speechSynthesisVoiceName =
40
+ options.voice ||
41
+ this.config.voice ||
42
+ 'en-US-AriaNeural'; // Aria is more conversational than Jenny
43
+ const synthesizer = new sdk.SpeechSynthesizer(speechConfig, undefined);
44
+ const ssml = this.generateSSML(options);
45
+ return new Promise((resolve, reject) => {
46
+ synthesizer.speakSsmlAsync(ssml, (result) => {
47
+ if (result.reason === sdk.ResultReason.SynthesizingAudioCompleted) {
48
+ resolve(result.audioData);
49
+ }
50
+ else {
51
+ reject(new Error('Speech synthesis failed'));
52
+ }
53
+ synthesizer.close();
54
+ }, (error) => {
55
+ reject(new Error(error));
56
+ synthesizer.close();
57
+ });
58
+ });
59
+ }
60
+ stop() {
61
+ if (this.synthesizer) {
62
+ this.synthesizer.close();
63
+ this.synthesizer = undefined;
64
+ }
65
+ }
66
+ generateSSML(options) {
67
+ const voice = options.voice || this.config.voice || 'en-US-AriaNeural'; // Aria is more conversational
68
+ const rate = options.rate ? `${options.rate}%` : '0%'; // Natural speed
69
+ const pitch = options.pitch ? `${options.pitch > 0 ? '+' : ''}${options.pitch}Hz` : '+5Hz'; // Slight warmth
70
+ // Add speaking style for supported neural voices (Aria, Jenny, Guy, Sara)
71
+ const style = options.style || (this.config.conversational ? 'friendly' : undefined);
72
+ const styleAttr = style ? `style="${style}"` : '';
73
+ return `
74
+ <speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis"
75
+ xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="en-US">
76
+ <voice name="${voice}">
77
+ ${style ? `<mstts:express-as ${styleAttr}>` : ''}
78
+ <prosody rate="${rate}" pitch="${pitch}">
79
+ ${this.escapeXML(options.text)}
80
+ </prosody>
81
+ ${style ? `</mstts:express-as>` : ''}
82
+ </voice>
83
+ </speak>
84
+ `.trim();
85
+ }
86
+ escapeXML(text) {
87
+ return text
88
+ .replace(/&/g, '&amp;')
89
+ .replace(/</g, '&lt;')
90
+ .replace(/>/g, '&gt;')
91
+ .replace(/"/g, '&quot;')
92
+ .replace(/'/g, '&apos;');
93
+ }
94
+ }
package/package.json ADDED
@@ -0,0 +1,31 @@
1
+ {
2
+ "name": "voicepilot-speech-engine",
3
+ "version": "0.1.0",
4
+ "description": "Azure Speech Services integration with enhanced TTS",
5
+ "type": "module",
6
+ "main": "./dist/index.js",
7
+ "types": "./dist/index.d.ts",
8
+ "files": ["dist"],
9
+ "exports": {
10
+ ".": {
11
+ "import": "./dist/index.js",
12
+ "types": "./dist/index.d.ts"
13
+ }
14
+ },
15
+ "scripts": {
16
+ "build": "tsc",
17
+ "dev": "tsc --watch",
18
+ "clean": "rm -rf dist"
19
+ },
20
+ "dependencies": {
21
+ "microsoft-cognitiveservices-speech-sdk": "^1.42.0"
22
+ },
23
+ "devDependencies": {
24
+ "@types/node": "^22.10.2",
25
+ "@voicepilot/typescript-config": "workspace:*",
26
+ "typescript": "^5.7.2"
27
+ },
28
+ "publishConfig": {
29
+ "access": "public"
30
+ }
31
+ }