speech-to-speech 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +123 -35
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +1 -1
- package/dist/index.d.ts +1 -1
- package/dist/index.mjs +121 -16
- package/dist/index.mjs.map +1 -1
- package/dist/stt/index.cjs +87 -35
- package/dist/stt/index.cjs.map +1 -1
- package/dist/stt/index.mjs +87 -15
- package/dist/stt/index.mjs.map +1 -1
- package/dist/tts/index.cjs +121 -35
- package/dist/tts/index.cjs.map +1 -1
- package/dist/tts/index.d.cts +67 -1
- package/dist/tts/index.d.ts +67 -1
- package/dist/tts/index.mjs +119 -16
- package/dist/tts/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -1,27 +1,7 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
3
|
var vadWeb = require('@ricky0123/vad-web');
|
|
4
|
-
var
|
|
5
|
-
|
|
6
|
-
function _interopNamespace(e) {
|
|
7
|
-
if (e && e.__esModule) return e;
|
|
8
|
-
var n = Object.create(null);
|
|
9
|
-
if (e) {
|
|
10
|
-
Object.keys(e).forEach(function (k) {
|
|
11
|
-
if (k !== 'default') {
|
|
12
|
-
var d = Object.getOwnPropertyDescriptor(e, k);
|
|
13
|
-
Object.defineProperty(n, k, d.get ? d : {
|
|
14
|
-
enumerable: true,
|
|
15
|
-
get: function () { return e[k]; }
|
|
16
|
-
});
|
|
17
|
-
}
|
|
18
|
-
});
|
|
19
|
-
}
|
|
20
|
-
n.default = e;
|
|
21
|
-
return Object.freeze(n);
|
|
22
|
-
}
|
|
23
|
-
|
|
24
|
-
var piperTts__namespace = /*#__PURE__*/_interopNamespace(piperTts);
|
|
4
|
+
var piperTtsWeb = require('@realtimex/piper-tts-web');
|
|
25
5
|
|
|
26
6
|
// src/stt/reset-stt-logic.ts
|
|
27
7
|
var ResetSTTLogic = class {
|
|
@@ -557,17 +537,101 @@ var sharedAudioPlayer = {
|
|
|
557
537
|
/** Close */
|
|
558
538
|
close: () => AudioPlayer.reset()
|
|
559
539
|
};
|
|
540
|
+
|
|
541
|
+
// src/tts/wasm-cache.ts
|
|
542
|
+
var CACHE_NAME = "stt-tts-lib-piper-wasm-v1";
|
|
543
|
+
var DEFAULT_CDN_BASE = "https://cdn.jsdelivr.net/npm/@diffusionstudio/piper-wasm@1.0.0/build/piper_phonemize";
|
|
544
|
+
var cachedBlobUrls = null;
|
|
545
|
+
function isCacheAvailable() {
|
|
546
|
+
return typeof caches !== "undefined";
|
|
547
|
+
}
|
|
548
|
+
async function fetchAndCache(cache, url) {
|
|
549
|
+
const existing = await cache.match(url);
|
|
550
|
+
if (existing) {
|
|
551
|
+
const blob2 = await existing.blob();
|
|
552
|
+
return URL.createObjectURL(blob2);
|
|
553
|
+
}
|
|
554
|
+
const response = await fetch(url);
|
|
555
|
+
if (!response.ok) {
|
|
556
|
+
throw new Error(`Failed to fetch ${url}: ${response.status}`);
|
|
557
|
+
}
|
|
558
|
+
const responseForCache = response.clone();
|
|
559
|
+
await cache.put(url, responseForCache);
|
|
560
|
+
const blob = await response.blob();
|
|
561
|
+
return URL.createObjectURL(blob);
|
|
562
|
+
}
|
|
563
|
+
async function ensureWasmCached(cdnBase = DEFAULT_CDN_BASE) {
|
|
564
|
+
if (cachedBlobUrls) return cachedBlobUrls;
|
|
565
|
+
const dataUrl = `${cdnBase}.data`;
|
|
566
|
+
const wasmUrl = `${cdnBase}.wasm`;
|
|
567
|
+
if (!isCacheAvailable()) {
|
|
568
|
+
console.log("[wasm-cache] Cache API unavailable \u2014 using CDN URLs directly");
|
|
569
|
+
cachedBlobUrls = { piperData: dataUrl, piperWasm: wasmUrl };
|
|
570
|
+
return cachedBlobUrls;
|
|
571
|
+
}
|
|
572
|
+
try {
|
|
573
|
+
const cache = await caches.open(CACHE_NAME);
|
|
574
|
+
const [piperData, piperWasm] = await Promise.all([
|
|
575
|
+
fetchAndCache(cache, dataUrl),
|
|
576
|
+
fetchAndCache(cache, wasmUrl)
|
|
577
|
+
]);
|
|
578
|
+
cachedBlobUrls = { piperData, piperWasm };
|
|
579
|
+
console.log("[wasm-cache] WASM assets served from Cache API (blob URLs)");
|
|
580
|
+
return cachedBlobUrls;
|
|
581
|
+
} catch (err) {
|
|
582
|
+
console.warn("[wasm-cache] Caching failed, falling back to CDN URLs:", err);
|
|
583
|
+
cachedBlobUrls = { piperData: dataUrl, piperWasm: wasmUrl };
|
|
584
|
+
return cachedBlobUrls;
|
|
585
|
+
}
|
|
586
|
+
}
|
|
587
|
+
async function isWasmCached(cdnBase = DEFAULT_CDN_BASE) {
|
|
588
|
+
if (!isCacheAvailable()) return false;
|
|
589
|
+
try {
|
|
590
|
+
const cache = await caches.open(CACHE_NAME);
|
|
591
|
+
const [data, wasm] = await Promise.all([
|
|
592
|
+
cache.match(`${cdnBase}.data`),
|
|
593
|
+
cache.match(`${cdnBase}.wasm`)
|
|
594
|
+
]);
|
|
595
|
+
return !!data && !!wasm;
|
|
596
|
+
} catch {
|
|
597
|
+
return false;
|
|
598
|
+
}
|
|
599
|
+
}
|
|
600
|
+
async function clearWasmCache() {
|
|
601
|
+
if (cachedBlobUrls) {
|
|
602
|
+
try {
|
|
603
|
+
URL.revokeObjectURL(cachedBlobUrls.piperData);
|
|
604
|
+
} catch {
|
|
605
|
+
}
|
|
606
|
+
try {
|
|
607
|
+
URL.revokeObjectURL(cachedBlobUrls.piperWasm);
|
|
608
|
+
} catch {
|
|
609
|
+
}
|
|
610
|
+
cachedBlobUrls = null;
|
|
611
|
+
}
|
|
612
|
+
if (!isCacheAvailable()) return;
|
|
613
|
+
try {
|
|
614
|
+
await caches.delete(CACHE_NAME);
|
|
615
|
+
console.log("[wasm-cache] Cache cleared");
|
|
616
|
+
} catch (err) {
|
|
617
|
+
console.warn("[wasm-cache] Failed to clear cache:", err);
|
|
618
|
+
}
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
// src/tts/piper-synthesizer.ts
|
|
560
622
|
var DEFAULT_VOICE_ID = "en_US-hfc_female-medium";
|
|
561
623
|
var TTSLogic = class {
|
|
562
624
|
constructor(config = {}) {
|
|
563
625
|
this.ready = false;
|
|
564
626
|
this.voiceLoaded = false;
|
|
565
627
|
this.warmUp = true;
|
|
628
|
+
this.ttsSession = null;
|
|
566
629
|
this.config = {
|
|
567
630
|
voiceId: DEFAULT_VOICE_ID,
|
|
568
631
|
sampleRate: 22050,
|
|
569
632
|
useSharedAudioPlayer: true,
|
|
570
633
|
warmUp: true,
|
|
634
|
+
enableWasmCache: true,
|
|
571
635
|
...config
|
|
572
636
|
};
|
|
573
637
|
this.useSharedPlayer = this.config.useSharedAudioPlayer !== false;
|
|
@@ -594,10 +658,11 @@ var TTSLogic = class {
|
|
|
594
658
|
throw new Error("Voice not loaded. Call initialize() first.");
|
|
595
659
|
}
|
|
596
660
|
try {
|
|
597
|
-
|
|
598
|
-
text
|
|
599
|
-
|
|
600
|
-
|
|
661
|
+
if (this.ttsSession) {
|
|
662
|
+
await this.ttsSession.predict(text);
|
|
663
|
+
} else {
|
|
664
|
+
throw new Error("TtsSession not created yet");
|
|
665
|
+
}
|
|
601
666
|
console.log("\u2713 Piper synthesizer warmed up");
|
|
602
667
|
return { synthesized: true };
|
|
603
668
|
} catch (error) {
|
|
@@ -612,11 +677,30 @@ var TTSLogic = class {
|
|
|
612
677
|
try {
|
|
613
678
|
const voiceId = this.config.voiceId;
|
|
614
679
|
console.log("\u{1F4CD} Loading Piper voice:", voiceId);
|
|
615
|
-
|
|
680
|
+
let wasmPaths;
|
|
681
|
+
if (this.config.wasmPaths) {
|
|
682
|
+
console.log("\u{1F4E6} Using custom WASM paths");
|
|
683
|
+
wasmPaths = {
|
|
684
|
+
onnxWasm: this.config.wasmPaths.onnxWasm ?? piperTtsWeb.TtsSession.WASM_LOCATIONS.onnxWasm,
|
|
685
|
+
piperData: this.config.wasmPaths.piperData,
|
|
686
|
+
piperWasm: this.config.wasmPaths.piperWasm
|
|
687
|
+
};
|
|
688
|
+
} else if (this.config.enableWasmCache !== false) {
|
|
689
|
+
console.log("\u{1F4E6} Caching WASM assets via Cache API...");
|
|
690
|
+
const cached = await ensureWasmCached(piperTtsWeb.WASM_BASE);
|
|
691
|
+
wasmPaths = {
|
|
692
|
+
onnxWasm: piperTtsWeb.TtsSession.WASM_LOCATIONS.onnxWasm,
|
|
693
|
+
piperData: cached.piperData,
|
|
694
|
+
piperWasm: cached.piperWasm
|
|
695
|
+
};
|
|
696
|
+
} else {
|
|
697
|
+
wasmPaths = { ...piperTtsWeb.TtsSession.WASM_LOCATIONS };
|
|
698
|
+
}
|
|
699
|
+
const storedVoices = await piperTtsWeb.stored();
|
|
616
700
|
const alreadyCached = Array.isArray(storedVoices) ? storedVoices.includes(voiceId) : false;
|
|
617
701
|
if (!alreadyCached) {
|
|
618
702
|
console.log("\u2B07\uFE0F Downloading voice model...");
|
|
619
|
-
await
|
|
703
|
+
await piperTtsWeb.download(voiceId, (progress) => {
|
|
620
704
|
if (progress?.total) {
|
|
621
705
|
const pct = Math.round(progress.loaded * 100 / progress.total);
|
|
622
706
|
console.log(`\u2B07\uFE0F Downloading: ${pct}%`);
|
|
@@ -625,6 +709,12 @@ var TTSLogic = class {
|
|
|
625
709
|
} else {
|
|
626
710
|
console.log("\u2713 Voice found in cache");
|
|
627
711
|
}
|
|
712
|
+
this.ttsSession = new piperTtsWeb.TtsSession({
|
|
713
|
+
voiceId,
|
|
714
|
+
wasmPaths,
|
|
715
|
+
logger: (msg) => console.log(`[piper] ${msg}`)
|
|
716
|
+
});
|
|
717
|
+
await this.ttsSession.waitReady;
|
|
628
718
|
this.voiceLoaded = true;
|
|
629
719
|
if (this.config.warmUp) {
|
|
630
720
|
const { synthesized } = await this.warmup();
|
|
@@ -660,10 +750,7 @@ var TTSLogic = class {
|
|
|
660
750
|
throw new Error("No text provided for synthesis");
|
|
661
751
|
}
|
|
662
752
|
try {
|
|
663
|
-
const wavBlob = await
|
|
664
|
-
text: trimmed,
|
|
665
|
-
voiceId: this.config.voiceId
|
|
666
|
-
});
|
|
753
|
+
const wavBlob = await this.ttsSession.predict(trimmed);
|
|
667
754
|
const arrayBuffer = await wavBlob.arrayBuffer();
|
|
668
755
|
const audioContext = new (window.AudioContext || window.webkitAudioContext)();
|
|
669
756
|
const decodedBuffer = await audioContext.decodeAudioData(arrayBuffer);
|
|
@@ -690,10 +777,7 @@ var TTSLogic = class {
|
|
|
690
777
|
if (!trimmed) {
|
|
691
778
|
throw new Error("No text provided for synthesis");
|
|
692
779
|
}
|
|
693
|
-
return
|
|
694
|
-
text: trimmed,
|
|
695
|
-
voiceId: this.config.voiceId
|
|
696
|
-
});
|
|
780
|
+
return this.ttsSession.predict(trimmed);
|
|
697
781
|
}
|
|
698
782
|
/**
|
|
699
783
|
* Synthesize text and add to queue (uses shared player by default)
|
|
@@ -717,6 +801,7 @@ var TTSLogic = class {
|
|
|
717
801
|
async dispose() {
|
|
718
802
|
this.ready = false;
|
|
719
803
|
this.voiceLoaded = false;
|
|
804
|
+
this.ttsSession = null;
|
|
720
805
|
}
|
|
721
806
|
};
|
|
722
807
|
function textToPhonemes(_text) {
|
|
@@ -1822,17 +1907,20 @@ exports.STTLogic = STTLogic;
|
|
|
1822
1907
|
exports.SimpleQueue = SimpleQueue;
|
|
1823
1908
|
exports.TTSLogic = TTSLogic;
|
|
1824
1909
|
exports.VADController = VADController;
|
|
1910
|
+
exports.clearWasmCache = clearWasmCache;
|
|
1825
1911
|
exports.configureFillerManager = configureFillerManager;
|
|
1826
1912
|
exports.createAudioPlayer = createAudioPlayer;
|
|
1827
1913
|
exports.createOrtEnvironment = createOrtEnvironment;
|
|
1828
1914
|
exports.emitSentence = emitSentence;
|
|
1829
1915
|
exports.ensureOrtReady = ensureOrtReady;
|
|
1830
1916
|
exports.ensureVoiceLoaded = ensureVoiceLoaded;
|
|
1917
|
+
exports.ensureWasmCached = ensureWasmCached;
|
|
1831
1918
|
exports.getAsyncIterator = getAsyncIterator;
|
|
1832
1919
|
exports.getBackendLabel = getBackendLabel;
|
|
1833
1920
|
exports.getFillerManager = getFillerManager;
|
|
1834
1921
|
exports.handleChunk = handleChunk;
|
|
1835
1922
|
exports.isCorruptModelError = isCorruptModelError;
|
|
1923
|
+
exports.isWasmCached = isWasmCached;
|
|
1836
1924
|
exports.nextBoundaryIndex = nextBoundaryIndex;
|
|
1837
1925
|
exports.playerWorker = playerWorker;
|
|
1838
1926
|
exports.preparePiperVoice = preparePiperVoice;
|