react-native-sherpa-onnx 0.4.0 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -0
- package/android/src/main/assets/model_licenses/alignment-models-license-status.csv +5 -0
- package/android/src/main/cpp/CMakeLists.txt +3 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-alignment-wrapper.cpp +66 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-alignment-wrapper.h +17 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-alignment.cpp +108 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect.h +30 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-alignment.cpp +66 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-alignment.h +30 -0
- package/android/src/main/cpp/jni/module/sherpa-onnx-module-jni.cpp +21 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxAlignmentHelper.kt +555 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +76 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxTextSegmenter.kt +330 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxTtsHelper.kt +180 -23
- package/ios/Resources/model_licenses/alignment-models-license-status.csv +5 -0
- package/ios/SherpaOnnx+Alignment.mm +704 -0
- package/ios/SherpaOnnx+STT.mm +6 -0
- package/ios/SherpaOnnx+TTS.mm +624 -50
- package/ios/model_detect/sherpa-onnx-model-detect-alignment.mm +108 -0
- package/ios/model_detect/sherpa-onnx-model-detect.h +31 -0
- package/ios/model_detect/sherpa-onnx-validate-alignment.h +30 -0
- package/ios/model_detect/sherpa-onnx-validate-alignment.mm +66 -0
- package/ios/stt/sherpa-onnx-stt-wrapper.h +3 -1
- package/ios/stt/sherpa-onnx-stt-wrapper.mm +6 -0
- package/lib/module/NativeSherpaOnnx.js.map +1 -1
- package/lib/module/alignment/index.js +27 -0
- package/lib/module/alignment/index.js.map +1 -0
- package/lib/module/alignment/types.js +2 -0
- package/lib/module/alignment/types.js.map +1 -0
- package/lib/module/alignment/vocab.js +40 -0
- package/lib/module/alignment/vocab.js.map +1 -0
- package/lib/module/download/paths.js +9 -1
- package/lib/module/download/paths.js.map +1 -1
- package/lib/module/download/registry.js +17 -1
- package/lib/module/download/registry.js.map +1 -1
- package/lib/module/download/types.js +1 -0
- package/lib/module/download/types.js.map +1 -1
- package/lib/module/index.js +6 -4
- package/lib/module/index.js.map +1 -1
- package/lib/module/licenses.js +8 -2
- package/lib/module/licenses.js.map +1 -1
- package/lib/module/stt/types.js.map +1 -1
- package/lib/module/tts/index.js +68 -2
- package/lib/module/tts/index.js.map +1 -1
- package/lib/module/tts/subtitles.js +400 -0
- package/lib/module/tts/subtitles.js.map +1 -0
- package/lib/module/tts/tempAudio.js +17 -0
- package/lib/module/tts/tempAudio.js.map +1 -0
- package/lib/module/tts/types.js.map +1 -1
- package/lib/typescript/src/NativeSherpaOnnx.d.ts +34 -3
- package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
- package/lib/typescript/src/alignment/index.d.ts +8 -0
- package/lib/typescript/src/alignment/index.d.ts.map +1 -0
- package/lib/typescript/src/alignment/types.d.ts +23 -0
- package/lib/typescript/src/alignment/types.d.ts.map +1 -0
- package/lib/typescript/src/alignment/vocab.d.ts +5 -0
- package/lib/typescript/src/alignment/vocab.d.ts.map +1 -0
- package/lib/typescript/src/download/paths.d.ts +5 -2
- package/lib/typescript/src/download/paths.d.ts.map +1 -1
- package/lib/typescript/src/download/registry.d.ts.map +1 -1
- package/lib/typescript/src/download/types.d.ts +2 -1
- package/lib/typescript/src/download/types.d.ts.map +1 -1
- package/lib/typescript/src/index.d.ts +1 -0
- package/lib/typescript/src/index.d.ts.map +1 -1
- package/lib/typescript/src/licenses.d.ts.map +1 -1
- package/lib/typescript/src/stt/types.d.ts +5 -2
- package/lib/typescript/src/stt/types.d.ts.map +1 -1
- package/lib/typescript/src/tts/index.d.ts +2 -1
- package/lib/typescript/src/tts/index.d.ts.map +1 -1
- package/lib/typescript/src/tts/subtitles.d.ts +24 -0
- package/lib/typescript/src/tts/subtitles.d.ts.map +1 -0
- package/lib/typescript/src/tts/tempAudio.d.ts +3 -0
- package/lib/typescript/src/tts/tempAudio.d.ts.map +1 -0
- package/lib/typescript/src/tts/types.d.ts +68 -2
- package/lib/typescript/src/tts/types.d.ts.map +1 -1
- package/package.json +6 -1
- package/scripts/alignment-models/README.md +90 -0
- package/scripts/alignment-models/build_and_upload.js +724 -0
- package/scripts/alignment-models/sources.csv +5 -0
- package/scripts/alignment-models/sync_alignment_license_status.js +123 -0
- package/src/NativeSherpaOnnx.ts +35 -3
- package/src/alignment/index.ts +41 -0
- package/src/alignment/types.ts +22 -0
- package/src/alignment/vocab.ts +38 -0
- package/src/download/paths.ts +18 -5
- package/src/download/registry.ts +23 -3
- package/src/download/types.ts +1 -0
- package/src/index.tsx +6 -4
- package/src/licenses.ts +12 -1
- package/src/stt/types.ts +5 -2
- package/src/tts/index.ts +110 -3
- package/src/tts/subtitles.ts +611 -0
- package/src/tts/tempAudio.ts +31 -0
- package/src/tts/types.ts +79 -2
- package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -1
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
id;onnx_url;license;license_type;commercial_use
|
|
2
|
+
wav2vec2-base-960h-int8;https://huggingface.co/onnx-community/wav2vec2-base-960h-ONNX/resolve/main/onnx/model_int8.onnx;https://huggingface.co/datasets/choosealicense/licenses/resolve/main/markdown/apache-2.0.md;apache-2.0;yes
|
|
3
|
+
wav2vec2-base-960h-fp16;https://huggingface.co/onnx-community/wav2vec2-base-960h-ONNX/resolve/main/onnx/model_fp16.onnx;https://huggingface.co/datasets/choosealicense/licenses/resolve/main/markdown/apache-2.0.md;apache-2.0;yes
|
|
4
|
+
wav2vec2-base-960h;https://huggingface.co/onnx-community/wav2vec2-base-960h-ONNX/resolve/main/onnx/model.onnx;https://huggingface.co/datasets/choosealicense/licenses/resolve/main/markdown/apache-2.0.md;apache-2.0;yes
|
|
5
|
+
wav2vec2-base-960h-q4f16;https://huggingface.co/onnx-community/wav2vec2-base-960h-ONNX/resolve/main/onnx/model_q4f16.onnx;https://huggingface.co/datasets/choosealicense/licenses/resolve/main/markdown/apache-2.0.md;apache-2.0;yes
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
'use strict';
|
|
4
|
+
|
|
5
|
+
const path = require('node:path');
|
|
6
|
+
const fsp = require('node:fs/promises');
|
|
7
|
+
const { readSources } = require('./build_and_upload.js');
|
|
8
|
+
|
|
9
|
+
const REPO_ROOT = path.join(__dirname, '..', '..');
|
|
10
|
+
const DEFAULT_CSV = path.join(__dirname, 'sources.csv');
|
|
11
|
+
const TARGETS = [
|
|
12
|
+
path.join(
|
|
13
|
+
REPO_ROOT,
|
|
14
|
+
'android/src/main/assets/model_licenses/alignment-models-license-status.csv'
|
|
15
|
+
),
|
|
16
|
+
path.join(
|
|
17
|
+
REPO_ROOT,
|
|
18
|
+
'ios/Resources/model_licenses/alignment-models-license-status.csv'
|
|
19
|
+
),
|
|
20
|
+
];
|
|
21
|
+
|
|
22
|
+
const HEADER =
|
|
23
|
+
'asset_name,license_type,commercial_use,confidence,detection_source,license_file';
|
|
24
|
+
|
|
25
|
+
function parseCsvLine(line) {
|
|
26
|
+
const out = [];
|
|
27
|
+
let cur = '';
|
|
28
|
+
let inQuotes = false;
|
|
29
|
+
for (let i = 0; i < line.length; i += 1) {
|
|
30
|
+
const c = line[i];
|
|
31
|
+
if (inQuotes) {
|
|
32
|
+
if (c === '"') {
|
|
33
|
+
if (line[i + 1] === '"') {
|
|
34
|
+
cur += '"';
|
|
35
|
+
i += 1;
|
|
36
|
+
} else {
|
|
37
|
+
inQuotes = false;
|
|
38
|
+
}
|
|
39
|
+
} else {
|
|
40
|
+
cur += c;
|
|
41
|
+
}
|
|
42
|
+
} else if (c === ',') {
|
|
43
|
+
out.push(cur);
|
|
44
|
+
cur = '';
|
|
45
|
+
} else if (c === '"') {
|
|
46
|
+
inQuotes = true;
|
|
47
|
+
} else {
|
|
48
|
+
cur += c;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
out.push(cur);
|
|
52
|
+
return out;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function escapeCsvField(value) {
|
|
56
|
+
const s = value == null ? '' : String(value);
|
|
57
|
+
if (/[",\n\r]/.test(s)) {
|
|
58
|
+
return `"${s.replace(/"/g, '""')}"`;
|
|
59
|
+
}
|
|
60
|
+
return s;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
function rowFromSource(source) {
|
|
64
|
+
return [
|
|
65
|
+
`${source.modelId}.tar.bz2`,
|
|
66
|
+
source.licenseType,
|
|
67
|
+
source.commercialUse,
|
|
68
|
+
'high',
|
|
69
|
+
'manual',
|
|
70
|
+
source.licenseUrl || '',
|
|
71
|
+
]
|
|
72
|
+
.map(escapeCsvField)
|
|
73
|
+
.join(',');
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
async function readExistingRows(filePath) {
|
|
77
|
+
let text;
|
|
78
|
+
try {
|
|
79
|
+
text = await fsp.readFile(filePath, 'utf8');
|
|
80
|
+
} catch {
|
|
81
|
+
return new Map();
|
|
82
|
+
}
|
|
83
|
+
const lines = text.split('\n').filter((line) => line.length > 0);
|
|
84
|
+
if (lines.length === 0) {
|
|
85
|
+
return new Map();
|
|
86
|
+
}
|
|
87
|
+
const map = new Map();
|
|
88
|
+
for (let i = 1; i < lines.length; i += 1) {
|
|
89
|
+
const cols = parseCsvLine(lines[i]);
|
|
90
|
+
if (cols[0]) {
|
|
91
|
+
map.set(cols[0], lines[i]);
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
return map;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
async function writeMerged(filePath, sources) {
|
|
98
|
+
const byAsset = await readExistingRows(filePath);
|
|
99
|
+
for (const s of sources) {
|
|
100
|
+
byAsset.set(`${s.modelId}.tar.bz2`, rowFromSource(s));
|
|
101
|
+
}
|
|
102
|
+
const keys = Array.from(byAsset.keys()).sort((a, b) => a.localeCompare(b));
|
|
103
|
+
const body = keys.map((k) => byAsset.get(k)).join('\n');
|
|
104
|
+
const out = `${HEADER}\n${body}\n`;
|
|
105
|
+
await fsp.mkdir(path.dirname(filePath), { recursive: true });
|
|
106
|
+
await fsp.writeFile(filePath, out, 'utf8');
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
async function main() {
|
|
110
|
+
const csvPath = process.argv[2]
|
|
111
|
+
? path.resolve(process.cwd(), process.argv[2])
|
|
112
|
+
: DEFAULT_CSV;
|
|
113
|
+
const sources = await readSources(csvPath);
|
|
114
|
+
for (const target of TARGETS) {
|
|
115
|
+
await writeMerged(target, sources);
|
|
116
|
+
console.log(`[sync] ${path.relative(REPO_ROOT, target)}`);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
main().catch((e) => {
|
|
121
|
+
console.error(e instanceof Error ? e.message : e);
|
|
122
|
+
process.exit(1);
|
|
123
|
+
});
|
package/src/NativeSherpaOnnx.ts
CHANGED
|
@@ -341,8 +341,8 @@ export interface Spec extends TurboModule {
|
|
|
341
341
|
* Generate speech with subtitle/timestamp metadata.
|
|
342
342
|
* @param instanceId - Unique ID for this engine instance
|
|
343
343
|
* @param text - Text to convert to speech
|
|
344
|
-
* @param options - Same as {@link generateTts} options
|
|
345
|
-
* @returns Object with samples, sampleRate, subtitles, and
|
|
344
|
+
* @param options - Same as {@link generateTts} options plus subtitle options (`subtitleMode`, `subtitleGranularity`).
|
|
345
|
+
* @returns Object with samples, sampleRate, subtitles, and timingMode
|
|
346
346
|
*/
|
|
347
347
|
generateTtsWithTimestamps(
|
|
348
348
|
instanceId: string,
|
|
@@ -352,7 +352,39 @@ export interface Spec extends TurboModule {
|
|
|
352
352
|
samples: number[];
|
|
353
353
|
sampleRate: number;
|
|
354
354
|
subtitles: Array<{ text: string; start: number; end: number }>;
|
|
355
|
-
|
|
355
|
+
timingMode: string;
|
|
356
|
+
}>;
|
|
357
|
+
|
|
358
|
+
// ==================== Alignment / Subtitle Methods ====================
|
|
359
|
+
|
|
360
|
+
/**
|
|
361
|
+
* Run wav2vec2 CTC forced alignment on an audio file and transcript.
|
|
362
|
+
* @param modelPath - Absolute path to wav2vec2 ONNX model file
|
|
363
|
+
* @param audioPath - Absolute path to input audio file (WAV recommended)
|
|
364
|
+
* @param text - Transcript to align
|
|
365
|
+
* @param vocabJson - JSON map of token -> id (stringified to reduce bridge overhead)
|
|
366
|
+
*/
|
|
367
|
+
runCTCForcedAlignment(
|
|
368
|
+
modelPath: string,
|
|
369
|
+
audioPath: string,
|
|
370
|
+
text: string,
|
|
371
|
+
vocabJson: string
|
|
372
|
+
): Promise<{
|
|
373
|
+
words: Array<{ text: string; start: number; end: number }>;
|
|
374
|
+
chars: Array<{ text: string; start: number; end: number }>;
|
|
375
|
+
}>;
|
|
376
|
+
|
|
377
|
+
detectAlignmentModel(
|
|
378
|
+
modelDir: string,
|
|
379
|
+
modelType?: string
|
|
380
|
+
): Promise<{
|
|
381
|
+
success: boolean;
|
|
382
|
+
error?: string;
|
|
383
|
+
detectedModels: Array<{ type: string; modelDir: string }>;
|
|
384
|
+
modelType?: string;
|
|
385
|
+
paths?: {
|
|
386
|
+
model?: string;
|
|
387
|
+
};
|
|
356
388
|
}>;
|
|
357
389
|
|
|
358
390
|
// ==================== Online (streaming) TTS Methods ====================
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import SherpaOnnx from '../NativeSherpaOnnx';
|
|
2
|
+
import type { ModelPathConfig } from '../types';
|
|
3
|
+
import { resolveModelPath } from '../utils';
|
|
4
|
+
import type { AlignmentDetectResult, AlignmentModelType } from './types';
|
|
5
|
+
|
|
6
|
+
export {
|
|
7
|
+
WAV2VEC2_BLANK_ID,
|
|
8
|
+
WAV2VEC2_FRAME_DURATION_S,
|
|
9
|
+
WAV2VEC2_VOCAB,
|
|
10
|
+
WAV2VEC2_WORD_BOUNDARY_ID,
|
|
11
|
+
} from './vocab';
|
|
12
|
+
|
|
13
|
+
export async function detectAlignmentModel(
|
|
14
|
+
modelPath: ModelPathConfig,
|
|
15
|
+
options?: { modelType?: AlignmentModelType }
|
|
16
|
+
): Promise<AlignmentDetectResult> {
|
|
17
|
+
const resolvedPath = await resolveModelPath(modelPath);
|
|
18
|
+
const raw = await SherpaOnnx.detectAlignmentModel(
|
|
19
|
+
resolvedPath,
|
|
20
|
+
options?.modelType
|
|
21
|
+
);
|
|
22
|
+
const err = typeof raw.error === 'string' ? raw.error.trim() : '';
|
|
23
|
+
const modelFilePath =
|
|
24
|
+
typeof raw.paths?.model === 'string' ? raw.paths.model.trim() : '';
|
|
25
|
+
return {
|
|
26
|
+
success: raw.success,
|
|
27
|
+
...(err.length > 0 ? { error: err } : {}),
|
|
28
|
+
detectedModels: raw.detectedModels ?? [],
|
|
29
|
+
...(raw.modelType != null && raw.modelType !== ''
|
|
30
|
+
? { modelType: raw.modelType }
|
|
31
|
+
: {}),
|
|
32
|
+
...(modelFilePath.length > 0 ? { paths: { model: modelFilePath } } : {}),
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export type {
|
|
37
|
+
AlignmentResult,
|
|
38
|
+
AlignmentTimestamp,
|
|
39
|
+
AlignmentDetectResult,
|
|
40
|
+
AlignmentModelType,
|
|
41
|
+
} from './types';
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
export interface AlignmentTimestamp {
|
|
2
|
+
text: string;
|
|
3
|
+
start: number;
|
|
4
|
+
end: number;
|
|
5
|
+
}
|
|
6
|
+
|
|
7
|
+
export interface AlignmentResult {
|
|
8
|
+
words: AlignmentTimestamp[];
|
|
9
|
+
chars: AlignmentTimestamp[];
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export type AlignmentModelType = 'wav2vec2' | 'auto';
|
|
13
|
+
|
|
14
|
+
export interface AlignmentDetectResult {
|
|
15
|
+
success: boolean;
|
|
16
|
+
error?: string;
|
|
17
|
+
detectedModels: Array<{ type: string; modelDir: string }>;
|
|
18
|
+
modelType?: string;
|
|
19
|
+
paths?: {
|
|
20
|
+
model?: string;
|
|
21
|
+
};
|
|
22
|
+
}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
export const WAV2VEC2_VOCAB: Record<string, number> = {
|
|
2
|
+
'<pad>': 0,
|
|
3
|
+
'<s>': 1,
|
|
4
|
+
'</s>': 2,
|
|
5
|
+
'<unk>': 3,
|
|
6
|
+
'|': 4,
|
|
7
|
+
'E': 5,
|
|
8
|
+
'T': 6,
|
|
9
|
+
'A': 7,
|
|
10
|
+
'O': 8,
|
|
11
|
+
'N': 9,
|
|
12
|
+
'I': 10,
|
|
13
|
+
'H': 11,
|
|
14
|
+
'S': 12,
|
|
15
|
+
'R': 13,
|
|
16
|
+
'D': 14,
|
|
17
|
+
'L': 15,
|
|
18
|
+
'U': 16,
|
|
19
|
+
'W': 17,
|
|
20
|
+
'M': 18,
|
|
21
|
+
'C': 19,
|
|
22
|
+
'F': 20,
|
|
23
|
+
'G': 21,
|
|
24
|
+
'Y': 22,
|
|
25
|
+
'P': 23,
|
|
26
|
+
'B': 24,
|
|
27
|
+
'V': 25,
|
|
28
|
+
'K': 26,
|
|
29
|
+
"'": 27,
|
|
30
|
+
'X': 28,
|
|
31
|
+
'J': 29,
|
|
32
|
+
'Q': 30,
|
|
33
|
+
'Z': 31,
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
export const WAV2VEC2_BLANK_ID = 0;
|
|
37
|
+
export const WAV2VEC2_WORD_BOUNDARY_ID = 4;
|
|
38
|
+
export const WAV2VEC2_FRAME_DURATION_S = 0.02;
|
package/src/download/paths.ts
CHANGED
|
@@ -3,10 +3,14 @@ import { ModelCategory } from './types';
|
|
|
3
3
|
import type { ModelArchiveExt } from './types';
|
|
4
4
|
import { RELEASE_API_BASE } from './constants';
|
|
5
5
|
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
6
|
+
type CategoryConfig = {
|
|
7
|
+
tag: string;
|
|
8
|
+
cacheFile: string;
|
|
9
|
+
baseDir: string;
|
|
10
|
+
releaseApiBase?: string;
|
|
11
|
+
};
|
|
12
|
+
|
|
13
|
+
export const CATEGORY_CONFIG: Record<ModelCategory, CategoryConfig> = {
|
|
10
14
|
[ModelCategory.Tts]: {
|
|
11
15
|
tag: 'tts-models',
|
|
12
16
|
cacheFile: 'tts-models.json',
|
|
@@ -42,6 +46,13 @@ export const CATEGORY_CONFIG: Record<
|
|
|
42
46
|
cacheFile: 'qnn-models.json',
|
|
43
47
|
baseDir: `${DocumentDirectoryPath}/sherpa-onnx/models/qnn`,
|
|
44
48
|
},
|
|
49
|
+
[ModelCategory.Alignment]: {
|
|
50
|
+
tag: 'alignment-models',
|
|
51
|
+
cacheFile: 'alignment-models.json',
|
|
52
|
+
baseDir: `${DocumentDirectoryPath}/sherpa-onnx/models/alignment`,
|
|
53
|
+
releaseApiBase:
|
|
54
|
+
'https://api.github.com/repos/XDcobra/react-native-sherpa-onnx/releases/tags',
|
|
55
|
+
},
|
|
45
56
|
};
|
|
46
57
|
|
|
47
58
|
export function getCacheDir(): string {
|
|
@@ -131,5 +142,7 @@ export function getNativeAssetExtractedModelDir(modelId: string): string {
|
|
|
131
142
|
}
|
|
132
143
|
|
|
133
144
|
export function getReleaseUrl(category: ModelCategory): string {
|
|
134
|
-
|
|
145
|
+
const config = CATEGORY_CONFIG[category];
|
|
146
|
+
const releaseApiBase = config.releaseApiBase ?? RELEASE_API_BASE;
|
|
147
|
+
return `${releaseApiBase}/${config.tag}`;
|
|
135
148
|
}
|
package/src/download/registry.ts
CHANGED
|
@@ -39,6 +39,26 @@ const checksumCacheByCategory: Partial<
|
|
|
39
39
|
Record<ModelCategory, Map<string, string>>
|
|
40
40
|
> = {};
|
|
41
41
|
|
|
42
|
+
const DEFAULT_RELEASE_REPO = 'k2-fsa/sherpa-onnx';
|
|
43
|
+
|
|
44
|
+
function getReleaseRepoFromConfig(category: ModelCategory): string {
|
|
45
|
+
const releaseApiBase = CATEGORY_CONFIG[category].releaseApiBase;
|
|
46
|
+
if (!releaseApiBase) {
|
|
47
|
+
return DEFAULT_RELEASE_REPO;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
const match = releaseApiBase.match(
|
|
51
|
+
/^https:\/\/api\.github\.com\/repos\/([^/]+\/[^/]+)\/releases\/tags\/?$/
|
|
52
|
+
);
|
|
53
|
+
return match?.[1] ?? DEFAULT_RELEASE_REPO;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
function getChecksumUrl(category: ModelCategory): string {
|
|
57
|
+
const tag = CATEGORY_CONFIG[category].tag;
|
|
58
|
+
const repo = getReleaseRepoFromConfig(category);
|
|
59
|
+
return `https://github.com/${repo}/releases/download/${tag}/checksum.txt`;
|
|
60
|
+
}
|
|
61
|
+
|
|
42
62
|
export async function fetchChecksumsFromRelease(
|
|
43
63
|
category: ModelCategory
|
|
44
64
|
): Promise<Map<string, string>> {
|
|
@@ -51,9 +71,7 @@ export async function fetchChecksumsFromRelease(
|
|
|
51
71
|
try {
|
|
52
72
|
const checksums = await retryWithBackoff(
|
|
53
73
|
async () => {
|
|
54
|
-
const response = await fetch(
|
|
55
|
-
`https://github.com/k2-fsa/sherpa-onnx/releases/download/${CATEGORY_CONFIG[category].tag}/checksum.txt`
|
|
56
|
-
);
|
|
74
|
+
const response = await fetch(getChecksumUrl(category));
|
|
57
75
|
if (!response.ok) {
|
|
58
76
|
throw new Error(
|
|
59
77
|
`Failed to fetch checksum.txt for ${category}: ${response.status}`
|
|
@@ -176,6 +194,8 @@ function isAssetSupportedForCategory(
|
|
|
176
194
|
lower.includes('binary') &&
|
|
177
195
|
lower.includes('seconds')
|
|
178
196
|
);
|
|
197
|
+
case ModelCategory.Alignment:
|
|
198
|
+
return ext === 'tar.bz2';
|
|
179
199
|
default:
|
|
180
200
|
return false;
|
|
181
201
|
}
|
package/src/download/types.ts
CHANGED
package/src/index.tsx
CHANGED
|
@@ -17,17 +17,19 @@ export {
|
|
|
17
17
|
} from './utils';
|
|
18
18
|
|
|
19
19
|
export { copyFileToContentUri } from './tts';
|
|
20
|
+
export * from './alignment';
|
|
20
21
|
|
|
21
22
|
export { getModelLicenses, type ModelLicense } from './licenses';
|
|
22
23
|
// Note: Feature-specific exports are available via subpath imports:
|
|
23
24
|
// - import { createSTT, createStreamingSTT, ... } from 'react-native-sherpa-onnx/stt'
|
|
24
25
|
// - import { createTTS, ... } from 'react-native-sherpa-onnx/tts'
|
|
26
|
+
// - import { detectAlignmentModel, ... } from 'react-native-sherpa-onnx/alignment'
|
|
25
27
|
// - import { ... } from 'react-native-sherpa-onnx/download'
|
|
26
28
|
// - import { getBundledArchives, listBundledArchives, extractArchive } from 'react-native-sherpa-onnx/extraction'
|
|
27
|
-
// - import { ... } from 'react-native-sherpa-onnx/vad'
|
|
28
|
-
// - import { ... } from 'react-native-sherpa-onnx/diarization'
|
|
29
|
-
// - import { ... } from 'react-native-sherpa-onnx/enhancement'
|
|
30
|
-
// - import { ... } from 'react-native-sherpa-onnx/separation'
|
|
29
|
+
// - import { ... } from 'react-native-sherpa-onnx/vad'
|
|
30
|
+
// - import { ... } from 'react-native-sherpa-onnx/diarization'
|
|
31
|
+
// - import { ... } from 'react-native-sherpa-onnx/enhancement'
|
|
32
|
+
// - import { ... } from 'react-native-sherpa-onnx/separation'
|
|
31
33
|
|
|
32
34
|
/**
|
|
33
35
|
* Test method to verify sherpa-onnx native library is loaded.
|
package/src/licenses.ts
CHANGED
|
@@ -13,6 +13,7 @@ export async function getModelLicenses(): Promise<ModelLicense[]> {
|
|
|
13
13
|
const asrPath = 'model_licenses/asr-models-license-status.csv';
|
|
14
14
|
const qnnPath = 'model_licenses/qnn-asr-models-license-status.csv';
|
|
15
15
|
const ttsPath = 'model_licenses/tts-models-license-status.csv';
|
|
16
|
+
const alignmentPath = 'model_licenses/alignment-models-license-status.csv';
|
|
16
17
|
const speechEnhancementPath =
|
|
17
18
|
'model_licenses/speech-enhancement-models-license-status.csv';
|
|
18
19
|
|
|
@@ -20,10 +21,12 @@ export async function getModelLicenses(): Promise<ModelLicense[]> {
|
|
|
20
21
|
SherpaOnnx.readAssetFileAsUtf8(asrPath),
|
|
21
22
|
SherpaOnnx.readAssetFileAsUtf8(qnnPath),
|
|
22
23
|
SherpaOnnx.readAssetFileAsUtf8(ttsPath),
|
|
24
|
+
SherpaOnnx.readAssetFileAsUtf8(alignmentPath),
|
|
23
25
|
SherpaOnnx.readAssetFileAsUtf8(speechEnhancementPath),
|
|
24
26
|
]);
|
|
25
27
|
|
|
26
|
-
const [asrResult, qnnResult, ttsResult, enhancementResult] =
|
|
28
|
+
const [asrResult, qnnResult, ttsResult, alignmentResult, enhancementResult] =
|
|
29
|
+
results;
|
|
27
30
|
|
|
28
31
|
const licenses: ModelLicense[] = [];
|
|
29
32
|
|
|
@@ -51,6 +54,14 @@ export async function getModelLicenses(): Promise<ModelLicense[]> {
|
|
|
51
54
|
);
|
|
52
55
|
}
|
|
53
56
|
|
|
57
|
+
if (alignmentResult.status === 'fulfilled') {
|
|
58
|
+
licenses.push(...parseCsv(alignmentResult.value));
|
|
59
|
+
} else {
|
|
60
|
+
console.warn(
|
|
61
|
+
`[SherpaOnnx] Failed to load alignment model licenses: ${alignmentResult.reason}`
|
|
62
|
+
);
|
|
63
|
+
}
|
|
64
|
+
|
|
54
65
|
if (enhancementResult.status === 'fulfilled') {
|
|
55
66
|
licenses.push(...parseCsv(enhancementResult.value));
|
|
56
67
|
} else {
|
package/src/stt/types.ts
CHANGED
|
@@ -80,9 +80,12 @@ export interface SttWhisperModelOptions {
|
|
|
80
80
|
task?: 'transcribe' | 'translate';
|
|
81
81
|
/** Padding at end of samples. Kotlin default 1000; C++ default -1. */
|
|
82
82
|
tailPaddings?: number;
|
|
83
|
-
/**
|
|
83
|
+
/**
|
|
84
|
+
* Token-level timestamps (cross-attention / DTW). Requires Whisper ONNX models
|
|
85
|
+
* built with attention outputs (see sherpa-onnx).
|
|
86
|
+
*/
|
|
84
87
|
enableTokenTimestamps?: boolean;
|
|
85
|
-
/** Segment-level timestamps
|
|
88
|
+
/** Segment-level timestamps via Whisper timestamp tokens. */
|
|
86
89
|
enableSegmentTimestamps?: boolean;
|
|
87
90
|
}
|
|
88
91
|
|
package/src/tts/index.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { unlink } from '@dr.pogodin/react-native-fs';
|
|
1
2
|
import SherpaOnnx from '../NativeSherpaOnnx';
|
|
2
3
|
import type {
|
|
3
4
|
TTSInitializeOptions,
|
|
@@ -12,6 +13,11 @@ import type {
|
|
|
12
13
|
} from './types';
|
|
13
14
|
import type { ModelPathConfig } from '../types';
|
|
14
15
|
import { resolveModelPath } from '../utils';
|
|
16
|
+
import {
|
|
17
|
+
assertSubtitleGranularityForMode,
|
|
18
|
+
generateSubtitlesFromAudio,
|
|
19
|
+
} from './subtitles';
|
|
20
|
+
import { saveAlignmentAudioToTempWav } from './tempAudio';
|
|
15
21
|
|
|
16
22
|
let ttsInstanceCounter = 0;
|
|
17
23
|
|
|
@@ -159,6 +165,12 @@ function toNativeTtsOptions(
|
|
|
159
165
|
if (options.numSteps !== undefined) out.numSteps = options.numSteps;
|
|
160
166
|
if (options.extra != null && Object.keys(options.extra).length > 0)
|
|
161
167
|
out.extra = options.extra;
|
|
168
|
+
if (options.subtitles?.mode !== undefined) {
|
|
169
|
+
out.subtitleMode = options.subtitles.mode;
|
|
170
|
+
}
|
|
171
|
+
if (options.subtitles?.granularity !== undefined) {
|
|
172
|
+
out.subtitleGranularity = options.subtitles.granularity;
|
|
173
|
+
}
|
|
162
174
|
return out;
|
|
163
175
|
}
|
|
164
176
|
|
|
@@ -276,7 +288,18 @@ export async function createTTS(
|
|
|
276
288
|
opts?: TtsGenerationOptions
|
|
277
289
|
): Promise<GeneratedAudio> {
|
|
278
290
|
guard();
|
|
279
|
-
|
|
291
|
+
const optionsWithSubtitlesOff: TtsGenerationOptions = {
|
|
292
|
+
...(opts ?? {}),
|
|
293
|
+
subtitles: {
|
|
294
|
+
...(opts?.subtitles ?? {}),
|
|
295
|
+
mode: 'off',
|
|
296
|
+
},
|
|
297
|
+
};
|
|
298
|
+
return SherpaOnnx.generateTts(
|
|
299
|
+
instanceId,
|
|
300
|
+
text,
|
|
301
|
+
toNativeTtsOptions(optionsWithSubtitlesOff)
|
|
302
|
+
);
|
|
280
303
|
},
|
|
281
304
|
|
|
282
305
|
async generateSpeechWithTimestamps(
|
|
@@ -284,11 +307,89 @@ export async function createTTS(
|
|
|
284
307
|
opts?: TtsGenerationOptions
|
|
285
308
|
): Promise<GeneratedAudioWithTimestamps> {
|
|
286
309
|
guard();
|
|
287
|
-
|
|
310
|
+
const subtitleMode = opts?.subtitles?.mode ?? 'fast';
|
|
311
|
+
const subtitleGranularity = opts?.subtitles?.granularity ?? 'sentence';
|
|
312
|
+
|
|
313
|
+
assertSubtitleGranularityForMode(subtitleMode, subtitleGranularity);
|
|
314
|
+
|
|
315
|
+
if (subtitleMode !== 'accurate') {
|
|
316
|
+
const optionsWithDefaultSubtitleMode: TtsGenerationOptions = {
|
|
317
|
+
...(opts ?? {}),
|
|
318
|
+
subtitles: {
|
|
319
|
+
...(opts?.subtitles ?? {}),
|
|
320
|
+
mode: subtitleMode,
|
|
321
|
+
},
|
|
322
|
+
};
|
|
323
|
+
|
|
324
|
+
const native = await SherpaOnnx.generateTtsWithTimestamps(
|
|
325
|
+
instanceId,
|
|
326
|
+
text,
|
|
327
|
+
toNativeTtsOptions(optionsWithDefaultSubtitleMode)
|
|
328
|
+
);
|
|
329
|
+
|
|
330
|
+
const timingMode =
|
|
331
|
+
native.timingMode === 'off' ||
|
|
332
|
+
native.timingMode === 'estimated' ||
|
|
333
|
+
native.timingMode === 'aligned'
|
|
334
|
+
? native.timingMode
|
|
335
|
+
: 'off';
|
|
336
|
+
|
|
337
|
+
return {
|
|
338
|
+
...native,
|
|
339
|
+
timingMode,
|
|
340
|
+
};
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
const alignmentModelPath = opts?.subtitles?.alignmentModelPath?.trim();
|
|
344
|
+
|
|
345
|
+
if (!alignmentModelPath) {
|
|
346
|
+
throw new Error(
|
|
347
|
+
'ALIGNMENT_MODEL_MISSING: Provide subtitles.alignmentModelPath for accurate mode.'
|
|
348
|
+
);
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
const optionsWithSubtitlesOff: TtsGenerationOptions = {
|
|
352
|
+
...(opts ?? {}),
|
|
353
|
+
subtitles: {
|
|
354
|
+
...(opts?.subtitles ?? {}),
|
|
355
|
+
mode: 'off',
|
|
356
|
+
},
|
|
357
|
+
};
|
|
358
|
+
|
|
359
|
+
const generated = await SherpaOnnx.generateTts(
|
|
288
360
|
instanceId,
|
|
289
361
|
text,
|
|
290
|
-
toNativeTtsOptions(
|
|
362
|
+
toNativeTtsOptions(optionsWithSubtitlesOff)
|
|
291
363
|
);
|
|
364
|
+
|
|
365
|
+
let tempAudioPath: string | null = null;
|
|
366
|
+
try {
|
|
367
|
+
tempAudioPath = await saveAlignmentAudioToTempWav(
|
|
368
|
+
generated,
|
|
369
|
+
instanceId
|
|
370
|
+
);
|
|
371
|
+
const subtitleResult = await generateSubtitlesFromAudio(
|
|
372
|
+
text,
|
|
373
|
+
tempAudioPath,
|
|
374
|
+
{
|
|
375
|
+
mode: 'accurate',
|
|
376
|
+
granularity: subtitleGranularity,
|
|
377
|
+
alignmentModelPath,
|
|
378
|
+
}
|
|
379
|
+
);
|
|
380
|
+
|
|
381
|
+
return {
|
|
382
|
+
...generated,
|
|
383
|
+
subtitles: subtitleResult.subtitles,
|
|
384
|
+
timingMode: subtitleResult.timingMode,
|
|
385
|
+
};
|
|
386
|
+
} finally {
|
|
387
|
+
if (tempAudioPath) {
|
|
388
|
+
unlink(tempAudioPath).catch(() => {
|
|
389
|
+
// ignore cleanup errors
|
|
390
|
+
});
|
|
391
|
+
}
|
|
392
|
+
}
|
|
292
393
|
},
|
|
293
394
|
|
|
294
395
|
async updateParams(opts: TtsUpdateOptions): Promise<{
|
|
@@ -437,6 +538,7 @@ export function shareAudioFile(
|
|
|
437
538
|
// Streaming TTS (separate engine; use createStreamingTTS for chunk callbacks and PCM playback)
|
|
438
539
|
export { createStreamingTTS } from './streaming';
|
|
439
540
|
export type { StreamingTtsEngine } from './streamingTypes';
|
|
541
|
+
export { generateSubtitlesFromAudio } from './subtitles';
|
|
440
542
|
|
|
441
543
|
// Export types and runtime type list
|
|
442
544
|
export type {
|
|
@@ -451,6 +553,11 @@ export type {
|
|
|
451
553
|
TtsSupertonicModelOptions,
|
|
452
554
|
TtsUpdateOptions,
|
|
453
555
|
TtsGenerationOptions,
|
|
556
|
+
SubtitleMode,
|
|
557
|
+
SubtitleGranularity,
|
|
558
|
+
SubtitleOptions,
|
|
559
|
+
SubtitleFromAudioOptions,
|
|
560
|
+
SubtitleResult,
|
|
454
561
|
GeneratedAudio,
|
|
455
562
|
GeneratedAudioWithTimestamps,
|
|
456
563
|
TtsSubtitleItem,
|