react-native-sherpa-onnx 0.4.0 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. package/README.md +3 -0
  2. package/android/src/main/assets/model_licenses/alignment-models-license-status.csv +5 -0
  3. package/android/src/main/cpp/CMakeLists.txt +3 -0
  4. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-alignment-wrapper.cpp +66 -0
  5. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-alignment-wrapper.h +17 -0
  6. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-alignment.cpp +108 -0
  7. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect.h +30 -0
  8. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-alignment.cpp +66 -0
  9. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-alignment.h +30 -0
  10. package/android/src/main/cpp/jni/module/sherpa-onnx-module-jni.cpp +21 -0
  11. package/android/src/main/java/com/sherpaonnx/SherpaOnnxAlignmentHelper.kt +555 -0
  12. package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +76 -0
  13. package/android/src/main/java/com/sherpaonnx/SherpaOnnxTextSegmenter.kt +330 -0
  14. package/android/src/main/java/com/sherpaonnx/SherpaOnnxTtsHelper.kt +180 -23
  15. package/ios/Resources/model_licenses/alignment-models-license-status.csv +5 -0
  16. package/ios/SherpaOnnx+Alignment.mm +704 -0
  17. package/ios/SherpaOnnx+STT.mm +6 -0
  18. package/ios/SherpaOnnx+TTS.mm +624 -50
  19. package/ios/model_detect/sherpa-onnx-model-detect-alignment.mm +108 -0
  20. package/ios/model_detect/sherpa-onnx-model-detect.h +31 -0
  21. package/ios/model_detect/sherpa-onnx-validate-alignment.h +30 -0
  22. package/ios/model_detect/sherpa-onnx-validate-alignment.mm +66 -0
  23. package/ios/stt/sherpa-onnx-stt-wrapper.h +3 -1
  24. package/ios/stt/sherpa-onnx-stt-wrapper.mm +6 -0
  25. package/lib/module/NativeSherpaOnnx.js.map +1 -1
  26. package/lib/module/alignment/index.js +27 -0
  27. package/lib/module/alignment/index.js.map +1 -0
  28. package/lib/module/alignment/types.js +2 -0
  29. package/lib/module/alignment/types.js.map +1 -0
  30. package/lib/module/alignment/vocab.js +40 -0
  31. package/lib/module/alignment/vocab.js.map +1 -0
  32. package/lib/module/download/paths.js +9 -1
  33. package/lib/module/download/paths.js.map +1 -1
  34. package/lib/module/download/registry.js +17 -1
  35. package/lib/module/download/registry.js.map +1 -1
  36. package/lib/module/download/types.js +1 -0
  37. package/lib/module/download/types.js.map +1 -1
  38. package/lib/module/index.js +6 -4
  39. package/lib/module/index.js.map +1 -1
  40. package/lib/module/licenses.js +8 -2
  41. package/lib/module/licenses.js.map +1 -1
  42. package/lib/module/stt/types.js.map +1 -1
  43. package/lib/module/tts/index.js +68 -2
  44. package/lib/module/tts/index.js.map +1 -1
  45. package/lib/module/tts/subtitles.js +400 -0
  46. package/lib/module/tts/subtitles.js.map +1 -0
  47. package/lib/module/tts/tempAudio.js +17 -0
  48. package/lib/module/tts/tempAudio.js.map +1 -0
  49. package/lib/module/tts/types.js.map +1 -1
  50. package/lib/typescript/src/NativeSherpaOnnx.d.ts +34 -3
  51. package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
  52. package/lib/typescript/src/alignment/index.d.ts +8 -0
  53. package/lib/typescript/src/alignment/index.d.ts.map +1 -0
  54. package/lib/typescript/src/alignment/types.d.ts +23 -0
  55. package/lib/typescript/src/alignment/types.d.ts.map +1 -0
  56. package/lib/typescript/src/alignment/vocab.d.ts +5 -0
  57. package/lib/typescript/src/alignment/vocab.d.ts.map +1 -0
  58. package/lib/typescript/src/download/paths.d.ts +5 -2
  59. package/lib/typescript/src/download/paths.d.ts.map +1 -1
  60. package/lib/typescript/src/download/registry.d.ts.map +1 -1
  61. package/lib/typescript/src/download/types.d.ts +2 -1
  62. package/lib/typescript/src/download/types.d.ts.map +1 -1
  63. package/lib/typescript/src/index.d.ts +1 -0
  64. package/lib/typescript/src/index.d.ts.map +1 -1
  65. package/lib/typescript/src/licenses.d.ts.map +1 -1
  66. package/lib/typescript/src/stt/types.d.ts +5 -2
  67. package/lib/typescript/src/stt/types.d.ts.map +1 -1
  68. package/lib/typescript/src/tts/index.d.ts +2 -1
  69. package/lib/typescript/src/tts/index.d.ts.map +1 -1
  70. package/lib/typescript/src/tts/subtitles.d.ts +24 -0
  71. package/lib/typescript/src/tts/subtitles.d.ts.map +1 -0
  72. package/lib/typescript/src/tts/tempAudio.d.ts +3 -0
  73. package/lib/typescript/src/tts/tempAudio.d.ts.map +1 -0
  74. package/lib/typescript/src/tts/types.d.ts +68 -2
  75. package/lib/typescript/src/tts/types.d.ts.map +1 -1
  76. package/package.json +6 -1
  77. package/scripts/alignment-models/README.md +90 -0
  78. package/scripts/alignment-models/build_and_upload.js +724 -0
  79. package/scripts/alignment-models/sources.csv +5 -0
  80. package/scripts/alignment-models/sync_alignment_license_status.js +123 -0
  81. package/src/NativeSherpaOnnx.ts +35 -3
  82. package/src/alignment/index.ts +41 -0
  83. package/src/alignment/types.ts +22 -0
  84. package/src/alignment/vocab.ts +38 -0
  85. package/src/download/paths.ts +18 -5
  86. package/src/download/registry.ts +23 -3
  87. package/src/download/types.ts +1 -0
  88. package/src/index.tsx +6 -4
  89. package/src/licenses.ts +12 -1
  90. package/src/stt/types.ts +5 -2
  91. package/src/tts/index.ts +110 -3
  92. package/src/tts/subtitles.ts +611 -0
  93. package/src/tts/tempAudio.ts +31 -0
  94. package/src/tts/types.ts +79 -2
  95. package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -1
@@ -0,0 +1,5 @@
1
+ id;onnx_url;license;license_type;commercial_use
2
+ wav2vec2-base-960h-int8;https://huggingface.co/onnx-community/wav2vec2-base-960h-ONNX/resolve/main/onnx/model_int8.onnx;https://huggingface.co/datasets/choosealicense/licenses/resolve/main/markdown/apache-2.0.md;apache-2.0;yes
3
+ wav2vec2-base-960h-fp16;https://huggingface.co/onnx-community/wav2vec2-base-960h-ONNX/resolve/main/onnx/model_fp16.onnx;https://huggingface.co/datasets/choosealicense/licenses/resolve/main/markdown/apache-2.0.md;apache-2.0;yes
4
+ wav2vec2-base-960h;https://huggingface.co/onnx-community/wav2vec2-base-960h-ONNX/resolve/main/onnx/model.onnx;https://huggingface.co/datasets/choosealicense/licenses/resolve/main/markdown/apache-2.0.md;apache-2.0;yes
5
+ wav2vec2-base-960h-q4f16;https://huggingface.co/onnx-community/wav2vec2-base-960h-ONNX/resolve/main/onnx/model_q4f16.onnx;https://huggingface.co/datasets/choosealicense/licenses/resolve/main/markdown/apache-2.0.md;apache-2.0;yes
@@ -0,0 +1,123 @@
1
+ #!/usr/bin/env node
2
+
3
+ 'use strict';
4
+
5
+ const path = require('node:path');
6
+ const fsp = require('node:fs/promises');
7
+ const { readSources } = require('./build_and_upload.js');
8
+
9
+ const REPO_ROOT = path.join(__dirname, '..', '..');
10
+ const DEFAULT_CSV = path.join(__dirname, 'sources.csv');
11
+ const TARGETS = [
12
+ path.join(
13
+ REPO_ROOT,
14
+ 'android/src/main/assets/model_licenses/alignment-models-license-status.csv'
15
+ ),
16
+ path.join(
17
+ REPO_ROOT,
18
+ 'ios/Resources/model_licenses/alignment-models-license-status.csv'
19
+ ),
20
+ ];
21
+
22
+ const HEADER =
23
+ 'asset_name,license_type,commercial_use,confidence,detection_source,license_file';
24
+
25
+ function parseCsvLine(line) {
26
+ const out = [];
27
+ let cur = '';
28
+ let inQuotes = false;
29
+ for (let i = 0; i < line.length; i += 1) {
30
+ const c = line[i];
31
+ if (inQuotes) {
32
+ if (c === '"') {
33
+ if (line[i + 1] === '"') {
34
+ cur += '"';
35
+ i += 1;
36
+ } else {
37
+ inQuotes = false;
38
+ }
39
+ } else {
40
+ cur += c;
41
+ }
42
+ } else if (c === ',') {
43
+ out.push(cur);
44
+ cur = '';
45
+ } else if (c === '"') {
46
+ inQuotes = true;
47
+ } else {
48
+ cur += c;
49
+ }
50
+ }
51
+ out.push(cur);
52
+ return out;
53
+ }
54
+
55
+ function escapeCsvField(value) {
56
+ const s = value == null ? '' : String(value);
57
+ if (/[",\n\r]/.test(s)) {
58
+ return `"${s.replace(/"/g, '""')}"`;
59
+ }
60
+ return s;
61
+ }
62
+
63
+ function rowFromSource(source) {
64
+ return [
65
+ `${source.modelId}.tar.bz2`,
66
+ source.licenseType,
67
+ source.commercialUse,
68
+ 'high',
69
+ 'manual',
70
+ source.licenseUrl || '',
71
+ ]
72
+ .map(escapeCsvField)
73
+ .join(',');
74
+ }
75
+
76
+ async function readExistingRows(filePath) {
77
+ let text;
78
+ try {
79
+ text = await fsp.readFile(filePath, 'utf8');
80
+ } catch {
81
+ return new Map();
82
+ }
83
+ const lines = text.split('\n').filter((line) => line.length > 0);
84
+ if (lines.length === 0) {
85
+ return new Map();
86
+ }
87
+ const map = new Map();
88
+ for (let i = 1; i < lines.length; i += 1) {
89
+ const cols = parseCsvLine(lines[i]);
90
+ if (cols[0]) {
91
+ map.set(cols[0], lines[i]);
92
+ }
93
+ }
94
+ return map;
95
+ }
96
+
97
+ async function writeMerged(filePath, sources) {
98
+ const byAsset = await readExistingRows(filePath);
99
+ for (const s of sources) {
100
+ byAsset.set(`${s.modelId}.tar.bz2`, rowFromSource(s));
101
+ }
102
+ const keys = Array.from(byAsset.keys()).sort((a, b) => a.localeCompare(b));
103
+ const body = keys.map((k) => byAsset.get(k)).join('\n');
104
+ const out = `${HEADER}\n${body}\n`;
105
+ await fsp.mkdir(path.dirname(filePath), { recursive: true });
106
+ await fsp.writeFile(filePath, out, 'utf8');
107
+ }
108
+
109
+ async function main() {
110
+ const csvPath = process.argv[2]
111
+ ? path.resolve(process.cwd(), process.argv[2])
112
+ : DEFAULT_CSV;
113
+ const sources = await readSources(csvPath);
114
+ for (const target of TARGETS) {
115
+ await writeMerged(target, sources);
116
+ console.log(`[sync] ${path.relative(REPO_ROOT, target)}`);
117
+ }
118
+ }
119
+
120
+ main().catch((e) => {
121
+ console.error(e instanceof Error ? e.message : e);
122
+ process.exit(1);
123
+ });
@@ -341,8 +341,8 @@ export interface Spec extends TurboModule {
341
341
  * Generate speech with subtitle/timestamp metadata.
342
342
  * @param instanceId - Unique ID for this engine instance
343
343
  * @param text - Text to convert to speech
344
- * @param options - Same as {@link generateTts} options (cloning: Zipvoice/Pocket; Zipvoice needs `referenceText`).
345
- * @returns Object with samples, sampleRate, subtitles, and estimated flag
344
+ * @param options - Same as {@link generateTts} options plus subtitle options (`subtitleMode`, `subtitleGranularity`).
345
+ * @returns Object with samples, sampleRate, subtitles, and timingMode
346
346
  */
347
347
  generateTtsWithTimestamps(
348
348
  instanceId: string,
@@ -352,7 +352,39 @@ export interface Spec extends TurboModule {
352
352
  samples: number[];
353
353
  sampleRate: number;
354
354
  subtitles: Array<{ text: string; start: number; end: number }>;
355
- estimated: boolean;
355
+ timingMode: string;
356
+ }>;
357
+
358
+ // ==================== Alignment / Subtitle Methods ====================
359
+
360
+ /**
361
+ * Run wav2vec2 CTC forced alignment on an audio file and transcript.
362
+ * @param modelPath - Absolute path to wav2vec2 ONNX model file
363
+ * @param audioPath - Absolute path to input audio file (WAV recommended)
364
+ * @param text - Transcript to align
365
+ * @param vocabJson - JSON map of token -> id (stringified to reduce bridge overhead)
366
+ */
367
+ runCTCForcedAlignment(
368
+ modelPath: string,
369
+ audioPath: string,
370
+ text: string,
371
+ vocabJson: string
372
+ ): Promise<{
373
+ words: Array<{ text: string; start: number; end: number }>;
374
+ chars: Array<{ text: string; start: number; end: number }>;
375
+ }>;
376
+
377
+ detectAlignmentModel(
378
+ modelDir: string,
379
+ modelType?: string
380
+ ): Promise<{
381
+ success: boolean;
382
+ error?: string;
383
+ detectedModels: Array<{ type: string; modelDir: string }>;
384
+ modelType?: string;
385
+ paths?: {
386
+ model?: string;
387
+ };
356
388
  }>;
357
389
 
358
390
  // ==================== Online (streaming) TTS Methods ====================
@@ -0,0 +1,41 @@
1
+ import SherpaOnnx from '../NativeSherpaOnnx';
2
+ import type { ModelPathConfig } from '../types';
3
+ import { resolveModelPath } from '../utils';
4
+ import type { AlignmentDetectResult, AlignmentModelType } from './types';
5
+
6
+ export {
7
+ WAV2VEC2_BLANK_ID,
8
+ WAV2VEC2_FRAME_DURATION_S,
9
+ WAV2VEC2_VOCAB,
10
+ WAV2VEC2_WORD_BOUNDARY_ID,
11
+ } from './vocab';
12
+
13
+ export async function detectAlignmentModel(
14
+ modelPath: ModelPathConfig,
15
+ options?: { modelType?: AlignmentModelType }
16
+ ): Promise<AlignmentDetectResult> {
17
+ const resolvedPath = await resolveModelPath(modelPath);
18
+ const raw = await SherpaOnnx.detectAlignmentModel(
19
+ resolvedPath,
20
+ options?.modelType
21
+ );
22
+ const err = typeof raw.error === 'string' ? raw.error.trim() : '';
23
+ const modelFilePath =
24
+ typeof raw.paths?.model === 'string' ? raw.paths.model.trim() : '';
25
+ return {
26
+ success: raw.success,
27
+ ...(err.length > 0 ? { error: err } : {}),
28
+ detectedModels: raw.detectedModels ?? [],
29
+ ...(raw.modelType != null && raw.modelType !== ''
30
+ ? { modelType: raw.modelType }
31
+ : {}),
32
+ ...(modelFilePath.length > 0 ? { paths: { model: modelFilePath } } : {}),
33
+ };
34
+ }
35
+
36
+ export type {
37
+ AlignmentResult,
38
+ AlignmentTimestamp,
39
+ AlignmentDetectResult,
40
+ AlignmentModelType,
41
+ } from './types';
@@ -0,0 +1,22 @@
1
+ export interface AlignmentTimestamp {
2
+ text: string;
3
+ start: number;
4
+ end: number;
5
+ }
6
+
7
+ export interface AlignmentResult {
8
+ words: AlignmentTimestamp[];
9
+ chars: AlignmentTimestamp[];
10
+ }
11
+
12
+ export type AlignmentModelType = 'wav2vec2' | 'auto';
13
+
14
+ export interface AlignmentDetectResult {
15
+ success: boolean;
16
+ error?: string;
17
+ detectedModels: Array<{ type: string; modelDir: string }>;
18
+ modelType?: string;
19
+ paths?: {
20
+ model?: string;
21
+ };
22
+ }
@@ -0,0 +1,38 @@
1
+ export const WAV2VEC2_VOCAB: Record<string, number> = {
2
+ '<pad>': 0,
3
+ '<s>': 1,
4
+ '</s>': 2,
5
+ '<unk>': 3,
6
+ '|': 4,
7
+ 'E': 5,
8
+ 'T': 6,
9
+ 'A': 7,
10
+ 'O': 8,
11
+ 'N': 9,
12
+ 'I': 10,
13
+ 'H': 11,
14
+ 'S': 12,
15
+ 'R': 13,
16
+ 'D': 14,
17
+ 'L': 15,
18
+ 'U': 16,
19
+ 'W': 17,
20
+ 'M': 18,
21
+ 'C': 19,
22
+ 'F': 20,
23
+ 'G': 21,
24
+ 'Y': 22,
25
+ 'P': 23,
26
+ 'B': 24,
27
+ 'V': 25,
28
+ 'K': 26,
29
+ "'": 27,
30
+ 'X': 28,
31
+ 'J': 29,
32
+ 'Q': 30,
33
+ 'Z': 31,
34
+ };
35
+
36
+ export const WAV2VEC2_BLANK_ID = 0;
37
+ export const WAV2VEC2_WORD_BOUNDARY_ID = 4;
38
+ export const WAV2VEC2_FRAME_DURATION_S = 0.02;
@@ -3,10 +3,14 @@ import { ModelCategory } from './types';
3
3
  import type { ModelArchiveExt } from './types';
4
4
  import { RELEASE_API_BASE } from './constants';
5
5
 
6
- export const CATEGORY_CONFIG: Record<
7
- ModelCategory,
8
- { tag: string; cacheFile: string; baseDir: string }
9
- > = {
6
+ type CategoryConfig = {
7
+ tag: string;
8
+ cacheFile: string;
9
+ baseDir: string;
10
+ releaseApiBase?: string;
11
+ };
12
+
13
+ export const CATEGORY_CONFIG: Record<ModelCategory, CategoryConfig> = {
10
14
  [ModelCategory.Tts]: {
11
15
  tag: 'tts-models',
12
16
  cacheFile: 'tts-models.json',
@@ -42,6 +46,13 @@ export const CATEGORY_CONFIG: Record<
42
46
  cacheFile: 'qnn-models.json',
43
47
  baseDir: `${DocumentDirectoryPath}/sherpa-onnx/models/qnn`,
44
48
  },
49
+ [ModelCategory.Alignment]: {
50
+ tag: 'alignment-models',
51
+ cacheFile: 'alignment-models.json',
52
+ baseDir: `${DocumentDirectoryPath}/sherpa-onnx/models/alignment`,
53
+ releaseApiBase:
54
+ 'https://api.github.com/repos/XDcobra/react-native-sherpa-onnx/releases/tags',
55
+ },
45
56
  };
46
57
 
47
58
  export function getCacheDir(): string {
@@ -131,5 +142,7 @@ export function getNativeAssetExtractedModelDir(modelId: string): string {
131
142
  }
132
143
 
133
144
  export function getReleaseUrl(category: ModelCategory): string {
134
- return `${RELEASE_API_BASE}/${CATEGORY_CONFIG[category].tag}`;
145
+ const config = CATEGORY_CONFIG[category];
146
+ const releaseApiBase = config.releaseApiBase ?? RELEASE_API_BASE;
147
+ return `${releaseApiBase}/${config.tag}`;
135
148
  }
@@ -39,6 +39,26 @@ const checksumCacheByCategory: Partial<
39
39
  Record<ModelCategory, Map<string, string>>
40
40
  > = {};
41
41
 
42
+ const DEFAULT_RELEASE_REPO = 'k2-fsa/sherpa-onnx';
43
+
44
+ function getReleaseRepoFromConfig(category: ModelCategory): string {
45
+ const releaseApiBase = CATEGORY_CONFIG[category].releaseApiBase;
46
+ if (!releaseApiBase) {
47
+ return DEFAULT_RELEASE_REPO;
48
+ }
49
+
50
+ const match = releaseApiBase.match(
51
+ /^https:\/\/api\.github\.com\/repos\/([^/]+\/[^/]+)\/releases\/tags\/?$/
52
+ );
53
+ return match?.[1] ?? DEFAULT_RELEASE_REPO;
54
+ }
55
+
56
+ function getChecksumUrl(category: ModelCategory): string {
57
+ const tag = CATEGORY_CONFIG[category].tag;
58
+ const repo = getReleaseRepoFromConfig(category);
59
+ return `https://github.com/${repo}/releases/download/${tag}/checksum.txt`;
60
+ }
61
+
42
62
  export async function fetchChecksumsFromRelease(
43
63
  category: ModelCategory
44
64
  ): Promise<Map<string, string>> {
@@ -51,9 +71,7 @@ export async function fetchChecksumsFromRelease(
51
71
  try {
52
72
  const checksums = await retryWithBackoff(
53
73
  async () => {
54
- const response = await fetch(
55
- `https://github.com/k2-fsa/sherpa-onnx/releases/download/${CATEGORY_CONFIG[category].tag}/checksum.txt`
56
- );
74
+ const response = await fetch(getChecksumUrl(category));
57
75
  if (!response.ok) {
58
76
  throw new Error(
59
77
  `Failed to fetch checksum.txt for ${category}: ${response.status}`
@@ -176,6 +194,8 @@ function isAssetSupportedForCategory(
176
194
  lower.includes('binary') &&
177
195
  lower.includes('seconds')
178
196
  );
197
+ case ModelCategory.Alignment:
198
+ return ext === 'tar.bz2';
179
199
  default:
180
200
  return false;
181
201
  }
@@ -8,6 +8,7 @@ export enum ModelCategory {
8
8
  Enhancement = 'enhancement',
9
9
  Separation = 'separation',
10
10
  Qnn = 'qnn',
11
+ Alignment = 'alignment',
11
12
  }
12
13
 
13
14
  /** TTS model type for meta; 'unknown' when id could not be classified. */
package/src/index.tsx CHANGED
@@ -17,17 +17,19 @@ export {
17
17
  } from './utils';
18
18
 
19
19
  export { copyFileToContentUri } from './tts';
20
+ export * from './alignment';
20
21
 
21
22
  export { getModelLicenses, type ModelLicense } from './licenses';
22
23
  // Note: Feature-specific exports are available via subpath imports:
23
24
  // - import { createSTT, createStreamingSTT, ... } from 'react-native-sherpa-onnx/stt'
24
25
  // - import { createTTS, ... } from 'react-native-sherpa-onnx/tts'
26
+ // - import { detectAlignmentModel, ... } from 'react-native-sherpa-onnx/alignment'
25
27
  // - import { ... } from 'react-native-sherpa-onnx/download'
26
28
  // - import { getBundledArchives, listBundledArchives, extractArchive } from 'react-native-sherpa-onnx/extraction'
27
- // - import { ... } from 'react-native-sherpa-onnx/vad' (planned)
28
- // - import { ... } from 'react-native-sherpa-onnx/diarization' (planned)
29
- // - import { ... } from 'react-native-sherpa-onnx/enhancement' (planned)
30
- // - import { ... } from 'react-native-sherpa-onnx/separation' (planned)
29
+ // - import { ... } from 'react-native-sherpa-onnx/vad'
30
+ // - import { ... } from 'react-native-sherpa-onnx/diarization'
31
+ // - import { ... } from 'react-native-sherpa-onnx/enhancement'
32
+ // - import { ... } from 'react-native-sherpa-onnx/separation'
31
33
 
32
34
  /**
33
35
  * Test method to verify sherpa-onnx native library is loaded.
package/src/licenses.ts CHANGED
@@ -13,6 +13,7 @@ export async function getModelLicenses(): Promise<ModelLicense[]> {
13
13
  const asrPath = 'model_licenses/asr-models-license-status.csv';
14
14
  const qnnPath = 'model_licenses/qnn-asr-models-license-status.csv';
15
15
  const ttsPath = 'model_licenses/tts-models-license-status.csv';
16
+ const alignmentPath = 'model_licenses/alignment-models-license-status.csv';
16
17
  const speechEnhancementPath =
17
18
  'model_licenses/speech-enhancement-models-license-status.csv';
18
19
 
@@ -20,10 +21,12 @@ export async function getModelLicenses(): Promise<ModelLicense[]> {
20
21
  SherpaOnnx.readAssetFileAsUtf8(asrPath),
21
22
  SherpaOnnx.readAssetFileAsUtf8(qnnPath),
22
23
  SherpaOnnx.readAssetFileAsUtf8(ttsPath),
24
+ SherpaOnnx.readAssetFileAsUtf8(alignmentPath),
23
25
  SherpaOnnx.readAssetFileAsUtf8(speechEnhancementPath),
24
26
  ]);
25
27
 
26
- const [asrResult, qnnResult, ttsResult, enhancementResult] = results;
28
+ const [asrResult, qnnResult, ttsResult, alignmentResult, enhancementResult] =
29
+ results;
27
30
 
28
31
  const licenses: ModelLicense[] = [];
29
32
 
@@ -51,6 +54,14 @@ export async function getModelLicenses(): Promise<ModelLicense[]> {
51
54
  );
52
55
  }
53
56
 
57
+ if (alignmentResult.status === 'fulfilled') {
58
+ licenses.push(...parseCsv(alignmentResult.value));
59
+ } else {
60
+ console.warn(
61
+ `[SherpaOnnx] Failed to load alignment model licenses: ${alignmentResult.reason}`
62
+ );
63
+ }
64
+
54
65
  if (enhancementResult.status === 'fulfilled') {
55
66
  licenses.push(...parseCsv(enhancementResult.value));
56
67
  } else {
package/src/stt/types.ts CHANGED
@@ -80,9 +80,12 @@ export interface SttWhisperModelOptions {
80
80
  task?: 'transcribe' | 'translate';
81
81
  /** Padding at end of samples. Kotlin default 1000; C++ default -1. */
82
82
  tailPaddings?: number;
83
- /** Token-level timestamps. Android only; ignored on iOS. */
83
+ /**
84
+ * Token-level timestamps (cross-attention / DTW). Requires Whisper ONNX models
85
+ * built with attention outputs (see sherpa-onnx).
86
+ */
84
87
  enableTokenTimestamps?: boolean;
85
- /** Segment-level timestamps. Android only; ignored on iOS. */
88
+ /** Segment-level timestamps via Whisper timestamp tokens. */
86
89
  enableSegmentTimestamps?: boolean;
87
90
  }
88
91
 
package/src/tts/index.ts CHANGED
@@ -1,3 +1,4 @@
1
+ import { unlink } from '@dr.pogodin/react-native-fs';
1
2
  import SherpaOnnx from '../NativeSherpaOnnx';
2
3
  import type {
3
4
  TTSInitializeOptions,
@@ -12,6 +13,11 @@ import type {
12
13
  } from './types';
13
14
  import type { ModelPathConfig } from '../types';
14
15
  import { resolveModelPath } from '../utils';
16
+ import {
17
+ assertSubtitleGranularityForMode,
18
+ generateSubtitlesFromAudio,
19
+ } from './subtitles';
20
+ import { saveAlignmentAudioToTempWav } from './tempAudio';
15
21
 
16
22
  let ttsInstanceCounter = 0;
17
23
 
@@ -159,6 +165,12 @@ function toNativeTtsOptions(
159
165
  if (options.numSteps !== undefined) out.numSteps = options.numSteps;
160
166
  if (options.extra != null && Object.keys(options.extra).length > 0)
161
167
  out.extra = options.extra;
168
+ if (options.subtitles?.mode !== undefined) {
169
+ out.subtitleMode = options.subtitles.mode;
170
+ }
171
+ if (options.subtitles?.granularity !== undefined) {
172
+ out.subtitleGranularity = options.subtitles.granularity;
173
+ }
162
174
  return out;
163
175
  }
164
176
 
@@ -276,7 +288,18 @@ export async function createTTS(
276
288
  opts?: TtsGenerationOptions
277
289
  ): Promise<GeneratedAudio> {
278
290
  guard();
279
- return SherpaOnnx.generateTts(instanceId, text, toNativeTtsOptions(opts));
291
+ const optionsWithSubtitlesOff: TtsGenerationOptions = {
292
+ ...(opts ?? {}),
293
+ subtitles: {
294
+ ...(opts?.subtitles ?? {}),
295
+ mode: 'off',
296
+ },
297
+ };
298
+ return SherpaOnnx.generateTts(
299
+ instanceId,
300
+ text,
301
+ toNativeTtsOptions(optionsWithSubtitlesOff)
302
+ );
280
303
  },
281
304
 
282
305
  async generateSpeechWithTimestamps(
@@ -284,11 +307,89 @@ export async function createTTS(
284
307
  opts?: TtsGenerationOptions
285
308
  ): Promise<GeneratedAudioWithTimestamps> {
286
309
  guard();
287
- return SherpaOnnx.generateTtsWithTimestamps(
310
+ const subtitleMode = opts?.subtitles?.mode ?? 'fast';
311
+ const subtitleGranularity = opts?.subtitles?.granularity ?? 'sentence';
312
+
313
+ assertSubtitleGranularityForMode(subtitleMode, subtitleGranularity);
314
+
315
+ if (subtitleMode !== 'accurate') {
316
+ const optionsWithDefaultSubtitleMode: TtsGenerationOptions = {
317
+ ...(opts ?? {}),
318
+ subtitles: {
319
+ ...(opts?.subtitles ?? {}),
320
+ mode: subtitleMode,
321
+ },
322
+ };
323
+
324
+ const native = await SherpaOnnx.generateTtsWithTimestamps(
325
+ instanceId,
326
+ text,
327
+ toNativeTtsOptions(optionsWithDefaultSubtitleMode)
328
+ );
329
+
330
+ const timingMode =
331
+ native.timingMode === 'off' ||
332
+ native.timingMode === 'estimated' ||
333
+ native.timingMode === 'aligned'
334
+ ? native.timingMode
335
+ : 'off';
336
+
337
+ return {
338
+ ...native,
339
+ timingMode,
340
+ };
341
+ }
342
+
343
+ const alignmentModelPath = opts?.subtitles?.alignmentModelPath?.trim();
344
+
345
+ if (!alignmentModelPath) {
346
+ throw new Error(
347
+ 'ALIGNMENT_MODEL_MISSING: Provide subtitles.alignmentModelPath for accurate mode.'
348
+ );
349
+ }
350
+
351
+ const optionsWithSubtitlesOff: TtsGenerationOptions = {
352
+ ...(opts ?? {}),
353
+ subtitles: {
354
+ ...(opts?.subtitles ?? {}),
355
+ mode: 'off',
356
+ },
357
+ };
358
+
359
+ const generated = await SherpaOnnx.generateTts(
288
360
  instanceId,
289
361
  text,
290
- toNativeTtsOptions(opts)
362
+ toNativeTtsOptions(optionsWithSubtitlesOff)
291
363
  );
364
+
365
+ let tempAudioPath: string | null = null;
366
+ try {
367
+ tempAudioPath = await saveAlignmentAudioToTempWav(
368
+ generated,
369
+ instanceId
370
+ );
371
+ const subtitleResult = await generateSubtitlesFromAudio(
372
+ text,
373
+ tempAudioPath,
374
+ {
375
+ mode: 'accurate',
376
+ granularity: subtitleGranularity,
377
+ alignmentModelPath,
378
+ }
379
+ );
380
+
381
+ return {
382
+ ...generated,
383
+ subtitles: subtitleResult.subtitles,
384
+ timingMode: subtitleResult.timingMode,
385
+ };
386
+ } finally {
387
+ if (tempAudioPath) {
388
+ unlink(tempAudioPath).catch(() => {
389
+ // ignore cleanup errors
390
+ });
391
+ }
392
+ }
292
393
  },
293
394
 
294
395
  async updateParams(opts: TtsUpdateOptions): Promise<{
@@ -437,6 +538,7 @@ export function shareAudioFile(
437
538
  // Streaming TTS (separate engine; use createStreamingTTS for chunk callbacks and PCM playback)
438
539
  export { createStreamingTTS } from './streaming';
439
540
  export type { StreamingTtsEngine } from './streamingTypes';
541
+ export { generateSubtitlesFromAudio } from './subtitles';
440
542
 
441
543
  // Export types and runtime type list
442
544
  export type {
@@ -451,6 +553,11 @@ export type {
451
553
  TtsSupertonicModelOptions,
452
554
  TtsUpdateOptions,
453
555
  TtsGenerationOptions,
556
+ SubtitleMode,
557
+ SubtitleGranularity,
558
+ SubtitleOptions,
559
+ SubtitleFromAudioOptions,
560
+ SubtitleResult,
454
561
  GeneratedAudio,
455
562
  GeneratedAudioWithTimestamps,
456
563
  TtsSubtitleItem,