@gmessier/nitro-speech 0.3.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +165 -148
- package/android/build.gradle +0 -1
- package/android/src/main/cpp/cpp-adapter.cpp +5 -1
- package/android/src/main/java/com/margelo/nitro/nitrospeech/HybridNitroSpeech.kt +2 -0
- package/android/src/main/java/com/margelo/nitro/nitrospeech/recognizer/AutoStopper.kt +80 -16
- package/android/src/main/java/com/margelo/nitro/nitrospeech/recognizer/HybridRecognizer.kt +93 -20
- package/android/src/main/java/com/margelo/nitro/nitrospeech/recognizer/RecognitionListenerSession.kt +27 -15
- package/ios/{BufferUtil.swift → Audio/AudioBufferConverter.swift} +3 -34
- package/ios/Audio/AudioLevelTracker.swift +66 -0
- package/ios/Coordinator.swift +105 -0
- package/ios/Engines/AnalyzerEngine.swift +241 -0
- package/ios/Engines/DictationRuntime.swift +67 -0
- package/ios/Engines/RecognizerEngine.swift +312 -0
- package/ios/Engines/SFSpeechEngine.swift +119 -0
- package/ios/Engines/SpeechRuntime.swift +58 -0
- package/ios/Engines/TranscriberRuntimeProtocol.swift +21 -0
- package/ios/HybridNitroSpeech.swift +1 -10
- package/ios/HybridRecognizer.swift +135 -192
- package/ios/LocaleManager.swift +73 -0
- package/ios/{AppStateObserver.swift → Shared/AppStateObserver.swift} +1 -2
- package/ios/Shared/AutoStopper.swift +147 -0
- package/ios/Shared/HapticImpact.swift +24 -0
- package/ios/Shared/Log.swift +41 -0
- package/ios/Shared/Permissions.swift +59 -0
- package/ios/Shared/Utils.swift +58 -0
- package/lib/NitroSpeech.d.ts +2 -0
- package/lib/NitroSpeech.js +2 -0
- package/lib/Recognizer/RecognizerRef.d.ts +5 -0
- package/lib/Recognizer/RecognizerRef.js +13 -0
- package/lib/Recognizer/SpeechRecognizer.d.ts +8 -0
- package/lib/Recognizer/SpeechRecognizer.js +9 -0
- package/lib/Recognizer/methods.d.ts +8 -0
- package/lib/Recognizer/methods.js +29 -0
- package/lib/Recognizer/types.d.ts +6 -0
- package/lib/Recognizer/types.js +1 -0
- package/lib/Recognizer/useRecognizer.d.ts +16 -0
- package/lib/Recognizer/useRecognizer.js +71 -0
- package/lib/Recognizer/useVoiceInputVolume.d.ts +25 -0
- package/lib/Recognizer/useVoiceInputVolume.js +52 -0
- package/lib/index.d.ts +6 -0
- package/lib/index.js +6 -0
- package/lib/specs/NitroSpeech.nitro.d.ts +8 -0
- package/lib/specs/NitroSpeech.nitro.js +1 -0
- package/lib/specs/Recognizer.nitro.d.ts +95 -0
- package/lib/specs/Recognizer.nitro.js +1 -0
- package/lib/specs/SpeechRecognitionConfig.d.ts +162 -0
- package/lib/specs/SpeechRecognitionConfig.js +1 -0
- package/lib/specs/VolumeChangeEvent.d.ts +31 -0
- package/lib/specs/VolumeChangeEvent.js +1 -0
- package/nitro.json +0 -4
- package/nitrogen/generated/android/NitroSpeech+autolinking.cmake +2 -2
- package/nitrogen/generated/android/NitroSpeechOnLoad.cpp +4 -2
- package/nitrogen/generated/android/c++/JFunc_void_VolumeChangeEvent.hpp +78 -0
- package/nitrogen/generated/android/c++/JFunc_void_std__vector_std__string_.hpp +14 -14
- package/nitrogen/generated/android/c++/JHybridRecognizerSpec.cpp +68 -19
- package/nitrogen/generated/android/c++/JHybridRecognizerSpec.hpp +7 -4
- package/nitrogen/generated/android/c++/JIosPreset.hpp +58 -0
- package/nitrogen/generated/android/c++/JMutableSpeechRecognitionConfig.hpp +79 -0
- package/nitrogen/generated/android/c++/{JSpeechToTextParams.hpp → JSpeechRecognitionConfig.hpp} +48 -30
- package/nitrogen/generated/android/c++/JVolumeChangeEvent.hpp +65 -0
- package/nitrogen/generated/android/kotlin/com/margelo/nitro/nitrospeech/Func_void_VolumeChangeEvent.kt +80 -0
- package/nitrogen/generated/android/kotlin/com/margelo/nitro/nitrospeech/HybridRecognizerSpec.kt +18 -5
- package/nitrogen/generated/android/kotlin/com/margelo/nitro/nitrospeech/IosPreset.kt +23 -0
- package/nitrogen/generated/android/kotlin/com/margelo/nitro/nitrospeech/MutableSpeechRecognitionConfig.kt +76 -0
- package/nitrogen/generated/android/kotlin/com/margelo/nitro/nitrospeech/SpeechRecognitionConfig.kt +121 -0
- package/nitrogen/generated/android/kotlin/com/margelo/nitro/nitrospeech/VolumeChangeEvent.kt +61 -0
- package/nitrogen/generated/ios/NitroSpeech-Swift-Cxx-Bridge.cpp +46 -30
- package/nitrogen/generated/ios/NitroSpeech-Swift-Cxx-Bridge.hpp +203 -70
- package/nitrogen/generated/ios/NitroSpeech-Swift-Cxx-Umbrella.hpp +13 -3
- package/nitrogen/generated/ios/c++/HybridRecognizerSpecSwift.hpp +41 -9
- package/nitrogen/generated/ios/swift/Func_void_VolumeChangeEvent.swift +46 -0
- package/nitrogen/generated/ios/swift/Func_void_std__exception_ptr.swift +46 -0
- package/nitrogen/generated/ios/swift/HybridRecognizerSpec.swift +6 -3
- package/nitrogen/generated/ios/swift/HybridRecognizerSpec_cxx.swift +66 -18
- package/nitrogen/generated/ios/swift/IosPreset.swift +40 -0
- package/nitrogen/generated/ios/swift/MutableSpeechRecognitionConfig.swift +118 -0
- package/nitrogen/generated/ios/swift/{SpeechToTextParams.swift → SpeechRecognitionConfig.swift} +108 -43
- package/nitrogen/generated/ios/swift/VolumeChangeEvent.swift +52 -0
- package/nitrogen/generated/shared/c++/HybridRecognizerSpec.cpp +4 -1
- package/nitrogen/generated/shared/c++/HybridRecognizerSpec.hpp +17 -7
- package/nitrogen/generated/shared/c++/IosPreset.hpp +76 -0
- package/nitrogen/generated/shared/c++/MutableSpeechRecognitionConfig.hpp +105 -0
- package/nitrogen/generated/shared/c++/{SpeechToTextParams.hpp → SpeechRecognitionConfig.hpp} +39 -20
- package/nitrogen/generated/shared/c++/VolumeChangeEvent.hpp +91 -0
- package/package.json +15 -16
- package/src/NitroSpeech.ts +5 -0
- package/src/Recognizer/RecognizerRef.ts +23 -0
- package/src/Recognizer/SpeechRecognizer.ts +10 -0
- package/src/Recognizer/methods.ts +40 -0
- package/src/Recognizer/types.ts +33 -0
- package/src/Recognizer/useRecognizer.ts +85 -0
- package/src/Recognizer/useVoiceInputVolume.ts +65 -0
- package/src/index.ts +6 -182
- package/src/specs/NitroSpeech.nitro.ts +2 -163
- package/src/specs/Recognizer.nitro.ts +110 -0
- package/src/specs/SpeechRecognitionConfig.ts +167 -0
- package/src/specs/VolumeChangeEvent.ts +31 -0
- package/android/proguard-rules.pro +0 -1
- package/ios/AnylyzerTranscriber.swift +0 -331
- package/ios/AutoStopper.swift +0 -69
- package/ios/HapticImpact.swift +0 -32
- package/ios/LegacySpeechRecognizer.swift +0 -161
- package/lib/commonjs/index.js +0 -145
- package/lib/commonjs/index.js.map +0 -1
- package/lib/commonjs/package.json +0 -1
- package/lib/commonjs/specs/NitroSpeech.nitro.js +0 -6
- package/lib/commonjs/specs/NitroSpeech.nitro.js.map +0 -1
- package/lib/module/index.js +0 -138
- package/lib/module/index.js.map +0 -1
- package/lib/module/package.json +0 -1
- package/lib/module/specs/NitroSpeech.nitro.js +0 -4
- package/lib/module/specs/NitroSpeech.nitro.js.map +0 -1
- package/lib/tsconfig.tsbuildinfo +0 -1
- package/lib/typescript/index.d.ts +0 -50
- package/lib/typescript/index.d.ts.map +0 -1
- package/lib/typescript/specs/NitroSpeech.nitro.d.ts +0 -162
- package/lib/typescript/specs/NitroSpeech.nitro.d.ts.map +0 -1
- package/nitrogen/generated/android/kotlin/com/margelo/nitro/nitrospeech/SpeechToTextParams.kt +0 -68
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
import Foundation
|
|
2
|
+
import Speech
|
|
3
|
+
import AVFoundation
|
|
4
|
+
|
|
5
|
+
@available(iOS 26.0, *)
|
|
6
|
+
final class AnalyzerEngine: RecognizerEngine {
|
|
7
|
+
private var inputSequence: AsyncStream<AnalyzerInput>?
|
|
8
|
+
private var inputBuilder: AsyncStream<AnalyzerInput>.Continuation?
|
|
9
|
+
private var outputContinuation: AsyncStream<AVAudioPCMBuffer>.Continuation?
|
|
10
|
+
private var analyzer: SpeechAnalyzer?
|
|
11
|
+
private let transcriber: TranscriberRuntime
|
|
12
|
+
|
|
13
|
+
private var audioProducerTask: Task<Void, Never>?
|
|
14
|
+
private var recognizerTask: Task<(), Error>?
|
|
15
|
+
private var lastBatchStartTime: Float64? = nil
|
|
16
|
+
private var resultBatches: [String] = []
|
|
17
|
+
|
|
18
|
+
init(backend: RecognizerBackend, locale: Locale, delegate: RecognizerDelegate) {
|
|
19
|
+
if backend == .speechTranscriber {
|
|
20
|
+
transcriber = SpeechRuntime(with: locale)
|
|
21
|
+
} else {
|
|
22
|
+
transcriber = DictationRuntime(with: locale)
|
|
23
|
+
}
|
|
24
|
+
super.init(locale: locale, delegate: delegate)
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
override func stop() {
|
|
28
|
+
super.stop()
|
|
29
|
+
inputBuilder?.finish()
|
|
30
|
+
|
|
31
|
+
Task { [weak self] in
|
|
32
|
+
guard let self = self else { return }
|
|
33
|
+
|
|
34
|
+
do {
|
|
35
|
+
try await self.analyzer?.finalizeAndFinishThroughEndOfInput()
|
|
36
|
+
} catch {
|
|
37
|
+
self.reportFailure(
|
|
38
|
+
from: "stop.finalizeAndFinishThroughEndOfInput",
|
|
39
|
+
message: "Failed to finalize the end of input",
|
|
40
|
+
type: .onSession
|
|
41
|
+
)
|
|
42
|
+
await self.analyzer?.cancelAndFinishNow()
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
self.cleanup(from: "stopListening")
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
override func prewarm(for type: FailureType) async {
|
|
50
|
+
await super.prewarm(for: type)
|
|
51
|
+
do {
|
|
52
|
+
// Create transcriber and install assets
|
|
53
|
+
try await transcriber.create(config: self.recognizerDelegate?.config)
|
|
54
|
+
}
|
|
55
|
+
catch {
|
|
56
|
+
self.reportFailure(
|
|
57
|
+
from: "prewarm.assets",
|
|
58
|
+
message: "Failed to create transcriber",
|
|
59
|
+
type: type
|
|
60
|
+
)
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
override func startSession() async {
|
|
65
|
+
await super.startSession()
|
|
66
|
+
|
|
67
|
+
// Prepares transcriber and handles errors.
|
|
68
|
+
// On failure, reportFailure triggers cleanup + engine reselection.
|
|
69
|
+
await prewarm(for: .start)
|
|
70
|
+
|
|
71
|
+
// 3. Input sequence
|
|
72
|
+
(inputSequence, inputBuilder) = AsyncStream.makeStream(of: AnalyzerInput.self)
|
|
73
|
+
|
|
74
|
+
let modules = transcriber.getModules()
|
|
75
|
+
// 4. Analyzer
|
|
76
|
+
guard let audioFormat = await SpeechAnalyzer.bestAvailableAudioFormat(
|
|
77
|
+
compatibleWith: modules
|
|
78
|
+
) else {
|
|
79
|
+
self.reportFailure(
|
|
80
|
+
from: "startRecognition.SpeechAnalyzer.bestAvailableAudioFormat",
|
|
81
|
+
message: "Failed to find SpeechAnalyzer audio format",
|
|
82
|
+
type: .start
|
|
83
|
+
)
|
|
84
|
+
return
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
analyzer = SpeechAnalyzer(modules: modules)
|
|
88
|
+
|
|
89
|
+
// 5. Supply audio
|
|
90
|
+
audioProducerTask = Task {
|
|
91
|
+
self.startAudioEngine(
|
|
92
|
+
onBuffer: { [weak self] buffer in
|
|
93
|
+
self?.outputContinuation?.yield(buffer)
|
|
94
|
+
}
|
|
95
|
+
)
|
|
96
|
+
guard let hardwareFormat else { return }
|
|
97
|
+
let stream = AsyncStream(
|
|
98
|
+
AVAudioPCMBuffer.self,
|
|
99
|
+
bufferingPolicy: .unbounded
|
|
100
|
+
) { continuation in
|
|
101
|
+
outputContinuation = continuation
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
let needsConversion =
|
|
105
|
+
hardwareFormat.commonFormat != audioFormat.commonFormat ||
|
|
106
|
+
hardwareFormat.sampleRate != audioFormat.sampleRate ||
|
|
107
|
+
hardwareFormat.channelCount != audioFormat.channelCount
|
|
108
|
+
do {
|
|
109
|
+
guard let converter = AVAudioConverter(
|
|
110
|
+
from: hardwareFormat,
|
|
111
|
+
to: audioFormat
|
|
112
|
+
) else {
|
|
113
|
+
throw NSError()
|
|
114
|
+
}
|
|
115
|
+
for await pcmBuffer in stream {
|
|
116
|
+
if Task.isCancelled { break }
|
|
117
|
+
|
|
118
|
+
let bufferForAnalyzer: AVAudioPCMBuffer
|
|
119
|
+
if needsConversion {
|
|
120
|
+
// Skip analyzing for empty buffers and
|
|
121
|
+
// Throw error if buffers are inconvertable
|
|
122
|
+
guard let convertedBuffer = try AudioBufferConverter.convertBuffer(
|
|
123
|
+
converter: converter,
|
|
124
|
+
audioFormat: audioFormat,
|
|
125
|
+
pcmBuffer: pcmBuffer
|
|
126
|
+
) else {
|
|
127
|
+
continue
|
|
128
|
+
}
|
|
129
|
+
bufferForAnalyzer = convertedBuffer
|
|
130
|
+
} else {
|
|
131
|
+
bufferForAnalyzer = pcmBuffer
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
let input = AnalyzerInput(buffer: bufferForAnalyzer)
|
|
135
|
+
inputBuilder?.yield(input)
|
|
136
|
+
}
|
|
137
|
+
} catch {
|
|
138
|
+
if Task.isCancelled || self.isStopping {
|
|
139
|
+
return
|
|
140
|
+
}
|
|
141
|
+
self.reportFailure(
|
|
142
|
+
from: "startRecognition.audioProducerTask",
|
|
143
|
+
message: "Failed to convert audio format",
|
|
144
|
+
type: .start
|
|
145
|
+
)
|
|
146
|
+
return
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
// 7. Handle the results
|
|
151
|
+
recognizerTask = Task {
|
|
152
|
+
do {
|
|
153
|
+
try await transcriber.handleResults(
|
|
154
|
+
onResult: { [weak self] result in
|
|
155
|
+
guard let self else { return }
|
|
156
|
+
self.handleBatch(
|
|
157
|
+
attrString: result.text,
|
|
158
|
+
rangeStart: result.rangeStart,
|
|
159
|
+
isFinal: result.isFinal
|
|
160
|
+
)
|
|
161
|
+
}
|
|
162
|
+
)
|
|
163
|
+
} catch {
|
|
164
|
+
if self.isStopping || error is CancellationError {
|
|
165
|
+
return
|
|
166
|
+
}
|
|
167
|
+
self.reportFailure(
|
|
168
|
+
from: "startRecognition.recognizerTask",
|
|
169
|
+
message: "Failed to retrieve transcriber result",
|
|
170
|
+
type: .onSession
|
|
171
|
+
)
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
do {
|
|
176
|
+
if let inputSequence, let analyzer {
|
|
177
|
+
if let contextualStrings = self.recognizerDelegate?.config?.contextualStrings {
|
|
178
|
+
let context = AnalysisContext()
|
|
179
|
+
context.contextualStrings = [
|
|
180
|
+
AnalysisContext.ContextualStringsTag.general: contextualStrings
|
|
181
|
+
]
|
|
182
|
+
try await analyzer.setContext(context)
|
|
183
|
+
}
|
|
184
|
+
try await analyzer.start(inputSequence: inputSequence)
|
|
185
|
+
}
|
|
186
|
+
} catch {
|
|
187
|
+
self.reportFailure(
|
|
188
|
+
from: "startRecognition.analyzerStart",
|
|
189
|
+
message: "Failed to start analyze input sequence",
|
|
190
|
+
type: .start
|
|
191
|
+
)
|
|
192
|
+
return
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
self.sendFeedbackOnStart()
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
override func cleanup(from: String) {
|
|
199
|
+
super.cleanup(from: "overridden.\(from)")
|
|
200
|
+
|
|
201
|
+
inputSequence = nil
|
|
202
|
+
inputBuilder = nil
|
|
203
|
+
outputContinuation?.finish()
|
|
204
|
+
outputContinuation = nil
|
|
205
|
+
analyzer = nil
|
|
206
|
+
transcriber.clean()
|
|
207
|
+
audioProducerTask?.cancel()
|
|
208
|
+
audioProducerTask = nil
|
|
209
|
+
recognizerTask?.cancel()
|
|
210
|
+
recognizerTask = nil
|
|
211
|
+
lastBatchStartTime = nil
|
|
212
|
+
resultBatches = []
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
private func handleBatch(attrString: AttributedString, rangeStart: CMTime, isFinal: Bool) {
|
|
216
|
+
var newBatch = String(attrString.characters)
|
|
217
|
+
// Ignore all batches without A-z0-9
|
|
218
|
+
if !newBatch.contains(/\w+/) {
|
|
219
|
+
return
|
|
220
|
+
}
|
|
221
|
+
// Track only when transcription is coming
|
|
222
|
+
self.trackPartialActivity()
|
|
223
|
+
|
|
224
|
+
let disableRepeatingFilter = self.recognizerDelegate?.config?.disableRepeatingFilter ?? false
|
|
225
|
+
if !disableRepeatingFilter {
|
|
226
|
+
newBatch = Utils.repeatingFilter(newBatch)
|
|
227
|
+
}
|
|
228
|
+
Log.log("[1] lastBatch: \(self.resultBatches.last ?? "") | newBatch: \(newBatch)")
|
|
229
|
+
if self.resultBatches.isEmpty {
|
|
230
|
+
self.resultBatches.append(newBatch)
|
|
231
|
+
} else if CMTimeGetSeconds(rangeStart) == self.lastBatchStartTime || isFinal {
|
|
232
|
+
Log.log("[2] replace, isFinal: \(isFinal)")
|
|
233
|
+
self.resultBatches[self.resultBatches.count - 1] = newBatch
|
|
234
|
+
} else {
|
|
235
|
+
Log.log("[2] add new batch")
|
|
236
|
+
self.resultBatches.append(newBatch)
|
|
237
|
+
}
|
|
238
|
+
self.lastBatchStartTime = CMTimeGetSeconds(rangeStart)
|
|
239
|
+
self.recognizerDelegate?.result(batches: self.resultBatches)
|
|
240
|
+
}
|
|
241
|
+
}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import Foundation
|
|
2
|
+
import Speech
|
|
3
|
+
|
|
4
|
+
@available(iOS 26.0, *)
|
|
5
|
+
final class DictationRuntime: TranscriberRuntime {
|
|
6
|
+
let locale: Locale
|
|
7
|
+
private var transcriber: DictationTranscriber?
|
|
8
|
+
|
|
9
|
+
init(with locale: Locale) {
|
|
10
|
+
self.locale = locale
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
func create(config: SpeechRecognitionConfig?) async throws {
|
|
14
|
+
var dictationTranscriptionOptions: Set<DictationTranscriber.TranscriptionOption> = [
|
|
15
|
+
.punctuation
|
|
16
|
+
]
|
|
17
|
+
if config?.maskOffensiveWords == true {
|
|
18
|
+
dictationTranscriptionOptions.insert(.etiquetteReplacements)
|
|
19
|
+
}
|
|
20
|
+
if config?.iosAddPunctuation == false
|
|
21
|
+
|| config?.iosPreset == IosPreset.shortform {
|
|
22
|
+
dictationTranscriptionOptions.remove(.punctuation)
|
|
23
|
+
}
|
|
24
|
+
var contentHints: Set<DictationTranscriber.ContentHint> = [
|
|
25
|
+
.shortForm,
|
|
26
|
+
.farField,
|
|
27
|
+
]
|
|
28
|
+
if config?.iosAtypicalSpeech == true {
|
|
29
|
+
contentHints.insert(.atypicalSpeech)
|
|
30
|
+
}
|
|
31
|
+
transcriber = DictationTranscriber(
|
|
32
|
+
locale: locale,
|
|
33
|
+
contentHints: contentHints,
|
|
34
|
+
transcriptionOptions: dictationTranscriptionOptions,
|
|
35
|
+
reportingOptions: [.frequentFinalization, .volatileResults],
|
|
36
|
+
attributeOptions: [.audioTimeRange]
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
if let transcriber, let installationRequest = try await AssetInventory.assetInstallationRequest(supporting: [transcriber]) {
|
|
40
|
+
try await installationRequest.downloadAndInstall()
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
func getModules() -> [any SpeechModule] {
|
|
45
|
+
guard let transcriber else { return [] }
|
|
46
|
+
return [transcriber]
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
func handleResults(
|
|
50
|
+
onResult: @escaping (TranscriberResult) -> Void
|
|
51
|
+
) async throws {
|
|
52
|
+
if let transcriber {
|
|
53
|
+
for try await result in transcriber.results {
|
|
54
|
+
onResult(
|
|
55
|
+
TranscriberResult(
|
|
56
|
+
text: result.text,
|
|
57
|
+
rangeStart: result.range.start,
|
|
58
|
+
isFinal: result.isFinal)
|
|
59
|
+
)
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
func clean() {
|
|
65
|
+
transcriber = nil
|
|
66
|
+
}
|
|
67
|
+
}
|
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
import Foundation
|
|
2
|
+
import Speech
|
|
3
|
+
import AVFoundation
|
|
4
|
+
|
|
5
|
+
// No practical diff between "system" and "onSession" for now.
|
|
6
|
+
// For future: send the level of error to RN
|
|
7
|
+
// "onSession" is less critical level, since the session has been started successfully
|
|
8
|
+
enum FailureType {
|
|
9
|
+
case system
|
|
10
|
+
case start
|
|
11
|
+
case prewarm
|
|
12
|
+
case onSession
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
class RecognizerEngine {
|
|
16
|
+
var isActive = false
|
|
17
|
+
var isStopping = false
|
|
18
|
+
var hardwareFormat: AVAudioFormat?
|
|
19
|
+
weak var recognizerDelegate: RecognizerDelegate?
|
|
20
|
+
|
|
21
|
+
private let audioLevelTracker: AudioLevelTracker
|
|
22
|
+
private var appStateObserver: AppStateObserver?
|
|
23
|
+
private var audioEngine: AVAudioEngine?
|
|
24
|
+
private var autoStopper: AutoStopper?
|
|
25
|
+
private let lg = Lg(prefix: "RecognizerEngine")
|
|
26
|
+
|
|
27
|
+
let locale: Locale
|
|
28
|
+
|
|
29
|
+
init(locale: Locale, delegate: RecognizerDelegate) {
|
|
30
|
+
self.locale = locale
|
|
31
|
+
self.recognizerDelegate = delegate
|
|
32
|
+
self.audioLevelTracker = AudioLevelTracker(
|
|
33
|
+
resetAutoFinishVoiceSensitivity: delegate.config?.resetAutoFinishVoiceSensitivity
|
|
34
|
+
)
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// MARK: - Recognizer Methods
|
|
38
|
+
|
|
39
|
+
func prewarm(for: FailureType) async {
|
|
40
|
+
self.prepareAudioEngine()
|
|
41
|
+
// for SpeechTranscriber: .isAvailable and async assets
|
|
42
|
+
// for Dictation: only async assets
|
|
43
|
+
// for legacy SF: only sync .isAvailable
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
func start() {
|
|
47
|
+
guard let recognizerDelegate, !isActive else { return }
|
|
48
|
+
|
|
49
|
+
Permissions(
|
|
50
|
+
onGranted: self.startSession,
|
|
51
|
+
onDenied: recognizerDelegate.permissionDenied,
|
|
52
|
+
onError: recognizerDelegate.error
|
|
53
|
+
).requestAuthorization()
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
func stop() {
|
|
57
|
+
guard isActive, !isStopping else { return }
|
|
58
|
+
isStopping = true
|
|
59
|
+
HapticImpact.trigger(with: self.recognizerDelegate?.config?.stopHapticFeedbackStyle)
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
func startSession() async {
|
|
63
|
+
lg.log("[startSession.startSession]")
|
|
64
|
+
// Init everything
|
|
65
|
+
isStopping = false
|
|
66
|
+
isActive = true
|
|
67
|
+
|
|
68
|
+
initAutoStop()
|
|
69
|
+
lg.log("[startSession.initAutoStop]")
|
|
70
|
+
startAppStateObserver()
|
|
71
|
+
lg.log("[startSession.startAppStateObserver]")
|
|
72
|
+
startAudioSession()
|
|
73
|
+
lg.log("[startSession.startAudioSession]")
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
func startAudioEngine(
|
|
77
|
+
onBuffer: @escaping (AVAudioPCMBuffer) -> Void
|
|
78
|
+
) {
|
|
79
|
+
lg.log("[startAudioEngine]")
|
|
80
|
+
guard let audioEngine, let hardwareFormat else { return }
|
|
81
|
+
audioEngine.inputNode.installTap(
|
|
82
|
+
onBus: 0,
|
|
83
|
+
bufferSize: 1024,
|
|
84
|
+
format: hardwareFormat
|
|
85
|
+
) { [weak self] buffer, _ in
|
|
86
|
+
guard let self, let recognizerDelegate = self.recognizerDelegate else { return }
|
|
87
|
+
if let sample = self.audioLevelTracker.process(buffer) {
|
|
88
|
+
// Send buffer volume data
|
|
89
|
+
recognizerDelegate.volumeChange(
|
|
90
|
+
event:
|
|
91
|
+
VolumeChangeEvent(
|
|
92
|
+
smoothedVolume: sample.smoothed,
|
|
93
|
+
rawVolume: sample.raw,
|
|
94
|
+
db: sample.db
|
|
95
|
+
)
|
|
96
|
+
)
|
|
97
|
+
if sample.resetTimer {
|
|
98
|
+
self.autoStopper?.resetTimer(from: "rms threshold")
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
onBuffer(buffer)
|
|
102
|
+
}
|
|
103
|
+
lg.log("[startAudioEngine.installTap]")
|
|
104
|
+
do {
|
|
105
|
+
audioEngine.prepare()
|
|
106
|
+
lg.log("[startAudioEngine.prepare]")
|
|
107
|
+
try audioEngine.start()
|
|
108
|
+
lg.log("[startAudioEngine.start]")
|
|
109
|
+
} catch {
|
|
110
|
+
self.reportFailure(
|
|
111
|
+
from: "Audio Engine",
|
|
112
|
+
message: "Audio Engine failed to start",
|
|
113
|
+
// RecognizerEngine-agnostic Error
|
|
114
|
+
type: .system
|
|
115
|
+
)
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
func sendFeedbackOnStart() {
|
|
120
|
+
guard let recognizerDelegate else { return }
|
|
121
|
+
lg.log("[sendFeedbackOnStart]")
|
|
122
|
+
HapticImpact.trigger(with: recognizerDelegate.config?.startHapticFeedbackStyle)
|
|
123
|
+
autoStopper?.resetTimer(from: "startListening.sendFeedbackOnStart")
|
|
124
|
+
recognizerDelegate.readyForSpeech()
|
|
125
|
+
recognizerDelegate.result(batches: [])
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
func updateSession(
|
|
129
|
+
newConfig: MutableSpeechRecognitionConfig? = nil,
|
|
130
|
+
addMsToTimer: Double? = nil,
|
|
131
|
+
resetTimer: Bool? = nil
|
|
132
|
+
) {
|
|
133
|
+
guard let recognizerDelegate, isActive, !isStopping else { return }
|
|
134
|
+
let currentConfig = recognizerDelegate.config
|
|
135
|
+
// Update AutoFinish time
|
|
136
|
+
if let newAutoFinish = newConfig?.autoFinishRecognitionMs,
|
|
137
|
+
newAutoFinish != currentConfig?.autoFinishRecognitionMs {
|
|
138
|
+
autoStopper?.updateThreshold(
|
|
139
|
+
newAutoFinish,
|
|
140
|
+
from: "updateSession"
|
|
141
|
+
)
|
|
142
|
+
}
|
|
143
|
+
// Update AutoFinish progress interval
|
|
144
|
+
if let newInterval = newConfig?.autoFinishProgressIntervalMs,
|
|
145
|
+
newInterval != currentConfig?.autoFinishProgressIntervalMs {
|
|
146
|
+
autoStopper?.updateProgressInterval(
|
|
147
|
+
newInterval,
|
|
148
|
+
from: "updateSession"
|
|
149
|
+
)
|
|
150
|
+
}
|
|
151
|
+
// Update AutoFinish reset voice sensitivity interval
|
|
152
|
+
if let newSensitivity = newConfig?.resetAutoFinishVoiceSensitivity,
|
|
153
|
+
newSensitivity != currentConfig?.resetAutoFinishVoiceSensitivity {
|
|
154
|
+
audioLevelTracker.updateResetAutoFinishVoiceSensitivity(
|
|
155
|
+
newValue: newSensitivity
|
|
156
|
+
)
|
|
157
|
+
}
|
|
158
|
+
if let addMsToTimer {
|
|
159
|
+
// Add time to the timer once
|
|
160
|
+
autoStopper?.addMsOnce(
|
|
161
|
+
addMsToTimer,
|
|
162
|
+
from: "updateSession"
|
|
163
|
+
)
|
|
164
|
+
} else if resetTimer == true {
|
|
165
|
+
// Reset to current baseline threshold.
|
|
166
|
+
autoStopper?.resetTimer(from: "updateSession")
|
|
167
|
+
}
|
|
168
|
+
// Only update new non-nil values in the config
|
|
169
|
+
recognizerDelegate.softlyUpdateConfig(newConfig: newConfig)
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
func cleanup(from: String) {
|
|
173
|
+
lg.log("[cleanup]: \(from)")
|
|
174
|
+
let wasActive = isActive
|
|
175
|
+
deinitAutoStop()
|
|
176
|
+
stopAppStateObserver()
|
|
177
|
+
stopAudioSession()
|
|
178
|
+
audioLevelTracker.reset()
|
|
179
|
+
|
|
180
|
+
if let audioEngine, audioEngine.isRunning {
|
|
181
|
+
audioEngine.stop()
|
|
182
|
+
}
|
|
183
|
+
audioEngine?.inputNode.removeTap(onBus: 0)
|
|
184
|
+
|
|
185
|
+
audioEngine = nil
|
|
186
|
+
isActive = false
|
|
187
|
+
isStopping = false
|
|
188
|
+
self.recognizerDelegate?.volumeChange(
|
|
189
|
+
event:
|
|
190
|
+
VolumeChangeEvent(
|
|
191
|
+
smoothedVolume: 0,
|
|
192
|
+
rawVolume: 0,
|
|
193
|
+
db: nil
|
|
194
|
+
)
|
|
195
|
+
)
|
|
196
|
+
if wasActive {
|
|
197
|
+
self.recognizerDelegate?.recordingStopped()
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
func reportFailure(from: String, message: String, type: FailureType) {
|
|
202
|
+
// Log message
|
|
203
|
+
lg.log("[Failure] type: \(type), message: \(message)")
|
|
204
|
+
|
|
205
|
+
// Cleanup on engine level anyway
|
|
206
|
+
self.cleanup(from: from)
|
|
207
|
+
|
|
208
|
+
switch type {
|
|
209
|
+
// Try to reselect engine and try again
|
|
210
|
+
case .prewarm, .start:
|
|
211
|
+
let isPrewarm = type == .prewarm
|
|
212
|
+
self.recognizerDelegate?.reselectEngine(forPrewarm: isPrewarm)
|
|
213
|
+
// System level issue: send onError with description and clean
|
|
214
|
+
// Session has already started: send onError and cleanup
|
|
215
|
+
case .system, .onSession:
|
|
216
|
+
self.recognizerDelegate?.error(message: message)
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
func trackPartialActivity() {
|
|
221
|
+
if !self.isStopping {
|
|
222
|
+
self.autoStopper?.resetTimer(from: "Partial results")
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
// MARK: - AudioEngine heavy prepare
|
|
227
|
+
|
|
228
|
+
private func prepareAudioEngine() {
|
|
229
|
+
lg.log("[prewarm.start]")
|
|
230
|
+
audioEngine = AVAudioEngine()
|
|
231
|
+
guard let audioEngine else {
|
|
232
|
+
self.reportFailure(
|
|
233
|
+
from: "Audio Engine",
|
|
234
|
+
message: "Audio Engine failed to initiate",
|
|
235
|
+
// RecognizerEngine-agnostic Error
|
|
236
|
+
type: .system
|
|
237
|
+
)
|
|
238
|
+
return
|
|
239
|
+
}
|
|
240
|
+
lg.log("[prewarm.audioEngine]")
|
|
241
|
+
// heavy first hardwareFormat retrieval
|
|
242
|
+
if hardwareFormat == nil {
|
|
243
|
+
hardwareFormat = audioEngine.inputNode.outputFormat(forBus: 0)
|
|
244
|
+
lg.log("[prewarm.hardwareFormat]")
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
// MARK: - AutoStopper
|
|
249
|
+
|
|
250
|
+
private func initAutoStop() {
|
|
251
|
+
let config = self.recognizerDelegate?.config
|
|
252
|
+
autoStopper = AutoStopper(
|
|
253
|
+
silenceThresholdMs: config?.autoFinishRecognitionMs,
|
|
254
|
+
progressIntervalMs: config?.autoFinishProgressIntervalMs,
|
|
255
|
+
onProgress: { [weak self] timeLeftMs in
|
|
256
|
+
guard let self else { return }
|
|
257
|
+
self.recognizerDelegate?.autoFinishProgress(
|
|
258
|
+
timeLeftMs: timeLeftMs
|
|
259
|
+
)
|
|
260
|
+
},
|
|
261
|
+
onTimeout: { [weak self] in
|
|
262
|
+
self?.stop()
|
|
263
|
+
}
|
|
264
|
+
)
|
|
265
|
+
}
|
|
266
|
+
private func deinitAutoStop() {
|
|
267
|
+
autoStopper?.stop()
|
|
268
|
+
autoStopper = nil
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
// MARK: - App State Observer
|
|
272
|
+
|
|
273
|
+
private func startAppStateObserver() {
|
|
274
|
+
appStateObserver = AppStateObserver { [weak self] in
|
|
275
|
+
guard let self, self.isActive else { return }
|
|
276
|
+
self.stop()
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
private func stopAppStateObserver() {
|
|
281
|
+
appStateObserver?.stop()
|
|
282
|
+
appStateObserver = nil
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
// MARK: - Audio Session
|
|
286
|
+
|
|
287
|
+
private func startAudioSession() {
|
|
288
|
+
do {
|
|
289
|
+
let audioSession = AVAudioSession.sharedInstance()
|
|
290
|
+
try audioSession.setCategory(.record, mode: .measurement, options: .duckOthers)
|
|
291
|
+
// Required for haptic feedback
|
|
292
|
+
try audioSession.setAllowHapticsAndSystemSoundsDuringRecording(true)
|
|
293
|
+
try audioSession.setActive(true, options: .notifyOthersOnDeactivation)
|
|
294
|
+
} catch {
|
|
295
|
+
self.reportFailure(
|
|
296
|
+
from: "startAudioSession",
|
|
297
|
+
message: "Failed to activate audio session: \(error.localizedDescription)",
|
|
298
|
+
// RecognizerEngine-agnostic Error
|
|
299
|
+
type: .system
|
|
300
|
+
)
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
private func stopAudioSession() {
|
|
304
|
+
do {
|
|
305
|
+
// TODO: check unduck
|
|
306
|
+
try AVAudioSession.sharedInstance().setActive(false)
|
|
307
|
+
} catch {
|
|
308
|
+
// Just log and no-op - not critical
|
|
309
|
+
lg.log("Failed to deactivate audio session: \(error.localizedDescription)")
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
}
|