@livx.cc/agentx 0.96.10 → 0.96.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/native/Info.plist +9 -0
- package/dist/native/mic-aec.swift +231 -0
- package/package.json +1 -1
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
2
|
+
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
|
3
|
+
<plist version="1.0">
|
|
4
|
+
<dict>
|
|
5
|
+
<key>CFBundleIdentifier</key><string>cc.livx.mic-aec</string>
|
|
6
|
+
<key>CFBundleName</key><string>mic-aec</string>
|
|
7
|
+
<key>NSMicrophoneUsageDescription</key><string>Echo-cancelled mic capture for voice experiment</string>
|
|
8
|
+
</dict>
|
|
9
|
+
</plist>
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
// mic-aec — full-duplex echo-cancelled audio for the voice CLI.
|
|
2
|
+
// Apple's voice-processing unit (same as FaceTime) subtracts rendered output from the mic. The
|
|
3
|
+
// ROOT echo fix is rendering TTS through THIS engine: VPIO then has a sample-accurate, time-aligned
|
|
4
|
+
// reference for exactly what we emit (vs. guessing from the system-output loopback when a separate
|
|
5
|
+
// process like ffplay plays it — cancellation quality became a per-run spectrum).
|
|
6
|
+
// stdin (optional): framed playback — [u32le len][s16le mono 44.1kHz PCM]; len==0 = FLUSH;
|
|
7
|
+
// len==0xFFFFFFFF = PAUSE, 0xFFFFFFFE = RESUME (overlap trail-off — exact-sample hold)
|
|
8
|
+
// (drop queued+playing audio NOW — the barge-in primitive). In-band framing
|
|
9
|
+
// makes flush perfectly ordered: stale frames already in the pipe are consumed
|
|
10
|
+
// and discarded by the stop, frames after it belong to the new turn.
|
|
11
|
+
// stdout: s16le mono 16kHz echo-cancelled mic PCM.
|
|
12
|
+
// No stdin writer (legacy capture-only use) is fine — the reader thread just blocks.
|
|
13
|
+
// Build: swiftc -O -o mic-aec mic-aec.swift -Xlinker -sectcreate -Xlinker __TEXT -Xlinker __info_plist -Xlinker Info.plist && codesign -fs - mic-aec
|
|
14
|
+
import AVFoundation
|
|
15
|
+
|
|
16
|
+
// VPIO checks mic TCC explicitly (plain HAL capture may work while VPIO silently mutes) — request it
|
|
17
|
+
let sem = DispatchSemaphore(value: 0)
|
|
18
|
+
var granted = false
|
|
19
|
+
AVCaptureDevice.requestAccess(for: .audio) { ok in granted = ok; sem.signal() }
|
|
20
|
+
sem.wait()
|
|
21
|
+
FileHandle.standardError.write("mic access granted: \(granted)\n".data(using: .utf8)!)
|
|
22
|
+
|
|
23
|
+
let engine = AVAudioEngine()
|
|
24
|
+
let input = engine.inputNode
|
|
25
|
+
do { try input.setVoiceProcessingEnabled(true) } catch {
|
|
26
|
+
FileHandle.standardError.write("AEC unavailable: \(error)\n".data(using: .utf8)!)
|
|
27
|
+
exit(1)
|
|
28
|
+
}
|
|
29
|
+
// the VP I/O unit is full-duplex: input stays muted unless the output side renders too. The mixer→
|
|
30
|
+
// output format must match the VP output's hardware format (nil/default fails kAUInitialize -10875).
|
|
31
|
+
let outHw = engine.outputNode.outputFormat(forBus: 0)
|
|
32
|
+
engine.connect(engine.mainMixerNode, to: engine.outputNode, format: outHw)
|
|
33
|
+
engine.mainMixerNode.outputVolume = 1 // playback is audible now (was 0 when this path was only a keep-alive)
|
|
34
|
+
// VP ducks all other system audio by default — disable (we ARE the audio).
|
|
35
|
+
if #available(macOS 14.0, *) {
|
|
36
|
+
input.voiceProcessingOtherAudioDuckingConfiguration = .init(enableAdvancedDucking: false, duckingLevel: .min)
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// ---- playback: framed s16le 44.1k mono on stdin → player node (the VPIO reference signal) ----
|
|
40
|
+
let playFmt = AVAudioFormat(commonFormat: .pcmFormatFloat32, sampleRate: 44100, channels: 1, interleaved: false)!
|
|
41
|
+
let player = AVAudioPlayerNode()
|
|
42
|
+
engine.attach(player)
|
|
43
|
+
engine.connect(player, to: engine.mainMixerNode, format: playFmt)
|
|
44
|
+
|
|
45
|
+
// ---- residual-echo gate: VPIO suppresses our own playback ~40dB, but modern STT transcribes
|
|
46
|
+
// -40dB speech just fine — so acoustic cancellation alone still leaks ghost turns. While WE are
|
|
47
|
+
// audible (scheduled buffers outstanding + a short tail), gate the mic by level: the residue sits
|
|
48
|
+
// near a quiet baseline (tracked adaptively), a real barging user is 10-100x louder and passes. ----
|
|
49
|
+
let gateLock = NSLock()
|
|
50
|
+
var audibleUntil: Double = 0 // CACurrentMediaTime deadline; extended as buffers schedule/complete
|
|
51
|
+
var pausedSince: Double = 0 // overlap pause: ~300ms tail then the gate OPENS — nothing is playing,
|
|
52
|
+
// so nothing can bleed; whatever the mic hears now IS the user. (The
|
|
53
|
+
// earlier 'keep gating while paused' muted the very speech the pause
|
|
54
|
+
// existed to listen for.)
|
|
55
|
+
var echoBaseline: Float = 0 // adaptive EMA of the residual level while audible
|
|
56
|
+
// Playback-envelope-aware gating: residue scales with how LOUD we are right now. Per-window rms of
|
|
57
|
+
// the scheduled audio, playerTime-aligned, × adaptive leak factor = expected residue.
|
|
58
|
+
let ENV_WIN = 2048 // samples per envelope window @44.1k (~46ms)
|
|
59
|
+
var schedEnv: [Float] = [] // rms per scheduled window (playerTime timeline; cleared on FLUSH)
|
|
60
|
+
var leak: Float = 0.05 // mic-residue / playback-loudness ratio (adaptive EMA, clamped)
|
|
61
|
+
let gateEnabled = ProcessInfo.processInfo.environment["MIC_GATE"] != "0" // escape hatch while tuning
|
|
62
|
+
// TEST HARNESS: INJECT frames (stdin, len==0xFFFFFFFD) queue s16le 16k mono "user speech" that the
|
|
63
|
+
// tap MIXES into the real mic stream BEFORE the gate — the full live pipeline (gate, leak, STT,
|
|
64
|
+
// engine) runs unchanged while a conversation simulator plays the human. Mirrors reality: injected
|
|
65
|
+
// speech, like a real voice, is in NO echo-cancellation reference.
|
|
66
|
+
var injectBuf: [Int16] = []
|
|
67
|
+
|
|
68
|
+
/** playback rms around the sample being rendered NOW (±1 window — alignment is approximate) */
|
|
69
|
+
func playbackEnvNow() -> Float {
|
|
70
|
+
guard let nt = player.lastRenderTime, let pt = player.playerTime(forNodeTime: nt) else { return 0 }
|
|
71
|
+
let w = Int(pt.sampleTime) / ENV_WIN
|
|
72
|
+
var m: Float = 0
|
|
73
|
+
for i in max(0, w - 1)...(w + 1) where i < schedEnv.count { m = max(m, schedEnv[i]) }
|
|
74
|
+
return m
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// ---- capture: VPIO exposes a multi-channel (e.g. 9ch) format with IDENTICAL channels (measured —
|
|
78
|
+
// processed mono replicated); AVAudioConverter's N→1 mapping yields silence — take channel 0
|
|
79
|
+
// manually and decimate to 16k by averaging. ----
|
|
80
|
+
let inFmt = input.outputFormat(forBus: 0)
|
|
81
|
+
let decim = max(1, Int(inFmt.sampleRate / 16000))
|
|
82
|
+
let out = FileHandle.standardOutput
|
|
83
|
+
|
|
84
|
+
input.installTap(onBus: 0, bufferSize: 2048, format: inFmt) { buf, _ in
|
|
85
|
+
guard let ch = buf.floatChannelData else { return }
|
|
86
|
+
let n = Int(buf.frameLength)
|
|
87
|
+
var pcm = [Int16]()
|
|
88
|
+
pcm.reserveCapacity(n / decim + 1)
|
|
89
|
+
var i = 0
|
|
90
|
+
var sq: Float = 0
|
|
91
|
+
while i + decim <= n {
|
|
92
|
+
var acc: Float = 0
|
|
93
|
+
for j in 0..<decim { acc += ch[0][i + j] }
|
|
94
|
+
let v = max(-1, min(1, acc / Float(decim)))
|
|
95
|
+
sq += v * v
|
|
96
|
+
pcm.append(Int16(v * 32767))
|
|
97
|
+
i += decim
|
|
98
|
+
}
|
|
99
|
+
gateLock.lock()
|
|
100
|
+
if !injectBuf.isEmpty, !pcm.isEmpty { // mix simulated user speech (pre-gate, post-AEC — like a real voice)
|
|
101
|
+
let n = min(injectBuf.count, pcm.count)
|
|
102
|
+
for k in 0..<n { pcm[k] = Int16(clamping: Int32(pcm[k]) + Int32(injectBuf[k])) }
|
|
103
|
+
injectBuf.removeFirst(n)
|
|
104
|
+
}
|
|
105
|
+
var sq2: Float = 0
|
|
106
|
+
for v in pcm { let f = Float(v) / 32767; sq2 += f * f }
|
|
107
|
+
let rms = pcm.isEmpty ? 0 : (sq2 / Float(pcm.count)).squareRoot() * 32767
|
|
108
|
+
_ = sq
|
|
109
|
+
let nowT = CACurrentMediaTime()
|
|
110
|
+
let audible = pausedSince > 0 ? nowT < pausedSince + 0.3 : nowT < audibleUntil + 0.8
|
|
111
|
+
var threshold: Float = 0
|
|
112
|
+
if audible {
|
|
113
|
+
if echoBaseline == 0 { echoBaseline = max(rms, 8) }
|
|
114
|
+
// track the baseline only while the level looks like residue (don't learn the user's voice)
|
|
115
|
+
if rms < echoBaseline * 4 { echoBaseline = echoBaseline * 0.9 + max(rms, 8) * 0.1 }
|
|
116
|
+
let env = playbackEnvNow()
|
|
117
|
+
// learn the leak ratio ONLY from clearly-residue-shaped moments (≤2.5× current estimate) —
|
|
118
|
+
// a looser bound (6×) learned the USER'S overlapping speech as leak, inflating the threshold
|
|
119
|
+
// until speech itself was gated (3s barge latency: Soniox got fragments). Clamp low: leak
|
|
120
|
+
// beyond ~0.2 means AEC is broken, not that speech should be silenced.
|
|
121
|
+
if env > 300, rms < leak * env * 2.5 {
|
|
122
|
+
leak = min(0.2, max(0.005, leak * 0.9 + (rms / env) * 0.1))
|
|
123
|
+
}
|
|
124
|
+
// 2× (not 3×): user speech must reach the STT — Soniox is the VAD/discriminator. Residue
|
|
125
|
+
// BURSTS that slip a 2× gate are 100-200ms fragments a speech model does not tokenize
|
|
126
|
+
// (the historical full-sentence bleed was CONTINUOUS residue — that still gates).
|
|
127
|
+
threshold = max(2 * leak * env, echoBaseline * 4, 120)
|
|
128
|
+
}
|
|
129
|
+
let gate = gateEnabled && audible && rms < threshold // user speech must clearly dominate the expected residue
|
|
130
|
+
gateLock.unlock()
|
|
131
|
+
if gate { for k in 0..<pcm.count { pcm[k] = 0 } } // keep the stream's timing — send silence, not nothing
|
|
132
|
+
pcm.withUnsafeBufferPointer { p in
|
|
133
|
+
out.write(Data(buffer: p))
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
do { try engine.start() } catch {
|
|
138
|
+
FileHandle.standardError.write("engine start failed: \(error)\n".data(using: .utf8)!)
|
|
139
|
+
exit(1)
|
|
140
|
+
}
|
|
141
|
+
player.play()
|
|
142
|
+
|
|
143
|
+
// stdin reader (background): framed playback + in-band FLUSH. EOF = parent gone → clean exit.
|
|
144
|
+
DispatchQueue.global(qos: .userInteractive).async {
|
|
145
|
+
let stdin = FileHandle.standardInput
|
|
146
|
+
var pausedAt: Double = 0 // overlap pause (control frames); resume shifts the gate window
|
|
147
|
+
func readN(_ n: Int) -> Data? {
|
|
148
|
+
var d = Data()
|
|
149
|
+
while d.count < n {
|
|
150
|
+
let chunk = stdin.readData(ofLength: n - d.count)
|
|
151
|
+
if chunk.isEmpty { return nil } // EOF
|
|
152
|
+
d.append(chunk)
|
|
153
|
+
}
|
|
154
|
+
return d
|
|
155
|
+
}
|
|
156
|
+
while true {
|
|
157
|
+
guard let hdr = readN(4) else { break }
|
|
158
|
+
let len = hdr.withUnsafeBytes { $0.loadUnaligned(as: UInt32.self) } // LE host
|
|
159
|
+
if len == 0 { // FLUSH: stop drops every scheduled buffer; restart for the next turn
|
|
160
|
+
player.stop()
|
|
161
|
+
player.play()
|
|
162
|
+
pausedAt = 0
|
|
163
|
+
gateLock.lock(); pausedSince = 0; audibleUntil = CACurrentMediaTime(); schedEnv.removeAll(); gateLock.unlock() // gate tail only; envelope timeline resets with playerTime
|
|
164
|
+
continue
|
|
165
|
+
}
|
|
166
|
+
if len == UInt32.max { // PAUSE: hold playback exactly (overlap trail-off; nothing is lost)
|
|
167
|
+
if pausedAt == 0 {
|
|
168
|
+
player.pause()
|
|
169
|
+
pausedAt = CACurrentMediaTime()
|
|
170
|
+
gateLock.lock(); pausedSince = pausedAt; gateLock.unlock()
|
|
171
|
+
}
|
|
172
|
+
continue
|
|
173
|
+
}
|
|
174
|
+
if len == UInt32.max - 2 { // INJECT (test harness): queue s16le 16k mono simulated user speech
|
|
175
|
+
guard let nl = readN(4) else { break }
|
|
176
|
+
let plen = nl.withUnsafeBytes { $0.loadUnaligned(as: UInt32.self) }
|
|
177
|
+
guard plen <= 1 << 22, let payload = readN(Int(plen)) else { break }
|
|
178
|
+
payload.withUnsafeBytes { raw in
|
|
179
|
+
let v = raw.bindMemory(to: Int16.self)
|
|
180
|
+
gateLock.lock(); injectBuf.append(contentsOf: v); gateLock.unlock()
|
|
181
|
+
}
|
|
182
|
+
continue
|
|
183
|
+
}
|
|
184
|
+
if len == UInt32.max - 1 { // RESUME: continue from the precise sample; gate window shifts by the pause
|
|
185
|
+
if pausedAt > 0 {
|
|
186
|
+
gateLock.lock()
|
|
187
|
+
audibleUntil += CACurrentMediaTime() - pausedAt
|
|
188
|
+
pausedSince = 0
|
|
189
|
+
gateLock.unlock()
|
|
190
|
+
pausedAt = 0
|
|
191
|
+
player.play()
|
|
192
|
+
}
|
|
193
|
+
continue
|
|
194
|
+
}
|
|
195
|
+
guard len <= 1 << 22, let payload = readN(Int(len)) else { break } // 4MB sanity cap
|
|
196
|
+
let frames = payload.count / 2
|
|
197
|
+
guard frames > 0, let buf = AVAudioPCMBuffer(pcmFormat: playFmt, frameCapacity: AVAudioFrameCount(frames)) else { continue }
|
|
198
|
+
buf.frameLength = AVAudioFrameCount(frames)
|
|
199
|
+
payload.withUnsafeBytes { raw in
|
|
200
|
+
let s = raw.bindMemory(to: Int16.self)
|
|
201
|
+
let dst = buf.floatChannelData![0]
|
|
202
|
+
for i in 0..<frames { dst[i] = Float(s[i]) / 32768.0 }
|
|
203
|
+
// playback envelope: rms per window, appended on the playerTime timeline
|
|
204
|
+
gateLock.lock()
|
|
205
|
+
var i = 0
|
|
206
|
+
while i < frames {
|
|
207
|
+
let n = min(ENV_WIN, frames - i)
|
|
208
|
+
var acc: Float = 0
|
|
209
|
+
for j in 0..<n { let v = Float(s[i + j]) / 32768.0; acc += v * v }
|
|
210
|
+
schedEnv.append((acc / Float(n)).squareRoot() * 32767)
|
|
211
|
+
i += n
|
|
212
|
+
}
|
|
213
|
+
gateLock.unlock()
|
|
214
|
+
}
|
|
215
|
+
player.scheduleBuffer(buf, completionHandler: nil)
|
|
216
|
+
// extend the audible window by this buffer's duration (+0.8s tail covers device latency)
|
|
217
|
+
gateLock.lock()
|
|
218
|
+
let nowT = CACurrentMediaTime()
|
|
219
|
+
audibleUntil = max(audibleUntil, nowT) + Double(frames) / 44100.0
|
|
220
|
+
gateLock.unlock()
|
|
221
|
+
}
|
|
222
|
+
engine.stop()
|
|
223
|
+
exit(0)
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
// Graceful teardown: SIGKILL leaves the VPIO session wedged inside coreaudiod (repeated kills
|
|
227
|
+
// degrade system-wide echo cancellation until coreaudiod restarts) — stop the engine first.
|
|
228
|
+
signal(SIGTERM) { _ in engine.stop(); exit(0) }
|
|
229
|
+
signal(SIGINT) { _ in engine.stop(); exit(0) }
|
|
230
|
+
FileHandle.standardError.write("aec duplex audio running\n".data(using: .utf8)!)
|
|
231
|
+
RunLoop.main.run()
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@livx.cc/agentx",
|
|
3
|
-
"version": "0.96.
|
|
3
|
+
"version": "0.96.11",
|
|
4
4
|
"description": "Edge-native AI agent runtime — drives a virtual filesystem via any LLM (ai.libx.js). Same bytes run in node, browser, or edge.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|