@mcptoolshop/voice-engine-dsp 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +78 -0
- package/dist/src/adapters/AudioWorkletProcessor.d.ts +31 -0
- package/dist/src/adapters/AudioWorkletProcessor.d.ts.map +1 -0
- package/dist/src/adapters/AudioWorkletProcessor.js +77 -0
- package/dist/src/adapters/NodeStreamAutotune.d.ts +28 -0
- package/dist/src/adapters/NodeStreamAutotune.d.ts.map +1 -0
- package/dist/src/adapters/NodeStreamAutotune.js +103 -0
- package/dist/src/analysis/PitchTrackerRefV1.d.ts +13 -0
- package/dist/src/analysis/PitchTrackerRefV1.d.ts.map +1 -0
- package/dist/src/analysis/PitchTrackerRefV1.js +136 -0
- package/dist/src/analysis/VoicingDetectorRefV1.d.ts +13 -0
- package/dist/src/analysis/VoicingDetectorRefV1.d.ts.map +1 -0
- package/dist/src/analysis/VoicingDetectorRefV1.js +77 -0
- package/dist/src/index.d.ts +8 -0
- package/dist/src/index.d.ts.map +1 -0
- package/dist/src/index.js +22 -0
- package/dist/src/prosody/AccentRenderer.d.ts +15 -0
- package/dist/src/prosody/AccentRenderer.d.ts.map +1 -0
- package/dist/src/prosody/AccentRenderer.js +66 -0
- package/dist/src/prosody/Presets.d.ts +3 -0
- package/dist/src/prosody/Presets.d.ts.map +1 -0
- package/dist/src/prosody/Presets.js +49 -0
- package/dist/src/prosody/SafetyRails.d.ts +21 -0
- package/dist/src/prosody/SafetyRails.d.ts.map +1 -0
- package/dist/src/prosody/SafetyRails.js +65 -0
- package/dist/src/transformation/FormantStrategyV1.d.ts +5 -0
- package/dist/src/transformation/FormantStrategyV1.d.ts.map +1 -0
- package/dist/src/transformation/FormantStrategyV1.js +39 -0
- package/dist/src/transformation/PitchShifterRefV1.d.ts +9 -0
- package/dist/src/transformation/PitchShifterRefV1.d.ts.map +1 -0
- package/dist/src/transformation/PitchShifterRefV1.js +120 -0
- package/dist/src/tuning/AutotuneExecutor.d.ts +16 -0
- package/dist/src/tuning/AutotuneExecutor.d.ts.map +1 -0
- package/dist/src/tuning/AutotuneExecutor.js +217 -0
- package/dist/src/tuning/CorrectionController.d.ts +5 -0
- package/dist/src/tuning/CorrectionController.d.ts.map +1 -0
- package/dist/src/tuning/CorrectionController.js +91 -0
- package/dist/src/tuning/CorrectionControllerRefV1.d.ts +6 -0
- package/dist/src/tuning/CorrectionControllerRefV1.d.ts.map +1 -0
- package/dist/src/tuning/CorrectionControllerRefV1.js +63 -0
- package/dist/src/tuning/ScaleQuantizer.d.ts +7 -0
- package/dist/src/tuning/ScaleQuantizer.d.ts.map +1 -0
- package/dist/src/tuning/ScaleQuantizer.js +43 -0
- package/dist/src/tuning/StreamingAutotuneEngine.d.ts +43 -0
- package/dist/src/tuning/StreamingAutotuneEngine.d.ts.map +1 -0
- package/dist/src/tuning/StreamingAutotuneEngine.js +389 -0
- package/dist/src/tuning/StreamingAutotuneEngine_Fixed.d.ts +36 -0
- package/dist/src/tuning/StreamingAutotuneEngine_Fixed.d.ts.map +1 -0
- package/dist/src/tuning/StreamingAutotuneEngine_Fixed.js +344 -0
- package/dist/src/tuning/TargetCurveGenerator.d.ts +5 -0
- package/dist/src/tuning/TargetCurveGenerator.d.ts.map +1 -0
- package/dist/src/tuning/TargetCurveGenerator.js +69 -0
- package/dist/src/tuning/TargetCurveRefV1.d.ts +6 -0
- package/dist/src/tuning/TargetCurveRefV1.d.ts.map +1 -0
- package/dist/src/tuning/TargetCurveRefV1.js +69 -0
- package/dist/src/utils/AudioBufferUtils.d.ts +3 -0
- package/dist/src/utils/AudioBufferUtils.d.ts.map +1 -0
- package/dist/src/utils/AudioBufferUtils.js +19 -0
- package/dist/src/version.d.ts +2 -0
- package/dist/src/version.d.ts.map +1 -0
- package/dist/src/version.js +4 -0
- package/package.json +38 -0
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.CorrectionController = void 0;
|
|
4
|
+
class CorrectionController {
|
|
5
|
+
generate(f0, voicing, target, plan) {
|
|
6
|
+
const len = f0.f0MhzQ.length;
|
|
7
|
+
const strengthQ = new Int16Array(len);
|
|
8
|
+
const baseSnap = plan.parameters.snapStrengthQ / 10000.0; // 0..1
|
|
9
|
+
const protection = plan.parameters.consonantProtectionQ / 10000.0; // 0..1
|
|
10
|
+
const speedVal = plan.parameters.retuneSpeedQ; // 0..10000
|
|
11
|
+
// Phase 6 Controls
|
|
12
|
+
const globalMix = (plan.parameters.globalStrengthQ ?? 10000) / 10000.0; // 0..1
|
|
13
|
+
const attackMs = plan.parameters.attackMsQ ?? 0;
|
|
14
|
+
const releaseMs = plan.parameters.releaseMsQ ?? 0;
|
|
15
|
+
// Time constants
|
|
16
|
+
// const frameDur = f0.hopSamples / f0.sampleRateHz; // e.g. 0.01s
|
|
17
|
+
const frameDur = 0.01; // Force 10ms for debugging if needed
|
|
18
|
+
let alphaAtt = attackMs > 0 ? (1.0 - Math.exp(-frameDur / (attackMs / 1000.0))) : 1.0;
|
|
19
|
+
let alphaRel = releaseMs > 0 ? (1.0 - Math.exp(-frameDur / (releaseMs / 1000.0))) : 1.0;
|
|
20
|
+
let currentStrength = 0;
|
|
21
|
+
// Retune Speed Model:
|
|
22
|
+
// low speed => low alpha (lazy correction). high speed => high alpha.
|
|
23
|
+
// If speed=10000 (100%), alpha=1.0 (Instant).
|
|
24
|
+
// If speed=0 (0%), alpha=0.01 (Very slow).
|
|
25
|
+
// Exponential map typical for speed knobs? Linear is fine for V1.
|
|
26
|
+
const alpha = Math.max(0.01, speedVal / 10000.0);
|
|
27
|
+
let smoothDiff = 0;
|
|
28
|
+
for (let i = 0; i < len; i++) {
|
|
29
|
+
const isVoiced = voicing.voicedQ[i] > 0;
|
|
30
|
+
if (!isVoiced) {
|
|
31
|
+
strengthQ[i] = 0;
|
|
32
|
+
smoothDiff = 0; // Reset state on unvoiced
|
|
33
|
+
continue;
|
|
34
|
+
}
|
|
35
|
+
// 1. Calculate Raw Difference (Target - Input)
|
|
36
|
+
// Need Input Cents. Re-calc or assume we have it?
|
|
37
|
+
// Re-calc is safest (pure).
|
|
38
|
+
let f0Hz = f0.f0MhzQ[i] / 1000.0;
|
|
39
|
+
if (f0Hz < 20)
|
|
40
|
+
f0Hz = 20; // Guard
|
|
41
|
+
const inputCents = 6900 + 1200 * Math.log2(f0Hz / 440);
|
|
42
|
+
const targetVal = target.targetCentsQ[i] / 1000.0; // Milli-Cents to Cents
|
|
43
|
+
const rawDiff = targetVal - inputCents;
|
|
44
|
+
// 2. Apply Retune Speed (Smoothing the Correction)
|
|
45
|
+
smoothDiff += alpha * (rawDiff - smoothDiff);
|
|
46
|
+
// 3. Calculate Ratio Strength
|
|
47
|
+
// Wanted Correction = smoothDiff.
|
|
48
|
+
// Available Correction = rawDiff.
|
|
49
|
+
// Strength = smoothDiff / rawDiff.
|
|
50
|
+
let ratio = 0;
|
|
51
|
+
if (Math.abs(rawDiff) > 0.001) {
|
|
52
|
+
ratio = smoothDiff / rawDiff;
|
|
53
|
+
}
|
|
54
|
+
else {
|
|
55
|
+
ratio = 1.0; // Already there
|
|
56
|
+
}
|
|
57
|
+
// Clamp ratio 0..1?
|
|
58
|
+
// If overshoot (ringing), ratio > 1.
|
|
59
|
+
// Usually we clamp strength 0..1 for artifacts.
|
|
60
|
+
ratio = Math.max(0, Math.min(1, ratio));
|
|
61
|
+
// 4. Protection
|
|
62
|
+
// If confidence is low, reduce max strength.
|
|
63
|
+
// confQ: 0..10000.
|
|
64
|
+
const conf = f0.confQ[i] / 10000.0;
|
|
65
|
+
// protectedMax = 1.0 - protection * (1.0 - conf)
|
|
66
|
+
// If protection=1, and conf=0 => max=0.
|
|
67
|
+
// If protection=0 => max=1.
|
|
68
|
+
const confCheck = Math.max(0, 1.0 - protection * (1.0 - conf));
|
|
69
|
+
// 5. Final Strength
|
|
70
|
+
// Base Snap * Computed Ratio * Protection * Global Mix (Phase 6)
|
|
71
|
+
let targetStrength = baseSnap * ratio * confCheck * globalMix;
|
|
72
|
+
// 6. Attack / Release Smoothing
|
|
73
|
+
if (targetStrength > currentStrength) {
|
|
74
|
+
// console.log(`Debug Att: Target=${targetStrength}, Curr=${currentStrength}, Alpha=${alphaAtt}`);
|
|
75
|
+
currentStrength += (targetStrength - currentStrength) * alphaAtt;
|
|
76
|
+
}
|
|
77
|
+
else {
|
|
78
|
+
currentStrength += (targetStrength - currentStrength) * alphaRel;
|
|
79
|
+
}
|
|
80
|
+
strengthQ[i] = Math.floor(currentStrength * 10000);
|
|
81
|
+
}
|
|
82
|
+
return {
|
|
83
|
+
sampleRateHz: f0.sampleRateHz,
|
|
84
|
+
frameHz: f0.frameHz,
|
|
85
|
+
hopSamples: f0.hopSamples,
|
|
86
|
+
t0Samples: f0.t0Samples,
|
|
87
|
+
strengthQ
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
exports.CorrectionController = CorrectionController;
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import { F0TrackV1, VoicingMaskV1, TuneScoreV1, CorrectionEnvelopeV1 } from "@mcptoolshop/voice-engine-core";
|
|
2
|
+
export declare class CorrectionControllerRefV1 {
|
|
3
|
+
constructor();
|
|
4
|
+
generate(f0Track: F0TrackV1, voicing: VoicingMaskV1, score: TuneScoreV1): CorrectionEnvelopeV1;
|
|
5
|
+
}
|
|
6
|
+
//# sourceMappingURL=CorrectionControllerRefV1.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"CorrectionControllerRefV1.d.ts","sourceRoot":"","sources":["../../../src/tuning/CorrectionControllerRefV1.ts"],"names":[],"mappings":"AAAA,OAAO,EACH,SAAS,EAAE,aAAa,EAAE,WAAW,EAAE,oBAAoB,EAE9D,MAAM,gCAAgC,CAAC;AAExC,qBAAa,yBAAyB;;IAG3B,QAAQ,CACV,OAAO,EAAE,SAAS,EAClB,OAAO,EAAE,aAAa,EACtB,KAAK,EAAE,WAAW,GACpB,oBAAoB;CA+D1B"}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.CorrectionControllerRefV1 = void 0;
|
|
4
|
+
class CorrectionControllerRefV1 {
|
|
5
|
+
constructor() { }
|
|
6
|
+
generate(f0Track, voicing, score) {
|
|
7
|
+
const numFrames = f0Track.f0MhzQ.length;
|
|
8
|
+
const strengthQ = new Int16Array(numFrames);
|
|
9
|
+
if (score.mode !== "scale") {
|
|
10
|
+
throw new Error("Only scale mode supported in V1");
|
|
11
|
+
}
|
|
12
|
+
const config = score;
|
|
13
|
+
const baseStrength = config.snapStrengthQ;
|
|
14
|
+
// Pass 1: Raw Strength Calculation
|
|
15
|
+
for (let i = 0; i < numFrames; i++) {
|
|
16
|
+
const isVoiced = voicing.voicedQ[i] > 0;
|
|
17
|
+
if (!isVoiced) {
|
|
18
|
+
strengthQ[i] = 0;
|
|
19
|
+
}
|
|
20
|
+
else {
|
|
21
|
+
const conf = f0Track.confQ[i];
|
|
22
|
+
// strength = snapStrengthQ * (confQ / 10000)
|
|
23
|
+
// Linear scaling by confidence
|
|
24
|
+
strengthQ[i] = Math.round(baseStrength * (conf / 10000));
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
// Pass 2: Boundary Softening (Distance Transform)
|
|
28
|
+
// If dist(unvoiced) < 3, strength *= dist/3
|
|
29
|
+
const dist = new Int32Array(numFrames).fill(numFrames + 1);
|
|
30
|
+
// Init distance 0 at unvoiced frames
|
|
31
|
+
for (let i = 0; i < numFrames; i++) {
|
|
32
|
+
if (voicing.voicedQ[i] === 0) {
|
|
33
|
+
dist[i] = 0;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
// Forward scan
|
|
37
|
+
for (let i = 1; i < numFrames; i++) {
|
|
38
|
+
dist[i] = Math.min(dist[i], dist[i - 1] + 1);
|
|
39
|
+
}
|
|
40
|
+
// Backward scan
|
|
41
|
+
for (let i = numFrames - 2; i >= 0; i--) {
|
|
42
|
+
dist[i] = Math.min(dist[i], dist[i + 1] + 1);
|
|
43
|
+
}
|
|
44
|
+
// Apply erosion
|
|
45
|
+
for (let i = 0; i < numFrames; i++) {
|
|
46
|
+
const d = dist[i];
|
|
47
|
+
if (d < 3) {
|
|
48
|
+
// d=0 -> strength=0 (already set)
|
|
49
|
+
// d=1 -> strength*=1/3
|
|
50
|
+
// d=2 -> strength*=2/3
|
|
51
|
+
strengthQ[i] = Math.floor((strengthQ[i] * d) / 3);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
return {
|
|
55
|
+
sampleRateHz: f0Track.sampleRateHz,
|
|
56
|
+
frameHz: f0Track.frameHz,
|
|
57
|
+
hopSamples: f0Track.hopSamples,
|
|
58
|
+
t0Samples: f0Track.t0Samples,
|
|
59
|
+
strengthQ
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
exports.CorrectionControllerRefV1 = CorrectionControllerRefV1;
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
export declare class ScaleQuantizer {
|
|
2
|
+
/**
|
|
3
|
+
* Quantizes an input pitch (in cents relative to A4=440Hz=6900) to the nearest allowed note.
|
|
4
|
+
*/
|
|
5
|
+
static quantize(inputCents: number, allowedPitchClasses: number[]): number;
|
|
6
|
+
}
|
|
7
|
+
//# sourceMappingURL=ScaleQuantizer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ScaleQuantizer.d.ts","sourceRoot":"","sources":["../../../src/tuning/ScaleQuantizer.ts"],"names":[],"mappings":"AAAA,qBAAa,cAAc;IACvB;;OAEG;IACH,MAAM,CAAC,QAAQ,CAAC,UAAU,EAAE,MAAM,EAAE,mBAAmB,EAAE,MAAM,EAAE,GAAG,MAAM;CAuC7E"}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.ScaleQuantizer = void 0;
|
|
4
|
+
class ScaleQuantizer {
|
|
5
|
+
/**
|
|
6
|
+
* Quantizes an input pitch (in cents relative to A4=440Hz=6900) to the nearest allowed note.
|
|
7
|
+
*/
|
|
8
|
+
static quantize(inputCents, allowedPitchClasses) {
|
|
9
|
+
// 1. Determine Octave and Semitone
|
|
10
|
+
// MIDI 0 = C-1 = 0 cents? No. A4=6900.
|
|
11
|
+
// C4 = 6000 cents.
|
|
12
|
+
// 1200 cents per octave.
|
|
13
|
+
// pitchClass = floor(cents / 100) % 12 ? No, cents are absolute.
|
|
14
|
+
// Let's align to MIDI numbers.
|
|
15
|
+
// midiVal = inputCents / 100.
|
|
16
|
+
const midiVal = inputCents / 100.0;
|
|
17
|
+
const noteIndex = Math.round(midiVal);
|
|
18
|
+
// Optimize: check if noteIndex is allowed
|
|
19
|
+
const pc = ((noteIndex % 12) + 12) % 12;
|
|
20
|
+
if (allowedPitchClasses.includes(pc)) {
|
|
21
|
+
return noteIndex * 100;
|
|
22
|
+
}
|
|
23
|
+
// Search nearest allowed
|
|
24
|
+
// Brute force is fast enough (12 classes max).
|
|
25
|
+
// Check neighbors up/down.
|
|
26
|
+
let bestDist = Infinity;
|
|
27
|
+
let bestCandidate = noteIndex;
|
|
28
|
+
// Check +/- 12 semitones is sufficient
|
|
29
|
+
for (let i = -6; i <= 6; i++) {
|
|
30
|
+
const candidate = noteIndex + i;
|
|
31
|
+
const cpc = ((candidate % 12) + 12) % 12;
|
|
32
|
+
if (allowedPitchClasses.includes(cpc)) {
|
|
33
|
+
const dist = Math.abs(candidate - midiVal);
|
|
34
|
+
if (dist < bestDist) {
|
|
35
|
+
bestDist = dist;
|
|
36
|
+
bestCandidate = candidate;
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
return bestCandidate * 100;
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
exports.ScaleQuantizer = ScaleQuantizer;
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import { ProsodyRuntimeStateV1 } from '../../../voice-engine-core/src/prosody/StreamingProsodyTypes';
|
|
2
|
+
export declare class StreamingAutotuneEngine {
|
|
3
|
+
private state;
|
|
4
|
+
private config;
|
|
5
|
+
private preset;
|
|
6
|
+
private frameCount;
|
|
7
|
+
private eventScheduler;
|
|
8
|
+
private _reusableEventList;
|
|
9
|
+
private allowedSet;
|
|
10
|
+
private _lastOutputCents;
|
|
11
|
+
private _mockPitchHz;
|
|
12
|
+
constructor(config: any, preset: any);
|
|
13
|
+
private hzToCents;
|
|
14
|
+
private quantize;
|
|
15
|
+
getLastOutputCents(): number;
|
|
16
|
+
setMockPitch(hz: number): void;
|
|
17
|
+
enqueueEvents(events: any[]): void;
|
|
18
|
+
process(chunk: Float32Array): {
|
|
19
|
+
audio: Float32Array;
|
|
20
|
+
targets: Float32Array;
|
|
21
|
+
};
|
|
22
|
+
processFrame(analysis: {
|
|
23
|
+
energyDb: number;
|
|
24
|
+
confidenceQ: number;
|
|
25
|
+
pitchHz: number;
|
|
26
|
+
}, frameIndex: number): void;
|
|
27
|
+
processFramePipeline(analysis: {
|
|
28
|
+
energyDb: number;
|
|
29
|
+
confidenceQ: number;
|
|
30
|
+
pitchHz: number;
|
|
31
|
+
}, frameIndex: number): void;
|
|
32
|
+
private getCurrentRampValue;
|
|
33
|
+
private handleSegmentStart;
|
|
34
|
+
private handleSegmentEnd;
|
|
35
|
+
snapshot(): {
|
|
36
|
+
version: string;
|
|
37
|
+
state: ProsodyRuntimeStateV1;
|
|
38
|
+
};
|
|
39
|
+
restore(snapshot: any): void;
|
|
40
|
+
reset(): void;
|
|
41
|
+
private createInitialState;
|
|
42
|
+
}
|
|
43
|
+
//# sourceMappingURL=StreamingAutotuneEngine.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"StreamingAutotuneEngine.d.ts","sourceRoot":"","sources":["../../../src/tuning/StreamingAutotuneEngine.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,qBAAqB,EAAY,MAAM,8DAA8D,CAAC;AA4D/G,qBAAa,uBAAuB;IAChC,OAAO,CAAC,KAAK,CAAwB;IACrC,OAAO,CAAC,MAAM,CAAM;IACpB,OAAO,CAAC,MAAM,CAAM;IACpB,OAAO,CAAC,UAAU,CAAa;IAC/B,OAAO,CAAC,cAAc,CAAsB;IAC5C,OAAO,CAAC,kBAAkB,CAAa;IACvC,OAAO,CAAC,UAAU,CAAc;IAChC,OAAO,CAAC,gBAAgB,CAAa;IACrC,OAAO,CAAC,YAAY,CAAa;gBAErB,MAAM,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG;IAUpC,OAAO,CAAC,SAAS;IAIjB,OAAO,CAAC,QAAQ;IAuBT,kBAAkB,IAAI,MAAM;IAI5B,YAAY,CAAC,EAAE,EAAE,MAAM;IAIvB,aAAa,CAAC,MAAM,EAAE,GAAG,EAAE;IAIlC,OAAO,CAAC,KAAK,EAAE,YAAY,GAAG;QAAE,KAAK,EAAE,YAAY,CAAC;QAAC,OAAO,EAAE,YAAY,CAAA;KAAE;IAsC5E,YAAY,CAAC,QAAQ,EAAE;QAAE,QAAQ,EAAE,MAAM,CAAC;QAAC,WAAW,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAA;KAAE,EAAE,UAAU,EAAE,MAAM,GAAG,IAAI;IAI5G,oBAAoB,CAAC,QAAQ,EAAE;QAAE,QAAQ,EAAE,MAAM,CAAC;QAAC,WAAW,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAA;KAAE,EAAE,UAAU,EAAE,MAAM,GAAG,IAAI;IAqKpH,OAAO,CAAC,mBAAmB;IAQ3B,OAAO,CAAC,kBAAkB;IAe1B,OAAO,CAAC,gBAAgB;IAMrB,QAAQ,IAAI;QAAE,OAAO,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,qBAAqB,CAAA;KAAE;IASzD,OAAO,CAAC,QAAQ,EAAE,GAAG,GAAG,IAAI;IAqCnC,KAAK,IAAI,IAAI;IAIb,OAAO,CAAC,kBAAkB;CAqC7B"}
|
|
@@ -0,0 +1,389 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.StreamingAutotuneEngine = void 0;
|
|
4
|
+
const OnlineStats_1 = require("../../../voice-engine-core/src/prosody/OnlineStats");
|
|
5
|
+
const version_1 = require("../version");
|
|
6
|
+
class RingBufferScheduler {
|
|
7
|
+
buffer;
|
|
8
|
+
capacity;
|
|
9
|
+
head = 0;
|
|
10
|
+
count = 0;
|
|
11
|
+
constructor(capacity = 1024) {
|
|
12
|
+
this.capacity = capacity;
|
|
13
|
+
this.buffer = new Array(capacity);
|
|
14
|
+
}
|
|
15
|
+
enqueue(events) {
|
|
16
|
+
// Sort incoming batch to ensure local order
|
|
17
|
+
events.sort((a, b) => a.time - b.time);
|
|
18
|
+
for (const e of events) {
|
|
19
|
+
if (this.count === this.capacity) {
|
|
20
|
+
// Buffer full. Drop oldest (head) to make room.
|
|
21
|
+
this.head = (this.head + 1) % this.capacity;
|
|
22
|
+
}
|
|
23
|
+
else {
|
|
24
|
+
this.count++;
|
|
25
|
+
}
|
|
26
|
+
const tail = (this.head + this.count - 1) % this.capacity;
|
|
27
|
+
this.buffer[tail] = e;
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
pruneOldEvents(currentTime) {
|
|
31
|
+
while (this.count > 0) {
|
|
32
|
+
const e = this.buffer[this.head];
|
|
33
|
+
const end = e.endTime !== undefined ? e.endTime : (e.time + (e.duration || 10));
|
|
34
|
+
if (end < currentTime) {
|
|
35
|
+
this.buffer[this.head] = undefined;
|
|
36
|
+
this.head = (this.head + 1) % this.capacity;
|
|
37
|
+
this.count--;
|
|
38
|
+
}
|
|
39
|
+
else {
|
|
40
|
+
break;
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
getActiveEvents(currentTime, outList) {
|
|
45
|
+
outList.length = 0;
|
|
46
|
+
for (let i = 0; i < this.count; i++) {
|
|
47
|
+
const idx = (this.head + i) % this.capacity;
|
|
48
|
+
const e = this.buffer[idx];
|
|
49
|
+
if (!e)
|
|
50
|
+
continue;
|
|
51
|
+
const end = e.endTime !== undefined ? e.endTime : (e.time + (e.duration || 10));
|
|
52
|
+
if (e.time <= currentTime && end >= currentTime) {
|
|
53
|
+
outList.push(e);
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
class StreamingAutotuneEngine {
|
|
59
|
+
state;
|
|
60
|
+
config;
|
|
61
|
+
preset;
|
|
62
|
+
frameCount = 0;
|
|
63
|
+
eventScheduler;
|
|
64
|
+
_reusableEventList = [];
|
|
65
|
+
allowedSet;
|
|
66
|
+
_lastOutputCents = 0;
|
|
67
|
+
_mockPitchHz = 0;
|
|
68
|
+
constructor(config, preset) {
|
|
69
|
+
this.config = config;
|
|
70
|
+
this.preset = preset;
|
|
71
|
+
this.state = this.createInitialState();
|
|
72
|
+
this.eventScheduler = new RingBufferScheduler();
|
|
73
|
+
const allowed = config.allowedPitchClasses || [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11];
|
|
74
|
+
this.allowedSet = new Set(allowed);
|
|
75
|
+
}
|
|
76
|
+
hzToCents(h) {
|
|
77
|
+
return (h > 1.0) ? (6900 + 1200 * Math.log2(h / 440)) : 0;
|
|
78
|
+
}
|
|
79
|
+
quantize(c) {
|
|
80
|
+
const rootOffset = this.config.rootOffsetCents || 0;
|
|
81
|
+
const local = c - rootOffset;
|
|
82
|
+
const midi = Math.round(local / 100);
|
|
83
|
+
const pc = ((midi % 12) + 12) % 12;
|
|
84
|
+
if (this.allowedSet.has(pc)) {
|
|
85
|
+
return { id: midi, center: midi * 100 + rootOffset };
|
|
86
|
+
}
|
|
87
|
+
for (let i = 1; i <= 6; i++) {
|
|
88
|
+
let m = midi + i;
|
|
89
|
+
let p = ((m % 12) + 12) % 12;
|
|
90
|
+
if (this.allowedSet.has(p))
|
|
91
|
+
return { id: m, center: m * 100 + rootOffset };
|
|
92
|
+
m = midi - i;
|
|
93
|
+
p = ((m % 12) + 12) % 12;
|
|
94
|
+
if (this.allowedSet.has(p))
|
|
95
|
+
return { id: m, center: m * 100 + rootOffset };
|
|
96
|
+
}
|
|
97
|
+
return { id: midi, center: midi * 100 + rootOffset };
|
|
98
|
+
}
|
|
99
|
+
getLastOutputCents() {
|
|
100
|
+
return this._lastOutputCents;
|
|
101
|
+
}
|
|
102
|
+
setMockPitch(hz) {
|
|
103
|
+
this._mockPitchHz = hz;
|
|
104
|
+
}
|
|
105
|
+
enqueueEvents(events) {
|
|
106
|
+
this.eventScheduler.enqueue(events);
|
|
107
|
+
}
|
|
108
|
+
process(chunk) {
|
|
109
|
+
const hopSize = 128;
|
|
110
|
+
const numFrames = Math.floor(chunk.length / hopSize);
|
|
111
|
+
const targets = new Float32Array(numFrames);
|
|
112
|
+
for (let i = 0; i < numFrames; i++) {
|
|
113
|
+
const startSample = i * hopSize;
|
|
114
|
+
// Mock Analysis (to be replaced by StreamingPitchTracker)
|
|
115
|
+
let sumSq = 0;
|
|
116
|
+
for (let j = 0; j < hopSize; j++) {
|
|
117
|
+
// Check bounds
|
|
118
|
+
if (startSample + j < chunk.length) {
|
|
119
|
+
const s = chunk[startSample + j];
|
|
120
|
+
sumSq += s * s;
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
const rms = Math.sqrt(sumSq / hopSize);
|
|
124
|
+
const energyDb = rms > 1e-9 ? 20 * Math.log10(rms) : -100;
|
|
125
|
+
// Mock Pitch/Confidence (0 confidence = unvoiced)
|
|
126
|
+
// In real impl, this comes from F0Decomposer
|
|
127
|
+
const isVoiced = energyDb > -50 && this._mockPitchHz > 0;
|
|
128
|
+
const frameAnalysis = {
|
|
129
|
+
energyDb,
|
|
130
|
+
confidenceQ: isVoiced ? 10000 : 0,
|
|
131
|
+
pitchHz: this._mockPitchHz
|
|
132
|
+
};
|
|
133
|
+
// Using the new pipeline
|
|
134
|
+
this.processFramePipeline(frameAnalysis, this.frameCount++);
|
|
135
|
+
targets[i] = this._lastOutputCents;
|
|
136
|
+
}
|
|
137
|
+
return { audio: chunk.slice(), targets: targets };
|
|
138
|
+
}
|
|
139
|
+
processFrame(analysis, frameIndex) {
|
|
140
|
+
this.processFramePipeline(analysis, frameIndex);
|
|
141
|
+
}
|
|
142
|
+
processFramePipeline(analysis, frameIndex) {
|
|
143
|
+
const { energyDb, confidenceQ, pitchHz } = analysis;
|
|
144
|
+
const config = this.config;
|
|
145
|
+
const segmenter = this.state.segmenter;
|
|
146
|
+
// Defaults
|
|
147
|
+
const silenceDb = config.silenceThresholdDb ?? -60;
|
|
148
|
+
const voicingLimit = config.voicingThresholdQ ?? 2000;
|
|
149
|
+
const enterLimit = config.voicedEnterFrames ?? 2;
|
|
150
|
+
const exitLimit = config.voicedExitFrames ?? 5;
|
|
151
|
+
// 1. Input Conditions
|
|
152
|
+
const isSpeechCandidate = energyDb > silenceDb;
|
|
153
|
+
const isVoicedCandidate = isSpeechCandidate && (confidenceQ > voicingLimit);
|
|
154
|
+
// 2. Hysteresis Logic
|
|
155
|
+
if (segmenter.isVoiced) {
|
|
156
|
+
if (!isVoicedCandidate) {
|
|
157
|
+
segmenter.exitCount++;
|
|
158
|
+
if (segmenter.exitCount > exitLimit) {
|
|
159
|
+
this.handleSegmentEnd(frameIndex);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
else {
|
|
163
|
+
segmenter.exitCount = 0;
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
else {
|
|
167
|
+
if (isVoicedCandidate) {
|
|
168
|
+
segmenter.enterCount++;
|
|
169
|
+
if (segmenter.enterCount >= enterLimit) {
|
|
170
|
+
this.handleSegmentStart(frameIndex);
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
else {
|
|
174
|
+
segmenter.enterCount = 0;
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
// 3. Process Voiced Frame Pipeline
|
|
178
|
+
if (segmenter.isVoiced) {
|
|
179
|
+
segmenter.accumulatedConf += confidenceQ;
|
|
180
|
+
segmenter.accumulatedEnergy += energyDb;
|
|
181
|
+
// --- Pipeline Step 1: Decomposition ---
|
|
182
|
+
const rawCents = this.hzToCents(pitchHz);
|
|
183
|
+
// For now, assuming raw input IS the decomposition
|
|
184
|
+
const centerCents = rawCents;
|
|
185
|
+
const residualCents = 0;
|
|
186
|
+
// --- Pipeline Step 2: Baseline/Intent ---
|
|
187
|
+
// Update Online Baseline (using Cents for linear regression on pitch)
|
|
188
|
+
OnlineStats_1.OnlineStats.update(this.state.baseline, frameIndex, rawCents);
|
|
189
|
+
const { slope, intercept } = OnlineStats_1.OnlineStats.getRegression(this.state.baseline);
|
|
190
|
+
const intentCents = centerCents;
|
|
191
|
+
// --- Pipeline Step 3: Stability ---
|
|
192
|
+
const stabilizer = this.state.stabilizer;
|
|
193
|
+
const hysteresis = config.hysteresisCents ?? 15;
|
|
194
|
+
const minHoldFrames = config.minHoldFrames ?? 6;
|
|
195
|
+
const rampFrames = config.rampFrames ?? 3;
|
|
196
|
+
// Calc delta/slope for transition detection
|
|
197
|
+
const delta = 0; // TODO: Track previous raw cents
|
|
198
|
+
const isTransition = delta > (config.slopeThreshFrame ?? 5);
|
|
199
|
+
const cand = this.quantize(intentCents);
|
|
200
|
+
// Initialize if first voiced frame logic covered by handleSegmentStart -> default state
|
|
201
|
+
if (stabilizer.currentNoteId === 0) {
|
|
202
|
+
stabilizer.currentNoteId = cand.id;
|
|
203
|
+
stabilizer.lastTargetCents = cand.center;
|
|
204
|
+
stabilizer.holdFrames = 0;
|
|
205
|
+
stabilizer.rampActive = false;
|
|
206
|
+
stabilizer.rampEndCents = cand.center;
|
|
207
|
+
stabilizer.rampStartCents = cand.center;
|
|
208
|
+
}
|
|
209
|
+
else {
|
|
210
|
+
stabilizer.holdFrames++;
|
|
211
|
+
let shouldSwitch = false;
|
|
212
|
+
if (!isTransition && stabilizer.holdFrames >= minHoldFrames) {
|
|
213
|
+
const currentErr = Math.abs(intentCents - stabilizer.lastTargetCents);
|
|
214
|
+
const candErr = Math.abs(intentCents - cand.center);
|
|
215
|
+
if (currentErr - candErr > hysteresis) {
|
|
216
|
+
shouldSwitch = true;
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
if (shouldSwitch) {
|
|
220
|
+
stabilizer.rampStartCents = stabilizer.rampActive ?
|
|
221
|
+
this.getCurrentRampValue(stabilizer) : stabilizer.lastTargetCents;
|
|
222
|
+
stabilizer.rampEndCents = cand.center;
|
|
223
|
+
stabilizer.rampActive = true;
|
|
224
|
+
stabilizer.rampProgress = 0;
|
|
225
|
+
stabilizer.currentNoteId = cand.id;
|
|
226
|
+
stabilizer.lastTargetCents = cand.center;
|
|
227
|
+
stabilizer.holdFrames = 0;
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
// Ramping
|
|
231
|
+
let macroCents = stabilizer.lastTargetCents;
|
|
232
|
+
if (stabilizer.rampActive) {
|
|
233
|
+
stabilizer.rampProgress++;
|
|
234
|
+
macroCents = this.getCurrentRampValue(stabilizer);
|
|
235
|
+
if (stabilizer.rampProgress >= rampFrames) {
|
|
236
|
+
stabilizer.rampActive = false;
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
// --- Pipeline Step 4: Events/PFC ---
|
|
240
|
+
this.eventScheduler.pruneOldEvents(frameIndex);
|
|
241
|
+
this.eventScheduler.getActiveEvents(frameIndex, this._reusableEventList);
|
|
242
|
+
let accentOffset = 0;
|
|
243
|
+
for (const event of this._reusableEventList) {
|
|
244
|
+
const duration = event.duration || 10;
|
|
245
|
+
const strength = event.strength || 0;
|
|
246
|
+
const shape = event.shape || 'rise';
|
|
247
|
+
const radius = duration / 2;
|
|
248
|
+
const d = frameIndex - event.time;
|
|
249
|
+
if (Math.abs(d) <= radius) {
|
|
250
|
+
const sign = (shape === 'fall' || shape === 'fall-rise') ? -1.0 : 1.0;
|
|
251
|
+
const w = 0.5 * (1 + Math.cos((Math.PI * d) / radius));
|
|
252
|
+
accentOffset += w * strength * sign; // Simple additive
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
// Update PFC State
|
|
256
|
+
const pfc = this.state.pfc;
|
|
257
|
+
// Track max accent strength in recent window (simple approach)
|
|
258
|
+
const currentAbsAccent = Math.abs(accentOffset);
|
|
259
|
+
if (currentAbsAccent > 0.01) {
|
|
260
|
+
if (currentAbsAccent > pfc.focusStrength) {
|
|
261
|
+
pfc.focusStrength = currentAbsAccent;
|
|
262
|
+
pfc.focusTime = frameIndex;
|
|
263
|
+
}
|
|
264
|
+
pfc.activeFade = 1.0;
|
|
265
|
+
}
|
|
266
|
+
else {
|
|
267
|
+
pfc.activeFade *= 0.95; // Decay
|
|
268
|
+
pfc.focusStrength *= 0.95;
|
|
269
|
+
}
|
|
270
|
+
// --- Pipeline Step 5: Reconstruct ---
|
|
271
|
+
let finalCents = macroCents + accentOffset + residualCents;
|
|
272
|
+
// Apply PFC
|
|
273
|
+
if (this.state.pfc.activeFade > 0) {
|
|
274
|
+
const { slope, intercept } = OnlineStats_1.OnlineStats.getRegression(this.state.baseline);
|
|
275
|
+
const baselineAtT = intercept + slope * frameIndex;
|
|
276
|
+
const compressionStrength = this.config.pfcStrength ?? 0.5;
|
|
277
|
+
const factor = this.state.pfc.activeFade * compressionStrength;
|
|
278
|
+
if (!isNaN(intercept)) {
|
|
279
|
+
finalCents = baselineAtT + (finalCents - baselineAtT) * (1.0 - factor);
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
this._lastOutputCents = finalCents;
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
getCurrentRampValue(stabilizer) {
|
|
286
|
+
// Simple linear interpolation
|
|
287
|
+
const total = this.config.rampFrames ?? 3;
|
|
288
|
+
if (stabilizer.rampProgress >= total)
|
|
289
|
+
return stabilizer.rampEndCents;
|
|
290
|
+
const t = stabilizer.rampProgress / total;
|
|
291
|
+
return stabilizer.rampStartCents + (stabilizer.rampEndCents - stabilizer.rampStartCents) * t;
|
|
292
|
+
}
|
|
293
|
+
handleSegmentStart(index) {
|
|
294
|
+
const segmenter = this.state.segmenter;
|
|
295
|
+
segmenter.isVoiced = true;
|
|
296
|
+
segmenter.enterCount = 0;
|
|
297
|
+
segmenter.exitCount = 0;
|
|
298
|
+
segmenter.currentSegmentStart = index;
|
|
299
|
+
segmenter.accumulatedConf = 0;
|
|
300
|
+
segmenter.accumulatedEnergy = 0;
|
|
301
|
+
// Reset Stabilizer
|
|
302
|
+
this.state.stabilizer.currentNoteId = 0;
|
|
303
|
+
this.state.stabilizer.holdFrames = 0;
|
|
304
|
+
this.state.stabilizer.rampActive = false;
|
|
305
|
+
}
|
|
306
|
+
handleSegmentEnd(index) {
|
|
307
|
+
const segmenter = this.state.segmenter;
|
|
308
|
+
segmenter.isVoiced = false;
|
|
309
|
+
segmenter.exitCount = 0;
|
|
310
|
+
segmenter.enterCount = 0;
|
|
311
|
+
}
|
|
312
|
+
snapshot() {
|
|
313
|
+
// Deep copy state to prevent mutation of the snapshot
|
|
314
|
+
const stateCopy = JSON.parse(JSON.stringify(this.state));
|
|
315
|
+
return {
|
|
316
|
+
version: version_1.PROSODY_API_VERSION,
|
|
317
|
+
state: stateCopy
|
|
318
|
+
};
|
|
319
|
+
}
|
|
320
|
+
restore(snapshot) {
|
|
321
|
+
if (!snapshot || typeof snapshot !== 'object') {
|
|
322
|
+
throw new Error('Invalid snapshot format');
|
|
323
|
+
}
|
|
324
|
+
const { version, state } = snapshot;
|
|
325
|
+
if (!version || typeof version !== 'string') {
|
|
326
|
+
throw new Error('Snapshot missing version');
|
|
327
|
+
}
|
|
328
|
+
const currentMajor = version_1.PROSODY_API_VERSION.split('.')[0];
|
|
329
|
+
const snapshotMajor = version.split('.')[0];
|
|
330
|
+
if (currentMajor !== snapshotMajor) {
|
|
331
|
+
throw new Error(`Incompatible snapshot version. Current: ${version_1.PROSODY_API_VERSION}, Snapshot: ${version}`);
|
|
332
|
+
}
|
|
333
|
+
if (!state) {
|
|
334
|
+
throw new Error('Snapshot missing state');
|
|
335
|
+
}
|
|
336
|
+
this.state = state;
|
|
337
|
+
// Rehydrate Float32Arrays if they were serialized to objects
|
|
338
|
+
if (this.state.decomposer && this.state.decomposer.buffer && !(this.state.decomposer.buffer instanceof Float32Array)) {
|
|
339
|
+
const buf = this.state.decomposer.buffer;
|
|
340
|
+
const len = Object.keys(buf).length;
|
|
341
|
+
const newBuf = new Float32Array(len);
|
|
342
|
+
for (let i = 0; i < len; i++) {
|
|
343
|
+
newBuf[i] = buf[i];
|
|
344
|
+
}
|
|
345
|
+
this.state.decomposer.buffer = newBuf;
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
reset() {
|
|
349
|
+
this.state = this.createInitialState();
|
|
350
|
+
}
|
|
351
|
+
createInitialState() {
|
|
352
|
+
return {
|
|
353
|
+
segmenter: {
|
|
354
|
+
isVoiced: false,
|
|
355
|
+
enterCount: 0,
|
|
356
|
+
exitCount: 0,
|
|
357
|
+
currentSegmentStart: 0,
|
|
358
|
+
accumulatedConf: 0,
|
|
359
|
+
accumulatedEnergy: 0
|
|
360
|
+
},
|
|
361
|
+
decomposer: {
|
|
362
|
+
buffer: new Float32Array(0),
|
|
363
|
+
microState: undefined
|
|
364
|
+
},
|
|
365
|
+
baseline: {
|
|
366
|
+
sumX: 0,
|
|
367
|
+
sumY: 0,
|
|
368
|
+
sumXY: 0,
|
|
369
|
+
sumXX: 0,
|
|
370
|
+
count: 0
|
|
371
|
+
},
|
|
372
|
+
stabilizer: {
|
|
373
|
+
currentNoteId: 0,
|
|
374
|
+
lastTargetCents: 0,
|
|
375
|
+
holdFrames: 0,
|
|
376
|
+
rampActive: false,
|
|
377
|
+
rampStartCents: 0,
|
|
378
|
+
rampEndCents: 0,
|
|
379
|
+
rampProgress: 0
|
|
380
|
+
},
|
|
381
|
+
pfc: {
|
|
382
|
+
focusTime: null,
|
|
383
|
+
focusStrength: 0,
|
|
384
|
+
activeFade: 0
|
|
385
|
+
}
|
|
386
|
+
};
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
exports.StreamingAutotuneEngine = StreamingAutotuneEngine;
|