@mcptoolshop/voice-engine-dsp 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +78 -0
  3. package/dist/src/adapters/AudioWorkletProcessor.d.ts +31 -0
  4. package/dist/src/adapters/AudioWorkletProcessor.d.ts.map +1 -0
  5. package/dist/src/adapters/AudioWorkletProcessor.js +77 -0
  6. package/dist/src/adapters/NodeStreamAutotune.d.ts +28 -0
  7. package/dist/src/adapters/NodeStreamAutotune.d.ts.map +1 -0
  8. package/dist/src/adapters/NodeStreamAutotune.js +103 -0
  9. package/dist/src/analysis/PitchTrackerRefV1.d.ts +13 -0
  10. package/dist/src/analysis/PitchTrackerRefV1.d.ts.map +1 -0
  11. package/dist/src/analysis/PitchTrackerRefV1.js +136 -0
  12. package/dist/src/analysis/VoicingDetectorRefV1.d.ts +13 -0
  13. package/dist/src/analysis/VoicingDetectorRefV1.d.ts.map +1 -0
  14. package/dist/src/analysis/VoicingDetectorRefV1.js +77 -0
  15. package/dist/src/index.d.ts +8 -0
  16. package/dist/src/index.d.ts.map +1 -0
  17. package/dist/src/index.js +22 -0
  18. package/dist/src/prosody/AccentRenderer.d.ts +15 -0
  19. package/dist/src/prosody/AccentRenderer.d.ts.map +1 -0
  20. package/dist/src/prosody/AccentRenderer.js +66 -0
  21. package/dist/src/prosody/Presets.d.ts +3 -0
  22. package/dist/src/prosody/Presets.d.ts.map +1 -0
  23. package/dist/src/prosody/Presets.js +49 -0
  24. package/dist/src/prosody/SafetyRails.d.ts +21 -0
  25. package/dist/src/prosody/SafetyRails.d.ts.map +1 -0
  26. package/dist/src/prosody/SafetyRails.js +65 -0
  27. package/dist/src/transformation/FormantStrategyV1.d.ts +5 -0
  28. package/dist/src/transformation/FormantStrategyV1.d.ts.map +1 -0
  29. package/dist/src/transformation/FormantStrategyV1.js +39 -0
  30. package/dist/src/transformation/PitchShifterRefV1.d.ts +9 -0
  31. package/dist/src/transformation/PitchShifterRefV1.d.ts.map +1 -0
  32. package/dist/src/transformation/PitchShifterRefV1.js +120 -0
  33. package/dist/src/tuning/AutotuneExecutor.d.ts +16 -0
  34. package/dist/src/tuning/AutotuneExecutor.d.ts.map +1 -0
  35. package/dist/src/tuning/AutotuneExecutor.js +217 -0
  36. package/dist/src/tuning/CorrectionController.d.ts +5 -0
  37. package/dist/src/tuning/CorrectionController.d.ts.map +1 -0
  38. package/dist/src/tuning/CorrectionController.js +91 -0
  39. package/dist/src/tuning/CorrectionControllerRefV1.d.ts +6 -0
  40. package/dist/src/tuning/CorrectionControllerRefV1.d.ts.map +1 -0
  41. package/dist/src/tuning/CorrectionControllerRefV1.js +63 -0
  42. package/dist/src/tuning/ScaleQuantizer.d.ts +7 -0
  43. package/dist/src/tuning/ScaleQuantizer.d.ts.map +1 -0
  44. package/dist/src/tuning/ScaleQuantizer.js +43 -0
  45. package/dist/src/tuning/StreamingAutotuneEngine.d.ts +43 -0
  46. package/dist/src/tuning/StreamingAutotuneEngine.d.ts.map +1 -0
  47. package/dist/src/tuning/StreamingAutotuneEngine.js +389 -0
  48. package/dist/src/tuning/StreamingAutotuneEngine_Fixed.d.ts +36 -0
  49. package/dist/src/tuning/StreamingAutotuneEngine_Fixed.d.ts.map +1 -0
  50. package/dist/src/tuning/StreamingAutotuneEngine_Fixed.js +344 -0
  51. package/dist/src/tuning/TargetCurveGenerator.d.ts +5 -0
  52. package/dist/src/tuning/TargetCurveGenerator.d.ts.map +1 -0
  53. package/dist/src/tuning/TargetCurveGenerator.js +69 -0
  54. package/dist/src/tuning/TargetCurveRefV1.d.ts +6 -0
  55. package/dist/src/tuning/TargetCurveRefV1.d.ts.map +1 -0
  56. package/dist/src/tuning/TargetCurveRefV1.js +69 -0
  57. package/dist/src/utils/AudioBufferUtils.d.ts +3 -0
  58. package/dist/src/utils/AudioBufferUtils.d.ts.map +1 -0
  59. package/dist/src/utils/AudioBufferUtils.js +19 -0
  60. package/dist/src/version.d.ts +2 -0
  61. package/dist/src/version.d.ts.map +1 -0
  62. package/dist/src/version.js +4 -0
  63. package/package.json +38 -0
@@ -0,0 +1,66 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.AccentRenderer = void 0;
4
+ class AccentRenderer {
5
+ /**
6
+ * Renders prosodic accents into a per-frame control curve.
7
+ * Uses a raised cosine window for smooth parameter modulation.
8
+ *
9
+ * @param events List of prosody events to render
10
+ * @param totalFrames Total number of frames in the output buffer
11
+ * @param style Prosody style configuration
12
+ * @param frameRateHz Frame rate for time conversions (default 100)
13
+ * @returns Float32Array of rendered values (additive relative cents)
14
+ */
15
+ static render(events, totalFrames, style, frameRateHz = 100) {
16
+ const output = new Float32Array(totalFrames);
17
+ const { accentMaxCents, accentSpanSeconds, eventStrengthScale } = style;
18
+ // Convert seconds to frames
19
+ const defaultDurationFrames = Math.round(accentSpanSeconds * frameRateHz);
20
+ for (const event of events) {
21
+ // Only process accent events
22
+ if (event.type !== 'accent')
23
+ continue;
24
+ const time = event.time;
25
+ const strength = event.strength;
26
+ const shape = event.shape || 'rise';
27
+ // Use event duration if provided, else default from style
28
+ let duration = event.spanFrames;
29
+ if (!duration || duration <= 0) {
30
+ duration = defaultDurationFrames;
31
+ }
32
+ // Calculate peak cents: normalized strength * global scale * max cents
33
+ const peakCents = strength * eventStrengthScale * accentMaxCents;
34
+ const radius = duration / 2;
35
+ // Determine range of frames to process
36
+ const startFrame = Math.ceil(time - radius);
37
+ const endFrame = Math.floor(time + radius);
38
+ // Clamp to valid buffer range
39
+ const validStart = Math.max(0, startFrame);
40
+ const validEnd = Math.min(totalFrames - 1, endFrame);
41
+ // Determine sign based on shape
42
+ let sign = 1.0;
43
+ if (shape === 'fall' || shape === 'fall-rise') {
44
+ sign = -1.0;
45
+ }
46
+ // For complex shapes like fall-rise or rise-fall, simple sign flip might not be enough.
47
+ // But preserving existing logic for now:
48
+ // Existing logic: if fall or fall-rise, sign = -1.0.
49
+ for (let i = validStart; i <= validEnd; i++) {
50
+ const d = i - time;
51
+ if (radius > 0) {
52
+ // Kernel: 0.5 * (1 + cos(pi * d / radius))
53
+ // This creates a window from -radius to +radius
54
+ // Check if d is within radius (it should be given loop range generally but good to verify)
55
+ if (Math.abs(d) <= radius) {
56
+ const ratio = d / radius;
57
+ const w = 0.5 * (1 + Math.cos(Math.PI * ratio));
58
+ output[i] += w * peakCents * sign;
59
+ }
60
+ }
61
+ }
62
+ }
63
+ return output;
64
+ }
65
+ }
66
+ exports.AccentRenderer = AccentRenderer;
@@ -0,0 +1,3 @@
1
+ import { ProsodyPresetV1 } from "../../../voice-engine-core/src/prosody/ProsodyV1.js";
2
+ export declare const PRESETS: Record<string, ProsodyPresetV1>;
3
+ //# sourceMappingURL=Presets.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"Presets.d.ts","sourceRoot":"","sources":["../../../src/prosody/Presets.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,eAAe,EAAqD,MAAM,qDAAqD,CAAC;AAsBzI,eAAO,MAAM,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,eAAe,CAqDnD,CAAC"}
@@ -0,0 +1,49 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.PRESETS = void 0;
4
+ const ProsodyPresets_js_1 = require("../../../voice-engine-core/src/config/ProsodyPresets.js");
5
+ function createPreset(id, name, desc, overrides = {}) {
6
+ return {
7
+ id,
8
+ name,
9
+ description: desc,
10
+ analysis: { ...ProsodyPresets_js_1.DEFAULT_PROSODY_CONFIG_V1, ...(overrides.analysis || {}) },
11
+ stabilizer: { ...ProsodyPresets_js_1.DEFAULT_STABILIZER_CONFIG_V1, ...(overrides.stabilizer || {}) },
12
+ tuning: { ...ProsodyPresets_js_1.DEFAULT_TUNING_CONFIG_V1, ...(overrides.tuning || {}) },
13
+ correctionStrength: overrides.correctionStrength ?? 1.0,
14
+ attackMs: overrides.attackMs ?? 20,
15
+ releaseMs: overrides.releaseMs ?? 100
16
+ };
17
+ }
18
+ exports.PRESETS = {
19
+ DEFAULT_CLEAN: createPreset("default_clean", "Default Clean", "Balanced correction for clean speech", {
20
+ stabilizer: {
21
+ hysteresisCents: 15,
22
+ switchRampMs: 30
23
+ }
24
+ } // Uses defaults mostly
25
+ ),
26
+ HARD_TUNE: createPreset("hard_tune", "Hard Tune", "Zero hysteresis, fast ramp for maximum robotic effect", {
27
+ stabilizer: {
28
+ hysteresisCents: 0, // Note: SafetyRails may clamp this to 5
29
+ switchRampMs: 5, // Fast ramp
30
+ minHoldMs: 10 // Short hold
31
+ },
32
+ correctionStrength: 1.0,
33
+ attackMs: 5,
34
+ releaseMs: 5
35
+ }),
36
+ NO_WARBLE: createPreset("no_warble", "No Warble", "High hysteresis and slow ramp to prevent artifacts", {
37
+ stabilizer: {
38
+ hysteresisCents: 25,
39
+ switchRampMs: 100, // "Slow ramp" - interpreted as 100ms (approx 10 frames @ 100Hz)
40
+ minHoldMs: 100
41
+ },
42
+ analysis: {
43
+ voicingThresholdQ: 3000 // Conservative voicing (higher confidence required)
44
+ }
45
+ }),
46
+ SUBTLE: createPreset("subtle", "Subtle", "Low correction strength for natural enhancement", {
47
+ correctionStrength: 0.3
48
+ })
49
+ };
@@ -0,0 +1,21 @@
1
+ import { ProsodyPresetV1 } from "../../../voice-engine-core/src/prosody/ProsodyV1.js";
2
+ /**
3
+ * Validates and clamps a Prosody configuration to safe operating ranges.
4
+ * This prevents configurations that might cause severe audio artifacts
5
+ * (like rapid switching/warble due to 0 hysteresis or extreme sensitivity).
6
+ *
7
+ * @param config The configuration to validate
8
+ * @returns A new configuration object with clamped values
9
+ */
10
+ export declare function validateAndClampConfig(config: ProsodyPresetV1): ProsodyPresetV1;
11
+ /**
12
+ * Applies an expressiveness scale factor to the configuration.
13
+ *
14
+ * @param config The base configuration
15
+ * @param amount Amount of expressiveness (0.0 to 1.0).
16
+ * 0.0 = Robotic / Strict (Full Correction)
17
+ * 1.0 = Expressive / Natural (Zero Correction or reduced strength)
18
+ * @returns Modified configuration
19
+ */
20
+ export declare function applyExpressiveness(config: ProsodyPresetV1, amount: number): ProsodyPresetV1;
21
+ //# sourceMappingURL=SafetyRails.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"SafetyRails.d.ts","sourceRoot":"","sources":["../../../src/prosody/SafetyRails.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,eAAe,EAAE,MAAM,qDAAqD,CAAC;AAEtF;;;;;;;GAOG;AACH,wBAAgB,sBAAsB,CAAC,MAAM,EAAE,eAAe,GAAG,eAAe,CAoC/E;AAED;;;;;;;;GAQG;AACH,wBAAgB,mBAAmB,CAAC,MAAM,EAAE,eAAe,EAAE,MAAM,EAAE,MAAM,GAAG,eAAe,CAe5F"}
@@ -0,0 +1,65 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.validateAndClampConfig = validateAndClampConfig;
4
+ exports.applyExpressiveness = applyExpressiveness;
5
+ /**
6
+ * Validates and clamps a Prosody configuration to safe operating ranges.
7
+ * This prevents configurations that might cause severe audio artifacts
8
+ * (like rapid switching/warble due to 0 hysteresis or extreme sensitivity).
9
+ *
10
+ * @param config The configuration to validate
11
+ * @returns A new configuration object with clamped values
12
+ */
13
+ function validateAndClampConfig(config) {
14
+ const clamped = { ...config }; // Shallow copy is enough for top-level, but we need deep for nested
15
+ // Deep copy specific sections we modify
16
+ if (config.stabilizer) {
17
+ clamped.stabilizer = { ...config.stabilizer };
18
+ }
19
+ if (config.analysis) {
20
+ clamped.analysis = { ...config.analysis };
21
+ }
22
+ // Ensure min hysteresisCents >= 5
23
+ // Hysteresis < 5 cents can cause rapid oscillation between pitch classes (warble)
24
+ if (clamped.stabilizer) {
25
+ const currentHysteresis = clamped.stabilizer.hysteresisCents ?? 15;
26
+ if (currentHysteresis < 5) {
27
+ clamped.stabilizer.hysteresisCents = 5;
28
+ }
29
+ }
30
+ // Ensure voicingThresholdQ is reasonable
31
+ // Range: 0 to 10000.
32
+ // < 100 implies almost everything is voiced (noise artifacts).
33
+ // > 9000 implies almost nothing is voiced.
34
+ if (clamped.analysis) {
35
+ let thresh = clamped.analysis.voicingThresholdQ;
36
+ if (thresh === undefined) {
37
+ // Default if missing
38
+ thresh = 2000;
39
+ }
40
+ // Clamp to [100, 9000]
41
+ clamped.analysis.voicingThresholdQ = Math.max(100, Math.min(9000, thresh));
42
+ }
43
+ return clamped;
44
+ }
45
+ /**
46
+ * Applies an expressiveness scale factor to the configuration.
47
+ *
48
+ * @param config The base configuration
49
+ * @param amount Amount of expressiveness (0.0 to 1.0).
50
+ * 0.0 = Robotic / Strict (Full Correction)
51
+ * 1.0 = Expressive / Natural (Zero Correction or reduced strength)
52
+ * @returns Modified configuration
53
+ */
54
+ function applyExpressiveness(config, amount) {
55
+ const modified = { ...config };
56
+ // Clamp amount 0..1
57
+ const safeAmount = Math.max(0, Math.min(1, amount));
58
+ // Interpretation: "Expressiveness" reduces the correction strength.
59
+ // Base strength is scaled down by the expressiveness amount.
60
+ // If amount is 0 (No Expressiveness), we keep full strength.
61
+ // If amount is 1 (Full Expressiveness), we reduce strength to 0.
62
+ const baseStrength = config.correctionStrength ?? 1.0;
63
+ modified.correctionStrength = baseStrength * (1.0 - safeAmount);
64
+ return modified;
65
+ }
@@ -0,0 +1,5 @@
1
+ import { IFormantStrategy, AudioBufferV1 } from "@mcptoolshop/voice-engine-core";
2
+ export declare class FormantStrategyV1 implements IFormantStrategy {
3
+ apply(tuned: AudioBufferV1, original: AudioBufferV1): Promise<AudioBufferV1>;
4
+ }
5
+ //# sourceMappingURL=FormantStrategyV1.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"FormantStrategyV1.d.ts","sourceRoot":"","sources":["../../../src/transformation/FormantStrategyV1.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,aAAa,EAAE,MAAM,gCAAgC,CAAC;AAEjF,qBAAa,iBAAkB,YAAW,gBAAgB;IAChD,KAAK,CAAC,KAAK,EAAE,aAAa,EAAE,QAAQ,EAAE,aAAa,GAAG,OAAO,CAAC,aAAa,CAAC;CAsCrF"}
@@ -0,0 +1,39 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.FormantStrategyV1 = void 0;
4
+ class FormantStrategyV1 {
5
+ async apply(tuned, original) {
6
+ const outData = new Float32Array(tuned.data[0]);
7
+ // const origData = original.data[0];
8
+ // 1. Spectral Tilt / Dynamics Guard (RMS Matching)
9
+ // Simple global RMS match for V1 to prevent massive gain changes
10
+ // Or frame-based? Global is safer.
11
+ /*
12
+ let sumSqTuned = 0;
13
+ let sumSqOrig = 0;
14
+ for (let i = 0; i < outData.length; i++) sumSqTuned += outData[i] * outData[i];
15
+ for (let i = 0; i < origData.length; i++) sumSqOrig += origData[i] * origData[i];
16
+
17
+ const rmsGain = Math.sqrt((sumSqOrig + 1e-9) / (sumSqTuned + 1e-9));
18
+ // Apply Gain (Softly)
19
+ for (let i = 0; i < outData.length; i++) outData[i] *= rmsGain;
20
+ */
21
+ // 2. Artifact Guard (Limiter / Soft Clip)
22
+ // Soft clip: tanh
23
+ for (let i = 0; i < outData.length; i++) {
24
+ let s = outData[i];
25
+ // Soft Clip
26
+ if (s > 1.0 || s < -1.0) {
27
+ s = Math.tanh(s);
28
+ }
29
+ // De-click / Smoothing (Simple Lowpass)?
30
+ // No, strictly limiter for V1 safety.
31
+ outData[i] = s;
32
+ }
33
+ return {
34
+ ...tuned,
35
+ data: [outData]
36
+ };
37
+ }
38
+ }
39
+ exports.FormantStrategyV1 = FormantStrategyV1;
@@ -0,0 +1,9 @@
1
+ import { IPitchShifter, AudioBufferV1, F0TrackV1, VoicingMaskV1, TargetCurveV1, CorrectionEnvelopeV1 } from "@mcptoolshop/voice-engine-core";
2
+ export declare class PitchShifterRefV1 implements IPitchShifter {
3
+ readonly id = "voice-engine-dsp.pitch-shifter.v1";
4
+ readonly version = "1.0.0";
5
+ private formantStrategy;
6
+ capabilities(): string[];
7
+ shift(audio: AudioBufferV1, f0Track: F0TrackV1, voicing: VoicingMaskV1, target: TargetCurveV1, envelope: CorrectionEnvelopeV1, request?: any): Promise<AudioBufferV1>;
8
+ }
9
+ //# sourceMappingURL=PitchShifterRefV1.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"PitchShifterRefV1.d.ts","sourceRoot":"","sources":["../../../src/transformation/PitchShifterRefV1.ts"],"names":[],"mappings":"AAAA,OAAO,EACH,aAAa,EACb,aAAa,EAAE,SAAS,EAAE,aAAa,EAAE,aAAa,EAAE,oBAAoB,EAC/E,MAAM,gCAAgC,CAAC;AAIxC,qBAAa,iBAAkB,YAAW,aAAa;IACnD,QAAQ,CAAC,EAAE,uCAAuC;IAClD,QAAQ,CAAC,OAAO,WAAW;IAE3B,OAAO,CAAC,eAAe,CAA2B;IAElD,YAAY,IAAI,MAAM,EAAE;IAIlB,KAAK,CACP,KAAK,EAAE,aAAa,EACpB,OAAO,EAAE,SAAS,EAClB,OAAO,EAAE,aAAa,EACtB,MAAM,EAAE,aAAa,EACrB,QAAQ,EAAE,oBAAoB,EAC9B,OAAO,CAAC,EAAE,GAAG,GACd,OAAO,CAAC,aAAa,CAAC;CA4H5B"}
@@ -0,0 +1,120 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.PitchShifterRefV1 = void 0;
4
+ const FormantStrategyV1_1 = require("./FormantStrategyV1");
5
+ class PitchShifterRefV1 {
6
+ id = "voice-engine-dsp.pitch-shifter.v1";
7
+ version = "1.0.0";
8
+ formantStrategy = new FormantStrategyV1_1.FormantStrategyV1();
9
+ capabilities() {
10
+ return ["pitch-shift", "psola-lite", "formant-guard"];
11
+ }
12
+ async shift(audio, f0Track, voicing, target, envelope, request // Pass full request if needed
13
+ ) {
14
+ // Validation
15
+ const sr = audio.sampleRate;
16
+ if (sr !== f0Track.sampleRateHz) {
17
+ throw new Error("Sample rate mismatch");
18
+ }
19
+ const outData = new Float32Array(audio.data[0].length);
20
+ const inData = audio.data[0]; // Mono assumption V1
21
+ // Granular/PSOLA State
22
+ let phase = 0;
23
+ const frames = f0Track.f0MhzQ.length;
24
+ const hop = f0Track.hopSamples;
25
+ // Determine Mode (Default Preserve)
26
+ // If request provided, use it. But signature didn't have request.
27
+ // We'll update signature or assume preserve.
28
+ // IPitchShifter interface signature uses specific args.
29
+ // We can pass `formantMode` via a config object if we change interface?
30
+ // Or assume this is "V1 Deterministic" which is formant preserving.
31
+ // But for "Chipmunk" support:
32
+ const useChipmunk = false; // TODO: pipe from request
33
+ // Output pointer
34
+ for (let i = 0; i < outData.length; i++) {
35
+ // 1. Determine Frame Index
36
+ let frameIdx = Math.floor(i / hop);
37
+ if (frameIdx >= frames)
38
+ frameIdx = frames - 1;
39
+ if (frameIdx < 0)
40
+ frameIdx = 0;
41
+ // Debug
42
+ // if (i === 24000) console.log(`Debug Shifter Frame: ${frameIdx}, Voiced: ${voicing.voicedQ[frameIdx]}`);
43
+ // 2. Unvoiced Bypass
44
+ const isVoiced = voicing.voicedQ[frameIdx] > 0;
45
+ if (!isVoiced) {
46
+ outData[i] = inData[i]; // TODO: Crossfade
47
+ continue;
48
+ }
49
+ // 3. Calculate Target Pitch
50
+ const targetValCents = target.targetCentsQ[frameIdx] / 1000.0; // Assume Milli-Cents in Q
51
+ let inputF0Mhz = f0Track.f0MhzQ[frameIdx];
52
+ if (inputF0Mhz <= 0)
53
+ inputF0Mhz = 100000;
54
+ const strength = envelope.strengthQ[frameIdx] / 10000;
55
+ const inputHz = inputF0Mhz / 1000;
56
+ // Base: MIDI 69 (A4 440) = 6900 Cents
57
+ const inputCents = 6900 + 1200 * Math.log2(inputHz / 440);
58
+ const desiredCents = inputCents + (targetValCents - inputCents) * strength;
59
+ const shiftCents = (desiredCents - inputCents);
60
+ const ratio = Math.pow(2, shiftCents / 1200);
61
+ // 4. PSOLA-lite Grain Trigger
62
+ const outputF0 = inputHz * ratio;
63
+ phase += outputF0 / sr;
64
+ /*
65
+ if (i > 24000 && i < 24005) {
66
+ console.log(`Debug Shifter Loop: i=${i}, phase=${phase}, outF0=${outputF0}`);
67
+ console.log(`Debug Cents: InputC=${inputCents}, TargetC=${targetValCents}, DesiredC=${desiredCents}`);
68
+ }
69
+ */
70
+ if (phase >= 1) {
71
+ phase -= 1;
72
+ // Grain Length Strategy
73
+ // Preserve Formants: Length = 2 * InputPeriod
74
+ // Shift Formants (Chipmunk): Length = 2 * OutputPeriod
75
+ const pInput = sr / inputHz;
76
+ const pBase = useChipmunk ? (sr / outputF0) : pInput;
77
+ const grainLen = Math.floor(2 * pBase);
78
+ const overlapGain = inputHz / outputF0; // Simple density comp
79
+ // Refinement: Find local peak (Pitch Mark) within one period of input
80
+ // This aligns the grain to the waveform phase, crucial for coherence
81
+ let center = i;
82
+ const searchWin = Math.min(Math.floor(pInput / 2), 512);
83
+ let maxVal = -1;
84
+ let bestOffset = 0;
85
+ for (let o = -searchWin; o <= searchWin; o++) {
86
+ const idx = i + o;
87
+ if (idx >= 0 && idx < inData.length) {
88
+ const val = Math.abs(inData[idx]);
89
+ if (val > maxVal) {
90
+ maxVal = val;
91
+ bestOffset = o;
92
+ }
93
+ }
94
+ }
95
+ center = i + bestOffset;
96
+ for (let k = 0; k < grainLen; k++) {
97
+ const pos = i - Math.floor(grainLen / 2) + k;
98
+ // Read from Time-Aligned Input (center)
99
+ // Input Grain is centered at 'center'
100
+ // Window is centered at 'center'
101
+ // k goes 0..grainLen.
102
+ // readPos should be relative to center.
103
+ // readPos = center - grainLen/2 + k
104
+ const readPos = center - Math.floor(grainLen / 2) + k;
105
+ if (pos >= 0 && pos < outData.length && readPos >= 0 && readPos < inData.length) {
106
+ const w = 0.5 - 0.5 * Math.cos(2 * Math.PI * k / grainLen);
107
+ outData[pos] += inData[readPos] * w * overlapGain;
108
+ }
109
+ }
110
+ }
111
+ }
112
+ // End Loop
113
+ // 5. Formant / Artifact Guard
114
+ const result = { ...audio, data: [outData] };
115
+ const guarded = await this.formantStrategy.apply(result, audio);
116
+ return guarded;
117
+ }
118
+ }
119
+ exports.PitchShifterRefV1 = PitchShifterRefV1;
120
+ function floor(x) { return Math.floor(x); }
@@ -0,0 +1,16 @@
1
+ import { TuneRequestV1, AudioBufferV1 } from "@mcptoolshop/voice-engine-core";
2
+ export declare class AutotuneExecutor {
3
+ private resolver;
4
+ private tracker;
5
+ private curveGen;
6
+ private envGen;
7
+ private shifter;
8
+ private decomposer;
9
+ private segmenter;
10
+ private baselineModel;
11
+ private stabilizer;
12
+ private resolveProsodyPreset;
13
+ execute(req: TuneRequestV1, audio: AudioBufferV1): Promise<AudioBufferV1>;
14
+ private calculateEnergyDb;
15
+ }
16
+ //# sourceMappingURL=AutotuneExecutor.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"AutotuneExecutor.d.ts","sourceRoot":"","sources":["../../../src/tuning/AutotuneExecutor.ts"],"names":[],"mappings":"AAAA,OAAO,EACH,aAAa,EAAE,aAAa,EAG/B,MAAM,gCAAgC,CAAC;AAqBxC,qBAAa,gBAAgB;IACzB,OAAO,CAAC,QAAQ,CAA0B;IAG1C,OAAO,CAAC,OAAO,CAKZ;IAEH,OAAO,CAAC,QAAQ,CAA8B;IAC9C,OAAO,CAAC,MAAM,CAA8B;IAC5C,OAAO,CAAC,OAAO,CAA2B;IAC1C,OAAO,CAAC,UAAU,CAAsB;IACxC,OAAO,CAAC,SAAS,CAA0B;IAC3C,OAAO,CAAC,aAAa,CAA6B;IAClD,OAAO,CAAC,UAAU,CAA0B;IAE5C,OAAO,CAAC,oBAAoB;IActB,OAAO,CAAC,GAAG,EAAE,aAAa,EAAE,KAAK,EAAE,aAAa,GAAG,OAAO,CAAC,aAAa,CAAC;IA0L/E,OAAO,CAAC,iBAAiB;CAqB5B"}
@@ -0,0 +1,217 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.AutotuneExecutor = void 0;
4
+ const voice_engine_core_1 = require("@mcptoolshop/voice-engine-core");
5
+ const PitchTrackerRefV1_js_1 = require("../analysis/PitchTrackerRefV1.js");
6
+ const PitchShifterRefV1_js_1 = require("../transformation/PitchShifterRefV1.js");
7
+ const TargetCurveGenerator_js_1 = require("./TargetCurveGenerator.js");
8
+ const CorrectionController_js_1 = require("./CorrectionController.js");
9
+ const F0Decomposer_js_1 = require("../../../voice-engine-core/src/prosody/F0Decomposer.js");
10
+ const ProsodySegmenter_js_1 = require("../../../voice-engine-core/src/prosody/ProsodySegmenter.js");
11
+ const PhraseBaselineModel_js_1 = require("../../../voice-engine-core/src/prosody/PhraseBaselineModel.js");
12
+ const TargetStabilizer_js_1 = require("../../../voice-engine-core/src/prosody/TargetStabilizer.js");
13
+ const AccentRenderer_js_1 = require("../prosody/AccentRenderer.js");
14
+ const ProsodyPresets_js_1 = require("../../../voice-engine-core/src/config/ProsodyPresets.js");
15
+ const ProsodyStyles_js_1 = require("../../../voice-engine-core/src/config/ProsodyStyles.js");
16
+ class AutotuneExecutor {
17
+ resolver = new voice_engine_core_1.TunePlanResolver();
18
+ // Default trackers - could be overridden or config passed in
19
+ tracker = new PitchTrackerRefV1_js_1.PitchTrackerRefV1({
20
+ windowMs: 40,
21
+ hopMs: 10,
22
+ f0Min: 50,
23
+ f0Max: 1000
24
+ });
25
+ curveGen = new TargetCurveGenerator_js_1.TargetCurveGenerator();
26
+ envGen = new CorrectionController_js_1.CorrectionController();
27
+ shifter = new PitchShifterRefV1_js_1.PitchShifterRefV1(); // "PSOLA-lite"
28
+ decomposer = new F0Decomposer_js_1.F0Decomposer();
29
+ segmenter = new ProsodySegmenter_js_1.ProsodySegmenter();
30
+ baselineModel = new PhraseBaselineModel_js_1.PhraseBaselineModel();
31
+ stabilizer = new TargetStabilizer_js_1.TargetStabilizer();
32
+ resolveProsodyPreset(presetName) {
33
+ switch (presetName) {
34
+ case "hard":
35
+ case "robot":
36
+ return ProsodyPresets_js_1.HARD_TUNE_PRESET;
37
+ case "subtle":
38
+ return ProsodyPresets_js_1.SUBTLE_PRESET;
39
+ case "natural":
40
+ case "pop":
41
+ default:
42
+ return ProsodyPresets_js_1.NATURAL_PRESET;
43
+ }
44
+ }
45
+ async execute(req, audio) {
46
+ // 1. Resolve Plan
47
+ const plan = this.resolver.resolve(req);
48
+ const preset = this.resolveProsodyPreset(req.preset);
49
+ // 2. Analyze Pitch
50
+ const f0Analysis = this.tracker.analyze(audio);
51
+ const frameCount = f0Analysis.f0MhzQ.length;
52
+ // 3. Decompose Pitch (New in Phase 7.2)
53
+ // We separate macro (intonation) from micro (jitter/vibrato).
54
+ const decomposition = this.decomposer.decompose(f0Analysis);
55
+ // 3b. Prosody Segmentation (Phase 7.1)
56
+ // Identify voiced vs unvoiced vs silence phrases.
57
+ const segments = this.segmenter.segment(audio.data[0], f0Analysis, preset.analysis);
58
+ // 3c. Phrase Baseline (Phase 7.3)
59
+ // Model the declination trend of each phrase.
60
+ const baseline = this.baselineModel.analyze(segments, decomposition.macro.valuesHz);
61
+ // 3d. Derive Voicing (Enhanced with Segments)
62
+ // Instead of raw heuristic, we can now use the segments to define the voicing mask.
63
+ const voicedQ = new Uint8Array(frameCount);
64
+ const voicingProbQ = new Int16Array(frameCount);
65
+ for (const seg of segments) {
66
+ if (seg.kind === 'voiced') {
67
+ for (let i = seg.startFrame; i < seg.endFrame; i++) {
68
+ voicedQ[i] = 1;
69
+ voicingProbQ[i] = 10000;
70
+ }
71
+ }
72
+ }
73
+ const voicing = {
74
+ sampleRateHz: f0Analysis.sampleRateHz,
75
+ frameHz: f0Analysis.frameHz,
76
+ hopSamples: f0Analysis.hopSamples,
77
+ t0Samples: f0Analysis.t0Samples,
78
+ voicedQ,
79
+ voicingProbQ
80
+ };
81
+ // 4. Generate Control Curves
82
+ // Old Method:
83
+ // const target = this.curveGen.generate(f0Analysis, voicing, plan);
84
+ // Phase 7.4 Target Stabilizer Integration:
85
+ // Stabilize the INTENT curve (macro - baseline)
86
+ const stabilized = this.stabilizer.stabilize(baseline.intentHz, segments, {
87
+ allowedPitchClasses: plan.scaleConfig?.allowedPitchClasses,
88
+ hysteresisCents: preset.stabilizer.hysteresisCents,
89
+ minHoldMs: preset.stabilizer.minHoldMs,
90
+ switchRampMs: preset.stabilizer.switchRampMs,
91
+ transitionSlopeThreshCentsPerSec: preset.stabilizer.transitionSlopeThreshCentsPerSec,
92
+ rootOffsetCents: 0
93
+ }, f0Analysis.frameHz);
94
+ // Phase 8: Expressive Rendering (8.4 Style Profiles)
95
+ const style = (0, ProsodyStyles_js_1.resolveProsodyStyle)(req.style || 'speech_neutral');
96
+ if (req.events && req.events.length > 0) {
97
+ const frameRateHz = f0Analysis.sampleRateHz / f0Analysis.hopSamples;
98
+ const accentOffsets = AccentRenderer_js_1.AccentRenderer.render(req.events, frameCount, style, frameRateHz);
99
+ // Add offsets to stabilized target
100
+ for (let i = 0; i < frameCount; i++) {
101
+ if (stabilized.noteIds[i] >= 0) {
102
+ stabilized.targetCents[i] += accentOffsets[i];
103
+ }
104
+ }
105
+ // Phase 8.5: Post-Focus Compression (PFC)
106
+ // Reduces pitch range/variance after a strong focus event to de-accentuate specific information.
107
+ if (style.postFocusCompression > 0) {
108
+ // 1. Find the strongest accent (focus)
109
+ let maxStrength = 0;
110
+ let focusTime = -1;
111
+ // Simple approach: global max in request
112
+ for (const event of req.events) {
113
+ if (event.type === 'accent' && event.strength > maxStrength) {
114
+ maxStrength = event.strength;
115
+ focusTime = event.time;
116
+ }
117
+ }
118
+ // Threshold to trigger PFC (e.g. > 0.5 strength)
119
+ if (maxStrength > 0.5 && focusTime >= 0 && focusTime < frameCount - 1) {
120
+ const pfcStrength = style.postFocusCompression;
121
+ // Apply compression for frames AFTER the focus event
122
+ // We can retain a small buffer (e.g. 50ms) before compressing fully
123
+ // But for now, let's just start compressing after focusTime + span/2 or similar?
124
+ // User said: For frames t > focusTime (center).
125
+ // Let's add a small grace period (e.g. 10 frames = 100ms) to let the accent finish falling.
126
+ // Or just strict t > focusTime.
127
+ for (let t = focusTime; t < frameCount; t++) {
128
+ // We need baseline in Cents.
129
+ // baseline.baselineHz[t] -> Cents (MIDI absolute)
130
+ // 440Hz = 6900 cents.
131
+ // cents = 6900 + 1200 * log2(hz / 440)
132
+ const bHz = baseline.baselineHz[t];
133
+ if (bHz > 10) { // avoid log(0)
134
+ const baselineCents = 6900 + 1200 * Math.log2(bHz / 440);
135
+ // Calculate deviation of current target from baseline
136
+ const currentCents = stabilized.targetCents[t];
137
+ const deviation = currentCents - baselineCents;
138
+ // Compress deviation
139
+ // newDeviation = deviation * (1 - pfcStrength)
140
+ // newTarget = baselineCents + newDeviation
141
+ // Ramp-in the compression?
142
+ // Let's do a simple linear ramp over 20 frames (200ms)
143
+ let ramp = 1.0;
144
+ if (t < focusTime + 20) {
145
+ ramp = (t - focusTime) / 20.0;
146
+ }
147
+ const effectiveCompression = pfcStrength * ramp;
148
+ stabilized.targetCents[t] = baselineCents + (deviation * (1 - effectiveCompression));
149
+ }
150
+ }
151
+ }
152
+ }
153
+ }
154
+ // Re-construct the Final Target Curve
155
+ // Final Target = Stabilized Intent + Micro (Vibrato)
156
+ // Note: baseline is discarded (flattened out) if we just use Stabilized.
157
+ // If we want to strictly follow the scale, we discard the baseline declination.
158
+ const finalTargetCentsQ = new Int32Array(frameCount);
159
+ for (let i = 0; i < frameCount; i++) {
160
+ if (stabilized.noteIds[i] >= 0) {
161
+ // Stabilized Cents (e.g. 6900.0)
162
+ const stabCents = stabilized.targetCents[i];
163
+ // Micro deviation in Hz -> Cents
164
+ // microHz is deviation from macroHz
165
+ // We approximate Cents Micro: 1200 * log2((macro + micro) / macro)
166
+ // Wait, micro is deviation around 0? No, F0Decomposer says "Relative Pitch Deviation (e.g., +2.5)".
167
+ const macroHz = decomposition.macro.valuesHz[i];
168
+ const microHz = decomposition.micro.valuesHz[i];
169
+ let microCents = 0;
170
+ if (macroHz > 10) {
171
+ microCents = 1200 * Math.log2((macroHz + microHz) / macroHz);
172
+ }
173
+ // Final Cents = Stabilized + Micro
174
+ const finalCents = stabCents + microCents;
175
+ // Convert to Int32 Scaled (x1000)
176
+ finalTargetCentsQ[i] = Math.round(finalCents * 1000);
177
+ }
178
+ else {
179
+ // Unvoiced - hold last or default?
180
+ // Let's copy from curveGen behavior or input pitch
181
+ // Here we just use 0 or last. Shifter usually ignores target for unvoiced.
182
+ finalTargetCentsQ[i] = 0;
183
+ }
184
+ }
185
+ const target = {
186
+ sampleRateHz: f0Analysis.sampleRateHz,
187
+ frameHz: f0Analysis.frameHz,
188
+ hopSamples: f0Analysis.hopSamples,
189
+ t0Samples: f0Analysis.t0Samples,
190
+ targetCentsQ: finalTargetCentsQ
191
+ };
192
+ const envelope = this.envGen.generate(f0Analysis, voicing, target, plan);
193
+ // 5. Apply Pitch Shift
194
+ const result = await this.shifter.shift(audio, f0Analysis, voicing, target, envelope);
195
+ return result;
196
+ }
197
+ calculateEnergyDb(signal, frameCount, hopSamples, windowSamples) {
198
+ const energyDb = new Float32Array(frameCount);
199
+ const len = signal.length;
200
+ for (let i = 0; i < frameCount; i++) {
201
+ const start = i * hopSamples;
202
+ const end = Math.min(start + windowSamples, len);
203
+ if (end <= start) {
204
+ energyDb[i] = -120;
205
+ continue;
206
+ }
207
+ let sumSq = 0;
208
+ for (let j = start; j < end; j++) {
209
+ sumSq += signal[j] * signal[j];
210
+ }
211
+ const rms = Math.sqrt(sumSq / (end - start));
212
+ energyDb[i] = rms > 1e-9 ? 20 * Math.log10(rms) : -120;
213
+ }
214
+ return energyDb;
215
+ }
216
+ }
217
+ exports.AutotuneExecutor = AutotuneExecutor;
@@ -0,0 +1,5 @@
1
+ import { F0TrackV1, VoicingMaskV1, TunePlanV1, TargetCurveV1, CorrectionEnvelopeV1 } from "@mcptoolshop/voice-engine-core";
2
+ export declare class CorrectionController {
3
+ generate(f0: F0TrackV1, voicing: VoicingMaskV1, target: TargetCurveV1, plan: TunePlanV1): CorrectionEnvelopeV1;
4
+ }
5
+ //# sourceMappingURL=CorrectionController.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"CorrectionController.d.ts","sourceRoot":"","sources":["../../../src/tuning/CorrectionController.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,aAAa,EAAE,UAAU,EAAE,aAAa,EAAE,oBAAoB,EAAE,MAAM,gCAAgC,CAAC;AAE3H,qBAAa,oBAAoB;IAC7B,QAAQ,CACJ,EAAE,EAAE,SAAS,EACb,OAAO,EAAE,aAAa,EACtB,MAAM,EAAE,aAAa,EACrB,IAAI,EAAE,UAAU,GACjB,oBAAoB;CAoG1B"}