@apteva/apteva-kit 0.1.136 → 0.1.138

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -4457,22 +4457,6 @@ function base64ToFloat32(base64) {
4457
4457
  }
4458
4458
  return float32Array;
4459
4459
  }
4460
- function resampleAudio(inputData, inputSampleRate, outputSampleRate) {
4461
- if (inputSampleRate === outputSampleRate) {
4462
- return inputData;
4463
- }
4464
- const ratio = inputSampleRate / outputSampleRate;
4465
- const outputLength = Math.floor(inputData.length / ratio);
4466
- const output = new Float32Array(outputLength);
4467
- for (let i = 0; i < outputLength; i++) {
4468
- const srcIndex = i * ratio;
4469
- const srcIndexFloor = Math.floor(srcIndex);
4470
- const srcIndexCeil = Math.min(srcIndexFloor + 1, inputData.length - 1);
4471
- const t = srcIndex - srcIndexFloor;
4472
- output[i] = inputData[srcIndexFloor] * (1 - t) + inputData[srcIndexCeil] * t;
4473
- }
4474
- return output;
4475
- }
4476
4460
 
4477
4461
  // src/hooks/useVoiceSession.ts
4478
4462
  function useVoiceSession(config) {
@@ -4491,6 +4475,10 @@ function useVoiceSession(config) {
4491
4475
  const mutedRef = useRef9(false);
4492
4476
  const configRef = useRef9(config);
4493
4477
  configRef.current = config;
4478
+ const activeSourcesRef = useRef9([]);
4479
+ const responseStartTimeRef = useRef9(0);
4480
+ const totalAudioDurationMsRef = useRef9(0);
4481
+ const interruptedRef = useRef9(false);
4494
4482
  const cleanup = useCallback4(() => {
4495
4483
  if (durationIntervalRef.current) {
4496
4484
  clearInterval(durationIntervalRef.current);
@@ -4527,10 +4515,26 @@ function useVoiceSession(config) {
4527
4515
  }
4528
4516
  nextPlayTimeRef.current = 0;
4529
4517
  mutedRef.current = false;
4518
+ activeSourcesRef.current = [];
4519
+ responseStartTimeRef.current = 0;
4520
+ totalAudioDurationMsRef.current = 0;
4521
+ interruptedRef.current = false;
4530
4522
  setMuted(false);
4531
4523
  setPartialTranscript("");
4532
4524
  setDuration(0);
4533
4525
  }, []);
4526
+ const resetPlayback = useCallback4(() => {
4527
+ activeSourcesRef.current.forEach((source) => {
4528
+ try {
4529
+ source.stop();
4530
+ } catch (_) {
4531
+ }
4532
+ });
4533
+ activeSourcesRef.current = [];
4534
+ nextPlayTimeRef.current = 0;
4535
+ responseStartTimeRef.current = 0;
4536
+ totalAudioDurationMsRef.current = 0;
4537
+ }, []);
4534
4538
  useEffect9(() => {
4535
4539
  return () => {
4536
4540
  cleanup();
@@ -4550,10 +4554,18 @@ function useVoiceSession(config) {
4550
4554
  const source = ctx.createBufferSource();
4551
4555
  source.buffer = audioBuffer;
4552
4556
  source.connect(ctx.destination);
4557
+ activeSourcesRef.current.push(source);
4558
+ source.onended = () => {
4559
+ activeSourcesRef.current = activeSourcesRef.current.filter((s) => s !== source);
4560
+ };
4553
4561
  const currentTime = ctx.currentTime;
4554
4562
  const startTime = Math.max(currentTime, nextPlayTimeRef.current);
4555
4563
  source.start(startTime);
4556
4564
  nextPlayTimeRef.current = startTime + audioBuffer.duration;
4565
+ if (responseStartTimeRef.current === 0) {
4566
+ responseStartTimeRef.current = startTime;
4567
+ }
4568
+ totalAudioDurationMsRef.current += Math.floor(audioBuffer.duration * 1e3);
4557
4569
  }, []);
4558
4570
  const startCaptureRef = useRef9(() => {
4559
4571
  });
@@ -4569,10 +4581,43 @@ function useVoiceSession(config) {
4569
4581
  startCaptureRef.current();
4570
4582
  break;
4571
4583
  case "audio_delta":
4584
+ if (interruptedRef.current) break;
4572
4585
  if (msg.data?.chunk) {
4573
4586
  playAudioChunk(msg.data.chunk);
4574
4587
  }
4575
4588
  break;
4589
+ case "audio_complete":
4590
+ interruptedRef.current = false;
4591
+ break;
4592
+ case "audio_interrupt": {
4593
+ if (activeSourcesRef.current.length === 0) break;
4594
+ let audioEndMs = 0;
4595
+ if (playbackCtxRef.current && responseStartTimeRef.current > 0) {
4596
+ const elapsedMs = Math.max(0, Math.floor(
4597
+ (playbackCtxRef.current.currentTime - responseStartTimeRef.current) * 1e3
4598
+ ));
4599
+ audioEndMs = Math.min(elapsedMs, totalAudioDurationMsRef.current);
4600
+ }
4601
+ const itemId = msg.data?.item_id;
4602
+ const contentIndex = msg.data?.content_index || 0;
4603
+ resetPlayback();
4604
+ if (itemId) {
4605
+ interruptedRef.current = true;
4606
+ }
4607
+ const ws = wsRef.current;
4608
+ if (ws && ws.readyState === WebSocket.OPEN && itemId) {
4609
+ ws.send(JSON.stringify({
4610
+ type: "control",
4611
+ data: {
4612
+ action: "truncate",
4613
+ item_id: itemId,
4614
+ content_index: contentIndex,
4615
+ audio_end_ms: audioEndMs
4616
+ }
4617
+ }));
4618
+ }
4619
+ break;
4620
+ }
4576
4621
  case "transcript":
4577
4622
  if (msg.data) {
4578
4623
  if (msg.data.partial) {
@@ -4591,7 +4636,7 @@ function useVoiceSession(config) {
4591
4636
  break;
4592
4637
  case "tool_call":
4593
4638
  if (msg.data) {
4594
- nextPlayTimeRef.current = 0;
4639
+ resetPlayback();
4595
4640
  cfg.onTranscript?.({
4596
4641
  id: `vt-tool-${Date.now()}`,
4597
4642
  role: "system",
@@ -4603,43 +4648,78 @@ function useVoiceSession(config) {
4603
4648
  break;
4604
4649
  case "tool_result":
4605
4650
  if (msg.data) {
4606
- nextPlayTimeRef.current = 0;
4651
+ const status = msg.data.error ? "failed" : "completed";
4652
+ cfg.onTranscript?.({
4653
+ id: `vt-toolresult-${Date.now()}`,
4654
+ role: "system",
4655
+ content: `Tool ${status}: ${msg.data.name || msg.data.call_id}`,
4656
+ partial: false,
4657
+ timestamp: /* @__PURE__ */ new Date()
4658
+ });
4607
4659
  }
4608
4660
  break;
4661
+ case "turn_end":
4662
+ interruptedRef.current = false;
4663
+ break;
4609
4664
  case "error":
4610
4665
  setState("error");
4611
4666
  cfg.onError?.(new Error(msg.data?.message || "Voice session error"));
4612
4667
  break;
4613
4668
  }
4614
- }, [playAudioChunk]);
4669
+ }, [playAudioChunk, resetPlayback]);
4615
4670
  const startCapture = useCallback4(async () => {
4616
4671
  const ws = wsRef.current;
4617
4672
  if (!ws) return;
4673
+ if (processorRef.current) {
4674
+ processorRef.current.disconnect();
4675
+ processorRef.current = null;
4676
+ }
4677
+ if (mediaStreamRef.current) {
4678
+ mediaStreamRef.current.getTracks().forEach((t) => t.stop());
4679
+ mediaStreamRef.current = null;
4680
+ }
4681
+ if (captureCtxRef.current) {
4682
+ try {
4683
+ captureCtxRef.current.close();
4684
+ } catch (_) {
4685
+ }
4686
+ captureCtxRef.current = null;
4687
+ }
4618
4688
  try {
4619
- captureCtxRef.current = new AudioContext();
4620
- const nativeSampleRate = captureCtxRef.current.sampleRate;
4621
- mediaStreamRef.current = await navigator.mediaDevices.getUserMedia({ audio: true });
4689
+ captureCtxRef.current = new AudioContext({ sampleRate: 24e3 });
4690
+ if (captureCtxRef.current.state === "suspended") {
4691
+ await captureCtxRef.current.resume();
4692
+ }
4693
+ mediaStreamRef.current = await navigator.mediaDevices.getUserMedia({
4694
+ audio: {
4695
+ echoCancellation: true,
4696
+ noiseSuppression: true,
4697
+ autoGainControl: true
4698
+ }
4699
+ });
4622
4700
  const source = captureCtxRef.current.createMediaStreamSource(mediaStreamRef.current);
4623
- processorRef.current = captureCtxRef.current.createScriptProcessor(2048, 1, 1);
4701
+ processorRef.current = captureCtxRef.current.createScriptProcessor(4096, 1, 1);
4624
4702
  processorRef.current.onaudioprocess = (e) => {
4625
4703
  if (!ws || ws.readyState !== WebSocket.OPEN) return;
4626
4704
  if (mutedRef.current) return;
4627
4705
  const inputData = e.inputBuffer.getChannelData(0);
4628
- const resampledData = resampleAudio(inputData, nativeSampleRate, 16e3);
4629
- const int16Data = float32ToInt16(resampledData);
4706
+ const int16Data = float32ToInt16(inputData);
4630
4707
  const base64Data = int16ToBase64(int16Data);
4631
4708
  ws.send(JSON.stringify({
4632
4709
  type: "audio",
4633
- data: { chunk: base64Data }
4710
+ data: { chunk: base64Data, sample_rate: 24e3 }
4634
4711
  }));
4635
4712
  };
4636
4713
  source.connect(processorRef.current);
4637
- processorRef.current.connect(captureCtxRef.current.destination);
4714
+ const silentGain = captureCtxRef.current.createGain();
4715
+ silentGain.gain.value = 0;
4716
+ processorRef.current.connect(silentGain);
4717
+ silentGain.connect(captureCtxRef.current.destination);
4638
4718
  } catch (e) {
4639
4719
  console.warn("Microphone access denied:", e);
4640
4720
  configRef.current.onError?.(new Error("Microphone access denied"));
4641
4721
  }
4642
- }, [cleanup]);
4722
+ }, []);
4643
4723
  startCaptureRef.current = startCapture;
4644
4724
  const start = useCallback4(() => {
4645
4725
  if (state !== "idle") return;
@@ -4656,6 +4736,12 @@ function useVoiceSession(config) {
4656
4736
  }
4657
4737
  const ws = new WebSocket(wsUrl);
4658
4738
  wsRef.current = ws;
4739
+ if (!playbackCtxRef.current) {
4740
+ playbackCtxRef.current = new AudioContext({ sampleRate: 24e3 });
4741
+ }
4742
+ if (playbackCtxRef.current.state === "suspended") {
4743
+ playbackCtxRef.current.resume();
4744
+ }
4659
4745
  ws.onopen = () => {
4660
4746
  const provider = configRef.current.provider || "openai";
4661
4747
  const voice = configRef.current.voice || "ash";
@@ -4679,7 +4765,7 @@ function useVoiceSession(config) {
4679
4765
  cleanup();
4680
4766
  setState("idle");
4681
4767
  };
4682
- }, [state, config.apiUrl, handleMessage, cleanup]);
4768
+ }, [state, config.apiUrl, config.apiKey, handleMessage, cleanup]);
4683
4769
  const stop = useCallback4(() => {
4684
4770
  cleanup();
4685
4771
  setState("idle");