@apteva/apteva-kit 0.1.137 → 0.1.138

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -4457,22 +4457,6 @@ function base64ToFloat32(base64) {
4457
4457
  }
4458
4458
  return float32Array;
4459
4459
  }
4460
- function resampleAudio(inputData, inputSampleRate, outputSampleRate) {
4461
- if (inputSampleRate === outputSampleRate) {
4462
- return inputData;
4463
- }
4464
- const ratio = inputSampleRate / outputSampleRate;
4465
- const outputLength = Math.floor(inputData.length / ratio);
4466
- const output = new Float32Array(outputLength);
4467
- for (let i = 0; i < outputLength; i++) {
4468
- const srcIndex = i * ratio;
4469
- const srcIndexFloor = Math.floor(srcIndex);
4470
- const srcIndexCeil = Math.min(srcIndexFloor + 1, inputData.length - 1);
4471
- const t = srcIndex - srcIndexFloor;
4472
- output[i] = inputData[srcIndexFloor] * (1 - t) + inputData[srcIndexCeil] * t;
4473
- }
4474
- return output;
4475
- }
4476
4460
 
4477
4461
  // src/hooks/useVoiceSession.ts
4478
4462
  function useVoiceSession(config) {
@@ -4491,8 +4475,10 @@ function useVoiceSession(config) {
4491
4475
  const mutedRef = useRef9(false);
4492
4476
  const configRef = useRef9(config);
4493
4477
  configRef.current = config;
4494
- const agentSpeakingRef = useRef9(false);
4495
- const agentSpeakingTimeoutRef = useRef9(null);
4478
+ const activeSourcesRef = useRef9([]);
4479
+ const responseStartTimeRef = useRef9(0);
4480
+ const totalAudioDurationMsRef = useRef9(0);
4481
+ const interruptedRef = useRef9(false);
4496
4482
  const cleanup = useCallback4(() => {
4497
4483
  if (durationIntervalRef.current) {
4498
4484
  clearInterval(durationIntervalRef.current);
@@ -4529,15 +4515,26 @@ function useVoiceSession(config) {
4529
4515
  }
4530
4516
  nextPlayTimeRef.current = 0;
4531
4517
  mutedRef.current = false;
4532
- agentSpeakingRef.current = false;
4533
- if (agentSpeakingTimeoutRef.current) {
4534
- clearTimeout(agentSpeakingTimeoutRef.current);
4535
- agentSpeakingTimeoutRef.current = null;
4536
- }
4518
+ activeSourcesRef.current = [];
4519
+ responseStartTimeRef.current = 0;
4520
+ totalAudioDurationMsRef.current = 0;
4521
+ interruptedRef.current = false;
4537
4522
  setMuted(false);
4538
4523
  setPartialTranscript("");
4539
4524
  setDuration(0);
4540
4525
  }, []);
4526
+ const resetPlayback = useCallback4(() => {
4527
+ activeSourcesRef.current.forEach((source) => {
4528
+ try {
4529
+ source.stop();
4530
+ } catch (_) {
4531
+ }
4532
+ });
4533
+ activeSourcesRef.current = [];
4534
+ nextPlayTimeRef.current = 0;
4535
+ responseStartTimeRef.current = 0;
4536
+ totalAudioDurationMsRef.current = 0;
4537
+ }, []);
4541
4538
  useEffect9(() => {
4542
4539
  return () => {
4543
4540
  cleanup();
@@ -4557,18 +4554,18 @@ function useVoiceSession(config) {
4557
4554
  const source = ctx.createBufferSource();
4558
4555
  source.buffer = audioBuffer;
4559
4556
  source.connect(ctx.destination);
4557
+ activeSourcesRef.current.push(source);
4558
+ source.onended = () => {
4559
+ activeSourcesRef.current = activeSourcesRef.current.filter((s) => s !== source);
4560
+ };
4560
4561
  const currentTime = ctx.currentTime;
4561
4562
  const startTime = Math.max(currentTime, nextPlayTimeRef.current);
4562
4563
  source.start(startTime);
4563
4564
  nextPlayTimeRef.current = startTime + audioBuffer.duration;
4564
- agentSpeakingRef.current = true;
4565
- if (agentSpeakingTimeoutRef.current) {
4566
- clearTimeout(agentSpeakingTimeoutRef.current);
4567
- }
4568
- const remainingMs = (nextPlayTimeRef.current - currentTime) * 1e3 + 150;
4569
- agentSpeakingTimeoutRef.current = setTimeout(() => {
4570
- agentSpeakingRef.current = false;
4571
- }, remainingMs);
4565
+ if (responseStartTimeRef.current === 0) {
4566
+ responseStartTimeRef.current = startTime;
4567
+ }
4568
+ totalAudioDurationMsRef.current += Math.floor(audioBuffer.duration * 1e3);
4572
4569
  }, []);
4573
4570
  const startCaptureRef = useRef9(() => {
4574
4571
  });
@@ -4584,10 +4581,43 @@ function useVoiceSession(config) {
4584
4581
  startCaptureRef.current();
4585
4582
  break;
4586
4583
  case "audio_delta":
4584
+ if (interruptedRef.current) break;
4587
4585
  if (msg.data?.chunk) {
4588
4586
  playAudioChunk(msg.data.chunk);
4589
4587
  }
4590
4588
  break;
4589
+ case "audio_complete":
4590
+ interruptedRef.current = false;
4591
+ break;
4592
+ case "audio_interrupt": {
4593
+ if (activeSourcesRef.current.length === 0) break;
4594
+ let audioEndMs = 0;
4595
+ if (playbackCtxRef.current && responseStartTimeRef.current > 0) {
4596
+ const elapsedMs = Math.max(0, Math.floor(
4597
+ (playbackCtxRef.current.currentTime - responseStartTimeRef.current) * 1e3
4598
+ ));
4599
+ audioEndMs = Math.min(elapsedMs, totalAudioDurationMsRef.current);
4600
+ }
4601
+ const itemId = msg.data?.item_id;
4602
+ const contentIndex = msg.data?.content_index || 0;
4603
+ resetPlayback();
4604
+ if (itemId) {
4605
+ interruptedRef.current = true;
4606
+ }
4607
+ const ws = wsRef.current;
4608
+ if (ws && ws.readyState === WebSocket.OPEN && itemId) {
4609
+ ws.send(JSON.stringify({
4610
+ type: "control",
4611
+ data: {
4612
+ action: "truncate",
4613
+ item_id: itemId,
4614
+ content_index: contentIndex,
4615
+ audio_end_ms: audioEndMs
4616
+ }
4617
+ }));
4618
+ }
4619
+ break;
4620
+ }
4591
4621
  case "transcript":
4592
4622
  if (msg.data) {
4593
4623
  if (msg.data.partial) {
@@ -4606,7 +4636,7 @@ function useVoiceSession(config) {
4606
4636
  break;
4607
4637
  case "tool_call":
4608
4638
  if (msg.data) {
4609
- nextPlayTimeRef.current = 0;
4639
+ resetPlayback();
4610
4640
  cfg.onTranscript?.({
4611
4641
  id: `vt-tool-${Date.now()}`,
4612
4642
  role: "system",
@@ -4618,24 +4648,48 @@ function useVoiceSession(config) {
4618
4648
  break;
4619
4649
  case "tool_result":
4620
4650
  if (msg.data) {
4621
- nextPlayTimeRef.current = 0;
4651
+ const status = msg.data.error ? "failed" : "completed";
4652
+ cfg.onTranscript?.({
4653
+ id: `vt-toolresult-${Date.now()}`,
4654
+ role: "system",
4655
+ content: `Tool ${status}: ${msg.data.name || msg.data.call_id}`,
4656
+ partial: false,
4657
+ timestamp: /* @__PURE__ */ new Date()
4658
+ });
4622
4659
  }
4623
4660
  break;
4661
+ case "turn_end":
4662
+ interruptedRef.current = false;
4663
+ break;
4624
4664
  case "error":
4625
4665
  setState("error");
4626
4666
  cfg.onError?.(new Error(msg.data?.message || "Voice session error"));
4627
4667
  break;
4628
4668
  }
4629
- }, [playAudioChunk]);
4669
+ }, [playAudioChunk, resetPlayback]);
4630
4670
  const startCapture = useCallback4(async () => {
4631
4671
  const ws = wsRef.current;
4632
4672
  if (!ws) return;
4673
+ if (processorRef.current) {
4674
+ processorRef.current.disconnect();
4675
+ processorRef.current = null;
4676
+ }
4677
+ if (mediaStreamRef.current) {
4678
+ mediaStreamRef.current.getTracks().forEach((t) => t.stop());
4679
+ mediaStreamRef.current = null;
4680
+ }
4681
+ if (captureCtxRef.current) {
4682
+ try {
4683
+ captureCtxRef.current.close();
4684
+ } catch (_) {
4685
+ }
4686
+ captureCtxRef.current = null;
4687
+ }
4633
4688
  try {
4634
- captureCtxRef.current = new AudioContext();
4689
+ captureCtxRef.current = new AudioContext({ sampleRate: 24e3 });
4635
4690
  if (captureCtxRef.current.state === "suspended") {
4636
4691
  await captureCtxRef.current.resume();
4637
4692
  }
4638
- const nativeSampleRate = captureCtxRef.current.sampleRate;
4639
4693
  mediaStreamRef.current = await navigator.mediaDevices.getUserMedia({
4640
4694
  audio: {
4641
4695
  echoCancellation: true,
@@ -4644,27 +4698,28 @@ function useVoiceSession(config) {
4644
4698
  }
4645
4699
  });
4646
4700
  const source = captureCtxRef.current.createMediaStreamSource(mediaStreamRef.current);
4647
- processorRef.current = captureCtxRef.current.createScriptProcessor(2048, 1, 1);
4701
+ processorRef.current = captureCtxRef.current.createScriptProcessor(4096, 1, 1);
4648
4702
  processorRef.current.onaudioprocess = (e) => {
4649
4703
  if (!ws || ws.readyState !== WebSocket.OPEN) return;
4650
4704
  if (mutedRef.current) return;
4651
- if (agentSpeakingRef.current) return;
4652
4705
  const inputData = e.inputBuffer.getChannelData(0);
4653
- const resampledData = resampleAudio(inputData, nativeSampleRate, 16e3);
4654
- const int16Data = float32ToInt16(resampledData);
4706
+ const int16Data = float32ToInt16(inputData);
4655
4707
  const base64Data = int16ToBase64(int16Data);
4656
4708
  ws.send(JSON.stringify({
4657
4709
  type: "audio",
4658
- data: { chunk: base64Data }
4710
+ data: { chunk: base64Data, sample_rate: 24e3 }
4659
4711
  }));
4660
4712
  };
4661
4713
  source.connect(processorRef.current);
4662
- processorRef.current.connect(captureCtxRef.current.destination);
4714
+ const silentGain = captureCtxRef.current.createGain();
4715
+ silentGain.gain.value = 0;
4716
+ processorRef.current.connect(silentGain);
4717
+ silentGain.connect(captureCtxRef.current.destination);
4663
4718
  } catch (e) {
4664
4719
  console.warn("Microphone access denied:", e);
4665
4720
  configRef.current.onError?.(new Error("Microphone access denied"));
4666
4721
  }
4667
- }, [cleanup]);
4722
+ }, []);
4668
4723
  startCaptureRef.current = startCapture;
4669
4724
  const start = useCallback4(() => {
4670
4725
  if (state !== "idle") return;
@@ -4710,7 +4765,7 @@ function useVoiceSession(config) {
4710
4765
  cleanup();
4711
4766
  setState("idle");
4712
4767
  };
4713
- }, [state, config.apiUrl, handleMessage, cleanup]);
4768
+ }, [state, config.apiUrl, config.apiKey, handleMessage, cleanup]);
4714
4769
  const stop = useCallback4(() => {
4715
4770
  cleanup();
4716
4771
  setState("idle");