@utterance/core 0.0.2 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -373,7 +373,7 @@ var FeatureExtractor = class {
373
373
  };
374
374
 
375
375
  // src/types.ts
376
- var MODEL_CDN_URL = "https://pub-46a5feb0029246bcbc93fab6162cff94.r2.dev/v0.0.2/utterance-v1.onnx";
376
+ var MODEL_CDN_URL = "https://pub-46a5feb0029246bcbc93fab6162cff94.r2.dev/v2/utterance-v2.onnx";
377
377
  var DEFAULT_OPTIONS = {
378
378
  sensitivity: 0.5,
379
379
  pauseTolerance: 1500,
@@ -484,7 +484,9 @@ var ONNXModel = class {
484
484
  }
485
485
  } else if (path === "bundled") {
486
486
  try {
487
- const response = await fetch(new URL("../../models/utterance-v1.onnx", import_meta.url).href);
487
+ const getUrl = new Function("p", "b", "return new URL(p, b).href");
488
+ const href = getUrl("../../models/utterance-v2.onnx", import_meta.url);
489
+ const response = await fetch(href);
488
490
  if (response.ok) {
489
491
  modelSource = await response.arrayBuffer();
490
492
  } else {
@@ -569,6 +571,25 @@ var ONNXModel = class {
569
571
  const dstIdx = i * FEATURE_DIM;
570
572
  input.set(this.frameBuffer.subarray(srcIdx, srcIdx + FEATURE_DIM), dstIdx);
571
573
  }
574
+ for (let f = 0; f < 14; f++) {
575
+ let sum = 0;
576
+ for (let i = 0; i < CONTEXT_FRAMES; i++) {
577
+ sum += input[i * FEATURE_DIM + f];
578
+ }
579
+ const mean = sum / CONTEXT_FRAMES;
580
+ let varSum = 0;
581
+ for (let i = 0; i < CONTEXT_FRAMES; i++) {
582
+ const d = input[i * FEATURE_DIM + f] - mean;
583
+ varSum += d * d;
584
+ }
585
+ const std = Math.sqrt(varSum / CONTEXT_FRAMES) || 1;
586
+ for (let i = 0; i < CONTEXT_FRAMES; i++) {
587
+ input[i * FEATURE_DIM + f] = (input[i * FEATURE_DIM + f] - mean) / std;
588
+ }
589
+ }
590
+ for (let i = 0; i < CONTEXT_FRAMES; i++) {
591
+ input[i * FEATURE_DIM + 14] /= 500;
592
+ }
572
593
  const tensor = new ort.Tensor("float32", input, [1, CONTEXT_FRAMES, FEATURE_DIM]);
573
594
  const results = await session.run({ input: tensor });
574
595
  const output = results.output;
@@ -616,6 +637,7 @@ var TurnDetector = class {
616
637
  state = "idle";
617
638
  pauseStart = 0;
618
639
  speakStart = 0;
640
+ interruptFired = false;
619
641
  sensitivity;
620
642
  pauseTolerance;
621
643
  constructor(sensitivity = 0.5, pauseTolerance = 1500) {
@@ -645,6 +667,7 @@ var TurnDetector = class {
645
667
  const threshold = this.sensitivity;
646
668
  switch (label) {
647
669
  case "speaking":
670
+ this.interruptFired = false;
648
671
  if (this.state !== "speaking") {
649
672
  this.state = "speaking";
650
673
  this.speakStart = timestamp;
@@ -680,7 +703,8 @@ var TurnDetector = class {
680
703
  }
681
704
  break;
682
705
  case "interrupt_intent":
683
- if (confidence >= threshold) {
706
+ if (confidence >= threshold && !this.interruptFired) {
707
+ this.interruptFired = true;
684
708
  this.emit("interrupt", { timestamp });
685
709
  }
686
710
  break;
@@ -693,6 +717,7 @@ var TurnDetector = class {
693
717
  this.state = "idle";
694
718
  this.pauseStart = 0;
695
719
  this.speakStart = 0;
720
+ this.interruptFired = false;
696
721
  }
697
722
  emit(event, payload) {
698
723
  this.listeners.get(event)?.forEach((fn) => fn(payload));
package/dist/index.js CHANGED
@@ -337,7 +337,7 @@ var FeatureExtractor = class {
337
337
  };
338
338
 
339
339
  // src/types.ts
340
- var MODEL_CDN_URL = "https://pub-46a5feb0029246bcbc93fab6162cff94.r2.dev/v0.0.2/utterance-v1.onnx";
340
+ var MODEL_CDN_URL = "https://pub-46a5feb0029246bcbc93fab6162cff94.r2.dev/v2/utterance-v2.onnx";
341
341
  var DEFAULT_OPTIONS = {
342
342
  sensitivity: 0.5,
343
343
  pauseTolerance: 1500,
@@ -447,7 +447,9 @@ var ONNXModel = class {
447
447
  }
448
448
  } else if (path === "bundled") {
449
449
  try {
450
- const response = await fetch(new URL("../../models/utterance-v1.onnx", import.meta.url).href);
450
+ const getUrl = new Function("p", "b", "return new URL(p, b).href");
451
+ const href = getUrl("../../models/utterance-v2.onnx", import.meta.url);
452
+ const response = await fetch(href);
451
453
  if (response.ok) {
452
454
  modelSource = await response.arrayBuffer();
453
455
  } else {
@@ -532,6 +534,25 @@ var ONNXModel = class {
532
534
  const dstIdx = i * FEATURE_DIM;
533
535
  input.set(this.frameBuffer.subarray(srcIdx, srcIdx + FEATURE_DIM), dstIdx);
534
536
  }
537
+ for (let f = 0; f < 14; f++) {
538
+ let sum = 0;
539
+ for (let i = 0; i < CONTEXT_FRAMES; i++) {
540
+ sum += input[i * FEATURE_DIM + f];
541
+ }
542
+ const mean = sum / CONTEXT_FRAMES;
543
+ let varSum = 0;
544
+ for (let i = 0; i < CONTEXT_FRAMES; i++) {
545
+ const d = input[i * FEATURE_DIM + f] - mean;
546
+ varSum += d * d;
547
+ }
548
+ const std = Math.sqrt(varSum / CONTEXT_FRAMES) || 1;
549
+ for (let i = 0; i < CONTEXT_FRAMES; i++) {
550
+ input[i * FEATURE_DIM + f] = (input[i * FEATURE_DIM + f] - mean) / std;
551
+ }
552
+ }
553
+ for (let i = 0; i < CONTEXT_FRAMES; i++) {
554
+ input[i * FEATURE_DIM + 14] /= 500;
555
+ }
535
556
  const tensor = new ort.Tensor("float32", input, [1, CONTEXT_FRAMES, FEATURE_DIM]);
536
557
  const results = await session.run({ input: tensor });
537
558
  const output = results.output;
@@ -579,6 +600,7 @@ var TurnDetector = class {
579
600
  state = "idle";
580
601
  pauseStart = 0;
581
602
  speakStart = 0;
603
+ interruptFired = false;
582
604
  sensitivity;
583
605
  pauseTolerance;
584
606
  constructor(sensitivity = 0.5, pauseTolerance = 1500) {
@@ -608,6 +630,7 @@ var TurnDetector = class {
608
630
  const threshold = this.sensitivity;
609
631
  switch (label) {
610
632
  case "speaking":
633
+ this.interruptFired = false;
611
634
  if (this.state !== "speaking") {
612
635
  this.state = "speaking";
613
636
  this.speakStart = timestamp;
@@ -643,7 +666,8 @@ var TurnDetector = class {
643
666
  }
644
667
  break;
645
668
  case "interrupt_intent":
646
- if (confidence >= threshold) {
669
+ if (confidence >= threshold && !this.interruptFired) {
670
+ this.interruptFired = true;
647
671
  this.emit("interrupt", { timestamp });
648
672
  }
649
673
  break;
@@ -656,6 +680,7 @@ var TurnDetector = class {
656
680
  this.state = "idle";
657
681
  this.pauseStart = 0;
658
682
  this.speakStart = 0;
683
+ this.interruptFired = false;
659
684
  }
660
685
  emit(event, payload) {
661
686
  this.listeners.get(event)?.forEach((fn) => fn(payload));
Binary file
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@utterance/core",
3
- "version": "0.0.2",
3
+ "version": "0.0.4",
4
4
  "description": "Client-side semantic endpointing. Know when they're done talking.",
5
5
  "type": "module",
6
6
  "main": "dist/index.cjs",
@@ -56,6 +56,8 @@
56
56
  "devDependencies": {
57
57
  "@eslint/js": "^9.0.0",
58
58
  "@tailwindcss/postcss": "^4.2.0",
59
+ "@types/d3-scale": "^4.0.9",
60
+ "@types/d3-shape": "^3.1.8",
59
61
  "@types/mdx": "^2.0.13",
60
62
  "@types/node": "^22.0.0",
61
63
  "@types/react": "^19.2.14",
@@ -76,9 +78,11 @@
76
78
  "dependencies": {
77
79
  "@next/third-parties": "^16.1.6",
78
80
  "@react-three/fiber": "^9.5.0",
79
- "@utterance/core": "^0.0.1",
81
+ "@utterance/core": "^0.0.2",
80
82
  "class-variance-authority": "^0.7.1",
81
83
  "clsx": "^2.1.1",
84
+ "d3-scale": "^4.0.2",
85
+ "d3-shape": "^3.2.0",
82
86
  "fumadocs-core": "^16.6.3",
83
87
  "fumadocs-mdx": "^14.2.7",
84
88
  "fumadocs-ui": "^16.6.3",
package/models/.gitkeep DELETED
File without changes
Binary file