@entros/pulse-sdk 1.5.1 → 1.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -134,8 +134,8 @@ interface VerificationResult {
134
134
  * Server-side safe-reveal (validator → executor → SDK):
135
135
  * - `variance_floor`, `entropy_bounds`, `temporal_coupling_low`,
136
136
  * `phrase_content_mismatch`
137
- * Surfaced for the soft-reject + retry UX (master-list #94) so the
138
- * UI can render a per-category hint.
137
+ * Surfaced for the soft-reject + retry UX so the UI can render a
138
+ * per-category hint.
139
139
  *
140
140
  * Client-side (SDK-emitted):
141
141
  * - `validation_unavailable` — the relayer's `/validate-features`
@@ -360,17 +360,17 @@ declare const SPEAKER_FEATURE_COUNT = 44;
360
360
  /**
361
361
  * Extract speaker-dependent audio features.
362
362
  *
363
- * Captures physiological vocal characteristics (F0, jitter, shimmer, HNR, formant
364
- * ratios) that are stable across different utterances from the same speaker.
365
- * Content-independent by design — different phrases produce similar feature values.
363
+ * Captures physiological vocal characteristics that are stable across
364
+ * different utterances from the same speaker. Content-independent by
365
+ * design — different phrases produce similar feature values.
366
366
  *
367
367
  * Returns 44 values.
368
368
  */
369
369
  /**
370
370
  * Extracts 44 speaker features AND the raw F0 contour.
371
- * The F0 contour is surfaced so Tier 2 cross-modal temporal analysis can be
372
- * performed server-side against the motion time-series. Feature vector shape
373
- * and semantics are unchanged.
371
+ * The F0 contour is surfaced so server-side analysis can pair it with
372
+ * the motion time-series. Feature vector shape and semantics are
373
+ * unchanged.
374
374
  */
375
375
  declare function extractSpeakerFeaturesDetailed(audio: AudioCapture): Promise<{
376
376
  features: number[];
@@ -379,15 +379,15 @@ declare function extractSpeakerFeaturesDetailed(audio: AudioCapture): Promise<{
379
379
  /**
380
380
  * Extracts 44 speaker features. Backward-compatible wrapper that discards
381
381
  * the F0 contour; use `extractSpeakerFeaturesDetailed` when the contour is
382
- * needed (e.g. for Tier 2 server-side cross-modal analysis).
382
+ * needed (e.g. for server-side analysis).
383
383
  */
384
384
  declare function extractSpeakerFeatures(audio: AudioCapture): Promise<number[]>;
385
385
 
386
386
  /**
387
387
  * Compute per-sample acceleration magnitude |a| = √(ax² + ay² + az²) and
388
- * linearly resample to a target frame count. Used for Tier 2 cross-modal
389
- * temporal analysis against the F0 contour; the two time-series must share
390
- * the same frame count for direct correlation.
388
+ * linearly resample to a target frame count. Surfaced for server-side
389
+ * analysis paired against the F0 contour; the two time-series must share
390
+ * the same frame count when consumed downstream.
391
391
  *
392
392
  * Returns an empty array if motion data is absent or too short.
393
393
  */
@@ -513,9 +513,8 @@ declare function submitViaWallet(proof: SolanaProof, commitment: Uint8Array, opt
513
513
  * and sets a 7-day cooldown before the next reset.
514
514
  *
515
515
  * Transaction shape: single instruction (no challenge / verify_proof /
516
- * ZK proof required). Humanness evidence comes from the Tier 1
517
- * validation pipeline invoked at the /attest step (same as mint and
518
- * update).
516
+ * ZK proof required). Humanness evidence comes from the validation
517
+ * pipeline invoked at the /attest step (same as mint and update).
519
518
  */
520
519
  declare function submitResetViaWallet(commitment: Uint8Array, options: {
521
520
  wallet: any;
@@ -659,8 +658,8 @@ declare function loadVerificationData(): Promise<StoredVerificationData | null>;
659
658
  * FALLBACK challenge-phrase generator. Used only when the executor's
660
659
  * `/challenge` endpoint is unreachable; the authoritative phrase comes from
661
660
  * the server (5 real words drawn from a curated English-word dictionary). On
662
- * this fallback path, validation skips server-side phrase content binding
663
- * Tier 1 acoustic + Tier 2 cross-modal still run.
661
+ * this fallback path, validation skips the phrase verification step
662
+ * other server-side checks still run.
664
663
  *
665
664
  * Output is 5-6 syllable pairs, forming nonsensical but speakable words.
666
665
  * Uses crypto.getRandomValues for unpredictable challenge generation.
@@ -711,16 +710,14 @@ declare function generateLissajousSequence(count?: number): {
711
710
  *
712
711
  * The executor's `/challenge` endpoint returns a fresh nonce + 5-word phrase
713
712
  * bound to the wallet for a short TTL (default 60s). The phrase is drawn from
714
- * a curated English-word dictionary (source of truth at
715
- * `entros-validation/src/word_dict.rs`); shown to the user as the voice challenge
713
+ * a curated English-word dictionary, shown to the user as the voice challenge
716
714
  * and looked up server-side at `/validate-features` to verify the audio
717
- * matches the issued phrase (master-list #89, phrase content binding).
715
+ * matches the issued phrase.
718
716
  *
719
- * Server-issued phrases are the only safe design for content binding: if the
720
- * client generated the phrase and sent it to the server alongside the audio,
721
- * an attacker would submit their own phrase matching whatever content they
722
- * captured. With server issuance, the phrase is bound to the nonce and the
723
- * client cannot substitute it.
717
+ * Server-issued phrases are the only safe design here: if the client generated
718
+ * the phrase and sent it to the server alongside the audio, an attacker would
719
+ * submit their own phrase matching whatever content they captured. With server
720
+ * issuance, the phrase is bound to the nonce and the client cannot substitute it.
724
721
  */
725
722
  /**
726
723
  * Server-issued challenge artifacts. Returned by `fetchChallenge`.
@@ -745,13 +742,13 @@ declare function fetchChallenge(executorUrl: string, walletAddress: string, apiK
745
742
 
746
743
  /**
747
744
  * Encode captured Float32 audio samples as base64 int16 PCM for transmission
748
- * to the validation service (master-list #89 phrase content binding).
745
+ * to the validation service.
749
746
  *
750
747
  * Audio is captured as `Float32Array` with values in `[-1.0, 1.0]` by the
751
- * Pulse SDK (`sensor/audio.ts`). The validation service's phrase-binding
752
- * module decodes base64 Vec<i16> Vec<f32> before feeding Whisper-tiny.
753
- * int16 is the standard compact representation: 2 bytes per sample vs 4 for
754
- * f32, halving wire size without perceptible quality loss for 16kHz speech.
748
+ * Pulse SDK (`sensor/audio.ts`). The validation service decodes the base64
749
+ * payload and feeds the audio into server-side transcription. int16 is the
750
+ * standard compact representation: 2 bytes per sample vs 4 for f32, halving
751
+ * wire size without perceptible quality loss for 16kHz speech.
755
752
  *
756
753
  * Byte layout: little-endian int16 samples, contiguous, no header.
757
754
  */
package/dist/index.d.ts CHANGED
@@ -134,8 +134,8 @@ interface VerificationResult {
134
134
  * Server-side safe-reveal (validator → executor → SDK):
135
135
  * - `variance_floor`, `entropy_bounds`, `temporal_coupling_low`,
136
136
  * `phrase_content_mismatch`
137
- * Surfaced for the soft-reject + retry UX (master-list #94) so the
138
- * UI can render a per-category hint.
137
+ * Surfaced for the soft-reject + retry UX so the UI can render a
138
+ * per-category hint.
139
139
  *
140
140
  * Client-side (SDK-emitted):
141
141
  * - `validation_unavailable` — the relayer's `/validate-features`
@@ -360,17 +360,17 @@ declare const SPEAKER_FEATURE_COUNT = 44;
360
360
  /**
361
361
  * Extract speaker-dependent audio features.
362
362
  *
363
- * Captures physiological vocal characteristics (F0, jitter, shimmer, HNR, formant
364
- * ratios) that are stable across different utterances from the same speaker.
365
- * Content-independent by design — different phrases produce similar feature values.
363
+ * Captures physiological vocal characteristics that are stable across
364
+ * different utterances from the same speaker. Content-independent by
365
+ * design — different phrases produce similar feature values.
366
366
  *
367
367
  * Returns 44 values.
368
368
  */
369
369
  /**
370
370
  * Extracts 44 speaker features AND the raw F0 contour.
371
- * The F0 contour is surfaced so Tier 2 cross-modal temporal analysis can be
372
- * performed server-side against the motion time-series. Feature vector shape
373
- * and semantics are unchanged.
371
+ * The F0 contour is surfaced so server-side analysis can pair it with
372
+ * the motion time-series. Feature vector shape and semantics are
373
+ * unchanged.
374
374
  */
375
375
  declare function extractSpeakerFeaturesDetailed(audio: AudioCapture): Promise<{
376
376
  features: number[];
@@ -379,15 +379,15 @@ declare function extractSpeakerFeaturesDetailed(audio: AudioCapture): Promise<{
379
379
  /**
380
380
  * Extracts 44 speaker features. Backward-compatible wrapper that discards
381
381
  * the F0 contour; use `extractSpeakerFeaturesDetailed` when the contour is
382
- * needed (e.g. for Tier 2 server-side cross-modal analysis).
382
+ * needed (e.g. for server-side analysis).
383
383
  */
384
384
  declare function extractSpeakerFeatures(audio: AudioCapture): Promise<number[]>;
385
385
 
386
386
  /**
387
387
  * Compute per-sample acceleration magnitude |a| = √(ax² + ay² + az²) and
388
- * linearly resample to a target frame count. Used for Tier 2 cross-modal
389
- * temporal analysis against the F0 contour; the two time-series must share
390
- * the same frame count for direct correlation.
388
+ * linearly resample to a target frame count. Surfaced for server-side
389
+ * analysis paired against the F0 contour; the two time-series must share
390
+ * the same frame count when consumed downstream.
391
391
  *
392
392
  * Returns an empty array if motion data is absent or too short.
393
393
  */
@@ -513,9 +513,8 @@ declare function submitViaWallet(proof: SolanaProof, commitment: Uint8Array, opt
513
513
  * and sets a 7-day cooldown before the next reset.
514
514
  *
515
515
  * Transaction shape: single instruction (no challenge / verify_proof /
516
- * ZK proof required). Humanness evidence comes from the Tier 1
517
- * validation pipeline invoked at the /attest step (same as mint and
518
- * update).
516
+ * ZK proof required). Humanness evidence comes from the validation
517
+ * pipeline invoked at the /attest step (same as mint and update).
519
518
  */
520
519
  declare function submitResetViaWallet(commitment: Uint8Array, options: {
521
520
  wallet: any;
@@ -659,8 +658,8 @@ declare function loadVerificationData(): Promise<StoredVerificationData | null>;
659
658
  * FALLBACK challenge-phrase generator. Used only when the executor's
660
659
  * `/challenge` endpoint is unreachable; the authoritative phrase comes from
661
660
  * the server (5 real words drawn from a curated English-word dictionary). On
662
- * this fallback path, validation skips server-side phrase content binding
663
- * Tier 1 acoustic + Tier 2 cross-modal still run.
661
+ * this fallback path, validation skips the phrase verification step
662
+ * other server-side checks still run.
664
663
  *
665
664
  * Output is 5-6 syllable pairs, forming nonsensical but speakable words.
666
665
  * Uses crypto.getRandomValues for unpredictable challenge generation.
@@ -711,16 +710,14 @@ declare function generateLissajousSequence(count?: number): {
711
710
  *
712
711
  * The executor's `/challenge` endpoint returns a fresh nonce + 5-word phrase
713
712
  * bound to the wallet for a short TTL (default 60s). The phrase is drawn from
714
- * a curated English-word dictionary (source of truth at
715
- * `entros-validation/src/word_dict.rs`); shown to the user as the voice challenge
713
+ * a curated English-word dictionary, shown to the user as the voice challenge
716
714
  * and looked up server-side at `/validate-features` to verify the audio
717
- * matches the issued phrase (master-list #89, phrase content binding).
715
+ * matches the issued phrase.
718
716
  *
719
- * Server-issued phrases are the only safe design for content binding: if the
720
- * client generated the phrase and sent it to the server alongside the audio,
721
- * an attacker would submit their own phrase matching whatever content they
722
- * captured. With server issuance, the phrase is bound to the nonce and the
723
- * client cannot substitute it.
717
+ * Server-issued phrases are the only safe design here: if the client generated
718
+ * the phrase and sent it to the server alongside the audio, an attacker would
719
+ * submit their own phrase matching whatever content they captured. With server
720
+ * issuance, the phrase is bound to the nonce and the client cannot substitute it.
724
721
  */
725
722
  /**
726
723
  * Server-issued challenge artifacts. Returned by `fetchChallenge`.
@@ -745,13 +742,13 @@ declare function fetchChallenge(executorUrl: string, walletAddress: string, apiK
745
742
 
746
743
  /**
747
744
  * Encode captured Float32 audio samples as base64 int16 PCM for transmission
748
- * to the validation service (master-list #89 phrase content binding).
745
+ * to the validation service.
749
746
  *
750
747
  * Audio is captured as `Float32Array` with values in `[-1.0, 1.0]` by the
751
- * Pulse SDK (`sensor/audio.ts`). The validation service's phrase-binding
752
- * module decodes base64 Vec<i16> Vec<f32> before feeding Whisper-tiny.
753
- * int16 is the standard compact representation: 2 bytes per sample vs 4 for
754
- * f32, halving wire size without perceptible quality loss for 16kHz speech.
748
+ * Pulse SDK (`sensor/audio.ts`). The validation service decodes the base64
749
+ * payload and feeds the audio into server-side transcription. int16 is the
750
+ * standard compact representation: 2 bytes per sample vs 4 for f32, halving
751
+ * wire size without perceptible quality loss for 16kHz speech.
755
752
  *
756
753
  * Byte layout: little-endian int16 samples, contiguous, no header.
757
754
  */
package/dist/index.js CHANGED
@@ -150,9 +150,30 @@ async function captureAudio(options = {}) {
150
150
  audio: {
151
151
  sampleRate: TARGET_SAMPLE_RATE,
152
152
  channelCount: 1,
153
+ // Capture without browser-side audio processing — preserves the
154
+ // raw microphone signal for the SDK's downstream feature extraction
155
+ // and for server-side validation. Audio cleanup intended for the
156
+ // transcription path runs server-side, on a parallel path that
157
+ // never feeds back to feature extraction. Matches the mobile SDK's
158
+ // choice of Android's `MIC` source over `VOICE_RECOGNITION` —
159
+ // same architectural decision, two platforms.
153
160
  echoCancellation: false,
154
161
  noiseSuppression: false,
155
- autoGainControl: false
162
+ autoGainControl: false,
163
+ // OS-level voice isolation request (W3C Media Capture Extensions,
164
+ // 2024). Activates the platform DSP on Chrome 124+ / ChromeOS and
165
+ // surfaces Apple Voice Isolation Mic Mode on Safari macOS Sonoma+
166
+ // / iOS 17+ when the user has it enabled in Control Center.
167
+ // Silently ignored on browsers/OSes without support, so the
168
+ // constraint costs nothing where it doesn't help. Distinct
169
+ // mechanism from `noiseSuppression` above — that flag controls
170
+ // WebRTC's hand-tuned AudioProcessingModule, this requests the
171
+ // OS-native neural effect.
172
+ // @ts-expect-error -- W3C Media Capture Extensions property; not
173
+ // yet in lib.dom.d.ts as of TypeScript 6.0. Removing this directive
174
+ // becomes a compile error once lib.dom catches up, signaling that
175
+ // it can be deleted.
176
+ voiceIsolation: true
156
177
  }
157
178
  });
158
179
  let ctx;
@@ -652,6 +673,7 @@ async function getMeyda() {
652
673
  }
653
674
  return meydaModule.default ?? meydaModule;
654
675
  }
676
+ var F0_YIELD_EVERY_N_FRAMES = 16;
655
677
  async function detectF0Contour(samples, sampleRate) {
656
678
  const detect = await getPitchDetector(sampleRate);
657
679
  const frameSize = getFrameSize(sampleRate);
@@ -678,6 +700,9 @@ async function detectF0Contour(samples, sampleRate) {
678
700
  sum += (frame[j] ?? 0) * (frame[j] ?? 0);
679
701
  }
680
702
  amplitudes.push(Math.sqrt(sum / frame.length));
703
+ if (i > 0 && i < numFrames - 1 && i % F0_YIELD_EVERY_N_FRAMES === 0) {
704
+ await yieldToMainThread();
705
+ }
681
706
  }
682
707
  return { f0, amplitudes, periods };
683
708
  }
@@ -4744,9 +4769,16 @@ var PulseSession = class {
4744
4769
  audio: {
4745
4770
  sampleRate: 16e3,
4746
4771
  channelCount: 1,
4772
+ // Capture constraints kept in lock-step with `sensor/audio.ts` —
4773
+ // the two entry points (standalone capture vs session-based
4774
+ // capture) must agree or the verify flow and direct-API
4775
+ // consumers diverge.
4747
4776
  echoCancellation: false,
4748
4777
  noiseSuppression: false,
4749
- autoGainControl: false
4778
+ autoGainControl: false,
4779
+ // @ts-expect-error -- W3C Media Capture Extensions property; not
4780
+ // yet in lib.dom.d.ts as of TypeScript 6.0.
4781
+ voiceIsolation: true
4750
4782
  }
4751
4783
  });
4752
4784
  this.audioStageState = "capturing";