npm - @volley/recognition-client-sdk - Versions diffs - 0.1.782 → 0.1.800 - Mend

@volley/recognition-client-sdk 0.1.782 → 0.1.800

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

package/dist/browser.bundled.d.ts +75 -4
package/dist/index.bundled.d.ts +198 -87
package/dist/index.js +191 -20
package/dist/index.js.map +4 -4
package/dist/recog-client-sdk.browser.js +95 -4
package/dist/recog-client-sdk.browser.js.map +4 -4
package/dist/recognition-client.d.ts +23 -0
package/dist/recognition-client.d.ts.map +1 -1
package/dist/recognition-client.types.d.ts +32 -0
package/dist/recognition-client.types.d.ts.map +1 -1
package/dist/simplified-vgf-recognition-client.d.ts +22 -85
package/dist/simplified-vgf-recognition-client.d.ts.map +1 -1
package/dist/utils/audio-resampler.d.ts +32 -0
package/dist/utils/audio-resampler.d.ts.map +1 -0
package/dist/vgf-recognition-mapper.d.ts +9 -17
package/dist/vgf-recognition-mapper.d.ts.map +1 -1
package/dist/vgf-recognition-state.d.ts +103 -0
package/dist/vgf-recognition-state.d.ts.map +1 -1
package/package.json +1 -1
package/src/index.spec.ts +2 -0
package/src/recognition-client.ts +65 -7
package/src/recognition-client.types.ts +37 -0
package/src/simplified-vgf-recognition-client.spec.ts +0 -27
package/src/simplified-vgf-recognition-client.ts +97 -127
package/src/utils/audio-resampler.spec.ts +69 -0
package/src/utils/audio-resampler.ts +79 -0
package/src/vgf-recognition-mapper.spec.ts +143 -0
package/src/vgf-recognition-mapper.ts +35 -45
package/src/vgf-recognition-state.ts +19 -1

package/src/vgf-recognition-mapper.spec.ts CHANGED Viewed

@@ -3,6 +3,8 @@
  */
 import {
+    mapTranscriptionResultToState,
+    mapSessionConfiguredToState,
     resetRecognitionVGFState
 } from './vgf-recognition-mapper.js';
 import {
@@ -11,6 +13,13 @@ import {
     TranscriptionStatus,
     RecognitionActionProcessingState
 } from './vgf-recognition-state.js';
+import {
+    ASRApiType,
+    DetectionTypeV1,
+    RecognitionResultTypeV1,
+    SessionConfiguredV1,
+    TranscriptionResultV1
+} from '@recog/shared-types';
 describe('resetRecognitionVGFState', () => {
     it('should generate a new UUID', () => {
@@ -75,4 +84,138 @@ describe('resetRecognitionVGFState', () => {
         expect(newState.startRecordingStatus).toBe(RecordingStatus.READY);
         expect(newState.recognitionActionProcessingState).toBe(RecognitionActionProcessingState.NOT_STARTED);
     });
+    it('should preserve prompt input fields and clear detections', () => {
+        const originalState: RecognitionState = {
+            audioUtteranceId: 'old-uuid-123',
+            pendingTranscript: '',
+            promptSTT: 'hello,world',
+            promptSTF: 'map to play()',
+            promptTTF: 'extract title,artist',
+            detections: [
+                { type: DetectionTypeV1.SEARCH, query: 'one time', score: 0.92 }
+            ]
+        };
+        const newState = resetRecognitionVGFState(originalState);
+        expect(newState.promptSTT).toBe('hello,world');
+        expect(newState.promptSTF).toBe('map to play()');
+        expect(newState.promptTTF).toBe('extract title,artist');
+        expect(newState.detections).toBeUndefined();
+    });
+});
+describe('mapTranscriptionResultToState detections', () => {
+    const baseState: RecognitionState = {
+        audioUtteranceId: 'utt-1',
+        pendingTranscript: ''
+    };
+    it('should copy detections from a pending transcript result', () => {
+        const result: TranscriptionResultV1 = {
+            type: RecognitionResultTypeV1.TRANSCRIPTION,
+            audioUtteranceId: 'utt-1',
+            finalTranscript: '',
+            finalTranscriptRaw: '',
+            pendingTranscript: 'one',
+            is_finished: false,
+            detections: [
+                { type: DetectionTypeV1.SEARCH, query: 'one time', score: 0.81, startMs: 100, endMs: 900 }
+            ]
+        };
+        const newState = mapTranscriptionResultToState(baseState, result, true);
+        expect(newState.detections).toEqual([
+            { type: DetectionTypeV1.SEARCH, query: 'one time', score: 0.81, startMs: 100, endMs: 900 }
+        ]);
+    });
+    it('should copy detections from a final transcript result', () => {
+        const result: TranscriptionResultV1 = {
+            type: RecognitionResultTypeV1.TRANSCRIPTION,
+            audioUtteranceId: 'utt-1',
+            finalTranscript: 'one time',
+            finalTranscriptRaw: 'one time',
+            is_finished: true,
+            detections: [
+                { type: DetectionTypeV1.SEARCH, query: 'one time', score: 0.93 }
+            ]
+        };
+        const newState = mapTranscriptionResultToState(baseState, result, false);
+        expect(newState.detections).toHaveLength(1);
+        expect(newState.detections?.[0]?.query).toBe('one time');
+    });
+    it('should mirror SessionConfiguredV1 onto state and clear on reset', () => {
+        const sessionConfigured: SessionConfiguredV1 = {
+            type: RecognitionResultTypeV1.SESSION_CONFIGURED,
+            audioUtteranceId: 'utt-1',
+            provider: 'deepgram',
+            model: 'nova-3',
+            sampleRate: 16000,
+            encoding: 'linear16',
+            apiType: ASRApiType.STREAMING,
+            isFallback: false
+        };
+        const afterConfigured = mapSessionConfiguredToState(baseState, sessionConfigured);
+        expect(afterConfigured.sessionConfigured?.provider).toBe('deepgram');
+        expect(afterConfigured.sessionConfigured?.model).toBe('nova-3');
+        expect(afterConfigured.sessionConfigured?.isFallback).toBe(false);
+        const afterReset = resetRecognitionVGFState(afterConfigured);
+        expect(afterReset.sessionConfigured).toBeUndefined();
+    });
+    it('should copy accumulatedAudioTimeMs on pending and final transcripts and clear on reset', () => {
+        const pending: TranscriptionResultV1 = {
+            type: RecognitionResultTypeV1.TRANSCRIPTION,
+            audioUtteranceId: 'utt-1',
+            finalTranscript: '',
+            finalTranscriptRaw: '',
+            pendingTranscript: 'hi',
+            is_finished: false,
+            accumulatedAudioTimeMs: 1234
+        };
+        const afterPending = mapTranscriptionResultToState(baseState, pending, true);
+        expect(afterPending.accumulatedAudioTimeMs).toBe(1234);
+        const final: TranscriptionResultV1 = {
+            type: RecognitionResultTypeV1.TRANSCRIPTION,
+            audioUtteranceId: 'utt-1',
+            finalTranscript: 'hi',
+            finalTranscriptRaw: 'hi',
+            is_finished: true,
+            accumulatedAudioTimeMs: 5678
+        };
+        const afterFinal = mapTranscriptionResultToState(afterPending, final, false);
+        expect(afterFinal.accumulatedAudioTimeMs).toBe(5678);
+        const afterReset = resetRecognitionVGFState(afterFinal);
+        expect(afterReset.accumulatedAudioTimeMs).toBeUndefined();
+    });
+    it('should leave existing detections untouched when the result omits them', () => {
+        const stateWithDetections: RecognitionState = {
+            ...baseState,
+            detections: [{ type: DetectionTypeV1.SEARCH, query: 'prev', score: 0.5 }]
+        };
+        const result: TranscriptionResultV1 = {
+            type: RecognitionResultTypeV1.TRANSCRIPTION,
+            audioUtteranceId: 'utt-1',
+            finalTranscript: '',
+            finalTranscriptRaw: '',
+            pendingTranscript: 'hello',
+            is_finished: false
+        };
+        const newState = mapTranscriptionResultToState(stateWithDetections, result, true);
+        expect(newState.detections).toEqual([
+            { type: DetectionTypeV1.SEARCH, query: 'prev', score: 0.5 }
+        ]);
+    });
 });

package/src/vgf-recognition-mapper.ts CHANGED Viewed

@@ -19,7 +19,7 @@ import {
 } from './recognition-client.types.js';
 import {
     TranscriptionResultV1,
-    ErrorResultV1
+    SessionConfiguredV1
 } from '@recog/shared-types';
 /**
@@ -94,6 +94,9 @@ export function mapTranscriptionResultToState(
         if (result.lastNonSilence !== undefined) {
             newState.lastNonSilence = result.lastNonSilence;
         }
+        if (result.accumulatedAudioTimeMs !== undefined) {
+            newState.accumulatedAudioTimeMs = result.accumulatedAudioTimeMs;
+        }
     } else {
         // Transcription is finished
         newState.transcriptionStatus = TranscriptionStatus.FINALIZED;
@@ -110,21 +113,46 @@ export function mapTranscriptionResultToState(
         if (result.lastNonSilence !== undefined) {
             newState.lastNonSilence = result.lastNonSilence;
         }
+        if (result.accumulatedAudioTimeMs !== undefined) {
+            newState.accumulatedAudioTimeMs = result.accumulatedAudioTimeMs;
+        }
         // Clear pending when we have final
         newState.pendingTranscript = "";
         newState.pendingConfidence = undefined;
     }
+    // Mirror provider-reported detections (currently Deepgram `search`; future entries
+    // may include keywords/keyterms/speech_contexts from other providers).
+    // Server pre-sorts by score descending, so [0] is the top hit — passed through as-is.
+    if (result.detections !== undefined) {
+        newState.detections = result.detections;
+    }
     return newState;
 }
+/**
+ * Mirrors the SessionConfiguredV1 message onto the VGF state.
+ * Carries the resolved provider/model/sampleRate/encoding/apiType/isFallback
+ * that the server actually chose (after circuit-breaker/fallback). Fires once
+ * per session, before audio streaming begins.
+ */
+export function mapSessionConfiguredToState(
+    currentState: RecognitionState,
+    sessionConfigured: SessionConfiguredV1
+): RecognitionState {
+    return {
+        ...currentState,
+        sessionConfigured
+    };
+}
 /**
  * Maps error to state
  */
 export function mapErrorToState(
-    currentState: RecognitionState,
-    error: ErrorResultV1
+    currentState: RecognitionState
 ): RecognitionState {
     return {
         ...currentState,
@@ -185,7 +213,10 @@ export function resetRecognitionVGFState(currentState: RecognitionState): Recogn
         recognitionActionProcessingState: RecognitionActionProcessingState.NOT_STARTED,
         finalTranscript: undefined,
         voiceEnd: undefined,
-        lastNonSilence: undefined
+        lastNonSilence: undefined,
+        accumulatedAudioTimeMs: undefined,
+        detections: undefined,
+        sessionConfigured: undefined
     };
 }
@@ -199,47 +230,6 @@ export function updateStateOnReady(currentState: RecognitionState): RecognitionS
     };
 }
-/**
- * Parses function call from transcript (STEP 3 support)
- * This is a placeholder - actual implementation would use NLP/LLM
- */
-export function extractFunctionCallFromTranscript(
-    transcript: string,
-    gameContext?: any
-): { metadata?: string; confidence?: number } | null {
-    // This would be replaced with actual function call extraction logic
-    // For example, using an LLM to parse intent from the transcript
-    // and map it to game actions
-    // Example stub implementation:
-    const lowerTranscript = transcript.toLowerCase();
-    // Simple pattern matching for demo
-    if (lowerTranscript.includes("play") && lowerTranscript.includes("artist")) {
-        return {
-            metadata: JSON.stringify({ action: "play", target: "artist" }),
-            confidence: 0.8
-        };
-    }
-    return null;
-}
-/**
- * Updates state with function call results (STEP 3)
- */
-export function updateStateWithFunctionCall(
-    currentState: RecognitionState,
-    functionCall: { metadata?: string; confidence?: number }
-): RecognitionState {
-    return {
-        ...currentState,
-        functionCallMetadata: functionCall.metadata,
-        functionCallConfidence: functionCall.confidence,
-        finalFunctionCallTimestamp: new Date().toISOString()
-    };
-}
 // Helper function to generate UUID (simplified version)
 function generateUUID(): string {
     return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) {

package/src/vgf-recognition-state.ts CHANGED Viewed

@@ -1,4 +1,5 @@
 import { z } from "zod"
+import { DetectionV1Schema, SessionConfiguredSchemaV1 } from "@recog/shared-types"
 /**
  * VGF-style state schema for game-side recognition state/results management.
@@ -25,9 +26,13 @@ export const RecognitionVGFStateSchema = z.object({
     // Voice timing (ms from stream start, prefix-adjusted)
     voiceEnd: z.number().optional(), // voice end time identified by ASR
     lastNonSilence: z.number().optional(), // last non-silence sample time from PCM analysis
+    accumulatedAudioTimeMs: z.number().optional(), // total user audio time watermark (ms) — mirrors TranscriptionResultV1.accumulatedAudioTimeMs
     // Tracking-only metadata
-    asrConfig: z.string().optional(), // Json format of the ASR config
+    asrConfig: z.string().optional(), // Json format of the *requested* ASR config (set once at construction).
+    // For the *resolved* truth — actual provider/model/sampleRate/encoding/apiType/isFallback chosen by the
+    // server after circuit-breaker/fallback — see `sessionConfigured` below.
+    sessionConfigured: SessionConfiguredSchemaV1.optional(), // Mirrors the SessionConfiguredV1 message; populated when the server emits it (before audio streams).
     startRecordingTimestamp: z.string().optional(), // Start of recording. Immutable after set.
     finalRecordingTimestamp: z.string().optional(), // End of recording. Immutable after set. Transcription may still be in progress.
     finalTranscriptionTimestamp: z.string().optional(), // When the final transcript was produced. Immutable after set.
@@ -44,6 +49,19 @@ export const RecognitionVGFStateSchema = z.object({
     // Support for prompt slot mapping - passed to recognition context when present
     promptSlotMap: z.record(z.string(), z.array(z.string())).optional(), // Optional map of slot names to prompt values for recognition context
+    // Optional prompt inputs - when set, forwarded into GameContext at client creation.
+    // Mirror the GameContextV1 fields: STT (ASR keywords/keyterms), STF (speech->function), TTF (text->function).
+    promptSTT: z.string().optional(),
+    promptSTF: z.string().optional(),
+    promptTTF: z.string().optional(),
+    // Provider-reported phrase detections from the last transcript message.
+    // Mirrors TranscriptionResultV1.detections — a heterogeneous list keyed by DetectionTypeV1
+    // (today only 'search' from Deepgram; future entries may include keywords/keyterms/speech_contexts).
+    // Sorted by `score` descending by the server (see deepgram/message-handlers/v1/transform-transcript.ts
+    // and provider-to-recognition-transformer.ts), so [0] is the top hit — no client-side re-rank needed.
+    detections: z.array(DetectionV1Schema).optional(),
     // Recognition action processing state - managed externally, SDK preserves but never modifies
     recognitionActionProcessingState: z.string().optional(), // "NOT_STARTED", "IN_PROGRESS", "COMPLETED"
 })