npm - @space3-npm/cybersoul-client - Versions diffs - 1.0.9 → 1.1.0 - Mend

@space3-npm/cybersoul-client 1.0.9 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/client.d.ts CHANGED Viewed

@@ -10,6 +10,23 @@ export declare class CyberSoulClient {
     private buildStateContextPrompt;
     private getImageSchemaParams;
     private getVoiceSchemaParams;
+    private buildVoiceSchemaFromDynamicParams;
+    /**
+     * Returns the JSON schema snippet for voiceArgs to embed in the LLM output schema.
+     * Built from dynamic_params when available, otherwise falls back to static defaults.
+     */
+    private getVoiceSchemaFromState;
+    /**
+     * Returns the natural-language director instruction for generating voiceArgs.
+     * Uses dynamic_param_prompt_template from the voice model when configured.
+     */
+    private getVoiceDirectorInstruction;
+    /**
+     * Extracts and types voiceArgs from a raw standalone LLM response.
+     * The voice-only prompt wraps the result as { voiceArgs: { ... } } — unwraps the inner object.
+     * If the payload is already the inner args object (no voiceArgs wrapper), uses it as-is.
+     */
+    private extractVoiceArgsFromLlmResponse;
     /**
      * Fetches the current dynamic context and daily state.
      */

package/dist/client.js CHANGED Viewed

@@ -74,7 +74,53 @@ EMOTIONAL INERTIA RULES:
   }`;
     }
     getVoiceSchemaParams() {
-        return `"voiceArgs": { "style_instruction": "How the line should be spoken (Qwen3 format)", "emotion": "happy | sad | angry | fearful | disgusted | surprised | calm | fluent | whisper (Strictly choose ONE from this exact list.)" }`;
+        // Only reached when no dynamic_params are configured on the voice model.
+        // Configure dynamic_params in DB to match the TTS provider; this fallback is provider-agnostic.
+        console.warn("[CyberSoulClient] voice_model.dynamic_params not configured — using generic fallback schema. Configure dynamic_params in DB for provider-specific behaviour.");
+        return `"voiceArgs": { "style_instruction": "How the line should be spoken (required)" }`;
+    }
+    buildVoiceSchemaFromDynamicParams(dynamicParams) {
+        const fields = dynamicParams
+            .map((p) => {
+            const hint = p.required ? `${p.description} (required)` : `${p.description} (optional)`;
+            return `"${p.name}": "${hint}"`;
+        })
+            .join(", ");
+        return `"voiceArgs": { ${fields} }`;
+    }
+    /**
+     * Returns the JSON schema snippet for voiceArgs to embed in the LLM output schema.
+     * Built from dynamic_params when available, otherwise falls back to static defaults.
+     */
+    getVoiceSchemaFromState(state) {
+        const dynamicParams = state.voice_model?.dynamic_params;
+        if (dynamicParams && dynamicParams.length > 0) {
+            return this.buildVoiceSchemaFromDynamicParams(dynamicParams);
+        }
+        return this.getVoiceSchemaParams();
+    }
+    /**
+     * Returns the natural-language director instruction for generating voiceArgs.
+     * Uses dynamic_param_prompt_template from the voice model when configured.
+     */
+    getVoiceDirectorInstruction(state) {
+        const template = state.voice_model?.dynamic_param_prompt_template?.trim();
+        if (template) {
+            return template;
+        }
+        return "Analyze the text according to the character's relationship stage and emotional inertia to determine the best dynamic voice parameters for TTS.";
+    }
+    /**
+     * Extracts and types voiceArgs from a raw standalone LLM response.
+     * The voice-only prompt wraps the result as { voiceArgs: { ... } } — unwraps the inner object.
+     * If the payload is already the inner args object (no voiceArgs wrapper), uses it as-is.
+     */
+    extractVoiceArgsFromLlmResponse(payload) {
+        const inner = payload.voiceArgs;
+        if (inner && typeof inner === "object" && !Array.isArray(inner)) {
+            return inner;
+        }
+        return payload;
     }
     /**
      * Fetches the current dynamic context and daily state.
@@ -86,25 +132,14 @@ EMOTIONAL INERTIA RULES:
      * Updates the character's relationship temperature or mood.
      */
     async updateDynamicContext(stateUpdate) {
-        if (!stateUpdate)
-            return;
-        // Map TS schema intent (temperatureDelta) to match Backend payload schema (temperature)
-        const payload = { ...stateUpdate };
-        if (payload.temperatureDelta !== undefined) {
-            payload.temperature = payload.temperatureDelta;
-            delete payload.temperatureDelta;
-        }
-        await this.apiFetch("/api/v1/cyber-soul/characters/dynamic-context", {
-            method: "PATCH",
-            body: JSON.stringify(payload),
-        }).catch((e) => console.error("Failed to update dynamic context", e)); // non-blocking error handler
+        return this._updateDynamicContextInternal(stateUpdate);
     }
     /**
      * Manually generate an image of the character outside of chat flow.
      */
     async generateImage(params) {
         let imageParams = {};
-        const state = await this.getState();
+        const state = await this.fetchRemoteState();
         const prompt = `${this.buildStateContextPrompt(state, params.interactParams?.localContext)}
 You are an AI image prompt director. Analyze the scene description according to the character's relationship stage and emotional inertia to determine the best image generation parameters.
@@ -139,12 +174,14 @@ Output strictly valid JSON ONLY. No markdown, no conversational filler. Return e
      */
     async generateVoice(params) {
         let dynamicArgs = {};
-        const state = await this.getState();
+        const state = await this.fetchRemoteState();
         const prompt = `${this.buildStateContextPrompt(state, params.interactParams?.localContext)}
-You are a voice acting director. Analyze the text according to the character's relationship stage and emotional inertia to determine the single best emotion and a style instruction for TTS.
-Allowed emotions: "happy", "sad", "angry", "fearful", "disgusted", "surprised", "calm", "fluent", "whisper".
-Output strictly valid JSON ONLY. No markdown, no conversational filler. Return exactly this format: {"emotion": "chosen_emotion", "style_instruction": "How the line should be spoken"}`;
+You are a voice acting director. ${this.getVoiceDirectorInstruction(state)}
+Output strictly valid JSON ONLY. No markdown, no conversational filler. Return exactly matching this schema:
+{
+  ${this.getVoiceSchemaFromState(state)}
+}`;
         const promptMessages = [
             { role: "system", content: prompt },
             ...(params.interactParams?.history || []),
@@ -156,10 +193,11 @@ Output strictly valid JSON ONLY. No markdown, no conversational filler. Return e
         const llmRes = await this.llm.generate(promptMessages, 800, 0.3);
         console.log("[CyberSoulClient VoiceGen] Raw LLM Response:", llmRes);
         try {
-            dynamicArgs = robustJsonParse(llmRes, "generateVoice args fallback");
+            const parsedVoicePayload = robustJsonParse(llmRes, "generateVoice args fallback");
+            dynamicArgs = this.extractVoiceArgsFromLlmResponse(parsedVoicePayload);
         }
         catch (e) {
-            dynamicArgs = {}; // fallback to empty
+            dynamicArgs = {};
         }
         const res = await this.generatePrimitive("voice", {
             text: params.text,
@@ -263,13 +301,14 @@ ${isAuto
   - If the user wants to hear you, or if appropriate for a voice message, include 'voiceArgs'.`
                 : `Requested types to fulfill: ${types.join(", ")}`}
 If the user's message shifts the emotional mood, establishes new nicknames, or warrants a relationship temperature change, you MUST include a 'stateUpdate' block. Temperature goes from 0 (cold/angry) to 100 (obsessively in love).
+Voice direction for voiceArgs: ${this.getVoiceDirectorInstruction(state)}
 Output JSON Schema:
 {
   "textResponse": "The direct spoken dialogue in Chinese",
   "stateUpdate": { "temperatureDelta": "+1 to -1", "userNickname": "What you now call the user", "agentNickname": "What the user calls you", "talkingStyle": "Current mood/style of talking" },
   ${this.getImageSchemaParams()},
-  ${this.getVoiceSchemaParams()}
+  ${this.getVoiceSchemaFromState(state)}
 }
 Note: If "imageParams", "voiceArgs", or "stateUpdate" are not needed, set their values to null instead of omitting the keys completely (e.g., "imageParams": null). Output MUST be ONLY valid JSON with no markdown block wrappers. CRITICAL: Ensure your JSON has exactly one root object \`{\` and ends with exactly one \`}\` without any trailing garbage or extra brackets.`;
             const promptMessages = [
@@ -319,9 +358,12 @@ Note: If "imageParams", "voiceArgs", or "stateUpdate" are not needed, set their
             const shouldGenerateVoice = types.includes(InteractRequestType.VOICE) ||
                 (isAuto && !!parsedIntent.voiceArgs);
             if (shouldGenerateVoice) {
+                const normalizedVoiceArgs = parsedIntent.voiceArgs && typeof parsedIntent.voiceArgs === "object"
+                    ? parsedIntent.voiceArgs
+                    : {};
                 mediaTasks.push(this.generatePrimitive("voice", {
                     text: parsedIntent.textResponse,
-                    dynamicArgs: parsedIntent.voiceArgs || {},
+                    dynamicArgs: normalizedVoiceArgs,
                 }).then((res) => {
                     finalAudioUrl = res.audio_url;
                     finalDurationSec = res.duration_sec;

package/dist/types.d.ts CHANGED Viewed

@@ -35,7 +35,7 @@ export interface InteractResponse {
 export interface DispatcherIntent {
     textResponse?: string;
     imageParams?: any;
-    voiceArgs?: any;
+    voiceArgs?: VoiceArgs | null;
     stateUpdate?: {
         temperatureDelta?: string | number;
         userNickname?: string;
@@ -50,6 +50,32 @@ export interface CoreMemory {
     keyEvents: string[];
     appointments: string[];
 }
+/**
+ * Generic dynamic voice args returned by the LLM and forwarded to backend TTS.
+ *
+ * - T lets callers/project code narrow this to model-specific fields when needed.
+ * - Defaults to fully dynamic key/value pairs for provider-agnostic SDK behavior.
+ */
+export type VoiceArgs<T extends Record<string, unknown> = Record<string, unknown>> = T;
+/**
+ * Optional compatibility shape for currently common fields.
+ * Not used as the SDK contract to avoid coupling to specific providers.
+ */
+export interface CommonVoiceArgs {
+    style_instruction?: string;
+    emotion?: string;
+}
+export interface VoiceModelState {
+    tts_provider?: string;
+    dynamic_param_prompt_template?: string;
+    dynamic_params?: Array<{
+        name: string;
+        description: string;
+        type: string;
+        required: boolean;
+        default?: unknown;
+    }>;
+}
 export interface CharacterState {
     current_time: string;
     active_event?: any;
@@ -57,6 +83,7 @@ export interface CharacterState {
     active_wardrobe?: any;
     core_memory?: CoreMemory;
     dynamic_context?: any;
+    voice_model?: VoiceModelState | null;
     relationship_stage?: string;
     name?: string;
     age?: number;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@space3-npm/cybersoul-client",
-  "version": "1.0.9",
+  "version": "1.1.0",
   "type": "module",
   "main": "dist/index.js",
   "module": "dist/index.js",