@space3-npm/cybersoul-client 1.0.9 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/client.d.ts CHANGED
@@ -10,6 +10,23 @@ export declare class CyberSoulClient {
10
10
  private buildStateContextPrompt;
11
11
  private getImageSchemaParams;
12
12
  private getVoiceSchemaParams;
13
+ private buildVoiceSchemaFromDynamicParams;
14
+ /**
15
+ * Returns the JSON schema snippet for voiceArgs to embed in the LLM output schema.
16
+ * Built from dynamic_params when available, otherwise falls back to static defaults.
17
+ */
18
+ private getVoiceSchemaFromState;
19
+ /**
20
+ * Returns the natural-language director instruction for generating voiceArgs.
21
+ * Uses dynamic_param_prompt_template from the voice model when configured.
22
+ */
23
+ private getVoiceDirectorInstruction;
24
+ /**
25
+ * Extracts and types voiceArgs from a raw standalone LLM response.
26
+ * The voice-only prompt wraps the result as { voiceArgs: { ... } } — unwraps the inner object.
27
+ * If the payload is already the inner args object (no voiceArgs wrapper), uses it as-is.
28
+ */
29
+ private extractVoiceArgsFromLlmResponse;
13
30
  /**
14
31
  * Fetches the current dynamic context and daily state.
15
32
  */
package/dist/client.js CHANGED
@@ -74,7 +74,53 @@ EMOTIONAL INERTIA RULES:
74
74
  }`;
75
75
  }
76
76
  getVoiceSchemaParams() {
77
- return `"voiceArgs": { "style_instruction": "How the line should be spoken (Qwen3 format)", "emotion": "happy | sad | angry | fearful | disgusted | surprised | calm | fluent | whisper (Strictly choose ONE from this exact list.)" }`;
77
+ // Only reached when no dynamic_params are configured on the voice model.
78
+ // Configure dynamic_params in DB to match the TTS provider; this fallback is provider-agnostic.
79
+ console.warn("[CyberSoulClient] voice_model.dynamic_params not configured — using generic fallback schema. Configure dynamic_params in DB for provider-specific behaviour.");
80
+ return `"voiceArgs": { "style_instruction": "How the line should be spoken (required)" }`;
81
+ }
82
+ buildVoiceSchemaFromDynamicParams(dynamicParams) {
83
+ const fields = dynamicParams
84
+ .map((p) => {
85
+ const hint = p.required ? `${p.description} (required)` : `${p.description} (optional)`;
86
+ return `"${p.name}": "${hint}"`;
87
+ })
88
+ .join(", ");
89
+ return `"voiceArgs": { ${fields} }`;
90
+ }
91
+ /**
92
+ * Returns the JSON schema snippet for voiceArgs to embed in the LLM output schema.
93
+ * Built from dynamic_params when available, otherwise falls back to static defaults.
94
+ */
95
+ getVoiceSchemaFromState(state) {
96
+ const dynamicParams = state.voice_model?.dynamic_params;
97
+ if (dynamicParams && dynamicParams.length > 0) {
98
+ return this.buildVoiceSchemaFromDynamicParams(dynamicParams);
99
+ }
100
+ return this.getVoiceSchemaParams();
101
+ }
102
+ /**
103
+ * Returns the natural-language director instruction for generating voiceArgs.
104
+ * Uses dynamic_param_prompt_template from the voice model when configured.
105
+ */
106
+ getVoiceDirectorInstruction(state) {
107
+ const template = state.voice_model?.dynamic_param_prompt_template?.trim();
108
+ if (template) {
109
+ return template;
110
+ }
111
+ return "Analyze the text according to the character's relationship stage and emotional inertia to determine the best dynamic voice parameters for TTS.";
112
+ }
113
+ /**
114
+ * Extracts and types voiceArgs from a raw standalone LLM response.
115
+ * The voice-only prompt wraps the result as { voiceArgs: { ... } } — unwraps the inner object.
116
+ * If the payload is already the inner args object (no voiceArgs wrapper), uses it as-is.
117
+ */
118
+ extractVoiceArgsFromLlmResponse(payload) {
119
+ const inner = payload.voiceArgs;
120
+ if (inner && typeof inner === "object" && !Array.isArray(inner)) {
121
+ return inner;
122
+ }
123
+ return payload;
78
124
  }
79
125
  /**
80
126
  * Fetches the current dynamic context and daily state.
@@ -86,25 +132,14 @@ EMOTIONAL INERTIA RULES:
86
132
  * Updates the character's relationship temperature or mood.
87
133
  */
88
134
  async updateDynamicContext(stateUpdate) {
89
- if (!stateUpdate)
90
- return;
91
- // Map TS schema intent (temperatureDelta) to match Backend payload schema (temperature)
92
- const payload = { ...stateUpdate };
93
- if (payload.temperatureDelta !== undefined) {
94
- payload.temperature = payload.temperatureDelta;
95
- delete payload.temperatureDelta;
96
- }
97
- await this.apiFetch("/api/v1/cyber-soul/characters/dynamic-context", {
98
- method: "PATCH",
99
- body: JSON.stringify(payload),
100
- }).catch((e) => console.error("Failed to update dynamic context", e)); // non-blocking error handler
135
+ return this._updateDynamicContextInternal(stateUpdate);
101
136
  }
102
137
  /**
103
138
  * Manually generate an image of the character outside of chat flow.
104
139
  */
105
140
  async generateImage(params) {
106
141
  let imageParams = {};
107
- const state = await this.getState();
142
+ const state = await this.fetchRemoteState();
108
143
  const prompt = `${this.buildStateContextPrompt(state, params.interactParams?.localContext)}
109
144
 
110
145
  You are an AI image prompt director. Analyze the scene description according to the character's relationship stage and emotional inertia to determine the best image generation parameters.
@@ -139,12 +174,14 @@ Output strictly valid JSON ONLY. No markdown, no conversational filler. Return e
139
174
  */
140
175
  async generateVoice(params) {
141
176
  let dynamicArgs = {};
142
- const state = await this.getState();
177
+ const state = await this.fetchRemoteState();
143
178
  const prompt = `${this.buildStateContextPrompt(state, params.interactParams?.localContext)}
144
179
 
145
- You are a voice acting director. Analyze the text according to the character's relationship stage and emotional inertia to determine the single best emotion and a style instruction for TTS.
146
- Allowed emotions: "happy", "sad", "angry", "fearful", "disgusted", "surprised", "calm", "fluent", "whisper".
147
- Output strictly valid JSON ONLY. No markdown, no conversational filler. Return exactly this format: {"emotion": "chosen_emotion", "style_instruction": "How the line should be spoken"}`;
180
+ You are a voice acting director. ${this.getVoiceDirectorInstruction(state)}
181
+ Output strictly valid JSON ONLY. No markdown, no conversational filler. Return exactly matching this schema:
182
+ {
183
+ ${this.getVoiceSchemaFromState(state)}
184
+ }`;
148
185
  const promptMessages = [
149
186
  { role: "system", content: prompt },
150
187
  ...(params.interactParams?.history || []),
@@ -156,10 +193,11 @@ Output strictly valid JSON ONLY. No markdown, no conversational filler. Return e
156
193
  const llmRes = await this.llm.generate(promptMessages, 800, 0.3);
157
194
  console.log("[CyberSoulClient VoiceGen] Raw LLM Response:", llmRes);
158
195
  try {
159
- dynamicArgs = robustJsonParse(llmRes, "generateVoice args fallback");
196
+ const parsedVoicePayload = robustJsonParse(llmRes, "generateVoice args fallback");
197
+ dynamicArgs = this.extractVoiceArgsFromLlmResponse(parsedVoicePayload);
160
198
  }
161
199
  catch (e) {
162
- dynamicArgs = {}; // fallback to empty
200
+ dynamicArgs = {};
163
201
  }
164
202
  const res = await this.generatePrimitive("voice", {
165
203
  text: params.text,
@@ -263,13 +301,14 @@ ${isAuto
263
301
  - If the user wants to hear you, or if appropriate for a voice message, include 'voiceArgs'.`
264
302
  : `Requested types to fulfill: ${types.join(", ")}`}
265
303
  If the user's message shifts the emotional mood, establishes new nicknames, or warrants a relationship temperature change, you MUST include a 'stateUpdate' block. Temperature goes from 0 (cold/angry) to 100 (obsessively in love).
304
+ Voice direction for voiceArgs: ${this.getVoiceDirectorInstruction(state)}
266
305
 
267
306
  Output JSON Schema:
268
307
  {
269
308
  "textResponse": "The direct spoken dialogue in Chinese",
270
309
  "stateUpdate": { "temperatureDelta": "+1 to -1", "userNickname": "What you now call the user", "agentNickname": "What the user calls you", "talkingStyle": "Current mood/style of talking" },
271
310
  ${this.getImageSchemaParams()},
272
- ${this.getVoiceSchemaParams()}
311
+ ${this.getVoiceSchemaFromState(state)}
273
312
  }
274
313
  Note: If "imageParams", "voiceArgs", or "stateUpdate" are not needed, set their values to null instead of omitting the keys completely (e.g., "imageParams": null). Output MUST be ONLY valid JSON with no markdown block wrappers. CRITICAL: Ensure your JSON has exactly one root object \`{\` and ends with exactly one \`}\` without any trailing garbage or extra brackets.`;
275
314
  const promptMessages = [
@@ -319,9 +358,12 @@ Note: If "imageParams", "voiceArgs", or "stateUpdate" are not needed, set their
319
358
  const shouldGenerateVoice = types.includes(InteractRequestType.VOICE) ||
320
359
  (isAuto && !!parsedIntent.voiceArgs);
321
360
  if (shouldGenerateVoice) {
361
+ const normalizedVoiceArgs = parsedIntent.voiceArgs && typeof parsedIntent.voiceArgs === "object"
362
+ ? parsedIntent.voiceArgs
363
+ : {};
322
364
  mediaTasks.push(this.generatePrimitive("voice", {
323
365
  text: parsedIntent.textResponse,
324
- dynamicArgs: parsedIntent.voiceArgs || {},
366
+ dynamicArgs: normalizedVoiceArgs,
325
367
  }).then((res) => {
326
368
  finalAudioUrl = res.audio_url;
327
369
  finalDurationSec = res.duration_sec;
package/dist/types.d.ts CHANGED
@@ -35,7 +35,7 @@ export interface InteractResponse {
35
35
  export interface DispatcherIntent {
36
36
  textResponse?: string;
37
37
  imageParams?: any;
38
- voiceArgs?: any;
38
+ voiceArgs?: VoiceArgs | null;
39
39
  stateUpdate?: {
40
40
  temperatureDelta?: string | number;
41
41
  userNickname?: string;
@@ -50,6 +50,32 @@ export interface CoreMemory {
50
50
  keyEvents: string[];
51
51
  appointments: string[];
52
52
  }
53
+ /**
54
+ * Generic dynamic voice args returned by the LLM and forwarded to backend TTS.
55
+ *
56
+ * - T lets callers/project code narrow this to model-specific fields when needed.
57
+ * - Defaults to fully dynamic key/value pairs for provider-agnostic SDK behavior.
58
+ */
59
+ export type VoiceArgs<T extends Record<string, unknown> = Record<string, unknown>> = T;
60
+ /**
61
+ * Optional compatibility shape for currently common fields.
62
+ * Not used as the SDK contract to avoid coupling to specific providers.
63
+ */
64
+ export interface CommonVoiceArgs {
65
+ style_instruction?: string;
66
+ emotion?: string;
67
+ }
68
+ export interface VoiceModelState {
69
+ tts_provider?: string;
70
+ dynamic_param_prompt_template?: string;
71
+ dynamic_params?: Array<{
72
+ name: string;
73
+ description: string;
74
+ type: string;
75
+ required: boolean;
76
+ default?: unknown;
77
+ }>;
78
+ }
53
79
  export interface CharacterState {
54
80
  current_time: string;
55
81
  active_event?: any;
@@ -57,6 +83,7 @@ export interface CharacterState {
57
83
  active_wardrobe?: any;
58
84
  core_memory?: CoreMemory;
59
85
  dynamic_context?: any;
86
+ voice_model?: VoiceModelState | null;
60
87
  relationship_stage?: string;
61
88
  name?: string;
62
89
  age?: number;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@space3-npm/cybersoul-client",
3
- "version": "1.0.9",
3
+ "version": "1.1.0",
4
4
  "type": "module",
5
5
  "main": "dist/index.js",
6
6
  "module": "dist/index.js",