npm - hume - Versions diffs - 0.13.6 → 0.13.8 - Mend

hume 0.13.6 → 0.13.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (244) hide show

package/.mock/definition/empathic-voice/chat.yml CHANGED Viewed

@@ -4,6 +4,21 @@ channel:
   auth: false
   docs: Chat with Empathic Voice Interface (EVI)
   query-parameters:
+    access_token:
+      type: optional<string>
+      default: ''
+      docs: >-
+        Access token used for authenticating the client. If not provided, an
+        `api_key` must be provided to authenticate.
+        The access token is generated using both an API key and a Secret key,
+        which provides an additional layer of security compared to using just an
+        API key.
+        For more details, refer to the [Authentication Strategies
+        Guide](/docs/introduction/api-key#authentication-strategies).
     config_id:
       type: optional<string>
       docs: >-
@@ -32,6 +47,13 @@ channel:
         Include this parameter to apply a specific version of an EVI
         configuration. If omitted, the latest version will be applied.
+    event_limit:
+      type: optional<integer>
+      docs: >-
+        The maximum number of chat events to return from chat history. By
+        default, the system returns up to 300 events (100 events per page × 3
+        pages). Set this parameter to a smaller value to limit the number of
+        events returned.
     resumed_chat_group_id:
       type: optional<string>
       docs: >-
@@ -76,12 +98,6 @@ channel:
         Use the GET `/v0/evi/chat_groups` endpoint to obtain the Chat Group IDs
         of all Chat Groups associated with an API key. This endpoint returns a
         list of all available chat groups.
-    voice_id:
-      type: optional<string>
-      docs: >-
-        The name or ID of the voice from the `Voice Library` to be used as the
-        speaker for this EVI session. This will override the speaker set in the
-        selected configuration.
     verbose_transcription:
       type: optional<boolean>
       default: false
@@ -93,28 +109,12 @@ channel:
         field on a
         [UserMessage](/reference/speech-to-speech-evi/chat#receive.UserMessage)
         denotes whether the message is "interim" or "final."
-    event_limit:
-      type: optional<integer>
-      docs: >-
-        The maximum number of chat events to return from chat history. By
-        default, the system returns up to 300 events (100 events per page × 3
-        pages). Set this parameter to a smaller value to limit the number of
-        events returned.
-    access_token:
+    voice_id:
       type: optional<string>
-      default: ''
       docs: >-
-        Access token used for authenticating the client. If not provided, an
-        `api_key` must be provided to authenticate.
-        The access token is generated using both an API key and a Secret key,
-        which provides an additional layer of security compared to using just an
-        API key.
-        For more details, refer to the [Authentication Strategies
-        Guide](/docs/introduction/api-key#authentication-strategies).
+        The name or ID of the voice from the `Voice Library` to be used as the
+        speaker for this EVI session. This will override the speaker set in the
+        selected configuration.
     api_key:
       type: optional<string>
       default: ''
@@ -136,8 +136,8 @@ channel:
     - messages:
         - type: publish
           body:
-            type: audio_input
             data: data
+            type: audio_input
         - type: subscribe
           body:
             type: assistant_end

package/.mock/definition/empathic-voice/chatWebhooks.yml CHANGED Viewed

@@ -11,9 +11,9 @@ webhooks:
       - payload:
           chat_group_id: chat_group_id
           chat_id: chat_id
-          end_time: 1
           duration_seconds: 1
           end_reason: ACTIVE
+          end_time: 1
     docs: Sent when an EVI chat ends.
   chatStarted:
     audiences: []
@@ -25,6 +25,6 @@ webhooks:
       - payload:
           chat_group_id: chat_group_id
           chat_id: chat_id
-          start_time: 1
           chat_start_type: new_chat_group
+          start_time: 1
     docs: Sent when an EVI chat is started.

package/.mock/definition/tts/__package__.yml CHANGED Viewed

@@ -120,30 +120,26 @@ service:
                   Middle-aged masculine voice with a clear, rhythmic Scots lilt,
                   rounded vowels, and a warm, steady tone with an articulate,
                   academic quality.
-    synthesize-json-streaming:
-      path: /v0/tts/stream/json
+    synthesize-file-streaming:
+      path: /v0/tts/stream/file
       method: POST
       auth: true
       docs: >-
         Streams synthesized speech using the specified voice. If no voice is
         provided, a novel voice will be generated dynamically. Optionally,
         additional context can be included to influence the speech's style and
-        prosody.
-        The response is a stream of JSON objects including audio encoded in
-        base64.
+        prosody.
       source:
         openapi: tts-openapi.json
-      display-name: Text-to-speech (Streamed JSON)
+      display-name: Text-to-speech (Streamed File)
       request:
         body:
           type: PostedTts
         content-type: application/json
-      response-stream:
-        docs: Successful Response
-        type: SnippetAudioChunk
-        format: json
+      response:
+        docs: OK
+        type: file
+        status-code: 200
       errors:
         - UnprocessableEntityError
       examples:
@@ -155,26 +151,30 @@ service:
                 voice:
                   name: Male English Actor
                   provider: HUME_AI
-    synthesize-file-streaming:
-      path: /v0/tts/stream/file
+    synthesize-json-streaming:
+      path: /v0/tts/stream/json
       method: POST
       auth: true
       docs: >-
         Streams synthesized speech using the specified voice. If no voice is
         provided, a novel voice will be generated dynamically. Optionally,
         additional context can be included to influence the speech's style and
-        prosody.
+        prosody.
+        The response is a stream of JSON objects including audio encoded in
+        base64.
       source:
         openapi: tts-openapi.json
-      display-name: Text-to-speech (Streamed File)
+      display-name: Text-to-speech (Streamed JSON)
       request:
         body:
           type: PostedTts
         content-type: application/json
-      response:
-        docs: OK
-        type: file
-        status-code: 200
+      response-stream:
+        docs: Successful Response
+        type: SnippetAudioChunk
+        format: json
       errors:
         - UnprocessableEntityError
       examples:
@@ -319,14 +319,30 @@ types:
   SnippetAudioChunk:
     docs: Metadata for a chunk of generated audio.
     properties:
-      request_id:
+      audio:
         type: string
-        docs: ID of the initiating request.
+        docs: The generated audio output chunk in the requested format.
+      audio_format:
+        type: AudioFormatType
+        docs: The generated audio output format.
+      chunk_index:
+        type: integer
+        docs: The index of the audio chunk in the snippet.
       generation_id:
         type: string
         docs: >-
           The generation ID of the parent snippet that this chunk corresponds
           to.
+      is_last_chunk:
+        type: boolean
+        docs: >-
+          Whether or not this is the last chunk streamed back from the decoder
+          for one input snippet.
+      request_id:
+        type: string
+        docs: ID of the initiating request.
+      snippet:
+        type: optional<Snippet>
       snippet_id:
         type: string
         docs: The ID of the parent snippet that this chunk corresponds to.
@@ -339,27 +355,13 @@ types:
           The transcribed text of the generated audio of the parent snippet that
           this chunk corresponds to. It is only present if `instant_mode` is set
           to `false`.
-      chunk_index:
-        type: integer
-        docs: The index of the audio chunk in the snippet.
-      audio:
-        type: string
-        docs: The generated audio output chunk in the requested format.
-      audio_format:
-        type: AudioFormatType
-        docs: The generated audio output format.
-      is_last_chunk:
-        type: boolean
-        docs: >-
-          Whether or not this is the last chunk streamed back from the decoder
-          for one input snippet.
+      type:
+        type: optional<literal<"audio">>
       utterance_index:
         type: optional<integer>
         docs: >-
           The index of the utterance in the request that the parent snippet of
           this chunk corresponds to.
-      snippet:
-        type: optional<Snippet>
     source:
       openapi: tts-openapi.json
   PostedContextWithGenerationId:
@@ -395,25 +397,25 @@ types:
       openapi: tts-openapi.json
   ReturnGeneration:
     properties:
-      generation_id:
+      audio:
         type: string
         docs: >-
-          A unique ID associated with this TTS generation that can be used as
-          context for generating consistent speech style and prosody across
-          multiple requests.
+          The generated audio output in the requested format, encoded as a
+          base64 string.
       duration:
         type: double
         docs: Duration of the generated audio in seconds.
+      encoding:
+        type: AudioEncoding
       file_size:
         type: integer
         docs: Size of the generated audio in bytes.
-      encoding:
-        type: AudioEncoding
-      audio:
+      generation_id:
         type: string
         docs: >-
-          The generated audio output in the requested format, encoded as a
-          base64 string.
+          A unique ID associated with this TTS generation that can be used as
+          context for generating consistent speech style and prosody across
+          multiple requests.
       snippets:
         docs: >-
           A list of snippet groups where each group corresponds to an utterance
@@ -466,18 +468,9 @@ types:
           Utterances to use as context for generating consistent speech style
           and prosody across multiple requests. These will not be converted to
           speech output.
-      utterances:
-        docs: >-
-          A list of **Utterances** to be converted to speech output.
-          An **Utterance** is a unit of input for
-          [Octave](/docs/text-to-speech-tts/overview), and includes input
-          `text`, an optional `description` to serve as the prompt for how the
-          speech should be delivered, an optional `voice` specification, and
-          additional controls to guide delivery for `speed` and
-          `trailing_silence`.
-        type: list<PostedUtterance>
+      format:
+        type: optional<Format>
+        docs: Specifies the output audio file format.
       num_generations:
         type: optional<integer>
         docs: Number of generations of the audio to produce.
@@ -485,9 +478,6 @@ types:
         validation:
           min: 1
           max: 5
-      format:
-        type: optional<Format>
-        docs: Specifies the output audio file format.
       split_utterances:
         type: optional<boolean>
         docs: >-
@@ -516,6 +506,20 @@ types:
           if disabled, each chunk's audio will be its own audio file, each with
           its own headers (if applicable).
         default: false
+      utterances:
+        docs: >-
+          A list of **Utterances** to be converted to speech output.
+          An **Utterance** is a unit of input for
+          [Octave](/docs/text-to-speech-tts/overview), and includes input
+          `text`, an optional `description` to serve as the prompt for how the
+          speech should be delivered, an optional `voice` specification, and
+          additional controls to guide delivery for `speed` and
+          `trailing_silence`.
+        type: list<PostedUtterance>
+      version:
+        type: optional<OctaveVersion>
       instant_mode:
         type: optional<boolean>
         docs: >-
@@ -542,16 +546,17 @@ types:
       openapi: tts-openapi.json
   ReturnTts:
     properties:
+      generations:
+        type: list<ReturnGeneration>
       request_id:
         type: optional<string>
         docs: >-
           A unique ID associated with this request for tracking and
           troubleshooting. Use this ID when contacting [support](/support) for
           troubleshooting assistance.
-      generations:
-        type: list<ReturnGeneration>
     source:
       openapi: tts-openapi.json
+  OctaveVersion: string
   ReturnVoice:
     docs: An Octave voice available for text-to-speech
     properties:
@@ -577,37 +582,32 @@ types:
       openapi: tts-openapi.json
   Snippet:
     properties:
+      audio:
+        type: string
+        docs: >-
+          The segmented audio output in the requested format, encoded as a
+          base64 string.
+      generation_id:
+        type: string
+        docs: The generation ID this snippet corresponds to.
       id:
         type: string
         docs: A unique ID associated with this **Snippet**.
       text:
         type: string
         docs: The text for this **Snippet**.
-      generation_id:
-        type: string
-        docs: The generation ID this snippet corresponds to.
-      utterance_index:
-        type: optional<integer>
-        docs: The index of the utterance in the request this snippet corresponds to.
       transcribed_text:
         type: optional<string>
         docs: >-
           The transcribed text of the generated audio. It is only present if
           `instant_mode` is set to `false`.
-      audio:
-        type: string
-        docs: >-
-          The segmented audio output in the requested format, encoded as a
-          base64 string.
+      utterance_index:
+        type: optional<integer>
+        docs: The index of the utterance in the request this snippet corresponds to.
     source:
       openapi: tts-openapi.json
   PostedUtterance:
     properties:
-      text:
-        type: string
-        docs: The input text to be synthesized into speech.
-        validation:
-          maxLength: 5000
       description:
         type: optional<string>
         docs: >-
@@ -629,14 +629,6 @@ types:
           guide](/docs/text-to-speech-tts/prompting) for design tips.
         validation:
           maxLength: 1000
-      voice:
-        type: optional<PostedUtteranceVoice>
-        docs: >-
-          The `name` or `id` associated with a **Voice** from the **Voice
-          Library** to be used as the speaker for this and all subsequent
-          `utterances`, until the `voice` field is updated again.
-           See our [voices guide](/docs/text-to-speech-tts/voices) for more details on generating and specifying **Voices**.
       speed:
         type: optional<double>
         docs: >-
@@ -646,6 +638,11 @@ types:
         validation:
           min: 0.5
           max: 2
+      text:
+        type: string
+        docs: The input text to be synthesized into speech.
+        validation:
+          maxLength: 5000
       trailing_silence:
         type: optional<double>
         docs: Duration of trailing silence (in seconds) to add to this utterance
@@ -653,6 +650,14 @@ types:
         validation:
           min: 0
           max: 5
+      voice:
+        type: optional<PostedUtteranceVoice>
+        docs: >-
+          The `name` or `id` associated with a **Voice** from the **Voice
+          Library** to be used as the speaker for this and all subsequent
+          `utterances`, until the `voice` field is updated again.
+           See our [voices guide](/docs/text-to-speech-tts/voices) for more details on generating and specifying **Voices**.
     source:
       openapi: tts-openapi.json
   ValidationErrorLocItem:

package/api/resources/empathicVoice/types/AssistantEnd.d.ts CHANGED Viewed

@@ -5,12 +5,12 @@
  * When provided, the output is an assistant end message.
  */
 export interface AssistantEnd {
+    /** Used to manage conversational state, correlate frontend and backend data, and persist conversations across EVI sessions. */
+    customSessionId?: string;
     /**
      * The type of message sent through the socket; for an Assistant End message, this must be `assistant_end`.
      *
      * This message indicates the conclusion of the assistant's response, signaling that the assistant has finished speaking for the current conversational turn.
      */
     type: "assistant_end";
-    /** Used to manage conversational state, correlate frontend and backend data, and persist conversations across EVI sessions. */
-    customSessionId?: string;
 }

package/api/resources/empathicVoice/types/AssistantInput.d.ts CHANGED Viewed

@@ -5,8 +5,6 @@
  * When provided, the input is spoken by EVI.
  */
 export interface AssistantInput {
-    /** The type of message sent through the socket; must be `assistant_input` for our server to correctly identify and process it as an Assistant Input message. */
-    type: "assistant_input";
     /** Used to manage conversational state, correlate frontend and backend data, and persist conversations across EVI sessions. */
     customSessionId?: string;
     /**
@@ -15,4 +13,6 @@ export interface AssistantInput {
      * EVI uses this text to generate spoken audio using our proprietary expressive text-to-speech model. Our model adds appropriate emotional inflections and tones to the text based on the user's expressions and the context of the conversation. The synthesized audio is streamed back to the user as an [Assistant Message](/reference/speech-to-speech-evi/chat#receive.AssistantMessage).
      */
     text: string;
+    /** The type of message sent through the socket; must be `assistant_input` for our server to correctly identify and process it as an Assistant Input message. */
+    type: "assistant_input";
 }

package/api/resources/empathicVoice/types/AssistantMessage.d.ts CHANGED Viewed

@@ -6,20 +6,20 @@ import * as Hume from "../../../index";
  * When provided, the output is an assistant message.
  */
 export interface AssistantMessage {
-    /**
-     * The type of message sent through the socket; for an Assistant Message, this must be `assistant_message`.
-     *
-     * This message contains both a transcript of the assistant's response and the expression measurement predictions of the assistant's audio output.
-     */
-    type: "assistant_message";
     /** Used to manage conversational state, correlate frontend and backend data, and persist conversations across EVI sessions. */
     customSessionId?: string;
+    /** Indicates if this message was inserted into the conversation as text from an [Assistant Input message](/reference/speech-to-speech-evi/chat#send.AssistantInput.text). */
+    fromText: boolean;
     /** ID of the assistant message. Allows the Assistant Message to be tracked and referenced. */
     id?: string;
     /** Transcript of the message. */
     message: Hume.empathicVoice.ChatMessage;
     /** Inference model results. */
     models: Hume.empathicVoice.Inference;
-    /** Indicates if this message was inserted into the conversation as text from an [Assistant Input message](/reference/speech-to-speech-evi/chat#send.AssistantInput.text). */
-    fromText: boolean;
+    /**
+     * The type of message sent through the socket; for an Assistant Message, this must be `assistant_message`.
+     *
+     * This message contains both a transcript of the assistant's response and the expression measurement predictions of the assistant's audio output.
+     */
+    type: "assistant_message";
 }

package/api/resources/empathicVoice/types/AssistantProsody.d.ts CHANGED Viewed

@@ -6,16 +6,16 @@ import * as Hume from "../../../index";
  * When provided, the output is an Assistant Prosody message.
  */
 export interface AssistantProsody {
+    /** Used to manage conversational state, correlate frontend and backend data, and persist conversations across EVI sessions. */
+    customSessionId?: string;
+    /** Unique identifier for the segment. */
+    id?: string;
+    /** Inference model results. */
+    models: Hume.empathicVoice.Inference;
     /**
      * The type of message sent through the socket; for an Assistant Prosody message, this must be `assistant_PROSODY`.
      *
      * This message the expression measurement predictions of the assistant's audio output.
      */
     type: "assistant_prosody";
-    /** Used to manage conversational state, correlate frontend and backend data, and persist conversations across EVI sessions. */
-    customSessionId?: string;
-    /** Inference model results. */
-    models: Hume.empathicVoice.Inference;
-    /** Unique identifier for the segment. */
-    id?: string;
 }

package/api/resources/empathicVoice/types/AudioConfiguration.d.ts CHANGED Viewed

@@ -3,10 +3,10 @@
  */
 import * as Hume from "../../../index";
 export interface AudioConfiguration {
-    /** Encoding format of the audio input, such as `linear16`. */
-    encoding: Hume.empathicVoice.Encoding;
     /** Number of audio channels. */
     channels: number;
+    /** Encoding format of the audio input, such as `linear16`. */
+    encoding: Hume.empathicVoice.Encoding;
     /** Audio sample rate. Number of samples per second in the audio input, measured in Hertz. */
     sampleRate: number;
 }

package/api/resources/empathicVoice/types/AudioInput.d.ts CHANGED Viewed

@@ -5,12 +5,6 @@
  * When provided, the input is audio.
  */
 export interface AudioInput {
-    /**
-     * The type of message sent through the socket; must be `audio_input` for our server to correctly identify and process it as an Audio Input message.
-     *
-     * This message is used for sending audio input data to EVI for processing and expression measurement. Audio data should be sent as a continuous stream, encoded in Base64.
-     */
-    type: "audio_input";
     /** Used to manage conversational state, correlate frontend and backend data, and persist conversations across EVI sessions. */
     customSessionId?: string;
     /**
@@ -23,4 +17,10 @@ export interface AudioInput {
      * Hume recommends streaming audio with a buffer window of 20 milliseconds (ms), or 100 milliseconds (ms) for web applications.
      */
     data: string;
+    /**
+     * The type of message sent through the socket; must be `audio_input` for our server to correctly identify and process it as an Audio Input message.
+     *
+     * This message is used for sending audio input data to EVI for processing and expression measurement. Audio data should be sent as a continuous stream, encoded in Base64.
+     */
+    type: "audio_input";
 }

package/api/resources/empathicVoice/types/AudioOutput.d.ts CHANGED Viewed

@@ -5,14 +5,14 @@
  * The type of message sent through the socket; for an Audio Output message, this must be `audio_output`.
  */
 export interface AudioOutput {
-    /** The type of message sent through the socket; for an Audio Output message, this must be `audio_output`. */
-    type: "audio_output";
     /** Used to manage conversational state, correlate frontend and backend data, and persist conversations across EVI sessions. */
     customSessionId?: string;
+    /** Base64 encoded audio output. This encoded audio is transmitted to the client, where it can be decoded and played back as part of the user interaction. */
+    data: string;
     /** ID of the audio output. Allows the Audio Output message to be tracked and referenced. */
     id: string;
     /** Index of the chunk of audio relative to the whole audio segment. */
     index: number;
-    /** Base64 encoded audio output. This encoded audio is transmitted to the client, where it can be decoded and played back as part of the user interaction. */
-    data: string;
+    /** The type of message sent through the socket; for an Audio Output message, this must be `audio_output`. */
+    type: "audio_output";
 }

package/api/resources/empathicVoice/types/BuiltinToolConfig.d.ts CHANGED Viewed

@@ -3,7 +3,7 @@
  */
 import * as Hume from "../../../index";
 export interface BuiltinToolConfig {
-    name: Hume.empathicVoice.BuiltInTool;
     /** Optional text passed to the supplemental LLM if the tool call fails. The LLM then uses this text to generate a response back to the user, ensuring continuity in the conversation. */
     fallbackContent?: string;
+    name: Hume.empathicVoice.BuiltInTool;
 }

package/api/resources/empathicVoice/types/ChatMessage.d.ts CHANGED Viewed

@@ -3,10 +3,10 @@
  */
 import * as Hume from "../../../index";
 export interface ChatMessage {
-    /** Role of who is providing the message. */
-    role: Hume.empathicVoice.Role;
     /** Transcript of the message. */
     content?: string;
+    /** Role of who is providing the message. */
+    role: Hume.empathicVoice.Role;
     /** Function call name and arguments. */
     toolCall?: Hume.empathicVoice.ToolCallMessage;
     /** Function call response from client. */

package/api/resources/empathicVoice/types/ChatMetadata.d.ts CHANGED Viewed

@@ -5,14 +5,6 @@
  * When provided, the output is a chat metadata message.
  */
 export interface ChatMetadata {
-    /**
-     * The type of message sent through the socket; for a Chat Metadata message, this must be `chat_metadata`.
-     *
-     * The Chat Metadata message is the first message you receive after establishing a connection with EVI and contains important identifiers for the current Chat session.
-     */
-    type: "chat_metadata";
-    /** Used to manage conversational state, correlate frontend and backend data, and persist conversations across EVI sessions. */
-    customSessionId?: string;
     /**
      * ID of the Chat Group.
      *
@@ -23,6 +15,14 @@ export interface ChatMetadata {
     chatGroupId: string;
     /** ID of the Chat session. Allows the Chat session to be tracked and referenced. */
     chatId: string;
+    /** Used to manage conversational state, correlate frontend and backend data, and persist conversations across EVI sessions. */
+    customSessionId?: string;
     /** ID of the initiating request. */
     requestId?: string;
+    /**
+     * The type of message sent through the socket; for a Chat Metadata message, this must be `chat_metadata`.
+     *
+     * The Chat Metadata message is the first message you receive after establishing a connection with EVI and contains important identifiers for the current Chat session.
+     */
+    type: "chat_metadata";
 }

package/api/resources/empathicVoice/types/Context.d.ts CHANGED Viewed

@@ -3,6 +3,12 @@
  */
 import * as Hume from "../../../index";
 export interface Context {
+    /**
+     * The context to be injected into the conversation. Helps inform the LLM's response by providing relevant information about the ongoing conversation.
+     *
+     * This text will be appended to the end of [user_messages](/reference/speech-to-speech-evi/chat#receive.UserMessage.message.content) based on the chosen persistence level. For example, if you want to remind EVI of its role as a helpful weather assistant, the context you insert will be appended to the end of user messages as `{Context: You are a helpful weather assistant}`.
+     */
+    text: string;
     /**
      * The persistence level of the injected context. Specifies how long the injected context will remain active in the session.
      *
@@ -11,10 +17,4 @@ export interface Context {
      * - **Persistent**: Context that is applied to all subsequent assistant responses for the remainder of the Chat.
      */
     type?: Hume.empathicVoice.ContextType;
-    /**
-     * The context to be injected into the conversation. Helps inform the LLM's response by providing relevant information about the ongoing conversation.
-     *
-     * This text will be appended to the end of [user_messages](/reference/speech-to-speech-evi/chat#receive.UserMessage.message.content) based on the chosen persistence level. For example, if you want to remind EVI of its role as a helpful weather assistant, the context you insert will be appended to the end of user messages as `{Context: You are a helpful weather assistant}`.
-     */
-    text: string;
 }