npm - openai - Versions diffs - 4.87.4 → 4.89.0 - Mend

openai 4.87.4 → 4.89.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (80) hide show

package/CHANGELOG.md +29 -0
package/helpers/audio.d.ts +12 -0
package/helpers/audio.d.ts.map +1 -0
package/helpers/audio.js +121 -0
package/helpers/audio.js.map +1 -0
package/helpers/audio.mjs +116 -0
package/helpers/audio.mjs.map +1 -0
package/index.d.mts +2 -0
package/index.d.ts +2 -0
package/index.d.ts.map +1 -1
package/index.js.map +1 -1
package/index.mjs.map +1 -1
package/package.json +8 -29
package/resources/audio/audio.d.ts +5 -4
package/resources/audio/audio.d.ts.map +1 -1
package/resources/audio/audio.js.map +1 -1
package/resources/audio/audio.mjs.map +1 -1
package/resources/audio/index.d.ts +1 -1
package/resources/audio/index.d.ts.map +1 -1
package/resources/audio/index.js.map +1 -1
package/resources/audio/index.mjs.map +1 -1
package/resources/audio/speech.d.ts +7 -2
package/resources/audio/speech.d.ts.map +1 -1
package/resources/audio/transcriptions.d.ts +172 -9
package/resources/audio/transcriptions.d.ts.map +1 -1
package/resources/audio/transcriptions.js.map +1 -1
package/resources/audio/transcriptions.mjs.map +1 -1
package/resources/audio/translations.d.ts +1 -1
package/resources/audio/translations.d.ts.map +1 -1
package/resources/beta/realtime/index.d.ts +1 -0
package/resources/beta/realtime/index.d.ts.map +1 -1
package/resources/beta/realtime/index.js +3 -1
package/resources/beta/realtime/index.js.map +1 -1
package/resources/beta/realtime/index.mjs +1 -0
package/resources/beta/realtime/index.mjs.map +1 -1
package/resources/beta/realtime/realtime.d.ts +383 -36
package/resources/beta/realtime/realtime.d.ts.map +1 -1
package/resources/beta/realtime/realtime.js +4 -0
package/resources/beta/realtime/realtime.js.map +1 -1
package/resources/beta/realtime/realtime.mjs +4 -0
package/resources/beta/realtime/realtime.mjs.map +1 -1
package/resources/beta/realtime/sessions.d.ts +169 -60
package/resources/beta/realtime/sessions.d.ts.map +1 -1
package/resources/beta/realtime/transcription-sessions.d.ts +262 -0
package/resources/beta/realtime/transcription-sessions.d.ts.map +1 -0
package/resources/beta/realtime/transcription-sessions.js +25 -0
package/resources/beta/realtime/transcription-sessions.js.map +1 -0
package/resources/beta/realtime/transcription-sessions.mjs +21 -0
package/resources/beta/realtime/transcription-sessions.mjs.map +1 -0
package/resources/chat/completions/completions.d.ts +1 -1
package/resources/chat/completions/completions.d.ts.map +1 -1
package/resources/responses/responses.d.ts +3 -3
package/resources/responses/responses.d.ts.map +1 -1
package/resources/shared.d.ts +3 -1
package/resources/shared.d.ts.map +1 -1
package/resources.d.ts +2 -0
package/resources.d.ts.map +1 -0
package/resources.js +18 -0
package/resources.js.map +1 -0
package/resources.mjs +2 -0
package/resources.mjs.map +1 -0
package/src/helpers/audio.ts +145 -0
package/src/index.ts +2 -0
package/src/resources/audio/audio.ts +15 -2
package/src/resources/audio/index.ts +6 -0
package/src/resources/audio/speech.ts +8 -2
package/src/resources/audio/transcriptions.ts +215 -9
package/src/resources/audio/translations.ts +1 -1
package/src/resources/beta/realtime/index.ts +5 -0
package/src/resources/beta/realtime/realtime.ts +465 -57
package/src/resources/beta/realtime/sessions.ts +176 -60
package/src/resources/beta/realtime/transcription-sessions.ts +308 -0
package/src/resources/chat/completions/completions.ts +1 -1
package/src/resources/responses/responses.ts +3 -3
package/src/resources/shared.ts +22 -5
package/src/resources.ts +1 -0
package/src/version.ts +1 -1
package/version.d.ts +1 -1
package/version.js +1 -1
package/version.mjs +1 -1

package/src/resources/beta/realtime/transcription-sessions.ts ADDED Viewed

@@ -0,0 +1,308 @@
+// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+import { APIResource } from '../../../resource';
+import * as Core from '../../../core';
+export class TranscriptionSessions extends APIResource {
+  /**
+   * Create an ephemeral API token for use in client-side applications with the
+   * Realtime API specifically for realtime transcriptions. Can be configured with
+   * the same session parameters as the `transcription_session.update` client event.
+   *
+   * It responds with a session object, plus a `client_secret` key which contains a
+   * usable ephemeral API token that can be used to authenticate browser clients for
+   * the Realtime API.
+   */
+  create(
+    body: TranscriptionSessionCreateParams,
+    options?: Core.RequestOptions,
+  ): Core.APIPromise<TranscriptionSession> {
+    return this._client.post('/realtime/transcription_sessions', {
+      body,
+      ...options,
+      headers: { 'OpenAI-Beta': 'assistants=v2', ...options?.headers },
+    });
+  }
+}
+/**
+ * A new Realtime transcription session configuration.
+ *
+ * When a session is created on the server via REST API, the session object also
+ * contains an ephemeral key. Default TTL for keys is one minute. This property is
+ * not present when a session is updated via the WebSocket API.
+ */
+export interface TranscriptionSession {
+  /**
+   * Ephemeral key returned by the API. Only present when the session is created on
+   * the server via REST API.
+   */
+  client_secret: TranscriptionSession.ClientSecret;
+  /**
+   * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+   */
+  input_audio_format?: string;
+  /**
+   * Configuration of the transcription model.
+   */
+  input_audio_transcription?: TranscriptionSession.InputAudioTranscription;
+  /**
+   * The set of modalities the model can respond with. To disable audio, set this to
+   * ["text"].
+   */
+  modalities?: Array<'text' | 'audio'>;
+  /**
+   * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
+   * means that the model will detect the start and end of speech based on audio
+   * volume and respond at the end of user speech.
+   */
+  turn_detection?: TranscriptionSession.TurnDetection;
+}
+export namespace TranscriptionSession {
+  /**
+   * Ephemeral key returned by the API. Only present when the session is created on
+   * the server via REST API.
+   */
+  export interface ClientSecret {
+    /**
+     * Timestamp for when the token expires. Currently, all tokens expire after one
+     * minute.
+     */
+    expires_at: number;
+    /**
+     * Ephemeral key usable in client environments to authenticate connections to the
+     * Realtime API. Use this in client-side environments rather than a standard API
+     * token, which should only be used server-side.
+     */
+    value: string;
+  }
+  /**
+   * Configuration of the transcription model.
+   */
+  export interface InputAudioTranscription {
+    /**
+     * The language of the input audio. Supplying the input language in
+     * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+     * format will improve accuracy and latency.
+     */
+    language?: string;
+    /**
+     * The model to use for transcription. Can be `gpt-4o-transcribe`,
+     * `gpt-4o-mini-transcribe`, or `whisper-1`.
+     */
+    model?: 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe' | 'whisper-1';
+    /**
+     * An optional text to guide the model's style or continue a previous audio
+     * segment. The
+     * [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
+     * should match the audio language.
+     */
+    prompt?: string;
+  }
+  /**
+   * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
+   * means that the model will detect the start and end of speech based on audio
+   * volume and respond at the end of user speech.
+   */
+  export interface TurnDetection {
+    /**
+     * Amount of audio to include before the VAD detected speech (in milliseconds).
+     * Defaults to 300ms.
+     */
+    prefix_padding_ms?: number;
+    /**
+     * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+     * With shorter values the model will respond more quickly, but may jump in on
+     * short pauses from the user.
+     */
+    silence_duration_ms?: number;
+    /**
+     * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+     * threshold will require louder audio to activate the model, and thus might
+     * perform better in noisy environments.
+     */
+    threshold?: number;
+    /**
+     * Type of turn detection, only `server_vad` is currently supported.
+     */
+    type?: string;
+  }
+}
+export interface TranscriptionSessionCreateParams {
+  /**
+   * The set of items to include in the transcription. Current available items are:
+   *
+   * - `item.input_audio_transcription.logprobs`
+   */
+  include?: Array<string>;
+  /**
+   * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
+   * `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
+   * (mono), and little-endian byte order.
+   */
+  input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
+  /**
+   * Configuration for input audio noise reduction. This can be set to `null` to turn
+   * off. Noise reduction filters audio added to the input audio buffer before it is
+   * sent to VAD and the model. Filtering the audio can improve VAD and turn
+   * detection accuracy (reducing false positives) and model performance by improving
+   * perception of the input audio.
+   */
+  input_audio_noise_reduction?: TranscriptionSessionCreateParams.InputAudioNoiseReduction;
+  /**
+   * Configuration for input audio transcription. The client can optionally set the
+   * language and prompt for transcription, these offer additional guidance to the
+   * transcription service.
+   */
+  input_audio_transcription?: TranscriptionSessionCreateParams.InputAudioTranscription;
+  /**
+   * The set of modalities the model can respond with. To disable audio, set this to
+   * ["text"].
+   */
+  modalities?: Array<'text' | 'audio'>;
+  /**
+   * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+   * set to `null` to turn off, in which case the client must manually trigger model
+   * response. Server VAD means that the model will detect the start and end of
+   * speech based on audio volume and respond at the end of user speech. Semantic VAD
+   * is more advanced and uses a turn detection model (in conjuction with VAD) to
+   * semantically estimate whether the user has finished speaking, then dynamically
+   * sets a timeout based on this probability. For example, if user audio trails off
+   * with "uhhm", the model will score a low probability of turn end and wait longer
+   * for the user to continue speaking. This can be useful for more natural
+   * conversations, but may have a higher latency.
+   */
+  turn_detection?: TranscriptionSessionCreateParams.TurnDetection;
+}
+export namespace TranscriptionSessionCreateParams {
+  /**
+   * Configuration for input audio noise reduction. This can be set to `null` to turn
+   * off. Noise reduction filters audio added to the input audio buffer before it is
+   * sent to VAD and the model. Filtering the audio can improve VAD and turn
+   * detection accuracy (reducing false positives) and model performance by improving
+   * perception of the input audio.
+   */
+  export interface InputAudioNoiseReduction {
+    /**
+     * Type of noise reduction. `near_field` is for close-talking microphones such as
+     * headphones, `far_field` is for far-field microphones such as laptop or
+     * conference room microphones.
+     */
+    type?: 'near_field' | 'far_field';
+  }
+  /**
+   * Configuration for input audio transcription. The client can optionally set the
+   * language and prompt for transcription, these offer additional guidance to the
+   * transcription service.
+   */
+  export interface InputAudioTranscription {
+    /**
+     * The language of the input audio. Supplying the input language in
+     * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+     * format will improve accuracy and latency.
+     */
+    language?: string;
+    /**
+     * The model to use for transcription, current options are `gpt-4o-transcribe`,
+     * `gpt-4o-mini-transcribe`, and `whisper-1`.
+     */
+    model?: 'gpt-4o-transcribe' | 'gpt-4o-mini-transcribe' | 'whisper-1';
+    /**
+     * An optional text to guide the model's style or continue a previous audio
+     * segment. For `whisper-1`, the
+     * [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
+     * For `gpt-4o-transcribe` models, the prompt is a free text string, for example
+     * "expect words related to technology".
+     */
+    prompt?: string;
+  }
+  /**
+   * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+   * set to `null` to turn off, in which case the client must manually trigger model
+   * response. Server VAD means that the model will detect the start and end of
+   * speech based on audio volume and respond at the end of user speech. Semantic VAD
+   * is more advanced and uses a turn detection model (in conjuction with VAD) to
+   * semantically estimate whether the user has finished speaking, then dynamically
+   * sets a timeout based on this probability. For example, if user audio trails off
+   * with "uhhm", the model will score a low probability of turn end and wait longer
+   * for the user to continue speaking. This can be useful for more natural
+   * conversations, but may have a higher latency.
+   */
+  export interface TurnDetection {
+    /**
+     * Whether or not to automatically generate a response when a VAD stop event
+     * occurs.
+     */
+    create_response?: boolean;
+    /**
+     * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+     * will wait longer for the user to continue speaking, `high` will respond more
+     * quickly. `auto` is the default and is equivalent to `medium`.
+     */
+    eagerness?: 'low' | 'medium' | 'high' | 'auto';
+    /**
+     * Whether or not to automatically interrupt any ongoing response with output to
+     * the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+     * occurs.
+     */
+    interrupt_response?: boolean;
+    /**
+     * Used only for `server_vad` mode. Amount of audio to include before the VAD
+     * detected speech (in milliseconds). Defaults to 300ms.
+     */
+    prefix_padding_ms?: number;
+    /**
+     * Used only for `server_vad` mode. Duration of silence to detect speech stop (in
+     * milliseconds). Defaults to 500ms. With shorter values the model will respond
+     * more quickly, but may jump in on short pauses from the user.
+     */
+    silence_duration_ms?: number;
+    /**
+     * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
+     * defaults to 0.5. A higher threshold will require louder audio to activate the
+     * model, and thus might perform better in noisy environments.
+     */
+    threshold?: number;
+    /**
+     * Type of turn detection.
+     */
+    type?: 'server_vad' | 'semantic_vad';
+  }
+}
+export declare namespace TranscriptionSessions {
+  export {
+    type TranscriptionSession as TranscriptionSession,
+    type TranscriptionSessionCreateParams as TranscriptionSessionCreateParams,
+  };
+}

package/src/resources/chat/completions/completions.ts CHANGED Viewed

@@ -383,7 +383,7 @@ export interface ChatCompletionChunk {
    * **NOTE:** If the stream is interrupted or cancelled, you may not receive the
    * final usage chunk which contains the total token usage for the request.
    */
-  usage?: CompletionsAPI.CompletionUsage;
+  usage?: CompletionsAPI.CompletionUsage | null;
 }
 export namespace ChatCompletionChunk {

package/src/resources/responses/responses.ts CHANGED Viewed

@@ -327,7 +327,7 @@ export interface Response {
    * [model guide](https://platform.openai.com/docs/models) to browse and compare
    * available models.
    */
-  model: (string & {}) | Shared.ChatModel;
+  model: Shared.ResponsesModel;
   /**
    * The object type of this resource - always set to `response`.
@@ -1481,7 +1481,7 @@ export interface ResponseFunctionToolCall {
  */
 export interface ResponseFunctionToolCallItem extends ResponseFunctionToolCall {
   /**
-   * The unique ID of the function call tool output.
+   * The unique ID of the function tool call.
    */
   id: string;
 }
@@ -2679,7 +2679,7 @@ export interface ResponseCreateParamsBase {
    * [model guide](https://platform.openai.com/docs/models) to browse and compare
    * available models.
    */
-  model: (string & {}) | Shared.ChatModel;
+  model: Shared.ResponsesModel;
   /**
    * Specify additional output data to include in the model response. Currently

package/src/resources/shared.ts CHANGED Viewed

@@ -1,5 +1,15 @@
 // File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+export type AllModels =
+  | string
+  | ChatModel
+  | string
+  | ChatModel
+  | 'o1-pro'
+  | 'o1-pro-2025-03-19'
+  | 'computer-use-preview'
+  | 'computer-use-preview-2025-03-11';
 export type ChatModel =
   | 'o3-mini'
   | 'o3-mini-2025-01-31'
@@ -9,11 +19,6 @@ export type ChatModel =
   | 'o1-preview-2024-09-12'
   | 'o1-mini'
   | 'o1-mini-2024-09-12'
-  | 'computer-use-preview'
-  | 'computer-use-preview-2025-02-04'
-  | 'computer-use-preview-2025-03-11'
-  | 'gpt-4.5-preview'
-  | 'gpt-4.5-preview-2025-02-27'
   | 'gpt-4o'
   | 'gpt-4o-2024-11-20'
   | 'gpt-4o-2024-08-06'
@@ -23,6 +28,10 @@ export type ChatModel =
   | 'gpt-4o-audio-preview-2024-12-17'
   | 'gpt-4o-mini-audio-preview'
   | 'gpt-4o-mini-audio-preview-2024-12-17'
+  | 'gpt-4o-search-preview'
+  | 'gpt-4o-mini-search-preview'
+  | 'gpt-4o-search-preview-2025-03-11'
+  | 'gpt-4o-mini-search-preview-2025-03-11'
   | 'chatgpt-4o-latest'
   | 'gpt-4o-mini'
   | 'gpt-4o-mini-2024-07-18'
@@ -265,3 +274,11 @@ export interface ResponseFormatText {
    */
   type: 'text';
 }
+export type ResponsesModel =
+  | (string & {})
+  | ChatModel
+  | 'o1-pro'
+  | 'o1-pro-2025-03-19'
+  | 'computer-use-preview'
+  | 'computer-use-preview-2025-03-11';

package/src/resources.ts ADDED Viewed

	@@ -0,0 +1 @@
1	+ export * from './resources/index';

package/src/version.ts CHANGED Viewed

	@@ -1 +1 @@
1	- export const VERSION = '4.87.4'; // x-release-please-version
1	+ export const VERSION = '4.89.0'; // x-release-please-version

package/version.d.ts CHANGED Viewed

@@ -1,2 +1,2 @@
-export declare const VERSION = "4.87.4";
+export declare const VERSION = "4.89.0";
 //# sourceMappingURL=version.d.ts.map

package/version.js CHANGED Viewed

@@ -1,5 +1,5 @@
 "use strict";
 Object.defineProperty(exports, "__esModule", { value: true });
 exports.VERSION = void 0;
-exports.VERSION = '4.87.4'; // x-release-please-version
+exports.VERSION = '4.89.0'; // x-release-please-version
 //# sourceMappingURL=version.js.map

package/version.mjs CHANGED Viewed

@@ -1,2 +1,2 @@
-export const VERSION = '4.87.4'; // x-release-please-version
+export const VERSION = '4.89.0'; // x-release-please-version
 //# sourceMappingURL=version.mjs.map