npm - @huggingface/transformers - Versions diffs - 3.2.4 → 3.3.1 - Mend

@huggingface/transformers 3.2.4 → 3.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

package/README.md +5 -3
package/dist/ort-wasm-simd-threaded.jsep.mjs +135 -0
package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
package/dist/transformers.cjs +598 -247
package/dist/transformers.cjs.map +1 -1
package/dist/transformers.js +956 -573
package/dist/transformers.js.map +1 -1
package/dist/transformers.min.cjs +1 -1
package/dist/transformers.min.cjs.map +1 -1
package/dist/transformers.min.js +1 -1
package/dist/transformers.min.js.map +1 -1
package/dist/transformers.min.mjs +1 -1
package/dist/transformers.min.mjs.map +1 -1
package/dist/transformers.mjs +604 -248
package/dist/transformers.mjs.map +1 -1
package/package.json +3 -3
package/src/base/image_processors_utils.js +1 -1
package/src/base/processing_utils.js +11 -0
package/src/env.js +1 -2
package/src/generation/streamers.js +5 -2
package/src/models/grounding_dino/image_processing_grounding_dino.js +29 -0
package/src/models/grounding_dino/processing_grounding_dino.js +101 -0
package/src/models/image_processors.js +1 -0
package/src/models/processors.js +3 -2
package/src/models.js +22 -5
package/src/pipelines.js +39 -16
package/src/utils/audio.js +113 -1
package/src/utils/core.js +26 -0
package/src/utils/image.js +5 -18
package/src/utils/tensor.js +100 -112
package/types/base/image_processors_utils.d.ts +7 -0
package/types/base/image_processors_utils.d.ts.map +1 -1
package/types/base/processing_utils.d.ts +8 -0
package/types/base/processing_utils.d.ts.map +1 -1
package/types/generation/streamers.d.ts +3 -1
package/types/generation/streamers.d.ts.map +1 -1
package/types/models/auto/image_processing_auto.d.ts.map +1 -1
package/types/models/grounding_dino/image_processing_grounding_dino.d.ts +20 -0
package/types/models/grounding_dino/image_processing_grounding_dino.d.ts.map +1 -0
package/types/models/grounding_dino/processing_grounding_dino.d.ts +27 -0
package/types/models/grounding_dino/processing_grounding_dino.d.ts.map +1 -0
package/types/models/image_processors.d.ts +1 -0
package/types/models/processors.d.ts +3 -2
package/types/models.d.ts +8 -0
package/types/models.d.ts.map +1 -1
package/types/pipelines.d.ts +5 -10
package/types/pipelines.d.ts.map +1 -1
package/types/tsconfig.tsbuildinfo +1 -1
package/types/utils/audio.d.ts +25 -0
package/types/utils/audio.d.ts.map +1 -1
package/types/utils/core.d.ts +6 -0
package/types/utils/core.d.ts.map +1 -1
package/types/utils/image.d.ts.map +1 -1
package/types/utils/tensor.d.ts +14 -2
package/types/utils/tensor.d.ts.map +1 -1

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@huggingface/transformers",
-  "version": "3.2.4",
+  "version": "3.3.1",
   "description": "State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!",
   "main": "./src/transformers.js",
   "types": "./types/transformers.d.ts",
@@ -26,7 +26,7 @@
     "format:check": "prettier --check .",
     "typegen": "tsc --build",
     "dev": "webpack serve --no-client-overlay",
-    "build": "webpack && npm run typegen",
+    "build": "webpack && npm run typegen && rm ./dist/ort.bundle.min.mjs && cp ./node_modules/onnxruntime-web/dist/ort-wasm-simd-threaded.jsep.mjs ./dist",
     "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --verbose",
     "readme": "python ./docs/scripts/build_readme.py",
     "docs-api": "node ./docs/scripts/generate.js",
@@ -57,7 +57,7 @@
   "dependencies": {
     "@huggingface/jinja": "^0.3.2",
     "onnxruntime-node": "1.20.1",
-    "onnxruntime-web": "1.21.0-dev.20241205-d27fecd3d3",
+    "onnxruntime-web": "1.21.0-dev.20250114-228dd16893",
     "sharp": "^0.33.5"
   },
   "devDependencies": {

package/src/base/image_processors_utils.js CHANGED Viewed

@@ -68,7 +68,7 @@ function enforce_size_divisibility([width, height], divisor) {
  * @param {number[]} arr The coordinate for the center of the box and its width, height dimensions (center_x, center_y, width, height)
  * @returns {number[]} The coodinates for the top-left and bottom-right corners of the box (top_left_x, top_left_y, bottom_right_x, bottom_right_y)
  */
-function center_to_corners_format([centerX, centerY, width, height]) {
+export function center_to_corners_format([centerX, centerY, width, height]) {
     return [
         centerX - width / 2,
         centerY - height / 2,

package/src/base/processing_utils.js CHANGED Viewed

@@ -101,6 +101,17 @@ export class Processor extends Callable {
         return this.tokenizer.batch_decode(...args);
     }
+    /**
+     * @param {Parameters<PreTrainedTokenizer['decode']>} args
+     * @returns {ReturnType<PreTrainedTokenizer['decode']>}
+     */
+    decode(...args) {
+        if (!this.tokenizer) {
+            throw new Error('Unable to decode without a tokenizer.');
+        }
+        return this.tokenizer.decode(...args);
+    }
     /**
      * Calls the feature_extractor function with the given input.

package/src/env.js CHANGED Viewed

@@ -26,7 +26,7 @@ import fs from 'fs';
 import path from 'path';
 import url from 'url';
-const VERSION = '3.2.4';
+const VERSION = '3.3.1';
 // Check if various APIs are available (depends on environment)
 const IS_BROWSER_ENV = typeof window !== "undefined" && typeof window.document !== "undefined";
@@ -160,4 +160,3 @@ export const env = {
 function isEmpty(obj) {
     return Object.keys(obj).length === 0;
 }

package/src/generation/streamers.js CHANGED Viewed

@@ -37,6 +37,7 @@ export class TextStreamer extends BaseStreamer {
      * @param {import('../tokenizers.js').PreTrainedTokenizer} tokenizer
      * @param {Object} options
      * @param {boolean} [options.skip_prompt=false] Whether to skip the prompt tokens
+     * @param {boolean} [options.skip_special_tokens=true] Whether to skip special tokens when decoding
      * @param {function(string): void} [options.callback_function=null] Function to call when a piece of text is ready to display
      * @param {function(bigint[]): void} [options.token_callback_function=null] Function to call when a new token is generated
      * @param {Object} [options.decode_kwargs={}] Additional keyword arguments to pass to the tokenizer's decode method
@@ -45,6 +46,7 @@ export class TextStreamer extends BaseStreamer {
         skip_prompt = false,
         callback_function = null,
         token_callback_function = null,
+        skip_special_tokens = true,
         decode_kwargs = {},
         ...kwargs
     } = {}) {
@@ -53,7 +55,7 @@ export class TextStreamer extends BaseStreamer {
         this.skip_prompt = skip_prompt;
         this.callback_function = callback_function ?? stdout_write;
         this.token_callback_function = token_callback_function;
-        this.decode_kwargs = { ...decode_kwargs, ...kwargs };
+        this.decode_kwargs = { skip_special_tokens, ...decode_kwargs, ...kwargs };
         // variables used in the streaming process
         this.token_cache = [];
@@ -169,9 +171,10 @@ export class WhisperTextStreamer extends TextStreamer {
     } = {}) {
         super(tokenizer, {
             skip_prompt,
+            skip_special_tokens,
             callback_function,
             token_callback_function,
-            decode_kwargs: { skip_special_tokens, ...decode_kwargs },
+            decode_kwargs,
         });
         this.timestamp_begin = tokenizer.timestamp_begin;

package/src/models/grounding_dino/image_processing_grounding_dino.js ADDED Viewed

@@ -0,0 +1,29 @@
+import {
+    ImageProcessor,
+} from "../../base/image_processors_utils.js";
+import { ones } from '../../utils/tensor.js';
+/**
+ * @typedef {object} GroundingDinoFeatureExtractorResultProps
+ * @property {import('../../utils/tensor.js').Tensor} pixel_mask
+ * @typedef {import('../../base/image_processors_utils.js').ImageProcessorResult & GroundingDinoFeatureExtractorResultProps} GroundingDinoFeatureExtractorResult
+ */
+export class GroundingDinoImageProcessor extends ImageProcessor {
+    /**
+     * Calls the feature extraction process on an array of images, preprocesses
+     * each image, and concatenates the resulting features into a single Tensor.
+     * @param {import('../../utils/image.js').RawImage[]} images The image(s) to extract features from.
+     * @returns {Promise<GroundingDinoFeatureExtractorResult>} An object containing the concatenated pixel values of the preprocessed images.
+     */
+    async _call(images) {
+        const result = await super._call(images);
+        const dims = result.pixel_values.dims;
+        const pixel_mask = ones([dims[0], dims[2], dims[3]]);
+        return { ...result, pixel_mask };
+    }
+}

package/src/models/grounding_dino/processing_grounding_dino.js ADDED Viewed

@@ -0,0 +1,101 @@
+import { Processor } from "../../base/processing_utils.js";
+import { AutoImageProcessor } from "../auto/image_processing_auto.js";
+import { AutoTokenizer } from "../../tokenizers.js";
+import { center_to_corners_format } from "../../base/image_processors_utils.js";
+/**
+ * Get token ids of phrases from posmaps and input_ids.
+ * @param {import('../../utils/tensor.js').Tensor} posmaps A boolean tensor of unbatched text-thresholded logits related to the detected bounding boxes of shape `(hidden_size, )`.
+ * @param {import('../../utils/tensor.js').Tensor} input_ids A tensor of token ids of shape `(sequence_length, )`.
+ */
+function get_phrases_from_posmap(posmaps, input_ids) {
+    const left_idx = 0;
+    const right_idx = posmaps.dims.at(-1) - 1;
+    const posmaps_list = posmaps.tolist();
+    posmaps_list.fill(false, 0, left_idx + 1);
+    posmaps_list.fill(false, right_idx);
+    const input_ids_list = input_ids.tolist();
+    return posmaps_list
+        .map((val, idx) => val ? idx : null)
+        .filter(idx => idx !== null)
+        .map(i => input_ids_list[i]);
+}
+export class GroundingDinoProcessor extends Processor {
+    static tokenizer_class = AutoTokenizer
+    static image_processor_class = AutoImageProcessor
+    /**
+     * @typedef {import('../../utils/image.js').RawImage} RawImage
+     */
+    /**
+     *
+     * @param {RawImage|RawImage[]|RawImage[][]} images
+     * @param {string|string[]} text
+     * @returns {Promise<any>}
+     */
+    async _call(images, text, options = {}) {
+        const image_inputs = images ? await this.image_processor(images, options) : {};
+        const text_inputs = text ? this.tokenizer(text, options) : {};
+        return {
+            ...text_inputs,
+            ...image_inputs,
+        }
+    }
+    post_process_grounded_object_detection(outputs, input_ids, {
+        box_threshold = 0.25,
+        text_threshold = 0.25,
+        target_sizes = null
+    } = {}) {
+        const { logits, pred_boxes } = outputs;
+        const batch_size = logits.dims[0];
+        if (target_sizes !== null && target_sizes.length !== batch_size) {
+            throw Error("Make sure that you pass in as many target sizes as the batch dimension of the logits")
+        }
+        const num_queries = logits.dims.at(1);
+        const probs = logits.sigmoid(); // (batch_size, num_queries, 256)
+        const scores = probs.max(-1).tolist(); // (batch_size, num_queries)
+        // Convert to [x0, y0, x1, y1] format
+        const boxes = pred_boxes.tolist() // (batch_size, num_queries, 4)
+            .map(batch => batch.map(box => center_to_corners_format(box)));
+        const results = [];
+        for (let i = 0; i < batch_size; ++i) {
+            const target_size = target_sizes !== null ? target_sizes[i] : null;
+            // Convert from relative [0, 1] to absolute [0, height] coordinates
+            if (target_size !== null) {
+                boxes[i] = boxes[i].map(box => box.map((x, j) => x * target_size[(j + 1) % 2]));
+            }
+            const batch_scores = scores[i];
+            const final_scores = [];
+            const final_phrases = [];
+            const final_boxes = [];
+            for (let j = 0; j < num_queries; ++j) {
+                const score = batch_scores[j];
+                if (score <= box_threshold) {
+                    continue;
+                }
+                const box = boxes[i][j];
+                const prob = probs[i][j];
+                final_scores.push(score);
+                final_boxes.push(box);
+                const phrases = get_phrases_from_posmap(prob.gt(text_threshold), input_ids[i]);
+                final_phrases.push(phrases);
+            }
+            results.push({ scores: final_scores, boxes: final_boxes, labels: this.batch_decode(final_phrases) });
+        }
+        return results;
+    }
+}

package/src/models/image_processors.js CHANGED Viewed

@@ -10,6 +10,7 @@ export * from './donut/image_processing_donut.js'
 export * from './dpt/image_processing_dpt.js'
 export * from './efficientnet/image_processing_efficientnet.js'
 export * from './glpn/image_processing_glpn.js'
+export * from './grounding_dino/image_processing_grounding_dino.js'
 export * from './idefics3/image_processing_idefics3.js'
 export * from './janus/image_processing_janus.js'
 export * from './jina_clip/image_processing_jina_clip.js'

package/src/models/processors.js CHANGED Viewed

@@ -1,9 +1,10 @@
 export * from './florence2/processing_florence2.js';
-export * from './mgp_str/processing_mgp_str.js';
-export * from './moonshine/processing_moonshine.js';
+export * from './grounding_dino/processing_grounding_dino.js';
 export * from './idefics3/processing_idefics3.js';
 export * from './janus/processing_janus.js';
 export * from './jina_clip/processing_jina_clip.js';
+export * from './mgp_str/processing_mgp_str.js';
+export * from './moonshine/processing_moonshine.js';
 export * from './owlvit/processing_owlvit.js';
 export * from './phi3_v/processing_phi3_v.js';
 export * from './paligemma/processing_paligemma.js';

package/src/models.js CHANGED Viewed

@@ -532,14 +532,23 @@ async function encoderForward(self, model_inputs) {
         encoderFeeds.inputs_embeds = await self.encode_text({ input_ids: model_inputs.input_ids });
     }
     if (session.inputNames.includes('token_type_ids') && !encoderFeeds.token_type_ids) {
+        if (!encoderFeeds.input_ids) {
+            throw new Error('Both `input_ids` and `token_type_ids` are missing in the model inputs.');
+        }
         // Assign default `token_type_ids` (all zeroes) to the `encoderFeeds` if the model expects it,
         // but they weren't created by the tokenizer.
-        encoderFeeds.token_type_ids = new Tensor(
-            'int64',
-            new BigInt64Array(encoderFeeds.input_ids.data.length),
-            encoderFeeds.input_ids.dims
-        )
+        encoderFeeds.token_type_ids = zeros_like(encoderFeeds.input_ids);
+    }
+    if (session.inputNames.includes('pixel_mask') && !encoderFeeds.pixel_mask) {
+        if (!encoderFeeds.pixel_values) {
+            throw new Error('Both `pixel_values` and `pixel_mask` are missing in the model inputs.');
+        }
+        // Assign default `pixel_mask` (all ones) to the `encoderFeeds` if the model expects it,
+        // but they weren't created by the processor.
+        const dims = encoderFeeds.pixel_values.dims;
+        encoderFeeds.pixel_mask = ones([dims[0], dims[2], dims[3]]);
     }
     return await sessionRun(session, encoderFeeds);
 }
@@ -5428,6 +5437,8 @@ export class Dinov2WithRegistersForImageClassification extends Dinov2WithRegiste
     }
 }
 //////////////////////////////////////////////////
+export class GroundingDinoPreTrainedModel extends PreTrainedModel { }
+export class GroundingDinoForObjectDetection extends GroundingDinoPreTrainedModel { }
 //////////////////////////////////////////////////
 export class YolosPreTrainedModel extends PreTrainedModel { }
@@ -6126,6 +6137,9 @@ export class WavLMForAudioFrameClassification extends WavLMPreTrainedModel {
     }
 }
+export class StyleTextToSpeech2PreTrainedModel extends PreTrainedModel { }
+export class StyleTextToSpeech2Model extends StyleTextToSpeech2PreTrainedModel { }
 //////////////////////////////////////////////////
 // SpeechT5 models
 /**
@@ -7089,6 +7103,8 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
     ['maskformer', ['MaskFormerModel', MaskFormerModel]],
     ['mgp-str', ['MgpstrForSceneTextRecognition', MgpstrForSceneTextRecognition]],
+    ['style_text_to_speech_2', ['StyleTextToSpeech2Model', StyleTextToSpeech2Model]],
 ]);
 const MODEL_MAPPING_NAMES_ENCODER_DECODER = new Map([
@@ -7333,6 +7349,7 @@ const MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES = new Map([
 const MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES = new Map([
     ['owlvit', ['OwlViTForObjectDetection', OwlViTForObjectDetection]],
     ['owlv2', ['Owlv2ForObjectDetection', Owlv2ForObjectDetection]],
+    ['grounding-dino', ['GroundingDinoForObjectDetection', GroundingDinoForObjectDetection]],
 ]);
 const MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES = new Map([

package/src/pipelines.js CHANGED Viewed

@@ -64,7 +64,8 @@ import {
     round,
 } from './utils/maths.js';
 import {
-    read_audio
+    read_audio,
+    RawAudio
 } from './utils/audio.js';
 import {
     Tensor,
@@ -2552,13 +2553,35 @@ export class ZeroShotObjectDetectionPipeline extends (/** @type {new (options: T
             // Run model with both text and pixel inputs
             const output = await this.model({ ...text_inputs, pixel_values });
-            // @ts-ignore
-            const processed = this.processor.image_processor.post_process_object_detection(output, threshold, imageSize, true)[0];
-            let result = processed.boxes.map((box, i) => ({
-                score: processed.scores[i],
-                label: candidate_labels[processed.classes[i]],
-                box: get_bounding_box(box, !percentage),
-            })).sort((a, b) => b.score - a.score);
+            let result;
+            if('post_process_grounded_object_detection' in this.processor) {
+                // @ts-ignore
+                const processed = this.processor.post_process_grounded_object_detection(
+                    output,
+                    text_inputs.input_ids,
+                    {
+                        // TODO: support separate threshold values
+                        box_threshold: threshold,
+                        text_threshold: threshold,
+                        target_sizes: imageSize,
+                    },
+                )[0];
+                result = processed.boxes.map((box, i) => ({
+                    score: processed.scores[i],
+                    label: processed.labels[i],
+                    box: get_bounding_box(box, !percentage),
+                }))
+            } else {
+                // @ts-ignore
+                const processed = this.processor.image_processor.post_process_object_detection(output, threshold, imageSize, true)[0];
+                result = processed.boxes.map((box, i) => ({
+                    score: processed.scores[i],
+                    label: candidate_labels[processed.classes[i]],
+                    box: get_bounding_box(box, !percentage),
+                }))
+            }
+            result.sort((a, b) => b.score - a.score);
             if (top_k !== null) {
                 result = result.slice(0, top_k);
             }
@@ -2678,7 +2701,7 @@ export class DocumentQuestionAnsweringPipeline extends (/** @type {new (options:
  * const synthesizer = await pipeline('text-to-speech', 'Xenova/speecht5_tts', { quantized: false });
  * const speaker_embeddings = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin';
  * const out = await synthesizer('Hello, my dog is cute', { speaker_embeddings });
- * // {
+ * // RawAudio {
  * //   audio: Float32Array(26112) [-0.00005657337896991521, 0.00020583874720614403, ...],
  * //   sampling_rate: 16000
  * // }
@@ -2698,7 +2721,7 @@ export class DocumentQuestionAnsweringPipeline extends (/** @type {new (options:
  * ```javascript
  * const synthesizer = await pipeline('text-to-speech', 'Xenova/mms-tts-fra');
  * const out = await synthesizer('Bonjour');
- * // {
+ * // RawAudio {
  * //   audio: Float32Array(23808) [-0.00037693005288019776, 0.0003325853613205254, ...],
  * //   sampling_rate: 16000
  * // }
@@ -2745,10 +2768,10 @@ export class TextToAudioPipeline extends (/** @type {new (options: TextToAudioPi
         // @ts-expect-error TS2339
         const sampling_rate = this.model.config.sampling_rate;
-        return {
-            audio: waveform.data,
+        return new RawAudio(
+            waveform.data,
             sampling_rate,
-        }
+        )
     }
     async _call_text_to_spectrogram(text_inputs, { speaker_embeddings }) {
@@ -2788,10 +2811,10 @@ export class TextToAudioPipeline extends (/** @type {new (options: TextToAudioPi
         const { waveform } = await this.model.generate_speech(input_ids, speaker_embeddings, { vocoder: this.vocoder });
         const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
-        return {
-            audio: waveform.data,
+        return new RawAudio(
+            waveform.data,
             sampling_rate,
-        }
+        )
     }
 }

package/src/utils/audio.js CHANGED Viewed

@@ -12,8 +12,10 @@ import {
 } from './hub.js';
 import { FFT, max } from './maths.js';
 import {
-    calculateReflectOffset,
+    calculateReflectOffset, saveBlob,
 } from './core.js';
+import { apis } from '../env.js';
+import fs from 'fs';
 import { Tensor, matmul } from './tensor.js';
@@ -702,3 +704,113 @@ export function window_function(window_length, name, {
     return window;
 }
+/**
+ * Encode audio data to a WAV file.
+ * WAV file specs : https://en.wikipedia.org/wiki/WAV#WAV_File_header
+ *
+ * Adapted from https://www.npmjs.com/package/audiobuffer-to-wav
+ * @param {Float32Array} samples The audio samples.
+ * @param {number} rate The sample rate.
+ * @returns {ArrayBuffer} The WAV audio buffer.
+ */
+function encodeWAV(samples, rate) {
+    let offset = 44;
+    const buffer = new ArrayBuffer(offset + samples.length * 4);
+    const view = new DataView(buffer);
+    /* RIFF identifier */
+    writeString(view, 0, "RIFF");
+    /* RIFF chunk length */
+    view.setUint32(4, 36 + samples.length * 4, true);
+    /* RIFF type */
+    writeString(view, 8, "WAVE");
+    /* format chunk identifier */
+    writeString(view, 12, "fmt ");
+    /* format chunk length */
+    view.setUint32(16, 16, true);
+    /* sample format (raw) */
+    view.setUint16(20, 3, true);
+    /* channel count */
+    view.setUint16(22, 1, true);
+    /* sample rate */
+    view.setUint32(24, rate, true);
+    /* byte rate (sample rate * block align) */
+    view.setUint32(28, rate * 4, true);
+    /* block align (channel count * bytes per sample) */
+    view.setUint16(32, 4, true);
+    /* bits per sample */
+    view.setUint16(34, 32, true);
+    /* data chunk identifier */
+    writeString(view, 36, "data");
+    /* data chunk length */
+    view.setUint32(40, samples.length * 4, true);
+    for (let i = 0; i < samples.length; ++i, offset += 4) {
+        view.setFloat32(offset, samples[i], true);
+    }
+    return buffer;
+}
+function writeString(view, offset, string) {
+    for (let i = 0; i < string.length; ++i) {
+        view.setUint8(offset + i, string.charCodeAt(i));
+    }
+}
+export class RawAudio {
+    /**
+     * Create a new `RawAudio` object.
+     * @param {Float32Array} audio Audio data
+     * @param {number} sampling_rate Sampling rate of the audio data
+     */
+    constructor(audio, sampling_rate) {
+        this.audio = audio
+        this.sampling_rate = sampling_rate
+    }
+    /**
+     * Convert the audio to a wav file buffer.
+     * @returns {ArrayBuffer} The WAV file.
+     */
+    toWav() {
+        return encodeWAV(this.audio, this.sampling_rate)
+    }
+    /**
+     * Convert the audio to a blob.
+     * @returns {Blob}
+     */
+    toBlob() {
+        const wav = this.toWav();
+        const blob = new Blob([wav], { type: 'audio/wav' });
+        return blob;
+    }
+    /**
+     * Save the audio to a wav file.
+     * @param {string} path
+     */
+    async save(path) {
+        let fn;
+        if (apis.IS_BROWSER_ENV) {
+            if (apis.IS_WEBWORKER_ENV) {
+                throw new Error('Unable to save a file from a Web Worker.')
+            }
+            fn = saveBlob;
+        } else if (apis.IS_FS_AVAILABLE) {
+            fn = async (/** @type {string} */ path, /** @type {Blob} */ blob) => {
+                let buffer = await blob.arrayBuffer();
+                fs.writeFileSync(path, Buffer.from(buffer));
+            }
+        } else {
+            throw new Error('Unable to save because filesystem is disabled in this environment.')
+        }
+        await fn(path, this.toBlob())
+    }
+}

package/src/utils/core.js CHANGED Viewed

@@ -189,6 +189,32 @@ export function calculateReflectOffset(i, w) {
     return Math.abs((i + w) % (2 * w) - w);
 }
+/**
+ * Save blob file on the web.
+ * @param {string} path The path to save the blob to
+ * @param {Blob} blob The blob to save
+ */
+export function saveBlob(path, blob){
+    // Convert the canvas content to a data URL
+    const dataURL = URL.createObjectURL(blob);
+    // Create an anchor element with the data URL as the href attribute
+    const downloadLink = document.createElement('a');
+    downloadLink.href = dataURL;
+    // Set the download attribute to specify the desired filename for the downloaded image
+    downloadLink.download = path;
+    // Trigger the download
+    downloadLink.click();
+    // Clean up: remove the anchor element from the DOM
+    downloadLink.remove();
+    // Revoke the Object URL to free up memory
+    URL.revokeObjectURL(dataURL);
+}
 /**
  *
  * @param {Object} o

package/src/utils/image.js CHANGED Viewed

@@ -8,9 +8,9 @@
  * @module utils/image
  */
-import { isNullishDimension } from './core.js';
+import { isNullishDimension, saveBlob } from './core.js';
 import { getFile } from './hub.js';
-import { env, apis } from '../env.js';
+import { apis } from '../env.js';
 import { Tensor } from './tensor.js';
 // Will be empty (or not used) if running in browser or web-worker
@@ -793,23 +793,9 @@ export class RawImage {
             // Convert image to Blob
             const blob = await this.toBlob(mime);
-            // Convert the canvas content to a data URL
-            const dataURL = URL.createObjectURL(blob);
+            saveBlob(path, blob)
-            // Create an anchor element with the data URL as the href attribute
-            const downloadLink = document.createElement('a');
-            downloadLink.href = dataURL;
-            // Set the download attribute to specify the desired filename for the downloaded image
-            downloadLink.download = path;
-            // Trigger the download
-            downloadLink.click();
-            // Clean up: remove the anchor element from the DOM
-            downloadLink.remove();
-        } else if (!env.useFS) {
+        } else if (!apis.IS_FS_AVAILABLE) {
             throw new Error('Unable to save the image because filesystem is disabled in this environment.')
         } else {
@@ -837,3 +823,4 @@ export class RawImage {
  * Helper function to load an image from a URL, path, etc.
  */
 export const load_image = RawImage.read.bind(RawImage);