npm - @huggingface/transformers - Versions diffs - 3.1.2 → 3.2.1 - Mend

@huggingface/transformers 3.1.2 → 3.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

package/README.md +7 -3
package/dist/transformers.cjs +835 -144
package/dist/transformers.cjs.map +1 -1
package/dist/transformers.js +850 -144
package/dist/transformers.js.map +1 -1
package/dist/transformers.min.cjs +1 -1
package/dist/transformers.min.cjs.map +1 -1
package/dist/transformers.min.js +1 -1
package/dist/transformers.min.js.map +1 -1
package/dist/transformers.min.mjs +1 -1
package/dist/transformers.min.mjs.map +1 -1
package/dist/transformers.mjs +850 -144
package/dist/transformers.mjs.map +1 -1
package/package.json +1 -1
package/src/base/image_processors_utils.js +3 -1
package/src/configs.js +10 -2
package/src/env.js +1 -1
package/src/models/feature_extractors.js +1 -0
package/src/models/idefics3/image_processing_idefics3.js +24 -13
package/src/models/image_processors.js +1 -0
package/src/models/moonshine/feature_extraction_moonshine.js +26 -0
package/src/models/moonshine/processing_moonshine.js +20 -0
package/src/models/phi3_v/image_processing_phi3_v.js +163 -0
package/src/models/phi3_v/processing_phi3_v.js +53 -0
package/src/models/processors.js +2 -0
package/src/models/pyannote/feature_extraction_pyannote.js +56 -0
package/src/models/pyannote/processing_pyannote.js +7 -54
package/src/models.js +223 -30
package/src/ops/registry.js +11 -0
package/src/pipelines.js +31 -1
package/src/utils/tensor.js +51 -1
package/types/base/image_processors_utils.d.ts +2 -2
package/types/base/image_processors_utils.d.ts.map +1 -1
package/types/configs.d.ts.map +1 -1
package/types/models/auto/image_processing_auto.d.ts.map +1 -1
package/types/models/feature_extractors.d.ts +1 -0
package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -1
package/types/models/image_processors.d.ts +1 -0
package/types/models/moonshine/feature_extraction_moonshine.d.ts +13 -0
package/types/models/moonshine/feature_extraction_moonshine.d.ts.map +1 -0
package/types/models/moonshine/processing_moonshine.d.ts +17 -0
package/types/models/moonshine/processing_moonshine.d.ts.map +1 -0
package/types/models/phi3_v/image_processing_phi3_v.d.ts +17 -0
package/types/models/phi3_v/image_processing_phi3_v.d.ts.map +1 -0
package/types/models/phi3_v/processing_phi3_v.d.ts +17 -0
package/types/models/phi3_v/processing_phi3_v.d.ts.map +1 -0
package/types/models/processors.d.ts +2 -0
package/types/models/pyannote/feature_extraction_pyannote.d.ts +18 -0
package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -1
package/types/models/pyannote/processing_pyannote.d.ts +4 -15
package/types/models/pyannote/processing_pyannote.d.ts.map +1 -1
package/types/models.d.ts +64 -1
package/types/models.d.ts.map +1 -1
package/types/ops/registry.d.ts +1 -0
package/types/ops/registry.d.ts.map +1 -1
package/types/pipelines.d.ts +5 -0
package/types/pipelines.d.ts.map +1 -1
package/types/utils/tensor.d.ts +16 -0
package/types/utils/tensor.d.ts.map +1 -1

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@huggingface/transformers",
-  "version": "3.1.2",
+  "version": "3.2.1",
   "description": "State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!",
   "main": "./src/transformers.js",
   "types": "./types/transformers.d.ts",

package/src/base/image_processors_utils.js CHANGED Viewed

@@ -699,7 +699,7 @@ export class ImageProcessor extends Callable {
      * Pad the image by a certain amount.
      * @param {Float32Array} pixelData The pixel data to pad.
      * @param {number[]} imgDims The dimensions of the image (height, width, channels).
-     * @param {{width:number; height:number}|number} padSize The dimensions of the padded image.
+     * @param {{width:number; height:number}|number|'square'} padSize The dimensions of the padded image.
      * @param {Object} options The options for padding.
      * @param {'constant'|'symmetric'} [options.mode='constant'] The type of padding to add.
      * @param {boolean} [options.center=false] Whether to center the image.
@@ -717,6 +717,8 @@ export class ImageProcessor extends Callable {
         if (typeof padSize === 'number') {
             paddedImageWidth = padSize;
             paddedImageHeight = padSize;
+        } else if (padSize === 'square') {
+            paddedImageWidth = paddedImageHeight = Math.max(imageHeight, imageWidth);
         } else {
             paddedImageWidth = padSize.width;
             paddedImageHeight = padSize.height;

package/src/configs.js CHANGED Viewed

@@ -95,8 +95,6 @@ function getNormalizedConfig(config) {
         case 'gpt_neox':
         case 'stablelm':
         case 'opt':
-        case 'phi':
-        case 'phi3':
         case 'falcon':
             mapping['num_heads'] = 'num_attention_heads';
             mapping['num_layers'] = 'num_hidden_layers';
@@ -112,6 +110,9 @@ function getNormalizedConfig(config) {
         case 'starcoder2':
         case 'qwen2':
         case 'qwen2_vl':
+        case 'phi':
+        case 'phi3':
+        case 'phi3_v':
             mapping['num_heads'] = 'num_key_value_heads';
             mapping['num_layers'] = 'num_hidden_layers';
             mapping['hidden_size'] = 'hidden_size';
@@ -144,6 +145,12 @@ function getNormalizedConfig(config) {
             mapping['num_layers'] = 'n_layers';
             mapping['hidden_size'] = 'd_model';
             break;
+        case 'exaone':
+            mapping['num_heads'] = 'num_key_value_heads';
+            mapping['num_layers'] = 'num_layers';
+            mapping['dim_kv'] = 'head_dim';
+            mapping['num_attention_heads'] = 'num_attention_heads';
+            break;
         // Encoder-decoder models
         case 't5':
@@ -185,6 +192,7 @@ function getNormalizedConfig(config) {
             mapping['encoder_hidden_size'] = mapping['decoder_hidden_size'] = 'd_model';
             break;
         case 'musicgen_decoder':
+        case 'moonshine':
             mapping['num_encoder_layers'] = mapping['num_decoder_layers'] = 'num_hidden_layers';
             mapping['num_encoder_heads'] = mapping['num_decoder_heads'] = 'num_attention_heads';
             mapping['encoder_hidden_size'] = mapping['decoder_hidden_size'] = 'hidden_size';

package/src/env.js CHANGED Viewed

@@ -26,7 +26,7 @@ import fs from 'fs';
 import path from 'path';
 import url from 'url';
-const VERSION = '3.1.2';
+const VERSION = '3.2.1';
 // Check if various APIs are available (depends on environment)
 const IS_BROWSER_ENV = typeof window !== "undefined" && typeof window.document !== "undefined";

package/src/models/feature_extractors.js CHANGED Viewed

@@ -1,6 +1,7 @@
 export * from './audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js';
 export * from './clap/feature_extraction_clap.js';
+export * from './moonshine/feature_extraction_moonshine.js';
 export * from './pyannote/feature_extraction_pyannote.js';
 export * from './seamless_m4t/feature_extraction_seamless_m4t.js';
 export * from './speecht5/feature_extraction_speecht5.js';

package/src/models/idefics3/image_processing_idefics3.js CHANGED Viewed

@@ -3,7 +3,7 @@
 import {
     ImageProcessor,
 } from "../../base/image_processors_utils.js";
-import { cat, full, interpolate_4d, stack } from "../../utils/tensor.js";
+import { cat, full, interpolate_4d, slice, stack } from "../../utils/tensor.js";
 export class Idefics3ImageProcessor extends ImageProcessor {
     constructor(config) {
@@ -186,18 +186,29 @@ export class Idefics3ImageProcessor extends ImageProcessor {
             const optimal_width = Math.ceil(width / num_splits_w);
             // Iterate through each row and column
-            for (let r = 0; r < num_splits_h; r++) {
-                for (let c = 0; c < num_splits_w; c++) {
-                    // Calculate the starting point of the crop
-                    const start_x = c * optimal_width;
-                    const start_y = r * optimal_height;
-                    // Calculate the ending point of the crop
-                    const end_x = Math.min(start_x + optimal_width, width);
-                    const end_y = Math.min(start_y + optimal_height, height);
-                    // Crop the image
-                    frames.push(pixel_values.slice(null, null, [start_y, end_y], [start_x, end_x]));
+            for (let r = 0; r < num_splits_h; ++r) {
+                for (let c = 0; c < num_splits_w; ++c) {
+                    let start_x, start_y, end_x, end_y;
+                    if (r === num_splits_h - 1) { // At bottom
+                        start_y = height - optimal_height;
+                        end_y = height;
+                    } else {
+                        start_y = r * optimal_height;
+                        end_y = (r + 1) * optimal_height;
+                    }
+                    if (c === num_splits_w - 1) { // At right
+                        start_x = width - optimal_width;
+                        end_x = width;
+                    } else {
+                        start_x = c * optimal_width;
+                        end_x = (c + 1) * optimal_width;
+                    }
+                    const starts = [start_y, start_x];
+                    const ends = [end_y, end_x];
+                    const patch = await slice(pixel_values, starts, ends, [2, 3]);
+                    frames.push(patch);
                 }
             }

package/src/models/image_processors.js CHANGED Viewed

@@ -24,6 +24,7 @@ export * from './mobilevit/image_processing_mobilevit.js'
 export * from './nougat/image_processing_nougat.js'
 export * from './owlv2/image_processing_owlv2.js'
 export * from './owlvit/image_processing_owlvit.js'
+export * from './phi3_v/image_processing_phi3_v.js'
 export * from './pvt/image_processing_pvt.js'
 export * from './qwen2_vl/image_processing_qwen2_vl.js'
 export * from './rt_detr/image_processing_rt_detr.js'

package/src/models/moonshine/feature_extraction_moonshine.js ADDED Viewed

@@ -0,0 +1,26 @@
+import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
+import { Tensor } from '../../utils/tensor.js';
+export class MoonshineFeatureExtractor extends FeatureExtractor {
+    /**
+     * Asynchronously extracts input values from a given audio using the provided configuration.
+     * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
+     * @returns {Promise<{ input_values: Tensor; }>} The extracted input values.
+     */
+    async _call(audio) {
+        validate_audio_inputs(audio, 'MoonshineFeatureExtractor');
+        if (audio instanceof Float64Array) {
+            audio = new Float32Array(audio);
+        }
+        const shape = [
+            1,            /* batch_size */
+            audio.length, /* num_samples */
+        ];
+        return {
+            input_values: new Tensor('float32', audio, shape),
+        };
+    }
+}

package/src/models/moonshine/processing_moonshine.js ADDED Viewed

@@ -0,0 +1,20 @@
+import { AutoFeatureExtractor } from "../auto/feature_extraction_auto.js"
+import { AutoTokenizer } from "../../tokenizers.js"
+import { Processor } from "../../base/processing_utils.js"
+/**
+ * Represents a MoonshineProcessor that extracts features from an audio input.
+ */
+export class MoonshineProcessor extends Processor {
+    static tokenizer_class = AutoTokenizer
+    static feature_extractor_class = AutoFeatureExtractor
+    /**
+     * Calls the feature_extractor function with the given audio input.
+     * @param {any} audio The audio input to extract features from.
+     * @returns {Promise<any>} A Promise that resolves with the extracted features.
+     */
+    async _call(audio) {
+        return await this.feature_extractor(audio);
+    }
+}

package/src/models/phi3_v/image_processing_phi3_v.js ADDED Viewed

@@ -0,0 +1,163 @@
+import {
+    ImageProcessor,
+} from "../../base/image_processors_utils.js";
+import { cat, interpolate_4d, slice, stack, Tensor } from "../../utils/tensor.js";
+const IMAGE_SIZE = 336;
+const SLICE_AXES = [2, 3]; // axes to slice on
+const { ceil, floor, sqrt } = Math;
+export class Phi3VImageProcessor extends ImageProcessor {
+    constructor(config) {
+        super({
+            ...config,
+            do_normalize: true,
+            do_pad: true,
+            pad_size: 'custom',
+            do_convert_rgb: true,
+            do_resize: true, // Smart resizing "hd_transform"
+        });
+        this._num_crops = config.num_crops;
+    }
+    calc_num_image_tokens_from_image_size(width, height) {
+        // @ts-expect-error
+        const { num_img_tokens } = this.config;
+        return floor(((floor((height / IMAGE_SIZE)) * floor((width / IMAGE_SIZE)) + 1) * num_img_tokens) + 1 + (floor(height / IMAGE_SIZE) + 1) * sqrt(num_img_tokens));
+    }
+    /** @type {ImageProcessor['get_resize_output_image_size']} */
+    get_resize_output_image_size(image, size) {
+        const hd_num = this._num_crops;
+        const [width, height] = image.size
+        let ratio = width / height;
+        let scale = 1;
+        // Calculate the scaling factor
+        while (scale * Math.ceil(scale / ratio) <= hd_num) {
+            scale += 1;
+        }
+        scale -= 1;
+        // Compute the new dimensions
+        const new_w = Math.floor(scale * 336);
+        const new_h = Math.floor(new_w / ratio);
+        return [new_w, new_h]
+    }
+    /** @type {ImageProcessor['pad_image']} */
+    pad_image(pixelData, imgDims, padSize, options = {}) {
+        // Phi3V uses a custom padding strategy:
+        // - Pad to a multiple of 336
+        // - Pad with white pixels
+        const [imageHeight, imageWidth] = imgDims;
+        const height = IMAGE_SIZE * ceil(imageHeight / IMAGE_SIZE);
+        const width = IMAGE_SIZE * ceil(imageWidth / IMAGE_SIZE);
+        // NOTE: Since padding is done after normalization, we need to fill with the normalized values
+        const constant_values = [1, 1, 1].map((x, i) => (x - this.image_mean[i]) / this.image_std[i]);
+        return super.pad_image(pixelData, imgDims, { width, height }, {
+            center: true,
+            constant_values,
+            ...options,
+        });
+    }
+    async _call(images, {
+        num_crops = null,
+    } = {}) {
+        // @ts-expect-error
+        this._num_crops = num_crops ??= this.config.num_crops;
+        if (num_crops < 4 || sqrt(num_crops) % 1 !== 0) {
+            throw new Error("num_crops must be a square number >= 4");
+        }
+        if (!Array.isArray(images)) {
+            images = [images];
+        }
+        const num_images = images.length;
+        const imageData = await Promise.all(images.map(x => this.preprocess(x)));
+        const original_sizes = imageData.map(x => x.original_size);
+        const reshaped_input_sizes = imageData.map(x => x.reshaped_input_size);
+        // Process each image in batch
+        const all_pixel_values = [];
+        for (const { pixel_values } of imageData) {
+            pixel_values.unsqueeze_(0); // Easier processing as 4D tensor
+            const [height, width] = pixel_values.dims.slice(-2);
+            // Global image (Tensor of shape [num_channels, height, width])
+            const batch_pixel_values = await interpolate_4d(pixel_values, {
+                size: [IMAGE_SIZE, IMAGE_SIZE],
+                mode: 'bicubic',
+            });
+            if (num_crops > 0) {
+                const patches = [];
+                const sqrt_patches = sqrt(num_crops);
+                const patch_width = floor(width / sqrt_patches);
+                const patch_height = floor(height / sqrt_patches);
+                for (let y = 0; y < sqrt_patches; ++y) {
+                    for (let x = 0; x < sqrt_patches; ++x) {
+                        let start_x, start_y, end_x, end_y;
+                        if (y === sqrt_patches - 1) { // At bottom
+                            start_y = height - patch_height;
+                            end_y = height;
+                        } else {
+                            start_y = y * patch_height;
+                            end_y = (y + 1) * patch_height;
+                        }
+                        if (x === sqrt_patches - 1) { // At right
+                            start_x = width - patch_width;
+                            end_x = width;
+                        } else {
+                            start_x = x * patch_width;
+                            end_x = (x + 1) * patch_width;
+                        }
+                        const starts = [start_y, start_x];
+                        const ends = [end_y, end_x];
+                        const patch = await slice(pixel_values, starts, ends, SLICE_AXES);
+                        patches.push(patch);
+                    }
+                }
+                const resized_tensors = await interpolate_4d(cat(patches, 0), {
+                    size: [IMAGE_SIZE, IMAGE_SIZE],
+                    mode: 'bicubic',
+                }); // [num_crops, 3, 336, 336]
+                // Concatenate the global image with the patches
+                all_pixel_values.push(cat([batch_pixel_values, resized_tensors], 0));
+            } else {
+                // Only use the global image
+                // NOTE: Not currently supported in modelling code
+                all_pixel_values.push(batch_pixel_values);
+            }
+        }
+        // [num_images, 1 + num_crops, num_channels=3, height, width]
+        const pixel_values = stack(all_pixel_values, 0);
+        // Calculate padded image sizes
+        const sizes = reshaped_input_sizes.map(x => x.map(y => IMAGE_SIZE * ceil(y / IMAGE_SIZE)));
+        const image_sizes = new Tensor(
+            'int64',
+            sizes.flat(),
+            [num_images, 2],
+        );
+        const num_img_tokens = sizes.map(
+            ([height, width]) => this.calc_num_image_tokens_from_image_size(width, height),
+        );
+        return { pixel_values, original_sizes, reshaped_input_sizes, image_sizes, num_img_tokens };
+    }
+}

package/src/models/phi3_v/processing_phi3_v.js ADDED Viewed

@@ -0,0 +1,53 @@
+import { Processor } from "../../base/processing_utils.js";
+import { AutoImageProcessor } from "../auto/image_processing_auto.js";
+import { AutoTokenizer } from "../../tokenizers.js";
+import { RawImage } from "../../utils/image.js";
+const IMAGE_TOKEN = "<|image|>";
+const IMAGE_TOKEN_PATTERN = /<\|image_\d+\|>/g;
+export class Phi3VProcessor extends Processor {
+    static image_processor_class = AutoImageProcessor
+    static tokenizer_class = AutoTokenizer
+    /**
+     *
+     * @param {string|string[]} text
+     * @param {RawImage|RawImage[]} images
+     * @param  {...any} args
+     * @returns {Promise<any>}
+     */
+    async _call(text, images = null, {
+        padding = true,
+        truncation = true,
+        num_crops = null,
+    } = {}) {
+        if (!Array.isArray(text)) {
+            text = [text];
+        }
+        let text_inputs, image_inputs;
+        if (images) {
+            image_inputs = await this.image_processor(images, { num_crops });
+            const { num_img_tokens } = image_inputs;
+            // The original implementation adds a bos_token before the image tokens
+            // TODO: Check if this affects performance, since it looks like a bug in the original implementation
+            const prompt_chunks = text.map((t, i) => t.split(IMAGE_TOKEN_PATTERN).join(IMAGE_TOKEN.repeat(num_img_tokens[i])));
+            text_inputs = this.tokenizer(prompt_chunks, { padding, truncation });
+            // The model expects image tokens to be negative, so we negate the image token ids
+            const image_token_id = this.tokenizer.model.convert_tokens_to_ids([IMAGE_TOKEN])[0];
+            text_inputs.input_ids.map_(id => (id == image_token_id) ? -id : id);
+        } else {
+            text_inputs = this.tokenizer(text);
+        }
+        return {
+            ...text_inputs,
+            ...image_inputs,
+        }
+    }
+}

package/src/models/processors.js CHANGED Viewed

@@ -1,9 +1,11 @@
 export * from './florence2/processing_florence2.js';
 export * from './mgp_str/processing_mgp_str.js';
+export * from './moonshine/processing_moonshine.js';
 export * from './idefics3/processing_idefics3.js';
 export * from './janus/processing_janus.js';
 export * from './jina_clip/processing_jina_clip.js';
 export * from './owlvit/processing_owlvit.js';
+export * from './phi3_v/processing_phi3_v.js';
 export * from './paligemma/processing_paligemma.js';
 export * from './pyannote/processing_pyannote.js';
 export * from './qwen2_vl/processing_qwen2_vl.js';

package/src/models/pyannote/feature_extraction_pyannote.js CHANGED Viewed

@@ -1,5 +1,6 @@
 import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
 import { Tensor } from '../../utils/tensor.js';
+import { max, softmax } from '../../utils/maths.js';
 export class PyAnnoteFeatureExtractor extends FeatureExtractor {
@@ -25,4 +26,59 @@ export class PyAnnoteFeatureExtractor extends FeatureExtractor {
         };
     }
+    /**
+     * NOTE: Can return fractional values. `Math.ceil` will ensure correct value.
+     * @param {number} samples The number of frames in the audio.
+     * @returns {number} The number of frames in the audio.
+     */
+    samples_to_frames(samples) {
+        return ((samples - this.config.offset) / this.config.step);
+    }
+    /**
+     * Post-processes the speaker diarization logits output by the model.
+     * @param {import('../../utils/tensor.js').Tensor} logits The speaker diarization logits output by the model.
+     * @param {number} num_samples Number of samples in the input audio.
+     * @returns {Array<Array<{ id: number, start: number, end: number, confidence: number }>>} The post-processed speaker diarization results.
+     */
+    post_process_speaker_diarization(logits, num_samples) {
+        const ratio = (
+            num_samples / this.samples_to_frames(num_samples)
+        ) / this.config.sampling_rate;
+        const results = [];
+        for (const scores of logits.tolist()) {
+            const accumulated_segments = [];
+            let current_speaker = -1;
+            for (let i = 0; i < scores.length; ++i) {
+                const probabilities = softmax(scores[i]);
+                const [score, id] = max(probabilities);
+                const [start, end] = [i, i + 1];
+                if (id !== current_speaker) {
+                    // Speaker has changed
+                    current_speaker = id;
+                    accumulated_segments.push({ id, start, end, score });
+                } else {
+                    // Continue the current segment
+                    accumulated_segments.at(-1).end = end;
+                    accumulated_segments.at(-1).score += score;
+                }
+            }
+            results.push(accumulated_segments.map(
+                // Convert frame-space to time-space
+                // and compute the confidence
+                ({ id, start, end, score }) => ({
+                    id,
+                    start: start * ratio,
+                    end: end * ratio,
+                    confidence: score / (end - start),
+                })
+            ));
+        }
+        return results;
+    }
 }

package/src/models/pyannote/processing_pyannote.js CHANGED Viewed

@@ -1,9 +1,8 @@
 import { Processor } from '../../base/processing_utils.js';
-import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js';
-import { max, softmax } from '../../utils/maths.js';
+import { PyAnnoteFeatureExtractor } from './feature_extraction_pyannote.js';
 export class PyAnnoteProcessor extends Processor {
-    static feature_extractor_class = AutoFeatureExtractor
+    static feature_extractor_class = PyAnnoteFeatureExtractor
     /**
      * Calls the feature_extractor function with the given audio input.
@@ -14,58 +13,12 @@ export class PyAnnoteProcessor extends Processor {
         return await this.feature_extractor(audio)
     }
-    /**
-     * NOTE: Can return fractional values. `Math.ceil` will ensure correct value.
-     * @param {number} samples The number of frames in the audio.
-     * @returns {number} The number of frames in the audio.
-     */
-    samples_to_frames(samples) {
-        return ((samples - this.config.offset) / this.config.step);
+    /** @type {PyAnnoteFeatureExtractor['post_process_speaker_diarization']} */
+    post_process_speaker_diarization(...args) {
+        return /** @type {PyAnnoteFeatureExtractor} */(this.feature_extractor).post_process_speaker_diarization(...args);
     }
-    /**
-     * Post-processes the speaker diarization logits output by the model.
-     * @param {import('../../utils/tensor.js').Tensor} logits The speaker diarization logits output by the model.
-     * @param {number} num_samples Number of samples in the input audio.
-     * @returns {Array<Array<{ id: number, start: number, end: number, confidence: number }>>} The post-processed speaker diarization results.
-     */
-    post_process_speaker_diarization(logits, num_samples) {
-        const ratio = (
-            num_samples / this.samples_to_frames(num_samples)
-        ) / this.config.sampling_rate;
-        const results = [];
-        for (const scores of logits.tolist()) {
-            const accumulated_segments = [];
-            let current_speaker = -1;
-            for (let i = 0; i < scores.length; ++i) {
-                const probabilities = softmax(scores[i]);
-                const [score, id] = max(probabilities);
-                const [start, end] = [i, i + 1];
-                if (id !== current_speaker) {
-                    // Speaker has changed
-                    current_speaker = id;
-                    accumulated_segments.push({ id, start, end, score });
-                } else {
-                    // Continue the current segment
-                    accumulated_segments.at(-1).end = end;
-                    accumulated_segments.at(-1).score += score;
-                }
-            }
-            results.push(accumulated_segments.map(
-                // Convert frame-space to time-space
-                // and compute the confidence
-                ({ id, start, end, score }) => ({
-                    id,
-                    start: start * ratio,
-                    end: end * ratio,
-                    confidence: score / (end - start),
-                })
-            ));
-        }
-        return results;
+    get sampling_rate() {
+        return this.feature_extractor.config.sampling_rate;
     }
 }