@huggingface/transformers 3.1.1 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -4
- package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
- package/dist/transformers.cjs +1062 -183
- package/dist/transformers.cjs.map +1 -1
- package/dist/transformers.js +2239 -1232
- package/dist/transformers.js.map +1 -1
- package/dist/transformers.min.cjs +1 -358
- package/dist/transformers.min.cjs.map +1 -1
- package/dist/transformers.min.js +1 -421
- package/dist/transformers.min.js.map +1 -1
- package/dist/transformers.min.mjs +1 -358
- package/dist/transformers.min.mjs.map +1 -1
- package/dist/transformers.mjs +1082 -181
- package/dist/transformers.mjs.map +1 -1
- package/package.json +11 -16
- package/src/backends/onnx.js +2 -7
- package/src/base/image_processors_utils.js +3 -1
- package/src/configs.js +11 -2
- package/src/env.js +1 -1
- package/src/models/feature_extractors.js +1 -0
- package/src/models/idefics3/image_processing_idefics3.js +24 -13
- package/src/models/image_processors.js +1 -0
- package/src/models/moonshine/feature_extraction_moonshine.js +26 -0
- package/src/models/moonshine/processing_moonshine.js +20 -0
- package/src/models/paligemma/processing_paligemma.js +82 -0
- package/src/models/phi3_v/image_processing_phi3_v.js +163 -0
- package/src/models/phi3_v/processing_phi3_v.js +53 -0
- package/src/models/processors.js +3 -0
- package/src/models/pyannote/feature_extraction_pyannote.js +56 -0
- package/src/models/pyannote/processing_pyannote.js +7 -54
- package/src/models.js +233 -35
- package/src/ops/registry.js +11 -0
- package/src/pipelines.js +30 -0
- package/src/tokenizers.js +12 -1
- package/src/utils/core.js +39 -9
- package/src/utils/hub.js +8 -12
- package/src/utils/image.js +40 -0
- package/src/utils/tensor.js +51 -1
- package/types/backends/onnx.d.ts +2 -2
- package/types/backends/onnx.d.ts.map +1 -1
- package/types/base/feature_extraction_utils.d.ts +1 -1
- package/types/base/feature_extraction_utils.d.ts.map +1 -1
- package/types/base/image_processors_utils.d.ts +4 -4
- package/types/base/image_processors_utils.d.ts.map +1 -1
- package/types/base/processing_utils.d.ts +4 -4
- package/types/base/processing_utils.d.ts.map +1 -1
- package/types/configs.d.ts +7 -7
- package/types/configs.d.ts.map +1 -1
- package/types/env.d.ts +1 -1
- package/types/env.d.ts.map +1 -1
- package/types/generation/configuration_utils.d.ts +2 -2
- package/types/generation/logits_process.d.ts +2 -2
- package/types/generation/logits_process.d.ts.map +1 -1
- package/types/generation/logits_sampler.d.ts.map +1 -1
- package/types/generation/parameters.d.ts +5 -5
- package/types/generation/stopping_criteria.d.ts +1 -1
- package/types/generation/stopping_criteria.d.ts.map +1 -1
- package/types/generation/streamers.d.ts +2 -2
- package/types/generation/streamers.d.ts.map +1 -1
- package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts +1 -1
- package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts.map +1 -1
- package/types/models/auto/feature_extraction_auto.d.ts.map +1 -1
- package/types/models/auto/image_processing_auto.d.ts.map +1 -1
- package/types/models/auto/processing_auto.d.ts +1 -1
- package/types/models/auto/processing_auto.d.ts.map +1 -1
- package/types/models/clap/feature_extraction_clap.d.ts +1 -1
- package/types/models/clap/feature_extraction_clap.d.ts.map +1 -1
- package/types/models/detr/image_processing_detr.d.ts +11 -11
- package/types/models/detr/image_processing_detr.d.ts.map +1 -1
- package/types/models/donut/image_processing_donut.d.ts +1 -1
- package/types/models/donut/image_processing_donut.d.ts.map +1 -1
- package/types/models/feature_extractors.d.ts +1 -0
- package/types/models/florence2/processing_florence2.d.ts.map +1 -1
- package/types/models/idefics3/image_processing_idefics3.d.ts.map +1 -1
- package/types/models/idefics3/processing_idefics3.d.ts.map +1 -1
- package/types/models/image_processors.d.ts +1 -0
- package/types/models/janus/image_processing_janus.d.ts +1 -1
- package/types/models/janus/image_processing_janus.d.ts.map +1 -1
- package/types/models/janus/processing_janus.d.ts.map +1 -1
- package/types/models/maskformer/image_processing_maskformer.d.ts +8 -8
- package/types/models/maskformer/image_processing_maskformer.d.ts.map +1 -1
- package/types/models/mgp_str/processing_mgp_str.d.ts +2 -2
- package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -1
- package/types/models/moonshine/feature_extraction_moonshine.d.ts +13 -0
- package/types/models/moonshine/feature_extraction_moonshine.d.ts.map +1 -0
- package/types/models/moonshine/processing_moonshine.d.ts +17 -0
- package/types/models/moonshine/processing_moonshine.d.ts.map +1 -0
- package/types/models/owlvit/image_processing_owlvit.d.ts.map +1 -1
- package/types/models/paligemma/processing_paligemma.d.ts +12 -0
- package/types/models/paligemma/processing_paligemma.d.ts.map +1 -0
- package/types/models/phi3_v/image_processing_phi3_v.d.ts +17 -0
- package/types/models/phi3_v/image_processing_phi3_v.d.ts.map +1 -0
- package/types/models/phi3_v/processing_phi3_v.d.ts +17 -0
- package/types/models/phi3_v/processing_phi3_v.d.ts.map +1 -0
- package/types/models/processors.d.ts +3 -0
- package/types/models/pyannote/feature_extraction_pyannote.d.ts +18 -0
- package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -1
- package/types/models/pyannote/processing_pyannote.d.ts +4 -15
- package/types/models/pyannote/processing_pyannote.d.ts.map +1 -1
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/rt_detr/image_processing_rt_detr.d.ts.map +1 -1
- package/types/models/sam/image_processing_sam.d.ts.map +1 -1
- package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts +1 -1
- package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts.map +1 -1
- package/types/models/segformer/image_processing_segformer.d.ts.map +1 -1
- package/types/models/speecht5/processing_speecht5.d.ts.map +1 -1
- package/types/models/swin2sr/image_processing_swin2sr.d.ts +1 -1
- package/types/models/swin2sr/image_processing_swin2sr.d.ts.map +1 -1
- package/types/models/vitmatte/image_processing_vitmatte.d.ts.map +1 -1
- package/types/models/vitpose/image_processing_vitpose.d.ts +1 -1
- package/types/models/vitpose/image_processing_vitpose.d.ts.map +1 -1
- package/types/models/wav2vec2/feature_extraction_wav2vec2.d.ts.map +1 -1
- package/types/models/wav2vec2/processing_wav2vec2.d.ts.map +1 -1
- package/types/models/wespeaker/feature_extraction_wespeaker.d.ts +1 -1
- package/types/models/wespeaker/feature_extraction_wespeaker.d.ts.map +1 -1
- package/types/models/whisper/feature_extraction_whisper.d.ts +1 -1
- package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
- package/types/models/whisper/generation_whisper.d.ts.map +1 -1
- package/types/models/whisper/processing_whisper.d.ts.map +1 -1
- package/types/models/yolos/image_processing_yolos.d.ts.map +1 -1
- package/types/models.d.ts +61 -5
- package/types/models.d.ts.map +1 -1
- package/types/ops/registry.d.ts +1 -0
- package/types/ops/registry.d.ts.map +1 -1
- package/types/pipelines.d.ts +31 -51
- package/types/pipelines.d.ts.map +1 -1
- package/types/tokenizers.d.ts +10 -6
- package/types/tokenizers.d.ts.map +1 -1
- package/types/utils/audio.d.ts.map +1 -1
- package/types/utils/constants.d.ts.map +1 -1
- package/types/utils/core.d.ts +87 -22
- package/types/utils/core.d.ts.map +1 -1
- package/types/utils/data-structures.d.ts.map +1 -1
- package/types/utils/devices.d.ts.map +1 -1
- package/types/utils/dtypes.d.ts.map +1 -1
- package/types/utils/generic.d.ts.map +1 -1
- package/types/utils/hub.d.ts +3 -3
- package/types/utils/hub.d.ts.map +1 -1
- package/types/utils/image.d.ts +10 -1
- package/types/utils/image.d.ts.map +1 -1
- package/types/utils/maths.d.ts +10 -10
- package/types/utils/maths.d.ts.map +1 -1
- package/types/utils/tensor.d.ts +22 -6
- package/types/utils/tensor.d.ts.map +1 -1
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
import { Processor } from '../../base/processing_utils.js';
|
|
2
|
-
import {
|
|
3
|
-
import { max, softmax } from '../../utils/maths.js';
|
|
2
|
+
import { PyAnnoteFeatureExtractor } from './feature_extraction_pyannote.js';
|
|
4
3
|
|
|
5
4
|
export class PyAnnoteProcessor extends Processor {
|
|
6
|
-
static feature_extractor_class =
|
|
5
|
+
static feature_extractor_class = PyAnnoteFeatureExtractor
|
|
7
6
|
|
|
8
7
|
/**
|
|
9
8
|
* Calls the feature_extractor function with the given audio input.
|
|
@@ -14,58 +13,12 @@ export class PyAnnoteProcessor extends Processor {
|
|
|
14
13
|
return await this.feature_extractor(audio)
|
|
15
14
|
}
|
|
16
15
|
|
|
17
|
-
/**
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
* @returns {number} The number of frames in the audio.
|
|
21
|
-
*/
|
|
22
|
-
samples_to_frames(samples) {
|
|
23
|
-
return ((samples - this.config.offset) / this.config.step);
|
|
16
|
+
/** @type {PyAnnoteFeatureExtractor['post_process_speaker_diarization']} */
|
|
17
|
+
post_process_speaker_diarization(...args) {
|
|
18
|
+
return /** @type {PyAnnoteFeatureExtractor} */(this.feature_extractor).post_process_speaker_diarization(...args);
|
|
24
19
|
}
|
|
25
20
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
* @param {import('../../utils/tensor.js').Tensor} logits The speaker diarization logits output by the model.
|
|
29
|
-
* @param {number} num_samples Number of samples in the input audio.
|
|
30
|
-
* @returns {Array<Array<{ id: number, start: number, end: number, confidence: number }>>} The post-processed speaker diarization results.
|
|
31
|
-
*/
|
|
32
|
-
post_process_speaker_diarization(logits, num_samples) {
|
|
33
|
-
const ratio = (
|
|
34
|
-
num_samples / this.samples_to_frames(num_samples)
|
|
35
|
-
) / this.config.sampling_rate;
|
|
36
|
-
|
|
37
|
-
const results = [];
|
|
38
|
-
for (const scores of logits.tolist()) {
|
|
39
|
-
const accumulated_segments = [];
|
|
40
|
-
|
|
41
|
-
let current_speaker = -1;
|
|
42
|
-
for (let i = 0; i < scores.length; ++i) {
|
|
43
|
-
const probabilities = softmax(scores[i]);
|
|
44
|
-
const [score, id] = max(probabilities);
|
|
45
|
-
const [start, end] = [i, i + 1];
|
|
46
|
-
|
|
47
|
-
if (id !== current_speaker) {
|
|
48
|
-
// Speaker has changed
|
|
49
|
-
current_speaker = id;
|
|
50
|
-
accumulated_segments.push({ id, start, end, score });
|
|
51
|
-
} else {
|
|
52
|
-
// Continue the current segment
|
|
53
|
-
accumulated_segments.at(-1).end = end;
|
|
54
|
-
accumulated_segments.at(-1).score += score;
|
|
55
|
-
}
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
results.push(accumulated_segments.map(
|
|
59
|
-
// Convert frame-space to time-space
|
|
60
|
-
// and compute the confidence
|
|
61
|
-
({ id, start, end, score }) => ({
|
|
62
|
-
id,
|
|
63
|
-
start: start * ratio,
|
|
64
|
-
end: end * ratio,
|
|
65
|
-
confidence: score / (end - start),
|
|
66
|
-
})
|
|
67
|
-
));
|
|
68
|
-
}
|
|
69
|
-
return results;
|
|
21
|
+
get sampling_rate() {
|
|
22
|
+
return this.feature_extractor.config.sampling_rate;
|
|
70
23
|
}
|
|
71
24
|
}
|
package/src/models.js
CHANGED
|
@@ -131,6 +131,7 @@ const MODEL_TYPES = {
|
|
|
131
131
|
ImageTextToText: 6,
|
|
132
132
|
Musicgen: 7,
|
|
133
133
|
MultiModality: 8,
|
|
134
|
+
Phi3V: 9,
|
|
134
135
|
}
|
|
135
136
|
//////////////////////////////////////////////////
|
|
136
137
|
|
|
@@ -558,7 +559,9 @@ async function decoderForward(self, model_inputs, is_encoder_decoder = false) {
|
|
|
558
559
|
new_model_inputs.use_cache_branch = boolTensor(!!past_key_values);
|
|
559
560
|
}
|
|
560
561
|
if (session.inputNames.includes('position_ids') && new_model_inputs.attention_mask && !new_model_inputs.position_ids) {
|
|
561
|
-
|
|
562
|
+
// NOTE: Handle a special case for paligemma models, where positions are 1-indexed
|
|
563
|
+
const start_index = self.config.model_type === 'paligemma' ? 1 : 0;
|
|
564
|
+
new_model_inputs.position_ids = createPositionIds(new_model_inputs, past_key_values, start_index);
|
|
562
565
|
}
|
|
563
566
|
|
|
564
567
|
// Unpack the `past_key_values` object into model inputs
|
|
@@ -694,14 +697,14 @@ async function imageTextToTextForward(self, {
|
|
|
694
697
|
* @param {Tensor} attention_mask
|
|
695
698
|
* @returns {{data: BigInt64Array, dims: number[]}}
|
|
696
699
|
*/
|
|
697
|
-
function cumsum_masked_fill(attention_mask) {
|
|
700
|
+
function cumsum_masked_fill(attention_mask, start_index = 0) {
|
|
698
701
|
const [bz, seq_len] = attention_mask.dims;
|
|
699
702
|
const attn_mask_data = attention_mask.data;
|
|
700
703
|
|
|
701
704
|
const data = new BigInt64Array(attn_mask_data.length);
|
|
702
705
|
for (let i = 0; i < bz; ++i) {
|
|
703
706
|
const start = i * seq_len;
|
|
704
|
-
let sum = BigInt(
|
|
707
|
+
let sum = BigInt(start_index);
|
|
705
708
|
for (let j = 0; j < seq_len; ++j) {
|
|
706
709
|
const index = start + j;
|
|
707
710
|
if (attn_mask_data[index] === 0n) {
|
|
@@ -728,10 +731,10 @@ function cumsum_masked_fill(attention_mask) {
|
|
|
728
731
|
* position_ids = position_ids[:, -input_ids.shape[1] :]
|
|
729
732
|
* ```
|
|
730
733
|
*/
|
|
731
|
-
function createPositionIds(model_inputs, past_key_values = null) {
|
|
734
|
+
function createPositionIds(model_inputs, past_key_values = null, start_index = 0) {
|
|
732
735
|
const { input_ids, inputs_embeds, attention_mask } = model_inputs;
|
|
733
736
|
|
|
734
|
-
const { data, dims } = cumsum_masked_fill(attention_mask);
|
|
737
|
+
const { data, dims } = cumsum_masked_fill(attention_mask, start_index);
|
|
735
738
|
let position_ids = new Tensor('int64', data, dims);
|
|
736
739
|
if (past_key_values) {
|
|
737
740
|
const offset = -(input_ids ?? inputs_embeds).dims.at(1);
|
|
@@ -904,6 +907,10 @@ export class PreTrainedModel extends Callable {
|
|
|
904
907
|
this._forward = imageTextToTextForward;
|
|
905
908
|
this._prepare_inputs_for_generation = image_text_to_text_prepare_inputs_for_generation;
|
|
906
909
|
break;
|
|
910
|
+
case MODEL_TYPES.Phi3V:
|
|
911
|
+
this.can_generate = true;
|
|
912
|
+
this._prepare_inputs_for_generation = image_text_to_text_prepare_inputs_for_generation;
|
|
913
|
+
break;
|
|
907
914
|
|
|
908
915
|
case MODEL_TYPES.MultiModality:
|
|
909
916
|
this.can_generate = true;
|
|
@@ -1068,6 +1075,18 @@ export class PreTrainedModel extends Callable {
|
|
|
1068
1075
|
}, options),
|
|
1069
1076
|
]);
|
|
1070
1077
|
|
|
1078
|
+
} else if (modelType === MODEL_TYPES.Phi3V) {
|
|
1079
|
+
info = await Promise.all([
|
|
1080
|
+
constructSessions(pretrained_model_name_or_path, {
|
|
1081
|
+
prepare_inputs_embeds: 'prepare_inputs_embeds',
|
|
1082
|
+
model: 'model',
|
|
1083
|
+
vision_encoder: 'vision_encoder',
|
|
1084
|
+
}, options),
|
|
1085
|
+
getOptionalConfigs(pretrained_model_name_or_path, {
|
|
1086
|
+
generation_config: 'generation_config.json',
|
|
1087
|
+
}, options),
|
|
1088
|
+
]);
|
|
1089
|
+
|
|
1071
1090
|
} else { // should be MODEL_TYPES.EncoderOnly
|
|
1072
1091
|
if (modelType !== MODEL_TYPES.EncoderOnly) {
|
|
1073
1092
|
const type = modelName ?? config?.model_type;
|
|
@@ -3340,6 +3359,29 @@ export class WhisperForConditionalGeneration extends WhisperPreTrainedModel {
|
|
|
3340
3359
|
}
|
|
3341
3360
|
//////////////////////////////////////////////////
|
|
3342
3361
|
|
|
3362
|
+
|
|
3363
|
+
//////////////////////////////////////////////////
|
|
3364
|
+
// Moonshine models
|
|
3365
|
+
export class MoonshinePreTrainedModel extends PreTrainedModel {
|
|
3366
|
+
|
|
3367
|
+
requires_attention_mask = false;
|
|
3368
|
+
main_input_name = 'input_values';
|
|
3369
|
+
forward_params = [
|
|
3370
|
+
'input_values',
|
|
3371
|
+
'decoder_input_ids',
|
|
3372
|
+
'past_key_values',
|
|
3373
|
+
];
|
|
3374
|
+
};
|
|
3375
|
+
|
|
3376
|
+
/**
|
|
3377
|
+
* MoonshineModel class for training Moonshine models without a language model head.
|
|
3378
|
+
*/
|
|
3379
|
+
export class MoonshineModel extends MoonshinePreTrainedModel { }
|
|
3380
|
+
|
|
3381
|
+
export class MoonshineForConditionalGeneration extends MoonshinePreTrainedModel { }
|
|
3382
|
+
//////////////////////////////////////////////////
|
|
3383
|
+
|
|
3384
|
+
|
|
3343
3385
|
//////////////////////////////////////////////////
|
|
3344
3386
|
/**
|
|
3345
3387
|
* Vision Encoder-Decoder model based on OpenAI's GPT architecture for image captioning and other vision tasks
|
|
@@ -3548,6 +3590,30 @@ export class Florence2ForConditionalGeneration extends Florence2PreTrainedModel
|
|
|
3548
3590
|
}
|
|
3549
3591
|
}
|
|
3550
3592
|
|
|
3593
|
+
export class PaliGemmaPreTrainedModel extends PreTrainedModel {
|
|
3594
|
+
forward_params = [
|
|
3595
|
+
'input_ids',
|
|
3596
|
+
// 'inputs_embeds',
|
|
3597
|
+
'attention_mask',
|
|
3598
|
+
'pixel_values',
|
|
3599
|
+
'position_ids',
|
|
3600
|
+
'past_key_values',
|
|
3601
|
+
];
|
|
3602
|
+
}
|
|
3603
|
+
|
|
3604
|
+
export class PaliGemmaForConditionalGeneration extends PaliGemmaPreTrainedModel {
|
|
3605
|
+
_merge_input_ids_with_image_features(kwargs) {
|
|
3606
|
+
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
3607
|
+
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
3608
|
+
|
|
3609
|
+
return default_merge_input_ids_with_image_features({
|
|
3610
|
+
// @ts-ignore
|
|
3611
|
+
image_token_id: this.config.image_token_index,
|
|
3612
|
+
...kwargs,
|
|
3613
|
+
image_features: reshaped_image_hidden_states,
|
|
3614
|
+
})
|
|
3615
|
+
}
|
|
3616
|
+
}
|
|
3551
3617
|
|
|
3552
3618
|
//////////////////////////////////////////////////
|
|
3553
3619
|
// Idefics3 Models
|
|
@@ -3586,6 +3652,77 @@ export class Idefics3ForConditionalGeneration extends Idefics3PreTrainedModel {
|
|
|
3586
3652
|
}
|
|
3587
3653
|
//////////////////////////////////////////////////
|
|
3588
3654
|
|
|
3655
|
+
export class Phi3VPreTrainedModel extends PreTrainedModel {
|
|
3656
|
+
forward_params = [
|
|
3657
|
+
'input_ids',
|
|
3658
|
+
'inputs_embeds',
|
|
3659
|
+
'attention_mask',
|
|
3660
|
+
'position_ids',
|
|
3661
|
+
'pixel_values',
|
|
3662
|
+
'image_sizes',
|
|
3663
|
+
'past_key_values',
|
|
3664
|
+
];
|
|
3665
|
+
}
|
|
3666
|
+
export class Phi3VForCausalLM extends Phi3VPreTrainedModel {
|
|
3667
|
+
|
|
3668
|
+
async forward({
|
|
3669
|
+
// Produced by the tokenizer/processor:
|
|
3670
|
+
input_ids = null,
|
|
3671
|
+
attention_mask = null,
|
|
3672
|
+
pixel_values = null,
|
|
3673
|
+
image_sizes = null,
|
|
3674
|
+
|
|
3675
|
+
// Used during generation:
|
|
3676
|
+
position_ids = null,
|
|
3677
|
+
inputs_embeds = null,
|
|
3678
|
+
past_key_values = null,
|
|
3679
|
+
|
|
3680
|
+
// Generic generation parameters
|
|
3681
|
+
generation_config = null,
|
|
3682
|
+
logits_processor = null,
|
|
3683
|
+
|
|
3684
|
+
// TODO: needed?
|
|
3685
|
+
...kwargs
|
|
3686
|
+
}) {
|
|
3687
|
+
if (!inputs_embeds) {
|
|
3688
|
+
let image_features;
|
|
3689
|
+
if (pixel_values && input_ids.dims[1] !== 1) {
|
|
3690
|
+
if (!image_sizes) {
|
|
3691
|
+
throw new Error('`image_sizes` must be provided when `pixel_values` is provided.');
|
|
3692
|
+
}
|
|
3693
|
+
|
|
3694
|
+
// Encode the image
|
|
3695
|
+
({ image_features } = await sessionRun(this.sessions['vision_encoder'], {
|
|
3696
|
+
pixel_values,
|
|
3697
|
+
image_sizes,
|
|
3698
|
+
}));
|
|
3699
|
+
} else {
|
|
3700
|
+
const hidden_size = this.config.normalized_config.hidden_size;
|
|
3701
|
+
image_features = new Tensor(
|
|
3702
|
+
'float32',
|
|
3703
|
+
[],
|
|
3704
|
+
[0, hidden_size],
|
|
3705
|
+
);
|
|
3706
|
+
}
|
|
3707
|
+
|
|
3708
|
+
({ inputs_embeds } = await sessionRun(this.sessions['prepare_inputs_embeds'], {
|
|
3709
|
+
input_ids,
|
|
3710
|
+
image_features,
|
|
3711
|
+
}));
|
|
3712
|
+
}
|
|
3713
|
+
|
|
3714
|
+
const outputs = await decoderForward(this, {
|
|
3715
|
+
inputs_embeds,
|
|
3716
|
+
past_key_values,
|
|
3717
|
+
attention_mask,
|
|
3718
|
+
position_ids,
|
|
3719
|
+
generation_config,
|
|
3720
|
+
logits_processor,
|
|
3721
|
+
}, false);
|
|
3722
|
+
return outputs;
|
|
3723
|
+
}
|
|
3724
|
+
}
|
|
3725
|
+
|
|
3589
3726
|
//////////////////////////////////////////////////
|
|
3590
3727
|
export class CLIPPreTrainedModel extends PreTrainedModel { }
|
|
3591
3728
|
|
|
@@ -3640,9 +3777,11 @@ export class CLIPModel extends CLIPPreTrainedModel { }
|
|
|
3640
3777
|
export class CLIPTextModel extends CLIPPreTrainedModel {
|
|
3641
3778
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
3642
3779
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
3643
|
-
|
|
3644
|
-
|
|
3645
|
-
|
|
3780
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
3781
|
+
// Update default model file name if not provided
|
|
3782
|
+
model_file_name: 'text_model',
|
|
3783
|
+
...options,
|
|
3784
|
+
});
|
|
3646
3785
|
}
|
|
3647
3786
|
}
|
|
3648
3787
|
|
|
@@ -3675,9 +3814,11 @@ export class CLIPTextModel extends CLIPPreTrainedModel {
|
|
|
3675
3814
|
export class CLIPTextModelWithProjection extends CLIPPreTrainedModel {
|
|
3676
3815
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
3677
3816
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
3678
|
-
|
|
3679
|
-
|
|
3680
|
-
|
|
3817
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
3818
|
+
// Update default model file name if not provided
|
|
3819
|
+
model_file_name: 'text_model',
|
|
3820
|
+
...options,
|
|
3821
|
+
});
|
|
3681
3822
|
}
|
|
3682
3823
|
}
|
|
3683
3824
|
|
|
@@ -3687,9 +3828,11 @@ export class CLIPTextModelWithProjection extends CLIPPreTrainedModel {
|
|
|
3687
3828
|
export class CLIPVisionModel extends CLIPPreTrainedModel {
|
|
3688
3829
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
3689
3830
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
3690
|
-
|
|
3691
|
-
|
|
3692
|
-
|
|
3831
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
3832
|
+
// Update default model file name if not provided
|
|
3833
|
+
model_file_name: 'vision_model',
|
|
3834
|
+
...options,
|
|
3835
|
+
});
|
|
3693
3836
|
}
|
|
3694
3837
|
}
|
|
3695
3838
|
|
|
@@ -3722,9 +3865,11 @@ export class CLIPVisionModel extends CLIPPreTrainedModel {
|
|
|
3722
3865
|
export class CLIPVisionModelWithProjection extends CLIPPreTrainedModel {
|
|
3723
3866
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
3724
3867
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
3725
|
-
|
|
3726
|
-
|
|
3727
|
-
|
|
3868
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
3869
|
+
// Update default model file name if not provided
|
|
3870
|
+
model_file_name: 'vision_model',
|
|
3871
|
+
...options,
|
|
3872
|
+
});
|
|
3728
3873
|
}
|
|
3729
3874
|
}
|
|
3730
3875
|
//////////////////////////////////////////////////
|
|
@@ -3808,9 +3953,11 @@ export class SiglipModel extends SiglipPreTrainedModel { }
|
|
|
3808
3953
|
export class SiglipTextModel extends SiglipPreTrainedModel {
|
|
3809
3954
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
3810
3955
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
3811
|
-
|
|
3812
|
-
|
|
3813
|
-
|
|
3956
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
3957
|
+
// Update default model file name if not provided
|
|
3958
|
+
model_file_name: 'text_model',
|
|
3959
|
+
...options,
|
|
3960
|
+
});
|
|
3814
3961
|
}
|
|
3815
3962
|
}
|
|
3816
3963
|
|
|
@@ -3843,9 +3990,11 @@ export class SiglipTextModel extends SiglipPreTrainedModel {
|
|
|
3843
3990
|
export class SiglipVisionModel extends CLIPPreTrainedModel {
|
|
3844
3991
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
3845
3992
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
3846
|
-
|
|
3847
|
-
|
|
3848
|
-
|
|
3993
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
3994
|
+
// Update default model file name if not provided
|
|
3995
|
+
model_file_name: 'vision_model',
|
|
3996
|
+
...options,
|
|
3997
|
+
});
|
|
3849
3998
|
}
|
|
3850
3999
|
}
|
|
3851
4000
|
//////////////////////////////////////////////////
|
|
@@ -3900,18 +4049,22 @@ export class JinaCLIPModel extends JinaCLIPPreTrainedModel {
|
|
|
3900
4049
|
export class JinaCLIPTextModel extends JinaCLIPPreTrainedModel {
|
|
3901
4050
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
3902
4051
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
3903
|
-
|
|
3904
|
-
|
|
3905
|
-
|
|
4052
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
4053
|
+
// Update default model file name if not provided
|
|
4054
|
+
model_file_name: 'text_model',
|
|
4055
|
+
...options,
|
|
4056
|
+
});
|
|
3906
4057
|
}
|
|
3907
4058
|
}
|
|
3908
4059
|
|
|
3909
4060
|
export class JinaCLIPVisionModel extends JinaCLIPPreTrainedModel {
|
|
3910
4061
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
3911
4062
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
3912
|
-
|
|
3913
|
-
|
|
3914
|
-
|
|
4063
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
4064
|
+
// Update default model file name if not provided
|
|
4065
|
+
model_file_name: 'vision_model',
|
|
4066
|
+
...options,
|
|
4067
|
+
});
|
|
3915
4068
|
}
|
|
3916
4069
|
}
|
|
3917
4070
|
//////////////////////////////////////////////////
|
|
@@ -4071,6 +4224,14 @@ export class LlamaForCausalLM extends LlamaPreTrainedModel { }
|
|
|
4071
4224
|
//////////////////////////////////////////////////
|
|
4072
4225
|
|
|
4073
4226
|
|
|
4227
|
+
//////////////////////////////////////////////////
|
|
4228
|
+
// EXAONE models
|
|
4229
|
+
export class ExaonePreTrainedModel extends PreTrainedModel { }
|
|
4230
|
+
export class ExaoneModel extends ExaonePreTrainedModel { }
|
|
4231
|
+
export class ExaoneForCausalLM extends ExaonePreTrainedModel { }
|
|
4232
|
+
//////////////////////////////////////////////////
|
|
4233
|
+
|
|
4234
|
+
|
|
4074
4235
|
//////////////////////////////////////////////////
|
|
4075
4236
|
// MobileLLM models
|
|
4076
4237
|
export class MobileLLMPreTrainedModel extends PreTrainedModel { }
|
|
@@ -4086,6 +4247,13 @@ export class OlmoModel extends OlmoPreTrainedModel { }
|
|
|
4086
4247
|
export class OlmoForCausalLM extends OlmoPreTrainedModel { }
|
|
4087
4248
|
//////////////////////////////////////////////////
|
|
4088
4249
|
|
|
4250
|
+
//////////////////////////////////////////////////
|
|
4251
|
+
// OLMo2 models
|
|
4252
|
+
export class Olmo2PreTrainedModel extends PreTrainedModel { }
|
|
4253
|
+
export class Olmo2Model extends Olmo2PreTrainedModel { }
|
|
4254
|
+
export class Olmo2ForCausalLM extends Olmo2PreTrainedModel { }
|
|
4255
|
+
//////////////////////////////////////////////////
|
|
4256
|
+
|
|
4089
4257
|
|
|
4090
4258
|
//////////////////////////////////////////////////
|
|
4091
4259
|
// Granite models
|
|
@@ -4502,6 +4670,20 @@ export class ViTForImageClassification extends ViTPreTrainedModel {
|
|
|
4502
4670
|
//////////////////////////////////////////////////
|
|
4503
4671
|
|
|
4504
4672
|
|
|
4673
|
+
//////////////////////////////////////////////////
|
|
4674
|
+
export class IJepaPreTrainedModel extends PreTrainedModel { }
|
|
4675
|
+
export class IJepaModel extends IJepaPreTrainedModel { }
|
|
4676
|
+
export class IJepaForImageClassification extends IJepaPreTrainedModel {
|
|
4677
|
+
/**
|
|
4678
|
+
* @param {any} model_inputs
|
|
4679
|
+
*/
|
|
4680
|
+
async _call(model_inputs) {
|
|
4681
|
+
return new SequenceClassifierOutput(await super._call(model_inputs));
|
|
4682
|
+
}
|
|
4683
|
+
}
|
|
4684
|
+
//////////////////////////////////////////////////
|
|
4685
|
+
|
|
4686
|
+
|
|
4505
4687
|
//////////////////////////////////////////////////
|
|
4506
4688
|
export class VitPosePreTrainedModel extends PreTrainedModel { }
|
|
4507
4689
|
|
|
@@ -6112,9 +6294,11 @@ export class ClapModel extends ClapPreTrainedModel { }
|
|
|
6112
6294
|
export class ClapTextModelWithProjection extends ClapPreTrainedModel {
|
|
6113
6295
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
6114
6296
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
6115
|
-
|
|
6116
|
-
|
|
6117
|
-
|
|
6297
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
6298
|
+
// Update default model file name if not provided
|
|
6299
|
+
model_file_name: 'text_model',
|
|
6300
|
+
...options,
|
|
6301
|
+
});
|
|
6118
6302
|
}
|
|
6119
6303
|
}
|
|
6120
6304
|
|
|
@@ -6147,9 +6331,11 @@ export class ClapTextModelWithProjection extends ClapPreTrainedModel {
|
|
|
6147
6331
|
export class ClapAudioModelWithProjection extends ClapPreTrainedModel {
|
|
6148
6332
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
6149
6333
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
6150
|
-
|
|
6151
|
-
|
|
6152
|
-
|
|
6334
|
+
return super.from_pretrained(pretrained_model_name_or_path, {
|
|
6335
|
+
// Update default model file name if not provided
|
|
6336
|
+
model_file_name: 'audio_model',
|
|
6337
|
+
...options,
|
|
6338
|
+
});
|
|
6153
6339
|
}
|
|
6154
6340
|
}
|
|
6155
6341
|
//////////////////////////////////////////////////
|
|
@@ -6772,6 +6958,7 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
|
|
|
6772
6958
|
['rt_detr', ['RTDetrModel', RTDetrModel]],
|
|
6773
6959
|
['table-transformer', ['TableTransformerModel', TableTransformerModel]],
|
|
6774
6960
|
['vit', ['ViTModel', ViTModel]],
|
|
6961
|
+
['ijepa', ['IJepaModel', IJepaModel]],
|
|
6775
6962
|
['pvt', ['PvtModel', PvtModel]],
|
|
6776
6963
|
['vit_msn', ['ViTMSNModel', ViTMSNModel]],
|
|
6777
6964
|
['vit_mae', ['ViTMAEModel', ViTMAEModel]],
|
|
@@ -6835,7 +7022,9 @@ const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([
|
|
|
6835
7022
|
['gpt_neox', ['GPTNeoXModel', GPTNeoXModel]],
|
|
6836
7023
|
['codegen', ['CodeGenModel', CodeGenModel]],
|
|
6837
7024
|
['llama', ['LlamaModel', LlamaModel]],
|
|
7025
|
+
['exaone', ['ExaoneModel', ExaoneModel]],
|
|
6838
7026
|
['olmo', ['OlmoModel', OlmoModel]],
|
|
7027
|
+
['olmo2', ['Olmo2Model', Olmo2Model]],
|
|
6839
7028
|
['mobilellm', ['MobileLLMModel', MobileLLMModel]],
|
|
6840
7029
|
['granite', ['GraniteModel', GraniteModel]],
|
|
6841
7030
|
['cohere', ['CohereModel', CohereModel]],
|
|
@@ -6856,6 +7045,7 @@ const MODEL_MAPPING_NAMES_DECODER_ONLY = new Map([
|
|
|
6856
7045
|
const MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = new Map([
|
|
6857
7046
|
['speecht5', ['SpeechT5ForSpeechToText', SpeechT5ForSpeechToText]],
|
|
6858
7047
|
['whisper', ['WhisperForConditionalGeneration', WhisperForConditionalGeneration]],
|
|
7048
|
+
['moonshine', ['MoonshineForConditionalGeneration', MoonshineForConditionalGeneration]],
|
|
6859
7049
|
]);
|
|
6860
7050
|
|
|
6861
7051
|
const MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES = new Map([
|
|
@@ -6926,7 +7116,9 @@ const MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = new Map([
|
|
|
6926
7116
|
['gpt_neox', ['GPTNeoXForCausalLM', GPTNeoXForCausalLM]],
|
|
6927
7117
|
['codegen', ['CodeGenForCausalLM', CodeGenForCausalLM]],
|
|
6928
7118
|
['llama', ['LlamaForCausalLM', LlamaForCausalLM]],
|
|
7119
|
+
['exaone', ['ExaoneForCausalLM', ExaoneForCausalLM]],
|
|
6929
7120
|
['olmo', ['OlmoForCausalLM', OlmoForCausalLM]],
|
|
7121
|
+
['olmo2', ['Olmo2ForCausalLM', Olmo2ForCausalLM]],
|
|
6930
7122
|
['mobilellm', ['MobileLLMForCausalLM', MobileLLMForCausalLM]],
|
|
6931
7123
|
['granite', ['GraniteForCausalLM', GraniteForCausalLM]],
|
|
6932
7124
|
['cohere', ['CohereForCausalLM', CohereForCausalLM]],
|
|
@@ -6944,6 +7136,9 @@ const MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = new Map([
|
|
|
6944
7136
|
['falcon', ['FalconForCausalLM', FalconForCausalLM]],
|
|
6945
7137
|
['trocr', ['TrOCRForCausalLM', TrOCRForCausalLM]],
|
|
6946
7138
|
['stablelm', ['StableLmForCausalLM', StableLmForCausalLM]],
|
|
7139
|
+
|
|
7140
|
+
// Also image-text-to-text
|
|
7141
|
+
['phi3_v', ['Phi3VForCausalLM', Phi3VForCausalLM]],
|
|
6947
7142
|
]);
|
|
6948
7143
|
|
|
6949
7144
|
const MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = new Map([
|
|
@@ -7000,6 +7195,7 @@ const MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = new Map([
|
|
|
7000
7195
|
['florence2', ['Florence2ForConditionalGeneration', Florence2ForConditionalGeneration]],
|
|
7001
7196
|
['qwen2-vl', ['Qwen2VLForConditionalGeneration', Qwen2VLForConditionalGeneration]],
|
|
7002
7197
|
['idefics3', ['Idefics3ForConditionalGeneration', Idefics3ForConditionalGeneration]],
|
|
7198
|
+
['paligemma', ['PaliGemmaForConditionalGeneration', PaliGemmaForConditionalGeneration]],
|
|
7003
7199
|
]);
|
|
7004
7200
|
|
|
7005
7201
|
const MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = new Map([
|
|
@@ -7008,6 +7204,7 @@ const MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = new Map([
|
|
|
7008
7204
|
|
|
7009
7205
|
const MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = new Map([
|
|
7010
7206
|
['vit', ['ViTForImageClassification', ViTForImageClassification]],
|
|
7207
|
+
['ijepa', ['IJepaForImageClassification', IJepaForImageClassification]],
|
|
7011
7208
|
['pvt', ['PvtForImageClassification', PvtForImageClassification]],
|
|
7012
7209
|
['vit_msn', ['ViTMSNForImageClassification', ViTMSNForImageClassification]],
|
|
7013
7210
|
['fastvit', ['FastViTForImageClassification', FastViTForImageClassification]],
|
|
@@ -7179,6 +7376,7 @@ const CUSTOM_MAPPING = [
|
|
|
7179
7376
|
// OVERRIDE:
|
|
7180
7377
|
// TODO: Refactor to allow class to specify model
|
|
7181
7378
|
['MusicgenForConditionalGeneration', MusicgenForConditionalGeneration, MODEL_TYPES.Musicgen],
|
|
7379
|
+
['Phi3VForCausalLM', Phi3VForCausalLM, MODEL_TYPES.Phi3V],
|
|
7182
7380
|
|
|
7183
7381
|
['CLIPTextModelWithProjection', CLIPTextModelWithProjection, MODEL_TYPES.EncoderOnly],
|
|
7184
7382
|
['SiglipTextModel', SiglipTextModel, MODEL_TYPES.EncoderOnly],
|
package/src/ops/registry.js
CHANGED
|
@@ -100,4 +100,15 @@ export class TensorOpRegistry {
|
|
|
100
100
|
}
|
|
101
101
|
return this._top_k;
|
|
102
102
|
}
|
|
103
|
+
|
|
104
|
+
static get slice() {
|
|
105
|
+
if (!this._slice) {
|
|
106
|
+
this._slice = wrap(
|
|
107
|
+
[8, 7, 18, 0, 58, 96, 10, 25, 10, 1, 120, 10, 1, 115, 10, 1, 101, 10, 1, 97, 10, 1, 116, 18, 1, 121, 34, 5, 83, 108, 105, 99, 101, 18, 1, 114, 90, 9, 10, 1, 120, 18, 4, 10, 2, 8, 1, 90, 9, 10, 1, 115, 18, 4, 10, 2, 8, 7, 90, 9, 10, 1, 101, 18, 4, 10, 2, 8, 7, 90, 9, 10, 1, 97, 18, 4, 10, 2, 8, 7, 90, 9, 10, 1, 116, 18, 4, 10, 2, 8, 7, 98, 9, 10, 1, 121, 18, 4, 10, 2, 8, 1, 66, 2, 16, 13],
|
|
108
|
+
this.session_options,
|
|
109
|
+
'y',
|
|
110
|
+
)
|
|
111
|
+
}
|
|
112
|
+
return this._slice;
|
|
113
|
+
}
|
|
103
114
|
}
|
package/src/pipelines.js
CHANGED
|
@@ -1729,6 +1729,8 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options
|
|
|
1729
1729
|
case 'unispeech-sat':
|
|
1730
1730
|
case 'hubert':
|
|
1731
1731
|
return this._call_wav2vec2(audio, kwargs)
|
|
1732
|
+
case 'moonshine':
|
|
1733
|
+
return this._call_moonshine(audio, kwargs)
|
|
1732
1734
|
default:
|
|
1733
1735
|
throw new Error(`AutomaticSpeechRecognitionPipeline does not support model type '${this.model.config.model_type}'.`)
|
|
1734
1736
|
}
|
|
@@ -1882,6 +1884,34 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options
|
|
|
1882
1884
|
}
|
|
1883
1885
|
return single ? toReturn[0] : toReturn;
|
|
1884
1886
|
}
|
|
1887
|
+
|
|
1888
|
+
/**
|
|
1889
|
+
* @type {AutomaticSpeechRecognitionPipelineCallback}
|
|
1890
|
+
* @private
|
|
1891
|
+
*/
|
|
1892
|
+
async _call_moonshine(audio, kwargs) {
|
|
1893
|
+
const single = !Array.isArray(audio);
|
|
1894
|
+
if (single) {
|
|
1895
|
+
audio = [/** @type {AudioInput} */ (audio)];
|
|
1896
|
+
}
|
|
1897
|
+
const sampling_rate = this.processor.feature_extractor.config.sampling_rate;
|
|
1898
|
+
const preparedAudios = await prepareAudios(audio, sampling_rate);
|
|
1899
|
+
const toReturn = [];
|
|
1900
|
+
for (const aud of preparedAudios) {
|
|
1901
|
+
const inputs = await this.processor(aud);
|
|
1902
|
+
|
|
1903
|
+
// According to the [paper](https://arxiv.org/pdf/2410.15608):
|
|
1904
|
+
// "We use greedy decoding, with a heuristic limit of 6 output tokens
|
|
1905
|
+
// per second of audio to avoid repeated output sequences."
|
|
1906
|
+
const max_new_tokens = Math.floor(aud.length / sampling_rate) * 6;
|
|
1907
|
+
const outputs = await this.model.generate({ max_new_tokens, ...kwargs, ...inputs });
|
|
1908
|
+
|
|
1909
|
+
const text = this.processor.batch_decode(outputs, { skip_special_tokens: true })[0];
|
|
1910
|
+
toReturn.push({ text });
|
|
1911
|
+
}
|
|
1912
|
+
return single ? toReturn[0] : toReturn;
|
|
1913
|
+
}
|
|
1914
|
+
|
|
1885
1915
|
}
|
|
1886
1916
|
|
|
1887
1917
|
/**
|
package/src/tokenizers.js
CHANGED
|
@@ -2605,6 +2605,12 @@ export class PreTrainedTokenizer extends Callable {
|
|
|
2605
2605
|
this.unk_token = this.getToken('unk_token');
|
|
2606
2606
|
this.unk_token_id = this.model.tokens_to_ids.get(this.unk_token);
|
|
2607
2607
|
|
|
2608
|
+
this.bos_token = this.getToken('bos_token');
|
|
2609
|
+
this.bos_token_id = this.model.tokens_to_ids.get(this.bos_token);
|
|
2610
|
+
|
|
2611
|
+
this.eos_token = this.getToken('eos_token');
|
|
2612
|
+
this.eos_token_id = this.model.tokens_to_ids.get(this.eos_token);
|
|
2613
|
+
|
|
2608
2614
|
this.model_max_length = tokenizerConfig.model_max_length;
|
|
2609
2615
|
|
|
2610
2616
|
/** @type {boolean} Whether or not to strip the text when tokenizing (removing excess spaces before and after the string). */
|
|
@@ -3577,6 +3583,11 @@ export class WhisperTokenizer extends PreTrainedTokenizer {
|
|
|
3577
3583
|
let chunk = new_chunk();
|
|
3578
3584
|
let time_offset = 0.0;
|
|
3579
3585
|
const timestamp_begin = this.timestamp_begin;
|
|
3586
|
+
// Whisper timestamp tokens start from 0.00 and go to timestamp 30.00 in 0.02 increments.
|
|
3587
|
+
// We can calculate the last time stamp token as timestamp_begin plus the number of tokens
|
|
3588
|
+
// tokens from 0.00 to 30.00 which is 1500.
|
|
3589
|
+
const total_timestamp_tokens = 1500; // (30.00 - 0.00) / 0.02
|
|
3590
|
+
const timestamp_end = timestamp_begin + total_timestamp_tokens;
|
|
3580
3591
|
|
|
3581
3592
|
let previous_tokens = [];
|
|
3582
3593
|
let previous_token_timestamps = [];
|
|
@@ -3664,7 +3675,7 @@ export class WhisperTokenizer extends PreTrainedTokenizer {
|
|
|
3664
3675
|
} else {
|
|
3665
3676
|
// 2/ This is a regular special token, ignoring it
|
|
3666
3677
|
}
|
|
3667
|
-
} else if (token >= timestamp_begin) {
|
|
3678
|
+
} else if (token >= timestamp_begin && token <= timestamp_end) {
|
|
3668
3679
|
// 3/ Timestamp token
|
|
3669
3680
|
const time = (token - timestamp_begin) * time_precision + time_offset;
|
|
3670
3681
|
const rounded_time = round(time, 2);
|