@huggingface/transformers 3.0.0-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +202 -0
- package/README.md +376 -0
- package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
- package/dist/transformers.cjs +30741 -0
- package/dist/transformers.cjs.map +1 -0
- package/dist/transformers.js +33858 -0
- package/dist/transformers.js.map +1 -0
- package/dist/transformers.min.cjs +173 -0
- package/dist/transformers.min.cjs.map +1 -0
- package/dist/transformers.min.js +231 -0
- package/dist/transformers.min.js.map +1 -0
- package/package.json +92 -0
- package/src/backends/onnx.js +151 -0
- package/src/configs.js +360 -0
- package/src/env.js +152 -0
- package/src/generation/configuration_utils.js +381 -0
- package/src/generation/logits_process.js +716 -0
- package/src/generation/logits_sampler.js +204 -0
- package/src/generation/parameters.js +35 -0
- package/src/generation/stopping_criteria.js +156 -0
- package/src/generation/streamers.js +212 -0
- package/src/models/whisper/common_whisper.js +151 -0
- package/src/models/whisper/generation_whisper.js +89 -0
- package/src/models.js +7028 -0
- package/src/ops/registry.js +92 -0
- package/src/pipelines.js +3341 -0
- package/src/processors.js +2614 -0
- package/src/tokenizers.js +4395 -0
- package/src/transformers.js +28 -0
- package/src/utils/audio.js +704 -0
- package/src/utils/constants.js +2 -0
- package/src/utils/core.js +149 -0
- package/src/utils/data-structures.js +445 -0
- package/src/utils/devices.js +11 -0
- package/src/utils/dtypes.js +62 -0
- package/src/utils/generic.js +35 -0
- package/src/utils/hub.js +671 -0
- package/src/utils/image.js +745 -0
- package/src/utils/maths.js +1050 -0
- package/src/utils/tensor.js +1378 -0
- package/types/backends/onnx.d.ts +26 -0
- package/types/backends/onnx.d.ts.map +1 -0
- package/types/configs.d.ts +59 -0
- package/types/configs.d.ts.map +1 -0
- package/types/env.d.ts +106 -0
- package/types/env.d.ts.map +1 -0
- package/types/generation/configuration_utils.d.ts +320 -0
- package/types/generation/configuration_utils.d.ts.map +1 -0
- package/types/generation/logits_process.d.ts +354 -0
- package/types/generation/logits_process.d.ts.map +1 -0
- package/types/generation/logits_sampler.d.ts +51 -0
- package/types/generation/logits_sampler.d.ts.map +1 -0
- package/types/generation/parameters.d.ts +47 -0
- package/types/generation/parameters.d.ts.map +1 -0
- package/types/generation/stopping_criteria.d.ts +81 -0
- package/types/generation/stopping_criteria.d.ts.map +1 -0
- package/types/generation/streamers.d.ts +81 -0
- package/types/generation/streamers.d.ts.map +1 -0
- package/types/models/whisper/common_whisper.d.ts +8 -0
- package/types/models/whisper/common_whisper.d.ts.map +1 -0
- package/types/models/whisper/generation_whisper.d.ts +76 -0
- package/types/models/whisper/generation_whisper.d.ts.map +1 -0
- package/types/models.d.ts +3845 -0
- package/types/models.d.ts.map +1 -0
- package/types/ops/registry.d.ts +11 -0
- package/types/ops/registry.d.ts.map +1 -0
- package/types/pipelines.d.ts +2403 -0
- package/types/pipelines.d.ts.map +1 -0
- package/types/processors.d.ts +917 -0
- package/types/processors.d.ts.map +1 -0
- package/types/tokenizers.d.ts +999 -0
- package/types/tokenizers.d.ts.map +1 -0
- package/types/transformers.d.ts +13 -0
- package/types/transformers.d.ts.map +1 -0
- package/types/utils/audio.d.ts +130 -0
- package/types/utils/audio.d.ts.map +1 -0
- package/types/utils/constants.d.ts +2 -0
- package/types/utils/constants.d.ts.map +1 -0
- package/types/utils/core.d.ts +91 -0
- package/types/utils/core.d.ts.map +1 -0
- package/types/utils/data-structures.d.ts +236 -0
- package/types/utils/data-structures.d.ts.map +1 -0
- package/types/utils/devices.d.ts +8 -0
- package/types/utils/devices.d.ts.map +1 -0
- package/types/utils/dtypes.d.ts +22 -0
- package/types/utils/dtypes.d.ts.map +1 -0
- package/types/utils/generic.d.ts +11 -0
- package/types/utils/generic.d.ts.map +1 -0
- package/types/utils/hub.d.ts +191 -0
- package/types/utils/hub.d.ts.map +1 -0
- package/types/utils/image.d.ts +119 -0
- package/types/utils/image.d.ts.map +1 -0
- package/types/utils/maths.d.ts +280 -0
- package/types/utils/maths.d.ts.map +1 -0
- package/types/utils/tensor.d.ts +392 -0
- package/types/utils/tensor.d.ts.map +1 -0
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
const WHISPER_LANGUAGES = [
|
|
4
|
+
["en", "english"],
|
|
5
|
+
["zh", "chinese"],
|
|
6
|
+
["de", "german"],
|
|
7
|
+
["es", "spanish"],
|
|
8
|
+
["ru", "russian"],
|
|
9
|
+
["ko", "korean"],
|
|
10
|
+
["fr", "french"],
|
|
11
|
+
["ja", "japanese"],
|
|
12
|
+
["pt", "portuguese"],
|
|
13
|
+
["tr", "turkish"],
|
|
14
|
+
["pl", "polish"],
|
|
15
|
+
["ca", "catalan"],
|
|
16
|
+
["nl", "dutch"],
|
|
17
|
+
["ar", "arabic"],
|
|
18
|
+
["sv", "swedish"],
|
|
19
|
+
["it", "italian"],
|
|
20
|
+
["id", "indonesian"],
|
|
21
|
+
["hi", "hindi"],
|
|
22
|
+
["fi", "finnish"],
|
|
23
|
+
["vi", "vietnamese"],
|
|
24
|
+
["he", "hebrew"],
|
|
25
|
+
["uk", "ukrainian"],
|
|
26
|
+
["el", "greek"],
|
|
27
|
+
["ms", "malay"],
|
|
28
|
+
["cs", "czech"],
|
|
29
|
+
["ro", "romanian"],
|
|
30
|
+
["da", "danish"],
|
|
31
|
+
["hu", "hungarian"],
|
|
32
|
+
["ta", "tamil"],
|
|
33
|
+
["no", "norwegian"],
|
|
34
|
+
["th", "thai"],
|
|
35
|
+
["ur", "urdu"],
|
|
36
|
+
["hr", "croatian"],
|
|
37
|
+
["bg", "bulgarian"],
|
|
38
|
+
["lt", "lithuanian"],
|
|
39
|
+
["la", "latin"],
|
|
40
|
+
["mi", "maori"],
|
|
41
|
+
["ml", "malayalam"],
|
|
42
|
+
["cy", "welsh"],
|
|
43
|
+
["sk", "slovak"],
|
|
44
|
+
["te", "telugu"],
|
|
45
|
+
["fa", "persian"],
|
|
46
|
+
["lv", "latvian"],
|
|
47
|
+
["bn", "bengali"],
|
|
48
|
+
["sr", "serbian"],
|
|
49
|
+
["az", "azerbaijani"],
|
|
50
|
+
["sl", "slovenian"],
|
|
51
|
+
["kn", "kannada"],
|
|
52
|
+
["et", "estonian"],
|
|
53
|
+
["mk", "macedonian"],
|
|
54
|
+
["br", "breton"],
|
|
55
|
+
["eu", "basque"],
|
|
56
|
+
["is", "icelandic"],
|
|
57
|
+
["hy", "armenian"],
|
|
58
|
+
["ne", "nepali"],
|
|
59
|
+
["mn", "mongolian"],
|
|
60
|
+
["bs", "bosnian"],
|
|
61
|
+
["kk", "kazakh"],
|
|
62
|
+
["sq", "albanian"],
|
|
63
|
+
["sw", "swahili"],
|
|
64
|
+
["gl", "galician"],
|
|
65
|
+
["mr", "marathi"],
|
|
66
|
+
["pa", "punjabi"],
|
|
67
|
+
["si", "sinhala"],
|
|
68
|
+
["km", "khmer"],
|
|
69
|
+
["sn", "shona"],
|
|
70
|
+
["yo", "yoruba"],
|
|
71
|
+
["so", "somali"],
|
|
72
|
+
["af", "afrikaans"],
|
|
73
|
+
["oc", "occitan"],
|
|
74
|
+
["ka", "georgian"],
|
|
75
|
+
["be", "belarusian"],
|
|
76
|
+
["tg", "tajik"],
|
|
77
|
+
["sd", "sindhi"],
|
|
78
|
+
["gu", "gujarati"],
|
|
79
|
+
["am", "amharic"],
|
|
80
|
+
["yi", "yiddish"],
|
|
81
|
+
["lo", "lao"],
|
|
82
|
+
["uz", "uzbek"],
|
|
83
|
+
["fo", "faroese"],
|
|
84
|
+
["ht", "haitian creole"],
|
|
85
|
+
["ps", "pashto"],
|
|
86
|
+
["tk", "turkmen"],
|
|
87
|
+
["nn", "nynorsk"],
|
|
88
|
+
["mt", "maltese"],
|
|
89
|
+
["sa", "sanskrit"],
|
|
90
|
+
["lb", "luxembourgish"],
|
|
91
|
+
["my", "myanmar"],
|
|
92
|
+
["bo", "tibetan"],
|
|
93
|
+
["tl", "tagalog"],
|
|
94
|
+
["mg", "malagasy"],
|
|
95
|
+
["as", "assamese"],
|
|
96
|
+
["tt", "tatar"],
|
|
97
|
+
["haw", "hawaiian"],
|
|
98
|
+
["ln", "lingala"],
|
|
99
|
+
["ha", "hausa"],
|
|
100
|
+
["ba", "bashkir"],
|
|
101
|
+
["jw", "javanese"],
|
|
102
|
+
["su", "sundanese"],
|
|
103
|
+
]
|
|
104
|
+
|
|
105
|
+
// @ts-ignore
|
|
106
|
+
export const WHISPER_LANGUAGE_MAPPING = new Map(WHISPER_LANGUAGES);
|
|
107
|
+
// @ts-ignore
|
|
108
|
+
export const WHISPER_TO_LANGUAGE_CODE_MAPPING = new Map([
|
|
109
|
+
...WHISPER_LANGUAGES.map(([k, v]) => [v, k]),
|
|
110
|
+
...[
|
|
111
|
+
["burmese", "my"],
|
|
112
|
+
["valencian", "ca"],
|
|
113
|
+
["flemish", "nl"],
|
|
114
|
+
["haitian", "ht"],
|
|
115
|
+
["letzeburgesch", "lb"],
|
|
116
|
+
["pushto", "ps"],
|
|
117
|
+
["panjabi", "pa"],
|
|
118
|
+
["moldavian", "ro"],
|
|
119
|
+
["moldovan", "ro"],
|
|
120
|
+
["sinhalese", "si"],
|
|
121
|
+
["castilian", "es"],
|
|
122
|
+
]
|
|
123
|
+
]);
|
|
124
|
+
|
|
125
|
+
/**
|
|
126
|
+
* @param {string} language The language name or code
|
|
127
|
+
* @returns {string} The language code
|
|
128
|
+
*/
|
|
129
|
+
export function whisper_language_to_code(language) {
|
|
130
|
+
language = language.toLowerCase();
|
|
131
|
+
|
|
132
|
+
// Map to code from user-friendly name (e.g., "english" -> "en")
|
|
133
|
+
let language_code = WHISPER_TO_LANGUAGE_CODE_MAPPING.get(language);
|
|
134
|
+
|
|
135
|
+
if (language_code === undefined) {
|
|
136
|
+
// User provided something that is not a language name
|
|
137
|
+
|
|
138
|
+
if (WHISPER_LANGUAGE_MAPPING.has(language)) {
|
|
139
|
+
// User provided the language code directly (e.g., "en")
|
|
140
|
+
language_code = language;
|
|
141
|
+
|
|
142
|
+
} else {
|
|
143
|
+
// User provided something that is not a language code or name
|
|
144
|
+
const is_language_code = language.length === 2;
|
|
145
|
+
const langs = is_language_code ? WHISPER_LANGUAGE_MAPPING.keys() : WHISPER_LANGUAGE_MAPPING.values();
|
|
146
|
+
|
|
147
|
+
throw new Error(`Language "${language}" is not supported. Must be one of: ${JSON.stringify(langs)}`);
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
return language_code;
|
|
151
|
+
}
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import { GenerationConfig } from "../../generation/configuration_utils.js";
|
|
2
|
+
|
|
3
|
+
export class WhisperGenerationConfig extends GenerationConfig {
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Whether to return the timestamps with the text. This enables the `WhisperTimestampsLogitsProcessor`.
|
|
7
|
+
* @type {boolean}
|
|
8
|
+
*/
|
|
9
|
+
return_timestamps = null;
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Whether to return token-level timestamps
|
|
13
|
+
* with the text. This can be used with or without the `return_timestamps` option. To get word-level
|
|
14
|
+
* timestamps, use the tokenizer to group the tokens into words.
|
|
15
|
+
* @type {boolean}
|
|
16
|
+
*/
|
|
17
|
+
return_token_timestamps = null;
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* The number of audio frames available in this chunk. This is only used generating word-level timestamps.
|
|
21
|
+
* @type {number}
|
|
22
|
+
*/
|
|
23
|
+
num_frames = null;
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Alignment heads to predict word-level timestamps. This is a list of [layer, head] pairs that
|
|
27
|
+
* select the cross-attention heads that are highly correlated to word-level timing.
|
|
28
|
+
* @type {[number, number][]}
|
|
29
|
+
*/
|
|
30
|
+
alignment_heads = null;
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Task to use for generation, either "translate" or "transcribe".
|
|
34
|
+
* @type {string}
|
|
35
|
+
*/
|
|
36
|
+
task = null;
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Language token to use for generation, can be either in the form of `<|en|>`, `en` or `english`.
|
|
40
|
+
* You can find all the possible language tokens in the `model.generation_config.lang_to_id` dictionary.
|
|
41
|
+
* @type {string}
|
|
42
|
+
*/
|
|
43
|
+
language = null;
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* The id of the `"<|notimestamps|>"` token.
|
|
47
|
+
* @type {number}
|
|
48
|
+
*/
|
|
49
|
+
no_timestamps_token_id = null;
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Rank-1 list of token IDs created by passing text to [`~WhisperProcessor.get_prompt_ids`] that is
|
|
53
|
+
* provided as a prompt to each chunk. This can be used to provide or "prompt-engineer" a context for
|
|
54
|
+
* transcription, e.g. custom vocabularies or proper nouns to make it more likely to predict those words
|
|
55
|
+
* correctly. It cannot be used in conjunction with `decoder_start_token_id` as it overwrites this value.
|
|
56
|
+
* @type {number[]}
|
|
57
|
+
*/
|
|
58
|
+
prompt_ids = null;
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Whether the model is multilingual or not.
|
|
62
|
+
* @type {boolean}
|
|
63
|
+
*/
|
|
64
|
+
is_multilingual = null;
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* (Optional) A mapping from language tokens to their corresponding IDs.
|
|
68
|
+
* Only required if the model is multilingual.
|
|
69
|
+
* @type {Record<string, number>|null}
|
|
70
|
+
*/
|
|
71
|
+
lang_to_id = null;
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* (Optional) A mapping from task tokens to their corresponding IDs.
|
|
75
|
+
* @type {Record<string, number>|null}
|
|
76
|
+
*/
|
|
77
|
+
task_to_id = null;
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* Used to set the maximum value of the initial timestamp. This is used to prevent the model from
|
|
81
|
+
* predicting timestamps that are too far in the future.
|
|
82
|
+
* @type {number}
|
|
83
|
+
*/
|
|
84
|
+
max_initial_timestamp_index = 1;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* @typedef {import('../../generation/parameters.js').GenerationFunctionParameters & {generation_config: WhisperGenerationConfig} & WhisperGenerationConfig} WhisperGenerationFunctionParameters
|
|
89
|
+
*/
|