@huggingface/transformers 3.0.0-alpha.15 → 3.0.0-alpha.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
- package/dist/transformers.cjs +108 -91
- package/dist/transformers.cjs.map +1 -1
- package/dist/transformers.js +459 -511
- package/dist/transformers.js.map +1 -1
- package/dist/transformers.min.cjs +6 -6
- package/dist/transformers.min.cjs.map +1 -1
- package/dist/transformers.min.js +16 -16
- package/dist/transformers.min.js.map +1 -1
- package/dist/transformers.min.mjs +6 -6
- package/dist/transformers.min.mjs.map +1 -1
- package/dist/transformers.mjs +108 -91
- package/dist/transformers.mjs.map +1 -1
- package/package.json +2 -2
- package/src/configs.js +16 -4
- package/src/env.js +1 -1
- package/src/models.js +43 -55
- package/src/tokenizers.js +22 -19
- package/src/utils/core.js +12 -0
- package/src/utils/data-structures.js +13 -11
- package/src/utils/hub.js +1 -1
- package/types/configs.d.ts +25 -3
- package/types/configs.d.ts.map +1 -1
- package/types/models.d.ts +1 -2
- package/types/models.d.ts.map +1 -1
- package/types/tokenizers.d.ts.map +1 -1
- package/types/utils/core.d.ts +7 -0
- package/types/utils/core.d.ts.map +1 -1
- package/types/utils/data-structures.d.ts +6 -6
- package/types/utils/data-structures.d.ts.map +1 -1
- package/types/utils/hub.d.ts +1 -1
- package/types/utils/hub.d.ts.map +1 -1
package/dist/transformers.mjs
CHANGED
|
@@ -4301,16 +4301,23 @@ function getKeyValueShapes(config, {
|
|
|
4301
4301
|
class PretrainedConfig {
|
|
4302
4302
|
// NOTE: Typo in original
|
|
4303
4303
|
|
|
4304
|
+
/** @type {string|null} */
|
|
4305
|
+
model_type = null;
|
|
4306
|
+
|
|
4307
|
+
/** @type {boolean} */
|
|
4308
|
+
is_encoder_decoder = false;
|
|
4309
|
+
|
|
4310
|
+
/** @type {number} */
|
|
4304
4311
|
max_position_embeddings;
|
|
4305
4312
|
|
|
4313
|
+
/** @type {TransformersJSConfig} */
|
|
4314
|
+
'transformers.js_config';
|
|
4315
|
+
|
|
4306
4316
|
/**
|
|
4307
4317
|
* Create a new PreTrainedTokenizer instance.
|
|
4308
4318
|
* @param {Object} configJSON The JSON of the config.
|
|
4309
4319
|
*/
|
|
4310
4320
|
constructor(configJSON) {
|
|
4311
|
-
this.model_type = null;
|
|
4312
|
-
this.is_encoder_decoder = false;
|
|
4313
|
-
|
|
4314
4321
|
Object.assign(this, configJSON);
|
|
4315
4322
|
this.normalized_config = getNormalizedConfig(this);
|
|
4316
4323
|
}
|
|
@@ -4362,7 +4369,12 @@ class AutoConfig {
|
|
|
4362
4369
|
/**
|
|
4363
4370
|
* Transformers.js-specific configuration, possibly present in config.json under the key `transformers.js_config`.
|
|
4364
4371
|
* @typedef {Object} TransformersJSConfig
|
|
4365
|
-
* @property {import('./
|
|
4372
|
+
* @property {import('./utils/tensor.js').DataType} [kv_cache_dtype] The data type of the key-value cache.
|
|
4373
|
+
* @property {Record<string, number>} [free_dimension_overrides] Override the free dimensions of the model.
|
|
4374
|
+
* See https://onnxruntime.ai/docs/tutorials/web/env-flags-and-session-options.html#freedimensionoverrides
|
|
4375
|
+
* for more information.
|
|
4376
|
+
* @property {import('./utils/devices.js').DeviceType} [device] The default device to use for the model.
|
|
4377
|
+
* @property {import('./utils/dtypes.js').DataType} [dtype] The default data type to use for the model.
|
|
4366
4378
|
*/
|
|
4367
4379
|
|
|
4368
4380
|
|
|
@@ -4410,7 +4422,7 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
4410
4422
|
|
|
4411
4423
|
|
|
4412
4424
|
|
|
4413
|
-
const VERSION = '3.0.0-alpha.
|
|
4425
|
+
const VERSION = '3.0.0-alpha.17';
|
|
4414
4426
|
|
|
4415
4427
|
// Check if various APIs are available (depends on environment)
|
|
4416
4428
|
const IS_BROWSER_ENV = typeof self !== 'undefined';
|
|
@@ -6856,7 +6868,8 @@ const MODEL_CLASS_TO_NAME_MAPPING = new Map();
|
|
|
6856
6868
|
* @private
|
|
6857
6869
|
*/
|
|
6858
6870
|
async function getSession(pretrained_model_name_or_path, fileName, options) {
|
|
6859
|
-
|
|
6871
|
+
const custom_config = options.config?.['transformers.js_config'] ?? {};
|
|
6872
|
+
let device = options.device ?? custom_config.device;
|
|
6860
6873
|
if (device && typeof device !== 'string') {
|
|
6861
6874
|
if (device.hasOwnProperty(fileName)) {
|
|
6862
6875
|
device = device[fileName];
|
|
@@ -6874,7 +6887,7 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
|
|
|
6874
6887
|
|
|
6875
6888
|
// If options.dtype is specified, we use it to choose the suffix for the model file.
|
|
6876
6889
|
// Otherwise, we use the default dtype for the device.
|
|
6877
|
-
let dtype = options.dtype;
|
|
6890
|
+
let dtype = options.dtype ?? custom_config.dtype;
|
|
6878
6891
|
if (typeof dtype !== 'string') {
|
|
6879
6892
|
if (dtype && dtype.hasOwnProperty(fileName)) {
|
|
6880
6893
|
dtype = dtype[fileName];
|
|
@@ -6901,6 +6914,16 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
|
|
|
6901
6914
|
// Overwrite `executionProviders` if not specified
|
|
6902
6915
|
session_options.executionProviders ??= executionProviders;
|
|
6903
6916
|
|
|
6917
|
+
// Overwrite `freeDimensionOverrides` if specified in config and not set in session options
|
|
6918
|
+
const free_dimension_overrides = custom_config.free_dimension_overrides;
|
|
6919
|
+
if (free_dimension_overrides) {
|
|
6920
|
+
session_options.freeDimensionOverrides ??= free_dimension_overrides;
|
|
6921
|
+
} else if (selectedDevice.startsWith('webnn') && !session_options.freeDimensionOverrides) {
|
|
6922
|
+
console.warn(
|
|
6923
|
+
'WebNN does not currently support dynamic shapes and requires `free_dimension_overrides` to be set in config.json as a field within "transformers.js_config". ' +
|
|
6924
|
+
'When `free_dimension_overrides` is not set, you may experience significant performance degradation.'
|
|
6925
|
+
);
|
|
6926
|
+
}
|
|
6904
6927
|
|
|
6905
6928
|
const bufferPromise = (0,_utils_hub_js__WEBPACK_IMPORTED_MODULE_5__.getModelFile)(pretrained_model_name_or_path, modelFileName, true, options);
|
|
6906
6929
|
|
|
@@ -6949,6 +6972,9 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
|
|
|
6949
6972
|
/** @type {Record<string, import('onnxruntime-common').Tensor.DataLocation>} */
|
|
6950
6973
|
const preferredOutputLocation = {};
|
|
6951
6974
|
for (const key in shapes) {
|
|
6975
|
+
// TODO: For now, we keep encoder outputs on the CPU
|
|
6976
|
+
// (otherwise, this causes a memory leak or throws an error "Error: previous buffer is not registered")
|
|
6977
|
+
if (key.includes('encoder')) continue;
|
|
6952
6978
|
preferredOutputLocation[key] = 'gpu-buffer';
|
|
6953
6979
|
}
|
|
6954
6980
|
session_options.preferredOutputLocation = preferredOutputLocation;
|
|
@@ -7104,37 +7130,6 @@ function toI64Tensor(items) {
|
|
|
7104
7130
|
}
|
|
7105
7131
|
}
|
|
7106
7132
|
|
|
7107
|
-
/**
|
|
7108
|
-
* Prepares an attention mask for a sequence of tokens based on configuration options.
|
|
7109
|
-
* @param {Object} self The calling object instance.
|
|
7110
|
-
* @param {Tensor} tokens The input tokens.
|
|
7111
|
-
* @returns {Tensor} The attention mask tensor.
|
|
7112
|
-
* @private
|
|
7113
|
-
*/
|
|
7114
|
-
function prepareAttentionMask(self, tokens) {
|
|
7115
|
-
|
|
7116
|
-
// Prepare attention mask
|
|
7117
|
-
let pad_token_id = self.config.pad_token_id ?? null;
|
|
7118
|
-
let eos_token_id = self.config.eos_token_id ?? null;
|
|
7119
|
-
if ((0,_utils_core_js__WEBPACK_IMPORTED_MODULE_4__.isIntegralNumber)(eos_token_id)) {
|
|
7120
|
-
eos_token_id = [eos_token_id];
|
|
7121
|
-
}
|
|
7122
|
-
|
|
7123
|
-
let is_pad_token_in_inputs = tokens.indexOf(pad_token_id) !== -1;
|
|
7124
|
-
let is_pad_token_not_equal_to_eos_token_id = (eos_token_id === null) || !eos_token_id.includes(pad_token_id)
|
|
7125
|
-
|
|
7126
|
-
if (is_pad_token_in_inputs && is_pad_token_not_equal_to_eos_token_id) {
|
|
7127
|
-
let data = BigInt64Array.from(
|
|
7128
|
-
// Note: != so that int matches bigint
|
|
7129
|
-
// @ts-ignore
|
|
7130
|
-
tokens.data.map(x => x != pad_token_id)
|
|
7131
|
-
)
|
|
7132
|
-
return new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_8__.Tensor('int64', data, tokens.dims)
|
|
7133
|
-
} else {
|
|
7134
|
-
return (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_8__.ones_like)(tokens);
|
|
7135
|
-
}
|
|
7136
|
-
}
|
|
7137
|
-
|
|
7138
7133
|
/**
|
|
7139
7134
|
* Creates a boolean tensor with a single value.
|
|
7140
7135
|
* @param {boolean} value The value of the tensor.
|
|
@@ -7405,8 +7400,8 @@ function image_text_to_text_prepare_inputs_for_generation(self, ...args) {
|
|
|
7405
7400
|
} else {
|
|
7406
7401
|
return decoder_prepare_inputs_for_generation(self, ...args);
|
|
7407
7402
|
}
|
|
7408
|
-
|
|
7409
7403
|
}
|
|
7404
|
+
|
|
7410
7405
|
//////////////////////////////////////////////////
|
|
7411
7406
|
|
|
7412
7407
|
//////////////////////////////////////////////////
|
|
@@ -8169,13 +8164,12 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
|
|
|
8169
8164
|
// - GenerationMode.BEAM_SEARCH
|
|
8170
8165
|
// - GenerationMode.BEAM_SAMPLE
|
|
8171
8166
|
////////////////////////////////////////////////////
|
|
8172
|
-
let
|
|
8167
|
+
let outputs;
|
|
8173
8168
|
let attentions = {};
|
|
8174
8169
|
while (true) {
|
|
8175
8170
|
// prepare model inputs
|
|
8176
8171
|
model_inputs = this.prepare_inputs_for_generation(all_input_ids, model_inputs, generation_config);
|
|
8177
|
-
|
|
8178
|
-
const outputs = await this.forward(model_inputs);
|
|
8172
|
+
outputs = await this.forward(model_inputs);
|
|
8179
8173
|
|
|
8180
8174
|
if (generation_config.output_attentions && generation_config.return_dict_in_generate) {
|
|
8181
8175
|
// Get attentions if they are present
|
|
@@ -8222,10 +8216,6 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
|
|
|
8222
8216
|
|
|
8223
8217
|
const stop = prepared_stopping_criteria(all_input_ids);
|
|
8224
8218
|
if (stop.every(x => x)) {
|
|
8225
|
-
if (generation_config.return_dict_in_generate) {
|
|
8226
|
-
// Get past key values without disposing buffers
|
|
8227
|
-
past_key_values = this.getPastKeyValues(outputs, model_inputs.past_key_values, false);
|
|
8228
|
-
}
|
|
8229
8219
|
break;
|
|
8230
8220
|
}
|
|
8231
8221
|
|
|
@@ -8238,6 +8228,9 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
|
|
|
8238
8228
|
streamer.end();
|
|
8239
8229
|
}
|
|
8240
8230
|
|
|
8231
|
+
// Retrieve and dispose all final past key values (including encoder attentions)
|
|
8232
|
+
const past_key_values = this.getPastKeyValues(outputs, model_inputs.past_key_values, true);
|
|
8233
|
+
|
|
8241
8234
|
// TODO: ensure all_input_ids is padded correctly...
|
|
8242
8235
|
const sequences = new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_8__.Tensor('int64', all_input_ids.flat(), [all_input_ids.length, all_input_ids[0].length]);
|
|
8243
8236
|
|
|
@@ -8251,6 +8244,12 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
|
|
|
8251
8244
|
// logits,
|
|
8252
8245
|
}
|
|
8253
8246
|
} else {
|
|
8247
|
+
// Dispose all remaining tensors
|
|
8248
|
+
for (const tensor of Object.values(outputs)) {
|
|
8249
|
+
if (tensor.location === 'gpu-buffer') {
|
|
8250
|
+
tensor.dispose();
|
|
8251
|
+
}
|
|
8252
|
+
}
|
|
8254
8253
|
return sequences;
|
|
8255
8254
|
}
|
|
8256
8255
|
}
|
|
@@ -8260,31 +8259,32 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
|
|
|
8260
8259
|
*
|
|
8261
8260
|
* @param {Object} decoderResults The decoder results object.
|
|
8262
8261
|
* @param {Object} pastKeyValues The previous past key values.
|
|
8263
|
-
* @param {boolean} [dispose=true] Whether to dispose of the old gpu buffer.
|
|
8264
8262
|
* @returns {Object} An object containing past key values.
|
|
8265
8263
|
*/
|
|
8266
|
-
getPastKeyValues(decoderResults, pastKeyValues,
|
|
8264
|
+
getPastKeyValues(decoderResults, pastKeyValues, disposeEncoderPKVs = false) {
|
|
8267
8265
|
const pkvs = Object.create(null);
|
|
8268
8266
|
|
|
8269
8267
|
for (const name in decoderResults) {
|
|
8270
8268
|
if (name.startsWith('present')) {
|
|
8271
8269
|
const newName = name.replace('present', 'past_key_values');
|
|
8272
|
-
|
|
8273
|
-
if (
|
|
8274
|
-
// Optimization introduced by optimum to reuse past key values.
|
|
8275
|
-
// outputs with the previous past key values.
|
|
8270
|
+
const is_encoder_pkv = name.includes('encoder');
|
|
8271
|
+
if (is_encoder_pkv && pastKeyValues) {
|
|
8272
|
+
// Optimization introduced by optimum to reuse past key values.
|
|
8273
|
+
// So, we just replace the constant outputs (`decoderResults[name]`) with the previous past key values.
|
|
8276
8274
|
// https://github.com/huggingface/optimum/blob/0bf2c05fb7e1182b52d21b703cfc95fd9e4ea3dc/optimum/onnxruntime/base.py#L677-L704
|
|
8277
8275
|
pkvs[newName] = pastKeyValues[newName];
|
|
8278
|
-
} else {
|
|
8279
|
-
if (dispose && pastKeyValues) {
|
|
8280
|
-
// Free old gpu buffer
|
|
8281
|
-
const t = pastKeyValues[newName];
|
|
8282
|
-
if (t.location === 'gpu-buffer') {
|
|
8283
|
-
t.dispose();
|
|
8284
|
-
}
|
|
8285
|
-
}
|
|
8276
|
+
} else { // decoder or using first encoder PKVs
|
|
8286
8277
|
pkvs[newName] = decoderResults[name];
|
|
8287
8278
|
}
|
|
8279
|
+
|
|
8280
|
+
if (pastKeyValues && (!is_encoder_pkv || disposeEncoderPKVs)) {
|
|
8281
|
+
// - Always dispose decoder PKVs
|
|
8282
|
+
// - Only dispose encoder past key values when requested (after generation)
|
|
8283
|
+
const t = pastKeyValues[newName];
|
|
8284
|
+
if (t.location === 'gpu-buffer') {
|
|
8285
|
+
t.dispose();
|
|
8286
|
+
}
|
|
8287
|
+
}
|
|
8288
8288
|
}
|
|
8289
8289
|
}
|
|
8290
8290
|
return pkvs;
|
|
@@ -20600,7 +20600,7 @@ function clean_up_tokenization(text) {
|
|
|
20600
20600
|
* @returns {string} The text with accents removed.
|
|
20601
20601
|
*/
|
|
20602
20602
|
function remove_accents(text) {
|
|
20603
|
-
return text.replace(/
|
|
20603
|
+
return text.replace(/\p{M}/gu, '');
|
|
20604
20604
|
}
|
|
20605
20605
|
|
|
20606
20606
|
/**
|
|
@@ -20946,18 +20946,18 @@ class Unigram extends TokenizerModel {
|
|
|
20946
20946
|
this.unk_token = this.vocab[config.unk_id];
|
|
20947
20947
|
|
|
20948
20948
|
this.tokens_to_ids = new Map(this.vocab.map((x, i) => [x, i]));
|
|
20949
|
-
this.
|
|
20949
|
+
this.bos_token = ' '; // beginning of a sentence token
|
|
20950
20950
|
|
|
20951
|
-
this.
|
|
20952
|
-
this.
|
|
20951
|
+
this.bos_token_id = this.tokens_to_ids.get(this.bos_token); // NOTE: may be undefined
|
|
20952
|
+
this.eos_token = moreConfig.eos_token;
|
|
20953
20953
|
|
|
20954
|
-
this.
|
|
20955
|
-
this.
|
|
20954
|
+
this.eos_token_id = this.tokens_to_ids.get(this.eos_token);
|
|
20955
|
+
this.unk_token = this.vocab[this.unk_token_id];
|
|
20956
20956
|
|
|
20957
20957
|
this.minScore = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_3__.min)(this.scores)[0];
|
|
20958
20958
|
|
|
20959
|
-
this.
|
|
20960
|
-
this.scores[this.unk_token_id] = this.
|
|
20959
|
+
this.unk_score = this.minScore - 10.0;
|
|
20960
|
+
this.scores[this.unk_token_id] = this.unk_score;
|
|
20961
20961
|
|
|
20962
20962
|
this.trie = new _utils_data_structures_js__WEBPACK_IMPORTED_MODULE_5__.CharTrie();
|
|
20963
20963
|
this.trie.extend(this.vocab);
|
|
@@ -20972,26 +20972,27 @@ class Unigram extends TokenizerModel {
|
|
|
20972
20972
|
* @param {TokenLattice} lattice The token lattice to populate with nodes.
|
|
20973
20973
|
*/
|
|
20974
20974
|
populateNodes(lattice) {
|
|
20975
|
-
const
|
|
20976
|
-
const
|
|
20975
|
+
const chars = lattice.chars;
|
|
20976
|
+
const mblen = 1;
|
|
20977
20977
|
let beginPos = 0;
|
|
20978
|
-
while (beginPos <
|
|
20979
|
-
const mblen = 1;
|
|
20978
|
+
while (beginPos < chars.length) {
|
|
20980
20979
|
let hasSingleNode = false;
|
|
20981
|
-
const tokens = [];
|
|
20982
20980
|
|
|
20983
|
-
|
|
20981
|
+
const tokens = [];
|
|
20982
|
+
const sliced = chars.slice(beginPos).join('');
|
|
20983
|
+
const prefixedTokens = this.trie.commonPrefixSearch(sliced);
|
|
20984
|
+
for (const token of prefixedTokens) {
|
|
20984
20985
|
tokens.push(token);
|
|
20985
20986
|
const tokenId = this.tokens_to_ids.get(token);
|
|
20986
20987
|
const tokenScore = this.scores[tokenId];
|
|
20987
|
-
const n = token
|
|
20988
|
+
const n = (0,_utils_core_js__WEBPACK_IMPORTED_MODULE_1__.len)(token);
|
|
20988
20989
|
lattice.insert(beginPos, n, tokenScore, tokenId);
|
|
20989
20990
|
if (!hasSingleNode && n === mblen) {
|
|
20990
20991
|
hasSingleNode = true;
|
|
20991
20992
|
}
|
|
20992
20993
|
}
|
|
20993
20994
|
if (!hasSingleNode) {
|
|
20994
|
-
lattice.insert(beginPos, mblen, this.
|
|
20995
|
+
lattice.insert(beginPos, mblen, this.unk_score, this.unk_token_id);
|
|
20995
20996
|
}
|
|
20996
20997
|
beginPos += mblen;
|
|
20997
20998
|
}
|
|
@@ -21004,7 +21005,7 @@ class Unigram extends TokenizerModel {
|
|
|
21004
21005
|
* @returns {string[]} An array of subtokens obtained by encoding the input tokens using the unigram model.
|
|
21005
21006
|
*/
|
|
21006
21007
|
tokenize(normalized) {
|
|
21007
|
-
const lattice = new _utils_data_structures_js__WEBPACK_IMPORTED_MODULE_5__.TokenLattice(normalized, this.
|
|
21008
|
+
const lattice = new _utils_data_structures_js__WEBPACK_IMPORTED_MODULE_5__.TokenLattice(normalized, this.bos_token_id, this.eos_token_id);
|
|
21008
21009
|
this.populateNodes(lattice);
|
|
21009
21010
|
return lattice.tokens();
|
|
21010
21011
|
}
|
|
@@ -21619,7 +21620,8 @@ class BertNormalizer extends Normalizer {
|
|
|
21619
21620
|
* @returns {string} The text with accents removed.
|
|
21620
21621
|
*/
|
|
21621
21622
|
stripAccents(text) {
|
|
21622
|
-
|
|
21623
|
+
// "Mark, Nonspacing" (Mn)
|
|
21624
|
+
return text.normalize('NFD').replace(/\p{Mn}/gu, '');
|
|
21623
21625
|
}
|
|
21624
21626
|
|
|
21625
21627
|
|
|
@@ -22737,7 +22739,7 @@ class Precompiled extends Normalizer {
|
|
|
22737
22739
|
// TODO: detect when a different `this.charsmap` is used.
|
|
22738
22740
|
|
|
22739
22741
|
text = text.replace(/[\u0001-\u0008\u000B\u000E-\u001F\u007F\u008F\u009F]/gm, ''); // Remove control characters
|
|
22740
|
-
text = text.replace(/[\u0009\u000A\u000C\u000D\u1680\
|
|
22742
|
+
text = text.replace(/[\u0009\u000A\u000C\u000D\u00A0\u1680\u2000-\u200F\u2028\u2029\u202F\u205F\u2581\u3000\uFEFF\uFFFD]/gm, '\u0020'); // Replace certain characters with a space
|
|
22741
22743
|
|
|
22742
22744
|
if (text.includes('\uFF5E')) {
|
|
22743
22745
|
// To match the sentencepiece implementation 100%, we must handle a very strange edge-case.
|
|
@@ -25590,6 +25592,7 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
25590
25592
|
/* harmony export */ escapeRegExp: () => (/* binding */ escapeRegExp),
|
|
25591
25593
|
/* harmony export */ isIntegralNumber: () => (/* binding */ isIntegralNumber),
|
|
25592
25594
|
/* harmony export */ isTypedArray: () => (/* binding */ isTypedArray),
|
|
25595
|
+
/* harmony export */ len: () => (/* binding */ len),
|
|
25593
25596
|
/* harmony export */ mergeArrays: () => (/* binding */ mergeArrays),
|
|
25594
25597
|
/* harmony export */ pick: () => (/* binding */ pick),
|
|
25595
25598
|
/* harmony export */ pop: () => (/* binding */ pop),
|
|
@@ -25746,6 +25749,18 @@ function pick(o, props) {
|
|
|
25746
25749
|
);
|
|
25747
25750
|
}
|
|
25748
25751
|
|
|
25752
|
+
/**
|
|
25753
|
+
* Calculate the length of a string, taking multi-byte characters into account.
|
|
25754
|
+
* This mimics the behavior of Python's `len` function.
|
|
25755
|
+
* @param {string} s The string to calculate the length of.
|
|
25756
|
+
* @returns {number} The length of the string.
|
|
25757
|
+
*/
|
|
25758
|
+
function len(s) {
|
|
25759
|
+
let length = 0;
|
|
25760
|
+
for (const c of s) ++length;
|
|
25761
|
+
return length;
|
|
25762
|
+
}
|
|
25763
|
+
|
|
25749
25764
|
|
|
25750
25765
|
/***/ }),
|
|
25751
25766
|
|
|
@@ -25992,7 +26007,7 @@ class CharTrie {
|
|
|
25992
26007
|
* @param {string[]} texts The strings to add to the trie.
|
|
25993
26008
|
*/
|
|
25994
26009
|
extend(texts) {
|
|
25995
|
-
for (
|
|
26010
|
+
for (const text of texts) {
|
|
25996
26011
|
this.push(text);
|
|
25997
26012
|
}
|
|
25998
26013
|
}
|
|
@@ -26003,7 +26018,7 @@ class CharTrie {
|
|
|
26003
26018
|
*/
|
|
26004
26019
|
push(text) {
|
|
26005
26020
|
let node = this.root;
|
|
26006
|
-
for (
|
|
26021
|
+
for (const ch of text) {
|
|
26007
26022
|
let child = node.children.get(ch);
|
|
26008
26023
|
if (child === undefined) {
|
|
26009
26024
|
child = CharTrieNode.default();
|
|
@@ -26021,12 +26036,14 @@ class CharTrie {
|
|
|
26021
26036
|
*/
|
|
26022
26037
|
*commonPrefixSearch(text) {
|
|
26023
26038
|
let node = this.root;
|
|
26039
|
+
if (node === undefined) return;
|
|
26040
|
+
|
|
26024
26041
|
let prefix = "";
|
|
26025
|
-
for (
|
|
26026
|
-
const ch = text[i];
|
|
26042
|
+
for (const ch of text) {
|
|
26027
26043
|
prefix += ch;
|
|
26028
26044
|
node = node.children.get(ch);
|
|
26029
|
-
if (node
|
|
26045
|
+
if (node === undefined) return;
|
|
26046
|
+
if (node.isLeaf) {
|
|
26030
26047
|
yield prefix;
|
|
26031
26048
|
}
|
|
26032
26049
|
}
|
|
@@ -26068,8 +26085,8 @@ class TokenLattice {
|
|
|
26068
26085
|
* @param {number} eosTokenId The end-of-sequence token ID.
|
|
26069
26086
|
*/
|
|
26070
26087
|
constructor(sentence, bosTokenId, eosTokenId) {
|
|
26071
|
-
this.
|
|
26072
|
-
this.len =
|
|
26088
|
+
this.chars = Array.from(sentence);
|
|
26089
|
+
this.len = this.chars.length;
|
|
26073
26090
|
this.bosTokenId = bosTokenId;
|
|
26074
26091
|
this.eosTokenId = eosTokenId;
|
|
26075
26092
|
this.nodes = [];
|
|
@@ -26103,7 +26120,7 @@ class TokenLattice {
|
|
|
26103
26120
|
/**
|
|
26104
26121
|
* Implements the Viterbi algorithm to compute the most likely sequence of tokens.
|
|
26105
26122
|
*
|
|
26106
|
-
* @returns {TokenLatticeNode[]} The
|
|
26123
|
+
* @returns {TokenLatticeNode[]} The most likely sequence of tokens.
|
|
26107
26124
|
*/
|
|
26108
26125
|
viterbi() {
|
|
26109
26126
|
const len = this.len;
|
|
@@ -26157,11 +26174,11 @@ class TokenLattice {
|
|
|
26157
26174
|
* @returns {string} The array of nodes representing the most likely sequence of tokens.
|
|
26158
26175
|
*/
|
|
26159
26176
|
piece(node) {
|
|
26160
|
-
return this.
|
|
26177
|
+
return this.chars.slice(node.pos, node.pos + node.length).join('');
|
|
26161
26178
|
}
|
|
26162
26179
|
|
|
26163
26180
|
/**
|
|
26164
|
-
* @returns {
|
|
26181
|
+
* @returns {string[]} The most likely sequence of tokens.
|
|
26165
26182
|
*/
|
|
26166
26183
|
tokens() {
|
|
26167
26184
|
const nodes = this.viterbi();
|
|
@@ -26169,7 +26186,7 @@ class TokenLattice {
|
|
|
26169
26186
|
}
|
|
26170
26187
|
|
|
26171
26188
|
/**
|
|
26172
|
-
* @returns {
|
|
26189
|
+
* @returns {number[]} The most likely sequence of token ids.
|
|
26173
26190
|
*/
|
|
26174
26191
|
tokenIds() {
|
|
26175
26192
|
const nodes = this.viterbi();
|
|
@@ -26406,7 +26423,7 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
26406
26423
|
/**
|
|
26407
26424
|
* @typedef {Object} PretrainedOptions Options for loading a pretrained model.
|
|
26408
26425
|
* @property {function} [progress_callback=null] If specified, this function will be called during model construction, to provide the user with progress updates.
|
|
26409
|
-
* @property {
|
|
26426
|
+
* @property {import('../configs.js').PretrainedConfig} [config=null] Configuration for the model to use instead of an automatically loaded configuration. Configuration can be automatically loaded when:
|
|
26410
26427
|
* - The model is a model provided by the library (loaded with the *model id* string of a pretrained model).
|
|
26411
26428
|
* - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a configuration JSON file named *config.json* is found in the directory.
|
|
26412
26429
|
* @property {string} [cache_dir=null] Path to a directory in which a downloaded pretrained model configuration should be cached if the standard cache should not be used.
|