@huggingface/transformers 3.0.0-alpha.15 → 3.0.0-alpha.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/dist/transformers.cjs +108 -91
- package/dist/transformers.cjs.map +1 -1
- package/dist/transformers.js +108 -91
- package/dist/transformers.js.map +1 -1
- package/dist/transformers.min.cjs +6 -6
- package/dist/transformers.min.cjs.map +1 -1
- package/dist/transformers.min.js +6 -6
- package/dist/transformers.min.js.map +1 -1
- package/dist/transformers.min.mjs +6 -6
- package/dist/transformers.min.mjs.map +1 -1
- package/dist/transformers.mjs +108 -91
- package/dist/transformers.mjs.map +1 -1
- package/package.json +1 -1
- package/src/configs.js +16 -4
- package/src/env.js +1 -1
- package/src/models.js +43 -55
- package/src/tokenizers.js +22 -19
- package/src/utils/core.js +12 -0
- package/src/utils/data-structures.js +13 -11
- package/src/utils/hub.js +1 -1
- package/types/configs.d.ts +25 -3
- package/types/configs.d.ts.map +1 -1
- package/types/models.d.ts +1 -2
- package/types/models.d.ts.map +1 -1
- package/types/tokenizers.d.ts.map +1 -1
- package/types/utils/core.d.ts +7 -0
- package/types/utils/core.d.ts.map +1 -1
- package/types/utils/data-structures.d.ts +6 -6
- package/types/utils/data-structures.d.ts.map +1 -1
- package/types/utils/hub.d.ts +1 -1
- package/types/utils/hub.d.ts.map +1 -1
package/dist/transformers.js
CHANGED
|
@@ -6985,16 +6985,23 @@ function getKeyValueShapes(config, {
|
|
|
6985
6985
|
class PretrainedConfig {
|
|
6986
6986
|
// NOTE: Typo in original
|
|
6987
6987
|
|
|
6988
|
+
/** @type {string|null} */
|
|
6989
|
+
model_type = null;
|
|
6990
|
+
|
|
6991
|
+
/** @type {boolean} */
|
|
6992
|
+
is_encoder_decoder = false;
|
|
6993
|
+
|
|
6994
|
+
/** @type {number} */
|
|
6988
6995
|
max_position_embeddings;
|
|
6989
6996
|
|
|
6997
|
+
/** @type {TransformersJSConfig} */
|
|
6998
|
+
'transformers.js_config';
|
|
6999
|
+
|
|
6990
7000
|
/**
|
|
6991
7001
|
* Create a new PreTrainedTokenizer instance.
|
|
6992
7002
|
* @param {Object} configJSON The JSON of the config.
|
|
6993
7003
|
*/
|
|
6994
7004
|
constructor(configJSON) {
|
|
6995
|
-
this.model_type = null;
|
|
6996
|
-
this.is_encoder_decoder = false;
|
|
6997
|
-
|
|
6998
7005
|
Object.assign(this, configJSON);
|
|
6999
7006
|
this.normalized_config = getNormalizedConfig(this);
|
|
7000
7007
|
}
|
|
@@ -7046,7 +7053,12 @@ class AutoConfig {
|
|
|
7046
7053
|
/**
|
|
7047
7054
|
* Transformers.js-specific configuration, possibly present in config.json under the key `transformers.js_config`.
|
|
7048
7055
|
* @typedef {Object} TransformersJSConfig
|
|
7049
|
-
* @property {import('./
|
|
7056
|
+
* @property {import('./utils/tensor.js').DataType} [kv_cache_dtype] The data type of the key-value cache.
|
|
7057
|
+
* @property {Record<string, number>} [free_dimension_overrides] Override the free dimensions of the model.
|
|
7058
|
+
* See https://onnxruntime.ai/docs/tutorials/web/env-flags-and-session-options.html#freedimensionoverrides
|
|
7059
|
+
* for more information.
|
|
7060
|
+
* @property {import('./utils/devices.js').DeviceType} [device] The default device to use for the model.
|
|
7061
|
+
* @property {import('./utils/dtypes.js').DataType} [dtype] The default data type to use for the model.
|
|
7050
7062
|
*/
|
|
7051
7063
|
|
|
7052
7064
|
|
|
@@ -7094,7 +7106,7 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
7094
7106
|
|
|
7095
7107
|
|
|
7096
7108
|
|
|
7097
|
-
const VERSION = '3.0.0-alpha.
|
|
7109
|
+
const VERSION = '3.0.0-alpha.16';
|
|
7098
7110
|
|
|
7099
7111
|
// Check if various APIs are available (depends on environment)
|
|
7100
7112
|
const IS_BROWSER_ENV = typeof self !== 'undefined';
|
|
@@ -9540,7 +9552,8 @@ const MODEL_CLASS_TO_NAME_MAPPING = new Map();
|
|
|
9540
9552
|
* @private
|
|
9541
9553
|
*/
|
|
9542
9554
|
async function getSession(pretrained_model_name_or_path, fileName, options) {
|
|
9543
|
-
|
|
9555
|
+
const custom_config = options.config?.['transformers.js_config'] ?? {};
|
|
9556
|
+
let device = options.device ?? custom_config.device;
|
|
9544
9557
|
if (device && typeof device !== 'string') {
|
|
9545
9558
|
if (device.hasOwnProperty(fileName)) {
|
|
9546
9559
|
device = device[fileName];
|
|
@@ -9558,7 +9571,7 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
|
|
|
9558
9571
|
|
|
9559
9572
|
// If options.dtype is specified, we use it to choose the suffix for the model file.
|
|
9560
9573
|
// Otherwise, we use the default dtype for the device.
|
|
9561
|
-
let dtype = options.dtype;
|
|
9574
|
+
let dtype = options.dtype ?? custom_config.dtype;
|
|
9562
9575
|
if (typeof dtype !== 'string') {
|
|
9563
9576
|
if (dtype && dtype.hasOwnProperty(fileName)) {
|
|
9564
9577
|
dtype = dtype[fileName];
|
|
@@ -9585,6 +9598,16 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
|
|
|
9585
9598
|
// Overwrite `executionProviders` if not specified
|
|
9586
9599
|
session_options.executionProviders ??= executionProviders;
|
|
9587
9600
|
|
|
9601
|
+
// Overwrite `freeDimensionOverrides` if specified in config and not set in session options
|
|
9602
|
+
const free_dimension_overrides = custom_config.free_dimension_overrides;
|
|
9603
|
+
if (free_dimension_overrides) {
|
|
9604
|
+
session_options.freeDimensionOverrides ??= free_dimension_overrides;
|
|
9605
|
+
} else if (selectedDevice.startsWith('webnn') && !session_options.freeDimensionOverrides) {
|
|
9606
|
+
console.warn(
|
|
9607
|
+
'WebNN does not currently support dynamic shapes and requires `free_dimension_overrides` to be set in config.json as a field within "transformers.js_config". ' +
|
|
9608
|
+
'When `free_dimension_overrides` is not set, you may experience significant performance degradation.'
|
|
9609
|
+
);
|
|
9610
|
+
}
|
|
9588
9611
|
|
|
9589
9612
|
const bufferPromise = (0,_utils_hub_js__WEBPACK_IMPORTED_MODULE_5__.getModelFile)(pretrained_model_name_or_path, modelFileName, true, options);
|
|
9590
9613
|
|
|
@@ -9633,6 +9656,9 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
|
|
|
9633
9656
|
/** @type {Record<string, import('onnxruntime-common').Tensor.DataLocation>} */
|
|
9634
9657
|
const preferredOutputLocation = {};
|
|
9635
9658
|
for (const key in shapes) {
|
|
9659
|
+
// TODO: For now, we keep encoder outputs on the CPU
|
|
9660
|
+
// (otherwise, this causes a memory leak or throws an error "Error: previous buffer is not registered")
|
|
9661
|
+
if (key.includes('encoder')) continue;
|
|
9636
9662
|
preferredOutputLocation[key] = 'gpu-buffer';
|
|
9637
9663
|
}
|
|
9638
9664
|
session_options.preferredOutputLocation = preferredOutputLocation;
|
|
@@ -9788,37 +9814,6 @@ function toI64Tensor(items) {
|
|
|
9788
9814
|
}
|
|
9789
9815
|
}
|
|
9790
9816
|
|
|
9791
|
-
/**
|
|
9792
|
-
* Prepares an attention mask for a sequence of tokens based on configuration options.
|
|
9793
|
-
* @param {Object} self The calling object instance.
|
|
9794
|
-
* @param {Tensor} tokens The input tokens.
|
|
9795
|
-
* @returns {Tensor} The attention mask tensor.
|
|
9796
|
-
* @private
|
|
9797
|
-
*/
|
|
9798
|
-
function prepareAttentionMask(self, tokens) {
|
|
9799
|
-
|
|
9800
|
-
// Prepare attention mask
|
|
9801
|
-
let pad_token_id = self.config.pad_token_id ?? null;
|
|
9802
|
-
let eos_token_id = self.config.eos_token_id ?? null;
|
|
9803
|
-
if ((0,_utils_core_js__WEBPACK_IMPORTED_MODULE_4__.isIntegralNumber)(eos_token_id)) {
|
|
9804
|
-
eos_token_id = [eos_token_id];
|
|
9805
|
-
}
|
|
9806
|
-
|
|
9807
|
-
let is_pad_token_in_inputs = tokens.indexOf(pad_token_id) !== -1;
|
|
9808
|
-
let is_pad_token_not_equal_to_eos_token_id = (eos_token_id === null) || !eos_token_id.includes(pad_token_id)
|
|
9809
|
-
|
|
9810
|
-
if (is_pad_token_in_inputs && is_pad_token_not_equal_to_eos_token_id) {
|
|
9811
|
-
let data = BigInt64Array.from(
|
|
9812
|
-
// Note: != so that int matches bigint
|
|
9813
|
-
// @ts-ignore
|
|
9814
|
-
tokens.data.map(x => x != pad_token_id)
|
|
9815
|
-
)
|
|
9816
|
-
return new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_8__.Tensor('int64', data, tokens.dims)
|
|
9817
|
-
} else {
|
|
9818
|
-
return (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_8__.ones_like)(tokens);
|
|
9819
|
-
}
|
|
9820
|
-
}
|
|
9821
|
-
|
|
9822
9817
|
/**
|
|
9823
9818
|
* Creates a boolean tensor with a single value.
|
|
9824
9819
|
* @param {boolean} value The value of the tensor.
|
|
@@ -10089,8 +10084,8 @@ function image_text_to_text_prepare_inputs_for_generation(self, ...args) {
|
|
|
10089
10084
|
} else {
|
|
10090
10085
|
return decoder_prepare_inputs_for_generation(self, ...args);
|
|
10091
10086
|
}
|
|
10092
|
-
|
|
10093
10087
|
}
|
|
10088
|
+
|
|
10094
10089
|
//////////////////////////////////////////////////
|
|
10095
10090
|
|
|
10096
10091
|
//////////////////////////////////////////////////
|
|
@@ -10853,13 +10848,12 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
|
|
|
10853
10848
|
// - GenerationMode.BEAM_SEARCH
|
|
10854
10849
|
// - GenerationMode.BEAM_SAMPLE
|
|
10855
10850
|
////////////////////////////////////////////////////
|
|
10856
|
-
let
|
|
10851
|
+
let outputs;
|
|
10857
10852
|
let attentions = {};
|
|
10858
10853
|
while (true) {
|
|
10859
10854
|
// prepare model inputs
|
|
10860
10855
|
model_inputs = this.prepare_inputs_for_generation(all_input_ids, model_inputs, generation_config);
|
|
10861
|
-
|
|
10862
|
-
const outputs = await this.forward(model_inputs);
|
|
10856
|
+
outputs = await this.forward(model_inputs);
|
|
10863
10857
|
|
|
10864
10858
|
if (generation_config.output_attentions && generation_config.return_dict_in_generate) {
|
|
10865
10859
|
// Get attentions if they are present
|
|
@@ -10906,10 +10900,6 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
|
|
|
10906
10900
|
|
|
10907
10901
|
const stop = prepared_stopping_criteria(all_input_ids);
|
|
10908
10902
|
if (stop.every(x => x)) {
|
|
10909
|
-
if (generation_config.return_dict_in_generate) {
|
|
10910
|
-
// Get past key values without disposing buffers
|
|
10911
|
-
past_key_values = this.getPastKeyValues(outputs, model_inputs.past_key_values, false);
|
|
10912
|
-
}
|
|
10913
10903
|
break;
|
|
10914
10904
|
}
|
|
10915
10905
|
|
|
@@ -10922,6 +10912,9 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
|
|
|
10922
10912
|
streamer.end();
|
|
10923
10913
|
}
|
|
10924
10914
|
|
|
10915
|
+
// Retrieve and dispose all final past key values (including encoder attentions)
|
|
10916
|
+
const past_key_values = this.getPastKeyValues(outputs, model_inputs.past_key_values, true);
|
|
10917
|
+
|
|
10925
10918
|
// TODO: ensure all_input_ids is padded correctly...
|
|
10926
10919
|
const sequences = new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_8__.Tensor('int64', all_input_ids.flat(), [all_input_ids.length, all_input_ids[0].length]);
|
|
10927
10920
|
|
|
@@ -10935,6 +10928,12 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
|
|
|
10935
10928
|
// logits,
|
|
10936
10929
|
}
|
|
10937
10930
|
} else {
|
|
10931
|
+
// Dispose all remaining tensors
|
|
10932
|
+
for (const tensor of Object.values(outputs)) {
|
|
10933
|
+
if (tensor.location === 'gpu-buffer') {
|
|
10934
|
+
tensor.dispose();
|
|
10935
|
+
}
|
|
10936
|
+
}
|
|
10938
10937
|
return sequences;
|
|
10939
10938
|
}
|
|
10940
10939
|
}
|
|
@@ -10944,31 +10943,32 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
|
|
|
10944
10943
|
*
|
|
10945
10944
|
* @param {Object} decoderResults The decoder results object.
|
|
10946
10945
|
* @param {Object} pastKeyValues The previous past key values.
|
|
10947
|
-
* @param {boolean} [dispose=true] Whether to dispose of the old gpu buffer.
|
|
10948
10946
|
* @returns {Object} An object containing past key values.
|
|
10949
10947
|
*/
|
|
10950
|
-
getPastKeyValues(decoderResults, pastKeyValues,
|
|
10948
|
+
getPastKeyValues(decoderResults, pastKeyValues, disposeEncoderPKVs = false) {
|
|
10951
10949
|
const pkvs = Object.create(null);
|
|
10952
10950
|
|
|
10953
10951
|
for (const name in decoderResults) {
|
|
10954
10952
|
if (name.startsWith('present')) {
|
|
10955
10953
|
const newName = name.replace('present', 'past_key_values');
|
|
10956
|
-
|
|
10957
|
-
if (
|
|
10958
|
-
// Optimization introduced by optimum to reuse past key values.
|
|
10959
|
-
// outputs with the previous past key values.
|
|
10954
|
+
const is_encoder_pkv = name.includes('encoder');
|
|
10955
|
+
if (is_encoder_pkv && pastKeyValues) {
|
|
10956
|
+
// Optimization introduced by optimum to reuse past key values.
|
|
10957
|
+
// So, we just replace the constant outputs (`decoderResults[name]`) with the previous past key values.
|
|
10960
10958
|
// https://github.com/huggingface/optimum/blob/0bf2c05fb7e1182b52d21b703cfc95fd9e4ea3dc/optimum/onnxruntime/base.py#L677-L704
|
|
10961
10959
|
pkvs[newName] = pastKeyValues[newName];
|
|
10962
|
-
} else {
|
|
10963
|
-
if (dispose && pastKeyValues) {
|
|
10964
|
-
// Free old gpu buffer
|
|
10965
|
-
const t = pastKeyValues[newName];
|
|
10966
|
-
if (t.location === 'gpu-buffer') {
|
|
10967
|
-
t.dispose();
|
|
10968
|
-
}
|
|
10969
|
-
}
|
|
10960
|
+
} else { // decoder or using first encoder PKVs
|
|
10970
10961
|
pkvs[newName] = decoderResults[name];
|
|
10971
10962
|
}
|
|
10963
|
+
|
|
10964
|
+
if (pastKeyValues && (!is_encoder_pkv || disposeEncoderPKVs)) {
|
|
10965
|
+
// - Always dispose decoder PKVs
|
|
10966
|
+
// - Only dispose encoder past key values when requested (after generation)
|
|
10967
|
+
const t = pastKeyValues[newName];
|
|
10968
|
+
if (t.location === 'gpu-buffer') {
|
|
10969
|
+
t.dispose();
|
|
10970
|
+
}
|
|
10971
|
+
}
|
|
10972
10972
|
}
|
|
10973
10973
|
}
|
|
10974
10974
|
return pkvs;
|
|
@@ -23284,7 +23284,7 @@ function clean_up_tokenization(text) {
|
|
|
23284
23284
|
* @returns {string} The text with accents removed.
|
|
23285
23285
|
*/
|
|
23286
23286
|
function remove_accents(text) {
|
|
23287
|
-
return text.replace(/
|
|
23287
|
+
return text.replace(/\p{M}/gu, '');
|
|
23288
23288
|
}
|
|
23289
23289
|
|
|
23290
23290
|
/**
|
|
@@ -23630,18 +23630,18 @@ class Unigram extends TokenizerModel {
|
|
|
23630
23630
|
this.unk_token = this.vocab[config.unk_id];
|
|
23631
23631
|
|
|
23632
23632
|
this.tokens_to_ids = new Map(this.vocab.map((x, i) => [x, i]));
|
|
23633
|
-
this.
|
|
23633
|
+
this.bos_token = ' '; // beginning of a sentence token
|
|
23634
23634
|
|
|
23635
|
-
this.
|
|
23636
|
-
this.
|
|
23635
|
+
this.bos_token_id = this.tokens_to_ids.get(this.bos_token); // NOTE: may be undefined
|
|
23636
|
+
this.eos_token = moreConfig.eos_token;
|
|
23637
23637
|
|
|
23638
|
-
this.
|
|
23639
|
-
this.
|
|
23638
|
+
this.eos_token_id = this.tokens_to_ids.get(this.eos_token);
|
|
23639
|
+
this.unk_token = this.vocab[this.unk_token_id];
|
|
23640
23640
|
|
|
23641
23641
|
this.minScore = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_3__.min)(this.scores)[0];
|
|
23642
23642
|
|
|
23643
|
-
this.
|
|
23644
|
-
this.scores[this.unk_token_id] = this.
|
|
23643
|
+
this.unk_score = this.minScore - 10.0;
|
|
23644
|
+
this.scores[this.unk_token_id] = this.unk_score;
|
|
23645
23645
|
|
|
23646
23646
|
this.trie = new _utils_data_structures_js__WEBPACK_IMPORTED_MODULE_5__.CharTrie();
|
|
23647
23647
|
this.trie.extend(this.vocab);
|
|
@@ -23656,26 +23656,27 @@ class Unigram extends TokenizerModel {
|
|
|
23656
23656
|
* @param {TokenLattice} lattice The token lattice to populate with nodes.
|
|
23657
23657
|
*/
|
|
23658
23658
|
populateNodes(lattice) {
|
|
23659
|
-
const
|
|
23660
|
-
const
|
|
23659
|
+
const chars = lattice.chars;
|
|
23660
|
+
const mblen = 1;
|
|
23661
23661
|
let beginPos = 0;
|
|
23662
|
-
while (beginPos <
|
|
23663
|
-
const mblen = 1;
|
|
23662
|
+
while (beginPos < chars.length) {
|
|
23664
23663
|
let hasSingleNode = false;
|
|
23665
|
-
const tokens = [];
|
|
23666
23664
|
|
|
23667
|
-
|
|
23665
|
+
const tokens = [];
|
|
23666
|
+
const sliced = chars.slice(beginPos).join('');
|
|
23667
|
+
const prefixedTokens = this.trie.commonPrefixSearch(sliced);
|
|
23668
|
+
for (const token of prefixedTokens) {
|
|
23668
23669
|
tokens.push(token);
|
|
23669
23670
|
const tokenId = this.tokens_to_ids.get(token);
|
|
23670
23671
|
const tokenScore = this.scores[tokenId];
|
|
23671
|
-
const n = token
|
|
23672
|
+
const n = (0,_utils_core_js__WEBPACK_IMPORTED_MODULE_1__.len)(token);
|
|
23672
23673
|
lattice.insert(beginPos, n, tokenScore, tokenId);
|
|
23673
23674
|
if (!hasSingleNode && n === mblen) {
|
|
23674
23675
|
hasSingleNode = true;
|
|
23675
23676
|
}
|
|
23676
23677
|
}
|
|
23677
23678
|
if (!hasSingleNode) {
|
|
23678
|
-
lattice.insert(beginPos, mblen, this.
|
|
23679
|
+
lattice.insert(beginPos, mblen, this.unk_score, this.unk_token_id);
|
|
23679
23680
|
}
|
|
23680
23681
|
beginPos += mblen;
|
|
23681
23682
|
}
|
|
@@ -23688,7 +23689,7 @@ class Unigram extends TokenizerModel {
|
|
|
23688
23689
|
* @returns {string[]} An array of subtokens obtained by encoding the input tokens using the unigram model.
|
|
23689
23690
|
*/
|
|
23690
23691
|
tokenize(normalized) {
|
|
23691
|
-
const lattice = new _utils_data_structures_js__WEBPACK_IMPORTED_MODULE_5__.TokenLattice(normalized, this.
|
|
23692
|
+
const lattice = new _utils_data_structures_js__WEBPACK_IMPORTED_MODULE_5__.TokenLattice(normalized, this.bos_token_id, this.eos_token_id);
|
|
23692
23693
|
this.populateNodes(lattice);
|
|
23693
23694
|
return lattice.tokens();
|
|
23694
23695
|
}
|
|
@@ -24303,7 +24304,8 @@ class BertNormalizer extends Normalizer {
|
|
|
24303
24304
|
* @returns {string} The text with accents removed.
|
|
24304
24305
|
*/
|
|
24305
24306
|
stripAccents(text) {
|
|
24306
|
-
|
|
24307
|
+
// "Mark, Nonspacing" (Mn)
|
|
24308
|
+
return text.normalize('NFD').replace(/\p{Mn}/gu, '');
|
|
24307
24309
|
}
|
|
24308
24310
|
|
|
24309
24311
|
|
|
@@ -25421,7 +25423,7 @@ class Precompiled extends Normalizer {
|
|
|
25421
25423
|
// TODO: detect when a different `this.charsmap` is used.
|
|
25422
25424
|
|
|
25423
25425
|
text = text.replace(/[\u0001-\u0008\u000B\u000E-\u001F\u007F\u008F\u009F]/gm, ''); // Remove control characters
|
|
25424
|
-
text = text.replace(/[\u0009\u000A\u000C\u000D\u1680\
|
|
25426
|
+
text = text.replace(/[\u0009\u000A\u000C\u000D\u00A0\u1680\u2000-\u200F\u2028\u2029\u202F\u205F\u2581\u3000\uFEFF\uFFFD]/gm, '\u0020'); // Replace certain characters with a space
|
|
25425
25427
|
|
|
25426
25428
|
if (text.includes('\uFF5E')) {
|
|
25427
25429
|
// To match the sentencepiece implementation 100%, we must handle a very strange edge-case.
|
|
@@ -28274,6 +28276,7 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
28274
28276
|
/* harmony export */ escapeRegExp: () => (/* binding */ escapeRegExp),
|
|
28275
28277
|
/* harmony export */ isIntegralNumber: () => (/* binding */ isIntegralNumber),
|
|
28276
28278
|
/* harmony export */ isTypedArray: () => (/* binding */ isTypedArray),
|
|
28279
|
+
/* harmony export */ len: () => (/* binding */ len),
|
|
28277
28280
|
/* harmony export */ mergeArrays: () => (/* binding */ mergeArrays),
|
|
28278
28281
|
/* harmony export */ pick: () => (/* binding */ pick),
|
|
28279
28282
|
/* harmony export */ pop: () => (/* binding */ pop),
|
|
@@ -28430,6 +28433,18 @@ function pick(o, props) {
|
|
|
28430
28433
|
);
|
|
28431
28434
|
}
|
|
28432
28435
|
|
|
28436
|
+
/**
|
|
28437
|
+
* Calculate the length of a string, taking multi-byte characters into account.
|
|
28438
|
+
* This mimics the behavior of Python's `len` function.
|
|
28439
|
+
* @param {string} s The string to calculate the length of.
|
|
28440
|
+
* @returns {number} The length of the string.
|
|
28441
|
+
*/
|
|
28442
|
+
function len(s) {
|
|
28443
|
+
let length = 0;
|
|
28444
|
+
for (const c of s) ++length;
|
|
28445
|
+
return length;
|
|
28446
|
+
}
|
|
28447
|
+
|
|
28433
28448
|
|
|
28434
28449
|
/***/ }),
|
|
28435
28450
|
|
|
@@ -28676,7 +28691,7 @@ class CharTrie {
|
|
|
28676
28691
|
* @param {string[]} texts The strings to add to the trie.
|
|
28677
28692
|
*/
|
|
28678
28693
|
extend(texts) {
|
|
28679
|
-
for (
|
|
28694
|
+
for (const text of texts) {
|
|
28680
28695
|
this.push(text);
|
|
28681
28696
|
}
|
|
28682
28697
|
}
|
|
@@ -28687,7 +28702,7 @@ class CharTrie {
|
|
|
28687
28702
|
*/
|
|
28688
28703
|
push(text) {
|
|
28689
28704
|
let node = this.root;
|
|
28690
|
-
for (
|
|
28705
|
+
for (const ch of text) {
|
|
28691
28706
|
let child = node.children.get(ch);
|
|
28692
28707
|
if (child === undefined) {
|
|
28693
28708
|
child = CharTrieNode.default();
|
|
@@ -28705,12 +28720,14 @@ class CharTrie {
|
|
|
28705
28720
|
*/
|
|
28706
28721
|
*commonPrefixSearch(text) {
|
|
28707
28722
|
let node = this.root;
|
|
28723
|
+
if (node === undefined) return;
|
|
28724
|
+
|
|
28708
28725
|
let prefix = "";
|
|
28709
|
-
for (
|
|
28710
|
-
const ch = text[i];
|
|
28726
|
+
for (const ch of text) {
|
|
28711
28727
|
prefix += ch;
|
|
28712
28728
|
node = node.children.get(ch);
|
|
28713
|
-
if (node
|
|
28729
|
+
if (node === undefined) return;
|
|
28730
|
+
if (node.isLeaf) {
|
|
28714
28731
|
yield prefix;
|
|
28715
28732
|
}
|
|
28716
28733
|
}
|
|
@@ -28752,8 +28769,8 @@ class TokenLattice {
|
|
|
28752
28769
|
* @param {number} eosTokenId The end-of-sequence token ID.
|
|
28753
28770
|
*/
|
|
28754
28771
|
constructor(sentence, bosTokenId, eosTokenId) {
|
|
28755
|
-
this.
|
|
28756
|
-
this.len =
|
|
28772
|
+
this.chars = Array.from(sentence);
|
|
28773
|
+
this.len = this.chars.length;
|
|
28757
28774
|
this.bosTokenId = bosTokenId;
|
|
28758
28775
|
this.eosTokenId = eosTokenId;
|
|
28759
28776
|
this.nodes = [];
|
|
@@ -28787,7 +28804,7 @@ class TokenLattice {
|
|
|
28787
28804
|
/**
|
|
28788
28805
|
* Implements the Viterbi algorithm to compute the most likely sequence of tokens.
|
|
28789
28806
|
*
|
|
28790
|
-
* @returns {TokenLatticeNode[]} The
|
|
28807
|
+
* @returns {TokenLatticeNode[]} The most likely sequence of tokens.
|
|
28791
28808
|
*/
|
|
28792
28809
|
viterbi() {
|
|
28793
28810
|
const len = this.len;
|
|
@@ -28841,11 +28858,11 @@ class TokenLattice {
|
|
|
28841
28858
|
* @returns {string} The array of nodes representing the most likely sequence of tokens.
|
|
28842
28859
|
*/
|
|
28843
28860
|
piece(node) {
|
|
28844
|
-
return this.
|
|
28861
|
+
return this.chars.slice(node.pos, node.pos + node.length).join('');
|
|
28845
28862
|
}
|
|
28846
28863
|
|
|
28847
28864
|
/**
|
|
28848
|
-
* @returns {
|
|
28865
|
+
* @returns {string[]} The most likely sequence of tokens.
|
|
28849
28866
|
*/
|
|
28850
28867
|
tokens() {
|
|
28851
28868
|
const nodes = this.viterbi();
|
|
@@ -28853,7 +28870,7 @@ class TokenLattice {
|
|
|
28853
28870
|
}
|
|
28854
28871
|
|
|
28855
28872
|
/**
|
|
28856
|
-
* @returns {
|
|
28873
|
+
* @returns {number[]} The most likely sequence of token ids.
|
|
28857
28874
|
*/
|
|
28858
28875
|
tokenIds() {
|
|
28859
28876
|
const nodes = this.viterbi();
|
|
@@ -29090,7 +29107,7 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
29090
29107
|
/**
|
|
29091
29108
|
* @typedef {Object} PretrainedOptions Options for loading a pretrained model.
|
|
29092
29109
|
* @property {function} [progress_callback=null] If specified, this function will be called during model construction, to provide the user with progress updates.
|
|
29093
|
-
* @property {
|
|
29110
|
+
* @property {import('../configs.js').PretrainedConfig} [config=null] Configuration for the model to use instead of an automatically loaded configuration. Configuration can be automatically loaded when:
|
|
29094
29111
|
* - The model is a model provided by the library (loaded with the *model id* string of a pretrained model).
|
|
29095
29112
|
* - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a configuration JSON file named *config.json* is found in the directory.
|
|
29096
29113
|
* @property {string} [cache_dir=null] Path to a directory in which a downloaded pretrained model configuration should be cached if the standard cache should not be used.
|