@huggingface/transformers 3.0.0-alpha.15 → 3.0.0-alpha.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
- package/dist/transformers.cjs +108 -91
- package/dist/transformers.cjs.map +1 -1
- package/dist/transformers.js +459 -511
- package/dist/transformers.js.map +1 -1
- package/dist/transformers.min.cjs +6 -6
- package/dist/transformers.min.cjs.map +1 -1
- package/dist/transformers.min.js +16 -16
- package/dist/transformers.min.js.map +1 -1
- package/dist/transformers.min.mjs +6 -6
- package/dist/transformers.min.mjs.map +1 -1
- package/dist/transformers.mjs +108 -91
- package/dist/transformers.mjs.map +1 -1
- package/package.json +2 -2
- package/src/configs.js +16 -4
- package/src/env.js +1 -1
- package/src/models.js +43 -55
- package/src/tokenizers.js +22 -19
- package/src/utils/core.js +12 -0
- package/src/utils/data-structures.js +13 -11
- package/src/utils/hub.js +1 -1
- package/types/configs.d.ts +25 -3
- package/types/configs.d.ts.map +1 -1
- package/types/models.d.ts +1 -2
- package/types/models.d.ts.map +1 -1
- package/types/tokenizers.d.ts.map +1 -1
- package/types/utils/core.d.ts +7 -0
- package/types/utils/core.d.ts.map +1 -1
- package/types/utils/data-structures.d.ts +6 -6
- package/types/utils/data-structures.d.ts.map +1 -1
- package/types/utils/hub.d.ts +1 -1
- package/types/utils/hub.d.ts.map +1 -1
package/README.md
CHANGED
|
@@ -101,7 +101,7 @@ npm i @huggingface/transformers
|
|
|
101
101
|
Alternatively, you can use it in vanilla JS, without any bundler, by using a CDN or static hosting. For example, using [ES Modules](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Modules), you can import the library with:
|
|
102
102
|
```html
|
|
103
103
|
<script type="module">
|
|
104
|
-
import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.0-alpha.
|
|
104
|
+
import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.0-alpha.17';
|
|
105
105
|
</script>
|
|
106
106
|
```
|
|
107
107
|
|
|
@@ -134,7 +134,7 @@ Check out the Transformers.js [template](https://huggingface.co/new-space?templa
|
|
|
134
134
|
|
|
135
135
|
|
|
136
136
|
|
|
137
|
-
By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.0-alpha.
|
|
137
|
+
By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.0-alpha.17/dist/), which should work out-of-the-box. You can customize this as follows:
|
|
138
138
|
|
|
139
139
|
### Settings
|
|
140
140
|
|
|
Binary file
|
package/dist/transformers.cjs
CHANGED
|
@@ -4327,16 +4327,23 @@ function getKeyValueShapes(config, {
|
|
|
4327
4327
|
class PretrainedConfig {
|
|
4328
4328
|
// NOTE: Typo in original
|
|
4329
4329
|
|
|
4330
|
+
/** @type {string|null} */
|
|
4331
|
+
model_type = null;
|
|
4332
|
+
|
|
4333
|
+
/** @type {boolean} */
|
|
4334
|
+
is_encoder_decoder = false;
|
|
4335
|
+
|
|
4336
|
+
/** @type {number} */
|
|
4330
4337
|
max_position_embeddings;
|
|
4331
4338
|
|
|
4339
|
+
/** @type {TransformersJSConfig} */
|
|
4340
|
+
'transformers.js_config';
|
|
4341
|
+
|
|
4332
4342
|
/**
|
|
4333
4343
|
* Create a new PreTrainedTokenizer instance.
|
|
4334
4344
|
* @param {Object} configJSON The JSON of the config.
|
|
4335
4345
|
*/
|
|
4336
4346
|
constructor(configJSON) {
|
|
4337
|
-
this.model_type = null;
|
|
4338
|
-
this.is_encoder_decoder = false;
|
|
4339
|
-
|
|
4340
4347
|
Object.assign(this, configJSON);
|
|
4341
4348
|
this.normalized_config = getNormalizedConfig(this);
|
|
4342
4349
|
}
|
|
@@ -4388,7 +4395,12 @@ class AutoConfig {
|
|
|
4388
4395
|
/**
|
|
4389
4396
|
* Transformers.js-specific configuration, possibly present in config.json under the key `transformers.js_config`.
|
|
4390
4397
|
* @typedef {Object} TransformersJSConfig
|
|
4391
|
-
* @property {import('./
|
|
4398
|
+
* @property {import('./utils/tensor.js').DataType} [kv_cache_dtype] The data type of the key-value cache.
|
|
4399
|
+
* @property {Record<string, number>} [free_dimension_overrides] Override the free dimensions of the model.
|
|
4400
|
+
* See https://onnxruntime.ai/docs/tutorials/web/env-flags-and-session-options.html#freedimensionoverrides
|
|
4401
|
+
* for more information.
|
|
4402
|
+
* @property {import('./utils/devices.js').DeviceType} [device] The default device to use for the model.
|
|
4403
|
+
* @property {import('./utils/dtypes.js').DataType} [dtype] The default data type to use for the model.
|
|
4392
4404
|
*/
|
|
4393
4405
|
|
|
4394
4406
|
|
|
@@ -4437,7 +4449,7 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
4437
4449
|
|
|
4438
4450
|
|
|
4439
4451
|
|
|
4440
|
-
const VERSION = '3.0.0-alpha.
|
|
4452
|
+
const VERSION = '3.0.0-alpha.17';
|
|
4441
4453
|
|
|
4442
4454
|
// Check if various APIs are available (depends on environment)
|
|
4443
4455
|
const IS_BROWSER_ENV = typeof self !== 'undefined';
|
|
@@ -6889,7 +6901,8 @@ const MODEL_CLASS_TO_NAME_MAPPING = new Map();
|
|
|
6889
6901
|
* @private
|
|
6890
6902
|
*/
|
|
6891
6903
|
async function getSession(pretrained_model_name_or_path, fileName, options) {
|
|
6892
|
-
|
|
6904
|
+
const custom_config = options.config?.['transformers.js_config'] ?? {};
|
|
6905
|
+
let device = options.device ?? custom_config.device;
|
|
6893
6906
|
if (device && typeof device !== 'string') {
|
|
6894
6907
|
if (device.hasOwnProperty(fileName)) {
|
|
6895
6908
|
device = device[fileName];
|
|
@@ -6907,7 +6920,7 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
|
|
|
6907
6920
|
|
|
6908
6921
|
// If options.dtype is specified, we use it to choose the suffix for the model file.
|
|
6909
6922
|
// Otherwise, we use the default dtype for the device.
|
|
6910
|
-
let dtype = options.dtype;
|
|
6923
|
+
let dtype = options.dtype ?? custom_config.dtype;
|
|
6911
6924
|
if (typeof dtype !== 'string') {
|
|
6912
6925
|
if (dtype && dtype.hasOwnProperty(fileName)) {
|
|
6913
6926
|
dtype = dtype[fileName];
|
|
@@ -6934,6 +6947,16 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
|
|
|
6934
6947
|
// Overwrite `executionProviders` if not specified
|
|
6935
6948
|
session_options.executionProviders ??= executionProviders;
|
|
6936
6949
|
|
|
6950
|
+
// Overwrite `freeDimensionOverrides` if specified in config and not set in session options
|
|
6951
|
+
const free_dimension_overrides = custom_config.free_dimension_overrides;
|
|
6952
|
+
if (free_dimension_overrides) {
|
|
6953
|
+
session_options.freeDimensionOverrides ??= free_dimension_overrides;
|
|
6954
|
+
} else if (selectedDevice.startsWith('webnn') && !session_options.freeDimensionOverrides) {
|
|
6955
|
+
console.warn(
|
|
6956
|
+
'WebNN does not currently support dynamic shapes and requires `free_dimension_overrides` to be set in config.json as a field within "transformers.js_config". ' +
|
|
6957
|
+
'When `free_dimension_overrides` is not set, you may experience significant performance degradation.'
|
|
6958
|
+
);
|
|
6959
|
+
}
|
|
6937
6960
|
|
|
6938
6961
|
const bufferPromise = (0,_utils_hub_js__WEBPACK_IMPORTED_MODULE_5__.getModelFile)(pretrained_model_name_or_path, modelFileName, true, options);
|
|
6939
6962
|
|
|
@@ -6982,6 +7005,9 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
|
|
|
6982
7005
|
/** @type {Record<string, import('onnxruntime-common').Tensor.DataLocation>} */
|
|
6983
7006
|
const preferredOutputLocation = {};
|
|
6984
7007
|
for (const key in shapes) {
|
|
7008
|
+
// TODO: For now, we keep encoder outputs on the CPU
|
|
7009
|
+
// (otherwise, this causes a memory leak or throws an error "Error: previous buffer is not registered")
|
|
7010
|
+
if (key.includes('encoder')) continue;
|
|
6985
7011
|
preferredOutputLocation[key] = 'gpu-buffer';
|
|
6986
7012
|
}
|
|
6987
7013
|
session_options.preferredOutputLocation = preferredOutputLocation;
|
|
@@ -7137,37 +7163,6 @@ function toI64Tensor(items) {
|
|
|
7137
7163
|
}
|
|
7138
7164
|
}
|
|
7139
7165
|
|
|
7140
|
-
/**
|
|
7141
|
-
* Prepares an attention mask for a sequence of tokens based on configuration options.
|
|
7142
|
-
* @param {Object} self The calling object instance.
|
|
7143
|
-
* @param {Tensor} tokens The input tokens.
|
|
7144
|
-
* @returns {Tensor} The attention mask tensor.
|
|
7145
|
-
* @private
|
|
7146
|
-
*/
|
|
7147
|
-
function prepareAttentionMask(self, tokens) {
|
|
7148
|
-
|
|
7149
|
-
// Prepare attention mask
|
|
7150
|
-
let pad_token_id = self.config.pad_token_id ?? null;
|
|
7151
|
-
let eos_token_id = self.config.eos_token_id ?? null;
|
|
7152
|
-
if ((0,_utils_core_js__WEBPACK_IMPORTED_MODULE_4__.isIntegralNumber)(eos_token_id)) {
|
|
7153
|
-
eos_token_id = [eos_token_id];
|
|
7154
|
-
}
|
|
7155
|
-
|
|
7156
|
-
let is_pad_token_in_inputs = tokens.indexOf(pad_token_id) !== -1;
|
|
7157
|
-
let is_pad_token_not_equal_to_eos_token_id = (eos_token_id === null) || !eos_token_id.includes(pad_token_id)
|
|
7158
|
-
|
|
7159
|
-
if (is_pad_token_in_inputs && is_pad_token_not_equal_to_eos_token_id) {
|
|
7160
|
-
let data = BigInt64Array.from(
|
|
7161
|
-
// Note: != so that int matches bigint
|
|
7162
|
-
// @ts-ignore
|
|
7163
|
-
tokens.data.map(x => x != pad_token_id)
|
|
7164
|
-
)
|
|
7165
|
-
return new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_8__.Tensor('int64', data, tokens.dims)
|
|
7166
|
-
} else {
|
|
7167
|
-
return (0,_utils_tensor_js__WEBPACK_IMPORTED_MODULE_8__.ones_like)(tokens);
|
|
7168
|
-
}
|
|
7169
|
-
}
|
|
7170
|
-
|
|
7171
7166
|
/**
|
|
7172
7167
|
* Creates a boolean tensor with a single value.
|
|
7173
7168
|
* @param {boolean} value The value of the tensor.
|
|
@@ -7438,8 +7433,8 @@ function image_text_to_text_prepare_inputs_for_generation(self, ...args) {
|
|
|
7438
7433
|
} else {
|
|
7439
7434
|
return decoder_prepare_inputs_for_generation(self, ...args);
|
|
7440
7435
|
}
|
|
7441
|
-
|
|
7442
7436
|
}
|
|
7437
|
+
|
|
7443
7438
|
//////////////////////////////////////////////////
|
|
7444
7439
|
|
|
7445
7440
|
//////////////////////////////////////////////////
|
|
@@ -8202,13 +8197,12 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
|
|
|
8202
8197
|
// - GenerationMode.BEAM_SEARCH
|
|
8203
8198
|
// - GenerationMode.BEAM_SAMPLE
|
|
8204
8199
|
////////////////////////////////////////////////////
|
|
8205
|
-
let
|
|
8200
|
+
let outputs;
|
|
8206
8201
|
let attentions = {};
|
|
8207
8202
|
while (true) {
|
|
8208
8203
|
// prepare model inputs
|
|
8209
8204
|
model_inputs = this.prepare_inputs_for_generation(all_input_ids, model_inputs, generation_config);
|
|
8210
|
-
|
|
8211
|
-
const outputs = await this.forward(model_inputs);
|
|
8205
|
+
outputs = await this.forward(model_inputs);
|
|
8212
8206
|
|
|
8213
8207
|
if (generation_config.output_attentions && generation_config.return_dict_in_generate) {
|
|
8214
8208
|
// Get attentions if they are present
|
|
@@ -8255,10 +8249,6 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
|
|
|
8255
8249
|
|
|
8256
8250
|
const stop = prepared_stopping_criteria(all_input_ids);
|
|
8257
8251
|
if (stop.every(x => x)) {
|
|
8258
|
-
if (generation_config.return_dict_in_generate) {
|
|
8259
|
-
// Get past key values without disposing buffers
|
|
8260
|
-
past_key_values = this.getPastKeyValues(outputs, model_inputs.past_key_values, false);
|
|
8261
|
-
}
|
|
8262
8252
|
break;
|
|
8263
8253
|
}
|
|
8264
8254
|
|
|
@@ -8271,6 +8261,9 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
|
|
|
8271
8261
|
streamer.end();
|
|
8272
8262
|
}
|
|
8273
8263
|
|
|
8264
|
+
// Retrieve and dispose all final past key values (including encoder attentions)
|
|
8265
|
+
const past_key_values = this.getPastKeyValues(outputs, model_inputs.past_key_values, true);
|
|
8266
|
+
|
|
8274
8267
|
// TODO: ensure all_input_ids is padded correctly...
|
|
8275
8268
|
const sequences = new _utils_tensor_js__WEBPACK_IMPORTED_MODULE_8__.Tensor('int64', all_input_ids.flat(), [all_input_ids.length, all_input_ids[0].length]);
|
|
8276
8269
|
|
|
@@ -8284,6 +8277,12 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
|
|
|
8284
8277
|
// logits,
|
|
8285
8278
|
}
|
|
8286
8279
|
} else {
|
|
8280
|
+
// Dispose all remaining tensors
|
|
8281
|
+
for (const tensor of Object.values(outputs)) {
|
|
8282
|
+
if (tensor.location === 'gpu-buffer') {
|
|
8283
|
+
tensor.dispose();
|
|
8284
|
+
}
|
|
8285
|
+
}
|
|
8287
8286
|
return sequences;
|
|
8288
8287
|
}
|
|
8289
8288
|
}
|
|
@@ -8293,31 +8292,32 @@ class PreTrainedModel extends _utils_generic_js__WEBPACK_IMPORTED_MODULE_3__.Cal
|
|
|
8293
8292
|
*
|
|
8294
8293
|
* @param {Object} decoderResults The decoder results object.
|
|
8295
8294
|
* @param {Object} pastKeyValues The previous past key values.
|
|
8296
|
-
* @param {boolean} [dispose=true] Whether to dispose of the old gpu buffer.
|
|
8297
8295
|
* @returns {Object} An object containing past key values.
|
|
8298
8296
|
*/
|
|
8299
|
-
getPastKeyValues(decoderResults, pastKeyValues,
|
|
8297
|
+
getPastKeyValues(decoderResults, pastKeyValues, disposeEncoderPKVs = false) {
|
|
8300
8298
|
const pkvs = Object.create(null);
|
|
8301
8299
|
|
|
8302
8300
|
for (const name in decoderResults) {
|
|
8303
8301
|
if (name.startsWith('present')) {
|
|
8304
8302
|
const newName = name.replace('present', 'past_key_values');
|
|
8305
|
-
|
|
8306
|
-
if (
|
|
8307
|
-
// Optimization introduced by optimum to reuse past key values.
|
|
8308
|
-
// outputs with the previous past key values.
|
|
8303
|
+
const is_encoder_pkv = name.includes('encoder');
|
|
8304
|
+
if (is_encoder_pkv && pastKeyValues) {
|
|
8305
|
+
// Optimization introduced by optimum to reuse past key values.
|
|
8306
|
+
// So, we just replace the constant outputs (`decoderResults[name]`) with the previous past key values.
|
|
8309
8307
|
// https://github.com/huggingface/optimum/blob/0bf2c05fb7e1182b52d21b703cfc95fd9e4ea3dc/optimum/onnxruntime/base.py#L677-L704
|
|
8310
8308
|
pkvs[newName] = pastKeyValues[newName];
|
|
8311
|
-
} else {
|
|
8312
|
-
if (dispose && pastKeyValues) {
|
|
8313
|
-
// Free old gpu buffer
|
|
8314
|
-
const t = pastKeyValues[newName];
|
|
8315
|
-
if (t.location === 'gpu-buffer') {
|
|
8316
|
-
t.dispose();
|
|
8317
|
-
}
|
|
8318
|
-
}
|
|
8309
|
+
} else { // decoder or using first encoder PKVs
|
|
8319
8310
|
pkvs[newName] = decoderResults[name];
|
|
8320
8311
|
}
|
|
8312
|
+
|
|
8313
|
+
if (pastKeyValues && (!is_encoder_pkv || disposeEncoderPKVs)) {
|
|
8314
|
+
// - Always dispose decoder PKVs
|
|
8315
|
+
// - Only dispose encoder past key values when requested (after generation)
|
|
8316
|
+
const t = pastKeyValues[newName];
|
|
8317
|
+
if (t.location === 'gpu-buffer') {
|
|
8318
|
+
t.dispose();
|
|
8319
|
+
}
|
|
8320
|
+
}
|
|
8321
8321
|
}
|
|
8322
8322
|
}
|
|
8323
8323
|
return pkvs;
|
|
@@ -20639,7 +20639,7 @@ function clean_up_tokenization(text) {
|
|
|
20639
20639
|
* @returns {string} The text with accents removed.
|
|
20640
20640
|
*/
|
|
20641
20641
|
function remove_accents(text) {
|
|
20642
|
-
return text.replace(/
|
|
20642
|
+
return text.replace(/\p{M}/gu, '');
|
|
20643
20643
|
}
|
|
20644
20644
|
|
|
20645
20645
|
/**
|
|
@@ -20985,18 +20985,18 @@ class Unigram extends TokenizerModel {
|
|
|
20985
20985
|
this.unk_token = this.vocab[config.unk_id];
|
|
20986
20986
|
|
|
20987
20987
|
this.tokens_to_ids = new Map(this.vocab.map((x, i) => [x, i]));
|
|
20988
|
-
this.
|
|
20988
|
+
this.bos_token = ' '; // beginning of a sentence token
|
|
20989
20989
|
|
|
20990
|
-
this.
|
|
20991
|
-
this.
|
|
20990
|
+
this.bos_token_id = this.tokens_to_ids.get(this.bos_token); // NOTE: may be undefined
|
|
20991
|
+
this.eos_token = moreConfig.eos_token;
|
|
20992
20992
|
|
|
20993
|
-
this.
|
|
20994
|
-
this.
|
|
20993
|
+
this.eos_token_id = this.tokens_to_ids.get(this.eos_token);
|
|
20994
|
+
this.unk_token = this.vocab[this.unk_token_id];
|
|
20995
20995
|
|
|
20996
20996
|
this.minScore = (0,_utils_maths_js__WEBPACK_IMPORTED_MODULE_3__.min)(this.scores)[0];
|
|
20997
20997
|
|
|
20998
|
-
this.
|
|
20999
|
-
this.scores[this.unk_token_id] = this.
|
|
20998
|
+
this.unk_score = this.minScore - 10.0;
|
|
20999
|
+
this.scores[this.unk_token_id] = this.unk_score;
|
|
21000
21000
|
|
|
21001
21001
|
this.trie = new _utils_data_structures_js__WEBPACK_IMPORTED_MODULE_5__.CharTrie();
|
|
21002
21002
|
this.trie.extend(this.vocab);
|
|
@@ -21011,26 +21011,27 @@ class Unigram extends TokenizerModel {
|
|
|
21011
21011
|
* @param {TokenLattice} lattice The token lattice to populate with nodes.
|
|
21012
21012
|
*/
|
|
21013
21013
|
populateNodes(lattice) {
|
|
21014
|
-
const
|
|
21015
|
-
const
|
|
21014
|
+
const chars = lattice.chars;
|
|
21015
|
+
const mblen = 1;
|
|
21016
21016
|
let beginPos = 0;
|
|
21017
|
-
while (beginPos <
|
|
21018
|
-
const mblen = 1;
|
|
21017
|
+
while (beginPos < chars.length) {
|
|
21019
21018
|
let hasSingleNode = false;
|
|
21020
|
-
const tokens = [];
|
|
21021
21019
|
|
|
21022
|
-
|
|
21020
|
+
const tokens = [];
|
|
21021
|
+
const sliced = chars.slice(beginPos).join('');
|
|
21022
|
+
const prefixedTokens = this.trie.commonPrefixSearch(sliced);
|
|
21023
|
+
for (const token of prefixedTokens) {
|
|
21023
21024
|
tokens.push(token);
|
|
21024
21025
|
const tokenId = this.tokens_to_ids.get(token);
|
|
21025
21026
|
const tokenScore = this.scores[tokenId];
|
|
21026
|
-
const n = token
|
|
21027
|
+
const n = (0,_utils_core_js__WEBPACK_IMPORTED_MODULE_1__.len)(token);
|
|
21027
21028
|
lattice.insert(beginPos, n, tokenScore, tokenId);
|
|
21028
21029
|
if (!hasSingleNode && n === mblen) {
|
|
21029
21030
|
hasSingleNode = true;
|
|
21030
21031
|
}
|
|
21031
21032
|
}
|
|
21032
21033
|
if (!hasSingleNode) {
|
|
21033
|
-
lattice.insert(beginPos, mblen, this.
|
|
21034
|
+
lattice.insert(beginPos, mblen, this.unk_score, this.unk_token_id);
|
|
21034
21035
|
}
|
|
21035
21036
|
beginPos += mblen;
|
|
21036
21037
|
}
|
|
@@ -21043,7 +21044,7 @@ class Unigram extends TokenizerModel {
|
|
|
21043
21044
|
* @returns {string[]} An array of subtokens obtained by encoding the input tokens using the unigram model.
|
|
21044
21045
|
*/
|
|
21045
21046
|
tokenize(normalized) {
|
|
21046
|
-
const lattice = new _utils_data_structures_js__WEBPACK_IMPORTED_MODULE_5__.TokenLattice(normalized, this.
|
|
21047
|
+
const lattice = new _utils_data_structures_js__WEBPACK_IMPORTED_MODULE_5__.TokenLattice(normalized, this.bos_token_id, this.eos_token_id);
|
|
21047
21048
|
this.populateNodes(lattice);
|
|
21048
21049
|
return lattice.tokens();
|
|
21049
21050
|
}
|
|
@@ -21658,7 +21659,8 @@ class BertNormalizer extends Normalizer {
|
|
|
21658
21659
|
* @returns {string} The text with accents removed.
|
|
21659
21660
|
*/
|
|
21660
21661
|
stripAccents(text) {
|
|
21661
|
-
|
|
21662
|
+
// "Mark, Nonspacing" (Mn)
|
|
21663
|
+
return text.normalize('NFD').replace(/\p{Mn}/gu, '');
|
|
21662
21664
|
}
|
|
21663
21665
|
|
|
21664
21666
|
|
|
@@ -22776,7 +22778,7 @@ class Precompiled extends Normalizer {
|
|
|
22776
22778
|
// TODO: detect when a different `this.charsmap` is used.
|
|
22777
22779
|
|
|
22778
22780
|
text = text.replace(/[\u0001-\u0008\u000B\u000E-\u001F\u007F\u008F\u009F]/gm, ''); // Remove control characters
|
|
22779
|
-
text = text.replace(/[\u0009\u000A\u000C\u000D\u1680\
|
|
22781
|
+
text = text.replace(/[\u0009\u000A\u000C\u000D\u00A0\u1680\u2000-\u200F\u2028\u2029\u202F\u205F\u2581\u3000\uFEFF\uFFFD]/gm, '\u0020'); // Replace certain characters with a space
|
|
22780
22782
|
|
|
22781
22783
|
if (text.includes('\uFF5E')) {
|
|
22782
22784
|
// To match the sentencepiece implementation 100%, we must handle a very strange edge-case.
|
|
@@ -25632,6 +25634,7 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
25632
25634
|
/* harmony export */ escapeRegExp: () => (/* binding */ escapeRegExp),
|
|
25633
25635
|
/* harmony export */ isIntegralNumber: () => (/* binding */ isIntegralNumber),
|
|
25634
25636
|
/* harmony export */ isTypedArray: () => (/* binding */ isTypedArray),
|
|
25637
|
+
/* harmony export */ len: () => (/* binding */ len),
|
|
25635
25638
|
/* harmony export */ mergeArrays: () => (/* binding */ mergeArrays),
|
|
25636
25639
|
/* harmony export */ pick: () => (/* binding */ pick),
|
|
25637
25640
|
/* harmony export */ pop: () => (/* binding */ pop),
|
|
@@ -25788,6 +25791,18 @@ function pick(o, props) {
|
|
|
25788
25791
|
);
|
|
25789
25792
|
}
|
|
25790
25793
|
|
|
25794
|
+
/**
|
|
25795
|
+
* Calculate the length of a string, taking multi-byte characters into account.
|
|
25796
|
+
* This mimics the behavior of Python's `len` function.
|
|
25797
|
+
* @param {string} s The string to calculate the length of.
|
|
25798
|
+
* @returns {number} The length of the string.
|
|
25799
|
+
*/
|
|
25800
|
+
function len(s) {
|
|
25801
|
+
let length = 0;
|
|
25802
|
+
for (const c of s) ++length;
|
|
25803
|
+
return length;
|
|
25804
|
+
}
|
|
25805
|
+
|
|
25791
25806
|
|
|
25792
25807
|
/***/ }),
|
|
25793
25808
|
|
|
@@ -26035,7 +26050,7 @@ class CharTrie {
|
|
|
26035
26050
|
* @param {string[]} texts The strings to add to the trie.
|
|
26036
26051
|
*/
|
|
26037
26052
|
extend(texts) {
|
|
26038
|
-
for (
|
|
26053
|
+
for (const text of texts) {
|
|
26039
26054
|
this.push(text);
|
|
26040
26055
|
}
|
|
26041
26056
|
}
|
|
@@ -26046,7 +26061,7 @@ class CharTrie {
|
|
|
26046
26061
|
*/
|
|
26047
26062
|
push(text) {
|
|
26048
26063
|
let node = this.root;
|
|
26049
|
-
for (
|
|
26064
|
+
for (const ch of text) {
|
|
26050
26065
|
let child = node.children.get(ch);
|
|
26051
26066
|
if (child === undefined) {
|
|
26052
26067
|
child = CharTrieNode.default();
|
|
@@ -26064,12 +26079,14 @@ class CharTrie {
|
|
|
26064
26079
|
*/
|
|
26065
26080
|
*commonPrefixSearch(text) {
|
|
26066
26081
|
let node = this.root;
|
|
26082
|
+
if (node === undefined) return;
|
|
26083
|
+
|
|
26067
26084
|
let prefix = "";
|
|
26068
|
-
for (
|
|
26069
|
-
const ch = text[i];
|
|
26085
|
+
for (const ch of text) {
|
|
26070
26086
|
prefix += ch;
|
|
26071
26087
|
node = node.children.get(ch);
|
|
26072
|
-
if (node
|
|
26088
|
+
if (node === undefined) return;
|
|
26089
|
+
if (node.isLeaf) {
|
|
26073
26090
|
yield prefix;
|
|
26074
26091
|
}
|
|
26075
26092
|
}
|
|
@@ -26111,8 +26128,8 @@ class TokenLattice {
|
|
|
26111
26128
|
* @param {number} eosTokenId The end-of-sequence token ID.
|
|
26112
26129
|
*/
|
|
26113
26130
|
constructor(sentence, bosTokenId, eosTokenId) {
|
|
26114
|
-
this.
|
|
26115
|
-
this.len =
|
|
26131
|
+
this.chars = Array.from(sentence);
|
|
26132
|
+
this.len = this.chars.length;
|
|
26116
26133
|
this.bosTokenId = bosTokenId;
|
|
26117
26134
|
this.eosTokenId = eosTokenId;
|
|
26118
26135
|
this.nodes = [];
|
|
@@ -26146,7 +26163,7 @@ class TokenLattice {
|
|
|
26146
26163
|
/**
|
|
26147
26164
|
* Implements the Viterbi algorithm to compute the most likely sequence of tokens.
|
|
26148
26165
|
*
|
|
26149
|
-
* @returns {TokenLatticeNode[]} The
|
|
26166
|
+
* @returns {TokenLatticeNode[]} The most likely sequence of tokens.
|
|
26150
26167
|
*/
|
|
26151
26168
|
viterbi() {
|
|
26152
26169
|
const len = this.len;
|
|
@@ -26200,11 +26217,11 @@ class TokenLattice {
|
|
|
26200
26217
|
* @returns {string} The array of nodes representing the most likely sequence of tokens.
|
|
26201
26218
|
*/
|
|
26202
26219
|
piece(node) {
|
|
26203
|
-
return this.
|
|
26220
|
+
return this.chars.slice(node.pos, node.pos + node.length).join('');
|
|
26204
26221
|
}
|
|
26205
26222
|
|
|
26206
26223
|
/**
|
|
26207
|
-
* @returns {
|
|
26224
|
+
* @returns {string[]} The most likely sequence of tokens.
|
|
26208
26225
|
*/
|
|
26209
26226
|
tokens() {
|
|
26210
26227
|
const nodes = this.viterbi();
|
|
@@ -26212,7 +26229,7 @@ class TokenLattice {
|
|
|
26212
26229
|
}
|
|
26213
26230
|
|
|
26214
26231
|
/**
|
|
26215
|
-
* @returns {
|
|
26232
|
+
* @returns {number[]} The most likely sequence of token ids.
|
|
26216
26233
|
*/
|
|
26217
26234
|
tokenIds() {
|
|
26218
26235
|
const nodes = this.viterbi();
|
|
@@ -26453,7 +26470,7 @@ __webpack_require__.r(__webpack_exports__);
|
|
|
26453
26470
|
/**
|
|
26454
26471
|
* @typedef {Object} PretrainedOptions Options for loading a pretrained model.
|
|
26455
26472
|
* @property {function} [progress_callback=null] If specified, this function will be called during model construction, to provide the user with progress updates.
|
|
26456
|
-
* @property {
|
|
26473
|
+
* @property {import('../configs.js').PretrainedConfig} [config=null] Configuration for the model to use instead of an automatically loaded configuration. Configuration can be automatically loaded when:
|
|
26457
26474
|
* - The model is a model provided by the library (loaded with the *model id* string of a pretrained model).
|
|
26458
26475
|
* - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a configuration JSON file named *config.json* is found in the directory.
|
|
26459
26476
|
* @property {string} [cache_dir=null] Path to a directory in which a downloaded pretrained model configuration should be cached if the standard cache should not be used.
|