@huggingface/transformers 3.0.0-alpha.17 → 3.0.0-alpha.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -101,7 +101,7 @@ npm i @huggingface/transformers
101
101
  Alternatively, you can use it in vanilla JS, without any bundler, by using a CDN or static hosting. For example, using [ES Modules](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Modules), you can import the library with:
102
102
  ```html
103
103
  <script type="module">
104
- import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.0-alpha.17';
104
+ import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.0-alpha.19';
105
105
  </script>
106
106
  ```
107
107
 
@@ -134,7 +134,7 @@ Check out the Transformers.js [template](https://huggingface.co/new-space?templa
134
134
 
135
135
 
136
136
 
137
- By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.0-alpha.17/dist/), which should work out-of-the-box. You can customize this as follows:
137
+ By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.0-alpha.19/dist/), which should work out-of-the-box. You can customize this as follows:
138
138
 
139
139
  ### Settings
140
140
 
@@ -4449,7 +4449,7 @@ __webpack_require__.r(__webpack_exports__);
4449
4449
 
4450
4450
 
4451
4451
 
4452
- const VERSION = '3.0.0-alpha.17';
4452
+ const VERSION = '3.0.0-alpha.19';
4453
4453
 
4454
4454
  // Check if various APIs are available (depends on environment)
4455
4455
  const IS_BROWSER_ENV = typeof self !== 'undefined';
@@ -7005,9 +7005,6 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
7005
7005
  /** @type {Record<string, import('onnxruntime-common').Tensor.DataLocation>} */
7006
7006
  const preferredOutputLocation = {};
7007
7007
  for (const key in shapes) {
7008
- // TODO: For now, we keep encoder outputs on the CPU
7009
- // (otherwise, this causes a memory leak or throws an error "Error: previous buffer is not registered")
7010
- if (key.includes('encoder')) continue;
7011
7008
  preferredOutputLocation[key] = 'gpu-buffer';
7012
7009
  }
7013
7010
  session_options.preferredOutputLocation = preferredOutputLocation;
@@ -21114,7 +21111,7 @@ class BPE extends TokenizerModel {
21114
21111
  * Create a BPE instance.
21115
21112
  * @param {Object} config The configuration object for BPE.
21116
21113
  * @param {Object} config.vocab A mapping of tokens to ids.
21117
- * @param {string[]} config.merges An array of BPE merges as strings.
21114
+ * @param {string[]|[string, string][]} config.merges An array of BPE merges as strings.
21118
21115
  * @param {string} config.unk_token The unknown token used for out of vocabulary words.
21119
21116
  * @param {string} config.end_of_word_suffix The suffix to place at the end of each word.
21120
21117
  * @param {string} [config.continuing_subword_suffix] The suffix to insert between words.
@@ -21124,8 +21121,6 @@ class BPE extends TokenizerModel {
21124
21121
  constructor(config) {
21125
21122
  super(config);
21126
21123
 
21127
- this.BPE_SPLIT_TOKEN = ' ';
21128
-
21129
21124
  /** @type {Map<string, number>} */
21130
21125
  this.tokens_to_ids = objectToMap(config.vocab);
21131
21126
 
@@ -21137,8 +21132,15 @@ class BPE extends TokenizerModel {
21137
21132
  this.vocab[value] = key;
21138
21133
  }
21139
21134
 
21140
- this.bpe_ranks = new Map(config.merges.map((x, i) => [x, i]));
21141
- this.merges = config.merges.map(x => x.split(this.BPE_SPLIT_TOKEN));
21135
+ // Tokenizers >= 0.20.0 serializes BPE merges as a [string, string][] instead of a string[],
21136
+ // which resolves the ambiguity for merges containing spaces.
21137
+ const use_new_merge_format = Array.isArray(config.merges[0]);
21138
+
21139
+ /** @type {[string, string][]} */
21140
+ this.merges = use_new_merge_format
21141
+ ? /** @type {[string, string][]} */(config.merges)
21142
+ : (/** @type {string[]} */(config.merges)).map(x => /** @type {[string, string]} */(x.split(' ', 2)));
21143
+ this.bpe_ranks = new Map(this.merges.map((x, i) => [JSON.stringify(x), i]));
21142
21144
 
21143
21145
  this.end_of_word_suffix = config.end_of_word_suffix;
21144
21146
 
@@ -21298,7 +21300,7 @@ class BPE extends TokenizerModel {
21298
21300
  // `score` is a measure of the merge priority: lower means higher priority
21299
21301
  // We use the BPE rank as a measure of priority (i.e., the local of the merge in the merges list)
21300
21302
  // We also add a fractional component to the score to break ties (with the earlier character having higher priority)
21301
- const rank = this.bpe_ranks.get(node.token + this.BPE_SPLIT_TOKEN + node.next.token);
21303
+ const rank = this.bpe_ranks.get(JSON.stringify([node.token, node.next.token]));
21302
21304
  if (rank !== undefined) {
21303
21305
  node.score = rank + node.bias;
21304
21306
  queue.push(node);