@huggingface/transformers 3.0.0-alpha.17 → 3.0.0-alpha.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -101,7 +101,7 @@ npm i @huggingface/transformers
101
101
  Alternatively, you can use it in vanilla JS, without any bundler, by using a CDN or static hosting. For example, using [ES Modules](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Modules), you can import the library with:
102
102
  ```html
103
103
  <script type="module">
104
- import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.0-alpha.17';
104
+ import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.0-alpha.18';
105
105
  </script>
106
106
  ```
107
107
 
@@ -134,7 +134,7 @@ Check out the Transformers.js [template](https://huggingface.co/new-space?templa
134
134
 
135
135
 
136
136
 
137
- By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.0-alpha.17/dist/), which should work out-of-the-box. You can customize this as follows:
137
+ By default, Transformers.js uses [hosted pretrained models](https://huggingface.co/models?library=transformers.js) and [precompiled WASM binaries](https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.0.0-alpha.18/dist/), which should work out-of-the-box. You can customize this as follows:
138
138
 
139
139
  ### Settings
140
140
 
@@ -4449,7 +4449,7 @@ __webpack_require__.r(__webpack_exports__);
4449
4449
 
4450
4450
 
4451
4451
 
4452
- const VERSION = '3.0.0-alpha.17';
4452
+ const VERSION = '3.0.0-alpha.18';
4453
4453
 
4454
4454
  // Check if various APIs are available (depends on environment)
4455
4455
  const IS_BROWSER_ENV = typeof self !== 'undefined';
@@ -21114,7 +21114,7 @@ class BPE extends TokenizerModel {
21114
21114
  * Create a BPE instance.
21115
21115
  * @param {Object} config The configuration object for BPE.
21116
21116
  * @param {Object} config.vocab A mapping of tokens to ids.
21117
- * @param {string[]} config.merges An array of BPE merges as strings.
21117
+ * @param {string[]|[string, string][]} config.merges An array of BPE merges as strings.
21118
21118
  * @param {string} config.unk_token The unknown token used for out of vocabulary words.
21119
21119
  * @param {string} config.end_of_word_suffix The suffix to place at the end of each word.
21120
21120
  * @param {string} [config.continuing_subword_suffix] The suffix to insert between words.
@@ -21124,8 +21124,6 @@ class BPE extends TokenizerModel {
21124
21124
  constructor(config) {
21125
21125
  super(config);
21126
21126
 
21127
- this.BPE_SPLIT_TOKEN = ' ';
21128
-
21129
21127
  /** @type {Map<string, number>} */
21130
21128
  this.tokens_to_ids = objectToMap(config.vocab);
21131
21129
 
@@ -21137,8 +21135,15 @@ class BPE extends TokenizerModel {
21137
21135
  this.vocab[value] = key;
21138
21136
  }
21139
21137
 
21140
- this.bpe_ranks = new Map(config.merges.map((x, i) => [x, i]));
21141
- this.merges = config.merges.map(x => x.split(this.BPE_SPLIT_TOKEN));
21138
+ // Tokenizers >= 0.20.0 serializes BPE merges as a [string, string][] instead of a string[],
21139
+ // which resolves the ambiguity for merges containing spaces.
21140
+ const use_new_merge_format = Array.isArray(config.merges[0]);
21141
+
21142
+ /** @type {[string, string][]} */
21143
+ this.merges = use_new_merge_format
21144
+ ? /** @type {[string, string][]} */(config.merges)
21145
+ : (/** @type {string[]} */(config.merges)).map(x => /** @type {[string, string]} */(x.split(' ', 2)));
21146
+ this.bpe_ranks = new Map(this.merges.map((x, i) => [JSON.stringify(x), i]));
21142
21147
 
21143
21148
  this.end_of_word_suffix = config.end_of_word_suffix;
21144
21149
 
@@ -21298,7 +21303,7 @@ class BPE extends TokenizerModel {
21298
21303
  // `score` is a measure of the merge priority: lower means higher priority
21299
21304
  // We use the BPE rank as a measure of priority (i.e., the local of the merge in the merges list)
21300
21305
  // We also add a fractional component to the score to break ties (with the earlier character having higher priority)
21301
- const rank = this.bpe_ranks.get(node.token + this.BPE_SPLIT_TOKEN + node.next.token);
21306
+ const rank = this.bpe_ranks.get(JSON.stringify([node.token, node.next.token]));
21302
21307
  if (rank !== undefined) {
21303
21308
  node.score = rank + node.bias;
21304
21309
  queue.push(node);