@huggingface/transformers 3.0.0-alpha.14 → 3.0.0-alpha.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -6
- package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
- package/dist/transformers.cjs +678 -443
- package/dist/transformers.cjs.map +1 -1
- package/dist/transformers.js +1107 -825
- package/dist/transformers.js.map +1 -1
- package/dist/transformers.min.cjs +14 -14
- package/dist/transformers.min.cjs.map +1 -1
- package/dist/transformers.min.js +17 -17
- package/dist/transformers.min.js.map +1 -1
- package/dist/transformers.min.mjs +52 -52
- package/dist/transformers.min.mjs.map +1 -1
- package/dist/transformers.mjs +699 -444
- package/dist/transformers.mjs.map +1 -1
- package/package.json +4 -5
- package/src/configs.js +16 -4
- package/src/env.js +4 -4
- package/src/models.js +151 -58
- package/src/pipelines.js +5 -4
- package/src/processors.js +313 -285
- package/src/tokenizers.js +111 -72
- package/src/utils/core.js +12 -0
- package/src/utils/data-structures.js +13 -11
- package/src/utils/hub.js +1 -1
- package/src/utils/maths.js +13 -4
- package/types/configs.d.ts +25 -3
- package/types/configs.d.ts.map +1 -1
- package/types/models.d.ts +63 -2
- package/types/models.d.ts.map +1 -1
- package/types/pipelines.d.ts.map +1 -1
- package/types/processors.d.ts +42 -52
- package/types/processors.d.ts.map +1 -1
- package/types/tokenizers.d.ts +23 -1
- package/types/tokenizers.d.ts.map +1 -1
- package/types/utils/core.d.ts +7 -0
- package/types/utils/core.d.ts.map +1 -1
- package/types/utils/data-structures.d.ts +6 -6
- package/types/utils/data-structures.d.ts.map +1 -1
- package/types/utils/hub.d.ts +1 -1
- package/types/utils/hub.d.ts.map +1 -1
- package/types/utils/maths.d.ts.map +1 -1
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@huggingface/transformers",
|
|
3
|
-
"version": "3.0.0-alpha.
|
|
3
|
+
"version": "3.0.0-alpha.16",
|
|
4
4
|
"description": "State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!",
|
|
5
5
|
"main": "./src/transformers.js",
|
|
6
6
|
"types": "./types/transformers.d.ts",
|
|
@@ -33,8 +33,7 @@
|
|
|
33
33
|
"typegen": "tsc ./src/transformers.js --allowJs --declaration --emitDeclarationOnly --declarationMap --outDir types",
|
|
34
34
|
"dev": "webpack serve --no-client-overlay",
|
|
35
35
|
"build": "webpack && npm run typegen",
|
|
36
|
-
"
|
|
37
|
-
"test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --verbose --maxConcurrency 1",
|
|
36
|
+
"test": "node --experimental-vm-modules node_modules/jest/bin/jest.js --verbose",
|
|
38
37
|
"readme": "python ./docs/scripts/build_readme.py",
|
|
39
38
|
"docs-api": "node ./docs/scripts/generate.js",
|
|
40
39
|
"docs-preview": "doc-builder preview transformers.js ./docs/source/ --not_python_module",
|
|
@@ -63,8 +62,8 @@
|
|
|
63
62
|
"homepage": "https://github.com/xenova/transformers.js#readme",
|
|
64
63
|
"dependencies": {
|
|
65
64
|
"@huggingface/jinja": "^0.3.0",
|
|
66
|
-
"onnxruntime-node": "1.19.
|
|
67
|
-
"onnxruntime-web": "1.20.0-dev.
|
|
65
|
+
"onnxruntime-node": "1.19.2",
|
|
66
|
+
"onnxruntime-web": "1.20.0-dev.20240908-de7a02beef",
|
|
68
67
|
"sharp": "^0.33.5"
|
|
69
68
|
},
|
|
70
69
|
"devDependencies": {
|
package/src/configs.js
CHANGED
|
@@ -296,16 +296,23 @@ export function getKeyValueShapes(config, {
|
|
|
296
296
|
export class PretrainedConfig {
|
|
297
297
|
// NOTE: Typo in original
|
|
298
298
|
|
|
299
|
+
/** @type {string|null} */
|
|
300
|
+
model_type = null;
|
|
301
|
+
|
|
302
|
+
/** @type {boolean} */
|
|
303
|
+
is_encoder_decoder = false;
|
|
304
|
+
|
|
305
|
+
/** @type {number} */
|
|
299
306
|
max_position_embeddings;
|
|
300
307
|
|
|
308
|
+
/** @type {TransformersJSConfig} */
|
|
309
|
+
'transformers.js_config';
|
|
310
|
+
|
|
301
311
|
/**
|
|
302
312
|
* Create a new PreTrainedTokenizer instance.
|
|
303
313
|
* @param {Object} configJSON The JSON of the config.
|
|
304
314
|
*/
|
|
305
315
|
constructor(configJSON) {
|
|
306
|
-
this.model_type = null;
|
|
307
|
-
this.is_encoder_decoder = false;
|
|
308
|
-
|
|
309
316
|
Object.assign(this, configJSON);
|
|
310
317
|
this.normalized_config = getNormalizedConfig(this);
|
|
311
318
|
}
|
|
@@ -357,5 +364,10 @@ export class AutoConfig {
|
|
|
357
364
|
/**
|
|
358
365
|
* Transformers.js-specific configuration, possibly present in config.json under the key `transformers.js_config`.
|
|
359
366
|
* @typedef {Object} TransformersJSConfig
|
|
360
|
-
* @property {import('./
|
|
367
|
+
* @property {import('./utils/tensor.js').DataType} [kv_cache_dtype] The data type of the key-value cache.
|
|
368
|
+
* @property {Record<string, number>} [free_dimension_overrides] Override the free dimensions of the model.
|
|
369
|
+
* See https://onnxruntime.ai/docs/tutorials/web/env-flags-and-session-options.html#freedimensionoverrides
|
|
370
|
+
* for more information.
|
|
371
|
+
* @property {import('./utils/devices.js').DeviceType} [device] The default device to use for the model.
|
|
372
|
+
* @property {import('./utils/dtypes.js').DataType} [dtype] The default data type to use for the model.
|
|
361
373
|
*/
|
package/src/env.js
CHANGED
|
@@ -26,7 +26,7 @@ import fs from 'fs';
|
|
|
26
26
|
import path from 'path';
|
|
27
27
|
import url from 'url';
|
|
28
28
|
|
|
29
|
-
const VERSION = '3.0.0-alpha.
|
|
29
|
+
const VERSION = '3.0.0-alpha.16';
|
|
30
30
|
|
|
31
31
|
// Check if various APIs are available (depends on environment)
|
|
32
32
|
const IS_BROWSER_ENV = typeof self !== 'undefined';
|
|
@@ -73,19 +73,19 @@ export const apis = Object.freeze({
|
|
|
73
73
|
});
|
|
74
74
|
|
|
75
75
|
const RUNNING_LOCALLY = IS_FS_AVAILABLE && IS_PATH_AVAILABLE;
|
|
76
|
-
const
|
|
76
|
+
const dirname__ = RUNNING_LOCALLY
|
|
77
77
|
? path.dirname(path.dirname(url.fileURLToPath(import.meta.url)))
|
|
78
78
|
: './';
|
|
79
79
|
|
|
80
80
|
// Only used for environments with access to file system
|
|
81
81
|
const DEFAULT_CACHE_DIR = RUNNING_LOCALLY
|
|
82
|
-
? path.join(
|
|
82
|
+
? path.join(dirname__, '/.cache/')
|
|
83
83
|
: null;
|
|
84
84
|
|
|
85
85
|
// Set local model path, based on available APIs
|
|
86
86
|
const DEFAULT_LOCAL_MODEL_PATH = '/models/';
|
|
87
87
|
const localModelPath = RUNNING_LOCALLY
|
|
88
|
-
? path.join(
|
|
88
|
+
? path.join(dirname__, DEFAULT_LOCAL_MODEL_PATH)
|
|
89
89
|
: DEFAULT_LOCAL_MODEL_PATH;
|
|
90
90
|
|
|
91
91
|
/**
|
package/src/models.js
CHANGED
|
@@ -146,7 +146,8 @@ const MODEL_CLASS_TO_NAME_MAPPING = new Map();
|
|
|
146
146
|
* @private
|
|
147
147
|
*/
|
|
148
148
|
async function getSession(pretrained_model_name_or_path, fileName, options) {
|
|
149
|
-
|
|
149
|
+
const custom_config = options.config?.['transformers.js_config'] ?? {};
|
|
150
|
+
let device = options.device ?? custom_config.device;
|
|
150
151
|
if (device && typeof device !== 'string') {
|
|
151
152
|
if (device.hasOwnProperty(fileName)) {
|
|
152
153
|
device = device[fileName];
|
|
@@ -164,7 +165,7 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
|
|
|
164
165
|
|
|
165
166
|
// If options.dtype is specified, we use it to choose the suffix for the model file.
|
|
166
167
|
// Otherwise, we use the default dtype for the device.
|
|
167
|
-
let dtype = options.dtype;
|
|
168
|
+
let dtype = options.dtype ?? custom_config.dtype;
|
|
168
169
|
if (typeof dtype !== 'string') {
|
|
169
170
|
if (dtype && dtype.hasOwnProperty(fileName)) {
|
|
170
171
|
dtype = dtype[fileName];
|
|
@@ -191,6 +192,16 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
|
|
|
191
192
|
// Overwrite `executionProviders` if not specified
|
|
192
193
|
session_options.executionProviders ??= executionProviders;
|
|
193
194
|
|
|
195
|
+
// Overwrite `freeDimensionOverrides` if specified in config and not set in session options
|
|
196
|
+
const free_dimension_overrides = custom_config.free_dimension_overrides;
|
|
197
|
+
if (free_dimension_overrides) {
|
|
198
|
+
session_options.freeDimensionOverrides ??= free_dimension_overrides;
|
|
199
|
+
} else if (selectedDevice.startsWith('webnn') && !session_options.freeDimensionOverrides) {
|
|
200
|
+
console.warn(
|
|
201
|
+
'WebNN does not currently support dynamic shapes and requires `free_dimension_overrides` to be set in config.json as a field within "transformers.js_config". ' +
|
|
202
|
+
'When `free_dimension_overrides` is not set, you may experience significant performance degradation.'
|
|
203
|
+
);
|
|
204
|
+
}
|
|
194
205
|
|
|
195
206
|
const bufferPromise = getModelFile(pretrained_model_name_or_path, modelFileName, true, options);
|
|
196
207
|
|
|
@@ -239,6 +250,9 @@ async function getSession(pretrained_model_name_or_path, fileName, options) {
|
|
|
239
250
|
/** @type {Record<string, import('onnxruntime-common').Tensor.DataLocation>} */
|
|
240
251
|
const preferredOutputLocation = {};
|
|
241
252
|
for (const key in shapes) {
|
|
253
|
+
// TODO: For now, we keep encoder outputs on the CPU
|
|
254
|
+
// (otherwise, this causes a memory leak or throws an error "Error: previous buffer is not registered")
|
|
255
|
+
if (key.includes('encoder')) continue;
|
|
242
256
|
preferredOutputLocation[key] = 'gpu-buffer';
|
|
243
257
|
}
|
|
244
258
|
session_options.preferredOutputLocation = preferredOutputLocation;
|
|
@@ -394,37 +408,6 @@ function toI64Tensor(items) {
|
|
|
394
408
|
}
|
|
395
409
|
}
|
|
396
410
|
|
|
397
|
-
/**
|
|
398
|
-
* Prepares an attention mask for a sequence of tokens based on configuration options.
|
|
399
|
-
* @param {Object} self The calling object instance.
|
|
400
|
-
* @param {Tensor} tokens The input tokens.
|
|
401
|
-
* @returns {Tensor} The attention mask tensor.
|
|
402
|
-
* @private
|
|
403
|
-
*/
|
|
404
|
-
function prepareAttentionMask(self, tokens) {
|
|
405
|
-
|
|
406
|
-
// Prepare attention mask
|
|
407
|
-
let pad_token_id = self.config.pad_token_id ?? null;
|
|
408
|
-
let eos_token_id = self.config.eos_token_id ?? null;
|
|
409
|
-
if (isIntegralNumber(eos_token_id)) {
|
|
410
|
-
eos_token_id = [eos_token_id];
|
|
411
|
-
}
|
|
412
|
-
|
|
413
|
-
let is_pad_token_in_inputs = tokens.indexOf(pad_token_id) !== -1;
|
|
414
|
-
let is_pad_token_not_equal_to_eos_token_id = (eos_token_id === null) || !eos_token_id.includes(pad_token_id)
|
|
415
|
-
|
|
416
|
-
if (is_pad_token_in_inputs && is_pad_token_not_equal_to_eos_token_id) {
|
|
417
|
-
let data = BigInt64Array.from(
|
|
418
|
-
// Note: != so that int matches bigint
|
|
419
|
-
// @ts-ignore
|
|
420
|
-
tokens.data.map(x => x != pad_token_id)
|
|
421
|
-
)
|
|
422
|
-
return new Tensor('int64', data, tokens.dims)
|
|
423
|
-
} else {
|
|
424
|
-
return ones_like(tokens);
|
|
425
|
-
}
|
|
426
|
-
}
|
|
427
|
-
|
|
428
411
|
/**
|
|
429
412
|
* Creates a boolean tensor with a single value.
|
|
430
413
|
* @param {boolean} value The value of the tensor.
|
|
@@ -695,8 +678,8 @@ function image_text_to_text_prepare_inputs_for_generation(self, ...args) {
|
|
|
695
678
|
} else {
|
|
696
679
|
return decoder_prepare_inputs_for_generation(self, ...args);
|
|
697
680
|
}
|
|
698
|
-
|
|
699
681
|
}
|
|
682
|
+
|
|
700
683
|
//////////////////////////////////////////////////
|
|
701
684
|
|
|
702
685
|
//////////////////////////////////////////////////
|
|
@@ -1459,13 +1442,12 @@ export class PreTrainedModel extends Callable {
|
|
|
1459
1442
|
// - GenerationMode.BEAM_SEARCH
|
|
1460
1443
|
// - GenerationMode.BEAM_SAMPLE
|
|
1461
1444
|
////////////////////////////////////////////////////
|
|
1462
|
-
let
|
|
1445
|
+
let outputs;
|
|
1463
1446
|
let attentions = {};
|
|
1464
1447
|
while (true) {
|
|
1465
1448
|
// prepare model inputs
|
|
1466
1449
|
model_inputs = this.prepare_inputs_for_generation(all_input_ids, model_inputs, generation_config);
|
|
1467
|
-
|
|
1468
|
-
const outputs = await this.forward(model_inputs);
|
|
1450
|
+
outputs = await this.forward(model_inputs);
|
|
1469
1451
|
|
|
1470
1452
|
if (generation_config.output_attentions && generation_config.return_dict_in_generate) {
|
|
1471
1453
|
// Get attentions if they are present
|
|
@@ -1512,10 +1494,6 @@ export class PreTrainedModel extends Callable {
|
|
|
1512
1494
|
|
|
1513
1495
|
const stop = prepared_stopping_criteria(all_input_ids);
|
|
1514
1496
|
if (stop.every(x => x)) {
|
|
1515
|
-
if (generation_config.return_dict_in_generate) {
|
|
1516
|
-
// Get past key values without disposing buffers
|
|
1517
|
-
past_key_values = this.getPastKeyValues(outputs, model_inputs.past_key_values, false);
|
|
1518
|
-
}
|
|
1519
1497
|
break;
|
|
1520
1498
|
}
|
|
1521
1499
|
|
|
@@ -1528,6 +1506,9 @@ export class PreTrainedModel extends Callable {
|
|
|
1528
1506
|
streamer.end();
|
|
1529
1507
|
}
|
|
1530
1508
|
|
|
1509
|
+
// Retrieve and dispose all final past key values (including encoder attentions)
|
|
1510
|
+
const past_key_values = this.getPastKeyValues(outputs, model_inputs.past_key_values, true);
|
|
1511
|
+
|
|
1531
1512
|
// TODO: ensure all_input_ids is padded correctly...
|
|
1532
1513
|
const sequences = new Tensor('int64', all_input_ids.flat(), [all_input_ids.length, all_input_ids[0].length]);
|
|
1533
1514
|
|
|
@@ -1541,6 +1522,12 @@ export class PreTrainedModel extends Callable {
|
|
|
1541
1522
|
// logits,
|
|
1542
1523
|
}
|
|
1543
1524
|
} else {
|
|
1525
|
+
// Dispose all remaining tensors
|
|
1526
|
+
for (const tensor of Object.values(outputs)) {
|
|
1527
|
+
if (tensor.location === 'gpu-buffer') {
|
|
1528
|
+
tensor.dispose();
|
|
1529
|
+
}
|
|
1530
|
+
}
|
|
1544
1531
|
return sequences;
|
|
1545
1532
|
}
|
|
1546
1533
|
}
|
|
@@ -1550,31 +1537,32 @@ export class PreTrainedModel extends Callable {
|
|
|
1550
1537
|
*
|
|
1551
1538
|
* @param {Object} decoderResults The decoder results object.
|
|
1552
1539
|
* @param {Object} pastKeyValues The previous past key values.
|
|
1553
|
-
* @param {boolean} [dispose=true] Whether to dispose of the old gpu buffer.
|
|
1554
1540
|
* @returns {Object} An object containing past key values.
|
|
1555
1541
|
*/
|
|
1556
|
-
getPastKeyValues(decoderResults, pastKeyValues,
|
|
1542
|
+
getPastKeyValues(decoderResults, pastKeyValues, disposeEncoderPKVs = false) {
|
|
1557
1543
|
const pkvs = Object.create(null);
|
|
1558
1544
|
|
|
1559
1545
|
for (const name in decoderResults) {
|
|
1560
1546
|
if (name.startsWith('present')) {
|
|
1561
1547
|
const newName = name.replace('present', 'past_key_values');
|
|
1562
|
-
|
|
1563
|
-
if (
|
|
1564
|
-
// Optimization introduced by optimum to reuse past key values.
|
|
1565
|
-
// outputs with the previous past key values.
|
|
1548
|
+
const is_encoder_pkv = name.includes('encoder');
|
|
1549
|
+
if (is_encoder_pkv && pastKeyValues) {
|
|
1550
|
+
// Optimization introduced by optimum to reuse past key values.
|
|
1551
|
+
// So, we just replace the constant outputs (`decoderResults[name]`) with the previous past key values.
|
|
1566
1552
|
// https://github.com/huggingface/optimum/blob/0bf2c05fb7e1182b52d21b703cfc95fd9e4ea3dc/optimum/onnxruntime/base.py#L677-L704
|
|
1567
1553
|
pkvs[newName] = pastKeyValues[newName];
|
|
1568
|
-
} else {
|
|
1569
|
-
if (dispose && pastKeyValues) {
|
|
1570
|
-
// Free old gpu buffer
|
|
1571
|
-
const t = pastKeyValues[newName];
|
|
1572
|
-
if (t.location === 'gpu-buffer') {
|
|
1573
|
-
t.dispose();
|
|
1574
|
-
}
|
|
1575
|
-
}
|
|
1554
|
+
} else { // decoder or using first encoder PKVs
|
|
1576
1555
|
pkvs[newName] = decoderResults[name];
|
|
1577
1556
|
}
|
|
1557
|
+
|
|
1558
|
+
if (pastKeyValues && (!is_encoder_pkv || disposeEncoderPKVs)) {
|
|
1559
|
+
// - Always dispose decoder PKVs
|
|
1560
|
+
// - Only dispose encoder past key values when requested (after generation)
|
|
1561
|
+
const t = pastKeyValues[newName];
|
|
1562
|
+
if (t.location === 'gpu-buffer') {
|
|
1563
|
+
t.dispose();
|
|
1564
|
+
}
|
|
1565
|
+
}
|
|
1578
1566
|
}
|
|
1579
1567
|
}
|
|
1580
1568
|
return pkvs;
|
|
@@ -3502,6 +3490,18 @@ export class CLIPPreTrainedModel extends PreTrainedModel { }
|
|
|
3502
3490
|
*/
|
|
3503
3491
|
export class CLIPModel extends CLIPPreTrainedModel { }
|
|
3504
3492
|
|
|
3493
|
+
/**
|
|
3494
|
+
* The text model from CLIP without any head or projection on top.
|
|
3495
|
+
*/
|
|
3496
|
+
export class CLIPTextModel extends CLIPPreTrainedModel {
|
|
3497
|
+
/** @type {PreTrainedModel.from_pretrained} */
|
|
3498
|
+
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
3499
|
+
// Update default model file name if not provided
|
|
3500
|
+
options.model_file_name ??= 'text_model';
|
|
3501
|
+
return super.from_pretrained(pretrained_model_name_or_path, options);
|
|
3502
|
+
}
|
|
3503
|
+
}
|
|
3504
|
+
|
|
3505
3505
|
/**
|
|
3506
3506
|
* CLIP Text Model with a projection layer on top (a linear layer on top of the pooled output)
|
|
3507
3507
|
*
|
|
@@ -3529,7 +3529,6 @@ export class CLIPModel extends CLIPPreTrainedModel { }
|
|
|
3529
3529
|
* ```
|
|
3530
3530
|
*/
|
|
3531
3531
|
export class CLIPTextModelWithProjection extends CLIPPreTrainedModel {
|
|
3532
|
-
|
|
3533
3532
|
/** @type {PreTrainedModel.from_pretrained} */
|
|
3534
3533
|
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
3535
3534
|
// Update default model file name if not provided
|
|
@@ -3538,6 +3537,18 @@ export class CLIPTextModelWithProjection extends CLIPPreTrainedModel {
|
|
|
3538
3537
|
}
|
|
3539
3538
|
}
|
|
3540
3539
|
|
|
3540
|
+
/**
|
|
3541
|
+
* The vision model from CLIP without any head or projection on top.
|
|
3542
|
+
*/
|
|
3543
|
+
export class CLIPVisionModel extends CLIPPreTrainedModel {
|
|
3544
|
+
/** @type {PreTrainedModel.from_pretrained} */
|
|
3545
|
+
static async from_pretrained(pretrained_model_name_or_path, options = {}) {
|
|
3546
|
+
// Update default model file name if not provided
|
|
3547
|
+
options.model_file_name ??= 'vision_model';
|
|
3548
|
+
return super.from_pretrained(pretrained_model_name_or_path, options);
|
|
3549
|
+
}
|
|
3550
|
+
}
|
|
3551
|
+
|
|
3541
3552
|
/**
|
|
3542
3553
|
* CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output)
|
|
3543
3554
|
*
|
|
@@ -4204,6 +4215,43 @@ export class ViTForImageClassification extends ViTPreTrainedModel {
|
|
|
4204
4215
|
}
|
|
4205
4216
|
//////////////////////////////////////////////////
|
|
4206
4217
|
|
|
4218
|
+
//////////////////////////////////////////////////
|
|
4219
|
+
export class PvtPreTrainedModel extends PreTrainedModel { }
|
|
4220
|
+
export class PvtModel extends PvtPreTrainedModel { }
|
|
4221
|
+
export class PvtForImageClassification extends PvtPreTrainedModel {
|
|
4222
|
+
/**
|
|
4223
|
+
* @param {any} model_inputs
|
|
4224
|
+
*/
|
|
4225
|
+
async _call(model_inputs) {
|
|
4226
|
+
return new SequenceClassifierOutput(await super._call(model_inputs));
|
|
4227
|
+
}
|
|
4228
|
+
}
|
|
4229
|
+
//////////////////////////////////////////////////
|
|
4230
|
+
|
|
4231
|
+
//////////////////////////////////////////////////
|
|
4232
|
+
export class ViTMAEPreTrainedModel extends PreTrainedModel { }
|
|
4233
|
+
export class ViTMAEModel extends ViTMAEPreTrainedModel { }
|
|
4234
|
+
//////////////////////////////////////////////////
|
|
4235
|
+
|
|
4236
|
+
|
|
4237
|
+
//////////////////////////////////////////////////
|
|
4238
|
+
export class ViTMSNPreTrainedModel extends PreTrainedModel { }
|
|
4239
|
+
export class ViTMSNModel extends ViTMSNPreTrainedModel { }
|
|
4240
|
+
export class ViTMSNForImageClassification extends ViTMSNPreTrainedModel {
|
|
4241
|
+
/**
|
|
4242
|
+
* @param {any} model_inputs
|
|
4243
|
+
*/
|
|
4244
|
+
async _call(model_inputs) {
|
|
4245
|
+
return new SequenceClassifierOutput(await super._call(model_inputs));
|
|
4246
|
+
}
|
|
4247
|
+
}
|
|
4248
|
+
//////////////////////////////////////////////////
|
|
4249
|
+
|
|
4250
|
+
//////////////////////////////////////////////////
|
|
4251
|
+
export class GroupViTPreTrainedModel extends PreTrainedModel { }
|
|
4252
|
+
export class GroupViTModel extends GroupViTPreTrainedModel { }
|
|
4253
|
+
//////////////////////////////////////////////////
|
|
4254
|
+
|
|
4207
4255
|
|
|
4208
4256
|
//////////////////////////////////////////////////
|
|
4209
4257
|
export class FastViTPreTrainedModel extends PreTrainedModel { }
|
|
@@ -4616,6 +4664,11 @@ export class SapiensForDepthEstimation extends SapiensPreTrainedModel { }
|
|
|
4616
4664
|
export class SapiensForNormalEstimation extends SapiensPreTrainedModel { }
|
|
4617
4665
|
//////////////////////////////////////////////////
|
|
4618
4666
|
|
|
4667
|
+
//////////////////////////////////////////////////
|
|
4668
|
+
export class MaskFormerPreTrainedModel extends PreTrainedModel { }
|
|
4669
|
+
export class MaskFormerModel extends MaskFormerPreTrainedModel { }
|
|
4670
|
+
export class MaskFormerForInstanceSegmentation extends MaskFormerPreTrainedModel { }
|
|
4671
|
+
//////////////////////////////////////////////////
|
|
4619
4672
|
|
|
4620
4673
|
//////////////////////////////////////////////////
|
|
4621
4674
|
export class GLPNPreTrainedModel extends PreTrainedModel { }
|
|
@@ -6138,6 +6191,7 @@ export class MusicgenForConditionalGeneration extends PreTrainedModel { // NOTE:
|
|
|
6138
6191
|
return audio_values;
|
|
6139
6192
|
}
|
|
6140
6193
|
}
|
|
6194
|
+
//////////////////////////////////////////////////
|
|
6141
6195
|
|
|
6142
6196
|
//////////////////////////////////////////////////
|
|
6143
6197
|
// MobileNetV1 models
|
|
@@ -6231,6 +6285,17 @@ export class MobileNetV4ForImageClassification extends MobileNetV4PreTrainedMode
|
|
|
6231
6285
|
}
|
|
6232
6286
|
//////////////////////////////////////////////////
|
|
6233
6287
|
|
|
6288
|
+
//////////////////////////////////////////////////
|
|
6289
|
+
// Decision Transformer models
|
|
6290
|
+
export class DecisionTransformerPreTrainedModel extends PreTrainedModel { }
|
|
6291
|
+
|
|
6292
|
+
/**
|
|
6293
|
+
* The model builds upon the GPT2 architecture to perform autoregressive prediction of actions in an offline RL setting.
|
|
6294
|
+
* Refer to the paper for more details: https://arxiv.org/abs/2106.01345
|
|
6295
|
+
*/
|
|
6296
|
+
export class DecisionTransformerModel extends DecisionTransformerPreTrainedModel { }
|
|
6297
|
+
|
|
6298
|
+
//////////////////////////////////////////////////
|
|
6234
6299
|
|
|
6235
6300
|
//////////////////////////////////////////////////
|
|
6236
6301
|
// AutoModels, used to simplify construction of PreTrainedModels
|
|
@@ -6269,7 +6334,7 @@ export class PretrainedMixin {
|
|
|
6269
6334
|
session_options = {},
|
|
6270
6335
|
} = {}) {
|
|
6271
6336
|
|
|
6272
|
-
|
|
6337
|
+
const options = {
|
|
6273
6338
|
progress_callback,
|
|
6274
6339
|
config,
|
|
6275
6340
|
cache_dir,
|
|
@@ -6288,7 +6353,7 @@ export class PretrainedMixin {
|
|
|
6288
6353
|
throw new Error("`MODEL_CLASS_MAPPINGS` not implemented for this type of `AutoClass`: " + this.name);
|
|
6289
6354
|
}
|
|
6290
6355
|
|
|
6291
|
-
for (
|
|
6356
|
+
for (const MODEL_CLASS_MAPPING of this.MODEL_CLASS_MAPPINGS) {
|
|
6292
6357
|
const modelInfo = MODEL_CLASS_MAPPING.get(options.config.model_type);
|
|
6293
6358
|
if (!modelInfo) {
|
|
6294
6359
|
continue; // Item not found in this mapping
|
|
@@ -6343,6 +6408,10 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
|
|
|
6343
6408
|
['rt_detr', ['RTDetrModel', RTDetrModel]],
|
|
6344
6409
|
['table-transformer', ['TableTransformerModel', TableTransformerModel]],
|
|
6345
6410
|
['vit', ['ViTModel', ViTModel]],
|
|
6411
|
+
['pvt', ['PvtModel', PvtModel]],
|
|
6412
|
+
['vit_msn', ['ViTMSNModel', ViTMSNModel]],
|
|
6413
|
+
['vit_mae', ['ViTMAEModel', ViTMAEModel]],
|
|
6414
|
+
['groupvit', ['GroupViTModel', GroupViTModel]],
|
|
6346
6415
|
['fastvit', ['FastViTModel', FastViTModel]],
|
|
6347
6416
|
['mobilevit', ['MobileViTModel', MobileViTModel]],
|
|
6348
6417
|
['mobilevitv2', ['MobileViTV2Model', MobileViTV2Model]],
|
|
@@ -6365,10 +6434,14 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
|
|
|
6365
6434
|
['hifigan', ['SpeechT5HifiGan', SpeechT5HifiGan]],
|
|
6366
6435
|
['efficientnet', ['EfficientNetModel', EfficientNetModel]],
|
|
6367
6436
|
|
|
6437
|
+
['decision_transformer', ['DecisionTransformerModel', DecisionTransformerModel]],
|
|
6438
|
+
|
|
6368
6439
|
['mobilenet_v1', ['MobileNetV1Model', MobileNetV1Model]],
|
|
6369
6440
|
['mobilenet_v2', ['MobileNetV2Model', MobileNetV2Model]],
|
|
6370
6441
|
['mobilenet_v3', ['MobileNetV3Model', MobileNetV3Model]],
|
|
6371
6442
|
['mobilenet_v4', ['MobileNetV4Model', MobileNetV4Model]],
|
|
6443
|
+
|
|
6444
|
+
['maskformer', ['MaskFormerModel', MaskFormerModel]],
|
|
6372
6445
|
]);
|
|
6373
6446
|
|
|
6374
6447
|
const MODEL_MAPPING_NAMES_ENCODER_DECODER = new Map([
|
|
@@ -6553,6 +6626,8 @@ const MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = new Map([
|
|
|
6553
6626
|
|
|
6554
6627
|
const MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = new Map([
|
|
6555
6628
|
['vit', ['ViTForImageClassification', ViTForImageClassification]],
|
|
6629
|
+
['pvt', ['PvtForImageClassification', PvtForImageClassification]],
|
|
6630
|
+
['vit_msn', ['ViTMSNForImageClassification', ViTMSNForImageClassification]],
|
|
6556
6631
|
['fastvit', ['FastViTForImageClassification', FastViTForImageClassification]],
|
|
6557
6632
|
['mobilevit', ['MobileViTForImageClassification', MobileViTForImageClassification]],
|
|
6558
6633
|
['mobilevitv2', ['MobileViTV2ForImageClassification', MobileViTV2ForImageClassification]],
|
|
@@ -6585,6 +6660,7 @@ const MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES = new Map([
|
|
|
6585
6660
|
]);
|
|
6586
6661
|
|
|
6587
6662
|
const MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES = new Map([
|
|
6663
|
+
// TODO: Do not add new models here
|
|
6588
6664
|
['detr', ['DetrForSegmentation', DetrForSegmentation]],
|
|
6589
6665
|
['clipseg', ['CLIPSegForImageSegmentation', CLIPSegForImageSegmentation]],
|
|
6590
6666
|
]);
|
|
@@ -6594,6 +6670,11 @@ const MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = new Map([
|
|
|
6594
6670
|
['sapiens', ['SapiensForSemanticSegmentation', SapiensForSemanticSegmentation]],
|
|
6595
6671
|
]);
|
|
6596
6672
|
|
|
6673
|
+
const MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES = new Map([
|
|
6674
|
+
['detr', ['DetrForSegmentation', DetrForSegmentation]],
|
|
6675
|
+
['maskformer', ['MaskFormerForInstanceSegmentation', MaskFormerForInstanceSegmentation]],
|
|
6676
|
+
]);
|
|
6677
|
+
|
|
6597
6678
|
const MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = new Map([
|
|
6598
6679
|
['sam', ['SamModel', SamModel]],
|
|
6599
6680
|
]);
|
|
@@ -6669,6 +6750,7 @@ const MODEL_CLASS_TYPE_MAPPING = [
|
|
|
6669
6750
|
[MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES, MODEL_TYPES.ImageTextToText],
|
|
6670
6751
|
[MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
|
|
6671
6752
|
[MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
|
|
6753
|
+
[MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
|
|
6672
6754
|
[MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
|
|
6673
6755
|
[MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
|
|
6674
6756
|
[MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
|
|
@@ -6871,6 +6953,17 @@ export class AutoModelForSemanticSegmentation extends PretrainedMixin {
|
|
|
6871
6953
|
static MODEL_CLASS_MAPPINGS = [MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES];
|
|
6872
6954
|
}
|
|
6873
6955
|
|
|
6956
|
+
/**
|
|
6957
|
+
* Helper class which is used to instantiate pretrained universal image segmentation models with the `from_pretrained` function.
|
|
6958
|
+
* The chosen model class is determined by the type specified in the model config.
|
|
6959
|
+
*
|
|
6960
|
+
* @example
|
|
6961
|
+
* let model = await AutoModelForUniversalSegmentation.from_pretrained('hf-internal-testing/tiny-random-MaskFormerForInstanceSegmentation');
|
|
6962
|
+
*/
|
|
6963
|
+
export class AutoModelForUniversalSegmentation extends PretrainedMixin {
|
|
6964
|
+
static MODEL_CLASS_MAPPINGS = [MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES];
|
|
6965
|
+
}
|
|
6966
|
+
|
|
6874
6967
|
/**
|
|
6875
6968
|
* Helper class which is used to instantiate pretrained object detection models with the `from_pretrained` function.
|
|
6876
6969
|
* The chosen model class is determined by the type specified in the model config.
|
package/src/pipelines.js
CHANGED
|
@@ -34,6 +34,7 @@ import {
|
|
|
34
34
|
AutoModelForImageClassification,
|
|
35
35
|
AutoModelForImageSegmentation,
|
|
36
36
|
AutoModelForSemanticSegmentation,
|
|
37
|
+
AutoModelForUniversalSegmentation,
|
|
37
38
|
AutoModelForObjectDetection,
|
|
38
39
|
AutoModelForZeroShotObjectDetection,
|
|
39
40
|
AutoModelForDocumentQuestionAnswering,
|
|
@@ -3045,7 +3046,7 @@ const SUPPORTED_TASKS = Object.freeze({
|
|
|
3045
3046
|
"image-segmentation": {
|
|
3046
3047
|
// no tokenizer
|
|
3047
3048
|
"pipeline": ImageSegmentationPipeline,
|
|
3048
|
-
"model": [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation],
|
|
3049
|
+
"model": [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation, AutoModelForUniversalSegmentation],
|
|
3049
3050
|
"processor": AutoProcessor,
|
|
3050
3051
|
"default": {
|
|
3051
3052
|
// TODO: replace with original
|
|
@@ -3287,7 +3288,7 @@ async function loadItems(mapping, model, pretrainedOptions) {
|
|
|
3287
3288
|
|
|
3288
3289
|
/**@type {Promise[]} */
|
|
3289
3290
|
const promises = [];
|
|
3290
|
-
for (
|
|
3291
|
+
for (const [name, cls] of mapping.entries()) {
|
|
3291
3292
|
if (!cls) continue;
|
|
3292
3293
|
|
|
3293
3294
|
/**@type {Promise} */
|
|
@@ -3295,7 +3296,7 @@ async function loadItems(mapping, model, pretrainedOptions) {
|
|
|
3295
3296
|
if (Array.isArray(cls)) {
|
|
3296
3297
|
promise = new Promise(async (resolve, reject) => {
|
|
3297
3298
|
let e;
|
|
3298
|
-
for (
|
|
3299
|
+
for (const c of cls) {
|
|
3299
3300
|
if (c === null) {
|
|
3300
3301
|
// If null, we resolve it immediately, meaning the relevant
|
|
3301
3302
|
// class was not found, but it is optional.
|
|
@@ -3333,7 +3334,7 @@ async function loadItems(mapping, model, pretrainedOptions) {
|
|
|
3333
3334
|
await Promise.all(promises);
|
|
3334
3335
|
|
|
3335
3336
|
// Then assign to result
|
|
3336
|
-
for (
|
|
3337
|
+
for (const [name, promise] of Object.entries(result)) {
|
|
3337
3338
|
result[name] = await promise;
|
|
3338
3339
|
}
|
|
3339
3340
|
|