@huggingface/transformers 3.0.2 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -4
- package/dist/ort-wasm-simd-threaded.jsep.wasm +0 -0
- package/dist/transformers.cjs +16235 -13145
- package/dist/transformers.cjs.map +1 -1
- package/dist/transformers.js +16536 -13437
- package/dist/transformers.js.map +1 -1
- package/dist/transformers.min.cjs +238 -52
- package/dist/transformers.min.cjs.map +1 -1
- package/dist/transformers.min.js +229 -43
- package/dist/transformers.min.js.map +1 -1
- package/dist/transformers.min.mjs +240 -54
- package/dist/transformers.min.mjs.map +1 -1
- package/dist/transformers.mjs +15259 -12171
- package/dist/transformers.mjs.map +1 -1
- package/package.json +4 -4
- package/src/base/feature_extraction_utils.js +54 -0
- package/src/base/image_processors_utils.js +1089 -0
- package/src/base/processing_utils.js +145 -0
- package/src/configs.js +13 -3
- package/src/env.js +1 -1
- package/src/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js +90 -0
- package/src/models/auto/feature_extraction_auto.js +41 -0
- package/src/models/auto/image_processing_auto.js +29 -0
- package/src/models/auto/processing_auto.js +100 -0
- package/src/models/beit/image_processing_beit.js +5 -0
- package/src/models/bit/image_processing_bit.js +5 -0
- package/src/models/chinese_clip/image_processing_chinese_clip.js +5 -0
- package/src/models/clap/feature_extraction_clap.js +159 -0
- package/src/models/clip/image_processing_clip.js +6 -0
- package/src/models/convnext/image_processing_convnext.js +45 -0
- package/src/models/deit/image_processing_deit.js +6 -0
- package/src/models/detr/image_processing_detr.js +52 -0
- package/src/models/donut/image_processing_donut.js +31 -0
- package/src/models/dpt/image_processing_dpt.js +6 -0
- package/src/models/efficientnet/image_processing_efficientnet.js +13 -0
- package/src/models/feature_extractors.js +12 -0
- package/src/models/florence2/processing_florence2.js +128 -0
- package/src/models/glpn/image_processing_glpn.js +5 -0
- package/src/models/image_processors.js +36 -0
- package/src/models/janus/image_processing_janus.js +26 -0
- package/src/models/janus/processing_janus.js +123 -0
- package/src/models/jina_clip/image_processing_jina_clip.js +26 -0
- package/src/models/jina_clip/processing_jina_clip.js +24 -0
- package/src/models/llava_onevision/image_processing_llava_onevision.js +5 -0
- package/src/models/mask2former/image_processing_mask2former.js +5 -0
- package/src/models/maskformer/image_processing_maskformer.js +18 -0
- package/src/models/mgp_str/processing_mgp_str.js +170 -0
- package/src/models/mobilenet_v1/image_processing_mobilenet_v1.js +7 -0
- package/src/models/mobilenet_v2/image_processing_mobilenet_v2.js +7 -0
- package/src/models/mobilenet_v3/image_processing_mobilenet_v3.js +7 -0
- package/src/models/mobilenet_v4/image_processing_mobilenet_v4.js +7 -0
- package/src/models/mobilevit/image_processing_mobilevit.js +6 -0
- package/src/models/nougat/image_processing_nougat.js +5 -0
- package/src/models/owlv2/image_processing_owlv2.js +5 -0
- package/src/models/owlvit/image_processing_owlvit.js +12 -0
- package/src/models/owlvit/processing_owlvit.js +7 -0
- package/src/models/processors.js +11 -0
- package/src/models/pvt/image_processing_pvt.js +5 -0
- package/src/models/pyannote/feature_extraction_pyannote.js +28 -0
- package/src/models/pyannote/processing_pyannote.js +71 -0
- package/src/models/qwen2_vl/image_processing_qwen2_vl.js +52 -0
- package/src/models/qwen2_vl/processing_qwen2_vl.js +52 -0
- package/src/models/rt_detr/image_processing_rt_detr.js +12 -0
- package/src/models/sam/image_processing_sam.js +242 -0
- package/src/models/sam/processing_sam.js +20 -0
- package/src/models/sapiens/image_processing_sapiens.js +13 -0
- package/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +180 -0
- package/src/models/segformer/image_processing_segformer.js +13 -0
- package/src/models/siglip/image_processing_siglip.js +5 -0
- package/src/models/speecht5/feature_extraction_speecht5.js +4 -0
- package/src/models/speecht5/processing_speecht5.js +17 -0
- package/src/models/swin2sr/image_processing_swin2sr.js +24 -0
- package/src/models/vit/image_processing_vit.js +7 -0
- package/src/models/vitmatte/image_processing_vitmatte.js +50 -0
- package/src/models/vitpose/image_processing_vitpose.js +89 -0
- package/src/models/wav2vec2/feature_extraction_wav2vec2.js +44 -0
- package/src/models/wav2vec2/processing_wav2vec2.js +15 -0
- package/src/models/wespeaker/feature_extraction_wespeaker.js +100 -0
- package/src/models/whisper/feature_extraction_whisper.js +84 -0
- package/src/models/whisper/processing_whisper.js +21 -0
- package/src/models/yolos/image_processing_yolos.js +12 -0
- package/src/models.js +675 -32
- package/src/pipelines.js +8 -8
- package/src/tokenizers.js +5 -0
- package/src/transformers.js +15 -2
- package/src/utils/constants.js +8 -1
- package/src/utils/core.js +37 -9
- package/src/utils/hub.js +2 -1
- package/src/utils/image.js +68 -17
- package/src/utils/tensor.js +33 -1
- package/types/base/feature_extraction_utils.d.ts +41 -0
- package/types/base/feature_extraction_utils.d.ts.map +1 -0
- package/types/base/image_processors_utils.d.ts +323 -0
- package/types/base/image_processors_utils.d.ts.map +1 -0
- package/types/base/processing_utils.d.ts +80 -0
- package/types/base/processing_utils.d.ts.map +1 -0
- package/types/configs.d.ts +4 -1
- package/types/configs.d.ts.map +1 -1
- package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts +25 -0
- package/types/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.d.ts.map +1 -0
- package/types/models/auto/feature_extraction_auto.d.ts +5 -0
- package/types/models/auto/feature_extraction_auto.d.ts.map +1 -0
- package/types/models/auto/image_processing_auto.d.ts +5 -0
- package/types/models/auto/image_processing_auto.d.ts.map +1 -0
- package/types/models/auto/processing_auto.d.ts +35 -0
- package/types/models/auto/processing_auto.d.ts.map +1 -0
- package/types/models/beit/image_processing_beit.d.ts +4 -0
- package/types/models/beit/image_processing_beit.d.ts.map +1 -0
- package/types/models/bit/image_processing_bit.d.ts +4 -0
- package/types/models/bit/image_processing_bit.d.ts.map +1 -0
- package/types/models/chinese_clip/image_processing_chinese_clip.d.ts +4 -0
- package/types/models/chinese_clip/image_processing_chinese_clip.d.ts.map +1 -0
- package/types/models/clap/feature_extraction_clap.d.ts +57 -0
- package/types/models/clap/feature_extraction_clap.d.ts.map +1 -0
- package/types/models/clip/image_processing_clip.d.ts +6 -0
- package/types/models/clip/image_processing_clip.d.ts.map +1 -0
- package/types/models/convnext/image_processing_convnext.d.ts +12 -0
- package/types/models/convnext/image_processing_convnext.d.ts.map +1 -0
- package/types/models/deit/image_processing_deit.d.ts +6 -0
- package/types/models/deit/image_processing_deit.d.ts.map +1 -0
- package/types/models/detr/image_processing_detr.d.ts +42 -0
- package/types/models/detr/image_processing_detr.d.ts.map +1 -0
- package/types/models/donut/image_processing_donut.d.ts +7 -0
- package/types/models/donut/image_processing_donut.d.ts.map +1 -0
- package/types/models/dpt/image_processing_dpt.d.ts +6 -0
- package/types/models/dpt/image_processing_dpt.d.ts.map +1 -0
- package/types/models/efficientnet/image_processing_efficientnet.d.ts +6 -0
- package/types/models/efficientnet/image_processing_efficientnet.d.ts.map +1 -0
- package/types/models/feature_extractors.d.ts +10 -0
- package/types/models/feature_extractors.d.ts.map +1 -0
- package/types/models/florence2/processing_florence2.d.ts +39 -0
- package/types/models/florence2/processing_florence2.d.ts.map +1 -0
- package/types/models/glpn/image_processing_glpn.d.ts +4 -0
- package/types/models/glpn/image_processing_glpn.d.ts.map +1 -0
- package/types/models/image_processors.d.ts +36 -0
- package/types/models/image_processors.d.ts.map +1 -0
- package/types/models/janus/image_processing_janus.d.ts +7 -0
- package/types/models/janus/image_processing_janus.d.ts.map +1 -0
- package/types/models/janus/processing_janus.d.ts +77 -0
- package/types/models/janus/processing_janus.d.ts.map +1 -0
- package/types/models/jina_clip/image_processing_jina_clip.d.ts +5 -0
- package/types/models/jina_clip/image_processing_jina_clip.d.ts.map +1 -0
- package/types/models/jina_clip/processing_jina_clip.d.ts +9 -0
- package/types/models/jina_clip/processing_jina_clip.d.ts.map +1 -0
- package/types/models/llava_onevision/image_processing_llava_onevision.d.ts +4 -0
- package/types/models/llava_onevision/image_processing_llava_onevision.d.ts.map +1 -0
- package/types/models/mask2former/image_processing_mask2former.d.ts +4 -0
- package/types/models/mask2former/image_processing_mask2former.d.ts.map +1 -0
- package/types/models/maskformer/image_processing_maskformer.d.ts +22 -0
- package/types/models/maskformer/image_processing_maskformer.d.ts.map +1 -0
- package/types/models/mgp_str/processing_mgp_str.d.ts +64 -0
- package/types/models/mgp_str/processing_mgp_str.d.ts.map +1 -0
- package/types/models/mobilenet_v1/image_processing_mobilenet_v1.d.ts +6 -0
- package/types/models/mobilenet_v1/image_processing_mobilenet_v1.d.ts.map +1 -0
- package/types/models/mobilenet_v2/image_processing_mobilenet_v2.d.ts +6 -0
- package/types/models/mobilenet_v2/image_processing_mobilenet_v2.d.ts.map +1 -0
- package/types/models/mobilenet_v3/image_processing_mobilenet_v3.d.ts +6 -0
- package/types/models/mobilenet_v3/image_processing_mobilenet_v3.d.ts.map +1 -0
- package/types/models/mobilenet_v4/image_processing_mobilenet_v4.d.ts +6 -0
- package/types/models/mobilenet_v4/image_processing_mobilenet_v4.d.ts.map +1 -0
- package/types/models/mobilevit/image_processing_mobilevit.d.ts +6 -0
- package/types/models/mobilevit/image_processing_mobilevit.d.ts.map +1 -0
- package/types/models/nougat/image_processing_nougat.d.ts +4 -0
- package/types/models/nougat/image_processing_nougat.d.ts.map +1 -0
- package/types/models/owlv2/image_processing_owlv2.d.ts +4 -0
- package/types/models/owlv2/image_processing_owlv2.d.ts.map +1 -0
- package/types/models/owlvit/image_processing_owlvit.d.ts +10 -0
- package/types/models/owlvit/image_processing_owlvit.d.ts.map +1 -0
- package/types/models/owlvit/processing_owlvit.d.ts +8 -0
- package/types/models/owlvit/processing_owlvit.d.ts.map +1 -0
- package/types/models/processors.d.ts +12 -0
- package/types/models/processors.d.ts.map +1 -0
- package/types/models/pvt/image_processing_pvt.d.ts +4 -0
- package/types/models/pvt/image_processing_pvt.d.ts.map +1 -0
- package/types/models/pyannote/feature_extraction_pyannote.d.ts +13 -0
- package/types/models/pyannote/feature_extraction_pyannote.d.ts.map +1 -0
- package/types/models/pyannote/processing_pyannote.d.ts +30 -0
- package/types/models/pyannote/processing_pyannote.d.ts.map +1 -0
- package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts +11 -0
- package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -0
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +17 -0
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -0
- package/types/models/rt_detr/image_processing_rt_detr.d.ts +8 -0
- package/types/models/rt_detr/image_processing_rt_detr.d.ts.map +1 -0
- package/types/models/sam/image_processing_sam.d.ts +103 -0
- package/types/models/sam/image_processing_sam.d.ts.map +1 -0
- package/types/models/sam/processing_sam.d.ts +9 -0
- package/types/models/sam/processing_sam.d.ts.map +1 -0
- package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts +34 -0
- package/types/models/seamless_m4t/feature_extraction_seamless_m4t.d.ts.map +1 -0
- package/types/models/segformer/image_processing_segformer.d.ts +10 -0
- package/types/models/segformer/image_processing_segformer.d.ts.map +1 -0
- package/types/models/siglip/image_processing_siglip.d.ts +4 -0
- package/types/models/siglip/image_processing_siglip.d.ts.map +1 -0
- package/types/models/speecht5/feature_extraction_speecht5.d.ts +4 -0
- package/types/models/speecht5/feature_extraction_speecht5.d.ts.map +1 -0
- package/types/models/speecht5/processing_speecht5.d.ts +14 -0
- package/types/models/speecht5/processing_speecht5.d.ts.map +1 -0
- package/types/models/swin2sr/image_processing_swin2sr.d.ts +5 -0
- package/types/models/swin2sr/image_processing_swin2sr.d.ts.map +1 -0
- package/types/models/vit/image_processing_vit.d.ts +6 -0
- package/types/models/vit/image_processing_vit.d.ts.map +1 -0
- package/types/models/vitmatte/image_processing_vitmatte.d.ts +12 -0
- package/types/models/vitmatte/image_processing_vitmatte.d.ts.map +1 -0
- package/types/models/vitpose/image_processing_vitpose.d.ts +26 -0
- package/types/models/vitpose/image_processing_vitpose.d.ts.map +1 -0
- package/types/models/wav2vec2/feature_extraction_wav2vec2.d.ts +19 -0
- package/types/models/wav2vec2/feature_extraction_wav2vec2.d.ts.map +1 -0
- package/types/models/wav2vec2/processing_wav2vec2.d.ts +12 -0
- package/types/models/wav2vec2/processing_wav2vec2.d.ts.map +1 -0
- package/types/models/wespeaker/feature_extraction_wespeaker.d.ts +23 -0
- package/types/models/wespeaker/feature_extraction_wespeaker.d.ts.map +1 -0
- package/types/models/whisper/feature_extraction_whisper.d.ts +21 -0
- package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -0
- package/types/models/whisper/processing_whisper.d.ts +17 -0
- package/types/models/whisper/processing_whisper.d.ts.map +1 -0
- package/types/models/yolos/image_processing_yolos.d.ts +10 -0
- package/types/models/yolos/image_processing_yolos.d.ts.map +1 -0
- package/types/models.d.ts +140 -0
- package/types/models.d.ts.map +1 -1
- package/types/pipelines.d.ts +2 -3
- package/types/pipelines.d.ts.map +1 -1
- package/types/tokenizers.d.ts +3 -0
- package/types/tokenizers.d.ts.map +1 -1
- package/types/transformers.d.ts +10 -1
- package/types/utils/constants.d.ts +6 -0
- package/types/utils/constants.d.ts.map +1 -1
- package/types/utils/core.d.ts +58 -3
- package/types/utils/core.d.ts.map +1 -1
- package/types/utils/hub.d.ts +1 -1
- package/types/utils/hub.d.ts.map +1 -1
- package/types/utils/image.d.ts +10 -2
- package/types/utils/image.d.ts.map +1 -1
- package/types/utils/tensor.d.ts +34 -1
- package/types/utils/tensor.d.ts.map +1 -1
- package/src/processors.js +0 -2655
- package/types/processors.d.ts +0 -924
- package/types/processors.d.ts.map +0 -1
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import {
|
|
2
|
+
ImageProcessor,
|
|
3
|
+
} from "../../base/image_processors_utils.js";
|
|
4
|
+
|
|
5
|
+
export class ConvNextImageProcessor extends ImageProcessor {
|
|
6
|
+
constructor(config) {
|
|
7
|
+
super(config);
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Percentage of the image to crop. Only has an effect if this.size < 384.
|
|
11
|
+
*/
|
|
12
|
+
this.crop_pct = this.config.crop_pct ?? (224 / 256);
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
async resize(image) {
|
|
16
|
+
const shortest_edge = this.size?.shortest_edge;
|
|
17
|
+
if (shortest_edge === undefined) {
|
|
18
|
+
throw new Error(`Size dictionary must contain 'shortest_edge' key.`);
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
if (shortest_edge < 384) {
|
|
22
|
+
// maintain same ratio, resizing shortest edge to shortest_edge/crop_pct
|
|
23
|
+
const resize_shortest_edge = Math.floor(shortest_edge / this.crop_pct);
|
|
24
|
+
|
|
25
|
+
const [newWidth, newHeight] = this.get_resize_output_image_size(image, {
|
|
26
|
+
shortest_edge: resize_shortest_edge,
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
image = await image.resize(newWidth, newHeight, {
|
|
30
|
+
resample: this.resample,
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
// then crop to (shortest_edge, shortest_edge)
|
|
34
|
+
image = await image.center_crop(shortest_edge, shortest_edge);
|
|
35
|
+
} else {
|
|
36
|
+
// warping (no cropping) when evaluated at 384 or larger
|
|
37
|
+
image = await image.resize(shortest_edge, shortest_edge, {
|
|
38
|
+
resample: this.resample,
|
|
39
|
+
});
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
return image;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
export class ConvNextFeatureExtractor extends ConvNextImageProcessor { }
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import {
|
|
2
|
+
ImageProcessor,
|
|
3
|
+
post_process_object_detection,
|
|
4
|
+
post_process_panoptic_segmentation,
|
|
5
|
+
post_process_instance_segmentation,
|
|
6
|
+
} from "../../base/image_processors_utils.js";
|
|
7
|
+
|
|
8
|
+
import { full } from '../../utils/tensor.js';
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* @typedef {object} DetrFeatureExtractorResultProps
|
|
13
|
+
* @property {import('../../utils/tensor.js').Tensor} pixel_mask
|
|
14
|
+
* @typedef {import('../../base/image_processors_utils.js').ImageProcessorResult & DetrFeatureExtractorResultProps} DetrFeatureExtractorResult
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
export class DetrImageProcessor extends ImageProcessor {
|
|
18
|
+
/**
|
|
19
|
+
* Calls the feature extraction process on an array of images, preprocesses
|
|
20
|
+
* each image, and concatenates the resulting features into a single Tensor.
|
|
21
|
+
* @param {import('../../utils/image.js').RawImage[]} images The image(s) to extract features from.
|
|
22
|
+
* @returns {Promise<DetrFeatureExtractorResult>} An object containing the concatenated pixel values of the preprocessed images.
|
|
23
|
+
*/
|
|
24
|
+
async _call(images) {
|
|
25
|
+
const result = await super._call(images);
|
|
26
|
+
|
|
27
|
+
// TODO support differently-sized images, for now assume all images are the same size.
|
|
28
|
+
// TODO support different mask sizes (not just 64x64)
|
|
29
|
+
// Currently, just fill pixel mask with 1s
|
|
30
|
+
const maskSize = [result.pixel_values.dims[0], 64, 64];
|
|
31
|
+
const pixel_mask = full(maskSize, 1n);
|
|
32
|
+
|
|
33
|
+
return { ...result, pixel_mask };
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/** @type {typeof post_process_object_detection} */
|
|
37
|
+
post_process_object_detection(...args) {
|
|
38
|
+
return post_process_object_detection(...args);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/** @type {typeof post_process_panoptic_segmentation} */
|
|
42
|
+
post_process_panoptic_segmentation(...args) {
|
|
43
|
+
return post_process_panoptic_segmentation(...args);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/** @type {typeof post_process_instance_segmentation} */
|
|
47
|
+
post_process_instance_segmentation(...args) {
|
|
48
|
+
return post_process_instance_segmentation(...args);
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
export class DetrFeatureExtractor extends DetrImageProcessor { } // NOTE: extends DetrImageProcessor
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import {
|
|
2
|
+
ImageProcessor,
|
|
3
|
+
} from "../../base/image_processors_utils.js";
|
|
4
|
+
|
|
5
|
+
export class DonutImageProcessor extends ImageProcessor {
|
|
6
|
+
pad_image(pixelData, imgDims, padSize, options = {}) {
|
|
7
|
+
const [imageHeight, imageWidth, imageChannels] = imgDims;
|
|
8
|
+
|
|
9
|
+
let image_mean = this.image_mean;
|
|
10
|
+
if (!Array.isArray(this.image_mean)) {
|
|
11
|
+
image_mean = new Array(imageChannels).fill(image_mean);
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
let image_std = this.image_std;
|
|
15
|
+
if (!Array.isArray(image_std)) {
|
|
16
|
+
image_std = new Array(imageChannels).fill(image_mean);
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
const constant_values = image_mean.map((x, i) => - x / image_std[i]);
|
|
20
|
+
|
|
21
|
+
return super.pad_image(pixelData, imgDims, padSize, {
|
|
22
|
+
center: true,
|
|
23
|
+
|
|
24
|
+
// Since normalization is done after padding, we need to use certain constant values to ensure the same behaviour is observed.
|
|
25
|
+
// For more information, see https://github.com/huggingface/transformers/blob/main/src/transformers/models/donut/image_processing_donut.py#L433-L451
|
|
26
|
+
constant_values,
|
|
27
|
+
...options,
|
|
28
|
+
});
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
export class DonutFeatureExtractor extends DonutImageProcessor { }
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import {
|
|
2
|
+
ImageProcessor,
|
|
3
|
+
} from "../../base/image_processors_utils.js";
|
|
4
|
+
|
|
5
|
+
export class EfficientNetImageProcessor extends ImageProcessor {
|
|
6
|
+
constructor(config) {
|
|
7
|
+
super(config);
|
|
8
|
+
this.include_top = this.config.include_top ?? true;
|
|
9
|
+
if (this.include_top) {
|
|
10
|
+
this.image_std = this.image_std.map(x => x * x);
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
|
|
2
|
+
export * from './audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js';
|
|
3
|
+
export * from './clap/feature_extraction_clap.js';
|
|
4
|
+
export * from './pyannote/feature_extraction_pyannote.js';
|
|
5
|
+
export * from './seamless_m4t/feature_extraction_seamless_m4t.js';
|
|
6
|
+
export * from './speecht5/feature_extraction_speecht5.js';
|
|
7
|
+
export * from './wav2vec2/feature_extraction_wav2vec2.js';
|
|
8
|
+
export * from './wespeaker/feature_extraction_wespeaker.js';
|
|
9
|
+
export * from './whisper/feature_extraction_whisper.js';
|
|
10
|
+
|
|
11
|
+
// For legacy support, ImageFeatureExtractor is an alias for ImageProcessor
|
|
12
|
+
export { ImageProcessor as ImageFeatureExtractor } from "../base/image_processors_utils.js";
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import { Processor } from "../../base/processing_utils.js";
|
|
2
|
+
import { AutoImageProcessor } from "../auto/image_processing_auto.js";
|
|
3
|
+
import { AutoTokenizer } from "../../tokenizers.js";
|
|
4
|
+
|
|
5
|
+
export class Florence2Processor extends Processor {
|
|
6
|
+
static tokenizer_class = AutoTokenizer
|
|
7
|
+
static image_processor_class = AutoImageProcessor
|
|
8
|
+
|
|
9
|
+
constructor(config, components) {
|
|
10
|
+
super(config, components);
|
|
11
|
+
|
|
12
|
+
const {
|
|
13
|
+
tasks_answer_post_processing_type,
|
|
14
|
+
task_prompts_without_inputs,
|
|
15
|
+
task_prompts_with_input,
|
|
16
|
+
} = this.image_processor.config;
|
|
17
|
+
|
|
18
|
+
/** @type {Map<string, string>} */
|
|
19
|
+
this.tasks_answer_post_processing_type = new Map(Object.entries(tasks_answer_post_processing_type ?? {}));
|
|
20
|
+
|
|
21
|
+
/** @type {Map<string, string>} */
|
|
22
|
+
this.task_prompts_without_inputs = new Map(Object.entries(task_prompts_without_inputs ?? {}));
|
|
23
|
+
|
|
24
|
+
/** @type {Map<string, string>} */
|
|
25
|
+
this.task_prompts_with_input = new Map(Object.entries(task_prompts_with_input ?? {}));
|
|
26
|
+
|
|
27
|
+
this.regexes = {
|
|
28
|
+
quad_boxes: /(.+?)<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>/gm,
|
|
29
|
+
bboxes: /([^<]+)?<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>/gm,
|
|
30
|
+
}
|
|
31
|
+
this.size_per_bin = 1000;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Helper function to construct prompts from input texts
|
|
36
|
+
* @param {string|string[]} text
|
|
37
|
+
* @returns {string[]}
|
|
38
|
+
*/
|
|
39
|
+
construct_prompts(text) {
|
|
40
|
+
if (typeof text === 'string') {
|
|
41
|
+
text = [text];
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
const prompts = [];
|
|
45
|
+
for (const t of text) {
|
|
46
|
+
// 1. fixed task prompts without additional inputs
|
|
47
|
+
if (this.task_prompts_without_inputs.has(t)) {
|
|
48
|
+
prompts.push(this.task_prompts_without_inputs.get(t));
|
|
49
|
+
}
|
|
50
|
+
// 2. task prompts with additional inputs
|
|
51
|
+
else {
|
|
52
|
+
for (const [task, prompt] of this.task_prompts_with_input) {
|
|
53
|
+
if (t.includes(task)) {
|
|
54
|
+
prompts.push(prompt.replaceAll('{input}', t).replaceAll(task, ''));
|
|
55
|
+
break;
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// 3. default prompt
|
|
60
|
+
if (prompts.length !== text.length) {
|
|
61
|
+
prompts.push(t);
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
return prompts;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Post-process the output of the model to each of the task outputs.
|
|
70
|
+
* @param {string} text The text to post-process.
|
|
71
|
+
* @param {string} task The task to post-process the text for.
|
|
72
|
+
* @param {[number, number]} image_size The size of the image. height x width.
|
|
73
|
+
*/
|
|
74
|
+
post_process_generation(text, task, image_size) {
|
|
75
|
+
const task_answer_post_processing_type = this.tasks_answer_post_processing_type.get(task) ?? 'pure_text';
|
|
76
|
+
|
|
77
|
+
// remove the special tokens
|
|
78
|
+
text = text.replaceAll('<s>', '').replaceAll('</s>', '');
|
|
79
|
+
|
|
80
|
+
let final_answer;
|
|
81
|
+
switch (task_answer_post_processing_type) {
|
|
82
|
+
case 'pure_text':
|
|
83
|
+
final_answer = text;
|
|
84
|
+
break;
|
|
85
|
+
|
|
86
|
+
case 'description_with_bboxes':
|
|
87
|
+
case 'bboxes':
|
|
88
|
+
case 'phrase_grounding':
|
|
89
|
+
case 'ocr':
|
|
90
|
+
const key = task_answer_post_processing_type === 'ocr' ? 'quad_boxes' : 'bboxes';
|
|
91
|
+
const matches = text.matchAll(this.regexes[key]);
|
|
92
|
+
const labels = [];
|
|
93
|
+
const items = [];
|
|
94
|
+
for (const [_, label, ...locations] of matches) {
|
|
95
|
+
// Push new label, or duplicate the last label
|
|
96
|
+
labels.push(label ? label.trim() : labels.at(-1) ?? '');
|
|
97
|
+
items.push(locations.map((x, i) =>
|
|
98
|
+
// NOTE: Add 0.5 to use the center position of the bin as the coordinate.
|
|
99
|
+
(Number(x) + 0.5) / this.size_per_bin * image_size[i % 2])
|
|
100
|
+
);
|
|
101
|
+
}
|
|
102
|
+
final_answer = { labels, [key]: items };
|
|
103
|
+
break;
|
|
104
|
+
|
|
105
|
+
default:
|
|
106
|
+
throw new Error(`Task "${task}" (of type "${task_answer_post_processing_type}") not yet implemented.`);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
return { [task]: final_answer }
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// NOTE: images and text are switched from the python version
|
|
113
|
+
// `images` is required, `text` is optional
|
|
114
|
+
async _call(images, text=null, kwargs = {}) {
|
|
115
|
+
|
|
116
|
+
if (!images && !text){
|
|
117
|
+
throw new Error('Either text or images must be provided');
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
const image_inputs = await this.image_processor(images, kwargs);
|
|
121
|
+
const text_inputs = text ? this.tokenizer(text, kwargs) : {};
|
|
122
|
+
|
|
123
|
+
return {
|
|
124
|
+
...image_inputs,
|
|
125
|
+
...text_inputs,
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
|
|
2
|
+
export * from './beit/image_processing_beit.js'
|
|
3
|
+
export * from './bit/image_processing_bit.js'
|
|
4
|
+
export * from './chinese_clip/image_processing_chinese_clip.js'
|
|
5
|
+
export * from './clip/image_processing_clip.js'
|
|
6
|
+
export * from './convnext/image_processing_convnext.js'
|
|
7
|
+
export * from './deit/image_processing_deit.js'
|
|
8
|
+
export * from './detr/image_processing_detr.js'
|
|
9
|
+
export * from './donut/image_processing_donut.js'
|
|
10
|
+
export * from './dpt/image_processing_dpt.js'
|
|
11
|
+
export * from './efficientnet/image_processing_efficientnet.js'
|
|
12
|
+
export * from './glpn/image_processing_glpn.js'
|
|
13
|
+
export * from './janus/image_processing_janus.js'
|
|
14
|
+
export * from './jina_clip/image_processing_jina_clip.js'
|
|
15
|
+
export * from './llava_onevision/image_processing_llava_onevision.js'
|
|
16
|
+
export * from './mask2former/image_processing_mask2former.js'
|
|
17
|
+
export * from './maskformer/image_processing_maskformer.js'
|
|
18
|
+
export * from './mobilenet_v1/image_processing_mobilenet_v1.js'
|
|
19
|
+
export * from './mobilenet_v2/image_processing_mobilenet_v2.js'
|
|
20
|
+
export * from './mobilenet_v3/image_processing_mobilenet_v3.js'
|
|
21
|
+
export * from './mobilenet_v4/image_processing_mobilenet_v4.js'
|
|
22
|
+
export * from './mobilevit/image_processing_mobilevit.js'
|
|
23
|
+
export * from './nougat/image_processing_nougat.js'
|
|
24
|
+
export * from './owlv2/image_processing_owlv2.js'
|
|
25
|
+
export * from './owlvit/image_processing_owlvit.js'
|
|
26
|
+
export * from './pvt/image_processing_pvt.js'
|
|
27
|
+
export * from './qwen2_vl/image_processing_qwen2_vl.js'
|
|
28
|
+
export * from './rt_detr/image_processing_rt_detr.js'
|
|
29
|
+
export * from './sam/image_processing_sam.js'
|
|
30
|
+
export * from './segformer/image_processing_segformer.js'
|
|
31
|
+
export * from './siglip/image_processing_siglip.js'
|
|
32
|
+
export * from './swin2sr/image_processing_swin2sr.js'
|
|
33
|
+
export * from './vit/image_processing_vit.js'
|
|
34
|
+
export * from './vitmatte/image_processing_vitmatte.js'
|
|
35
|
+
export * from './vitpose/image_processing_vitpose.js'
|
|
36
|
+
export * from './yolos/image_processing_yolos.js'
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
|
|
2
|
+
import {
|
|
3
|
+
ImageProcessor,
|
|
4
|
+
} from "../../base/image_processors_utils.js";
|
|
5
|
+
|
|
6
|
+
export class VLMImageProcessor extends ImageProcessor {
|
|
7
|
+
constructor(config) {
|
|
8
|
+
super({
|
|
9
|
+
do_pad: true,
|
|
10
|
+
pad_size: {
|
|
11
|
+
width: config.image_size,
|
|
12
|
+
height: config.image_size,
|
|
13
|
+
},
|
|
14
|
+
...config,
|
|
15
|
+
});
|
|
16
|
+
this.constant_values = this.config.background_color.map(x => x * this.rescale_factor)
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
pad_image(pixelData, imgDims, padSize, options) {
|
|
20
|
+
return super.pad_image(pixelData, imgDims, padSize, {
|
|
21
|
+
constant_values: this.constant_values,
|
|
22
|
+
center: true,
|
|
23
|
+
...options,
|
|
24
|
+
});
|
|
25
|
+
}
|
|
26
|
+
}
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
|
|
2
|
+
import { Processor } from "../../base/processing_utils.js";
|
|
3
|
+
import { AutoImageProcessor } from "../auto/image_processing_auto.js";
|
|
4
|
+
import { AutoTokenizer } from "../../tokenizers.js";
|
|
5
|
+
import { mergeArrays } from "../../utils/core.js";
|
|
6
|
+
import { Tensor } from "../../utils/tensor.js";
|
|
7
|
+
import { RawImage } from "../../utils/image.js";
|
|
8
|
+
|
|
9
|
+
export class VLChatProcessor extends Processor {
|
|
10
|
+
static image_processor_class = AutoImageProcessor
|
|
11
|
+
static tokenizer_class = AutoTokenizer
|
|
12
|
+
static uses_processor_config = true;
|
|
13
|
+
|
|
14
|
+
constructor(config, components) {
|
|
15
|
+
super(config, components);
|
|
16
|
+
|
|
17
|
+
this.image_tag = this.config.image_tag;
|
|
18
|
+
this.image_start_tag = this.config.image_start_tag;
|
|
19
|
+
this.image_end_tag = this.config.image_end_tag;
|
|
20
|
+
this.num_image_tokens = this.config.num_image_tokens;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* @typedef {Object} MultimodalMessageProperties Additional properties for multimodal messages.
|
|
25
|
+
* @property {(RawImage | string | URL)[]} [images] The images in the message.
|
|
26
|
+
* @typedef {(import('../../tokenizers.js').Message & MultimodalMessageProperties)[]} MultimodalConversation The conversation possibly containing multimodal inputs.
|
|
27
|
+
*/
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* @typedef {Object} VLCChatProcessorResult The processed input.
|
|
31
|
+
* @property {Tensor} input_ids The input IDs.
|
|
32
|
+
* @property {Tensor} attention_mask The attention mask.
|
|
33
|
+
* @property {Tensor} images_seq_mask The image sequence mask.
|
|
34
|
+
* @property {Tensor} images_emb_mask The image embedding mask.
|
|
35
|
+
*/
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* @param {MultimodalConversation} conversation The chat messages to process.
|
|
39
|
+
* @param {Object} options Additional options for processing.
|
|
40
|
+
* @param {RawImage|RawImage[]} [options.images] The images to process, if not set in the conversation.
|
|
41
|
+
* @param {string} [options.chat_template="default"] The chat template to use.
|
|
42
|
+
* @returns {Promise<VLCChatProcessorResult | VLCChatProcessorResult & import('../../base/image_processors_utils.js').ImageProcessorResult>} The processed input.
|
|
43
|
+
*/
|
|
44
|
+
async _call(conversation, {
|
|
45
|
+
images = null,
|
|
46
|
+
chat_template = "default",
|
|
47
|
+
}={}) {
|
|
48
|
+
if (!images) {
|
|
49
|
+
images = await Promise.all(
|
|
50
|
+
conversation
|
|
51
|
+
.filter((msg) => msg.images)
|
|
52
|
+
.flatMap((msg) => msg.images)
|
|
53
|
+
.map((img) => RawImage.read(img))
|
|
54
|
+
);
|
|
55
|
+
} else if (!Array.isArray(images)) {
|
|
56
|
+
images = [images];
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
const tokenizer = this.tokenizer;
|
|
60
|
+
const result = tokenizer.apply_chat_template(conversation, {
|
|
61
|
+
tokenize: false,
|
|
62
|
+
add_generation_prompt: true,
|
|
63
|
+
chat_template,
|
|
64
|
+
});
|
|
65
|
+
|
|
66
|
+
const encode = (text) => tokenizer.encode(text, { add_special_tokens: false });
|
|
67
|
+
const parts = (/** @type {string} */(result))
|
|
68
|
+
.split(this.image_tag);
|
|
69
|
+
const num_images = parts.length - 1;
|
|
70
|
+
if (images.length !== num_images) {
|
|
71
|
+
throw new Error(`Number of images provided (${images.length}) does not match number of "${this.image_tag}" image tags (${num_images})`);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const [
|
|
75
|
+
image_placeholder_tag_id,
|
|
76
|
+
image_start_tag_id,
|
|
77
|
+
image_end_tag_id,
|
|
78
|
+
] = tokenizer.model.convert_tokens_to_ids([
|
|
79
|
+
this.image_tag,
|
|
80
|
+
this.image_start_tag,
|
|
81
|
+
this.image_end_tag,
|
|
82
|
+
]);
|
|
83
|
+
|
|
84
|
+
let input_ids = encode(parts[0]);
|
|
85
|
+
let images_seq_mask = new Array(input_ids.length).fill(false);
|
|
86
|
+
for (let i = 1; i < parts.length; ++i) {
|
|
87
|
+
const placeholder_image_tokens = new Array(this.num_image_tokens).fill(image_placeholder_tag_id);
|
|
88
|
+
const tokens = encode(parts[i]);
|
|
89
|
+
input_ids = mergeArrays(
|
|
90
|
+
input_ids,
|
|
91
|
+
[image_start_tag_id], placeholder_image_tokens, [image_end_tag_id],
|
|
92
|
+
tokens,
|
|
93
|
+
);
|
|
94
|
+
const image_mask = new Array(this.num_image_tokens).fill(true);
|
|
95
|
+
images_seq_mask = mergeArrays(
|
|
96
|
+
images_seq_mask,
|
|
97
|
+
[false], image_mask, [false],
|
|
98
|
+
new Array(tokens.length).fill(false),
|
|
99
|
+
);
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
const dims = [1, input_ids.length];
|
|
103
|
+
const final = {
|
|
104
|
+
input_ids: new Tensor('int64', input_ids, dims),
|
|
105
|
+
attention_mask: new Tensor('int64', new Array(input_ids.length).fill(1), dims),
|
|
106
|
+
images_seq_mask: new Tensor('bool', images_seq_mask, dims),
|
|
107
|
+
images_emb_mask: new Tensor(
|
|
108
|
+
'bool',
|
|
109
|
+
new Array(num_images * this.num_image_tokens).fill(true),
|
|
110
|
+
[1, num_images, this.num_image_tokens],
|
|
111
|
+
),
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
if (images && images.length > 0) {
|
|
115
|
+
const image_inputs = await this.image_processor(images);
|
|
116
|
+
// Set the batch_size dimension to 1
|
|
117
|
+
image_inputs.pixel_values.unsqueeze_(0);
|
|
118
|
+
return { ...final, ...image_inputs };
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
return final;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import {
|
|
2
|
+
ImageProcessor,
|
|
3
|
+
} from "../../base/image_processors_utils.js";
|
|
4
|
+
|
|
5
|
+
export class JinaCLIPImageProcessor extends ImageProcessor {
|
|
6
|
+
constructor(config) {
|
|
7
|
+
// JinaCLIPImageProcessor uses a custom preprocessor_config.json, so we configure it here
|
|
8
|
+
const { resize_mode, fill_color, interpolation, size, ...other } = config;
|
|
9
|
+
|
|
10
|
+
const new_size = resize_mode === 'squash'
|
|
11
|
+
? { width: size, height: size }
|
|
12
|
+
: resize_mode === 'shortest'
|
|
13
|
+
? { shortest_edge: size }
|
|
14
|
+
: { longest_edge: size };
|
|
15
|
+
|
|
16
|
+
const resample = interpolation === 'bicubic' ? 3 : 2;
|
|
17
|
+
super({
|
|
18
|
+
...other,
|
|
19
|
+
size: new_size,
|
|
20
|
+
resample,
|
|
21
|
+
do_center_crop: true,
|
|
22
|
+
crop_size: size,
|
|
23
|
+
do_normalize: true,
|
|
24
|
+
});
|
|
25
|
+
}
|
|
26
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
|
|
2
|
+
import { Processor } from "../../base/processing_utils.js";
|
|
3
|
+
import { AutoImageProcessor } from "../auto/image_processing_auto.js";
|
|
4
|
+
import { AutoTokenizer } from "../../tokenizers.js";
|
|
5
|
+
|
|
6
|
+
export class JinaCLIPProcessor extends Processor {
|
|
7
|
+
static tokenizer_class = AutoTokenizer
|
|
8
|
+
static image_processor_class = AutoImageProcessor
|
|
9
|
+
|
|
10
|
+
async _call(text=null, images=null, kwargs = {}) {
|
|
11
|
+
|
|
12
|
+
if (!text && !images){
|
|
13
|
+
throw new Error('Either text or images must be provided');
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
const text_inputs = text ? this.tokenizer(text, kwargs) : {};
|
|
17
|
+
const image_inputs = images ? await this.image_processor(images, kwargs) : {};
|
|
18
|
+
|
|
19
|
+
return {
|
|
20
|
+
...text_inputs,
|
|
21
|
+
...image_inputs,
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import {
|
|
2
|
+
ImageProcessor,
|
|
3
|
+
post_process_panoptic_segmentation,
|
|
4
|
+
post_process_instance_segmentation,
|
|
5
|
+
} from "../../base/image_processors_utils.js";
|
|
6
|
+
|
|
7
|
+
export class MaskFormerImageProcessor extends ImageProcessor {
|
|
8
|
+
|
|
9
|
+
/** @type {typeof post_process_panoptic_segmentation} */
|
|
10
|
+
post_process_panoptic_segmentation(...args) {
|
|
11
|
+
return post_process_panoptic_segmentation(...args);
|
|
12
|
+
}
|
|
13
|
+
/** @type {typeof post_process_instance_segmentation} */
|
|
14
|
+
post_process_instance_segmentation(...args) {
|
|
15
|
+
return post_process_instance_segmentation(...args);
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
export class MaskFormerFeatureExtractor extends MaskFormerImageProcessor { }
|