parakeet.js 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. package/.gitmodules +3 -0
  2. package/README.md +240 -239
  3. package/examples/hf-spaces-demo/README.md +6 -9
  4. package/examples/hf-spaces-demo/package.json +1 -1
  5. package/examples/hf-spaces-demo/src/App.js +307 -316
  6. package/examples/react-demo/package.json +19 -19
  7. package/examples/react-demo/src/App.jsx +324 -326
  8. package/examples/react-demo-dev/src/App.jsx +23 -24
  9. package/package.json +1 -1
  10. package/publish.ps1 +65 -0
  11. package/src/hub.js +235 -241
  12. package/src/parakeet.js +15 -8
  13. package/src/preprocessor.js +75 -68
  14. package/docs/parakeet-transformers-js/.gitattributes +0 -2
  15. package/docs/parakeet-transformers-js/.prettierignore +0 -8
  16. package/docs/parakeet-transformers-js/.prettierrc +0 -10
  17. package/docs/parakeet-transformers-js/.tmp_features.json +0 -1
  18. package/docs/parakeet-transformers-js/LICENSE +0 -202
  19. package/docs/parakeet-transformers-js/README.md +0 -448
  20. package/docs/parakeet-transformers-js/assets/nemo128.onnx +0 -0
  21. package/docs/parakeet-transformers-js/assets/nemo80.onnx +0 -0
  22. package/docs/parakeet-transformers-js/debug_test.js +0 -84
  23. package/docs/parakeet-transformers-js/dev/inspect_decoder.cjs +0 -9
  24. package/docs/parakeet-transformers-js/dev/inspect_joiner.cjs +0 -9
  25. package/docs/parakeet-transformers-js/dev/js_step_by_step.js +0 -249
  26. package/docs/parakeet-transformers-js/dev/parakeet_cli.js +0 -91
  27. package/docs/parakeet-transformers-js/jest.config.mjs +0 -194
  28. package/docs/parakeet-transformers-js/js_preprocessing.json +0 -225
  29. package/docs/parakeet-transformers-js/js_step_by_step.json +0 -837
  30. package/docs/parakeet-transformers-js/js_step_by_step_v2.json +0 -450
  31. package/docs/parakeet-transformers-js/js_step_by_step_v3.json +0 -450
  32. package/docs/parakeet-transformers-js/js_steps.json +0 -821
  33. package/docs/parakeet-transformers-js/package-lock.json +0 -12251
  34. package/docs/parakeet-transformers-js/package.json +0 -96
  35. package/docs/parakeet-transformers-js/src/audio_features.js +0 -178
  36. package/docs/parakeet-transformers-js/src/backends/onnx.js +0 -210
  37. package/docs/parakeet-transformers-js/src/base/feature_extraction_utils.js +0 -54
  38. package/docs/parakeet-transformers-js/src/base/image_processors_utils.js +0 -1105
  39. package/docs/parakeet-transformers-js/src/base/processing_utils.js +0 -173
  40. package/docs/parakeet-transformers-js/src/configs.js +0 -455
  41. package/docs/parakeet-transformers-js/src/env.js +0 -167
  42. package/docs/parakeet-transformers-js/src/generation/configuration_utils.js +0 -388
  43. package/docs/parakeet-transformers-js/src/generation/logits_process.js +0 -727
  44. package/docs/parakeet-transformers-js/src/generation/logits_sampler.js +0 -204
  45. package/docs/parakeet-transformers-js/src/generation/parameters.js +0 -35
  46. package/docs/parakeet-transformers-js/src/generation/stopping_criteria.js +0 -156
  47. package/docs/parakeet-transformers-js/src/generation/streamers.js +0 -225
  48. package/docs/parakeet-transformers-js/src/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js +0 -85
  49. package/docs/parakeet-transformers-js/src/models/auto/feature_extraction_auto.js +0 -25
  50. package/docs/parakeet-transformers-js/src/models/auto/image_processing_auto.js +0 -29
  51. package/docs/parakeet-transformers-js/src/models/auto/processing_auto.js +0 -85
  52. package/docs/parakeet-transformers-js/src/models/beit/image_processing_beit.js +0 -5
  53. package/docs/parakeet-transformers-js/src/models/bit/image_processing_bit.js +0 -5
  54. package/docs/parakeet-transformers-js/src/models/chinese_clip/image_processing_chinese_clip.js +0 -5
  55. package/docs/parakeet-transformers-js/src/models/clap/feature_extraction_clap.js +0 -159
  56. package/docs/parakeet-transformers-js/src/models/clip/image_processing_clip.js +0 -6
  57. package/docs/parakeet-transformers-js/src/models/convnext/image_processing_convnext.js +0 -46
  58. package/docs/parakeet-transformers-js/src/models/dac/feature_extraction_dac.js +0 -3
  59. package/docs/parakeet-transformers-js/src/models/deit/image_processing_deit.js +0 -6
  60. package/docs/parakeet-transformers-js/src/models/detr/image_processing_detr.js +0 -52
  61. package/docs/parakeet-transformers-js/src/models/donut/image_processing_donut.js +0 -31
  62. package/docs/parakeet-transformers-js/src/models/dpt/image_processing_dpt.js +0 -6
  63. package/docs/parakeet-transformers-js/src/models/efficientnet/image_processing_efficientnet.js +0 -14
  64. package/docs/parakeet-transformers-js/src/models/encodec/feature_extraction_encodec.js +0 -32
  65. package/docs/parakeet-transformers-js/src/models/feature_extractors.js +0 -17
  66. package/docs/parakeet-transformers-js/src/models/florence2/processing_florence2.js +0 -131
  67. package/docs/parakeet-transformers-js/src/models/gemma3n/feature_extraction_gemma3n.js +0 -97
  68. package/docs/parakeet-transformers-js/src/models/gemma3n/processing_gemma3n.js +0 -74
  69. package/docs/parakeet-transformers-js/src/models/glpn/image_processing_glpn.js +0 -5
  70. package/docs/parakeet-transformers-js/src/models/grounding_dino/image_processing_grounding_dino.js +0 -29
  71. package/docs/parakeet-transformers-js/src/models/grounding_dino/processing_grounding_dino.js +0 -101
  72. package/docs/parakeet-transformers-js/src/models/idefics3/image_processing_idefics3.js +0 -232
  73. package/docs/parakeet-transformers-js/src/models/idefics3/processing_idefics3.js +0 -136
  74. package/docs/parakeet-transformers-js/src/models/image_processors.js +0 -40
  75. package/docs/parakeet-transformers-js/src/models/janus/image_processing_janus.js +0 -27
  76. package/docs/parakeet-transformers-js/src/models/janus/processing_janus.js +0 -123
  77. package/docs/parakeet-transformers-js/src/models/jina_clip/image_processing_jina_clip.js +0 -26
  78. package/docs/parakeet-transformers-js/src/models/jina_clip/processing_jina_clip.js +0 -24
  79. package/docs/parakeet-transformers-js/src/models/llava/processing_llava.js +0 -44
  80. package/docs/parakeet-transformers-js/src/models/llava_onevision/image_processing_llava_onevision.js +0 -5
  81. package/docs/parakeet-transformers-js/src/models/mask2former/image_processing_mask2former.js +0 -5
  82. package/docs/parakeet-transformers-js/src/models/maskformer/image_processing_maskformer.js +0 -18
  83. package/docs/parakeet-transformers-js/src/models/mgp_str/processing_mgp_str.js +0 -172
  84. package/docs/parakeet-transformers-js/src/models/mobilenet_v1/image_processing_mobilenet_v1.js +0 -7
  85. package/docs/parakeet-transformers-js/src/models/mobilenet_v2/image_processing_mobilenet_v2.js +0 -7
  86. package/docs/parakeet-transformers-js/src/models/mobilenet_v3/image_processing_mobilenet_v3.js +0 -7
  87. package/docs/parakeet-transformers-js/src/models/mobilenet_v4/image_processing_mobilenet_v4.js +0 -7
  88. package/docs/parakeet-transformers-js/src/models/mobilevit/image_processing_mobilevit.js +0 -6
  89. package/docs/parakeet-transformers-js/src/models/moonshine/feature_extraction_moonshine.js +0 -26
  90. package/docs/parakeet-transformers-js/src/models/moonshine/processing_moonshine.js +0 -20
  91. package/docs/parakeet-transformers-js/src/models/nougat/image_processing_nougat.js +0 -5
  92. package/docs/parakeet-transformers-js/src/models/owlv2/image_processing_owlv2.js +0 -5
  93. package/docs/parakeet-transformers-js/src/models/owlvit/image_processing_owlvit.js +0 -12
  94. package/docs/parakeet-transformers-js/src/models/owlvit/processing_owlvit.js +0 -7
  95. package/docs/parakeet-transformers-js/src/models/paligemma/processing_paligemma.js +0 -83
  96. package/docs/parakeet-transformers-js/src/models/parakeet/feature_extraction_parakeet.js +0 -3
  97. package/docs/parakeet-transformers-js/src/models/parakeet/modeling_parakeet.js +0 -3
  98. package/docs/parakeet-transformers-js/src/models/parakeet/processing_parakeet.js +0 -3
  99. package/docs/parakeet-transformers-js/src/models/parakeet/tokenization_parakeet.js +0 -3
  100. package/docs/parakeet-transformers-js/src/models/phi3_v/image_processing_phi3_v.js +0 -163
  101. package/docs/parakeet-transformers-js/src/models/phi3_v/processing_phi3_v.js +0 -53
  102. package/docs/parakeet-transformers-js/src/models/processors.js +0 -22
  103. package/docs/parakeet-transformers-js/src/models/pvt/image_processing_pvt.js +0 -5
  104. package/docs/parakeet-transformers-js/src/models/pyannote/feature_extraction_pyannote.js +0 -85
  105. package/docs/parakeet-transformers-js/src/models/pyannote/processing_pyannote.js +0 -24
  106. package/docs/parakeet-transformers-js/src/models/qwen2_vl/image_processing_qwen2_vl.js +0 -52
  107. package/docs/parakeet-transformers-js/src/models/qwen2_vl/processing_qwen2_vl.js +0 -53
  108. package/docs/parakeet-transformers-js/src/models/rt_detr/image_processing_rt_detr.js +0 -12
  109. package/docs/parakeet-transformers-js/src/models/sam/image_processing_sam.js +0 -242
  110. package/docs/parakeet-transformers-js/src/models/sam/processing_sam.js +0 -20
  111. package/docs/parakeet-transformers-js/src/models/sapiens/image_processing_sapiens.js +0 -13
  112. package/docs/parakeet-transformers-js/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +0 -175
  113. package/docs/parakeet-transformers-js/src/models/segformer/image_processing_segformer.js +0 -13
  114. package/docs/parakeet-transformers-js/src/models/siglip/image_processing_siglip.js +0 -5
  115. package/docs/parakeet-transformers-js/src/models/smolvlm/image_processing_smolvlm.js +0 -2
  116. package/docs/parakeet-transformers-js/src/models/smolvlm/processing_smolvlm.js +0 -2
  117. package/docs/parakeet-transformers-js/src/models/snac/feature_extraction_snac.js +0 -3
  118. package/docs/parakeet-transformers-js/src/models/speecht5/feature_extraction_speecht5.js +0 -4
  119. package/docs/parakeet-transformers-js/src/models/speecht5/processing_speecht5.js +0 -17
  120. package/docs/parakeet-transformers-js/src/models/swin2sr/image_processing_swin2sr.js +0 -24
  121. package/docs/parakeet-transformers-js/src/models/ultravox/processing_ultravox.js +0 -54
  122. package/docs/parakeet-transformers-js/src/models/vit/image_processing_vit.js +0 -7
  123. package/docs/parakeet-transformers-js/src/models/vitmatte/image_processing_vitmatte.js +0 -50
  124. package/docs/parakeet-transformers-js/src/models/vitpose/image_processing_vitpose.js +0 -89
  125. package/docs/parakeet-transformers-js/src/models/wav2vec2/feature_extraction_wav2vec2.js +0 -44
  126. package/docs/parakeet-transformers-js/src/models/wav2vec2/processing_wav2vec2.js +0 -17
  127. package/docs/parakeet-transformers-js/src/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.js +0 -17
  128. package/docs/parakeet-transformers-js/src/models/wespeaker/feature_extraction_wespeaker.js +0 -95
  129. package/docs/parakeet-transformers-js/src/models/whisper/common_whisper.js +0 -157
  130. package/docs/parakeet-transformers-js/src/models/whisper/feature_extraction_whisper.js +0 -92
  131. package/docs/parakeet-transformers-js/src/models/whisper/generation_whisper.js +0 -89
  132. package/docs/parakeet-transformers-js/src/models/whisper/processing_whisper.js +0 -21
  133. package/docs/parakeet-transformers-js/src/models/yolos/image_processing_yolos.js +0 -12
  134. package/docs/parakeet-transformers-js/src/models.js +0 -8644
  135. package/docs/parakeet-transformers-js/src/ops/registry.js +0 -133
  136. package/docs/parakeet-transformers-js/src/ort_env.js +0 -8
  137. package/docs/parakeet-transformers-js/src/parakeet.js +0 -792
  138. package/docs/parakeet-transformers-js/src/pipelines.js +0 -3540
  139. package/docs/parakeet-transformers-js/src/processors.js +0 -16
  140. package/docs/parakeet-transformers-js/src/tokenizers.js +0 -4432
  141. package/docs/parakeet-transformers-js/src/transformers.js +0 -50
  142. package/docs/parakeet-transformers-js/src/utils/audio.js +0 -893
  143. package/docs/parakeet-transformers-js/src/utils/constants.js +0 -9
  144. package/docs/parakeet-transformers-js/src/utils/core.js +0 -259
  145. package/docs/parakeet-transformers-js/src/utils/data-structures.js +0 -574
  146. package/docs/parakeet-transformers-js/src/utils/devices.js +0 -22
  147. package/docs/parakeet-transformers-js/src/utils/dtypes.js +0 -63
  148. package/docs/parakeet-transformers-js/src/utils/generic.js +0 -35
  149. package/docs/parakeet-transformers-js/src/utils/hub.js +0 -780
  150. package/docs/parakeet-transformers-js/src/utils/image.js +0 -834
  151. package/docs/parakeet-transformers-js/src/utils/maths.js +0 -1061
  152. package/docs/parakeet-transformers-js/src/utils/tensor.js +0 -1539
  153. package/docs/parakeet-transformers-js/src/utils/video.js +0 -128
  154. package/docs/parakeet-transformers-js/test/decoder.test.js +0 -114
  155. package/docs/parakeet-transformers-js/test/encoder.test.js +0 -108
  156. package/docs/parakeet-transformers-js/test/preprocessor.test.js +0 -85
  157. package/docs/parakeet-transformers-js/test/tokenizer.test.js +0 -24
  158. package/docs/parakeet-transformers-js/test/transcribe.js +0 -89
  159. package/docs/parakeet-transformers-js/tsconfig.json +0 -21
  160. package/docs/parakeet-transformers-js/webpack.config.js +0 -223
@@ -1,131 +0,0 @@
1
- import { Processor } from "../../base/processing_utils.js";
2
- import { AutoImageProcessor } from "../auto/image_processing_auto.js";
3
- import { AutoTokenizer } from "../../tokenizers.js";
4
-
5
- export class Florence2Processor extends Processor {
6
- static tokenizer_class = AutoTokenizer
7
- static image_processor_class = AutoImageProcessor
8
-
9
- constructor(config, components, chat_template) {
10
- super(config, components, chat_template);
11
-
12
- const {
13
- // @ts-expect-error TS2339
14
- tasks_answer_post_processing_type,
15
- // @ts-expect-error TS2339
16
- task_prompts_without_inputs,
17
- // @ts-expect-error TS2339
18
- task_prompts_with_input,
19
- } = this.image_processor.config;
20
-
21
- /** @type {Map<string, string>} */
22
- this.tasks_answer_post_processing_type = new Map(Object.entries(tasks_answer_post_processing_type ?? {}));
23
-
24
- /** @type {Map<string, string>} */
25
- this.task_prompts_without_inputs = new Map(Object.entries(task_prompts_without_inputs ?? {}));
26
-
27
- /** @type {Map<string, string>} */
28
- this.task_prompts_with_input = new Map(Object.entries(task_prompts_with_input ?? {}));
29
-
30
- this.regexes = {
31
- quad_boxes: /(.+?)<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>/gm,
32
- bboxes: /([^<]+)?<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>/gm,
33
- }
34
- this.size_per_bin = 1000;
35
- }
36
-
37
- /**
38
- * Helper function to construct prompts from input texts
39
- * @param {string|string[]} text
40
- * @returns {string[]}
41
- */
42
- construct_prompts(text) {
43
- if (typeof text === 'string') {
44
- text = [text];
45
- }
46
-
47
- const prompts = [];
48
- for (const t of text) {
49
- // 1. fixed task prompts without additional inputs
50
- if (this.task_prompts_without_inputs.has(t)) {
51
- prompts.push(this.task_prompts_without_inputs.get(t));
52
- }
53
- // 2. task prompts with additional inputs
54
- else {
55
- for (const [task, prompt] of this.task_prompts_with_input) {
56
- if (t.includes(task)) {
57
- prompts.push(prompt.replaceAll('{input}', t).replaceAll(task, ''));
58
- break;
59
- }
60
- }
61
-
62
- // 3. default prompt
63
- if (prompts.length !== text.length) {
64
- prompts.push(t);
65
- }
66
- }
67
- }
68
- return prompts;
69
- }
70
-
71
- /**
72
- * Post-process the output of the model to each of the task outputs.
73
- * @param {string} text The text to post-process.
74
- * @param {string} task The task to post-process the text for.
75
- * @param {[number, number]} image_size The size of the image. height x width.
76
- */
77
- post_process_generation(text, task, image_size) {
78
- const task_answer_post_processing_type = this.tasks_answer_post_processing_type.get(task) ?? 'pure_text';
79
-
80
- // remove the special tokens
81
- text = text.replaceAll('<s>', '').replaceAll('</s>', '');
82
-
83
- let final_answer;
84
- switch (task_answer_post_processing_type) {
85
- case 'pure_text':
86
- final_answer = text;
87
- break;
88
-
89
- case 'description_with_bboxes':
90
- case 'bboxes':
91
- case 'phrase_grounding':
92
- case 'ocr':
93
- const key = task_answer_post_processing_type === 'ocr' ? 'quad_boxes' : 'bboxes';
94
- const matches = text.matchAll(this.regexes[key]);
95
- const labels = [];
96
- const items = [];
97
- for (const [_, label, ...locations] of matches) {
98
- // Push new label, or duplicate the last label
99
- labels.push(label ? label.trim() : labels.at(-1) ?? '');
100
- items.push(locations.map((x, i) =>
101
- // NOTE: Add 0.5 to use the center position of the bin as the coordinate.
102
- (Number(x) + 0.5) / this.size_per_bin * image_size[i % 2])
103
- );
104
- }
105
- final_answer = { labels, [key]: items };
106
- break;
107
-
108
- default:
109
- throw new Error(`Task "${task}" (of type "${task_answer_post_processing_type}") not yet implemented.`);
110
- }
111
-
112
- return { [task]: final_answer }
113
- }
114
-
115
- // NOTE: images and text are switched from the python version
116
- // `images` is required, `text` is optional
117
- async _call(images, text=null, kwargs = {}) {
118
-
119
- if (!images && !text){
120
- throw new Error('Either text or images must be provided');
121
- }
122
-
123
- const image_inputs = await this.image_processor(images, kwargs);
124
- const text_inputs = text ? this.tokenizer(this.construct_prompts(text), kwargs) : {};
125
-
126
- return {
127
- ...image_inputs,
128
- ...text_inputs,
129
- }
130
- }
131
- }
@@ -1,97 +0,0 @@
1
- import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
2
- import { full, Tensor } from '../../utils/tensor.js';
3
- import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js';
4
-
5
- export class Gemma3nAudioFeatureExtractor extends FeatureExtractor {
6
-
7
- constructor(config) {
8
- super(config);
9
-
10
- const {
11
- fft_length, feature_size, min_frequency, max_frequency, sampling_rate, frame_length
12
- } = this.config;
13
-
14
- const mel_filters = mel_filter_bank(
15
- Math.floor(1 + fft_length / 2), // num_frequency_bins
16
- feature_size, // num_mel_filters
17
- min_frequency, // min_frequency
18
- max_frequency, // max_frequency
19
- sampling_rate, // sampling_rate
20
- null, // norm
21
- "htk", // mel_scale
22
- false, // triangularize_in_mel_space
23
- );
24
- this.mel_filters = mel_filters;
25
-
26
- this.window = window_function(frame_length, 'hann')
27
- }
28
-
29
- /**
30
- * Computes the log-Mel spectrogram of the provided audio waveform.
31
- * @param {Float32Array|Float64Array} waveform The audio waveform to process.
32
- * @param {number} max_length The maximum number of frames to return.
33
- * @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
34
- */
35
- async _extract_fbank_features(waveform, max_length) {
36
- // NOTE: We don't pad/truncate since that is passed in as `max_num_frames`
37
- return spectrogram(
38
- waveform,
39
- this.window, // window
40
- this.config.frame_length, // frame_length
41
- this.config.hop_length, // hop_length
42
- {
43
- fft_length: this.config.fft_length,
44
- center: false,
45
- onesided: true,
46
- preemphasis: this.config.preemphasis,
47
- preemphasis_htk_flavor: this.config.preemphasis_htk_flavor,
48
- mel_filters: this.mel_filters,
49
- log_mel: 'log',
50
- mel_floor: this.config.mel_floor,
51
- remove_dc_offset: false,
52
-
53
- // Custom
54
- transpose: true,
55
- }
56
- )
57
- }
58
-
59
- /**
60
- * Asynchronously extracts features from a given audio using the provided configuration.
61
- * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
62
- * @param {Object} options Optional parameters for feature extraction.
63
- * @param {number} [options.max_length=480_000] If provided, defines the maximum length of the audio to allow.
64
- * Audio longer than this will be truncated if `truncation=True`.
65
- * @param {boolean} [options.truncation=true] Whether or not to truncate audio above `max_length`.
66
- * @param {boolean} [options.padding=true] Whether to pad the sequence to a multiple of `pad_to_multiple_of`.
67
- * @param {number} [options.pad_to_multiple_of=128] The number to pad the sequence to a multiple of.
68
- * @returns {Promise<{ input_features: Tensor, input_features_mask: Tensor }>} A Promise resolving to an object containing the extracted input features and attention masks as Tensors.
69
- */
70
- async _call(audio, {
71
- max_length = 480_000,
72
- truncation=true,
73
- padding = true,
74
- pad_to_multiple_of = 128,
75
- } = {}) {
76
- validate_audio_inputs(audio, 'Gemma3nAudioFeatureExtractor');
77
- if (truncation && audio.length > max_length) {
78
- audio = audio.slice(0, max_length);
79
- }
80
- if (padding && audio.length % pad_to_multiple_of !== 0) {
81
- const padding_length = pad_to_multiple_of - (audio.length % pad_to_multiple_of);
82
- const padded_audio = new Float64Array(audio.length + padding_length);
83
- padded_audio.set(audio);
84
- if (this.config.padding_value !== 0) {
85
- padded_audio.fill(this.config.padding_value, audio.length);
86
- }
87
- audio = padded_audio;
88
- }
89
-
90
- const features = await this._extract_fbank_features(audio, this.config.max_length);
91
- const padded_attention_mask = full([1, features.dims[0]], true);
92
- return {
93
- input_features: features.unsqueeze_(0),
94
- input_features_mask: padded_attention_mask,
95
- }
96
- }
97
- }
@@ -1,74 +0,0 @@
1
-
2
- import { Processor } from "../../base/processing_utils.js";
3
- import { AutoImageProcessor } from "../auto/image_processing_auto.js";
4
- import { AutoFeatureExtractor } from "../auto/feature_extraction_auto.js";
5
- import { AutoTokenizer } from "../../tokenizers.js";
6
- import { RawImage } from "../../utils/image.js";
7
- import { RawAudio } from "../../utils/audio.js";
8
-
9
- export class Gemma3nProcessor extends Processor {
10
- static image_processor_class = AutoImageProcessor;
11
- static feature_extractor_class = AutoFeatureExtractor;
12
- static tokenizer_class = AutoTokenizer;
13
- static uses_processor_config = true;
14
- static uses_chat_template_file = true;
15
-
16
- constructor(config, components, chat_template) {
17
- super(config, components, chat_template);
18
- this.audio_seq_length = this.config.audio_seq_length;
19
- this.image_seq_length = this.config.image_seq_length;
20
-
21
- const {
22
- // Audio tokens
23
- audio_token_id, boa_token, audio_token, eoa_token,
24
-
25
- // Image tokens
26
- image_token_id, boi_token, image_token, eoi_token
27
- } = this.tokenizer.config;
28
-
29
- this.audio_token_id = audio_token_id
30
- this.boa_token = boa_token
31
- this.audio_token = audio_token
32
- const audio_tokens_expanded = audio_token.repeat(this.audio_seq_length);
33
- this.full_audio_sequence = `\n\n${boa_token}${audio_tokens_expanded}${eoa_token}\n\n`
34
-
35
- this.image_token_id = image_token_id
36
- this.boi_token = boi_token
37
- this.image_token = image_token
38
- const image_tokens_expanded = image_token.repeat(this.image_seq_length);
39
- this.full_image_sequence = `\n\n${boi_token}${image_tokens_expanded}${eoi_token}\n\n`
40
- }
41
-
42
- /**
43
- *
44
- * @param {string|string[]} text
45
- * @param {RawImage|RawImage[]|RawImage[][]} images
46
- * @param {RawAudio|RawAudio[]|RawAudio[][]} audio
47
- * @returns {Promise<any>}
48
- */
49
- async _call(text, images = null, audio = null, options = {}) {
50
-
51
- if (typeof text === 'string') {
52
- text = [text];
53
- }
54
-
55
- let audio_inputs;
56
- if (audio) {
57
- audio_inputs = await this.feature_extractor(audio, options);
58
-
59
- text = text.map(prompt => prompt.replaceAll(this.audio_token, this.full_audio_sequence));
60
- }
61
- let image_inputs;
62
- if (images) {
63
- image_inputs = await this.image_processor(images, options);
64
- text = text.map(prompt => prompt.replaceAll(this.image_token, this.full_image_sequence));
65
- }
66
-
67
- let text_inputs = this.tokenizer(text, options);
68
- return {
69
- ...text_inputs,
70
- ...image_inputs,
71
- ...audio_inputs,
72
- }
73
- }
74
- }
@@ -1,5 +0,0 @@
1
- import {
2
- ImageProcessor,
3
- } from "../../base/image_processors_utils.js";
4
-
5
- export class GLPNFeatureExtractor extends ImageProcessor { }
@@ -1,29 +0,0 @@
1
-
2
- import {
3
- ImageProcessor,
4
- } from "../../base/image_processors_utils.js";
5
- import { ones } from '../../utils/tensor.js';
6
-
7
-
8
- /**
9
- * @typedef {object} GroundingDinoFeatureExtractorResultProps
10
- * @property {import('../../utils/tensor.js').Tensor} pixel_mask
11
- * @typedef {import('../../base/image_processors_utils.js').ImageProcessorResult & GroundingDinoFeatureExtractorResultProps} GroundingDinoFeatureExtractorResult
12
- */
13
-
14
- export class GroundingDinoImageProcessor extends ImageProcessor {
15
- /**
16
- * Calls the feature extraction process on an array of images, preprocesses
17
- * each image, and concatenates the resulting features into a single Tensor.
18
- * @param {import('../../utils/image.js').RawImage[]} images The image(s) to extract features from.
19
- * @returns {Promise<GroundingDinoFeatureExtractorResult>} An object containing the concatenated pixel values of the preprocessed images.
20
- */
21
- async _call(images) {
22
- const result = await super._call(images);
23
-
24
- const dims = result.pixel_values.dims;
25
- const pixel_mask = ones([dims[0], dims[2], dims[3]]);
26
-
27
- return { ...result, pixel_mask };
28
- }
29
- }
@@ -1,101 +0,0 @@
1
- import { Processor } from "../../base/processing_utils.js";
2
- import { AutoImageProcessor } from "../auto/image_processing_auto.js";
3
- import { AutoTokenizer } from "../../tokenizers.js";
4
- import { center_to_corners_format } from "../../base/image_processors_utils.js";
5
-
6
- /**
7
- * Get token ids of phrases from posmaps and input_ids.
8
- * @param {import('../../utils/tensor.js').Tensor} posmaps A boolean tensor of unbatched text-thresholded logits related to the detected bounding boxes of shape `(hidden_size, )`.
9
- * @param {import('../../utils/tensor.js').Tensor} input_ids A tensor of token ids of shape `(sequence_length, )`.
10
- */
11
- function get_phrases_from_posmap(posmaps, input_ids) {
12
-
13
- const left_idx = 0;
14
- const right_idx = posmaps.dims.at(-1) - 1;
15
-
16
- const posmaps_list = posmaps.tolist();
17
- posmaps_list.fill(false, 0, left_idx + 1);
18
- posmaps_list.fill(false, right_idx);
19
-
20
- const input_ids_list = input_ids.tolist();
21
- return posmaps_list
22
- .map((val, idx) => val ? idx : null)
23
- .filter(idx => idx !== null)
24
- .map(i => input_ids_list[i]);
25
- }
26
-
27
- export class GroundingDinoProcessor extends Processor {
28
- static tokenizer_class = AutoTokenizer
29
- static image_processor_class = AutoImageProcessor
30
-
31
- /**
32
- * @typedef {import('../../utils/image.js').RawImage} RawImage
33
- */
34
- /**
35
- *
36
- * @param {RawImage|RawImage[]|RawImage[][]} images
37
- * @param {string|string[]} text
38
- * @returns {Promise<any>}
39
- */
40
- async _call(images, text, options = {}) {
41
-
42
- const image_inputs = images ? await this.image_processor(images, options) : {};
43
- const text_inputs = text ? this.tokenizer(text, options) : {};
44
-
45
- return {
46
- ...text_inputs,
47
- ...image_inputs,
48
- }
49
- }
50
- post_process_grounded_object_detection(outputs, input_ids, {
51
- box_threshold = 0.25,
52
- text_threshold = 0.25,
53
- target_sizes = null
54
- } = {}) {
55
- const { logits, pred_boxes } = outputs;
56
- const batch_size = logits.dims[0];
57
-
58
- if (target_sizes !== null && target_sizes.length !== batch_size) {
59
- throw Error("Make sure that you pass in as many target sizes as the batch dimension of the logits")
60
- }
61
- const num_queries = logits.dims.at(1);
62
-
63
- const probs = logits.sigmoid(); // (batch_size, num_queries, 256)
64
- const scores = probs.max(-1).tolist(); // (batch_size, num_queries)
65
-
66
- // Convert to [x0, y0, x1, y1] format
67
- const boxes = pred_boxes.tolist() // (batch_size, num_queries, 4)
68
- .map(batch => batch.map(box => center_to_corners_format(box)));
69
-
70
- const results = [];
71
- for (let i = 0; i < batch_size; ++i) {
72
- const target_size = target_sizes !== null ? target_sizes[i] : null;
73
-
74
- // Convert from relative [0, 1] to absolute [0, height] coordinates
75
- if (target_size !== null) {
76
- boxes[i] = boxes[i].map(box => box.map((x, j) => x * target_size[(j + 1) % 2]));
77
- }
78
-
79
- const batch_scores = scores[i];
80
- const final_scores = [];
81
- const final_phrases = [];
82
- const final_boxes = [];
83
- for (let j = 0; j < num_queries; ++j) {
84
- const score = batch_scores[j];
85
- if (score <= box_threshold) {
86
- continue;
87
- }
88
- const box = boxes[i][j];
89
- const prob = probs[i][j];
90
-
91
- final_scores.push(score);
92
- final_boxes.push(box);
93
-
94
- const phrases = get_phrases_from_posmap(prob.gt(text_threshold), input_ids[i]);
95
- final_phrases.push(phrases);
96
- }
97
- results.push({ scores: final_scores, boxes: final_boxes, labels: this.batch_decode(final_phrases) });
98
- }
99
- return results;
100
- }
101
- }
@@ -1,232 +0,0 @@
1
-
2
-
3
- import {
4
- ImageProcessor,
5
- } from "../../base/image_processors_utils.js";
6
- import { cat, full, interpolate_4d, slice, stack } from "../../utils/tensor.js";
7
-
8
- export class Idefics3ImageProcessor extends ImageProcessor {
9
- constructor(config) {
10
- super(config);
11
-
12
- this.do_image_splitting = config.do_image_splitting ?? true;
13
- this.max_image_size = config.max_image_size;
14
- }
15
-
16
- /**
17
- * @typedef {import('../../utils/image.js').RawImage} RawImage
18
- * @typedef {import('../../utils/tensor.js').Tensor} Tensor
19
- */
20
-
21
- /**
22
- * Calculate size to resize images to, to be multiples of `vision_encoder_max_size` while preserving the aspect ratio.
23
- * @param {Tensor} pixel_values Tensor of the image to resize.
24
- * @param {number} vision_encoder_max_size Maximum size of the output image. If the image is larger than this size,
25
- * it will be split into patches of this size, and the original image will be concatenated with the patches, resized to max_size.
26
- */
27
- get_resize_for_vision_encoder(pixel_values, vision_encoder_max_size) {
28
- let [height, width] = pixel_values.dims.slice(-2);
29
-
30
- const aspect_ratio = width / height;
31
- if (width >= height) {
32
- width = Math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size;
33
- height = Math.floor(width / aspect_ratio);
34
- height = Math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size;
35
- } else {
36
- height = Math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size;
37
- width = Math.floor(height * aspect_ratio);
38
- width = Math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size;
39
- }
40
- return { height, width };
41
- }
42
-
43
- /** @param {RawImage|RawImage[]|RawImage[][]} images */
44
- async _call(images, {
45
- do_image_splitting = null,
46
- return_row_col_info = false,
47
- } = {}) {
48
-
49
- /** @type {RawImage[][]} */
50
- let batched_2d_images;
51
- if (!Array.isArray(images)) {
52
- batched_2d_images = [[images]];
53
- } else {
54
- if (images.length === 0 || !images[0]) {
55
- throw new Error("No images provided.");
56
- }
57
- if (!Array.isArray(images[0])) {
58
- batched_2d_images = [/** @type {RawImage[]} */(images)];
59
- } else {
60
- batched_2d_images = /** @type {RawImage[][]} */(images);
61
- }
62
- }
63
-
64
- // List of tensors, each with shape [patches, channels, height, width]
65
- let all_pixel_values = [];
66
- let images_list_rows = [];
67
- let images_list_cols = [];
68
-
69
- const original_sizes = [];
70
- const reshaped_input_sizes = [];
71
- for (const image_batch of batched_2d_images) {
72
-
73
- let images_list = await Promise.all(image_batch.map(x => this.preprocess(x)));
74
-
75
- // Original sizes of images
76
- original_sizes.push(...images_list.map(x => x.original_size));
77
-
78
- // Reshaped sizes of images, before padding or cropping
79
- reshaped_input_sizes.push(...images_list.map(x => x.reshaped_input_size));
80
-
81
- // Convert images to 4D tensors for easier processing
82
- images_list.forEach(x => x.pixel_values.unsqueeze_(0));
83
-
84
- const { longest_edge } = this.max_image_size;
85
-
86
- /** @type {Tensor[]} */
87
- let images_tensor;
88
- if (do_image_splitting ?? this.do_image_splitting) {
89
- let image_rows = new Array(images_list.length);
90
- let image_cols = new Array(images_list.length);
91
-
92
- // We first resize both height and width of each image to the nearest max_image_size multiple, disregarding the aspect ratio
93
- images_tensor = await Promise.all(images_list.map(async (x, i) => {
94
- const new_size = this.get_resize_for_vision_encoder(x.pixel_values, longest_edge);
95
-
96
- const resized = await interpolate_4d(x.pixel_values, {
97
- size: [new_size.height, new_size.width],
98
- });
99
-
100
- const { frames, num_splits_h, num_splits_w } = await this.split_image(resized, this.max_image_size);
101
- image_rows[i] = num_splits_h;
102
- image_cols[i] = num_splits_w;
103
- return cat(frames, 0);
104
- }));
105
-
106
- images_list_rows.push(image_rows);
107
- images_list_cols.push(image_cols);
108
-
109
- } else {
110
- /** @type {[number, number]} */
111
- const size = [longest_edge, longest_edge];
112
- images_tensor = await Promise.all(
113
- images_list.map(x => interpolate_4d(x.pixel_values, { size }))
114
- );
115
-
116
- images_list_rows.push(new Array(images_list.length).fill(0));
117
- images_list_cols.push(new Array(images_list.length).fill(0));
118
- }
119
-
120
- all_pixel_values.push(cat(images_tensor, 0));
121
- }
122
-
123
- const batch_size = all_pixel_values.length;
124
- const [n, c, h, w] = all_pixel_values[0].dims;
125
-
126
- // Stack pixel values
127
- let pixel_values;
128
- let pixel_attention_mask;
129
- if (batch_size === 1) {
130
- pixel_values = all_pixel_values[0].unsqueeze_(0);
131
- pixel_attention_mask = full([batch_size, n, h, w], true);
132
- } else {
133
- // Add padding (if necessary) to images with less patches than the maximum number of patches
134
- const max_num_patches = Math.max(...all_pixel_values.map(x => x.dims.at(0)));
135
-
136
- pixel_attention_mask = full([batch_size, max_num_patches, h, w], true);
137
- const pixel_attention_mask_data = pixel_attention_mask.data;
138
- const pixel_attention_mask_stride = max_num_patches * h * w;
139
- for (let i = 0; i < batch_size; ++i) {
140
- const num_patches = all_pixel_values[i].dims[0];
141
- if (num_patches < max_num_patches) {
142
- all_pixel_values[i] = cat([
143
- all_pixel_values[i],
144
- full([max_num_patches - num_patches, c, h, w], 0),
145
- ], 0);
146
-
147
- const start_offset = i * pixel_attention_mask_stride + num_patches * h * w;
148
- const end_offset = (i + 1) * pixel_attention_mask_stride;
149
-
150
- // @ts-ignore
151
- pixel_attention_mask_data.fill(false, start_offset, end_offset);
152
- }
153
- }
154
- pixel_values = stack(all_pixel_values, 0);
155
- }
156
-
157
- return {
158
- pixel_values,
159
- pixel_attention_mask,
160
-
161
- original_sizes,
162
- reshaped_input_sizes,
163
- ...(
164
- return_row_col_info
165
- ? { rows: images_list_rows, cols: images_list_cols }
166
- : {}
167
- ),
168
- }
169
- }
170
-
171
- async split_image(pixel_values, { longest_edge }) {
172
- const max_height = longest_edge;
173
- const max_width = longest_edge;
174
-
175
- const frames = [];
176
-
177
- const [height, width] = pixel_values.dims.slice(-2);
178
-
179
- let num_splits_h = 0, num_splits_w = 0;
180
-
181
- if (height > max_height || width > max_width) {
182
- // Calculate the number of splits
183
- num_splits_h = Math.ceil(height / max_height);
184
- num_splits_w = Math.ceil(width / max_width);
185
-
186
- // Calculate the optimal width and height for the sub-images
187
- const optimal_height = Math.ceil(height / num_splits_h);
188
- const optimal_width = Math.ceil(width / num_splits_w);
189
-
190
- // Iterate through each row and column
191
- for (let r = 0; r < num_splits_h; ++r) {
192
- for (let c = 0; c < num_splits_w; ++c) {
193
- let start_x, start_y, end_x, end_y;
194
- if (r === num_splits_h - 1) { // At bottom
195
- start_y = height - optimal_height;
196
- end_y = height;
197
- } else {
198
- start_y = r * optimal_height;
199
- end_y = (r + 1) * optimal_height;
200
- }
201
- if (c === num_splits_w - 1) { // At right
202
- start_x = width - optimal_width;
203
- end_x = width;
204
- } else {
205
- start_x = c * optimal_width;
206
- end_x = (c + 1) * optimal_width;
207
- }
208
-
209
- const starts = [start_y, start_x];
210
- const ends = [end_y, end_x];
211
-
212
- const patch = await slice(pixel_values, starts, ends, [2, 3]);
213
- frames.push(patch);
214
- }
215
- }
216
-
217
- // Resize the global image to match max dimensions for memory efficiency
218
- const global_image_height = max_height;
219
- const global_image_width = max_width;
220
-
221
- if (height !== global_image_height || width !== global_image_width) {
222
- pixel_values = await interpolate_4d(pixel_values, {
223
- size: [global_image_height, global_image_width],
224
- })
225
- }
226
- }
227
-
228
- frames.push(pixel_values);
229
-
230
- return { frames, num_splits_h, num_splits_w };
231
- }
232
- }