parakeet.js 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.gitmodules +3 -0
- package/README.md +240 -239
- package/examples/hf-spaces-demo/README.md +6 -9
- package/examples/hf-spaces-demo/package.json +1 -1
- package/examples/hf-spaces-demo/src/App.js +307 -316
- package/examples/react-demo/package.json +19 -19
- package/examples/react-demo/src/App.jsx +324 -326
- package/examples/react-demo-dev/src/App.jsx +23 -24
- package/package.json +1 -1
- package/publish.ps1 +65 -0
- package/src/hub.js +235 -241
- package/src/parakeet.js +15 -8
- package/src/preprocessor.js +75 -68
- package/docs/parakeet-transformers-js/.gitattributes +0 -2
- package/docs/parakeet-transformers-js/.prettierignore +0 -8
- package/docs/parakeet-transformers-js/.prettierrc +0 -10
- package/docs/parakeet-transformers-js/.tmp_features.json +0 -1
- package/docs/parakeet-transformers-js/LICENSE +0 -202
- package/docs/parakeet-transformers-js/README.md +0 -448
- package/docs/parakeet-transformers-js/assets/nemo128.onnx +0 -0
- package/docs/parakeet-transformers-js/assets/nemo80.onnx +0 -0
- package/docs/parakeet-transformers-js/debug_test.js +0 -84
- package/docs/parakeet-transformers-js/dev/inspect_decoder.cjs +0 -9
- package/docs/parakeet-transformers-js/dev/inspect_joiner.cjs +0 -9
- package/docs/parakeet-transformers-js/dev/js_step_by_step.js +0 -249
- package/docs/parakeet-transformers-js/dev/parakeet_cli.js +0 -91
- package/docs/parakeet-transformers-js/jest.config.mjs +0 -194
- package/docs/parakeet-transformers-js/js_preprocessing.json +0 -225
- package/docs/parakeet-transformers-js/js_step_by_step.json +0 -837
- package/docs/parakeet-transformers-js/js_step_by_step_v2.json +0 -450
- package/docs/parakeet-transformers-js/js_step_by_step_v3.json +0 -450
- package/docs/parakeet-transformers-js/js_steps.json +0 -821
- package/docs/parakeet-transformers-js/package-lock.json +0 -12251
- package/docs/parakeet-transformers-js/package.json +0 -96
- package/docs/parakeet-transformers-js/src/audio_features.js +0 -178
- package/docs/parakeet-transformers-js/src/backends/onnx.js +0 -210
- package/docs/parakeet-transformers-js/src/base/feature_extraction_utils.js +0 -54
- package/docs/parakeet-transformers-js/src/base/image_processors_utils.js +0 -1105
- package/docs/parakeet-transformers-js/src/base/processing_utils.js +0 -173
- package/docs/parakeet-transformers-js/src/configs.js +0 -455
- package/docs/parakeet-transformers-js/src/env.js +0 -167
- package/docs/parakeet-transformers-js/src/generation/configuration_utils.js +0 -388
- package/docs/parakeet-transformers-js/src/generation/logits_process.js +0 -727
- package/docs/parakeet-transformers-js/src/generation/logits_sampler.js +0 -204
- package/docs/parakeet-transformers-js/src/generation/parameters.js +0 -35
- package/docs/parakeet-transformers-js/src/generation/stopping_criteria.js +0 -156
- package/docs/parakeet-transformers-js/src/generation/streamers.js +0 -225
- package/docs/parakeet-transformers-js/src/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.js +0 -85
- package/docs/parakeet-transformers-js/src/models/auto/feature_extraction_auto.js +0 -25
- package/docs/parakeet-transformers-js/src/models/auto/image_processing_auto.js +0 -29
- package/docs/parakeet-transformers-js/src/models/auto/processing_auto.js +0 -85
- package/docs/parakeet-transformers-js/src/models/beit/image_processing_beit.js +0 -5
- package/docs/parakeet-transformers-js/src/models/bit/image_processing_bit.js +0 -5
- package/docs/parakeet-transformers-js/src/models/chinese_clip/image_processing_chinese_clip.js +0 -5
- package/docs/parakeet-transformers-js/src/models/clap/feature_extraction_clap.js +0 -159
- package/docs/parakeet-transformers-js/src/models/clip/image_processing_clip.js +0 -6
- package/docs/parakeet-transformers-js/src/models/convnext/image_processing_convnext.js +0 -46
- package/docs/parakeet-transformers-js/src/models/dac/feature_extraction_dac.js +0 -3
- package/docs/parakeet-transformers-js/src/models/deit/image_processing_deit.js +0 -6
- package/docs/parakeet-transformers-js/src/models/detr/image_processing_detr.js +0 -52
- package/docs/parakeet-transformers-js/src/models/donut/image_processing_donut.js +0 -31
- package/docs/parakeet-transformers-js/src/models/dpt/image_processing_dpt.js +0 -6
- package/docs/parakeet-transformers-js/src/models/efficientnet/image_processing_efficientnet.js +0 -14
- package/docs/parakeet-transformers-js/src/models/encodec/feature_extraction_encodec.js +0 -32
- package/docs/parakeet-transformers-js/src/models/feature_extractors.js +0 -17
- package/docs/parakeet-transformers-js/src/models/florence2/processing_florence2.js +0 -131
- package/docs/parakeet-transformers-js/src/models/gemma3n/feature_extraction_gemma3n.js +0 -97
- package/docs/parakeet-transformers-js/src/models/gemma3n/processing_gemma3n.js +0 -74
- package/docs/parakeet-transformers-js/src/models/glpn/image_processing_glpn.js +0 -5
- package/docs/parakeet-transformers-js/src/models/grounding_dino/image_processing_grounding_dino.js +0 -29
- package/docs/parakeet-transformers-js/src/models/grounding_dino/processing_grounding_dino.js +0 -101
- package/docs/parakeet-transformers-js/src/models/idefics3/image_processing_idefics3.js +0 -232
- package/docs/parakeet-transformers-js/src/models/idefics3/processing_idefics3.js +0 -136
- package/docs/parakeet-transformers-js/src/models/image_processors.js +0 -40
- package/docs/parakeet-transformers-js/src/models/janus/image_processing_janus.js +0 -27
- package/docs/parakeet-transformers-js/src/models/janus/processing_janus.js +0 -123
- package/docs/parakeet-transformers-js/src/models/jina_clip/image_processing_jina_clip.js +0 -26
- package/docs/parakeet-transformers-js/src/models/jina_clip/processing_jina_clip.js +0 -24
- package/docs/parakeet-transformers-js/src/models/llava/processing_llava.js +0 -44
- package/docs/parakeet-transformers-js/src/models/llava_onevision/image_processing_llava_onevision.js +0 -5
- package/docs/parakeet-transformers-js/src/models/mask2former/image_processing_mask2former.js +0 -5
- package/docs/parakeet-transformers-js/src/models/maskformer/image_processing_maskformer.js +0 -18
- package/docs/parakeet-transformers-js/src/models/mgp_str/processing_mgp_str.js +0 -172
- package/docs/parakeet-transformers-js/src/models/mobilenet_v1/image_processing_mobilenet_v1.js +0 -7
- package/docs/parakeet-transformers-js/src/models/mobilenet_v2/image_processing_mobilenet_v2.js +0 -7
- package/docs/parakeet-transformers-js/src/models/mobilenet_v3/image_processing_mobilenet_v3.js +0 -7
- package/docs/parakeet-transformers-js/src/models/mobilenet_v4/image_processing_mobilenet_v4.js +0 -7
- package/docs/parakeet-transformers-js/src/models/mobilevit/image_processing_mobilevit.js +0 -6
- package/docs/parakeet-transformers-js/src/models/moonshine/feature_extraction_moonshine.js +0 -26
- package/docs/parakeet-transformers-js/src/models/moonshine/processing_moonshine.js +0 -20
- package/docs/parakeet-transformers-js/src/models/nougat/image_processing_nougat.js +0 -5
- package/docs/parakeet-transformers-js/src/models/owlv2/image_processing_owlv2.js +0 -5
- package/docs/parakeet-transformers-js/src/models/owlvit/image_processing_owlvit.js +0 -12
- package/docs/parakeet-transformers-js/src/models/owlvit/processing_owlvit.js +0 -7
- package/docs/parakeet-transformers-js/src/models/paligemma/processing_paligemma.js +0 -83
- package/docs/parakeet-transformers-js/src/models/parakeet/feature_extraction_parakeet.js +0 -3
- package/docs/parakeet-transformers-js/src/models/parakeet/modeling_parakeet.js +0 -3
- package/docs/parakeet-transformers-js/src/models/parakeet/processing_parakeet.js +0 -3
- package/docs/parakeet-transformers-js/src/models/parakeet/tokenization_parakeet.js +0 -3
- package/docs/parakeet-transformers-js/src/models/phi3_v/image_processing_phi3_v.js +0 -163
- package/docs/parakeet-transformers-js/src/models/phi3_v/processing_phi3_v.js +0 -53
- package/docs/parakeet-transformers-js/src/models/processors.js +0 -22
- package/docs/parakeet-transformers-js/src/models/pvt/image_processing_pvt.js +0 -5
- package/docs/parakeet-transformers-js/src/models/pyannote/feature_extraction_pyannote.js +0 -85
- package/docs/parakeet-transformers-js/src/models/pyannote/processing_pyannote.js +0 -24
- package/docs/parakeet-transformers-js/src/models/qwen2_vl/image_processing_qwen2_vl.js +0 -52
- package/docs/parakeet-transformers-js/src/models/qwen2_vl/processing_qwen2_vl.js +0 -53
- package/docs/parakeet-transformers-js/src/models/rt_detr/image_processing_rt_detr.js +0 -12
- package/docs/parakeet-transformers-js/src/models/sam/image_processing_sam.js +0 -242
- package/docs/parakeet-transformers-js/src/models/sam/processing_sam.js +0 -20
- package/docs/parakeet-transformers-js/src/models/sapiens/image_processing_sapiens.js +0 -13
- package/docs/parakeet-transformers-js/src/models/seamless_m4t/feature_extraction_seamless_m4t.js +0 -175
- package/docs/parakeet-transformers-js/src/models/segformer/image_processing_segformer.js +0 -13
- package/docs/parakeet-transformers-js/src/models/siglip/image_processing_siglip.js +0 -5
- package/docs/parakeet-transformers-js/src/models/smolvlm/image_processing_smolvlm.js +0 -2
- package/docs/parakeet-transformers-js/src/models/smolvlm/processing_smolvlm.js +0 -2
- package/docs/parakeet-transformers-js/src/models/snac/feature_extraction_snac.js +0 -3
- package/docs/parakeet-transformers-js/src/models/speecht5/feature_extraction_speecht5.js +0 -4
- package/docs/parakeet-transformers-js/src/models/speecht5/processing_speecht5.js +0 -17
- package/docs/parakeet-transformers-js/src/models/swin2sr/image_processing_swin2sr.js +0 -24
- package/docs/parakeet-transformers-js/src/models/ultravox/processing_ultravox.js +0 -54
- package/docs/parakeet-transformers-js/src/models/vit/image_processing_vit.js +0 -7
- package/docs/parakeet-transformers-js/src/models/vitmatte/image_processing_vitmatte.js +0 -50
- package/docs/parakeet-transformers-js/src/models/vitpose/image_processing_vitpose.js +0 -89
- package/docs/parakeet-transformers-js/src/models/wav2vec2/feature_extraction_wav2vec2.js +0 -44
- package/docs/parakeet-transformers-js/src/models/wav2vec2/processing_wav2vec2.js +0 -17
- package/docs/parakeet-transformers-js/src/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.js +0 -17
- package/docs/parakeet-transformers-js/src/models/wespeaker/feature_extraction_wespeaker.js +0 -95
- package/docs/parakeet-transformers-js/src/models/whisper/common_whisper.js +0 -157
- package/docs/parakeet-transformers-js/src/models/whisper/feature_extraction_whisper.js +0 -92
- package/docs/parakeet-transformers-js/src/models/whisper/generation_whisper.js +0 -89
- package/docs/parakeet-transformers-js/src/models/whisper/processing_whisper.js +0 -21
- package/docs/parakeet-transformers-js/src/models/yolos/image_processing_yolos.js +0 -12
- package/docs/parakeet-transformers-js/src/models.js +0 -8644
- package/docs/parakeet-transformers-js/src/ops/registry.js +0 -133
- package/docs/parakeet-transformers-js/src/ort_env.js +0 -8
- package/docs/parakeet-transformers-js/src/parakeet.js +0 -792
- package/docs/parakeet-transformers-js/src/pipelines.js +0 -3540
- package/docs/parakeet-transformers-js/src/processors.js +0 -16
- package/docs/parakeet-transformers-js/src/tokenizers.js +0 -4432
- package/docs/parakeet-transformers-js/src/transformers.js +0 -50
- package/docs/parakeet-transformers-js/src/utils/audio.js +0 -893
- package/docs/parakeet-transformers-js/src/utils/constants.js +0 -9
- package/docs/parakeet-transformers-js/src/utils/core.js +0 -259
- package/docs/parakeet-transformers-js/src/utils/data-structures.js +0 -574
- package/docs/parakeet-transformers-js/src/utils/devices.js +0 -22
- package/docs/parakeet-transformers-js/src/utils/dtypes.js +0 -63
- package/docs/parakeet-transformers-js/src/utils/generic.js +0 -35
- package/docs/parakeet-transformers-js/src/utils/hub.js +0 -780
- package/docs/parakeet-transformers-js/src/utils/image.js +0 -834
- package/docs/parakeet-transformers-js/src/utils/maths.js +0 -1061
- package/docs/parakeet-transformers-js/src/utils/tensor.js +0 -1539
- package/docs/parakeet-transformers-js/src/utils/video.js +0 -128
- package/docs/parakeet-transformers-js/test/decoder.test.js +0 -114
- package/docs/parakeet-transformers-js/test/encoder.test.js +0 -108
- package/docs/parakeet-transformers-js/test/preprocessor.test.js +0 -85
- package/docs/parakeet-transformers-js/test/tokenizer.test.js +0 -24
- package/docs/parakeet-transformers-js/test/transcribe.js +0 -89
- package/docs/parakeet-transformers-js/tsconfig.json +0 -21
- package/docs/parakeet-transformers-js/webpack.config.js +0 -223
|
@@ -1,131 +0,0 @@
|
|
|
1
|
-
import { Processor } from "../../base/processing_utils.js";
|
|
2
|
-
import { AutoImageProcessor } from "../auto/image_processing_auto.js";
|
|
3
|
-
import { AutoTokenizer } from "../../tokenizers.js";
|
|
4
|
-
|
|
5
|
-
export class Florence2Processor extends Processor {
|
|
6
|
-
static tokenizer_class = AutoTokenizer
|
|
7
|
-
static image_processor_class = AutoImageProcessor
|
|
8
|
-
|
|
9
|
-
constructor(config, components, chat_template) {
|
|
10
|
-
super(config, components, chat_template);
|
|
11
|
-
|
|
12
|
-
const {
|
|
13
|
-
// @ts-expect-error TS2339
|
|
14
|
-
tasks_answer_post_processing_type,
|
|
15
|
-
// @ts-expect-error TS2339
|
|
16
|
-
task_prompts_without_inputs,
|
|
17
|
-
// @ts-expect-error TS2339
|
|
18
|
-
task_prompts_with_input,
|
|
19
|
-
} = this.image_processor.config;
|
|
20
|
-
|
|
21
|
-
/** @type {Map<string, string>} */
|
|
22
|
-
this.tasks_answer_post_processing_type = new Map(Object.entries(tasks_answer_post_processing_type ?? {}));
|
|
23
|
-
|
|
24
|
-
/** @type {Map<string, string>} */
|
|
25
|
-
this.task_prompts_without_inputs = new Map(Object.entries(task_prompts_without_inputs ?? {}));
|
|
26
|
-
|
|
27
|
-
/** @type {Map<string, string>} */
|
|
28
|
-
this.task_prompts_with_input = new Map(Object.entries(task_prompts_with_input ?? {}));
|
|
29
|
-
|
|
30
|
-
this.regexes = {
|
|
31
|
-
quad_boxes: /(.+?)<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>/gm,
|
|
32
|
-
bboxes: /([^<]+)?<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>/gm,
|
|
33
|
-
}
|
|
34
|
-
this.size_per_bin = 1000;
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
/**
|
|
38
|
-
* Helper function to construct prompts from input texts
|
|
39
|
-
* @param {string|string[]} text
|
|
40
|
-
* @returns {string[]}
|
|
41
|
-
*/
|
|
42
|
-
construct_prompts(text) {
|
|
43
|
-
if (typeof text === 'string') {
|
|
44
|
-
text = [text];
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
const prompts = [];
|
|
48
|
-
for (const t of text) {
|
|
49
|
-
// 1. fixed task prompts without additional inputs
|
|
50
|
-
if (this.task_prompts_without_inputs.has(t)) {
|
|
51
|
-
prompts.push(this.task_prompts_without_inputs.get(t));
|
|
52
|
-
}
|
|
53
|
-
// 2. task prompts with additional inputs
|
|
54
|
-
else {
|
|
55
|
-
for (const [task, prompt] of this.task_prompts_with_input) {
|
|
56
|
-
if (t.includes(task)) {
|
|
57
|
-
prompts.push(prompt.replaceAll('{input}', t).replaceAll(task, ''));
|
|
58
|
-
break;
|
|
59
|
-
}
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
// 3. default prompt
|
|
63
|
-
if (prompts.length !== text.length) {
|
|
64
|
-
prompts.push(t);
|
|
65
|
-
}
|
|
66
|
-
}
|
|
67
|
-
}
|
|
68
|
-
return prompts;
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
/**
|
|
72
|
-
* Post-process the output of the model to each of the task outputs.
|
|
73
|
-
* @param {string} text The text to post-process.
|
|
74
|
-
* @param {string} task The task to post-process the text for.
|
|
75
|
-
* @param {[number, number]} image_size The size of the image. height x width.
|
|
76
|
-
*/
|
|
77
|
-
post_process_generation(text, task, image_size) {
|
|
78
|
-
const task_answer_post_processing_type = this.tasks_answer_post_processing_type.get(task) ?? 'pure_text';
|
|
79
|
-
|
|
80
|
-
// remove the special tokens
|
|
81
|
-
text = text.replaceAll('<s>', '').replaceAll('</s>', '');
|
|
82
|
-
|
|
83
|
-
let final_answer;
|
|
84
|
-
switch (task_answer_post_processing_type) {
|
|
85
|
-
case 'pure_text':
|
|
86
|
-
final_answer = text;
|
|
87
|
-
break;
|
|
88
|
-
|
|
89
|
-
case 'description_with_bboxes':
|
|
90
|
-
case 'bboxes':
|
|
91
|
-
case 'phrase_grounding':
|
|
92
|
-
case 'ocr':
|
|
93
|
-
const key = task_answer_post_processing_type === 'ocr' ? 'quad_boxes' : 'bboxes';
|
|
94
|
-
const matches = text.matchAll(this.regexes[key]);
|
|
95
|
-
const labels = [];
|
|
96
|
-
const items = [];
|
|
97
|
-
for (const [_, label, ...locations] of matches) {
|
|
98
|
-
// Push new label, or duplicate the last label
|
|
99
|
-
labels.push(label ? label.trim() : labels.at(-1) ?? '');
|
|
100
|
-
items.push(locations.map((x, i) =>
|
|
101
|
-
// NOTE: Add 0.5 to use the center position of the bin as the coordinate.
|
|
102
|
-
(Number(x) + 0.5) / this.size_per_bin * image_size[i % 2])
|
|
103
|
-
);
|
|
104
|
-
}
|
|
105
|
-
final_answer = { labels, [key]: items };
|
|
106
|
-
break;
|
|
107
|
-
|
|
108
|
-
default:
|
|
109
|
-
throw new Error(`Task "${task}" (of type "${task_answer_post_processing_type}") not yet implemented.`);
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
return { [task]: final_answer }
|
|
113
|
-
}
|
|
114
|
-
|
|
115
|
-
// NOTE: images and text are switched from the python version
|
|
116
|
-
// `images` is required, `text` is optional
|
|
117
|
-
async _call(images, text=null, kwargs = {}) {
|
|
118
|
-
|
|
119
|
-
if (!images && !text){
|
|
120
|
-
throw new Error('Either text or images must be provided');
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
const image_inputs = await this.image_processor(images, kwargs);
|
|
124
|
-
const text_inputs = text ? this.tokenizer(this.construct_prompts(text), kwargs) : {};
|
|
125
|
-
|
|
126
|
-
return {
|
|
127
|
-
...image_inputs,
|
|
128
|
-
...text_inputs,
|
|
129
|
-
}
|
|
130
|
-
}
|
|
131
|
-
}
|
|
@@ -1,97 +0,0 @@
|
|
|
1
|
-
import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
|
|
2
|
-
import { full, Tensor } from '../../utils/tensor.js';
|
|
3
|
-
import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js';
|
|
4
|
-
|
|
5
|
-
export class Gemma3nAudioFeatureExtractor extends FeatureExtractor {
|
|
6
|
-
|
|
7
|
-
constructor(config) {
|
|
8
|
-
super(config);
|
|
9
|
-
|
|
10
|
-
const {
|
|
11
|
-
fft_length, feature_size, min_frequency, max_frequency, sampling_rate, frame_length
|
|
12
|
-
} = this.config;
|
|
13
|
-
|
|
14
|
-
const mel_filters = mel_filter_bank(
|
|
15
|
-
Math.floor(1 + fft_length / 2), // num_frequency_bins
|
|
16
|
-
feature_size, // num_mel_filters
|
|
17
|
-
min_frequency, // min_frequency
|
|
18
|
-
max_frequency, // max_frequency
|
|
19
|
-
sampling_rate, // sampling_rate
|
|
20
|
-
null, // norm
|
|
21
|
-
"htk", // mel_scale
|
|
22
|
-
false, // triangularize_in_mel_space
|
|
23
|
-
);
|
|
24
|
-
this.mel_filters = mel_filters;
|
|
25
|
-
|
|
26
|
-
this.window = window_function(frame_length, 'hann')
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
/**
|
|
30
|
-
* Computes the log-Mel spectrogram of the provided audio waveform.
|
|
31
|
-
* @param {Float32Array|Float64Array} waveform The audio waveform to process.
|
|
32
|
-
* @param {number} max_length The maximum number of frames to return.
|
|
33
|
-
* @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
|
|
34
|
-
*/
|
|
35
|
-
async _extract_fbank_features(waveform, max_length) {
|
|
36
|
-
// NOTE: We don't pad/truncate since that is passed in as `max_num_frames`
|
|
37
|
-
return spectrogram(
|
|
38
|
-
waveform,
|
|
39
|
-
this.window, // window
|
|
40
|
-
this.config.frame_length, // frame_length
|
|
41
|
-
this.config.hop_length, // hop_length
|
|
42
|
-
{
|
|
43
|
-
fft_length: this.config.fft_length,
|
|
44
|
-
center: false,
|
|
45
|
-
onesided: true,
|
|
46
|
-
preemphasis: this.config.preemphasis,
|
|
47
|
-
preemphasis_htk_flavor: this.config.preemphasis_htk_flavor,
|
|
48
|
-
mel_filters: this.mel_filters,
|
|
49
|
-
log_mel: 'log',
|
|
50
|
-
mel_floor: this.config.mel_floor,
|
|
51
|
-
remove_dc_offset: false,
|
|
52
|
-
|
|
53
|
-
// Custom
|
|
54
|
-
transpose: true,
|
|
55
|
-
}
|
|
56
|
-
)
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
/**
|
|
60
|
-
* Asynchronously extracts features from a given audio using the provided configuration.
|
|
61
|
-
* @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array.
|
|
62
|
-
* @param {Object} options Optional parameters for feature extraction.
|
|
63
|
-
* @param {number} [options.max_length=480_000] If provided, defines the maximum length of the audio to allow.
|
|
64
|
-
* Audio longer than this will be truncated if `truncation=True`.
|
|
65
|
-
* @param {boolean} [options.truncation=true] Whether or not to truncate audio above `max_length`.
|
|
66
|
-
* @param {boolean} [options.padding=true] Whether to pad the sequence to a multiple of `pad_to_multiple_of`.
|
|
67
|
-
* @param {number} [options.pad_to_multiple_of=128] The number to pad the sequence to a multiple of.
|
|
68
|
-
* @returns {Promise<{ input_features: Tensor, input_features_mask: Tensor }>} A Promise resolving to an object containing the extracted input features and attention masks as Tensors.
|
|
69
|
-
*/
|
|
70
|
-
async _call(audio, {
|
|
71
|
-
max_length = 480_000,
|
|
72
|
-
truncation=true,
|
|
73
|
-
padding = true,
|
|
74
|
-
pad_to_multiple_of = 128,
|
|
75
|
-
} = {}) {
|
|
76
|
-
validate_audio_inputs(audio, 'Gemma3nAudioFeatureExtractor');
|
|
77
|
-
if (truncation && audio.length > max_length) {
|
|
78
|
-
audio = audio.slice(0, max_length);
|
|
79
|
-
}
|
|
80
|
-
if (padding && audio.length % pad_to_multiple_of !== 0) {
|
|
81
|
-
const padding_length = pad_to_multiple_of - (audio.length % pad_to_multiple_of);
|
|
82
|
-
const padded_audio = new Float64Array(audio.length + padding_length);
|
|
83
|
-
padded_audio.set(audio);
|
|
84
|
-
if (this.config.padding_value !== 0) {
|
|
85
|
-
padded_audio.fill(this.config.padding_value, audio.length);
|
|
86
|
-
}
|
|
87
|
-
audio = padded_audio;
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
const features = await this._extract_fbank_features(audio, this.config.max_length);
|
|
91
|
-
const padded_attention_mask = full([1, features.dims[0]], true);
|
|
92
|
-
return {
|
|
93
|
-
input_features: features.unsqueeze_(0),
|
|
94
|
-
input_features_mask: padded_attention_mask,
|
|
95
|
-
}
|
|
96
|
-
}
|
|
97
|
-
}
|
|
@@ -1,74 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
import { Processor } from "../../base/processing_utils.js";
|
|
3
|
-
import { AutoImageProcessor } from "../auto/image_processing_auto.js";
|
|
4
|
-
import { AutoFeatureExtractor } from "../auto/feature_extraction_auto.js";
|
|
5
|
-
import { AutoTokenizer } from "../../tokenizers.js";
|
|
6
|
-
import { RawImage } from "../../utils/image.js";
|
|
7
|
-
import { RawAudio } from "../../utils/audio.js";
|
|
8
|
-
|
|
9
|
-
export class Gemma3nProcessor extends Processor {
|
|
10
|
-
static image_processor_class = AutoImageProcessor;
|
|
11
|
-
static feature_extractor_class = AutoFeatureExtractor;
|
|
12
|
-
static tokenizer_class = AutoTokenizer;
|
|
13
|
-
static uses_processor_config = true;
|
|
14
|
-
static uses_chat_template_file = true;
|
|
15
|
-
|
|
16
|
-
constructor(config, components, chat_template) {
|
|
17
|
-
super(config, components, chat_template);
|
|
18
|
-
this.audio_seq_length = this.config.audio_seq_length;
|
|
19
|
-
this.image_seq_length = this.config.image_seq_length;
|
|
20
|
-
|
|
21
|
-
const {
|
|
22
|
-
// Audio tokens
|
|
23
|
-
audio_token_id, boa_token, audio_token, eoa_token,
|
|
24
|
-
|
|
25
|
-
// Image tokens
|
|
26
|
-
image_token_id, boi_token, image_token, eoi_token
|
|
27
|
-
} = this.tokenizer.config;
|
|
28
|
-
|
|
29
|
-
this.audio_token_id = audio_token_id
|
|
30
|
-
this.boa_token = boa_token
|
|
31
|
-
this.audio_token = audio_token
|
|
32
|
-
const audio_tokens_expanded = audio_token.repeat(this.audio_seq_length);
|
|
33
|
-
this.full_audio_sequence = `\n\n${boa_token}${audio_tokens_expanded}${eoa_token}\n\n`
|
|
34
|
-
|
|
35
|
-
this.image_token_id = image_token_id
|
|
36
|
-
this.boi_token = boi_token
|
|
37
|
-
this.image_token = image_token
|
|
38
|
-
const image_tokens_expanded = image_token.repeat(this.image_seq_length);
|
|
39
|
-
this.full_image_sequence = `\n\n${boi_token}${image_tokens_expanded}${eoi_token}\n\n`
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
/**
|
|
43
|
-
*
|
|
44
|
-
* @param {string|string[]} text
|
|
45
|
-
* @param {RawImage|RawImage[]|RawImage[][]} images
|
|
46
|
-
* @param {RawAudio|RawAudio[]|RawAudio[][]} audio
|
|
47
|
-
* @returns {Promise<any>}
|
|
48
|
-
*/
|
|
49
|
-
async _call(text, images = null, audio = null, options = {}) {
|
|
50
|
-
|
|
51
|
-
if (typeof text === 'string') {
|
|
52
|
-
text = [text];
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
let audio_inputs;
|
|
56
|
-
if (audio) {
|
|
57
|
-
audio_inputs = await this.feature_extractor(audio, options);
|
|
58
|
-
|
|
59
|
-
text = text.map(prompt => prompt.replaceAll(this.audio_token, this.full_audio_sequence));
|
|
60
|
-
}
|
|
61
|
-
let image_inputs;
|
|
62
|
-
if (images) {
|
|
63
|
-
image_inputs = await this.image_processor(images, options);
|
|
64
|
-
text = text.map(prompt => prompt.replaceAll(this.image_token, this.full_image_sequence));
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
let text_inputs = this.tokenizer(text, options);
|
|
68
|
-
return {
|
|
69
|
-
...text_inputs,
|
|
70
|
-
...image_inputs,
|
|
71
|
-
...audio_inputs,
|
|
72
|
-
}
|
|
73
|
-
}
|
|
74
|
-
}
|
package/docs/parakeet-transformers-js/src/models/grounding_dino/image_processing_grounding_dino.js
DELETED
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
import {
|
|
3
|
-
ImageProcessor,
|
|
4
|
-
} from "../../base/image_processors_utils.js";
|
|
5
|
-
import { ones } from '../../utils/tensor.js';
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
/**
|
|
9
|
-
* @typedef {object} GroundingDinoFeatureExtractorResultProps
|
|
10
|
-
* @property {import('../../utils/tensor.js').Tensor} pixel_mask
|
|
11
|
-
* @typedef {import('../../base/image_processors_utils.js').ImageProcessorResult & GroundingDinoFeatureExtractorResultProps} GroundingDinoFeatureExtractorResult
|
|
12
|
-
*/
|
|
13
|
-
|
|
14
|
-
export class GroundingDinoImageProcessor extends ImageProcessor {
|
|
15
|
-
/**
|
|
16
|
-
* Calls the feature extraction process on an array of images, preprocesses
|
|
17
|
-
* each image, and concatenates the resulting features into a single Tensor.
|
|
18
|
-
* @param {import('../../utils/image.js').RawImage[]} images The image(s) to extract features from.
|
|
19
|
-
* @returns {Promise<GroundingDinoFeatureExtractorResult>} An object containing the concatenated pixel values of the preprocessed images.
|
|
20
|
-
*/
|
|
21
|
-
async _call(images) {
|
|
22
|
-
const result = await super._call(images);
|
|
23
|
-
|
|
24
|
-
const dims = result.pixel_values.dims;
|
|
25
|
-
const pixel_mask = ones([dims[0], dims[2], dims[3]]);
|
|
26
|
-
|
|
27
|
-
return { ...result, pixel_mask };
|
|
28
|
-
}
|
|
29
|
-
}
|
package/docs/parakeet-transformers-js/src/models/grounding_dino/processing_grounding_dino.js
DELETED
|
@@ -1,101 +0,0 @@
|
|
|
1
|
-
import { Processor } from "../../base/processing_utils.js";
|
|
2
|
-
import { AutoImageProcessor } from "../auto/image_processing_auto.js";
|
|
3
|
-
import { AutoTokenizer } from "../../tokenizers.js";
|
|
4
|
-
import { center_to_corners_format } from "../../base/image_processors_utils.js";
|
|
5
|
-
|
|
6
|
-
/**
|
|
7
|
-
* Get token ids of phrases from posmaps and input_ids.
|
|
8
|
-
* @param {import('../../utils/tensor.js').Tensor} posmaps A boolean tensor of unbatched text-thresholded logits related to the detected bounding boxes of shape `(hidden_size, )`.
|
|
9
|
-
* @param {import('../../utils/tensor.js').Tensor} input_ids A tensor of token ids of shape `(sequence_length, )`.
|
|
10
|
-
*/
|
|
11
|
-
function get_phrases_from_posmap(posmaps, input_ids) {
|
|
12
|
-
|
|
13
|
-
const left_idx = 0;
|
|
14
|
-
const right_idx = posmaps.dims.at(-1) - 1;
|
|
15
|
-
|
|
16
|
-
const posmaps_list = posmaps.tolist();
|
|
17
|
-
posmaps_list.fill(false, 0, left_idx + 1);
|
|
18
|
-
posmaps_list.fill(false, right_idx);
|
|
19
|
-
|
|
20
|
-
const input_ids_list = input_ids.tolist();
|
|
21
|
-
return posmaps_list
|
|
22
|
-
.map((val, idx) => val ? idx : null)
|
|
23
|
-
.filter(idx => idx !== null)
|
|
24
|
-
.map(i => input_ids_list[i]);
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
export class GroundingDinoProcessor extends Processor {
|
|
28
|
-
static tokenizer_class = AutoTokenizer
|
|
29
|
-
static image_processor_class = AutoImageProcessor
|
|
30
|
-
|
|
31
|
-
/**
|
|
32
|
-
* @typedef {import('../../utils/image.js').RawImage} RawImage
|
|
33
|
-
*/
|
|
34
|
-
/**
|
|
35
|
-
*
|
|
36
|
-
* @param {RawImage|RawImage[]|RawImage[][]} images
|
|
37
|
-
* @param {string|string[]} text
|
|
38
|
-
* @returns {Promise<any>}
|
|
39
|
-
*/
|
|
40
|
-
async _call(images, text, options = {}) {
|
|
41
|
-
|
|
42
|
-
const image_inputs = images ? await this.image_processor(images, options) : {};
|
|
43
|
-
const text_inputs = text ? this.tokenizer(text, options) : {};
|
|
44
|
-
|
|
45
|
-
return {
|
|
46
|
-
...text_inputs,
|
|
47
|
-
...image_inputs,
|
|
48
|
-
}
|
|
49
|
-
}
|
|
50
|
-
post_process_grounded_object_detection(outputs, input_ids, {
|
|
51
|
-
box_threshold = 0.25,
|
|
52
|
-
text_threshold = 0.25,
|
|
53
|
-
target_sizes = null
|
|
54
|
-
} = {}) {
|
|
55
|
-
const { logits, pred_boxes } = outputs;
|
|
56
|
-
const batch_size = logits.dims[0];
|
|
57
|
-
|
|
58
|
-
if (target_sizes !== null && target_sizes.length !== batch_size) {
|
|
59
|
-
throw Error("Make sure that you pass in as many target sizes as the batch dimension of the logits")
|
|
60
|
-
}
|
|
61
|
-
const num_queries = logits.dims.at(1);
|
|
62
|
-
|
|
63
|
-
const probs = logits.sigmoid(); // (batch_size, num_queries, 256)
|
|
64
|
-
const scores = probs.max(-1).tolist(); // (batch_size, num_queries)
|
|
65
|
-
|
|
66
|
-
// Convert to [x0, y0, x1, y1] format
|
|
67
|
-
const boxes = pred_boxes.tolist() // (batch_size, num_queries, 4)
|
|
68
|
-
.map(batch => batch.map(box => center_to_corners_format(box)));
|
|
69
|
-
|
|
70
|
-
const results = [];
|
|
71
|
-
for (let i = 0; i < batch_size; ++i) {
|
|
72
|
-
const target_size = target_sizes !== null ? target_sizes[i] : null;
|
|
73
|
-
|
|
74
|
-
// Convert from relative [0, 1] to absolute [0, height] coordinates
|
|
75
|
-
if (target_size !== null) {
|
|
76
|
-
boxes[i] = boxes[i].map(box => box.map((x, j) => x * target_size[(j + 1) % 2]));
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
const batch_scores = scores[i];
|
|
80
|
-
const final_scores = [];
|
|
81
|
-
const final_phrases = [];
|
|
82
|
-
const final_boxes = [];
|
|
83
|
-
for (let j = 0; j < num_queries; ++j) {
|
|
84
|
-
const score = batch_scores[j];
|
|
85
|
-
if (score <= box_threshold) {
|
|
86
|
-
continue;
|
|
87
|
-
}
|
|
88
|
-
const box = boxes[i][j];
|
|
89
|
-
const prob = probs[i][j];
|
|
90
|
-
|
|
91
|
-
final_scores.push(score);
|
|
92
|
-
final_boxes.push(box);
|
|
93
|
-
|
|
94
|
-
const phrases = get_phrases_from_posmap(prob.gt(text_threshold), input_ids[i]);
|
|
95
|
-
final_phrases.push(phrases);
|
|
96
|
-
}
|
|
97
|
-
results.push({ scores: final_scores, boxes: final_boxes, labels: this.batch_decode(final_phrases) });
|
|
98
|
-
}
|
|
99
|
-
return results;
|
|
100
|
-
}
|
|
101
|
-
}
|
|
@@ -1,232 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
import {
|
|
4
|
-
ImageProcessor,
|
|
5
|
-
} from "../../base/image_processors_utils.js";
|
|
6
|
-
import { cat, full, interpolate_4d, slice, stack } from "../../utils/tensor.js";
|
|
7
|
-
|
|
8
|
-
export class Idefics3ImageProcessor extends ImageProcessor {
|
|
9
|
-
constructor(config) {
|
|
10
|
-
super(config);
|
|
11
|
-
|
|
12
|
-
this.do_image_splitting = config.do_image_splitting ?? true;
|
|
13
|
-
this.max_image_size = config.max_image_size;
|
|
14
|
-
}
|
|
15
|
-
|
|
16
|
-
/**
|
|
17
|
-
* @typedef {import('../../utils/image.js').RawImage} RawImage
|
|
18
|
-
* @typedef {import('../../utils/tensor.js').Tensor} Tensor
|
|
19
|
-
*/
|
|
20
|
-
|
|
21
|
-
/**
|
|
22
|
-
* Calculate size to resize images to, to be multiples of `vision_encoder_max_size` while preserving the aspect ratio.
|
|
23
|
-
* @param {Tensor} pixel_values Tensor of the image to resize.
|
|
24
|
-
* @param {number} vision_encoder_max_size Maximum size of the output image. If the image is larger than this size,
|
|
25
|
-
* it will be split into patches of this size, and the original image will be concatenated with the patches, resized to max_size.
|
|
26
|
-
*/
|
|
27
|
-
get_resize_for_vision_encoder(pixel_values, vision_encoder_max_size) {
|
|
28
|
-
let [height, width] = pixel_values.dims.slice(-2);
|
|
29
|
-
|
|
30
|
-
const aspect_ratio = width / height;
|
|
31
|
-
if (width >= height) {
|
|
32
|
-
width = Math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size;
|
|
33
|
-
height = Math.floor(width / aspect_ratio);
|
|
34
|
-
height = Math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size;
|
|
35
|
-
} else {
|
|
36
|
-
height = Math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size;
|
|
37
|
-
width = Math.floor(height * aspect_ratio);
|
|
38
|
-
width = Math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size;
|
|
39
|
-
}
|
|
40
|
-
return { height, width };
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
/** @param {RawImage|RawImage[]|RawImage[][]} images */
|
|
44
|
-
async _call(images, {
|
|
45
|
-
do_image_splitting = null,
|
|
46
|
-
return_row_col_info = false,
|
|
47
|
-
} = {}) {
|
|
48
|
-
|
|
49
|
-
/** @type {RawImage[][]} */
|
|
50
|
-
let batched_2d_images;
|
|
51
|
-
if (!Array.isArray(images)) {
|
|
52
|
-
batched_2d_images = [[images]];
|
|
53
|
-
} else {
|
|
54
|
-
if (images.length === 0 || !images[0]) {
|
|
55
|
-
throw new Error("No images provided.");
|
|
56
|
-
}
|
|
57
|
-
if (!Array.isArray(images[0])) {
|
|
58
|
-
batched_2d_images = [/** @type {RawImage[]} */(images)];
|
|
59
|
-
} else {
|
|
60
|
-
batched_2d_images = /** @type {RawImage[][]} */(images);
|
|
61
|
-
}
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
// List of tensors, each with shape [patches, channels, height, width]
|
|
65
|
-
let all_pixel_values = [];
|
|
66
|
-
let images_list_rows = [];
|
|
67
|
-
let images_list_cols = [];
|
|
68
|
-
|
|
69
|
-
const original_sizes = [];
|
|
70
|
-
const reshaped_input_sizes = [];
|
|
71
|
-
for (const image_batch of batched_2d_images) {
|
|
72
|
-
|
|
73
|
-
let images_list = await Promise.all(image_batch.map(x => this.preprocess(x)));
|
|
74
|
-
|
|
75
|
-
// Original sizes of images
|
|
76
|
-
original_sizes.push(...images_list.map(x => x.original_size));
|
|
77
|
-
|
|
78
|
-
// Reshaped sizes of images, before padding or cropping
|
|
79
|
-
reshaped_input_sizes.push(...images_list.map(x => x.reshaped_input_size));
|
|
80
|
-
|
|
81
|
-
// Convert images to 4D tensors for easier processing
|
|
82
|
-
images_list.forEach(x => x.pixel_values.unsqueeze_(0));
|
|
83
|
-
|
|
84
|
-
const { longest_edge } = this.max_image_size;
|
|
85
|
-
|
|
86
|
-
/** @type {Tensor[]} */
|
|
87
|
-
let images_tensor;
|
|
88
|
-
if (do_image_splitting ?? this.do_image_splitting) {
|
|
89
|
-
let image_rows = new Array(images_list.length);
|
|
90
|
-
let image_cols = new Array(images_list.length);
|
|
91
|
-
|
|
92
|
-
// We first resize both height and width of each image to the nearest max_image_size multiple, disregarding the aspect ratio
|
|
93
|
-
images_tensor = await Promise.all(images_list.map(async (x, i) => {
|
|
94
|
-
const new_size = this.get_resize_for_vision_encoder(x.pixel_values, longest_edge);
|
|
95
|
-
|
|
96
|
-
const resized = await interpolate_4d(x.pixel_values, {
|
|
97
|
-
size: [new_size.height, new_size.width],
|
|
98
|
-
});
|
|
99
|
-
|
|
100
|
-
const { frames, num_splits_h, num_splits_w } = await this.split_image(resized, this.max_image_size);
|
|
101
|
-
image_rows[i] = num_splits_h;
|
|
102
|
-
image_cols[i] = num_splits_w;
|
|
103
|
-
return cat(frames, 0);
|
|
104
|
-
}));
|
|
105
|
-
|
|
106
|
-
images_list_rows.push(image_rows);
|
|
107
|
-
images_list_cols.push(image_cols);
|
|
108
|
-
|
|
109
|
-
} else {
|
|
110
|
-
/** @type {[number, number]} */
|
|
111
|
-
const size = [longest_edge, longest_edge];
|
|
112
|
-
images_tensor = await Promise.all(
|
|
113
|
-
images_list.map(x => interpolate_4d(x.pixel_values, { size }))
|
|
114
|
-
);
|
|
115
|
-
|
|
116
|
-
images_list_rows.push(new Array(images_list.length).fill(0));
|
|
117
|
-
images_list_cols.push(new Array(images_list.length).fill(0));
|
|
118
|
-
}
|
|
119
|
-
|
|
120
|
-
all_pixel_values.push(cat(images_tensor, 0));
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
const batch_size = all_pixel_values.length;
|
|
124
|
-
const [n, c, h, w] = all_pixel_values[0].dims;
|
|
125
|
-
|
|
126
|
-
// Stack pixel values
|
|
127
|
-
let pixel_values;
|
|
128
|
-
let pixel_attention_mask;
|
|
129
|
-
if (batch_size === 1) {
|
|
130
|
-
pixel_values = all_pixel_values[0].unsqueeze_(0);
|
|
131
|
-
pixel_attention_mask = full([batch_size, n, h, w], true);
|
|
132
|
-
} else {
|
|
133
|
-
// Add padding (if necessary) to images with less patches than the maximum number of patches
|
|
134
|
-
const max_num_patches = Math.max(...all_pixel_values.map(x => x.dims.at(0)));
|
|
135
|
-
|
|
136
|
-
pixel_attention_mask = full([batch_size, max_num_patches, h, w], true);
|
|
137
|
-
const pixel_attention_mask_data = pixel_attention_mask.data;
|
|
138
|
-
const pixel_attention_mask_stride = max_num_patches * h * w;
|
|
139
|
-
for (let i = 0; i < batch_size; ++i) {
|
|
140
|
-
const num_patches = all_pixel_values[i].dims[0];
|
|
141
|
-
if (num_patches < max_num_patches) {
|
|
142
|
-
all_pixel_values[i] = cat([
|
|
143
|
-
all_pixel_values[i],
|
|
144
|
-
full([max_num_patches - num_patches, c, h, w], 0),
|
|
145
|
-
], 0);
|
|
146
|
-
|
|
147
|
-
const start_offset = i * pixel_attention_mask_stride + num_patches * h * w;
|
|
148
|
-
const end_offset = (i + 1) * pixel_attention_mask_stride;
|
|
149
|
-
|
|
150
|
-
// @ts-ignore
|
|
151
|
-
pixel_attention_mask_data.fill(false, start_offset, end_offset);
|
|
152
|
-
}
|
|
153
|
-
}
|
|
154
|
-
pixel_values = stack(all_pixel_values, 0);
|
|
155
|
-
}
|
|
156
|
-
|
|
157
|
-
return {
|
|
158
|
-
pixel_values,
|
|
159
|
-
pixel_attention_mask,
|
|
160
|
-
|
|
161
|
-
original_sizes,
|
|
162
|
-
reshaped_input_sizes,
|
|
163
|
-
...(
|
|
164
|
-
return_row_col_info
|
|
165
|
-
? { rows: images_list_rows, cols: images_list_cols }
|
|
166
|
-
: {}
|
|
167
|
-
),
|
|
168
|
-
}
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
async split_image(pixel_values, { longest_edge }) {
|
|
172
|
-
const max_height = longest_edge;
|
|
173
|
-
const max_width = longest_edge;
|
|
174
|
-
|
|
175
|
-
const frames = [];
|
|
176
|
-
|
|
177
|
-
const [height, width] = pixel_values.dims.slice(-2);
|
|
178
|
-
|
|
179
|
-
let num_splits_h = 0, num_splits_w = 0;
|
|
180
|
-
|
|
181
|
-
if (height > max_height || width > max_width) {
|
|
182
|
-
// Calculate the number of splits
|
|
183
|
-
num_splits_h = Math.ceil(height / max_height);
|
|
184
|
-
num_splits_w = Math.ceil(width / max_width);
|
|
185
|
-
|
|
186
|
-
// Calculate the optimal width and height for the sub-images
|
|
187
|
-
const optimal_height = Math.ceil(height / num_splits_h);
|
|
188
|
-
const optimal_width = Math.ceil(width / num_splits_w);
|
|
189
|
-
|
|
190
|
-
// Iterate through each row and column
|
|
191
|
-
for (let r = 0; r < num_splits_h; ++r) {
|
|
192
|
-
for (let c = 0; c < num_splits_w; ++c) {
|
|
193
|
-
let start_x, start_y, end_x, end_y;
|
|
194
|
-
if (r === num_splits_h - 1) { // At bottom
|
|
195
|
-
start_y = height - optimal_height;
|
|
196
|
-
end_y = height;
|
|
197
|
-
} else {
|
|
198
|
-
start_y = r * optimal_height;
|
|
199
|
-
end_y = (r + 1) * optimal_height;
|
|
200
|
-
}
|
|
201
|
-
if (c === num_splits_w - 1) { // At right
|
|
202
|
-
start_x = width - optimal_width;
|
|
203
|
-
end_x = width;
|
|
204
|
-
} else {
|
|
205
|
-
start_x = c * optimal_width;
|
|
206
|
-
end_x = (c + 1) * optimal_width;
|
|
207
|
-
}
|
|
208
|
-
|
|
209
|
-
const starts = [start_y, start_x];
|
|
210
|
-
const ends = [end_y, end_x];
|
|
211
|
-
|
|
212
|
-
const patch = await slice(pixel_values, starts, ends, [2, 3]);
|
|
213
|
-
frames.push(patch);
|
|
214
|
-
}
|
|
215
|
-
}
|
|
216
|
-
|
|
217
|
-
// Resize the global image to match max dimensions for memory efficiency
|
|
218
|
-
const global_image_height = max_height;
|
|
219
|
-
const global_image_width = max_width;
|
|
220
|
-
|
|
221
|
-
if (height !== global_image_height || width !== global_image_width) {
|
|
222
|
-
pixel_values = await interpolate_4d(pixel_values, {
|
|
223
|
-
size: [global_image_height, global_image_width],
|
|
224
|
-
})
|
|
225
|
-
}
|
|
226
|
-
}
|
|
227
|
-
|
|
228
|
-
frames.push(pixel_values);
|
|
229
|
-
|
|
230
|
-
return { frames, num_splits_h, num_splits_w };
|
|
231
|
-
}
|
|
232
|
-
}
|