@huggingface/inference 3.0.1 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +162 -69
- package/dist/index.js +162 -69
- package/dist/src/providers/fal-ai.d.ts.map +1 -1
- package/dist/src/providers/replicate.d.ts.map +1 -1
- package/dist/src/tasks/audio/audioClassification.d.ts +4 -18
- package/dist/src/tasks/audio/audioClassification.d.ts.map +1 -1
- package/dist/src/tasks/audio/audioToAudio.d.ts +10 -9
- package/dist/src/tasks/audio/audioToAudio.d.ts.map +1 -1
- package/dist/src/tasks/audio/automaticSpeechRecognition.d.ts +3 -12
- package/dist/src/tasks/audio/automaticSpeechRecognition.d.ts.map +1 -1
- package/dist/src/tasks/audio/textToSpeech.d.ts +4 -8
- package/dist/src/tasks/audio/textToSpeech.d.ts.map +1 -1
- package/dist/src/tasks/audio/utils.d.ts +11 -0
- package/dist/src/tasks/audio/utils.d.ts.map +1 -0
- package/dist/src/tasks/cv/imageClassification.d.ts +3 -17
- package/dist/src/tasks/cv/imageClassification.d.ts.map +1 -1
- package/dist/src/tasks/cv/imageSegmentation.d.ts +3 -21
- package/dist/src/tasks/cv/imageSegmentation.d.ts.map +1 -1
- package/dist/src/tasks/cv/imageToImage.d.ts +3 -49
- package/dist/src/tasks/cv/imageToImage.d.ts.map +1 -1
- package/dist/src/tasks/cv/imageToText.d.ts +3 -12
- package/dist/src/tasks/cv/imageToText.d.ts.map +1 -1
- package/dist/src/tasks/cv/objectDetection.d.ts +3 -26
- package/dist/src/tasks/cv/objectDetection.d.ts.map +1 -1
- package/dist/src/tasks/cv/textToImage.d.ts +3 -38
- package/dist/src/tasks/cv/textToImage.d.ts.map +1 -1
- package/dist/src/tasks/cv/textToVideo.d.ts +6 -0
- package/dist/src/tasks/cv/textToVideo.d.ts.map +1 -0
- package/dist/src/tasks/cv/utils.d.ts +11 -0
- package/dist/src/tasks/cv/utils.d.ts.map +1 -0
- package/dist/src/tasks/cv/zeroShotImageClassification.d.ts +7 -15
- package/dist/src/tasks/cv/zeroShotImageClassification.d.ts.map +1 -1
- package/dist/src/tasks/multimodal/documentQuestionAnswering.d.ts +5 -28
- package/dist/src/tasks/multimodal/documentQuestionAnswering.d.ts.map +1 -1
- package/dist/src/tasks/multimodal/visualQuestionAnswering.d.ts +5 -20
- package/dist/src/tasks/multimodal/visualQuestionAnswering.d.ts.map +1 -1
- package/dist/src/tasks/nlp/fillMask.d.ts +2 -21
- package/dist/src/tasks/nlp/fillMask.d.ts.map +1 -1
- package/dist/src/tasks/nlp/questionAnswering.d.ts +3 -25
- package/dist/src/tasks/nlp/questionAnswering.d.ts.map +1 -1
- package/dist/src/tasks/nlp/sentenceSimilarity.d.ts +2 -13
- package/dist/src/tasks/nlp/sentenceSimilarity.d.ts.map +1 -1
- package/dist/src/tasks/nlp/summarization.d.ts +2 -42
- package/dist/src/tasks/nlp/summarization.d.ts.map +1 -1
- package/dist/src/tasks/nlp/tableQuestionAnswering.d.ts +3 -31
- package/dist/src/tasks/nlp/tableQuestionAnswering.d.ts.map +1 -1
- package/dist/src/tasks/nlp/textClassification.d.ts +2 -16
- package/dist/src/tasks/nlp/textClassification.d.ts.map +1 -1
- package/dist/src/tasks/nlp/tokenClassification.d.ts +2 -45
- package/dist/src/tasks/nlp/tokenClassification.d.ts.map +1 -1
- package/dist/src/tasks/nlp/translation.d.ts +2 -13
- package/dist/src/tasks/nlp/translation.d.ts.map +1 -1
- package/dist/src/tasks/nlp/zeroShotClassification.d.ts +2 -22
- package/dist/src/tasks/nlp/zeroShotClassification.d.ts.map +1 -1
- package/dist/src/types.d.ts +4 -0
- package/dist/src/types.d.ts.map +1 -1
- package/package.json +2 -2
- package/src/providers/fal-ai.ts +4 -0
- package/src/providers/replicate.ts +3 -0
- package/src/tasks/audio/audioClassification.ts +7 -22
- package/src/tasks/audio/audioToAudio.ts +43 -23
- package/src/tasks/audio/automaticSpeechRecognition.ts +35 -23
- package/src/tasks/audio/textToSpeech.ts +8 -14
- package/src/tasks/audio/utils.ts +18 -0
- package/src/tasks/cv/imageClassification.ts +5 -20
- package/src/tasks/cv/imageSegmentation.ts +5 -24
- package/src/tasks/cv/imageToImage.ts +4 -52
- package/src/tasks/cv/imageToText.ts +6 -15
- package/src/tasks/cv/objectDetection.ts +5 -30
- package/src/tasks/cv/textToImage.ts +14 -50
- package/src/tasks/cv/textToVideo.ts +67 -0
- package/src/tasks/cv/utils.ts +13 -0
- package/src/tasks/cv/zeroShotImageClassification.ts +32 -31
- package/src/tasks/multimodal/documentQuestionAnswering.ts +25 -43
- package/src/tasks/multimodal/visualQuestionAnswering.ts +20 -36
- package/src/tasks/nlp/fillMask.ts +2 -22
- package/src/tasks/nlp/questionAnswering.ts +22 -36
- package/src/tasks/nlp/sentenceSimilarity.ts +12 -15
- package/src/tasks/nlp/summarization.ts +2 -43
- package/src/tasks/nlp/tableQuestionAnswering.ts +25 -41
- package/src/tasks/nlp/textClassification.ts +3 -18
- package/src/tasks/nlp/tokenClassification.ts +2 -47
- package/src/tasks/nlp/translation.ts +3 -17
- package/src/tasks/nlp/zeroShotClassification.ts +2 -24
- package/src/types.ts +7 -1
package/dist/index.cjs
CHANGED
|
@@ -119,6 +119,10 @@ var FAL_AI_SUPPORTED_MODEL_IDS = {
|
|
|
119
119
|
},
|
|
120
120
|
"automatic-speech-recognition": {
|
|
121
121
|
"openai/whisper-large-v3": "fal-ai/whisper"
|
|
122
|
+
},
|
|
123
|
+
"text-to-video": {
|
|
124
|
+
"genmo/mochi-1-preview": "fal-ai/mochi-v1",
|
|
125
|
+
"tencent/HunyuanVideo": "fal-ai/hunyuan-video"
|
|
122
126
|
}
|
|
123
127
|
};
|
|
124
128
|
|
|
@@ -131,6 +135,9 @@ var REPLICATE_SUPPORTED_MODEL_IDS = {
|
|
|
131
135
|
},
|
|
132
136
|
"text-to-speech": {
|
|
133
137
|
"OuteAI/OuteTTS-0.3-500M": "jbilcke/oute-tts:39a59319327b27327fa3095149c5a746e7f2aee18c75055c3368237a6503cd26"
|
|
138
|
+
},
|
|
139
|
+
"text-to-video": {
|
|
140
|
+
"genmo/mochi-1-preview": "genmoai/mochi-1:1944af04d098ef69bed7f9d335d102e652203f268ec4aaa2d836f6217217e460"
|
|
134
141
|
}
|
|
135
142
|
};
|
|
136
143
|
|
|
@@ -596,9 +603,42 @@ var InferenceOutputError = class extends TypeError {
|
|
|
596
603
|
}
|
|
597
604
|
};
|
|
598
605
|
|
|
606
|
+
// src/utils/pick.ts
|
|
607
|
+
function pick(o, props) {
|
|
608
|
+
return Object.assign(
|
|
609
|
+
{},
|
|
610
|
+
...props.map((prop) => {
|
|
611
|
+
if (o[prop] !== void 0) {
|
|
612
|
+
return { [prop]: o[prop] };
|
|
613
|
+
}
|
|
614
|
+
})
|
|
615
|
+
);
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
// src/utils/typedInclude.ts
|
|
619
|
+
function typedInclude(arr, v) {
|
|
620
|
+
return arr.includes(v);
|
|
621
|
+
}
|
|
622
|
+
|
|
623
|
+
// src/utils/omit.ts
|
|
624
|
+
function omit(o, props) {
|
|
625
|
+
const propsArr = Array.isArray(props) ? props : [props];
|
|
626
|
+
const letsKeep = Object.keys(o).filter((prop) => !typedInclude(propsArr, prop));
|
|
627
|
+
return pick(o, letsKeep);
|
|
628
|
+
}
|
|
629
|
+
|
|
630
|
+
// src/tasks/audio/utils.ts
|
|
631
|
+
function preparePayload(args) {
|
|
632
|
+
return "data" in args ? args : {
|
|
633
|
+
...omit(args, "inputs"),
|
|
634
|
+
data: args.inputs
|
|
635
|
+
};
|
|
636
|
+
}
|
|
637
|
+
|
|
599
638
|
// src/tasks/audio/audioClassification.ts
|
|
600
639
|
async function audioClassification(args, options) {
|
|
601
|
-
const
|
|
640
|
+
const payload = preparePayload(args);
|
|
641
|
+
const res = await request(payload, {
|
|
602
642
|
...options,
|
|
603
643
|
taskHint: "audio-classification"
|
|
604
644
|
});
|
|
@@ -624,15 +664,8 @@ function base64FromBytes(arr) {
|
|
|
624
664
|
|
|
625
665
|
// src/tasks/audio/automaticSpeechRecognition.ts
|
|
626
666
|
async function automaticSpeechRecognition(args, options) {
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
const base64audio = base64FromBytes(
|
|
630
|
-
new Uint8Array(args.data instanceof ArrayBuffer ? args.data : await args.data.arrayBuffer())
|
|
631
|
-
);
|
|
632
|
-
args.audio_url = `data:${contentType};base64,${base64audio}`;
|
|
633
|
-
delete args.data;
|
|
634
|
-
}
|
|
635
|
-
const res = await request(args, {
|
|
667
|
+
const payload = await buildPayload(args);
|
|
668
|
+
const res = await request(payload, {
|
|
636
669
|
...options,
|
|
637
670
|
taskHint: "automatic-speech-recognition"
|
|
638
671
|
});
|
|
@@ -642,6 +675,32 @@ async function automaticSpeechRecognition(args, options) {
|
|
|
642
675
|
}
|
|
643
676
|
return res;
|
|
644
677
|
}
|
|
678
|
+
var FAL_AI_SUPPORTED_BLOB_TYPES = ["audio/mpeg", "audio/mp4", "audio/wav", "audio/x-wav"];
|
|
679
|
+
async function buildPayload(args) {
|
|
680
|
+
if (args.provider === "fal-ai") {
|
|
681
|
+
const blob = "data" in args && args.data instanceof Blob ? args.data : "inputs" in args ? args.inputs : void 0;
|
|
682
|
+
const contentType = blob?.type;
|
|
683
|
+
if (!contentType) {
|
|
684
|
+
throw new Error(
|
|
685
|
+
`Unable to determine the input's content-type. Make sure your are passing a Blob when using provider fal-ai.`
|
|
686
|
+
);
|
|
687
|
+
}
|
|
688
|
+
if (!FAL_AI_SUPPORTED_BLOB_TYPES.includes(contentType)) {
|
|
689
|
+
throw new Error(
|
|
690
|
+
`Provider fal-ai does not support blob type ${contentType} - supported content types are: ${FAL_AI_SUPPORTED_BLOB_TYPES.join(
|
|
691
|
+
", "
|
|
692
|
+
)}`
|
|
693
|
+
);
|
|
694
|
+
}
|
|
695
|
+
const base64audio = base64FromBytes(new Uint8Array(await blob.arrayBuffer()));
|
|
696
|
+
return {
|
|
697
|
+
..."data" in args ? omit(args, "data") : omit(args, "inputs"),
|
|
698
|
+
audio_url: `data:${contentType};base64,${base64audio}`
|
|
699
|
+
};
|
|
700
|
+
} else {
|
|
701
|
+
return preparePayload(args);
|
|
702
|
+
}
|
|
703
|
+
}
|
|
645
704
|
|
|
646
705
|
// src/tasks/audio/textToSpeech.ts
|
|
647
706
|
async function textToSpeech(args, options) {
|
|
@@ -649,6 +708,9 @@ async function textToSpeech(args, options) {
|
|
|
649
708
|
...options,
|
|
650
709
|
taskHint: "text-to-speech"
|
|
651
710
|
});
|
|
711
|
+
if (res instanceof Blob) {
|
|
712
|
+
return res;
|
|
713
|
+
}
|
|
652
714
|
if (res && typeof res === "object") {
|
|
653
715
|
if ("output" in res) {
|
|
654
716
|
if (typeof res.output === "string") {
|
|
@@ -662,31 +724,39 @@ async function textToSpeech(args, options) {
|
|
|
662
724
|
}
|
|
663
725
|
}
|
|
664
726
|
}
|
|
665
|
-
|
|
666
|
-
if (!isValidOutput) {
|
|
667
|
-
throw new InferenceOutputError("Expected Blob");
|
|
668
|
-
}
|
|
669
|
-
return res;
|
|
727
|
+
throw new InferenceOutputError("Expected Blob or object with output");
|
|
670
728
|
}
|
|
671
729
|
|
|
672
730
|
// src/tasks/audio/audioToAudio.ts
|
|
673
731
|
async function audioToAudio(args, options) {
|
|
674
|
-
const
|
|
732
|
+
const payload = preparePayload(args);
|
|
733
|
+
const res = await request(payload, {
|
|
675
734
|
...options,
|
|
676
735
|
taskHint: "audio-to-audio"
|
|
677
736
|
});
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
if (!
|
|
682
|
-
throw new InferenceOutputError("Expected Array
|
|
737
|
+
return validateOutput(res);
|
|
738
|
+
}
|
|
739
|
+
function validateOutput(output) {
|
|
740
|
+
if (!Array.isArray(output)) {
|
|
741
|
+
throw new InferenceOutputError("Expected Array");
|
|
683
742
|
}
|
|
684
|
-
|
|
743
|
+
if (!output.every((elem) => {
|
|
744
|
+
return typeof elem === "object" && elem && "label" in elem && typeof elem.label === "string" && "content-type" in elem && typeof elem["content-type"] === "string" && "blob" in elem && typeof elem.blob === "string";
|
|
745
|
+
})) {
|
|
746
|
+
throw new InferenceOutputError("Expected Array<{label: string, audio: Blob}>");
|
|
747
|
+
}
|
|
748
|
+
return output;
|
|
749
|
+
}
|
|
750
|
+
|
|
751
|
+
// src/tasks/cv/utils.ts
|
|
752
|
+
function preparePayload2(args) {
|
|
753
|
+
return "data" in args ? args : { ...omit(args, "inputs"), data: args.inputs };
|
|
685
754
|
}
|
|
686
755
|
|
|
687
756
|
// src/tasks/cv/imageClassification.ts
|
|
688
757
|
async function imageClassification(args, options) {
|
|
689
|
-
const
|
|
758
|
+
const payload = preparePayload2(args);
|
|
759
|
+
const res = await request(payload, {
|
|
690
760
|
...options,
|
|
691
761
|
taskHint: "image-classification"
|
|
692
762
|
});
|
|
@@ -699,7 +769,8 @@ async function imageClassification(args, options) {
|
|
|
699
769
|
|
|
700
770
|
// src/tasks/cv/imageSegmentation.ts
|
|
701
771
|
async function imageSegmentation(args, options) {
|
|
702
|
-
const
|
|
772
|
+
const payload = preparePayload2(args);
|
|
773
|
+
const res = await request(payload, {
|
|
703
774
|
...options,
|
|
704
775
|
taskHint: "image-segmentation"
|
|
705
776
|
});
|
|
@@ -712,7 +783,8 @@ async function imageSegmentation(args, options) {
|
|
|
712
783
|
|
|
713
784
|
// src/tasks/cv/imageToText.ts
|
|
714
785
|
async function imageToText(args, options) {
|
|
715
|
-
const
|
|
786
|
+
const payload = preparePayload2(args);
|
|
787
|
+
const res = (await request(payload, {
|
|
716
788
|
...options,
|
|
717
789
|
taskHint: "image-to-text"
|
|
718
790
|
}))?.[0];
|
|
@@ -724,7 +796,8 @@ async function imageToText(args, options) {
|
|
|
724
796
|
|
|
725
797
|
// src/tasks/cv/objectDetection.ts
|
|
726
798
|
async function objectDetection(args, options) {
|
|
727
|
-
const
|
|
799
|
+
const payload = preparePayload2(args);
|
|
800
|
+
const res = await request(payload, {
|
|
728
801
|
...options,
|
|
729
802
|
taskHint: "object-detection"
|
|
730
803
|
});
|
|
@@ -741,15 +814,13 @@ async function objectDetection(args, options) {
|
|
|
741
814
|
|
|
742
815
|
// src/tasks/cv/textToImage.ts
|
|
743
816
|
async function textToImage(args, options) {
|
|
744
|
-
|
|
745
|
-
args
|
|
746
|
-
|
|
747
|
-
args.
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
}
|
|
752
|
-
const res = await request(args, {
|
|
817
|
+
const payload = args.provider === "together" || args.provider === "fal-ai" || args.provider === "replicate" ? {
|
|
818
|
+
...omit(args, ["inputs", "parameters"]),
|
|
819
|
+
...args.parameters,
|
|
820
|
+
...args.provider !== "replicate" ? { response_format: "base64" } : void 0,
|
|
821
|
+
prompt: args.inputs
|
|
822
|
+
} : args;
|
|
823
|
+
const res = await request(payload, {
|
|
753
824
|
...options,
|
|
754
825
|
taskHint: "text-to-image"
|
|
755
826
|
});
|
|
@@ -806,18 +877,30 @@ async function imageToImage(args, options) {
|
|
|
806
877
|
}
|
|
807
878
|
|
|
808
879
|
// src/tasks/cv/zeroShotImageClassification.ts
|
|
809
|
-
async function
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
new Uint8Array(
|
|
815
|
-
|
|
880
|
+
async function preparePayload3(args) {
|
|
881
|
+
if (args.inputs instanceof Blob) {
|
|
882
|
+
return {
|
|
883
|
+
...args,
|
|
884
|
+
inputs: {
|
|
885
|
+
image: base64FromBytes(new Uint8Array(await args.inputs.arrayBuffer()))
|
|
886
|
+
}
|
|
887
|
+
};
|
|
888
|
+
} else {
|
|
889
|
+
return {
|
|
890
|
+
...args,
|
|
891
|
+
inputs: {
|
|
892
|
+
image: base64FromBytes(
|
|
893
|
+
new Uint8Array(
|
|
894
|
+
args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
|
|
895
|
+
)
|
|
816
896
|
)
|
|
817
|
-
|
|
818
|
-
}
|
|
819
|
-
}
|
|
820
|
-
|
|
897
|
+
}
|
|
898
|
+
};
|
|
899
|
+
}
|
|
900
|
+
}
|
|
901
|
+
async function zeroShotImageClassification(args, options) {
|
|
902
|
+
const payload = await preparePayload3(args);
|
|
903
|
+
const res = await request(payload, {
|
|
821
904
|
...options,
|
|
822
905
|
taskHint: "zero-shot-image-classification"
|
|
823
906
|
});
|
|
@@ -906,17 +989,19 @@ async function questionAnswering(args, options) {
|
|
|
906
989
|
...options,
|
|
907
990
|
taskHint: "question-answering"
|
|
908
991
|
});
|
|
909
|
-
const isValidOutput =
|
|
992
|
+
const isValidOutput = Array.isArray(res) ? res.every(
|
|
993
|
+
(elem) => typeof elem === "object" && !!elem && typeof elem.answer === "string" && typeof elem.end === "number" && typeof elem.score === "number" && typeof elem.start === "number"
|
|
994
|
+
) : typeof res === "object" && !!res && typeof res.answer === "string" && typeof res.end === "number" && typeof res.score === "number" && typeof res.start === "number";
|
|
910
995
|
if (!isValidOutput) {
|
|
911
|
-
throw new InferenceOutputError("Expected {answer: string, end: number, score: number, start: number}");
|
|
996
|
+
throw new InferenceOutputError("Expected Array<{answer: string, end: number, score: number, start: number}>");
|
|
912
997
|
}
|
|
913
|
-
return res;
|
|
998
|
+
return Array.isArray(res) ? res[0] : res;
|
|
914
999
|
}
|
|
915
1000
|
|
|
916
1001
|
// src/tasks/nlp/sentenceSimilarity.ts
|
|
917
1002
|
async function sentenceSimilarity(args, options) {
|
|
918
1003
|
const defaultTask = args.model ? await getDefaultTask(args.model, args.accessToken, options) : void 0;
|
|
919
|
-
const res = await request(args, {
|
|
1004
|
+
const res = await request(prepareInput(args), {
|
|
920
1005
|
...options,
|
|
921
1006
|
taskHint: "sentence-similarity",
|
|
922
1007
|
...defaultTask === "feature-extraction" && { forceTask: "sentence-similarity" }
|
|
@@ -927,6 +1012,13 @@ async function sentenceSimilarity(args, options) {
|
|
|
927
1012
|
}
|
|
928
1013
|
return res;
|
|
929
1014
|
}
|
|
1015
|
+
function prepareInput(args) {
|
|
1016
|
+
return {
|
|
1017
|
+
...omit(args, ["inputs", "parameters"]),
|
|
1018
|
+
inputs: { ...omit(args.inputs, "sourceSentence") },
|
|
1019
|
+
parameters: { source_sentence: args.inputs.sourceSentence, ...args.parameters }
|
|
1020
|
+
};
|
|
1021
|
+
}
|
|
930
1022
|
|
|
931
1023
|
// src/tasks/nlp/summarization.ts
|
|
932
1024
|
async function summarization(args, options) {
|
|
@@ -947,13 +1039,18 @@ async function tableQuestionAnswering(args, options) {
|
|
|
947
1039
|
...options,
|
|
948
1040
|
taskHint: "table-question-answering"
|
|
949
1041
|
});
|
|
950
|
-
const isValidOutput =
|
|
1042
|
+
const isValidOutput = Array.isArray(res) ? res.every((elem) => validate(elem)) : validate(res);
|
|
951
1043
|
if (!isValidOutput) {
|
|
952
1044
|
throw new InferenceOutputError(
|
|
953
1045
|
"Expected {aggregator: string, answer: string, cells: string[], coordinates: number[][]}"
|
|
954
1046
|
);
|
|
955
1047
|
}
|
|
956
|
-
return res;
|
|
1048
|
+
return Array.isArray(res) ? res[0] : res;
|
|
1049
|
+
}
|
|
1050
|
+
function validate(elem) {
|
|
1051
|
+
return typeof elem === "object" && !!elem && "aggregator" in elem && typeof elem.aggregator === "string" && "answer" in elem && typeof elem.answer === "string" && "cells" in elem && Array.isArray(elem.cells) && elem.cells.every((x) => typeof x === "string") && "coordinates" in elem && Array.isArray(elem.coordinates) && elem.coordinates.every(
|
|
1052
|
+
(coord) => Array.isArray(coord) && coord.every((x) => typeof x === "number")
|
|
1053
|
+
);
|
|
957
1054
|
}
|
|
958
1055
|
|
|
959
1056
|
// src/tasks/nlp/textClassification.ts
|
|
@@ -1096,11 +1193,7 @@ async function documentQuestionAnswering(args, options) {
|
|
|
1096
1193
|
inputs: {
|
|
1097
1194
|
question: args.inputs.question,
|
|
1098
1195
|
// convert Blob or ArrayBuffer to base64
|
|
1099
|
-
image: base64FromBytes(
|
|
1100
|
-
new Uint8Array(
|
|
1101
|
-
args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
|
|
1102
|
-
)
|
|
1103
|
-
)
|
|
1196
|
+
image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer()))
|
|
1104
1197
|
}
|
|
1105
1198
|
};
|
|
1106
1199
|
const res = toArray(
|
|
@@ -1108,12 +1201,14 @@ async function documentQuestionAnswering(args, options) {
|
|
|
1108
1201
|
...options,
|
|
1109
1202
|
taskHint: "document-question-answering"
|
|
1110
1203
|
})
|
|
1111
|
-
)
|
|
1112
|
-
const isValidOutput =
|
|
1204
|
+
);
|
|
1205
|
+
const isValidOutput = Array.isArray(res) && res.every(
|
|
1206
|
+
(elem) => typeof elem === "object" && !!elem && typeof elem?.answer === "string" && (typeof elem.end === "number" || typeof elem.end === "undefined") && (typeof elem.score === "number" || typeof elem.score === "undefined") && (typeof elem.start === "number" || typeof elem.start === "undefined")
|
|
1207
|
+
);
|
|
1113
1208
|
if (!isValidOutput) {
|
|
1114
1209
|
throw new InferenceOutputError("Expected Array<{answer: string, end?: number, score?: number, start?: number}>");
|
|
1115
1210
|
}
|
|
1116
|
-
return res;
|
|
1211
|
+
return res[0];
|
|
1117
1212
|
}
|
|
1118
1213
|
|
|
1119
1214
|
// src/tasks/multimodal/visualQuestionAnswering.ts
|
|
@@ -1123,22 +1218,20 @@ async function visualQuestionAnswering(args, options) {
|
|
|
1123
1218
|
inputs: {
|
|
1124
1219
|
question: args.inputs.question,
|
|
1125
1220
|
// convert Blob or ArrayBuffer to base64
|
|
1126
|
-
image: base64FromBytes(
|
|
1127
|
-
new Uint8Array(
|
|
1128
|
-
args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
|
|
1129
|
-
)
|
|
1130
|
-
)
|
|
1221
|
+
image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer()))
|
|
1131
1222
|
}
|
|
1132
1223
|
};
|
|
1133
|
-
const res =
|
|
1224
|
+
const res = await request(reqArgs, {
|
|
1134
1225
|
...options,
|
|
1135
1226
|
taskHint: "visual-question-answering"
|
|
1136
|
-
})
|
|
1137
|
-
const isValidOutput =
|
|
1227
|
+
});
|
|
1228
|
+
const isValidOutput = Array.isArray(res) && res.every(
|
|
1229
|
+
(elem) => typeof elem === "object" && !!elem && typeof elem?.answer === "string" && typeof elem.score === "number"
|
|
1230
|
+
);
|
|
1138
1231
|
if (!isValidOutput) {
|
|
1139
1232
|
throw new InferenceOutputError("Expected Array<{answer: string, score: number}>");
|
|
1140
1233
|
}
|
|
1141
|
-
return res;
|
|
1234
|
+
return res[0];
|
|
1142
1235
|
}
|
|
1143
1236
|
|
|
1144
1237
|
// src/tasks/tabular/tabularRegression.ts
|