@huggingface/inference 3.0.1 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +162 -69
- package/dist/index.js +162 -69
- package/dist/src/providers/fal-ai.d.ts.map +1 -1
- package/dist/src/providers/replicate.d.ts.map +1 -1
- package/dist/src/tasks/audio/audioClassification.d.ts +4 -18
- package/dist/src/tasks/audio/audioClassification.d.ts.map +1 -1
- package/dist/src/tasks/audio/audioToAudio.d.ts +10 -9
- package/dist/src/tasks/audio/audioToAudio.d.ts.map +1 -1
- package/dist/src/tasks/audio/automaticSpeechRecognition.d.ts +3 -12
- package/dist/src/tasks/audio/automaticSpeechRecognition.d.ts.map +1 -1
- package/dist/src/tasks/audio/textToSpeech.d.ts +4 -8
- package/dist/src/tasks/audio/textToSpeech.d.ts.map +1 -1
- package/dist/src/tasks/audio/utils.d.ts +11 -0
- package/dist/src/tasks/audio/utils.d.ts.map +1 -0
- package/dist/src/tasks/cv/imageClassification.d.ts +3 -17
- package/dist/src/tasks/cv/imageClassification.d.ts.map +1 -1
- package/dist/src/tasks/cv/imageSegmentation.d.ts +3 -21
- package/dist/src/tasks/cv/imageSegmentation.d.ts.map +1 -1
- package/dist/src/tasks/cv/imageToImage.d.ts +3 -49
- package/dist/src/tasks/cv/imageToImage.d.ts.map +1 -1
- package/dist/src/tasks/cv/imageToText.d.ts +3 -12
- package/dist/src/tasks/cv/imageToText.d.ts.map +1 -1
- package/dist/src/tasks/cv/objectDetection.d.ts +3 -26
- package/dist/src/tasks/cv/objectDetection.d.ts.map +1 -1
- package/dist/src/tasks/cv/textToImage.d.ts +3 -38
- package/dist/src/tasks/cv/textToImage.d.ts.map +1 -1
- package/dist/src/tasks/cv/textToVideo.d.ts +6 -0
- package/dist/src/tasks/cv/textToVideo.d.ts.map +1 -0
- package/dist/src/tasks/cv/utils.d.ts +11 -0
- package/dist/src/tasks/cv/utils.d.ts.map +1 -0
- package/dist/src/tasks/cv/zeroShotImageClassification.d.ts +7 -15
- package/dist/src/tasks/cv/zeroShotImageClassification.d.ts.map +1 -1
- package/dist/src/tasks/multimodal/documentQuestionAnswering.d.ts +5 -28
- package/dist/src/tasks/multimodal/documentQuestionAnswering.d.ts.map +1 -1
- package/dist/src/tasks/multimodal/visualQuestionAnswering.d.ts +5 -20
- package/dist/src/tasks/multimodal/visualQuestionAnswering.d.ts.map +1 -1
- package/dist/src/tasks/nlp/fillMask.d.ts +2 -21
- package/dist/src/tasks/nlp/fillMask.d.ts.map +1 -1
- package/dist/src/tasks/nlp/questionAnswering.d.ts +3 -25
- package/dist/src/tasks/nlp/questionAnswering.d.ts.map +1 -1
- package/dist/src/tasks/nlp/sentenceSimilarity.d.ts +2 -13
- package/dist/src/tasks/nlp/sentenceSimilarity.d.ts.map +1 -1
- package/dist/src/tasks/nlp/summarization.d.ts +2 -42
- package/dist/src/tasks/nlp/summarization.d.ts.map +1 -1
- package/dist/src/tasks/nlp/tableQuestionAnswering.d.ts +3 -31
- package/dist/src/tasks/nlp/tableQuestionAnswering.d.ts.map +1 -1
- package/dist/src/tasks/nlp/textClassification.d.ts +2 -16
- package/dist/src/tasks/nlp/textClassification.d.ts.map +1 -1
- package/dist/src/tasks/nlp/tokenClassification.d.ts +2 -45
- package/dist/src/tasks/nlp/tokenClassification.d.ts.map +1 -1
- package/dist/src/tasks/nlp/translation.d.ts +2 -13
- package/dist/src/tasks/nlp/translation.d.ts.map +1 -1
- package/dist/src/tasks/nlp/zeroShotClassification.d.ts +2 -22
- package/dist/src/tasks/nlp/zeroShotClassification.d.ts.map +1 -1
- package/dist/src/types.d.ts +4 -0
- package/dist/src/types.d.ts.map +1 -1
- package/package.json +2 -2
- package/src/providers/fal-ai.ts +4 -0
- package/src/providers/replicate.ts +3 -0
- package/src/tasks/audio/audioClassification.ts +7 -22
- package/src/tasks/audio/audioToAudio.ts +43 -23
- package/src/tasks/audio/automaticSpeechRecognition.ts +35 -23
- package/src/tasks/audio/textToSpeech.ts +8 -14
- package/src/tasks/audio/utils.ts +18 -0
- package/src/tasks/cv/imageClassification.ts +5 -20
- package/src/tasks/cv/imageSegmentation.ts +5 -24
- package/src/tasks/cv/imageToImage.ts +4 -52
- package/src/tasks/cv/imageToText.ts +6 -15
- package/src/tasks/cv/objectDetection.ts +5 -30
- package/src/tasks/cv/textToImage.ts +14 -50
- package/src/tasks/cv/textToVideo.ts +67 -0
- package/src/tasks/cv/utils.ts +13 -0
- package/src/tasks/cv/zeroShotImageClassification.ts +32 -31
- package/src/tasks/multimodal/documentQuestionAnswering.ts +25 -43
- package/src/tasks/multimodal/visualQuestionAnswering.ts +20 -36
- package/src/tasks/nlp/fillMask.ts +2 -22
- package/src/tasks/nlp/questionAnswering.ts +22 -36
- package/src/tasks/nlp/sentenceSimilarity.ts +12 -15
- package/src/tasks/nlp/summarization.ts +2 -43
- package/src/tasks/nlp/tableQuestionAnswering.ts +25 -41
- package/src/tasks/nlp/textClassification.ts +3 -18
- package/src/tasks/nlp/tokenClassification.ts +2 -47
- package/src/tasks/nlp/translation.ts +3 -17
- package/src/tasks/nlp/zeroShotClassification.ts +2 -24
- package/src/types.ts +7 -1
package/dist/index.js
CHANGED
|
@@ -61,6 +61,10 @@ var FAL_AI_SUPPORTED_MODEL_IDS = {
|
|
|
61
61
|
},
|
|
62
62
|
"automatic-speech-recognition": {
|
|
63
63
|
"openai/whisper-large-v3": "fal-ai/whisper"
|
|
64
|
+
},
|
|
65
|
+
"text-to-video": {
|
|
66
|
+
"genmo/mochi-1-preview": "fal-ai/mochi-v1",
|
|
67
|
+
"tencent/HunyuanVideo": "fal-ai/hunyuan-video"
|
|
64
68
|
}
|
|
65
69
|
};
|
|
66
70
|
|
|
@@ -73,6 +77,9 @@ var REPLICATE_SUPPORTED_MODEL_IDS = {
|
|
|
73
77
|
},
|
|
74
78
|
"text-to-speech": {
|
|
75
79
|
"OuteAI/OuteTTS-0.3-500M": "jbilcke/oute-tts:39a59319327b27327fa3095149c5a746e7f2aee18c75055c3368237a6503cd26"
|
|
80
|
+
},
|
|
81
|
+
"text-to-video": {
|
|
82
|
+
"genmo/mochi-1-preview": "genmoai/mochi-1:1944af04d098ef69bed7f9d335d102e652203f268ec4aaa2d836f6217217e460"
|
|
76
83
|
}
|
|
77
84
|
};
|
|
78
85
|
|
|
@@ -538,9 +545,42 @@ var InferenceOutputError = class extends TypeError {
|
|
|
538
545
|
}
|
|
539
546
|
};
|
|
540
547
|
|
|
548
|
+
// src/utils/pick.ts
|
|
549
|
+
function pick(o, props) {
|
|
550
|
+
return Object.assign(
|
|
551
|
+
{},
|
|
552
|
+
...props.map((prop) => {
|
|
553
|
+
if (o[prop] !== void 0) {
|
|
554
|
+
return { [prop]: o[prop] };
|
|
555
|
+
}
|
|
556
|
+
})
|
|
557
|
+
);
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
// src/utils/typedInclude.ts
|
|
561
|
+
function typedInclude(arr, v) {
|
|
562
|
+
return arr.includes(v);
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
// src/utils/omit.ts
|
|
566
|
+
function omit(o, props) {
|
|
567
|
+
const propsArr = Array.isArray(props) ? props : [props];
|
|
568
|
+
const letsKeep = Object.keys(o).filter((prop) => !typedInclude(propsArr, prop));
|
|
569
|
+
return pick(o, letsKeep);
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
// src/tasks/audio/utils.ts
|
|
573
|
+
function preparePayload(args) {
|
|
574
|
+
return "data" in args ? args : {
|
|
575
|
+
...omit(args, "inputs"),
|
|
576
|
+
data: args.inputs
|
|
577
|
+
};
|
|
578
|
+
}
|
|
579
|
+
|
|
541
580
|
// src/tasks/audio/audioClassification.ts
|
|
542
581
|
async function audioClassification(args, options) {
|
|
543
|
-
const
|
|
582
|
+
const payload = preparePayload(args);
|
|
583
|
+
const res = await request(payload, {
|
|
544
584
|
...options,
|
|
545
585
|
taskHint: "audio-classification"
|
|
546
586
|
});
|
|
@@ -566,15 +606,8 @@ function base64FromBytes(arr) {
|
|
|
566
606
|
|
|
567
607
|
// src/tasks/audio/automaticSpeechRecognition.ts
|
|
568
608
|
async function automaticSpeechRecognition(args, options) {
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
const base64audio = base64FromBytes(
|
|
572
|
-
new Uint8Array(args.data instanceof ArrayBuffer ? args.data : await args.data.arrayBuffer())
|
|
573
|
-
);
|
|
574
|
-
args.audio_url = `data:${contentType};base64,${base64audio}`;
|
|
575
|
-
delete args.data;
|
|
576
|
-
}
|
|
577
|
-
const res = await request(args, {
|
|
609
|
+
const payload = await buildPayload(args);
|
|
610
|
+
const res = await request(payload, {
|
|
578
611
|
...options,
|
|
579
612
|
taskHint: "automatic-speech-recognition"
|
|
580
613
|
});
|
|
@@ -584,6 +617,32 @@ async function automaticSpeechRecognition(args, options) {
|
|
|
584
617
|
}
|
|
585
618
|
return res;
|
|
586
619
|
}
|
|
620
|
+
var FAL_AI_SUPPORTED_BLOB_TYPES = ["audio/mpeg", "audio/mp4", "audio/wav", "audio/x-wav"];
|
|
621
|
+
async function buildPayload(args) {
|
|
622
|
+
if (args.provider === "fal-ai") {
|
|
623
|
+
const blob = "data" in args && args.data instanceof Blob ? args.data : "inputs" in args ? args.inputs : void 0;
|
|
624
|
+
const contentType = blob?.type;
|
|
625
|
+
if (!contentType) {
|
|
626
|
+
throw new Error(
|
|
627
|
+
`Unable to determine the input's content-type. Make sure your are passing a Blob when using provider fal-ai.`
|
|
628
|
+
);
|
|
629
|
+
}
|
|
630
|
+
if (!FAL_AI_SUPPORTED_BLOB_TYPES.includes(contentType)) {
|
|
631
|
+
throw new Error(
|
|
632
|
+
`Provider fal-ai does not support blob type ${contentType} - supported content types are: ${FAL_AI_SUPPORTED_BLOB_TYPES.join(
|
|
633
|
+
", "
|
|
634
|
+
)}`
|
|
635
|
+
);
|
|
636
|
+
}
|
|
637
|
+
const base64audio = base64FromBytes(new Uint8Array(await blob.arrayBuffer()));
|
|
638
|
+
return {
|
|
639
|
+
..."data" in args ? omit(args, "data") : omit(args, "inputs"),
|
|
640
|
+
audio_url: `data:${contentType};base64,${base64audio}`
|
|
641
|
+
};
|
|
642
|
+
} else {
|
|
643
|
+
return preparePayload(args);
|
|
644
|
+
}
|
|
645
|
+
}
|
|
587
646
|
|
|
588
647
|
// src/tasks/audio/textToSpeech.ts
|
|
589
648
|
async function textToSpeech(args, options) {
|
|
@@ -591,6 +650,9 @@ async function textToSpeech(args, options) {
|
|
|
591
650
|
...options,
|
|
592
651
|
taskHint: "text-to-speech"
|
|
593
652
|
});
|
|
653
|
+
if (res instanceof Blob) {
|
|
654
|
+
return res;
|
|
655
|
+
}
|
|
594
656
|
if (res && typeof res === "object") {
|
|
595
657
|
if ("output" in res) {
|
|
596
658
|
if (typeof res.output === "string") {
|
|
@@ -604,31 +666,39 @@ async function textToSpeech(args, options) {
|
|
|
604
666
|
}
|
|
605
667
|
}
|
|
606
668
|
}
|
|
607
|
-
|
|
608
|
-
if (!isValidOutput) {
|
|
609
|
-
throw new InferenceOutputError("Expected Blob");
|
|
610
|
-
}
|
|
611
|
-
return res;
|
|
669
|
+
throw new InferenceOutputError("Expected Blob or object with output");
|
|
612
670
|
}
|
|
613
671
|
|
|
614
672
|
// src/tasks/audio/audioToAudio.ts
|
|
615
673
|
async function audioToAudio(args, options) {
|
|
616
|
-
const
|
|
674
|
+
const payload = preparePayload(args);
|
|
675
|
+
const res = await request(payload, {
|
|
617
676
|
...options,
|
|
618
677
|
taskHint: "audio-to-audio"
|
|
619
678
|
});
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
if (!
|
|
624
|
-
throw new InferenceOutputError("Expected Array
|
|
679
|
+
return validateOutput(res);
|
|
680
|
+
}
|
|
681
|
+
function validateOutput(output) {
|
|
682
|
+
if (!Array.isArray(output)) {
|
|
683
|
+
throw new InferenceOutputError("Expected Array");
|
|
625
684
|
}
|
|
626
|
-
|
|
685
|
+
if (!output.every((elem) => {
|
|
686
|
+
return typeof elem === "object" && elem && "label" in elem && typeof elem.label === "string" && "content-type" in elem && typeof elem["content-type"] === "string" && "blob" in elem && typeof elem.blob === "string";
|
|
687
|
+
})) {
|
|
688
|
+
throw new InferenceOutputError("Expected Array<{label: string, audio: Blob}>");
|
|
689
|
+
}
|
|
690
|
+
return output;
|
|
691
|
+
}
|
|
692
|
+
|
|
693
|
+
// src/tasks/cv/utils.ts
|
|
694
|
+
function preparePayload2(args) {
|
|
695
|
+
return "data" in args ? args : { ...omit(args, "inputs"), data: args.inputs };
|
|
627
696
|
}
|
|
628
697
|
|
|
629
698
|
// src/tasks/cv/imageClassification.ts
|
|
630
699
|
async function imageClassification(args, options) {
|
|
631
|
-
const
|
|
700
|
+
const payload = preparePayload2(args);
|
|
701
|
+
const res = await request(payload, {
|
|
632
702
|
...options,
|
|
633
703
|
taskHint: "image-classification"
|
|
634
704
|
});
|
|
@@ -641,7 +711,8 @@ async function imageClassification(args, options) {
|
|
|
641
711
|
|
|
642
712
|
// src/tasks/cv/imageSegmentation.ts
|
|
643
713
|
async function imageSegmentation(args, options) {
|
|
644
|
-
const
|
|
714
|
+
const payload = preparePayload2(args);
|
|
715
|
+
const res = await request(payload, {
|
|
645
716
|
...options,
|
|
646
717
|
taskHint: "image-segmentation"
|
|
647
718
|
});
|
|
@@ -654,7 +725,8 @@ async function imageSegmentation(args, options) {
|
|
|
654
725
|
|
|
655
726
|
// src/tasks/cv/imageToText.ts
|
|
656
727
|
async function imageToText(args, options) {
|
|
657
|
-
const
|
|
728
|
+
const payload = preparePayload2(args);
|
|
729
|
+
const res = (await request(payload, {
|
|
658
730
|
...options,
|
|
659
731
|
taskHint: "image-to-text"
|
|
660
732
|
}))?.[0];
|
|
@@ -666,7 +738,8 @@ async function imageToText(args, options) {
|
|
|
666
738
|
|
|
667
739
|
// src/tasks/cv/objectDetection.ts
|
|
668
740
|
async function objectDetection(args, options) {
|
|
669
|
-
const
|
|
741
|
+
const payload = preparePayload2(args);
|
|
742
|
+
const res = await request(payload, {
|
|
670
743
|
...options,
|
|
671
744
|
taskHint: "object-detection"
|
|
672
745
|
});
|
|
@@ -683,15 +756,13 @@ async function objectDetection(args, options) {
|
|
|
683
756
|
|
|
684
757
|
// src/tasks/cv/textToImage.ts
|
|
685
758
|
async function textToImage(args, options) {
|
|
686
|
-
|
|
687
|
-
args
|
|
688
|
-
|
|
689
|
-
args.
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
}
|
|
694
|
-
const res = await request(args, {
|
|
759
|
+
const payload = args.provider === "together" || args.provider === "fal-ai" || args.provider === "replicate" ? {
|
|
760
|
+
...omit(args, ["inputs", "parameters"]),
|
|
761
|
+
...args.parameters,
|
|
762
|
+
...args.provider !== "replicate" ? { response_format: "base64" } : void 0,
|
|
763
|
+
prompt: args.inputs
|
|
764
|
+
} : args;
|
|
765
|
+
const res = await request(payload, {
|
|
695
766
|
...options,
|
|
696
767
|
taskHint: "text-to-image"
|
|
697
768
|
});
|
|
@@ -748,18 +819,30 @@ async function imageToImage(args, options) {
|
|
|
748
819
|
}
|
|
749
820
|
|
|
750
821
|
// src/tasks/cv/zeroShotImageClassification.ts
|
|
751
|
-
async function
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
new Uint8Array(
|
|
757
|
-
|
|
822
|
+
async function preparePayload3(args) {
|
|
823
|
+
if (args.inputs instanceof Blob) {
|
|
824
|
+
return {
|
|
825
|
+
...args,
|
|
826
|
+
inputs: {
|
|
827
|
+
image: base64FromBytes(new Uint8Array(await args.inputs.arrayBuffer()))
|
|
828
|
+
}
|
|
829
|
+
};
|
|
830
|
+
} else {
|
|
831
|
+
return {
|
|
832
|
+
...args,
|
|
833
|
+
inputs: {
|
|
834
|
+
image: base64FromBytes(
|
|
835
|
+
new Uint8Array(
|
|
836
|
+
args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
|
|
837
|
+
)
|
|
758
838
|
)
|
|
759
|
-
|
|
760
|
-
}
|
|
761
|
-
}
|
|
762
|
-
|
|
839
|
+
}
|
|
840
|
+
};
|
|
841
|
+
}
|
|
842
|
+
}
|
|
843
|
+
async function zeroShotImageClassification(args, options) {
|
|
844
|
+
const payload = await preparePayload3(args);
|
|
845
|
+
const res = await request(payload, {
|
|
763
846
|
...options,
|
|
764
847
|
taskHint: "zero-shot-image-classification"
|
|
765
848
|
});
|
|
@@ -848,17 +931,19 @@ async function questionAnswering(args, options) {
|
|
|
848
931
|
...options,
|
|
849
932
|
taskHint: "question-answering"
|
|
850
933
|
});
|
|
851
|
-
const isValidOutput =
|
|
934
|
+
const isValidOutput = Array.isArray(res) ? res.every(
|
|
935
|
+
(elem) => typeof elem === "object" && !!elem && typeof elem.answer === "string" && typeof elem.end === "number" && typeof elem.score === "number" && typeof elem.start === "number"
|
|
936
|
+
) : typeof res === "object" && !!res && typeof res.answer === "string" && typeof res.end === "number" && typeof res.score === "number" && typeof res.start === "number";
|
|
852
937
|
if (!isValidOutput) {
|
|
853
|
-
throw new InferenceOutputError("Expected {answer: string, end: number, score: number, start: number}");
|
|
938
|
+
throw new InferenceOutputError("Expected Array<{answer: string, end: number, score: number, start: number}>");
|
|
854
939
|
}
|
|
855
|
-
return res;
|
|
940
|
+
return Array.isArray(res) ? res[0] : res;
|
|
856
941
|
}
|
|
857
942
|
|
|
858
943
|
// src/tasks/nlp/sentenceSimilarity.ts
|
|
859
944
|
async function sentenceSimilarity(args, options) {
|
|
860
945
|
const defaultTask = args.model ? await getDefaultTask(args.model, args.accessToken, options) : void 0;
|
|
861
|
-
const res = await request(args, {
|
|
946
|
+
const res = await request(prepareInput(args), {
|
|
862
947
|
...options,
|
|
863
948
|
taskHint: "sentence-similarity",
|
|
864
949
|
...defaultTask === "feature-extraction" && { forceTask: "sentence-similarity" }
|
|
@@ -869,6 +954,13 @@ async function sentenceSimilarity(args, options) {
|
|
|
869
954
|
}
|
|
870
955
|
return res;
|
|
871
956
|
}
|
|
957
|
+
function prepareInput(args) {
|
|
958
|
+
return {
|
|
959
|
+
...omit(args, ["inputs", "parameters"]),
|
|
960
|
+
inputs: { ...omit(args.inputs, "sourceSentence") },
|
|
961
|
+
parameters: { source_sentence: args.inputs.sourceSentence, ...args.parameters }
|
|
962
|
+
};
|
|
963
|
+
}
|
|
872
964
|
|
|
873
965
|
// src/tasks/nlp/summarization.ts
|
|
874
966
|
async function summarization(args, options) {
|
|
@@ -889,13 +981,18 @@ async function tableQuestionAnswering(args, options) {
|
|
|
889
981
|
...options,
|
|
890
982
|
taskHint: "table-question-answering"
|
|
891
983
|
});
|
|
892
|
-
const isValidOutput =
|
|
984
|
+
const isValidOutput = Array.isArray(res) ? res.every((elem) => validate(elem)) : validate(res);
|
|
893
985
|
if (!isValidOutput) {
|
|
894
986
|
throw new InferenceOutputError(
|
|
895
987
|
"Expected {aggregator: string, answer: string, cells: string[], coordinates: number[][]}"
|
|
896
988
|
);
|
|
897
989
|
}
|
|
898
|
-
return res;
|
|
990
|
+
return Array.isArray(res) ? res[0] : res;
|
|
991
|
+
}
|
|
992
|
+
function validate(elem) {
|
|
993
|
+
return typeof elem === "object" && !!elem && "aggregator" in elem && typeof elem.aggregator === "string" && "answer" in elem && typeof elem.answer === "string" && "cells" in elem && Array.isArray(elem.cells) && elem.cells.every((x) => typeof x === "string") && "coordinates" in elem && Array.isArray(elem.coordinates) && elem.coordinates.every(
|
|
994
|
+
(coord) => Array.isArray(coord) && coord.every((x) => typeof x === "number")
|
|
995
|
+
);
|
|
899
996
|
}
|
|
900
997
|
|
|
901
998
|
// src/tasks/nlp/textClassification.ts
|
|
@@ -1038,11 +1135,7 @@ async function documentQuestionAnswering(args, options) {
|
|
|
1038
1135
|
inputs: {
|
|
1039
1136
|
question: args.inputs.question,
|
|
1040
1137
|
// convert Blob or ArrayBuffer to base64
|
|
1041
|
-
image: base64FromBytes(
|
|
1042
|
-
new Uint8Array(
|
|
1043
|
-
args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
|
|
1044
|
-
)
|
|
1045
|
-
)
|
|
1138
|
+
image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer()))
|
|
1046
1139
|
}
|
|
1047
1140
|
};
|
|
1048
1141
|
const res = toArray(
|
|
@@ -1050,12 +1143,14 @@ async function documentQuestionAnswering(args, options) {
|
|
|
1050
1143
|
...options,
|
|
1051
1144
|
taskHint: "document-question-answering"
|
|
1052
1145
|
})
|
|
1053
|
-
)
|
|
1054
|
-
const isValidOutput =
|
|
1146
|
+
);
|
|
1147
|
+
const isValidOutput = Array.isArray(res) && res.every(
|
|
1148
|
+
(elem) => typeof elem === "object" && !!elem && typeof elem?.answer === "string" && (typeof elem.end === "number" || typeof elem.end === "undefined") && (typeof elem.score === "number" || typeof elem.score === "undefined") && (typeof elem.start === "number" || typeof elem.start === "undefined")
|
|
1149
|
+
);
|
|
1055
1150
|
if (!isValidOutput) {
|
|
1056
1151
|
throw new InferenceOutputError("Expected Array<{answer: string, end?: number, score?: number, start?: number}>");
|
|
1057
1152
|
}
|
|
1058
|
-
return res;
|
|
1153
|
+
return res[0];
|
|
1059
1154
|
}
|
|
1060
1155
|
|
|
1061
1156
|
// src/tasks/multimodal/visualQuestionAnswering.ts
|
|
@@ -1065,22 +1160,20 @@ async function visualQuestionAnswering(args, options) {
|
|
|
1065
1160
|
inputs: {
|
|
1066
1161
|
question: args.inputs.question,
|
|
1067
1162
|
// convert Blob or ArrayBuffer to base64
|
|
1068
|
-
image: base64FromBytes(
|
|
1069
|
-
new Uint8Array(
|
|
1070
|
-
args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
|
|
1071
|
-
)
|
|
1072
|
-
)
|
|
1163
|
+
image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer()))
|
|
1073
1164
|
}
|
|
1074
1165
|
};
|
|
1075
|
-
const res =
|
|
1166
|
+
const res = await request(reqArgs, {
|
|
1076
1167
|
...options,
|
|
1077
1168
|
taskHint: "visual-question-answering"
|
|
1078
|
-
})
|
|
1079
|
-
const isValidOutput =
|
|
1169
|
+
});
|
|
1170
|
+
const isValidOutput = Array.isArray(res) && res.every(
|
|
1171
|
+
(elem) => typeof elem === "object" && !!elem && typeof elem?.answer === "string" && typeof elem.score === "number"
|
|
1172
|
+
);
|
|
1080
1173
|
if (!isValidOutput) {
|
|
1081
1174
|
throw new InferenceOutputError("Expected Array<{answer: string, score: number}>");
|
|
1082
1175
|
}
|
|
1083
|
-
return res;
|
|
1176
|
+
return res[0];
|
|
1084
1177
|
}
|
|
1085
1178
|
|
|
1086
1179
|
// src/tasks/tabular/tabularRegression.ts
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"fal-ai.d.ts","sourceRoot":"","sources":["../../../src/providers/fal-ai.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,SAAS,CAAC;AAE/C,eAAO,MAAM,mBAAmB,oBAAoB,CAAC;AAErD,KAAK,OAAO,GAAG,MAAM,CAAC;AAEtB,eAAO,MAAM,0BAA0B,EAAE,eAAe,CAAC,OAAO,
|
|
1
|
+
{"version":3,"file":"fal-ai.d.ts","sourceRoot":"","sources":["../../../src/providers/fal-ai.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,SAAS,CAAC;AAE/C,eAAO,MAAM,mBAAmB,oBAAoB,CAAC;AAErD,KAAK,OAAO,GAAG,MAAM,CAAC;AAEtB,eAAO,MAAM,0BAA0B,EAAE,eAAe,CAAC,OAAO,CAoB/D,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"replicate.d.ts","sourceRoot":"","sources":["../../../src/providers/replicate.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,SAAS,CAAC;AAE/C,eAAO,MAAM,sBAAsB,8BAA8B,CAAC;AAElE,KAAK,WAAW,GAAG,MAAM,CAAC;AAE1B,eAAO,MAAM,6BAA6B,EAAE,eAAe,CAAC,WAAW,
|
|
1
|
+
{"version":3,"file":"replicate.d.ts","sourceRoot":"","sources":["../../../src/providers/replicate.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,SAAS,CAAC;AAE/C,eAAO,MAAM,sBAAsB,8BAA8B,CAAC;AAElE,KAAK,WAAW,GAAG,MAAM,CAAC;AAE1B,eAAO,MAAM,6BAA6B,EAAE,eAAe,CAAC,WAAW,CAYtE,CAAC"}
|
|
@@ -1,24 +1,10 @@
|
|
|
1
|
+
import type { AudioClassificationInput, AudioClassificationOutput } from "@huggingface/tasks";
|
|
1
2
|
import type { BaseArgs, Options } from "../../types";
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
* Binary audio data
|
|
5
|
-
*/
|
|
6
|
-
data: Blob | ArrayBuffer;
|
|
7
|
-
};
|
|
8
|
-
export interface AudioClassificationOutputValue {
|
|
9
|
-
/**
|
|
10
|
-
* The label for the class (model specific)
|
|
11
|
-
*/
|
|
12
|
-
label: string;
|
|
13
|
-
/**
|
|
14
|
-
* A float that represents how likely it is that the audio file belongs to this class.
|
|
15
|
-
*/
|
|
16
|
-
score: number;
|
|
17
|
-
}
|
|
18
|
-
export type AudioClassificationReturn = AudioClassificationOutputValue[];
|
|
3
|
+
import type { LegacyAudioInput } from "./utils";
|
|
4
|
+
export type AudioClassificationArgs = BaseArgs & (AudioClassificationInput | LegacyAudioInput);
|
|
19
5
|
/**
|
|
20
6
|
* This task reads some audio input and outputs the likelihood of classes.
|
|
21
7
|
* Recommended model: superb/hubert-large-superb-er
|
|
22
8
|
*/
|
|
23
|
-
export declare function audioClassification(args: AudioClassificationArgs, options?: Options): Promise<
|
|
9
|
+
export declare function audioClassification(args: AudioClassificationArgs, options?: Options): Promise<AudioClassificationOutput>;
|
|
24
10
|
//# sourceMappingURL=audioClassification.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"audioClassification.d.ts","sourceRoot":"","sources":["../../../../src/tasks/audio/audioClassification.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"audioClassification.d.ts","sourceRoot":"","sources":["../../../../src/tasks/audio/audioClassification.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,wBAAwB,EAAE,yBAAyB,EAAE,MAAM,oBAAoB,CAAC;AAE9F,OAAO,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,aAAa,CAAC;AAErD,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,SAAS,CAAC;AAGhD,MAAM,MAAM,uBAAuB,GAAG,QAAQ,GAAG,CAAC,wBAAwB,GAAG,gBAAgB,CAAC,CAAC;AAE/F;;;GAGG;AACH,wBAAsB,mBAAmB,CACxC,IAAI,EAAE,uBAAuB,EAC7B,OAAO,CAAC,EAAE,OAAO,GACf,OAAO,CAAC,yBAAyB,CAAC,CAYpC"}
|
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import type { BaseArgs, Options } from "../../types";
|
|
2
|
-
|
|
2
|
+
import type { LegacyAudioInput } from "./utils";
|
|
3
|
+
export type AudioToAudioArgs = (BaseArgs & {
|
|
3
4
|
/**
|
|
4
5
|
* Binary audio data
|
|
5
6
|
*/
|
|
6
|
-
|
|
7
|
-
};
|
|
8
|
-
export interface
|
|
7
|
+
inputs: Blob;
|
|
8
|
+
}) | LegacyAudioInput;
|
|
9
|
+
export interface AudioToAudioOutputElem {
|
|
9
10
|
/**
|
|
10
11
|
* The label for the audio output (model specific)
|
|
11
12
|
*/
|
|
@@ -13,16 +14,16 @@ export interface AudioToAudioOutputValue {
|
|
|
13
14
|
/**
|
|
14
15
|
* Base64 encoded audio output.
|
|
15
16
|
*/
|
|
17
|
+
audio: Blob;
|
|
18
|
+
}
|
|
19
|
+
export interface AudioToAudioOutput {
|
|
16
20
|
blob: string;
|
|
17
|
-
/**
|
|
18
|
-
* Content-type for blob, e.g. audio/flac
|
|
19
|
-
*/
|
|
20
21
|
"content-type": string;
|
|
22
|
+
label: string;
|
|
21
23
|
}
|
|
22
|
-
export type AudioToAudioReturn = AudioToAudioOutputValue[];
|
|
23
24
|
/**
|
|
24
25
|
* This task reads some audio input and outputs one or multiple audio files.
|
|
25
26
|
* Example model: speechbrain/sepformer-wham does audio source separation.
|
|
26
27
|
*/
|
|
27
|
-
export declare function audioToAudio(args: AudioToAudioArgs, options?: Options): Promise<
|
|
28
|
+
export declare function audioToAudio(args: AudioToAudioArgs, options?: Options): Promise<AudioToAudioOutput[]>;
|
|
28
29
|
//# sourceMappingURL=audioToAudio.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"audioToAudio.d.ts","sourceRoot":"","sources":["../../../../src/tasks/audio/audioToAudio.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,aAAa,CAAC;
|
|
1
|
+
{"version":3,"file":"audioToAudio.d.ts","sourceRoot":"","sources":["../../../../src/tasks/audio/audioToAudio.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,aAAa,CAAC;AAErD,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,SAAS,CAAC;AAGhD,MAAM,MAAM,gBAAgB,GACzB,CAAC,QAAQ,GAAG;IACZ;;OAEG;IACH,MAAM,EAAE,IAAI,CAAC;CACZ,CAAC,GACF,gBAAgB,CAAC;AAEpB,MAAM,WAAW,sBAAsB;IACtC;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IAEd;;OAEG;IACH,KAAK,EAAE,IAAI,CAAC;CACZ;AAED,MAAM,WAAW,kBAAkB;IAClC,IAAI,EAAE,MAAM,CAAC;IACb,cAAc,EAAE,MAAM,CAAC;IACvB,KAAK,EAAE,MAAM,CAAC;CACd;AAED;;;GAGG;AACH,wBAAsB,YAAY,CAAC,IAAI,EAAE,gBAAgB,EAAE,OAAO,CAAC,EAAE,OAAO,GAAG,OAAO,CAAC,kBAAkB,EAAE,CAAC,CAQ3G"}
|
|
@@ -1,16 +1,7 @@
|
|
|
1
|
+
import type { AutomaticSpeechRecognitionInput, AutomaticSpeechRecognitionOutput } from "@huggingface/tasks";
|
|
1
2
|
import type { BaseArgs, Options } from "../../types";
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
* Binary audio data
|
|
5
|
-
*/
|
|
6
|
-
data: Blob | ArrayBuffer;
|
|
7
|
-
};
|
|
8
|
-
export interface AutomaticSpeechRecognitionOutput {
|
|
9
|
-
/**
|
|
10
|
-
* The text that was recognized from the audio
|
|
11
|
-
*/
|
|
12
|
-
text: string;
|
|
13
|
-
}
|
|
3
|
+
import type { LegacyAudioInput } from "./utils";
|
|
4
|
+
export type AutomaticSpeechRecognitionArgs = BaseArgs & (AutomaticSpeechRecognitionInput | LegacyAudioInput);
|
|
14
5
|
/**
|
|
15
6
|
* This task reads some audio input and outputs the said words within the audio files.
|
|
16
7
|
* Recommended model (english language): facebook/wav2vec2-large-960h-lv60-self
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"automaticSpeechRecognition.d.ts","sourceRoot":"","sources":["../../../../src/tasks/audio/automaticSpeechRecognition.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"automaticSpeechRecognition.d.ts","sourceRoot":"","sources":["../../../../src/tasks/audio/automaticSpeechRecognition.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,+BAA+B,EAAE,gCAAgC,EAAE,MAAM,oBAAoB,CAAC;AAE5G,OAAO,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAe,MAAM,aAAa,CAAC;AAGlE,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,SAAS,CAAC;AAIhD,MAAM,MAAM,8BAA8B,GAAG,QAAQ,GAAG,CAAC,+BAA+B,GAAG,gBAAgB,CAAC,CAAC;AAC7G;;;GAGG;AACH,wBAAsB,0BAA0B,CAC/C,IAAI,EAAE,8BAA8B,EACpC,OAAO,CAAC,EAAE,OAAO,GACf,OAAO,CAAC,gCAAgC,CAAC,CAW3C"}
|
|
@@ -1,14 +1,10 @@
|
|
|
1
|
+
import type { TextToSpeechInput } from "@huggingface/tasks";
|
|
1
2
|
import type { BaseArgs, Options } from "../../types";
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* The text to generate an audio from
|
|
5
|
-
*/
|
|
6
|
-
inputs: string;
|
|
7
|
-
};
|
|
8
|
-
export type TextToSpeechOutput = Blob;
|
|
3
|
+
type TextToSpeechArgs = BaseArgs & TextToSpeechInput;
|
|
9
4
|
/**
|
|
10
5
|
* This task synthesize an audio of a voice pronouncing a given text.
|
|
11
6
|
* Recommended model: espnet/kan-bayashi_ljspeech_vits
|
|
12
7
|
*/
|
|
13
|
-
export declare function textToSpeech(args: TextToSpeechArgs, options?: Options): Promise<
|
|
8
|
+
export declare function textToSpeech(args: TextToSpeechArgs, options?: Options): Promise<Blob>;
|
|
9
|
+
export {};
|
|
14
10
|
//# sourceMappingURL=textToSpeech.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"textToSpeech.d.ts","sourceRoot":"","sources":["../../../../src/tasks/audio/textToSpeech.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"textToSpeech.d.ts","sourceRoot":"","sources":["../../../../src/tasks/audio/textToSpeech.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,oBAAoB,CAAC;AAE5D,OAAO,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,aAAa,CAAC;AAGrD,KAAK,gBAAgB,GAAG,QAAQ,GAAG,iBAAiB,CAAC;AAKrD;;;GAGG;AACH,wBAAsB,YAAY,CAAC,IAAI,EAAE,gBAAgB,EAAE,OAAO,CAAC,EAAE,OAAO,GAAG,OAAO,CAAC,IAAI,CAAC,CAsB3F"}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { BaseArgs, RequestArgs } from "../../types";
|
|
2
|
+
/**
|
|
3
|
+
* @deprecated
|
|
4
|
+
*/
|
|
5
|
+
export interface LegacyAudioInput {
|
|
6
|
+
data: Blob | ArrayBuffer;
|
|
7
|
+
}
|
|
8
|
+
export declare function preparePayload(args: BaseArgs & ({
|
|
9
|
+
inputs: Blob;
|
|
10
|
+
} | LegacyAudioInput)): RequestArgs;
|
|
11
|
+
//# sourceMappingURL=utils.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"utils.d.ts","sourceRoot":"","sources":["../../../../src/tasks/audio/utils.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,QAAQ,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAGzD;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAChC,IAAI,EAAE,IAAI,GAAG,WAAW,CAAC;CACzB;AAED,wBAAgB,cAAc,CAAC,IAAI,EAAE,QAAQ,GAAG,CAAC;IAAE,MAAM,EAAE,IAAI,CAAA;CAAE,GAAG,gBAAgB,CAAC,GAAG,WAAW,CAOlG"}
|
|
@@ -1,21 +1,7 @@
|
|
|
1
|
+
import type { ImageClassificationInput, ImageClassificationOutput } from "@huggingface/tasks";
|
|
1
2
|
import type { BaseArgs, Options } from "../../types";
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
* Binary image data
|
|
5
|
-
*/
|
|
6
|
-
data: Blob | ArrayBuffer;
|
|
7
|
-
};
|
|
8
|
-
export interface ImageClassificationOutputValue {
|
|
9
|
-
/**
|
|
10
|
-
* The label for the class (model specific)
|
|
11
|
-
*/
|
|
12
|
-
label: string;
|
|
13
|
-
/**
|
|
14
|
-
* A float that represents how likely it is that the image file belongs to this class.
|
|
15
|
-
*/
|
|
16
|
-
score: number;
|
|
17
|
-
}
|
|
18
|
-
export type ImageClassificationOutput = ImageClassificationOutputValue[];
|
|
3
|
+
import { type LegacyImageInput } from "./utils";
|
|
4
|
+
export type ImageClassificationArgs = BaseArgs & (ImageClassificationInput | LegacyImageInput);
|
|
19
5
|
/**
|
|
20
6
|
* This task reads some image input and outputs the likelihood of classes.
|
|
21
7
|
* Recommended model: google/vit-base-patch16-224
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"imageClassification.d.ts","sourceRoot":"","sources":["../../../../src/tasks/cv/imageClassification.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"imageClassification.d.ts","sourceRoot":"","sources":["../../../../src/tasks/cv/imageClassification.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,wBAAwB,EAAE,yBAAyB,EAAE,MAAM,oBAAoB,CAAC;AAE9F,OAAO,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,aAAa,CAAC;AAErD,OAAO,EAAkB,KAAK,gBAAgB,EAAE,MAAM,SAAS,CAAC;AAEhE,MAAM,MAAM,uBAAuB,GAAG,QAAQ,GAAG,CAAC,wBAAwB,GAAG,gBAAgB,CAAC,CAAC;AAE/F;;;GAGG;AACH,wBAAsB,mBAAmB,CACxC,IAAI,EAAE,uBAAuB,EAC7B,OAAO,CAAC,EAAE,OAAO,GACf,OAAO,CAAC,yBAAyB,CAAC,CAYpC"}
|
|
@@ -1,25 +1,7 @@
|
|
|
1
|
+
import type { ImageSegmentationInput, ImageSegmentationOutput } from "@huggingface/tasks";
|
|
1
2
|
import type { BaseArgs, Options } from "../../types";
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
* Binary image data
|
|
5
|
-
*/
|
|
6
|
-
data: Blob | ArrayBuffer;
|
|
7
|
-
};
|
|
8
|
-
export interface ImageSegmentationOutputValue {
|
|
9
|
-
/**
|
|
10
|
-
* The label for the class (model specific) of a segment.
|
|
11
|
-
*/
|
|
12
|
-
label: string;
|
|
13
|
-
/**
|
|
14
|
-
* A str (base64 str of a single channel black-and-white img) representing the mask of a segment.
|
|
15
|
-
*/
|
|
16
|
-
mask: string;
|
|
17
|
-
/**
|
|
18
|
-
* A float that represents how likely it is that the detected object belongs to the given class.
|
|
19
|
-
*/
|
|
20
|
-
score: number;
|
|
21
|
-
}
|
|
22
|
-
export type ImageSegmentationOutput = ImageSegmentationOutputValue[];
|
|
3
|
+
import { type LegacyImageInput } from "./utils";
|
|
4
|
+
export type ImageSegmentationArgs = BaseArgs & (ImageSegmentationInput | LegacyImageInput);
|
|
23
5
|
/**
|
|
24
6
|
* This task reads some image input and outputs the likelihood of classes & bounding boxes of detected objects.
|
|
25
7
|
* Recommended model: facebook/detr-resnet-50-panoptic
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"imageSegmentation.d.ts","sourceRoot":"","sources":["../../../../src/tasks/cv/imageSegmentation.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"imageSegmentation.d.ts","sourceRoot":"","sources":["../../../../src/tasks/cv/imageSegmentation.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,sBAAsB,EAAE,uBAAuB,EAAE,MAAM,oBAAoB,CAAC;AAE1F,OAAO,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,aAAa,CAAC;AAErD,OAAO,EAAkB,KAAK,gBAAgB,EAAE,MAAM,SAAS,CAAC;AAEhE,MAAM,MAAM,qBAAqB,GAAG,QAAQ,GAAG,CAAC,sBAAsB,GAAG,gBAAgB,CAAC,CAAC;AAE3F;;;GAGG;AACH,wBAAsB,iBAAiB,CACtC,IAAI,EAAE,qBAAqB,EAC3B,OAAO,CAAC,EAAE,OAAO,GACf,OAAO,CAAC,uBAAuB,CAAC,CAalC"}
|