@huggingface/inference 3.0.1 → 3.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. package/dist/index.cjs +195 -69
  2. package/dist/index.js +194 -69
  3. package/dist/src/providers/fal-ai.d.ts.map +1 -1
  4. package/dist/src/providers/replicate.d.ts.map +1 -1
  5. package/dist/src/tasks/audio/audioClassification.d.ts +4 -18
  6. package/dist/src/tasks/audio/audioClassification.d.ts.map +1 -1
  7. package/dist/src/tasks/audio/audioToAudio.d.ts +10 -9
  8. package/dist/src/tasks/audio/audioToAudio.d.ts.map +1 -1
  9. package/dist/src/tasks/audio/automaticSpeechRecognition.d.ts +3 -12
  10. package/dist/src/tasks/audio/automaticSpeechRecognition.d.ts.map +1 -1
  11. package/dist/src/tasks/audio/textToSpeech.d.ts +4 -8
  12. package/dist/src/tasks/audio/textToSpeech.d.ts.map +1 -1
  13. package/dist/src/tasks/audio/utils.d.ts +11 -0
  14. package/dist/src/tasks/audio/utils.d.ts.map +1 -0
  15. package/dist/src/tasks/cv/imageClassification.d.ts +3 -17
  16. package/dist/src/tasks/cv/imageClassification.d.ts.map +1 -1
  17. package/dist/src/tasks/cv/imageSegmentation.d.ts +3 -21
  18. package/dist/src/tasks/cv/imageSegmentation.d.ts.map +1 -1
  19. package/dist/src/tasks/cv/imageToImage.d.ts +3 -49
  20. package/dist/src/tasks/cv/imageToImage.d.ts.map +1 -1
  21. package/dist/src/tasks/cv/imageToText.d.ts +3 -12
  22. package/dist/src/tasks/cv/imageToText.d.ts.map +1 -1
  23. package/dist/src/tasks/cv/objectDetection.d.ts +3 -26
  24. package/dist/src/tasks/cv/objectDetection.d.ts.map +1 -1
  25. package/dist/src/tasks/cv/textToImage.d.ts +3 -38
  26. package/dist/src/tasks/cv/textToImage.d.ts.map +1 -1
  27. package/dist/src/tasks/cv/textToVideo.d.ts +6 -0
  28. package/dist/src/tasks/cv/textToVideo.d.ts.map +1 -0
  29. package/dist/src/tasks/cv/utils.d.ts +11 -0
  30. package/dist/src/tasks/cv/utils.d.ts.map +1 -0
  31. package/dist/src/tasks/cv/zeroShotImageClassification.d.ts +7 -15
  32. package/dist/src/tasks/cv/zeroShotImageClassification.d.ts.map +1 -1
  33. package/dist/src/tasks/index.d.ts +1 -0
  34. package/dist/src/tasks/index.d.ts.map +1 -1
  35. package/dist/src/tasks/multimodal/documentQuestionAnswering.d.ts +5 -28
  36. package/dist/src/tasks/multimodal/documentQuestionAnswering.d.ts.map +1 -1
  37. package/dist/src/tasks/multimodal/visualQuestionAnswering.d.ts +5 -20
  38. package/dist/src/tasks/multimodal/visualQuestionAnswering.d.ts.map +1 -1
  39. package/dist/src/tasks/nlp/fillMask.d.ts +2 -21
  40. package/dist/src/tasks/nlp/fillMask.d.ts.map +1 -1
  41. package/dist/src/tasks/nlp/questionAnswering.d.ts +3 -25
  42. package/dist/src/tasks/nlp/questionAnswering.d.ts.map +1 -1
  43. package/dist/src/tasks/nlp/sentenceSimilarity.d.ts +2 -13
  44. package/dist/src/tasks/nlp/sentenceSimilarity.d.ts.map +1 -1
  45. package/dist/src/tasks/nlp/summarization.d.ts +2 -42
  46. package/dist/src/tasks/nlp/summarization.d.ts.map +1 -1
  47. package/dist/src/tasks/nlp/tableQuestionAnswering.d.ts +3 -31
  48. package/dist/src/tasks/nlp/tableQuestionAnswering.d.ts.map +1 -1
  49. package/dist/src/tasks/nlp/textClassification.d.ts +2 -16
  50. package/dist/src/tasks/nlp/textClassification.d.ts.map +1 -1
  51. package/dist/src/tasks/nlp/tokenClassification.d.ts +2 -45
  52. package/dist/src/tasks/nlp/tokenClassification.d.ts.map +1 -1
  53. package/dist/src/tasks/nlp/translation.d.ts +2 -13
  54. package/dist/src/tasks/nlp/translation.d.ts.map +1 -1
  55. package/dist/src/tasks/nlp/zeroShotClassification.d.ts +2 -22
  56. package/dist/src/tasks/nlp/zeroShotClassification.d.ts.map +1 -1
  57. package/dist/src/types.d.ts +4 -0
  58. package/dist/src/types.d.ts.map +1 -1
  59. package/package.json +2 -2
  60. package/src/providers/fal-ai.ts +4 -0
  61. package/src/providers/replicate.ts +3 -0
  62. package/src/tasks/audio/audioClassification.ts +7 -22
  63. package/src/tasks/audio/audioToAudio.ts +43 -23
  64. package/src/tasks/audio/automaticSpeechRecognition.ts +35 -23
  65. package/src/tasks/audio/textToSpeech.ts +8 -14
  66. package/src/tasks/audio/utils.ts +18 -0
  67. package/src/tasks/cv/imageClassification.ts +5 -20
  68. package/src/tasks/cv/imageSegmentation.ts +5 -24
  69. package/src/tasks/cv/imageToImage.ts +4 -52
  70. package/src/tasks/cv/imageToText.ts +6 -15
  71. package/src/tasks/cv/objectDetection.ts +5 -30
  72. package/src/tasks/cv/textToImage.ts +14 -50
  73. package/src/tasks/cv/textToVideo.ts +67 -0
  74. package/src/tasks/cv/utils.ts +13 -0
  75. package/src/tasks/cv/zeroShotImageClassification.ts +32 -31
  76. package/src/tasks/index.ts +1 -0
  77. package/src/tasks/multimodal/documentQuestionAnswering.ts +25 -43
  78. package/src/tasks/multimodal/visualQuestionAnswering.ts +20 -36
  79. package/src/tasks/nlp/fillMask.ts +2 -22
  80. package/src/tasks/nlp/questionAnswering.ts +22 -36
  81. package/src/tasks/nlp/sentenceSimilarity.ts +12 -15
  82. package/src/tasks/nlp/summarization.ts +2 -43
  83. package/src/tasks/nlp/tableQuestionAnswering.ts +25 -41
  84. package/src/tasks/nlp/textClassification.ts +3 -18
  85. package/src/tasks/nlp/tokenClassification.ts +2 -47
  86. package/src/tasks/nlp/translation.ts +3 -17
  87. package/src/tasks/nlp/zeroShotClassification.ts +2 -24
  88. package/src/types.ts +7 -1
package/dist/index.js CHANGED
@@ -33,6 +33,7 @@ __export(tasks_exports, {
33
33
  textGenerationStream: () => textGenerationStream,
34
34
  textToImage: () => textToImage,
35
35
  textToSpeech: () => textToSpeech,
36
+ textToVideo: () => textToVideo,
36
37
  tokenClassification: () => tokenClassification,
37
38
  translation: () => translation,
38
39
  visualQuestionAnswering: () => visualQuestionAnswering,
@@ -61,6 +62,10 @@ var FAL_AI_SUPPORTED_MODEL_IDS = {
61
62
  },
62
63
  "automatic-speech-recognition": {
63
64
  "openai/whisper-large-v3": "fal-ai/whisper"
65
+ },
66
+ "text-to-video": {
67
+ "genmo/mochi-1-preview": "fal-ai/mochi-v1",
68
+ "tencent/HunyuanVideo": "fal-ai/hunyuan-video"
64
69
  }
65
70
  };
66
71
 
@@ -73,6 +78,9 @@ var REPLICATE_SUPPORTED_MODEL_IDS = {
73
78
  },
74
79
  "text-to-speech": {
75
80
  "OuteAI/OuteTTS-0.3-500M": "jbilcke/oute-tts:39a59319327b27327fa3095149c5a746e7f2aee18c75055c3368237a6503cd26"
81
+ },
82
+ "text-to-video": {
83
+ "genmo/mochi-1-preview": "genmoai/mochi-1:1944af04d098ef69bed7f9d335d102e652203f268ec4aaa2d836f6217217e460"
76
84
  }
77
85
  };
78
86
 
@@ -538,9 +546,42 @@ var InferenceOutputError = class extends TypeError {
538
546
  }
539
547
  };
540
548
 
549
+ // src/utils/pick.ts
550
+ function pick(o, props) {
551
+ return Object.assign(
552
+ {},
553
+ ...props.map((prop) => {
554
+ if (o[prop] !== void 0) {
555
+ return { [prop]: o[prop] };
556
+ }
557
+ })
558
+ );
559
+ }
560
+
561
+ // src/utils/typedInclude.ts
562
+ function typedInclude(arr, v) {
563
+ return arr.includes(v);
564
+ }
565
+
566
+ // src/utils/omit.ts
567
+ function omit(o, props) {
568
+ const propsArr = Array.isArray(props) ? props : [props];
569
+ const letsKeep = Object.keys(o).filter((prop) => !typedInclude(propsArr, prop));
570
+ return pick(o, letsKeep);
571
+ }
572
+
573
+ // src/tasks/audio/utils.ts
574
+ function preparePayload(args) {
575
+ return "data" in args ? args : {
576
+ ...omit(args, "inputs"),
577
+ data: args.inputs
578
+ };
579
+ }
580
+
541
581
  // src/tasks/audio/audioClassification.ts
542
582
  async function audioClassification(args, options) {
543
- const res = await request(args, {
583
+ const payload = preparePayload(args);
584
+ const res = await request(payload, {
544
585
  ...options,
545
586
  taskHint: "audio-classification"
546
587
  });
@@ -566,15 +607,8 @@ function base64FromBytes(arr) {
566
607
 
567
608
  // src/tasks/audio/automaticSpeechRecognition.ts
568
609
  async function automaticSpeechRecognition(args, options) {
569
- if (args.provider === "fal-ai") {
570
- const contentType = args.data instanceof Blob ? args.data.type : "audio/mpeg";
571
- const base64audio = base64FromBytes(
572
- new Uint8Array(args.data instanceof ArrayBuffer ? args.data : await args.data.arrayBuffer())
573
- );
574
- args.audio_url = `data:${contentType};base64,${base64audio}`;
575
- delete args.data;
576
- }
577
- const res = await request(args, {
610
+ const payload = await buildPayload(args);
611
+ const res = await request(payload, {
578
612
  ...options,
579
613
  taskHint: "automatic-speech-recognition"
580
614
  });
@@ -584,6 +618,32 @@ async function automaticSpeechRecognition(args, options) {
584
618
  }
585
619
  return res;
586
620
  }
621
+ var FAL_AI_SUPPORTED_BLOB_TYPES = ["audio/mpeg", "audio/mp4", "audio/wav", "audio/x-wav"];
622
+ async function buildPayload(args) {
623
+ if (args.provider === "fal-ai") {
624
+ const blob = "data" in args && args.data instanceof Blob ? args.data : "inputs" in args ? args.inputs : void 0;
625
+ const contentType = blob?.type;
626
+ if (!contentType) {
627
+ throw new Error(
628
+ `Unable to determine the input's content-type. Make sure your are passing a Blob when using provider fal-ai.`
629
+ );
630
+ }
631
+ if (!FAL_AI_SUPPORTED_BLOB_TYPES.includes(contentType)) {
632
+ throw new Error(
633
+ `Provider fal-ai does not support blob type ${contentType} - supported content types are: ${FAL_AI_SUPPORTED_BLOB_TYPES.join(
634
+ ", "
635
+ )}`
636
+ );
637
+ }
638
+ const base64audio = base64FromBytes(new Uint8Array(await blob.arrayBuffer()));
639
+ return {
640
+ ..."data" in args ? omit(args, "data") : omit(args, "inputs"),
641
+ audio_url: `data:${contentType};base64,${base64audio}`
642
+ };
643
+ } else {
644
+ return preparePayload(args);
645
+ }
646
+ }
587
647
 
588
648
  // src/tasks/audio/textToSpeech.ts
589
649
  async function textToSpeech(args, options) {
@@ -591,6 +651,9 @@ async function textToSpeech(args, options) {
591
651
  ...options,
592
652
  taskHint: "text-to-speech"
593
653
  });
654
+ if (res instanceof Blob) {
655
+ return res;
656
+ }
594
657
  if (res && typeof res === "object") {
595
658
  if ("output" in res) {
596
659
  if (typeof res.output === "string") {
@@ -604,31 +667,39 @@ async function textToSpeech(args, options) {
604
667
  }
605
668
  }
606
669
  }
607
- const isValidOutput = res && res instanceof Blob;
608
- if (!isValidOutput) {
609
- throw new InferenceOutputError("Expected Blob");
610
- }
611
- return res;
670
+ throw new InferenceOutputError("Expected Blob or object with output");
612
671
  }
613
672
 
614
673
  // src/tasks/audio/audioToAudio.ts
615
674
  async function audioToAudio(args, options) {
616
- const res = await request(args, {
675
+ const payload = preparePayload(args);
676
+ const res = await request(payload, {
617
677
  ...options,
618
678
  taskHint: "audio-to-audio"
619
679
  });
620
- const isValidOutput = Array.isArray(res) && res.every(
621
- (x) => typeof x.label === "string" && typeof x.blob === "string" && typeof x["content-type"] === "string"
622
- );
623
- if (!isValidOutput) {
624
- throw new InferenceOutputError("Expected Array<{label: string, blob: string, content-type: string}>");
680
+ return validateOutput(res);
681
+ }
682
+ function validateOutput(output) {
683
+ if (!Array.isArray(output)) {
684
+ throw new InferenceOutputError("Expected Array");
625
685
  }
626
- return res;
686
+ if (!output.every((elem) => {
687
+ return typeof elem === "object" && elem && "label" in elem && typeof elem.label === "string" && "content-type" in elem && typeof elem["content-type"] === "string" && "blob" in elem && typeof elem.blob === "string";
688
+ })) {
689
+ throw new InferenceOutputError("Expected Array<{label: string, audio: Blob}>");
690
+ }
691
+ return output;
692
+ }
693
+
694
+ // src/tasks/cv/utils.ts
695
+ function preparePayload2(args) {
696
+ return "data" in args ? args : { ...omit(args, "inputs"), data: args.inputs };
627
697
  }
628
698
 
629
699
  // src/tasks/cv/imageClassification.ts
630
700
  async function imageClassification(args, options) {
631
- const res = await request(args, {
701
+ const payload = preparePayload2(args);
702
+ const res = await request(payload, {
632
703
  ...options,
633
704
  taskHint: "image-classification"
634
705
  });
@@ -641,7 +712,8 @@ async function imageClassification(args, options) {
641
712
 
642
713
  // src/tasks/cv/imageSegmentation.ts
643
714
  async function imageSegmentation(args, options) {
644
- const res = await request(args, {
715
+ const payload = preparePayload2(args);
716
+ const res = await request(payload, {
645
717
  ...options,
646
718
  taskHint: "image-segmentation"
647
719
  });
@@ -654,7 +726,8 @@ async function imageSegmentation(args, options) {
654
726
 
655
727
  // src/tasks/cv/imageToText.ts
656
728
  async function imageToText(args, options) {
657
- const res = (await request(args, {
729
+ const payload = preparePayload2(args);
730
+ const res = (await request(payload, {
658
731
  ...options,
659
732
  taskHint: "image-to-text"
660
733
  }))?.[0];
@@ -666,7 +739,8 @@ async function imageToText(args, options) {
666
739
 
667
740
  // src/tasks/cv/objectDetection.ts
668
741
  async function objectDetection(args, options) {
669
- const res = await request(args, {
742
+ const payload = preparePayload2(args);
743
+ const res = await request(payload, {
670
744
  ...options,
671
745
  taskHint: "object-detection"
672
746
  });
@@ -683,15 +757,13 @@ async function objectDetection(args, options) {
683
757
 
684
758
  // src/tasks/cv/textToImage.ts
685
759
  async function textToImage(args, options) {
686
- if (args.provider === "together" || args.provider === "fal-ai") {
687
- args.prompt = args.inputs;
688
- delete args.inputs;
689
- args.response_format = "base64";
690
- } else if (args.provider === "replicate") {
691
- args.prompt = args.inputs;
692
- delete args.inputs;
693
- }
694
- const res = await request(args, {
760
+ const payload = args.provider === "together" || args.provider === "fal-ai" || args.provider === "replicate" ? {
761
+ ...omit(args, ["inputs", "parameters"]),
762
+ ...args.parameters,
763
+ ...args.provider !== "replicate" ? { response_format: "base64" } : void 0,
764
+ prompt: args.inputs
765
+ } : args;
766
+ const res = await request(payload, {
695
767
  ...options,
696
768
  taskHint: "text-to-image"
697
769
  });
@@ -748,18 +820,30 @@ async function imageToImage(args, options) {
748
820
  }
749
821
 
750
822
  // src/tasks/cv/zeroShotImageClassification.ts
751
- async function zeroShotImageClassification(args, options) {
752
- const reqArgs = {
753
- ...args,
754
- inputs: {
755
- image: base64FromBytes(
756
- new Uint8Array(
757
- args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
823
+ async function preparePayload3(args) {
824
+ if (args.inputs instanceof Blob) {
825
+ return {
826
+ ...args,
827
+ inputs: {
828
+ image: base64FromBytes(new Uint8Array(await args.inputs.arrayBuffer()))
829
+ }
830
+ };
831
+ } else {
832
+ return {
833
+ ...args,
834
+ inputs: {
835
+ image: base64FromBytes(
836
+ new Uint8Array(
837
+ args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
838
+ )
758
839
  )
759
- )
760
- }
761
- };
762
- const res = await request(reqArgs, {
840
+ }
841
+ };
842
+ }
843
+ }
844
+ async function zeroShotImageClassification(args, options) {
845
+ const payload = await preparePayload3(args);
846
+ const res = await request(payload, {
763
847
  ...options,
764
848
  taskHint: "zero-shot-image-classification"
765
849
  });
@@ -770,6 +854,36 @@ async function zeroShotImageClassification(args, options) {
770
854
  return res;
771
855
  }
772
856
 
857
+ // src/tasks/cv/textToVideo.ts
858
+ var SUPPORTED_PROVIDERS = ["fal-ai", "replicate"];
859
+ async function textToVideo(args, options) {
860
+ if (!args.provider || !typedInclude(SUPPORTED_PROVIDERS, args.provider)) {
861
+ throw new Error(
862
+ `textToVideo inference is only supported for the following providers: ${SUPPORTED_PROVIDERS.join(", ")}`
863
+ );
864
+ }
865
+ const payload = args.provider === "fal-ai" || args.provider === "replicate" ? { ...omit(args, ["inputs", "parameters"]), ...args.parameters, prompt: args.inputs } : args;
866
+ const res = await request(payload, {
867
+ ...options,
868
+ taskHint: "text-to-video"
869
+ });
870
+ if (args.provider === "fal-ai") {
871
+ const isValidOutput = typeof res === "object" && !!res && "video" in res && typeof res.video === "object" && !!res.video && "url" in res.video && typeof res.video.url === "string" && isUrl(res.video.url);
872
+ if (!isValidOutput) {
873
+ throw new InferenceOutputError("Expected { video: { url: string } }");
874
+ }
875
+ const urlResponse = await fetch(res.video.url);
876
+ return await urlResponse.blob();
877
+ } else {
878
+ const isValidOutput = typeof res === "object" && !!res && "output" in res && typeof res.output === "string" && isUrl(res.output);
879
+ if (!isValidOutput) {
880
+ throw new InferenceOutputError("Expected { output: string }");
881
+ }
882
+ const urlResponse = await fetch(res.output);
883
+ return await urlResponse.blob();
884
+ }
885
+ }
886
+
773
887
  // src/lib/getDefaultTask.ts
774
888
  var taskCache = /* @__PURE__ */ new Map();
775
889
  var CACHE_DURATION = 10 * 60 * 1e3;
@@ -848,17 +962,19 @@ async function questionAnswering(args, options) {
848
962
  ...options,
849
963
  taskHint: "question-answering"
850
964
  });
851
- const isValidOutput = typeof res === "object" && !!res && typeof res.answer === "string" && typeof res.end === "number" && typeof res.score === "number" && typeof res.start === "number";
965
+ const isValidOutput = Array.isArray(res) ? res.every(
966
+ (elem) => typeof elem === "object" && !!elem && typeof elem.answer === "string" && typeof elem.end === "number" && typeof elem.score === "number" && typeof elem.start === "number"
967
+ ) : typeof res === "object" && !!res && typeof res.answer === "string" && typeof res.end === "number" && typeof res.score === "number" && typeof res.start === "number";
852
968
  if (!isValidOutput) {
853
- throw new InferenceOutputError("Expected {answer: string, end: number, score: number, start: number}");
969
+ throw new InferenceOutputError("Expected Array<{answer: string, end: number, score: number, start: number}>");
854
970
  }
855
- return res;
971
+ return Array.isArray(res) ? res[0] : res;
856
972
  }
857
973
 
858
974
  // src/tasks/nlp/sentenceSimilarity.ts
859
975
  async function sentenceSimilarity(args, options) {
860
976
  const defaultTask = args.model ? await getDefaultTask(args.model, args.accessToken, options) : void 0;
861
- const res = await request(args, {
977
+ const res = await request(prepareInput(args), {
862
978
  ...options,
863
979
  taskHint: "sentence-similarity",
864
980
  ...defaultTask === "feature-extraction" && { forceTask: "sentence-similarity" }
@@ -869,6 +985,13 @@ async function sentenceSimilarity(args, options) {
869
985
  }
870
986
  return res;
871
987
  }
988
+ function prepareInput(args) {
989
+ return {
990
+ ...omit(args, ["inputs", "parameters"]),
991
+ inputs: { ...omit(args.inputs, "sourceSentence") },
992
+ parameters: { source_sentence: args.inputs.sourceSentence, ...args.parameters }
993
+ };
994
+ }
872
995
 
873
996
  // src/tasks/nlp/summarization.ts
874
997
  async function summarization(args, options) {
@@ -889,13 +1012,18 @@ async function tableQuestionAnswering(args, options) {
889
1012
  ...options,
890
1013
  taskHint: "table-question-answering"
891
1014
  });
892
- const isValidOutput = typeof res?.aggregator === "string" && typeof res.answer === "string" && Array.isArray(res.cells) && res.cells.every((x) => typeof x === "string") && Array.isArray(res.coordinates) && res.coordinates.every((coord) => Array.isArray(coord) && coord.every((x) => typeof x === "number"));
1015
+ const isValidOutput = Array.isArray(res) ? res.every((elem) => validate(elem)) : validate(res);
893
1016
  if (!isValidOutput) {
894
1017
  throw new InferenceOutputError(
895
1018
  "Expected {aggregator: string, answer: string, cells: string[], coordinates: number[][]}"
896
1019
  );
897
1020
  }
898
- return res;
1021
+ return Array.isArray(res) ? res[0] : res;
1022
+ }
1023
+ function validate(elem) {
1024
+ return typeof elem === "object" && !!elem && "aggregator" in elem && typeof elem.aggregator === "string" && "answer" in elem && typeof elem.answer === "string" && "cells" in elem && Array.isArray(elem.cells) && elem.cells.every((x) => typeof x === "string") && "coordinates" in elem && Array.isArray(elem.coordinates) && elem.coordinates.every(
1025
+ (coord) => Array.isArray(coord) && coord.every((x) => typeof x === "number")
1026
+ );
899
1027
  }
900
1028
 
901
1029
  // src/tasks/nlp/textClassification.ts
@@ -1038,11 +1166,7 @@ async function documentQuestionAnswering(args, options) {
1038
1166
  inputs: {
1039
1167
  question: args.inputs.question,
1040
1168
  // convert Blob or ArrayBuffer to base64
1041
- image: base64FromBytes(
1042
- new Uint8Array(
1043
- args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
1044
- )
1045
- )
1169
+ image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer()))
1046
1170
  }
1047
1171
  };
1048
1172
  const res = toArray(
@@ -1050,12 +1174,14 @@ async function documentQuestionAnswering(args, options) {
1050
1174
  ...options,
1051
1175
  taskHint: "document-question-answering"
1052
1176
  })
1053
- )?.[0];
1054
- const isValidOutput = typeof res?.answer === "string" && (typeof res.end === "number" || typeof res.end === "undefined") && (typeof res.score === "number" || typeof res.score === "undefined") && (typeof res.start === "number" || typeof res.start === "undefined");
1177
+ );
1178
+ const isValidOutput = Array.isArray(res) && res.every(
1179
+ (elem) => typeof elem === "object" && !!elem && typeof elem?.answer === "string" && (typeof elem.end === "number" || typeof elem.end === "undefined") && (typeof elem.score === "number" || typeof elem.score === "undefined") && (typeof elem.start === "number" || typeof elem.start === "undefined")
1180
+ );
1055
1181
  if (!isValidOutput) {
1056
1182
  throw new InferenceOutputError("Expected Array<{answer: string, end?: number, score?: number, start?: number}>");
1057
1183
  }
1058
- return res;
1184
+ return res[0];
1059
1185
  }
1060
1186
 
1061
1187
  // src/tasks/multimodal/visualQuestionAnswering.ts
@@ -1065,22 +1191,20 @@ async function visualQuestionAnswering(args, options) {
1065
1191
  inputs: {
1066
1192
  question: args.inputs.question,
1067
1193
  // convert Blob or ArrayBuffer to base64
1068
- image: base64FromBytes(
1069
- new Uint8Array(
1070
- args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
1071
- )
1072
- )
1194
+ image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer()))
1073
1195
  }
1074
1196
  };
1075
- const res = (await request(reqArgs, {
1197
+ const res = await request(reqArgs, {
1076
1198
  ...options,
1077
1199
  taskHint: "visual-question-answering"
1078
- }))?.[0];
1079
- const isValidOutput = typeof res?.answer === "string" && typeof res.score === "number";
1200
+ });
1201
+ const isValidOutput = Array.isArray(res) && res.every(
1202
+ (elem) => typeof elem === "object" && !!elem && typeof elem?.answer === "string" && typeof elem.score === "number"
1203
+ );
1080
1204
  if (!isValidOutput) {
1081
1205
  throw new InferenceOutputError("Expected Array<{answer: string, score: number}>");
1082
1206
  }
1083
- return res;
1207
+ return res[0];
1084
1208
  }
1085
1209
 
1086
1210
  // src/tasks/tabular/tabularRegression.ts
@@ -1186,6 +1310,7 @@ export {
1186
1310
  textGenerationStream,
1187
1311
  textToImage,
1188
1312
  textToSpeech,
1313
+ textToVideo,
1189
1314
  tokenClassification,
1190
1315
  translation,
1191
1316
  visualQuestionAnswering,
@@ -1 +1 @@
1
- {"version":3,"file":"fal-ai.d.ts","sourceRoot":"","sources":["../../../src/providers/fal-ai.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,SAAS,CAAC;AAE/C,eAAO,MAAM,mBAAmB,oBAAoB,CAAC;AAErD,KAAK,OAAO,GAAG,MAAM,CAAC;AAEtB,eAAO,MAAM,0BAA0B,EAAE,eAAe,CAAC,OAAO,CAgB/D,CAAC"}
1
+ {"version":3,"file":"fal-ai.d.ts","sourceRoot":"","sources":["../../../src/providers/fal-ai.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,SAAS,CAAC;AAE/C,eAAO,MAAM,mBAAmB,oBAAoB,CAAC;AAErD,KAAK,OAAO,GAAG,MAAM,CAAC;AAEtB,eAAO,MAAM,0BAA0B,EAAE,eAAe,CAAC,OAAO,CAoB/D,CAAC"}
@@ -1 +1 @@
1
- {"version":3,"file":"replicate.d.ts","sourceRoot":"","sources":["../../../src/providers/replicate.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,SAAS,CAAC;AAE/C,eAAO,MAAM,sBAAsB,8BAA8B,CAAC;AAElE,KAAK,WAAW,GAAG,MAAM,CAAC;AAE1B,eAAO,MAAM,6BAA6B,EAAE,eAAe,CAAC,WAAW,CAStE,CAAC"}
1
+ {"version":3,"file":"replicate.d.ts","sourceRoot":"","sources":["../../../src/providers/replicate.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,SAAS,CAAC;AAE/C,eAAO,MAAM,sBAAsB,8BAA8B,CAAC;AAElE,KAAK,WAAW,GAAG,MAAM,CAAC;AAE1B,eAAO,MAAM,6BAA6B,EAAE,eAAe,CAAC,WAAW,CAYtE,CAAC"}
@@ -1,24 +1,10 @@
1
+ import type { AudioClassificationInput, AudioClassificationOutput } from "@huggingface/tasks";
1
2
  import type { BaseArgs, Options } from "../../types";
2
- export type AudioClassificationArgs = BaseArgs & {
3
- /**
4
- * Binary audio data
5
- */
6
- data: Blob | ArrayBuffer;
7
- };
8
- export interface AudioClassificationOutputValue {
9
- /**
10
- * The label for the class (model specific)
11
- */
12
- label: string;
13
- /**
14
- * A float that represents how likely it is that the audio file belongs to this class.
15
- */
16
- score: number;
17
- }
18
- export type AudioClassificationReturn = AudioClassificationOutputValue[];
3
+ import type { LegacyAudioInput } from "./utils";
4
+ export type AudioClassificationArgs = BaseArgs & (AudioClassificationInput | LegacyAudioInput);
19
5
  /**
20
6
  * This task reads some audio input and outputs the likelihood of classes.
21
7
  * Recommended model: superb/hubert-large-superb-er
22
8
  */
23
- export declare function audioClassification(args: AudioClassificationArgs, options?: Options): Promise<AudioClassificationReturn>;
9
+ export declare function audioClassification(args: AudioClassificationArgs, options?: Options): Promise<AudioClassificationOutput>;
24
10
  //# sourceMappingURL=audioClassification.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"audioClassification.d.ts","sourceRoot":"","sources":["../../../../src/tasks/audio/audioClassification.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,aAAa,CAAC;AAGrD,MAAM,MAAM,uBAAuB,GAAG,QAAQ,GAAG;IAChD;;OAEG;IACH,IAAI,EAAE,IAAI,GAAG,WAAW,CAAC;CACzB,CAAC;AAEF,MAAM,WAAW,8BAA8B;IAC9C;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IAEd;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;CACd;AAED,MAAM,MAAM,yBAAyB,GAAG,8BAA8B,EAAE,CAAC;AAEzE;;;GAGG;AACH,wBAAsB,mBAAmB,CACxC,IAAI,EAAE,uBAAuB,EAC7B,OAAO,CAAC,EAAE,OAAO,GACf,OAAO,CAAC,yBAAyB,CAAC,CAWpC"}
1
+ {"version":3,"file":"audioClassification.d.ts","sourceRoot":"","sources":["../../../../src/tasks/audio/audioClassification.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,wBAAwB,EAAE,yBAAyB,EAAE,MAAM,oBAAoB,CAAC;AAE9F,OAAO,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,aAAa,CAAC;AAErD,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,SAAS,CAAC;AAGhD,MAAM,MAAM,uBAAuB,GAAG,QAAQ,GAAG,CAAC,wBAAwB,GAAG,gBAAgB,CAAC,CAAC;AAE/F;;;GAGG;AACH,wBAAsB,mBAAmB,CACxC,IAAI,EAAE,uBAAuB,EAC7B,OAAO,CAAC,EAAE,OAAO,GACf,OAAO,CAAC,yBAAyB,CAAC,CAYpC"}
@@ -1,11 +1,12 @@
1
1
  import type { BaseArgs, Options } from "../../types";
2
- export type AudioToAudioArgs = BaseArgs & {
2
+ import type { LegacyAudioInput } from "./utils";
3
+ export type AudioToAudioArgs = (BaseArgs & {
3
4
  /**
4
5
  * Binary audio data
5
6
  */
6
- data: Blob | ArrayBuffer;
7
- };
8
- export interface AudioToAudioOutputValue {
7
+ inputs: Blob;
8
+ }) | LegacyAudioInput;
9
+ export interface AudioToAudioOutputElem {
9
10
  /**
10
11
  * The label for the audio output (model specific)
11
12
  */
@@ -13,16 +14,16 @@ export interface AudioToAudioOutputValue {
13
14
  /**
14
15
  * Base64 encoded audio output.
15
16
  */
17
+ audio: Blob;
18
+ }
19
+ export interface AudioToAudioOutput {
16
20
  blob: string;
17
- /**
18
- * Content-type for blob, e.g. audio/flac
19
- */
20
21
  "content-type": string;
22
+ label: string;
21
23
  }
22
- export type AudioToAudioReturn = AudioToAudioOutputValue[];
23
24
  /**
24
25
  * This task reads some audio input and outputs one or multiple audio files.
25
26
  * Example model: speechbrain/sepformer-wham does audio source separation.
26
27
  */
27
- export declare function audioToAudio(args: AudioToAudioArgs, options?: Options): Promise<AudioToAudioReturn>;
28
+ export declare function audioToAudio(args: AudioToAudioArgs, options?: Options): Promise<AudioToAudioOutput[]>;
28
29
  //# sourceMappingURL=audioToAudio.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"audioToAudio.d.ts","sourceRoot":"","sources":["../../../../src/tasks/audio/audioToAudio.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,aAAa,CAAC;AAGrD,MAAM,MAAM,gBAAgB,GAAG,QAAQ,GAAG;IACzC;;OAEG;IACH,IAAI,EAAE,IAAI,GAAG,WAAW,CAAC;CACzB,CAAC;AAEF,MAAM,WAAW,uBAAuB;IACvC;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IAEd;;OAEG;IACH,IAAI,EAAE,MAAM,CAAC;IAEb;;OAEG;IACH,cAAc,EAAE,MAAM,CAAC;CACvB;AAED,MAAM,MAAM,kBAAkB,GAAG,uBAAuB,EAAE,CAAC;AAE3D;;;GAGG;AACH,wBAAsB,YAAY,CAAC,IAAI,EAAE,gBAAgB,EAAE,OAAO,CAAC,EAAE,OAAO,GAAG,OAAO,CAAC,kBAAkB,CAAC,CAczG"}
1
+ {"version":3,"file":"audioToAudio.d.ts","sourceRoot":"","sources":["../../../../src/tasks/audio/audioToAudio.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,aAAa,CAAC;AAErD,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,SAAS,CAAC;AAGhD,MAAM,MAAM,gBAAgB,GACzB,CAAC,QAAQ,GAAG;IACZ;;OAEG;IACH,MAAM,EAAE,IAAI,CAAC;CACZ,CAAC,GACF,gBAAgB,CAAC;AAEpB,MAAM,WAAW,sBAAsB;IACtC;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IAEd;;OAEG;IACH,KAAK,EAAE,IAAI,CAAC;CACZ;AAED,MAAM,WAAW,kBAAkB;IAClC,IAAI,EAAE,MAAM,CAAC;IACb,cAAc,EAAE,MAAM,CAAC;IACvB,KAAK,EAAE,MAAM,CAAC;CACd;AAED;;;GAGG;AACH,wBAAsB,YAAY,CAAC,IAAI,EAAE,gBAAgB,EAAE,OAAO,CAAC,EAAE,OAAO,GAAG,OAAO,CAAC,kBAAkB,EAAE,CAAC,CAQ3G"}
@@ -1,16 +1,7 @@
1
+ import type { AutomaticSpeechRecognitionInput, AutomaticSpeechRecognitionOutput } from "@huggingface/tasks";
1
2
  import type { BaseArgs, Options } from "../../types";
2
- export type AutomaticSpeechRecognitionArgs = BaseArgs & {
3
- /**
4
- * Binary audio data
5
- */
6
- data: Blob | ArrayBuffer;
7
- };
8
- export interface AutomaticSpeechRecognitionOutput {
9
- /**
10
- * The text that was recognized from the audio
11
- */
12
- text: string;
13
- }
3
+ import type { LegacyAudioInput } from "./utils";
4
+ export type AutomaticSpeechRecognitionArgs = BaseArgs & (AutomaticSpeechRecognitionInput | LegacyAudioInput);
14
5
  /**
15
6
  * This task reads some audio input and outputs the said words within the audio files.
16
7
  * Recommended model (english language): facebook/wav2vec2-large-960h-lv60-self
@@ -1 +1 @@
1
- {"version":3,"file":"automaticSpeechRecognition.d.ts","sourceRoot":"","sources":["../../../../src/tasks/audio/automaticSpeechRecognition.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAe,MAAM,aAAa,CAAC;AAIlE,MAAM,MAAM,8BAA8B,GAAG,QAAQ,GAAG;IACvD;;OAEG;IACH,IAAI,EAAE,IAAI,GAAG,WAAW,CAAC;CACzB,CAAC;AAEF,MAAM,WAAW,gCAAgC;IAChD;;OAEG;IACH,IAAI,EAAE,MAAM,CAAC;CACb;AAED;;;GAGG;AACH,wBAAsB,0BAA0B,CAC/C,IAAI,EAAE,8BAA8B,EACpC,OAAO,CAAC,EAAE,OAAO,GACf,OAAO,CAAC,gCAAgC,CAAC,CAkB3C"}
1
+ {"version":3,"file":"automaticSpeechRecognition.d.ts","sourceRoot":"","sources":["../../../../src/tasks/audio/automaticSpeechRecognition.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,+BAA+B,EAAE,gCAAgC,EAAE,MAAM,oBAAoB,CAAC;AAE5G,OAAO,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAe,MAAM,aAAa,CAAC;AAGlE,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,SAAS,CAAC;AAIhD,MAAM,MAAM,8BAA8B,GAAG,QAAQ,GAAG,CAAC,+BAA+B,GAAG,gBAAgB,CAAC,CAAC;AAC7G;;;GAGG;AACH,wBAAsB,0BAA0B,CAC/C,IAAI,EAAE,8BAA8B,EACpC,OAAO,CAAC,EAAE,OAAO,GACf,OAAO,CAAC,gCAAgC,CAAC,CAW3C"}
@@ -1,14 +1,10 @@
1
+ import type { TextToSpeechInput } from "@huggingface/tasks";
1
2
  import type { BaseArgs, Options } from "../../types";
2
- export type TextToSpeechArgs = BaseArgs & {
3
- /**
4
- * The text to generate an audio from
5
- */
6
- inputs: string;
7
- };
8
- export type TextToSpeechOutput = Blob;
3
+ type TextToSpeechArgs = BaseArgs & TextToSpeechInput;
9
4
  /**
10
5
  * This task synthesize an audio of a voice pronouncing a given text.
11
6
  * Recommended model: espnet/kan-bayashi_ljspeech_vits
12
7
  */
13
- export declare function textToSpeech(args: TextToSpeechArgs, options?: Options): Promise<TextToSpeechOutput>;
8
+ export declare function textToSpeech(args: TextToSpeechArgs, options?: Options): Promise<Blob>;
9
+ export {};
14
10
  //# sourceMappingURL=textToSpeech.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"textToSpeech.d.ts","sourceRoot":"","sources":["../../../../src/tasks/audio/textToSpeech.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,aAAa,CAAC;AAGrD,MAAM,MAAM,gBAAgB,GAAG,QAAQ,GAAG;IACzC;;OAEG;IACH,MAAM,EAAE,MAAM,CAAC;CACf,CAAC;AAEF,MAAM,MAAM,kBAAkB,GAAG,IAAI,CAAC;AAItC;;;GAGG;AACH,wBAAsB,YAAY,CAAC,IAAI,EAAE,gBAAgB,EAAE,OAAO,CAAC,EAAE,OAAO,GAAG,OAAO,CAAC,kBAAkB,CAAC,CAuBzG"}
1
+ {"version":3,"file":"textToSpeech.d.ts","sourceRoot":"","sources":["../../../../src/tasks/audio/textToSpeech.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,oBAAoB,CAAC;AAE5D,OAAO,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,aAAa,CAAC;AAGrD,KAAK,gBAAgB,GAAG,QAAQ,GAAG,iBAAiB,CAAC;AAKrD;;;GAGG;AACH,wBAAsB,YAAY,CAAC,IAAI,EAAE,gBAAgB,EAAE,OAAO,CAAC,EAAE,OAAO,GAAG,OAAO,CAAC,IAAI,CAAC,CAsB3F"}
@@ -0,0 +1,11 @@
1
+ import type { BaseArgs, RequestArgs } from "../../types";
2
+ /**
3
+ * @deprecated
4
+ */
5
+ export interface LegacyAudioInput {
6
+ data: Blob | ArrayBuffer;
7
+ }
8
+ export declare function preparePayload(args: BaseArgs & ({
9
+ inputs: Blob;
10
+ } | LegacyAudioInput)): RequestArgs;
11
+ //# sourceMappingURL=utils.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"utils.d.ts","sourceRoot":"","sources":["../../../../src/tasks/audio/utils.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,QAAQ,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAGzD;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAChC,IAAI,EAAE,IAAI,GAAG,WAAW,CAAC;CACzB;AAED,wBAAgB,cAAc,CAAC,IAAI,EAAE,QAAQ,GAAG,CAAC;IAAE,MAAM,EAAE,IAAI,CAAA;CAAE,GAAG,gBAAgB,CAAC,GAAG,WAAW,CAOlG"}
@@ -1,21 +1,7 @@
1
+ import type { ImageClassificationInput, ImageClassificationOutput } from "@huggingface/tasks";
1
2
  import type { BaseArgs, Options } from "../../types";
2
- export type ImageClassificationArgs = BaseArgs & {
3
- /**
4
- * Binary image data
5
- */
6
- data: Blob | ArrayBuffer;
7
- };
8
- export interface ImageClassificationOutputValue {
9
- /**
10
- * The label for the class (model specific)
11
- */
12
- label: string;
13
- /**
14
- * A float that represents how likely it is that the image file belongs to this class.
15
- */
16
- score: number;
17
- }
18
- export type ImageClassificationOutput = ImageClassificationOutputValue[];
3
+ import { type LegacyImageInput } from "./utils";
4
+ export type ImageClassificationArgs = BaseArgs & (ImageClassificationInput | LegacyImageInput);
19
5
  /**
20
6
  * This task reads some image input and outputs the likelihood of classes.
21
7
  * Recommended model: google/vit-base-patch16-224
@@ -1 +1 @@
1
- {"version":3,"file":"imageClassification.d.ts","sourceRoot":"","sources":["../../../../src/tasks/cv/imageClassification.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,aAAa,CAAC;AAGrD,MAAM,MAAM,uBAAuB,GAAG,QAAQ,GAAG;IAChD;;OAEG;IACH,IAAI,EAAE,IAAI,GAAG,WAAW,CAAC;CACzB,CAAC;AAEF,MAAM,WAAW,8BAA8B;IAC9C;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;CACd;AAED,MAAM,MAAM,yBAAyB,GAAG,8BAA8B,EAAE,CAAC;AAEzE;;;GAGG;AACH,wBAAsB,mBAAmB,CACxC,IAAI,EAAE,uBAAuB,EAC7B,OAAO,CAAC,EAAE,OAAO,GACf,OAAO,CAAC,yBAAyB,CAAC,CAWpC"}
1
+ {"version":3,"file":"imageClassification.d.ts","sourceRoot":"","sources":["../../../../src/tasks/cv/imageClassification.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,wBAAwB,EAAE,yBAAyB,EAAE,MAAM,oBAAoB,CAAC;AAE9F,OAAO,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,aAAa,CAAC;AAErD,OAAO,EAAkB,KAAK,gBAAgB,EAAE,MAAM,SAAS,CAAC;AAEhE,MAAM,MAAM,uBAAuB,GAAG,QAAQ,GAAG,CAAC,wBAAwB,GAAG,gBAAgB,CAAC,CAAC;AAE/F;;;GAGG;AACH,wBAAsB,mBAAmB,CACxC,IAAI,EAAE,uBAAuB,EAC7B,OAAO,CAAC,EAAE,OAAO,GACf,OAAO,CAAC,yBAAyB,CAAC,CAYpC"}