@huggingface/inference 3.0.1 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/dist/index.cjs +162 -69
  2. package/dist/index.js +162 -69
  3. package/dist/src/providers/fal-ai.d.ts.map +1 -1
  4. package/dist/src/providers/replicate.d.ts.map +1 -1
  5. package/dist/src/tasks/audio/audioClassification.d.ts +4 -18
  6. package/dist/src/tasks/audio/audioClassification.d.ts.map +1 -1
  7. package/dist/src/tasks/audio/audioToAudio.d.ts +10 -9
  8. package/dist/src/tasks/audio/audioToAudio.d.ts.map +1 -1
  9. package/dist/src/tasks/audio/automaticSpeechRecognition.d.ts +3 -12
  10. package/dist/src/tasks/audio/automaticSpeechRecognition.d.ts.map +1 -1
  11. package/dist/src/tasks/audio/textToSpeech.d.ts +4 -8
  12. package/dist/src/tasks/audio/textToSpeech.d.ts.map +1 -1
  13. package/dist/src/tasks/audio/utils.d.ts +11 -0
  14. package/dist/src/tasks/audio/utils.d.ts.map +1 -0
  15. package/dist/src/tasks/cv/imageClassification.d.ts +3 -17
  16. package/dist/src/tasks/cv/imageClassification.d.ts.map +1 -1
  17. package/dist/src/tasks/cv/imageSegmentation.d.ts +3 -21
  18. package/dist/src/tasks/cv/imageSegmentation.d.ts.map +1 -1
  19. package/dist/src/tasks/cv/imageToImage.d.ts +3 -49
  20. package/dist/src/tasks/cv/imageToImage.d.ts.map +1 -1
  21. package/dist/src/tasks/cv/imageToText.d.ts +3 -12
  22. package/dist/src/tasks/cv/imageToText.d.ts.map +1 -1
  23. package/dist/src/tasks/cv/objectDetection.d.ts +3 -26
  24. package/dist/src/tasks/cv/objectDetection.d.ts.map +1 -1
  25. package/dist/src/tasks/cv/textToImage.d.ts +3 -38
  26. package/dist/src/tasks/cv/textToImage.d.ts.map +1 -1
  27. package/dist/src/tasks/cv/textToVideo.d.ts +6 -0
  28. package/dist/src/tasks/cv/textToVideo.d.ts.map +1 -0
  29. package/dist/src/tasks/cv/utils.d.ts +11 -0
  30. package/dist/src/tasks/cv/utils.d.ts.map +1 -0
  31. package/dist/src/tasks/cv/zeroShotImageClassification.d.ts +7 -15
  32. package/dist/src/tasks/cv/zeroShotImageClassification.d.ts.map +1 -1
  33. package/dist/src/tasks/multimodal/documentQuestionAnswering.d.ts +5 -28
  34. package/dist/src/tasks/multimodal/documentQuestionAnswering.d.ts.map +1 -1
  35. package/dist/src/tasks/multimodal/visualQuestionAnswering.d.ts +5 -20
  36. package/dist/src/tasks/multimodal/visualQuestionAnswering.d.ts.map +1 -1
  37. package/dist/src/tasks/nlp/fillMask.d.ts +2 -21
  38. package/dist/src/tasks/nlp/fillMask.d.ts.map +1 -1
  39. package/dist/src/tasks/nlp/questionAnswering.d.ts +3 -25
  40. package/dist/src/tasks/nlp/questionAnswering.d.ts.map +1 -1
  41. package/dist/src/tasks/nlp/sentenceSimilarity.d.ts +2 -13
  42. package/dist/src/tasks/nlp/sentenceSimilarity.d.ts.map +1 -1
  43. package/dist/src/tasks/nlp/summarization.d.ts +2 -42
  44. package/dist/src/tasks/nlp/summarization.d.ts.map +1 -1
  45. package/dist/src/tasks/nlp/tableQuestionAnswering.d.ts +3 -31
  46. package/dist/src/tasks/nlp/tableQuestionAnswering.d.ts.map +1 -1
  47. package/dist/src/tasks/nlp/textClassification.d.ts +2 -16
  48. package/dist/src/tasks/nlp/textClassification.d.ts.map +1 -1
  49. package/dist/src/tasks/nlp/tokenClassification.d.ts +2 -45
  50. package/dist/src/tasks/nlp/tokenClassification.d.ts.map +1 -1
  51. package/dist/src/tasks/nlp/translation.d.ts +2 -13
  52. package/dist/src/tasks/nlp/translation.d.ts.map +1 -1
  53. package/dist/src/tasks/nlp/zeroShotClassification.d.ts +2 -22
  54. package/dist/src/tasks/nlp/zeroShotClassification.d.ts.map +1 -1
  55. package/dist/src/types.d.ts +4 -0
  56. package/dist/src/types.d.ts.map +1 -1
  57. package/package.json +2 -2
  58. package/src/providers/fal-ai.ts +4 -0
  59. package/src/providers/replicate.ts +3 -0
  60. package/src/tasks/audio/audioClassification.ts +7 -22
  61. package/src/tasks/audio/audioToAudio.ts +43 -23
  62. package/src/tasks/audio/automaticSpeechRecognition.ts +35 -23
  63. package/src/tasks/audio/textToSpeech.ts +8 -14
  64. package/src/tasks/audio/utils.ts +18 -0
  65. package/src/tasks/cv/imageClassification.ts +5 -20
  66. package/src/tasks/cv/imageSegmentation.ts +5 -24
  67. package/src/tasks/cv/imageToImage.ts +4 -52
  68. package/src/tasks/cv/imageToText.ts +6 -15
  69. package/src/tasks/cv/objectDetection.ts +5 -30
  70. package/src/tasks/cv/textToImage.ts +14 -50
  71. package/src/tasks/cv/textToVideo.ts +67 -0
  72. package/src/tasks/cv/utils.ts +13 -0
  73. package/src/tasks/cv/zeroShotImageClassification.ts +32 -31
  74. package/src/tasks/multimodal/documentQuestionAnswering.ts +25 -43
  75. package/src/tasks/multimodal/visualQuestionAnswering.ts +20 -36
  76. package/src/tasks/nlp/fillMask.ts +2 -22
  77. package/src/tasks/nlp/questionAnswering.ts +22 -36
  78. package/src/tasks/nlp/sentenceSimilarity.ts +12 -15
  79. package/src/tasks/nlp/summarization.ts +2 -43
  80. package/src/tasks/nlp/tableQuestionAnswering.ts +25 -41
  81. package/src/tasks/nlp/textClassification.ts +3 -18
  82. package/src/tasks/nlp/tokenClassification.ts +2 -47
  83. package/src/tasks/nlp/translation.ts +3 -17
  84. package/src/tasks/nlp/zeroShotClassification.ts +2 -24
  85. package/src/types.ts +7 -1
@@ -1,49 +1,6 @@
1
+ import type { TokenClassificationInput, TokenClassificationOutput } from "@huggingface/tasks";
1
2
  import type { BaseArgs, Options } from "../../types";
2
- export type TokenClassificationArgs = BaseArgs & {
3
- /**
4
- * A string to be classified
5
- */
6
- inputs: string;
7
- parameters?: {
8
- /**
9
- * (Default: simple). There are several aggregation strategies:
10
- *
11
- * none: Every token gets classified without further aggregation.
12
- *
13
- * simple: Entities are grouped according to the default schema (B-, I- tags get merged when the tag is similar).
14
- *
15
- * first: Same as the simple strategy except words cannot end up with different tags. Words will use the tag of the first token when there is ambiguity.
16
- *
17
- * average: Same as the simple strategy except words cannot end up with different tags. Scores are averaged across tokens and then the maximum label is applied.
18
- *
19
- * max: Same as the simple strategy except words cannot end up with different tags. Word entity will be the token with the maximum score.
20
- */
21
- aggregation_strategy?: "none" | "simple" | "first" | "average" | "max";
22
- };
23
- };
24
- export interface TokenClassificationOutputValue {
25
- /**
26
- * The offset stringwise where the answer is located. Useful to disambiguate if word occurs multiple times.
27
- */
28
- end: number;
29
- /**
30
- * The type for the entity being recognized (model specific).
31
- */
32
- entity_group: string;
33
- /**
34
- * How likely the entity was recognized.
35
- */
36
- score: number;
37
- /**
38
- * The offset stringwise where the answer is located. Useful to disambiguate if word occurs multiple times.
39
- */
40
- start: number;
41
- /**
42
- * The string that was captured
43
- */
44
- word: string;
45
- }
46
- export type TokenClassificationOutput = TokenClassificationOutputValue[];
3
+ export type TokenClassificationArgs = BaseArgs & TokenClassificationInput;
47
4
  /**
48
5
  * Usually used for sentence parsing, either grammatical, or Named Entity Recognition (NER) to understand keywords contained within text. Recommended model: dbmdz/bert-large-cased-finetuned-conll03-english
49
6
  */
@@ -1 +1 @@
1
- {"version":3,"file":"tokenClassification.d.ts","sourceRoot":"","sources":["../../../../src/tasks/nlp/tokenClassification.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,aAAa,CAAC;AAIrD,MAAM,MAAM,uBAAuB,GAAG,QAAQ,GAAG;IAChD;;OAEG;IACH,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,CAAC,EAAE;QACZ;;;;;;;;;;;;WAYG;QACH,oBAAoB,CAAC,EAAE,MAAM,GAAG,QAAQ,GAAG,OAAO,GAAG,SAAS,GAAG,KAAK,CAAC;KACvE,CAAC;CACF,CAAC;AAEF,MAAM,WAAW,8BAA8B;IAC9C;;OAEG;IACH,GAAG,EAAE,MAAM,CAAC;IACZ;;OAEG;IACH,YAAY,EAAE,MAAM,CAAC;IACrB;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;OAEG;IACH,KAAK,EAAE,MAAM,CAAC;IACd;;OAEG;IACH,IAAI,EAAE,MAAM,CAAC;CACb;AAED,MAAM,MAAM,yBAAyB,GAAG,8BAA8B,EAAE,CAAC;AAEzE;;GAEG;AACH,wBAAsB,mBAAmB,CACxC,IAAI,EAAE,uBAAuB,EAC7B,OAAO,CAAC,EAAE,OAAO,GACf,OAAO,CAAC,yBAAyB,CAAC,CAuBpC"}
1
+ {"version":3,"file":"tokenClassification.d.ts","sourceRoot":"","sources":["../../../../src/tasks/nlp/tokenClassification.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,wBAAwB,EAAE,yBAAyB,EAAE,MAAM,oBAAoB,CAAC;AAE9F,OAAO,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,aAAa,CAAC;AAIrD,MAAM,MAAM,uBAAuB,GAAG,QAAQ,GAAG,wBAAwB,CAAC;AAE1E;;GAEG;AACH,wBAAsB,mBAAmB,CACxC,IAAI,EAAE,uBAAuB,EAC7B,OAAO,CAAC,EAAE,OAAO,GACf,OAAO,CAAC,yBAAyB,CAAC,CAuBpC"}
@@ -1,17 +1,6 @@
1
+ import type { TranslationInput, TranslationOutput } from "@huggingface/tasks";
1
2
  import type { BaseArgs, Options } from "../../types";
2
- export type TranslationArgs = BaseArgs & {
3
- /**
4
- * A string to be translated
5
- */
6
- inputs: string | string[];
7
- };
8
- export interface TranslationOutputValue {
9
- /**
10
- * The string after translation
11
- */
12
- translation_text: string;
13
- }
14
- export type TranslationOutput = TranslationOutputValue | TranslationOutputValue[];
3
+ export type TranslationArgs = BaseArgs & TranslationInput;
15
4
  /**
16
5
  * This task is well known to translate text from one language to another. Recommended model: Helsinki-NLP/opus-mt-ru-en.
17
6
  */
@@ -1 +1 @@
1
- {"version":3,"file":"translation.d.ts","sourceRoot":"","sources":["../../../../src/tasks/nlp/translation.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,aAAa,CAAC;AAGrD,MAAM,MAAM,eAAe,GAAG,QAAQ,GAAG;IACxC;;OAEG;IACH,MAAM,EAAE,MAAM,GAAG,MAAM,EAAE,CAAC;CAC1B,CAAC;AAEF,MAAM,WAAW,sBAAsB;IACtC;;OAEG;IACH,gBAAgB,EAAE,MAAM,CAAC;CACzB;AAED,MAAM,MAAM,iBAAiB,GAAG,sBAAsB,GAAG,sBAAsB,EAAE,CAAC;AAElF;;GAEG;AACH,wBAAsB,WAAW,CAAC,IAAI,EAAE,eAAe,EAAE,OAAO,CAAC,EAAE,OAAO,GAAG,OAAO,CAAC,iBAAiB,CAAC,CAUtG"}
1
+ {"version":3,"file":"translation.d.ts","sourceRoot":"","sources":["../../../../src/tasks/nlp/translation.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,gBAAgB,EAAE,iBAAiB,EAAE,MAAM,oBAAoB,CAAC;AAE9E,OAAO,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,aAAa,CAAC;AAGrD,MAAM,MAAM,eAAe,GAAG,QAAQ,GAAG,gBAAgB,CAAC;AAC1D;;GAEG;AACH,wBAAsB,WAAW,CAAC,IAAI,EAAE,eAAe,EAAE,OAAO,CAAC,EAAE,OAAO,GAAG,OAAO,CAAC,iBAAiB,CAAC,CAUtG"}
@@ -1,26 +1,6 @@
1
+ import type { ZeroShotClassificationInput, ZeroShotClassificationOutput } from "@huggingface/tasks";
1
2
  import type { BaseArgs, Options } from "../../types";
2
- export type ZeroShotClassificationArgs = BaseArgs & {
3
- /**
4
- * a string or list of strings
5
- */
6
- inputs: string | string[];
7
- parameters: {
8
- /**
9
- * a list of strings that are potential classes for inputs. (max 10 candidate_labels, for more, simply run multiple requests, results are going to be misleading if using too many candidate_labels anyway. If you want to keep the exact same, you can simply run multi_label=True and do the scaling on your end.
10
- */
11
- candidate_labels: string[];
12
- /**
13
- * (Default: false) Boolean that is set to True if classes can overlap
14
- */
15
- multi_label?: boolean;
16
- };
17
- };
18
- export interface ZeroShotClassificationOutputValue {
19
- labels: string[];
20
- scores: number[];
21
- sequence: string;
22
- }
23
- export type ZeroShotClassificationOutput = ZeroShotClassificationOutputValue[];
3
+ export type ZeroShotClassificationArgs = BaseArgs & ZeroShotClassificationInput;
24
4
  /**
25
5
  * This task is super useful to try out classification with zero code, you simply pass a sentence/paragraph and the possible labels for that sentence, and you get a result. Recommended model: facebook/bart-large-mnli.
26
6
  */
@@ -1 +1 @@
1
- {"version":3,"file":"zeroShotClassification.d.ts","sourceRoot":"","sources":["../../../../src/tasks/nlp/zeroShotClassification.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,aAAa,CAAC;AAIrD,MAAM,MAAM,0BAA0B,GAAG,QAAQ,GAAG;IACnD;;OAEG;IACH,MAAM,EAAE,MAAM,GAAG,MAAM,EAAE,CAAC;IAC1B,UAAU,EAAE;QACX;;WAEG;QACH,gBAAgB,EAAE,MAAM,EAAE,CAAC;QAC3B;;WAEG;QACH,WAAW,CAAC,EAAE,OAAO,CAAC;KACtB,CAAC;CACF,CAAC;AAEF,MAAM,WAAW,iCAAiC;IACjD,MAAM,EAAE,MAAM,EAAE,CAAC;IACjB,MAAM,EAAE,MAAM,EAAE,CAAC;IACjB,QAAQ,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,MAAM,4BAA4B,GAAG,iCAAiC,EAAE,CAAC;AAE/E;;GAEG;AACH,wBAAsB,sBAAsB,CAC3C,IAAI,EAAE,0BAA0B,EAChC,OAAO,CAAC,EAAE,OAAO,GACf,OAAO,CAAC,4BAA4B,CAAC,CAqBvC"}
1
+ {"version":3,"file":"zeroShotClassification.d.ts","sourceRoot":"","sources":["../../../../src/tasks/nlp/zeroShotClassification.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,2BAA2B,EAAE,4BAA4B,EAAE,MAAM,oBAAoB,CAAC;AAEpG,OAAO,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,aAAa,CAAC;AAIrD,MAAM,MAAM,0BAA0B,GAAG,QAAQ,GAAG,2BAA2B,CAAC;AAEhF;;GAEG;AACH,wBAAsB,sBAAsB,CAC3C,IAAI,EAAE,0BAA0B,EAChC,OAAO,CAAC,EAAE,OAAO,GACf,OAAO,CAAC,4BAA4B,CAAC,CAqBvC"}
@@ -76,6 +76,10 @@ export type RequestArgs = BaseArgs & ({
76
76
  data: Blob | ArrayBuffer;
77
77
  } | {
78
78
  inputs: unknown;
79
+ } | {
80
+ prompt: string;
81
+ } | {
82
+ audio_url: string;
79
83
  } | ChatCompletionInput) & {
80
84
  parameters?: Record<string, unknown>;
81
85
  accessToken?: string;
@@ -1 +1 @@
1
- {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AACvD,OAAO,KAAK,EAAE,mBAAmB,EAAE,MAAM,oBAAoB,CAAC;AAE9D;;GAEG;AACH,MAAM,MAAM,OAAO,GAAG,MAAM,CAAC;AAE7B,MAAM,WAAW,OAAO;IACvB;;OAEG;IACH,cAAc,CAAC,EAAE,OAAO,CAAC;IACzB;;OAEG;IACH,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB;;OAEG;IACH,eAAe,CAAC,EAAE,OAAO,CAAC;IAC1B;;OAEG;IACH,OAAO,CAAC,EAAE,OAAO,CAAC;IAElB;;OAEG;IACH,cAAc,CAAC,EAAE,OAAO,CAAC;IACzB;;OAEG;IACH,KAAK,CAAC,EAAE,OAAO,KAAK,CAAC;IACrB;;OAEG;IACH,MAAM,CAAC,EAAE,WAAW,CAAC;IAErB;;OAEG;IACH,kBAAkB,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC;CACtC;AAED,MAAM,MAAM,aAAa,GAAG,OAAO,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;AAE3D,eAAO,MAAM,mBAAmB,2EAA4E,CAAC;AAC7G,MAAM,MAAM,iBAAiB,GAAG,CAAC,OAAO,mBAAmB,CAAC,CAAC,MAAM,CAAC,CAAC;AAErE,MAAM,WAAW,QAAQ;IACxB;;;;;;OAMG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IAErB;;;;;;;OAOG;IACH,KAAK,CAAC,EAAE,OAAO,CAAC;IAEhB;;;;OAIG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IAErB;;;;OAIG;IACH,QAAQ,CAAC,EAAE,iBAAiB,CAAC;CAC7B;AAED,MAAM,MAAM,WAAW,GAAG,QAAQ,GACjC,CAAC;IAAE,IAAI,EAAE,IAAI,GAAG,WAAW,CAAA;CAAE,GAAG;IAAE,MAAM,EAAE,OAAO,CAAA;CAAE,GAAG,mBAAmB,CAAC,GAAG;IAC5E,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACrC,WAAW,CAAC,EAAE,MAAM,CAAC;CACrB,CAAC"}
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AACvD,OAAO,KAAK,EAAE,mBAAmB,EAAE,MAAM,oBAAoB,CAAC;AAE9D;;GAEG;AACH,MAAM,MAAM,OAAO,GAAG,MAAM,CAAC;AAE7B,MAAM,WAAW,OAAO;IACvB;;OAEG;IACH,cAAc,CAAC,EAAE,OAAO,CAAC;IACzB;;OAEG;IACH,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB;;OAEG;IACH,eAAe,CAAC,EAAE,OAAO,CAAC;IAC1B;;OAEG;IACH,OAAO,CAAC,EAAE,OAAO,CAAC;IAElB;;OAEG;IACH,cAAc,CAAC,EAAE,OAAO,CAAC;IACzB;;OAEG;IACH,KAAK,CAAC,EAAE,OAAO,KAAK,CAAC;IACrB;;OAEG;IACH,MAAM,CAAC,EAAE,WAAW,CAAC;IAErB;;OAEG;IACH,kBAAkB,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC;CACtC;AAED,MAAM,MAAM,aAAa,GAAG,OAAO,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;AAE3D,eAAO,MAAM,mBAAmB,2EAA4E,CAAC;AAC7G,MAAM,MAAM,iBAAiB,GAAG,CAAC,OAAO,mBAAmB,CAAC,CAAC,MAAM,CAAC,CAAC;AAErE,MAAM,WAAW,QAAQ;IACxB;;;;;;OAMG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IAErB;;;;;;;OAOG;IACH,KAAK,CAAC,EAAE,OAAO,CAAC;IAEhB;;;;OAIG;IACH,WAAW,CAAC,EAAE,MAAM,CAAC;IAErB;;;;OAIG;IACH,QAAQ,CAAC,EAAE,iBAAiB,CAAC;CAC7B;AAED,MAAM,MAAM,WAAW,GAAG,QAAQ,GACjC,CACG;IAAE,IAAI,EAAE,IAAI,GAAG,WAAW,CAAA;CAAE,GAC5B;IAAE,MAAM,EAAE,OAAO,CAAA;CAAE,GACnB;IAAE,MAAM,EAAE,MAAM,CAAA;CAAE,GAClB;IAAE,SAAS,EAAE,MAAM,CAAA;CAAE,GACrB,mBAAmB,CACrB,GAAG;IACH,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACrC,WAAW,CAAC,EAAE,MAAM,CAAC;CACrB,CAAC"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@huggingface/inference",
3
- "version": "3.0.1",
3
+ "version": "3.1.0",
4
4
  "packageManager": "pnpm@8.10.5",
5
5
  "license": "MIT",
6
6
  "author": "Tim Mikeladze <tim.mikeladze@gmail.com>",
@@ -39,7 +39,7 @@
39
39
  },
40
40
  "type": "module",
41
41
  "dependencies": {
42
- "@huggingface/tasks": "^0.13.17"
42
+ "@huggingface/tasks": "^0.14.0"
43
43
  },
44
44
  "devDependencies": {
45
45
  "@types/node": "18.13.0"
@@ -20,4 +20,8 @@ export const FAL_AI_SUPPORTED_MODEL_IDS: ProviderMapping<FalAiId> = {
20
20
  "automatic-speech-recognition": {
21
21
  "openai/whisper-large-v3": "fal-ai/whisper",
22
22
  },
23
+ "text-to-video": {
24
+ "genmo/mochi-1-preview": "fal-ai/mochi-v1",
25
+ "tencent/HunyuanVideo": "fal-ai/hunyuan-video",
26
+ },
23
27
  };
@@ -13,4 +13,7 @@ export const REPLICATE_SUPPORTED_MODEL_IDS: ProviderMapping<ReplicateId> = {
13
13
  "text-to-speech": {
14
14
  "OuteAI/OuteTTS-0.3-500M": "jbilcke/oute-tts:39a59319327b27327fa3095149c5a746e7f2aee18c75055c3368237a6503cd26",
15
15
  },
16
+ "text-to-video": {
17
+ "genmo/mochi-1-preview": "genmoai/mochi-1:1944af04d098ef69bed7f9d335d102e652203f268ec4aaa2d836f6217217e460",
18
+ },
16
19
  };
@@ -1,27 +1,11 @@
1
+ import type { AudioClassificationInput, AudioClassificationOutput } from "@huggingface/tasks";
1
2
  import { InferenceOutputError } from "../../lib/InferenceOutputError";
2
3
  import type { BaseArgs, Options } from "../../types";
3
4
  import { request } from "../custom/request";
5
+ import type { LegacyAudioInput } from "./utils";
6
+ import { preparePayload } from "./utils";
4
7
 
5
- export type AudioClassificationArgs = BaseArgs & {
6
- /**
7
- * Binary audio data
8
- */
9
- data: Blob | ArrayBuffer;
10
- };
11
-
12
- export interface AudioClassificationOutputValue {
13
- /**
14
- * The label for the class (model specific)
15
- */
16
- label: string;
17
-
18
- /**
19
- * A float that represents how likely it is that the audio file belongs to this class.
20
- */
21
- score: number;
22
- }
23
-
24
- export type AudioClassificationReturn = AudioClassificationOutputValue[];
8
+ export type AudioClassificationArgs = BaseArgs & (AudioClassificationInput | LegacyAudioInput);
25
9
 
26
10
  /**
27
11
  * This task reads some audio input and outputs the likelihood of classes.
@@ -30,8 +14,9 @@ export type AudioClassificationReturn = AudioClassificationOutputValue[];
30
14
  export async function audioClassification(
31
15
  args: AudioClassificationArgs,
32
16
  options?: Options
33
- ): Promise<AudioClassificationReturn> {
34
- const res = await request<AudioClassificationReturn>(args, {
17
+ ): Promise<AudioClassificationOutput> {
18
+ const payload = preparePayload(args);
19
+ const res = await request<AudioClassificationOutput>(payload, {
35
20
  ...options,
36
21
  taskHint: "audio-classification",
37
22
  });
@@ -1,15 +1,19 @@
1
1
  import { InferenceOutputError } from "../../lib/InferenceOutputError";
2
2
  import type { BaseArgs, Options } from "../../types";
3
3
  import { request } from "../custom/request";
4
+ import type { LegacyAudioInput } from "./utils";
5
+ import { preparePayload } from "./utils";
4
6
 
5
- export type AudioToAudioArgs = BaseArgs & {
6
- /**
7
- * Binary audio data
8
- */
9
- data: Blob | ArrayBuffer;
10
- };
7
+ export type AudioToAudioArgs =
8
+ | (BaseArgs & {
9
+ /**
10
+ * Binary audio data
11
+ */
12
+ inputs: Blob;
13
+ })
14
+ | LegacyAudioInput;
11
15
 
12
- export interface AudioToAudioOutputValue {
16
+ export interface AudioToAudioOutputElem {
13
17
  /**
14
18
  * The label for the audio output (model specific)
15
19
  */
@@ -18,32 +22,48 @@ export interface AudioToAudioOutputValue {
18
22
  /**
19
23
  * Base64 encoded audio output.
20
24
  */
21
- blob: string;
25
+ audio: Blob;
26
+ }
22
27
 
23
- /**
24
- * Content-type for blob, e.g. audio/flac
25
- */
28
+ export interface AudioToAudioOutput {
29
+ blob: string;
26
30
  "content-type": string;
31
+ label: string;
27
32
  }
28
33
 
29
- export type AudioToAudioReturn = AudioToAudioOutputValue[];
30
-
31
34
  /**
32
35
  * This task reads some audio input and outputs one or multiple audio files.
33
36
  * Example model: speechbrain/sepformer-wham does audio source separation.
34
37
  */
35
- export async function audioToAudio(args: AudioToAudioArgs, options?: Options): Promise<AudioToAudioReturn> {
36
- const res = await request<AudioToAudioReturn>(args, {
38
+ export async function audioToAudio(args: AudioToAudioArgs, options?: Options): Promise<AudioToAudioOutput[]> {
39
+ const payload = preparePayload(args);
40
+ const res = await request<AudioToAudioOutput>(payload, {
37
41
  ...options,
38
42
  taskHint: "audio-to-audio",
39
43
  });
40
- const isValidOutput =
41
- Array.isArray(res) &&
42
- res.every(
43
- (x) => typeof x.label === "string" && typeof x.blob === "string" && typeof x["content-type"] === "string"
44
- );
45
- if (!isValidOutput) {
46
- throw new InferenceOutputError("Expected Array<{label: string, blob: string, content-type: string}>");
44
+
45
+ return validateOutput(res);
46
+ }
47
+
48
+ function validateOutput(output: unknown): AudioToAudioOutput[] {
49
+ if (!Array.isArray(output)) {
50
+ throw new InferenceOutputError("Expected Array");
51
+ }
52
+ if (
53
+ !output.every((elem): elem is AudioToAudioOutput => {
54
+ return (
55
+ typeof elem === "object" &&
56
+ elem &&
57
+ "label" in elem &&
58
+ typeof elem.label === "string" &&
59
+ "content-type" in elem &&
60
+ typeof elem["content-type"] === "string" &&
61
+ "blob" in elem &&
62
+ typeof elem.blob === "string"
63
+ );
64
+ })
65
+ ) {
66
+ throw new InferenceOutputError("Expected Array<{label: string, audio: Blob}>");
47
67
  }
48
- return res;
68
+ return output;
49
69
  }
@@ -1,22 +1,13 @@
1
+ import type { AutomaticSpeechRecognitionInput, AutomaticSpeechRecognitionOutput } from "@huggingface/tasks";
1
2
  import { InferenceOutputError } from "../../lib/InferenceOutputError";
2
3
  import type { BaseArgs, Options, RequestArgs } from "../../types";
3
4
  import { base64FromBytes } from "../../utils/base64FromBytes";
4
5
  import { request } from "../custom/request";
6
+ import type { LegacyAudioInput } from "./utils";
7
+ import { preparePayload } from "./utils";
8
+ import { omit } from "../../utils/omit";
5
9
 
6
- export type AutomaticSpeechRecognitionArgs = BaseArgs & {
7
- /**
8
- * Binary audio data
9
- */
10
- data: Blob | ArrayBuffer;
11
- };
12
-
13
- export interface AutomaticSpeechRecognitionOutput {
14
- /**
15
- * The text that was recognized from the audio
16
- */
17
- text: string;
18
- }
19
-
10
+ export type AutomaticSpeechRecognitionArgs = BaseArgs & (AutomaticSpeechRecognitionInput | LegacyAudioInput);
20
11
  /**
21
12
  * This task reads some audio input and outputs the said words within the audio files.
22
13
  * Recommended model (english language): facebook/wav2vec2-large-960h-lv60-self
@@ -25,15 +16,8 @@ export async function automaticSpeechRecognition(
25
16
  args: AutomaticSpeechRecognitionArgs,
26
17
  options?: Options
27
18
  ): Promise<AutomaticSpeechRecognitionOutput> {
28
- if (args.provider === "fal-ai") {
29
- const contentType = args.data instanceof Blob ? args.data.type : "audio/mpeg";
30
- const base64audio = base64FromBytes(
31
- new Uint8Array(args.data instanceof ArrayBuffer ? args.data : await args.data.arrayBuffer())
32
- );
33
- (args as RequestArgs & { audio_url: string }).audio_url = `data:${contentType};base64,${base64audio}`;
34
- delete (args as RequestArgs & { data: unknown }).data;
35
- }
36
- const res = await request<AutomaticSpeechRecognitionOutput>(args, {
19
+ const payload = await buildPayload(args);
20
+ const res = await request<AutomaticSpeechRecognitionOutput>(payload, {
37
21
  ...options,
38
22
  taskHint: "automatic-speech-recognition",
39
23
  });
@@ -43,3 +27,31 @@ export async function automaticSpeechRecognition(
43
27
  }
44
28
  return res;
45
29
  }
30
+
31
+ const FAL_AI_SUPPORTED_BLOB_TYPES = ["audio/mpeg", "audio/mp4", "audio/wav", "audio/x-wav"];
32
+
33
+ async function buildPayload(args: AutomaticSpeechRecognitionArgs): Promise<RequestArgs> {
34
+ if (args.provider === "fal-ai") {
35
+ const blob = "data" in args && args.data instanceof Blob ? args.data : "inputs" in args ? args.inputs : undefined;
36
+ const contentType = blob?.type;
37
+ if (!contentType) {
38
+ throw new Error(
39
+ `Unable to determine the input's content-type. Make sure your are passing a Blob when using provider fal-ai.`
40
+ );
41
+ }
42
+ if (!FAL_AI_SUPPORTED_BLOB_TYPES.includes(contentType)) {
43
+ throw new Error(
44
+ `Provider fal-ai does not support blob type ${contentType} - supported content types are: ${FAL_AI_SUPPORTED_BLOB_TYPES.join(
45
+ ", "
46
+ )}`
47
+ );
48
+ }
49
+ const base64audio = base64FromBytes(new Uint8Array(await blob.arrayBuffer()));
50
+ return {
51
+ ...("data" in args ? omit(args, "data") : omit(args, "inputs")),
52
+ audio_url: `data:${contentType};base64,${base64audio}`,
53
+ };
54
+ } else {
55
+ return preparePayload(args);
56
+ }
57
+ }
@@ -1,15 +1,10 @@
1
+ import type { TextToSpeechInput } from "@huggingface/tasks";
1
2
  import { InferenceOutputError } from "../../lib/InferenceOutputError";
2
3
  import type { BaseArgs, Options } from "../../types";
3
4
  import { request } from "../custom/request";
4
5
 
5
- export type TextToSpeechArgs = BaseArgs & {
6
- /**
7
- * The text to generate an audio from
8
- */
9
- inputs: string;
10
- };
6
+ type TextToSpeechArgs = BaseArgs & TextToSpeechInput;
11
7
 
12
- export type TextToSpeechOutput = Blob;
13
8
  interface OutputUrlTextToSpeechGeneration {
14
9
  output: string | string[];
15
10
  }
@@ -17,11 +12,14 @@ interface OutputUrlTextToSpeechGeneration {
17
12
  * This task synthesize an audio of a voice pronouncing a given text.
18
13
  * Recommended model: espnet/kan-bayashi_ljspeech_vits
19
14
  */
20
- export async function textToSpeech(args: TextToSpeechArgs, options?: Options): Promise<TextToSpeechOutput> {
21
- const res = await request<TextToSpeechOutput | OutputUrlTextToSpeechGeneration>(args, {
15
+ export async function textToSpeech(args: TextToSpeechArgs, options?: Options): Promise<Blob> {
16
+ const res = await request<Blob | OutputUrlTextToSpeechGeneration>(args, {
22
17
  ...options,
23
18
  taskHint: "text-to-speech",
24
19
  });
20
+ if (res instanceof Blob) {
21
+ return res;
22
+ }
25
23
  if (res && typeof res === "object") {
26
24
  if ("output" in res) {
27
25
  if (typeof res.output === "string") {
@@ -35,9 +33,5 @@ export async function textToSpeech(args: TextToSpeechArgs, options?: Options): P
35
33
  }
36
34
  }
37
35
  }
38
- const isValidOutput = res && res instanceof Blob;
39
- if (!isValidOutput) {
40
- throw new InferenceOutputError("Expected Blob");
41
- }
42
- return res;
36
+ throw new InferenceOutputError("Expected Blob or object with output");
43
37
  }
@@ -0,0 +1,18 @@
1
+ import type { BaseArgs, RequestArgs } from "../../types";
2
+ import { omit } from "../../utils/omit";
3
+
4
+ /**
5
+ * @deprecated
6
+ */
7
+ export interface LegacyAudioInput {
8
+ data: Blob | ArrayBuffer;
9
+ }
10
+
11
+ export function preparePayload(args: BaseArgs & ({ inputs: Blob } | LegacyAudioInput)): RequestArgs {
12
+ return "data" in args
13
+ ? args
14
+ : {
15
+ ...omit(args, "inputs"),
16
+ data: args.inputs,
17
+ };
18
+ }
@@ -1,26 +1,10 @@
1
+ import type { ImageClassificationInput, ImageClassificationOutput } from "@huggingface/tasks";
1
2
  import { InferenceOutputError } from "../../lib/InferenceOutputError";
2
3
  import type { BaseArgs, Options } from "../../types";
3
4
  import { request } from "../custom/request";
5
+ import { preparePayload, type LegacyImageInput } from "./utils";
4
6
 
5
- export type ImageClassificationArgs = BaseArgs & {
6
- /**
7
- * Binary image data
8
- */
9
- data: Blob | ArrayBuffer;
10
- };
11
-
12
- export interface ImageClassificationOutputValue {
13
- /**
14
- * The label for the class (model specific)
15
- */
16
- label: string;
17
- /**
18
- * A float that represents how likely it is that the image file belongs to this class.
19
- */
20
- score: number;
21
- }
22
-
23
- export type ImageClassificationOutput = ImageClassificationOutputValue[];
7
+ export type ImageClassificationArgs = BaseArgs & (ImageClassificationInput | LegacyImageInput);
24
8
 
25
9
  /**
26
10
  * This task reads some image input and outputs the likelihood of classes.
@@ -30,7 +14,8 @@ export async function imageClassification(
30
14
  args: ImageClassificationArgs,
31
15
  options?: Options
32
16
  ): Promise<ImageClassificationOutput> {
33
- const res = await request<ImageClassificationOutput>(args, {
17
+ const payload = preparePayload(args);
18
+ const res = await request<ImageClassificationOutput>(payload, {
34
19
  ...options,
35
20
  taskHint: "image-classification",
36
21
  });
@@ -1,30 +1,10 @@
1
+ import type { ImageSegmentationInput, ImageSegmentationOutput } from "@huggingface/tasks";
1
2
  import { InferenceOutputError } from "../../lib/InferenceOutputError";
2
3
  import type { BaseArgs, Options } from "../../types";
3
4
  import { request } from "../custom/request";
5
+ import { preparePayload, type LegacyImageInput } from "./utils";
4
6
 
5
- export type ImageSegmentationArgs = BaseArgs & {
6
- /**
7
- * Binary image data
8
- */
9
- data: Blob | ArrayBuffer;
10
- };
11
-
12
- export interface ImageSegmentationOutputValue {
13
- /**
14
- * The label for the class (model specific) of a segment.
15
- */
16
- label: string;
17
- /**
18
- * A str (base64 str of a single channel black-and-white img) representing the mask of a segment.
19
- */
20
- mask: string;
21
- /**
22
- * A float that represents how likely it is that the detected object belongs to the given class.
23
- */
24
- score: number;
25
- }
26
-
27
- export type ImageSegmentationOutput = ImageSegmentationOutputValue[];
7
+ export type ImageSegmentationArgs = BaseArgs & (ImageSegmentationInput | LegacyImageInput);
28
8
 
29
9
  /**
30
10
  * This task reads some image input and outputs the likelihood of classes & bounding boxes of detected objects.
@@ -34,7 +14,8 @@ export async function imageSegmentation(
34
14
  args: ImageSegmentationArgs,
35
15
  options?: Options
36
16
  ): Promise<ImageSegmentationOutput> {
37
- const res = await request<ImageSegmentationOutput>(args, {
17
+ const payload = preparePayload(args);
18
+ const res = await request<ImageSegmentationOutput>(payload, {
38
19
  ...options,
39
20
  taskHint: "image-segmentation",
40
21
  });
@@ -1,64 +1,16 @@
1
+ import type { ImageToImageInput } from "@huggingface/tasks";
1
2
  import { InferenceOutputError } from "../../lib/InferenceOutputError";
2
3
  import type { BaseArgs, Options, RequestArgs } from "../../types";
3
4
  import { base64FromBytes } from "../../utils/base64FromBytes";
4
5
  import { request } from "../custom/request";
5
6
 
6
- export type ImageToImageArgs = BaseArgs & {
7
- /**
8
- * The initial image condition
9
- *
10
- **/
11
- inputs: Blob | ArrayBuffer;
12
-
13
- parameters?: {
14
- /**
15
- * The text prompt to guide the image generation.
16
- */
17
- prompt?: string;
18
- /**
19
- * strengh param only works for SD img2img and alt diffusion img2img models
20
- * Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
21
- * will be used as a starting point, adding more noise to it the larger the `strength`. The number of
22
- * denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
23
- * be maximum and the denoising process will run for the full number of iterations specified in
24
- * `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
25
- **/
26
- strength?: number;
27
- /**
28
- * An optional negative prompt for the image generation
29
- */
30
- negative_prompt?: string;
31
- /**
32
- * The height in pixels of the generated image
33
- */
34
- height?: number;
35
- /**
36
- * The width in pixels of the generated image
37
- */
38
- width?: number;
39
- /**
40
- * The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference.
41
- */
42
- num_inference_steps?: number;
43
- /**
44
- * Guidance scale: Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality.
45
- */
46
- guidance_scale?: number;
47
- /**
48
- * guess_mode only works for ControlNet models, defaults to False In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
49
- * you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
50
- */
51
- guess_mode?: boolean;
52
- };
53
- };
54
-
55
- export type ImageToImageOutput = Blob;
7
+ export type ImageToImageArgs = BaseArgs & ImageToImageInput;
56
8
 
57
9
  /**
58
10
  * This task reads some text input and outputs an image.
59
11
  * Recommended model: lllyasviel/sd-controlnet-depth
60
12
  */
61
- export async function imageToImage(args: ImageToImageArgs, options?: Options): Promise<ImageToImageOutput> {
13
+ export async function imageToImage(args: ImageToImageArgs, options?: Options): Promise<Blob> {
62
14
  let reqArgs: RequestArgs;
63
15
  if (!args.parameters) {
64
16
  reqArgs = {
@@ -74,7 +26,7 @@ export async function imageToImage(args: ImageToImageArgs, options?: Options): P
74
26
  ),
75
27
  };
76
28
  }
77
- const res = await request<ImageToImageOutput>(reqArgs, {
29
+ const res = await request<Blob>(reqArgs, {
78
30
  ...options,
79
31
  taskHint: "image-to-image",
80
32
  });
@@ -1,27 +1,18 @@
1
+ import type { ImageToTextInput, ImageToTextOutput } from "@huggingface/tasks";
1
2
  import { InferenceOutputError } from "../../lib/InferenceOutputError";
2
3
  import type { BaseArgs, Options } from "../../types";
3
4
  import { request } from "../custom/request";
5
+ import type { LegacyImageInput } from "./utils";
6
+ import { preparePayload } from "./utils";
4
7
 
5
- export type ImageToTextArgs = BaseArgs & {
6
- /**
7
- * Binary image data
8
- */
9
- data: Blob | ArrayBuffer;
10
- };
11
-
12
- export interface ImageToTextOutput {
13
- /**
14
- * The generated caption
15
- */
16
- generated_text: string;
17
- }
18
-
8
+ export type ImageToTextArgs = BaseArgs & (ImageToTextInput | LegacyImageInput);
19
9
  /**
20
10
  * This task reads some image input and outputs the text caption.
21
11
  */
22
12
  export async function imageToText(args: ImageToTextArgs, options?: Options): Promise<ImageToTextOutput> {
13
+ const payload = preparePayload(args);
23
14
  const res = (
24
- await request<[ImageToTextOutput]>(args, {
15
+ await request<[ImageToTextOutput]>(payload, {
25
16
  ...options,
26
17
  taskHint: "image-to-text",
27
18
  })