@huggingface/inference 2.0.0-rc2 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -124,8 +124,8 @@ await hf.sentenceSimilarity({
124
124
  })
125
125
 
126
126
  await hf.featureExtraction({
127
- model: "sentence-transformers/distilbert-base-nli-mean-tokens",
128
- inputs: "That is a happy person",
127
+ model: "sentence-transformers/distilbert-base-nli-mean-tokens",
128
+ inputs: "That is a happy person",
129
129
  });
130
130
 
131
131
  // Audio
@@ -170,6 +170,24 @@ await hf.imageToText({
170
170
  model: 'nlpconnect/vit-gpt2-image-captioning'
171
171
  })
172
172
 
173
+ // Multimodal
174
+
175
+ await hf.visualQuestionAnswering({
176
+ model: 'dandelin/vilt-b32-finetuned-vqa',
177
+ inputs: {
178
+ question: 'How many cats are lying down?',
179
+ image: await (await fetch('https://placekitten.com/300/300')).blob()
180
+ }
181
+ })
182
+
183
+ await hf.documentQuestionAnswering({
184
+ model: 'impira/layoutlm-document-qa',
185
+ inputs: {
186
+ question: 'Invoice number?',
187
+ image: await (await fetch('https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png')).blob(),
188
+ }
189
+ })
190
+
173
191
  // Custom call, for models with custom parameters / outputs
174
192
  await hf.request({
175
193
  model: 'my-custom-model',
@@ -227,6 +245,10 @@ const { generated_text } = await gpt2.textGeneration({inputs: 'The answer to the
227
245
  - [x] Text to image
228
246
  - [x] Image to text
229
247
 
248
+ ### Multimodal
249
+ - [x] Document question answering
250
+ - [x] Visual question answering
251
+
230
252
  ## Tree-shaking
231
253
 
232
254
  You can import the functions you need directly from the module, rather than using the `HfInference` class:
package/dist/index.js CHANGED
@@ -25,6 +25,7 @@ __export(src_exports, {
25
25
  audioClassification: () => audioClassification,
26
26
  automaticSpeechRecognition: () => automaticSpeechRecognition,
27
27
  conversational: () => conversational,
28
+ documentQuestionAnswering: () => documentQuestionAnswering,
28
29
  featureExtraction: () => featureExtraction,
29
30
  fillMask: () => fillMask,
30
31
  imageClassification: () => imageClassification,
@@ -43,6 +44,7 @@ __export(src_exports, {
43
44
  textToImage: () => textToImage,
44
45
  tokenClassification: () => tokenClassification,
45
46
  translation: () => translation,
47
+ visualQuestionAnswering: () => visualQuestionAnswering,
46
48
  zeroShotClassification: () => zeroShotClassification
47
49
  });
48
50
  module.exports = __toCommonJS(src_exports);
@@ -53,6 +55,7 @@ __export(tasks_exports, {
53
55
  audioClassification: () => audioClassification,
54
56
  automaticSpeechRecognition: () => automaticSpeechRecognition,
55
57
  conversational: () => conversational,
58
+ documentQuestionAnswering: () => documentQuestionAnswering,
56
59
  featureExtraction: () => featureExtraction,
57
60
  fillMask: () => fillMask,
58
61
  imageClassification: () => imageClassification,
@@ -71,6 +74,7 @@ __export(tasks_exports, {
71
74
  textToImage: () => textToImage,
72
75
  tokenClassification: () => tokenClassification,
73
76
  translation: () => translation,
77
+ visualQuestionAnswering: () => visualQuestionAnswering,
74
78
  zeroShotClassification: () => zeroShotClassification
75
79
  });
76
80
 
@@ -96,7 +100,7 @@ function makeRequestOptions(args, options) {
96
100
  headers["X-Load-Model"] = "0";
97
101
  }
98
102
  }
99
- const url = /^http(s?):/.test(model) ? model : `${HF_INFERENCE_API_BASE_URL}${model}`;
103
+ const url = /^http(s?):/.test(model) || model.startsWith("/") ? model : `${HF_INFERENCE_API_BASE_URL}${model}`;
100
104
  const info = {
101
105
  headers,
102
106
  method: "POST",
@@ -539,6 +543,55 @@ async function zeroShotClassification(args, options) {
539
543
  return res;
540
544
  }
541
545
 
546
+ // ../shared/src/base64FromBytes.ts
547
+ function base64FromBytes(arr) {
548
+ if (globalThis.Buffer) {
549
+ return globalThis.Buffer.from(arr).toString("base64");
550
+ } else {
551
+ const bin = [];
552
+ arr.forEach((byte) => {
553
+ bin.push(String.fromCharCode(byte));
554
+ });
555
+ return globalThis.btoa(bin.join(""));
556
+ }
557
+ }
558
+
559
+ // src/tasks/multimodal/documentQuestionAnswering.ts
560
+ async function documentQuestionAnswering(args, options) {
561
+ const reqArgs = {
562
+ ...args,
563
+ inputs: {
564
+ question: args.inputs.question,
565
+ // convert Blob to base64
566
+ image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer()))
567
+ }
568
+ };
569
+ const res = (await request(reqArgs, options))?.[0];
570
+ const isValidOutput = typeof res?.answer === "string" && typeof res.end === "number" && typeof res.score === "number" && typeof res.start === "number";
571
+ if (!isValidOutput) {
572
+ throw new InferenceOutputError("Expected Array<{answer: string, end: number, score: number, start: number}>");
573
+ }
574
+ return res;
575
+ }
576
+
577
+ // src/tasks/multimodal/visualQuestionAnswering.ts
578
+ async function visualQuestionAnswering(args, options) {
579
+ const reqArgs = {
580
+ ...args,
581
+ inputs: {
582
+ question: args.inputs.question,
583
+ // convert Blob to base64
584
+ image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer()))
585
+ }
586
+ };
587
+ const res = (await request(reqArgs, options))?.[0];
588
+ const isValidOutput = typeof res?.answer === "string" && typeof res.score === "number";
589
+ if (!isValidOutput) {
590
+ throw new InferenceOutputError("Expected Array<{answer: string, score: number}>");
591
+ }
592
+ return res;
593
+ }
594
+
542
595
  // src/HfInference.ts
543
596
  var HfInference = class {
544
597
  accessToken;
@@ -585,6 +638,7 @@ var HfInferenceEndpoint = class {
585
638
  audioClassification,
586
639
  automaticSpeechRecognition,
587
640
  conversational,
641
+ documentQuestionAnswering,
588
642
  featureExtraction,
589
643
  fillMask,
590
644
  imageClassification,
@@ -603,5 +657,6 @@ var HfInferenceEndpoint = class {
603
657
  textToImage,
604
658
  tokenClassification,
605
659
  translation,
660
+ visualQuestionAnswering,
606
661
  zeroShotClassification
607
662
  });
package/dist/index.mjs CHANGED
@@ -10,6 +10,7 @@ __export(tasks_exports, {
10
10
  audioClassification: () => audioClassification,
11
11
  automaticSpeechRecognition: () => automaticSpeechRecognition,
12
12
  conversational: () => conversational,
13
+ documentQuestionAnswering: () => documentQuestionAnswering,
13
14
  featureExtraction: () => featureExtraction,
14
15
  fillMask: () => fillMask,
15
16
  imageClassification: () => imageClassification,
@@ -28,6 +29,7 @@ __export(tasks_exports, {
28
29
  textToImage: () => textToImage,
29
30
  tokenClassification: () => tokenClassification,
30
31
  translation: () => translation,
32
+ visualQuestionAnswering: () => visualQuestionAnswering,
31
33
  zeroShotClassification: () => zeroShotClassification
32
34
  });
33
35
 
@@ -53,7 +55,7 @@ function makeRequestOptions(args, options) {
53
55
  headers["X-Load-Model"] = "0";
54
56
  }
55
57
  }
56
- const url = /^http(s?):/.test(model) ? model : `${HF_INFERENCE_API_BASE_URL}${model}`;
58
+ const url = /^http(s?):/.test(model) || model.startsWith("/") ? model : `${HF_INFERENCE_API_BASE_URL}${model}`;
57
59
  const info = {
58
60
  headers,
59
61
  method: "POST",
@@ -496,6 +498,55 @@ async function zeroShotClassification(args, options) {
496
498
  return res;
497
499
  }
498
500
 
501
+ // ../shared/src/base64FromBytes.ts
502
+ function base64FromBytes(arr) {
503
+ if (globalThis.Buffer) {
504
+ return globalThis.Buffer.from(arr).toString("base64");
505
+ } else {
506
+ const bin = [];
507
+ arr.forEach((byte) => {
508
+ bin.push(String.fromCharCode(byte));
509
+ });
510
+ return globalThis.btoa(bin.join(""));
511
+ }
512
+ }
513
+
514
+ // src/tasks/multimodal/documentQuestionAnswering.ts
515
+ async function documentQuestionAnswering(args, options) {
516
+ const reqArgs = {
517
+ ...args,
518
+ inputs: {
519
+ question: args.inputs.question,
520
+ // convert Blob to base64
521
+ image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer()))
522
+ }
523
+ };
524
+ const res = (await request(reqArgs, options))?.[0];
525
+ const isValidOutput = typeof res?.answer === "string" && typeof res.end === "number" && typeof res.score === "number" && typeof res.start === "number";
526
+ if (!isValidOutput) {
527
+ throw new InferenceOutputError("Expected Array<{answer: string, end: number, score: number, start: number}>");
528
+ }
529
+ return res;
530
+ }
531
+
532
+ // src/tasks/multimodal/visualQuestionAnswering.ts
533
+ async function visualQuestionAnswering(args, options) {
534
+ const reqArgs = {
535
+ ...args,
536
+ inputs: {
537
+ question: args.inputs.question,
538
+ // convert Blob to base64
539
+ image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer()))
540
+ }
541
+ };
542
+ const res = (await request(reqArgs, options))?.[0];
543
+ const isValidOutput = typeof res?.answer === "string" && typeof res.score === "number";
544
+ if (!isValidOutput) {
545
+ throw new InferenceOutputError("Expected Array<{answer: string, score: number}>");
546
+ }
547
+ return res;
548
+ }
549
+
499
550
  // src/HfInference.ts
500
551
  var HfInference = class {
501
552
  accessToken;
@@ -541,6 +592,7 @@ export {
541
592
  audioClassification,
542
593
  automaticSpeechRecognition,
543
594
  conversational,
595
+ documentQuestionAnswering,
544
596
  featureExtraction,
545
597
  fillMask,
546
598
  imageClassification,
@@ -559,5 +611,6 @@ export {
559
611
  textToImage,
560
612
  tokenClassification,
561
613
  translation,
614
+ visualQuestionAnswering,
562
615
  zeroShotClassification
563
616
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@huggingface/inference",
3
- "version": "2.0.0-rc2",
3
+ "version": "2.1.0",
4
4
  "license": "MIT",
5
5
  "author": "Tim Mikeladze <tim.mikeladze@gmail.com>",
6
6
  "description": "Typescript wrapper for the Hugging Face Inference API",
@@ -38,7 +38,7 @@ export function makeRequestOptions(
38
38
  }
39
39
  }
40
40
 
41
- const url = /^http(s?):/.test(model) ? model : `${HF_INFERENCE_API_BASE_URL}${model}`;
41
+ const url = /^http(s?):/.test(model) || model.startsWith("/") ? model : `${HF_INFERENCE_API_BASE_URL}${model}`;
42
42
  const info: RequestInit = {
43
43
  headers,
44
44
  method: "POST",
@@ -27,3 +27,7 @@ export * from "./nlp/textGenerationStream";
27
27
  export * from "./nlp/tokenClassification";
28
28
  export * from "./nlp/translation";
29
29
  export * from "./nlp/zeroShotClassification";
30
+
31
+ // Multimodal tasks
32
+ export * from "./multimodal/documentQuestionAnswering";
33
+ export * from "./multimodal/visualQuestionAnswering";
@@ -0,0 +1,63 @@
1
+ import { InferenceOutputError } from "../../lib/InferenceOutputError";
2
+ import type { BaseArgs, Options } from "../../types";
3
+ import { request } from "../custom/request";
4
+ import type { RequestArgs } from "../../types";
5
+ import { base64FromBytes } from "../../../../shared/src/base64FromBytes";
6
+
7
+ export type DocumentQuestionAnsweringArgs = BaseArgs & {
8
+ inputs: {
9
+ /**
10
+ * Raw image
11
+ *
12
+ * You can use native `File` in browsers, or `new Blob([buffer])` in node, or for a base64 image `new Blob([btoa(base64String)])`, or even `await (await fetch('...)).blob()`
13
+ **/
14
+ image: Blob;
15
+ question: string;
16
+ };
17
+ };
18
+
19
+ export interface DocumentQuestionAnsweringOutput {
20
+ /**
21
+ * A string that’s the answer within the document.
22
+ */
23
+ answer: string;
24
+ /**
25
+ * ?
26
+ */
27
+ end: number;
28
+ /**
29
+ * A float that represents how likely that the answer is correct
30
+ */
31
+ score: number;
32
+ /**
33
+ * ?
34
+ */
35
+ start: number;
36
+ }
37
+
38
+ /**
39
+ * Answers a question on a document image. Recommended model: impira/layoutlm-document-qa.
40
+ */
41
+ export async function documentQuestionAnswering(
42
+ args: DocumentQuestionAnsweringArgs,
43
+ options?: Options
44
+ ): Promise<DocumentQuestionAnsweringOutput> {
45
+ const reqArgs: RequestArgs = {
46
+ ...args,
47
+ inputs: {
48
+ question: args.inputs.question,
49
+ // convert Blob to base64
50
+ image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer())),
51
+ },
52
+ } as RequestArgs;
53
+ const res = (await request<[DocumentQuestionAnsweringOutput]>(reqArgs, options))?.[0];
54
+ const isValidOutput =
55
+ typeof res?.answer === "string" &&
56
+ typeof res.end === "number" &&
57
+ typeof res.score === "number" &&
58
+ typeof res.start === "number";
59
+ if (!isValidOutput) {
60
+ throw new InferenceOutputError("Expected Array<{answer: string, end: number, score: number, start: number}>");
61
+ }
62
+ return res;
63
+ }
@@ -0,0 +1,50 @@
1
+ import { InferenceOutputError } from "../../lib/InferenceOutputError";
2
+ import type { BaseArgs, Options, RequestArgs } from "../../types";
3
+ import { request } from "../custom/request";
4
+ import { base64FromBytes } from "../../../../shared/src/base64FromBytes";
5
+
6
+ export type VisualQuestionAnsweringArgs = BaseArgs & {
7
+ inputs: {
8
+ /**
9
+ * Raw image
10
+ *
11
+ * You can use native `File` in browsers, or `new Blob([buffer])` in node, or for a base64 image `new Blob([btoa(base64String)])`, or even `await (await fetch('...)).blob()`
12
+ **/
13
+ image: Blob;
14
+ question: string;
15
+ };
16
+ };
17
+
18
+ export interface VisualQuestionAnsweringOutput {
19
+ /**
20
+ * A string that’s the answer to a visual question.
21
+ */
22
+ answer: string;
23
+ /**
24
+ * Answer correctness score.
25
+ */
26
+ score: number;
27
+ }
28
+
29
+ /**
30
+ * Answers a question on an image. Recommended model: dandelin/vilt-b32-finetuned-vqa.
31
+ */
32
+ export async function visualQuestionAnswering(
33
+ args: VisualQuestionAnsweringArgs,
34
+ options?: Options
35
+ ): Promise<VisualQuestionAnsweringOutput> {
36
+ const reqArgs: RequestArgs = {
37
+ ...args,
38
+ inputs: {
39
+ question: args.inputs.question,
40
+ // convert Blob to base64
41
+ image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer())),
42
+ },
43
+ } as RequestArgs;
44
+ const res = (await request<[VisualQuestionAnsweringOutput]>(reqArgs, options))?.[0];
45
+ const isValidOutput = typeof res?.answer === "string" && typeof res.score === "number";
46
+ if (!isValidOutput) {
47
+ throw new InferenceOutputError("Expected Array<{answer: string, score: number}>");
48
+ }
49
+ return res;
50
+ }
@@ -4,12 +4,10 @@ import { request } from "../custom/request";
4
4
 
5
5
  export type SentenceSimilarityArgs = BaseArgs & {
6
6
  /**
7
- * The inputs vary based on the model. For example when using sentence-transformers/paraphrase-xlm-r-multilingual-v1 the inputs will look like this:
7
+ * The inputs vary based on the model.
8
8
  *
9
- * inputs: &#123;
10
- * "source_sentence": "That is a happy person",
11
- * "sentences": ["That is a happy dog", "That is a very happy person", "Today is a sunny day"]
12
- * &#125;
9
+ * For example when using sentence-transformers/paraphrase-xlm-r-multilingual-v1 the inputs will have a `source_sentence` string and
10
+ * a `sentences` array of strings
13
11
  */
14
12
  inputs: Record<string, unknown> | Record<string, unknown>[];
15
13
  };