@huggingface/inference 2.0.0-rc2 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +24 -2
- package/dist/index.js +56 -1
- package/dist/index.mjs +54 -1
- package/package.json +1 -1
- package/src/lib/makeRequestOptions.ts +1 -1
- package/src/tasks/index.ts +4 -0
- package/src/tasks/multimodal/documentQuestionAnswering.ts +63 -0
- package/src/tasks/multimodal/visualQuestionAnswering.ts +50 -0
- package/src/tasks/nlp/sentenceSimilarity.ts +3 -5
package/README.md
CHANGED
|
@@ -124,8 +124,8 @@ await hf.sentenceSimilarity({
|
|
|
124
124
|
})
|
|
125
125
|
|
|
126
126
|
await hf.featureExtraction({
|
|
127
|
-
|
|
128
|
-
|
|
127
|
+
model: "sentence-transformers/distilbert-base-nli-mean-tokens",
|
|
128
|
+
inputs: "That is a happy person",
|
|
129
129
|
});
|
|
130
130
|
|
|
131
131
|
// Audio
|
|
@@ -170,6 +170,24 @@ await hf.imageToText({
|
|
|
170
170
|
model: 'nlpconnect/vit-gpt2-image-captioning'
|
|
171
171
|
})
|
|
172
172
|
|
|
173
|
+
// Multimodal
|
|
174
|
+
|
|
175
|
+
await hf.visualQuestionAnswering({
|
|
176
|
+
model: 'dandelin/vilt-b32-finetuned-vqa',
|
|
177
|
+
inputs: {
|
|
178
|
+
question: 'How many cats are lying down?',
|
|
179
|
+
image: await (await fetch('https://placekitten.com/300/300')).blob()
|
|
180
|
+
}
|
|
181
|
+
})
|
|
182
|
+
|
|
183
|
+
await hf.documentQuestionAnswering({
|
|
184
|
+
model: 'impira/layoutlm-document-qa',
|
|
185
|
+
inputs: {
|
|
186
|
+
question: 'Invoice number?',
|
|
187
|
+
image: await (await fetch('https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png')).blob(),
|
|
188
|
+
}
|
|
189
|
+
})
|
|
190
|
+
|
|
173
191
|
// Custom call, for models with custom parameters / outputs
|
|
174
192
|
await hf.request({
|
|
175
193
|
model: 'my-custom-model',
|
|
@@ -227,6 +245,10 @@ const { generated_text } = await gpt2.textGeneration({inputs: 'The answer to the
|
|
|
227
245
|
- [x] Text to image
|
|
228
246
|
- [x] Image to text
|
|
229
247
|
|
|
248
|
+
### Multimodal
|
|
249
|
+
- [x] Document question answering
|
|
250
|
+
- [x] Visual question answering
|
|
251
|
+
|
|
230
252
|
## Tree-shaking
|
|
231
253
|
|
|
232
254
|
You can import the functions you need directly from the module, rather than using the `HfInference` class:
|
package/dist/index.js
CHANGED
|
@@ -25,6 +25,7 @@ __export(src_exports, {
|
|
|
25
25
|
audioClassification: () => audioClassification,
|
|
26
26
|
automaticSpeechRecognition: () => automaticSpeechRecognition,
|
|
27
27
|
conversational: () => conversational,
|
|
28
|
+
documentQuestionAnswering: () => documentQuestionAnswering,
|
|
28
29
|
featureExtraction: () => featureExtraction,
|
|
29
30
|
fillMask: () => fillMask,
|
|
30
31
|
imageClassification: () => imageClassification,
|
|
@@ -43,6 +44,7 @@ __export(src_exports, {
|
|
|
43
44
|
textToImage: () => textToImage,
|
|
44
45
|
tokenClassification: () => tokenClassification,
|
|
45
46
|
translation: () => translation,
|
|
47
|
+
visualQuestionAnswering: () => visualQuestionAnswering,
|
|
46
48
|
zeroShotClassification: () => zeroShotClassification
|
|
47
49
|
});
|
|
48
50
|
module.exports = __toCommonJS(src_exports);
|
|
@@ -53,6 +55,7 @@ __export(tasks_exports, {
|
|
|
53
55
|
audioClassification: () => audioClassification,
|
|
54
56
|
automaticSpeechRecognition: () => automaticSpeechRecognition,
|
|
55
57
|
conversational: () => conversational,
|
|
58
|
+
documentQuestionAnswering: () => documentQuestionAnswering,
|
|
56
59
|
featureExtraction: () => featureExtraction,
|
|
57
60
|
fillMask: () => fillMask,
|
|
58
61
|
imageClassification: () => imageClassification,
|
|
@@ -71,6 +74,7 @@ __export(tasks_exports, {
|
|
|
71
74
|
textToImage: () => textToImage,
|
|
72
75
|
tokenClassification: () => tokenClassification,
|
|
73
76
|
translation: () => translation,
|
|
77
|
+
visualQuestionAnswering: () => visualQuestionAnswering,
|
|
74
78
|
zeroShotClassification: () => zeroShotClassification
|
|
75
79
|
});
|
|
76
80
|
|
|
@@ -96,7 +100,7 @@ function makeRequestOptions(args, options) {
|
|
|
96
100
|
headers["X-Load-Model"] = "0";
|
|
97
101
|
}
|
|
98
102
|
}
|
|
99
|
-
const url = /^http(s?):/.test(model) ? model : `${HF_INFERENCE_API_BASE_URL}${model}`;
|
|
103
|
+
const url = /^http(s?):/.test(model) || model.startsWith("/") ? model : `${HF_INFERENCE_API_BASE_URL}${model}`;
|
|
100
104
|
const info = {
|
|
101
105
|
headers,
|
|
102
106
|
method: "POST",
|
|
@@ -539,6 +543,55 @@ async function zeroShotClassification(args, options) {
|
|
|
539
543
|
return res;
|
|
540
544
|
}
|
|
541
545
|
|
|
546
|
+
// ../shared/src/base64FromBytes.ts
|
|
547
|
+
function base64FromBytes(arr) {
|
|
548
|
+
if (globalThis.Buffer) {
|
|
549
|
+
return globalThis.Buffer.from(arr).toString("base64");
|
|
550
|
+
} else {
|
|
551
|
+
const bin = [];
|
|
552
|
+
arr.forEach((byte) => {
|
|
553
|
+
bin.push(String.fromCharCode(byte));
|
|
554
|
+
});
|
|
555
|
+
return globalThis.btoa(bin.join(""));
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
// src/tasks/multimodal/documentQuestionAnswering.ts
|
|
560
|
+
async function documentQuestionAnswering(args, options) {
|
|
561
|
+
const reqArgs = {
|
|
562
|
+
...args,
|
|
563
|
+
inputs: {
|
|
564
|
+
question: args.inputs.question,
|
|
565
|
+
// convert Blob to base64
|
|
566
|
+
image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer()))
|
|
567
|
+
}
|
|
568
|
+
};
|
|
569
|
+
const res = (await request(reqArgs, options))?.[0];
|
|
570
|
+
const isValidOutput = typeof res?.answer === "string" && typeof res.end === "number" && typeof res.score === "number" && typeof res.start === "number";
|
|
571
|
+
if (!isValidOutput) {
|
|
572
|
+
throw new InferenceOutputError("Expected Array<{answer: string, end: number, score: number, start: number}>");
|
|
573
|
+
}
|
|
574
|
+
return res;
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
// src/tasks/multimodal/visualQuestionAnswering.ts
|
|
578
|
+
async function visualQuestionAnswering(args, options) {
|
|
579
|
+
const reqArgs = {
|
|
580
|
+
...args,
|
|
581
|
+
inputs: {
|
|
582
|
+
question: args.inputs.question,
|
|
583
|
+
// convert Blob to base64
|
|
584
|
+
image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer()))
|
|
585
|
+
}
|
|
586
|
+
};
|
|
587
|
+
const res = (await request(reqArgs, options))?.[0];
|
|
588
|
+
const isValidOutput = typeof res?.answer === "string" && typeof res.score === "number";
|
|
589
|
+
if (!isValidOutput) {
|
|
590
|
+
throw new InferenceOutputError("Expected Array<{answer: string, score: number}>");
|
|
591
|
+
}
|
|
592
|
+
return res;
|
|
593
|
+
}
|
|
594
|
+
|
|
542
595
|
// src/HfInference.ts
|
|
543
596
|
var HfInference = class {
|
|
544
597
|
accessToken;
|
|
@@ -585,6 +638,7 @@ var HfInferenceEndpoint = class {
|
|
|
585
638
|
audioClassification,
|
|
586
639
|
automaticSpeechRecognition,
|
|
587
640
|
conversational,
|
|
641
|
+
documentQuestionAnswering,
|
|
588
642
|
featureExtraction,
|
|
589
643
|
fillMask,
|
|
590
644
|
imageClassification,
|
|
@@ -603,5 +657,6 @@ var HfInferenceEndpoint = class {
|
|
|
603
657
|
textToImage,
|
|
604
658
|
tokenClassification,
|
|
605
659
|
translation,
|
|
660
|
+
visualQuestionAnswering,
|
|
606
661
|
zeroShotClassification
|
|
607
662
|
});
|
package/dist/index.mjs
CHANGED
|
@@ -10,6 +10,7 @@ __export(tasks_exports, {
|
|
|
10
10
|
audioClassification: () => audioClassification,
|
|
11
11
|
automaticSpeechRecognition: () => automaticSpeechRecognition,
|
|
12
12
|
conversational: () => conversational,
|
|
13
|
+
documentQuestionAnswering: () => documentQuestionAnswering,
|
|
13
14
|
featureExtraction: () => featureExtraction,
|
|
14
15
|
fillMask: () => fillMask,
|
|
15
16
|
imageClassification: () => imageClassification,
|
|
@@ -28,6 +29,7 @@ __export(tasks_exports, {
|
|
|
28
29
|
textToImage: () => textToImage,
|
|
29
30
|
tokenClassification: () => tokenClassification,
|
|
30
31
|
translation: () => translation,
|
|
32
|
+
visualQuestionAnswering: () => visualQuestionAnswering,
|
|
31
33
|
zeroShotClassification: () => zeroShotClassification
|
|
32
34
|
});
|
|
33
35
|
|
|
@@ -53,7 +55,7 @@ function makeRequestOptions(args, options) {
|
|
|
53
55
|
headers["X-Load-Model"] = "0";
|
|
54
56
|
}
|
|
55
57
|
}
|
|
56
|
-
const url = /^http(s?):/.test(model) ? model : `${HF_INFERENCE_API_BASE_URL}${model}`;
|
|
58
|
+
const url = /^http(s?):/.test(model) || model.startsWith("/") ? model : `${HF_INFERENCE_API_BASE_URL}${model}`;
|
|
57
59
|
const info = {
|
|
58
60
|
headers,
|
|
59
61
|
method: "POST",
|
|
@@ -496,6 +498,55 @@ async function zeroShotClassification(args, options) {
|
|
|
496
498
|
return res;
|
|
497
499
|
}
|
|
498
500
|
|
|
501
|
+
// ../shared/src/base64FromBytes.ts
|
|
502
|
+
function base64FromBytes(arr) {
|
|
503
|
+
if (globalThis.Buffer) {
|
|
504
|
+
return globalThis.Buffer.from(arr).toString("base64");
|
|
505
|
+
} else {
|
|
506
|
+
const bin = [];
|
|
507
|
+
arr.forEach((byte) => {
|
|
508
|
+
bin.push(String.fromCharCode(byte));
|
|
509
|
+
});
|
|
510
|
+
return globalThis.btoa(bin.join(""));
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
// src/tasks/multimodal/documentQuestionAnswering.ts
|
|
515
|
+
async function documentQuestionAnswering(args, options) {
|
|
516
|
+
const reqArgs = {
|
|
517
|
+
...args,
|
|
518
|
+
inputs: {
|
|
519
|
+
question: args.inputs.question,
|
|
520
|
+
// convert Blob to base64
|
|
521
|
+
image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer()))
|
|
522
|
+
}
|
|
523
|
+
};
|
|
524
|
+
const res = (await request(reqArgs, options))?.[0];
|
|
525
|
+
const isValidOutput = typeof res?.answer === "string" && typeof res.end === "number" && typeof res.score === "number" && typeof res.start === "number";
|
|
526
|
+
if (!isValidOutput) {
|
|
527
|
+
throw new InferenceOutputError("Expected Array<{answer: string, end: number, score: number, start: number}>");
|
|
528
|
+
}
|
|
529
|
+
return res;
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
// src/tasks/multimodal/visualQuestionAnswering.ts
|
|
533
|
+
async function visualQuestionAnswering(args, options) {
|
|
534
|
+
const reqArgs = {
|
|
535
|
+
...args,
|
|
536
|
+
inputs: {
|
|
537
|
+
question: args.inputs.question,
|
|
538
|
+
// convert Blob to base64
|
|
539
|
+
image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer()))
|
|
540
|
+
}
|
|
541
|
+
};
|
|
542
|
+
const res = (await request(reqArgs, options))?.[0];
|
|
543
|
+
const isValidOutput = typeof res?.answer === "string" && typeof res.score === "number";
|
|
544
|
+
if (!isValidOutput) {
|
|
545
|
+
throw new InferenceOutputError("Expected Array<{answer: string, score: number}>");
|
|
546
|
+
}
|
|
547
|
+
return res;
|
|
548
|
+
}
|
|
549
|
+
|
|
499
550
|
// src/HfInference.ts
|
|
500
551
|
var HfInference = class {
|
|
501
552
|
accessToken;
|
|
@@ -541,6 +592,7 @@ export {
|
|
|
541
592
|
audioClassification,
|
|
542
593
|
automaticSpeechRecognition,
|
|
543
594
|
conversational,
|
|
595
|
+
documentQuestionAnswering,
|
|
544
596
|
featureExtraction,
|
|
545
597
|
fillMask,
|
|
546
598
|
imageClassification,
|
|
@@ -559,5 +611,6 @@ export {
|
|
|
559
611
|
textToImage,
|
|
560
612
|
tokenClassification,
|
|
561
613
|
translation,
|
|
614
|
+
visualQuestionAnswering,
|
|
562
615
|
zeroShotClassification
|
|
563
616
|
};
|
package/package.json
CHANGED
|
@@ -38,7 +38,7 @@ export function makeRequestOptions(
|
|
|
38
38
|
}
|
|
39
39
|
}
|
|
40
40
|
|
|
41
|
-
const url = /^http(s?):/.test(model) ? model : `${HF_INFERENCE_API_BASE_URL}${model}`;
|
|
41
|
+
const url = /^http(s?):/.test(model) || model.startsWith("/") ? model : `${HF_INFERENCE_API_BASE_URL}${model}`;
|
|
42
42
|
const info: RequestInit = {
|
|
43
43
|
headers,
|
|
44
44
|
method: "POST",
|
package/src/tasks/index.ts
CHANGED
|
@@ -27,3 +27,7 @@ export * from "./nlp/textGenerationStream";
|
|
|
27
27
|
export * from "./nlp/tokenClassification";
|
|
28
28
|
export * from "./nlp/translation";
|
|
29
29
|
export * from "./nlp/zeroShotClassification";
|
|
30
|
+
|
|
31
|
+
// Multimodal tasks
|
|
32
|
+
export * from "./multimodal/documentQuestionAnswering";
|
|
33
|
+
export * from "./multimodal/visualQuestionAnswering";
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import { InferenceOutputError } from "../../lib/InferenceOutputError";
|
|
2
|
+
import type { BaseArgs, Options } from "../../types";
|
|
3
|
+
import { request } from "../custom/request";
|
|
4
|
+
import type { RequestArgs } from "../../types";
|
|
5
|
+
import { base64FromBytes } from "../../../../shared/src/base64FromBytes";
|
|
6
|
+
|
|
7
|
+
export type DocumentQuestionAnsweringArgs = BaseArgs & {
|
|
8
|
+
inputs: {
|
|
9
|
+
/**
|
|
10
|
+
* Raw image
|
|
11
|
+
*
|
|
12
|
+
* You can use native `File` in browsers, or `new Blob([buffer])` in node, or for a base64 image `new Blob([btoa(base64String)])`, or even `await (await fetch('...)).blob()`
|
|
13
|
+
**/
|
|
14
|
+
image: Blob;
|
|
15
|
+
question: string;
|
|
16
|
+
};
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
export interface DocumentQuestionAnsweringOutput {
|
|
20
|
+
/**
|
|
21
|
+
* A string that’s the answer within the document.
|
|
22
|
+
*/
|
|
23
|
+
answer: string;
|
|
24
|
+
/**
|
|
25
|
+
* ?
|
|
26
|
+
*/
|
|
27
|
+
end: number;
|
|
28
|
+
/**
|
|
29
|
+
* A float that represents how likely that the answer is correct
|
|
30
|
+
*/
|
|
31
|
+
score: number;
|
|
32
|
+
/**
|
|
33
|
+
* ?
|
|
34
|
+
*/
|
|
35
|
+
start: number;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Answers a question on a document image. Recommended model: impira/layoutlm-document-qa.
|
|
40
|
+
*/
|
|
41
|
+
export async function documentQuestionAnswering(
|
|
42
|
+
args: DocumentQuestionAnsweringArgs,
|
|
43
|
+
options?: Options
|
|
44
|
+
): Promise<DocumentQuestionAnsweringOutput> {
|
|
45
|
+
const reqArgs: RequestArgs = {
|
|
46
|
+
...args,
|
|
47
|
+
inputs: {
|
|
48
|
+
question: args.inputs.question,
|
|
49
|
+
// convert Blob to base64
|
|
50
|
+
image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer())),
|
|
51
|
+
},
|
|
52
|
+
} as RequestArgs;
|
|
53
|
+
const res = (await request<[DocumentQuestionAnsweringOutput]>(reqArgs, options))?.[0];
|
|
54
|
+
const isValidOutput =
|
|
55
|
+
typeof res?.answer === "string" &&
|
|
56
|
+
typeof res.end === "number" &&
|
|
57
|
+
typeof res.score === "number" &&
|
|
58
|
+
typeof res.start === "number";
|
|
59
|
+
if (!isValidOutput) {
|
|
60
|
+
throw new InferenceOutputError("Expected Array<{answer: string, end: number, score: number, start: number}>");
|
|
61
|
+
}
|
|
62
|
+
return res;
|
|
63
|
+
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import { InferenceOutputError } from "../../lib/InferenceOutputError";
|
|
2
|
+
import type { BaseArgs, Options, RequestArgs } from "../../types";
|
|
3
|
+
import { request } from "../custom/request";
|
|
4
|
+
import { base64FromBytes } from "../../../../shared/src/base64FromBytes";
|
|
5
|
+
|
|
6
|
+
export type VisualQuestionAnsweringArgs = BaseArgs & {
|
|
7
|
+
inputs: {
|
|
8
|
+
/**
|
|
9
|
+
* Raw image
|
|
10
|
+
*
|
|
11
|
+
* You can use native `File` in browsers, or `new Blob([buffer])` in node, or for a base64 image `new Blob([btoa(base64String)])`, or even `await (await fetch('...)).blob()`
|
|
12
|
+
**/
|
|
13
|
+
image: Blob;
|
|
14
|
+
question: string;
|
|
15
|
+
};
|
|
16
|
+
};
|
|
17
|
+
|
|
18
|
+
export interface VisualQuestionAnsweringOutput {
|
|
19
|
+
/**
|
|
20
|
+
* A string that’s the answer to a visual question.
|
|
21
|
+
*/
|
|
22
|
+
answer: string;
|
|
23
|
+
/**
|
|
24
|
+
* Answer correctness score.
|
|
25
|
+
*/
|
|
26
|
+
score: number;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Answers a question on an image. Recommended model: dandelin/vilt-b32-finetuned-vqa.
|
|
31
|
+
*/
|
|
32
|
+
export async function visualQuestionAnswering(
|
|
33
|
+
args: VisualQuestionAnsweringArgs,
|
|
34
|
+
options?: Options
|
|
35
|
+
): Promise<VisualQuestionAnsweringOutput> {
|
|
36
|
+
const reqArgs: RequestArgs = {
|
|
37
|
+
...args,
|
|
38
|
+
inputs: {
|
|
39
|
+
question: args.inputs.question,
|
|
40
|
+
// convert Blob to base64
|
|
41
|
+
image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer())),
|
|
42
|
+
},
|
|
43
|
+
} as RequestArgs;
|
|
44
|
+
const res = (await request<[VisualQuestionAnsweringOutput]>(reqArgs, options))?.[0];
|
|
45
|
+
const isValidOutput = typeof res?.answer === "string" && typeof res.score === "number";
|
|
46
|
+
if (!isValidOutput) {
|
|
47
|
+
throw new InferenceOutputError("Expected Array<{answer: string, score: number}>");
|
|
48
|
+
}
|
|
49
|
+
return res;
|
|
50
|
+
}
|
|
@@ -4,12 +4,10 @@ import { request } from "../custom/request";
|
|
|
4
4
|
|
|
5
5
|
export type SentenceSimilarityArgs = BaseArgs & {
|
|
6
6
|
/**
|
|
7
|
-
* The inputs vary based on the model.
|
|
7
|
+
* The inputs vary based on the model.
|
|
8
8
|
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
* "sentences": ["That is a happy dog", "That is a very happy person", "Today is a sunny day"]
|
|
12
|
-
* }
|
|
9
|
+
* For example when using sentence-transformers/paraphrase-xlm-r-multilingual-v1 the inputs will have a `source_sentence` string and
|
|
10
|
+
* a `sentences` array of strings
|
|
13
11
|
*/
|
|
14
12
|
inputs: Record<string, unknown> | Record<string, unknown>[];
|
|
15
13
|
};
|