@huggingface/inference 2.2.2 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -1
- package/dist/index.js +57 -21
- package/dist/index.mjs +56 -21
- package/package.json +2 -1
- package/src/tasks/cv/imageToImage.ts +83 -0
- package/src/tasks/index.ts +1 -0
- package/src/tasks/multimodal/documentQuestionAnswering.ts +7 -3
- package/src/tasks/multimodal/visualQuestionAnswering.ts +7 -3
package/README.md
CHANGED
|
@@ -170,6 +170,14 @@ await hf.imageToText({
|
|
|
170
170
|
model: 'nlpconnect/vit-gpt2-image-captioning'
|
|
171
171
|
})
|
|
172
172
|
|
|
173
|
+
await hf.imageToImage({
|
|
174
|
+
inputs: readFileSync("test/stormtrooper_depth.png"),
|
|
175
|
+
parameters: {
|
|
176
|
+
prompt: "elmo's lecture",
|
|
177
|
+
},
|
|
178
|
+
model: "lllyasviel/sd-controlnet-depth",
|
|
179
|
+
});
|
|
180
|
+
|
|
173
181
|
// Multimodal
|
|
174
182
|
|
|
175
183
|
await hf.visualQuestionAnswering({
|
|
@@ -260,12 +268,15 @@ const { generated_text } = await gpt2.textGeneration({inputs: 'The answer to the
|
|
|
260
268
|
- [x] Image segmentation
|
|
261
269
|
- [x] Text to image
|
|
262
270
|
- [x] Image to text - [demo](https://huggingface.co/spaces/huggingfacejs/image-to-text)
|
|
271
|
+
- [x] Image to Image
|
|
263
272
|
|
|
264
273
|
### Multimodal
|
|
274
|
+
|
|
265
275
|
- [x] Document question answering - [demo](https://huggingface.co/spaces/huggingfacejs/doc-vis-qa)
|
|
266
276
|
- [x] Visual question answering - [demo](https://huggingface.co/spaces/huggingfacejs/doc-vis-qa)
|
|
267
277
|
|
|
268
278
|
### Tabular
|
|
279
|
+
|
|
269
280
|
- [x] Tabular regression
|
|
270
281
|
|
|
271
282
|
## Tree-shaking
|
|
@@ -288,7 +299,7 @@ This will enable tree-shaking by your bundler.
|
|
|
288
299
|
## Running tests
|
|
289
300
|
|
|
290
301
|
```console
|
|
291
|
-
HF_ACCESS_TOKEN="your access token"
|
|
302
|
+
HF_ACCESS_TOKEN="your access token" pnpm run test
|
|
292
303
|
```
|
|
293
304
|
|
|
294
305
|
## Finding appropriate models
|
package/dist/index.js
CHANGED
|
@@ -30,6 +30,7 @@ __export(src_exports, {
|
|
|
30
30
|
fillMask: () => fillMask,
|
|
31
31
|
imageClassification: () => imageClassification,
|
|
32
32
|
imageSegmentation: () => imageSegmentation,
|
|
33
|
+
imageToImage: () => imageToImage,
|
|
33
34
|
imageToText: () => imageToText,
|
|
34
35
|
objectDetection: () => objectDetection,
|
|
35
36
|
questionAnswering: () => questionAnswering,
|
|
@@ -62,6 +63,7 @@ __export(tasks_exports, {
|
|
|
62
63
|
fillMask: () => fillMask,
|
|
63
64
|
imageClassification: () => imageClassification,
|
|
64
65
|
imageSegmentation: () => imageSegmentation,
|
|
66
|
+
imageToImage: () => imageToImage,
|
|
65
67
|
imageToText: () => imageToText,
|
|
66
68
|
objectDetection: () => objectDetection,
|
|
67
69
|
questionAnswering: () => questionAnswering,
|
|
@@ -397,6 +399,48 @@ async function textToImage(args, options) {
|
|
|
397
399
|
return res;
|
|
398
400
|
}
|
|
399
401
|
|
|
402
|
+
// ../shared/src/base64FromBytes.ts
|
|
403
|
+
function base64FromBytes(arr) {
|
|
404
|
+
if (globalThis.Buffer) {
|
|
405
|
+
return globalThis.Buffer.from(arr).toString("base64");
|
|
406
|
+
} else {
|
|
407
|
+
const bin = [];
|
|
408
|
+
arr.forEach((byte) => {
|
|
409
|
+
bin.push(String.fromCharCode(byte));
|
|
410
|
+
});
|
|
411
|
+
return globalThis.btoa(bin.join(""));
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
// ../shared/src/isBackend.ts
|
|
416
|
+
var isBrowser = typeof window !== "undefined" && typeof window.document !== "undefined";
|
|
417
|
+
var isWebWorker = typeof self === "object" && self.constructor && self.constructor.name === "DedicatedWorkerGlobalScope";
|
|
418
|
+
|
|
419
|
+
// src/tasks/cv/imageToImage.ts
|
|
420
|
+
async function imageToImage(args, options) {
|
|
421
|
+
let reqArgs;
|
|
422
|
+
if (!args.parameters) {
|
|
423
|
+
reqArgs = {
|
|
424
|
+
accessToken: args.accessToken,
|
|
425
|
+
model: args.model,
|
|
426
|
+
data: args.inputs
|
|
427
|
+
};
|
|
428
|
+
} else {
|
|
429
|
+
reqArgs = {
|
|
430
|
+
...args,
|
|
431
|
+
inputs: base64FromBytes(
|
|
432
|
+
new Uint8Array(args.inputs instanceof ArrayBuffer ? args.inputs : await args.inputs.arrayBuffer())
|
|
433
|
+
)
|
|
434
|
+
};
|
|
435
|
+
}
|
|
436
|
+
const res = await request(reqArgs, options);
|
|
437
|
+
const isValidOutput = res && res instanceof Blob;
|
|
438
|
+
if (!isValidOutput) {
|
|
439
|
+
throw new InferenceOutputError("Expected Blob");
|
|
440
|
+
}
|
|
441
|
+
return res;
|
|
442
|
+
}
|
|
443
|
+
|
|
400
444
|
// src/tasks/nlp/conversational.ts
|
|
401
445
|
async function conversational(args, options) {
|
|
402
446
|
const res = await request(args, options);
|
|
@@ -561,31 +605,18 @@ async function zeroShotClassification(args, options) {
|
|
|
561
605
|
return res;
|
|
562
606
|
}
|
|
563
607
|
|
|
564
|
-
// ../shared/src/base64FromBytes.ts
|
|
565
|
-
function base64FromBytes(arr) {
|
|
566
|
-
if (globalThis.Buffer) {
|
|
567
|
-
return globalThis.Buffer.from(arr).toString("base64");
|
|
568
|
-
} else {
|
|
569
|
-
const bin = [];
|
|
570
|
-
arr.forEach((byte) => {
|
|
571
|
-
bin.push(String.fromCharCode(byte));
|
|
572
|
-
});
|
|
573
|
-
return globalThis.btoa(bin.join(""));
|
|
574
|
-
}
|
|
575
|
-
}
|
|
576
|
-
|
|
577
|
-
// ../shared/src/isBackend.ts
|
|
578
|
-
var isBrowser = typeof window !== "undefined" && typeof window.document !== "undefined";
|
|
579
|
-
var isWebWorker = typeof self === "object" && self.constructor && self.constructor.name === "DedicatedWorkerGlobalScope";
|
|
580
|
-
|
|
581
608
|
// src/tasks/multimodal/documentQuestionAnswering.ts
|
|
582
609
|
async function documentQuestionAnswering(args, options) {
|
|
583
610
|
const reqArgs = {
|
|
584
611
|
...args,
|
|
585
612
|
inputs: {
|
|
586
613
|
question: args.inputs.question,
|
|
587
|
-
// convert Blob to base64
|
|
588
|
-
image: base64FromBytes(
|
|
614
|
+
// convert Blob or ArrayBuffer to base64
|
|
615
|
+
image: base64FromBytes(
|
|
616
|
+
new Uint8Array(
|
|
617
|
+
args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
|
|
618
|
+
)
|
|
619
|
+
)
|
|
589
620
|
}
|
|
590
621
|
};
|
|
591
622
|
const res = toArray(
|
|
@@ -604,8 +635,12 @@ async function visualQuestionAnswering(args, options) {
|
|
|
604
635
|
...args,
|
|
605
636
|
inputs: {
|
|
606
637
|
question: args.inputs.question,
|
|
607
|
-
// convert Blob to base64
|
|
608
|
-
image: base64FromBytes(
|
|
638
|
+
// convert Blob or ArrayBuffer to base64
|
|
639
|
+
image: base64FromBytes(
|
|
640
|
+
new Uint8Array(
|
|
641
|
+
args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
|
|
642
|
+
)
|
|
643
|
+
)
|
|
609
644
|
}
|
|
610
645
|
};
|
|
611
646
|
const res = (await request(reqArgs, options))?.[0];
|
|
@@ -677,6 +712,7 @@ var HfInferenceEndpoint = class {
|
|
|
677
712
|
fillMask,
|
|
678
713
|
imageClassification,
|
|
679
714
|
imageSegmentation,
|
|
715
|
+
imageToImage,
|
|
680
716
|
imageToText,
|
|
681
717
|
objectDetection,
|
|
682
718
|
questionAnswering,
|
package/dist/index.mjs
CHANGED
|
@@ -15,6 +15,7 @@ __export(tasks_exports, {
|
|
|
15
15
|
fillMask: () => fillMask,
|
|
16
16
|
imageClassification: () => imageClassification,
|
|
17
17
|
imageSegmentation: () => imageSegmentation,
|
|
18
|
+
imageToImage: () => imageToImage,
|
|
18
19
|
imageToText: () => imageToText,
|
|
19
20
|
objectDetection: () => objectDetection,
|
|
20
21
|
questionAnswering: () => questionAnswering,
|
|
@@ -350,6 +351,48 @@ async function textToImage(args, options) {
|
|
|
350
351
|
return res;
|
|
351
352
|
}
|
|
352
353
|
|
|
354
|
+
// ../shared/src/base64FromBytes.ts
|
|
355
|
+
function base64FromBytes(arr) {
|
|
356
|
+
if (globalThis.Buffer) {
|
|
357
|
+
return globalThis.Buffer.from(arr).toString("base64");
|
|
358
|
+
} else {
|
|
359
|
+
const bin = [];
|
|
360
|
+
arr.forEach((byte) => {
|
|
361
|
+
bin.push(String.fromCharCode(byte));
|
|
362
|
+
});
|
|
363
|
+
return globalThis.btoa(bin.join(""));
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
// ../shared/src/isBackend.ts
|
|
368
|
+
var isBrowser = typeof window !== "undefined" && typeof window.document !== "undefined";
|
|
369
|
+
var isWebWorker = typeof self === "object" && self.constructor && self.constructor.name === "DedicatedWorkerGlobalScope";
|
|
370
|
+
|
|
371
|
+
// src/tasks/cv/imageToImage.ts
|
|
372
|
+
async function imageToImage(args, options) {
|
|
373
|
+
let reqArgs;
|
|
374
|
+
if (!args.parameters) {
|
|
375
|
+
reqArgs = {
|
|
376
|
+
accessToken: args.accessToken,
|
|
377
|
+
model: args.model,
|
|
378
|
+
data: args.inputs
|
|
379
|
+
};
|
|
380
|
+
} else {
|
|
381
|
+
reqArgs = {
|
|
382
|
+
...args,
|
|
383
|
+
inputs: base64FromBytes(
|
|
384
|
+
new Uint8Array(args.inputs instanceof ArrayBuffer ? args.inputs : await args.inputs.arrayBuffer())
|
|
385
|
+
)
|
|
386
|
+
};
|
|
387
|
+
}
|
|
388
|
+
const res = await request(reqArgs, options);
|
|
389
|
+
const isValidOutput = res && res instanceof Blob;
|
|
390
|
+
if (!isValidOutput) {
|
|
391
|
+
throw new InferenceOutputError("Expected Blob");
|
|
392
|
+
}
|
|
393
|
+
return res;
|
|
394
|
+
}
|
|
395
|
+
|
|
353
396
|
// src/tasks/nlp/conversational.ts
|
|
354
397
|
async function conversational(args, options) {
|
|
355
398
|
const res = await request(args, options);
|
|
@@ -514,31 +557,18 @@ async function zeroShotClassification(args, options) {
|
|
|
514
557
|
return res;
|
|
515
558
|
}
|
|
516
559
|
|
|
517
|
-
// ../shared/src/base64FromBytes.ts
|
|
518
|
-
function base64FromBytes(arr) {
|
|
519
|
-
if (globalThis.Buffer) {
|
|
520
|
-
return globalThis.Buffer.from(arr).toString("base64");
|
|
521
|
-
} else {
|
|
522
|
-
const bin = [];
|
|
523
|
-
arr.forEach((byte) => {
|
|
524
|
-
bin.push(String.fromCharCode(byte));
|
|
525
|
-
});
|
|
526
|
-
return globalThis.btoa(bin.join(""));
|
|
527
|
-
}
|
|
528
|
-
}
|
|
529
|
-
|
|
530
|
-
// ../shared/src/isBackend.ts
|
|
531
|
-
var isBrowser = typeof window !== "undefined" && typeof window.document !== "undefined";
|
|
532
|
-
var isWebWorker = typeof self === "object" && self.constructor && self.constructor.name === "DedicatedWorkerGlobalScope";
|
|
533
|
-
|
|
534
560
|
// src/tasks/multimodal/documentQuestionAnswering.ts
|
|
535
561
|
async function documentQuestionAnswering(args, options) {
|
|
536
562
|
const reqArgs = {
|
|
537
563
|
...args,
|
|
538
564
|
inputs: {
|
|
539
565
|
question: args.inputs.question,
|
|
540
|
-
// convert Blob to base64
|
|
541
|
-
image: base64FromBytes(
|
|
566
|
+
// convert Blob or ArrayBuffer to base64
|
|
567
|
+
image: base64FromBytes(
|
|
568
|
+
new Uint8Array(
|
|
569
|
+
args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
|
|
570
|
+
)
|
|
571
|
+
)
|
|
542
572
|
}
|
|
543
573
|
};
|
|
544
574
|
const res = toArray(
|
|
@@ -557,8 +587,12 @@ async function visualQuestionAnswering(args, options) {
|
|
|
557
587
|
...args,
|
|
558
588
|
inputs: {
|
|
559
589
|
question: args.inputs.question,
|
|
560
|
-
// convert Blob to base64
|
|
561
|
-
image: base64FromBytes(
|
|
590
|
+
// convert Blob or ArrayBuffer to base64
|
|
591
|
+
image: base64FromBytes(
|
|
592
|
+
new Uint8Array(
|
|
593
|
+
args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
|
|
594
|
+
)
|
|
595
|
+
)
|
|
562
596
|
}
|
|
563
597
|
};
|
|
564
598
|
const res = (await request(reqArgs, options))?.[0];
|
|
@@ -629,6 +663,7 @@ export {
|
|
|
629
663
|
fillMask,
|
|
630
664
|
imageClassification,
|
|
631
665
|
imageSegmentation,
|
|
666
|
+
imageToImage,
|
|
632
667
|
imageToText,
|
|
633
668
|
objectDetection,
|
|
634
669
|
questionAnswering,
|
package/package.json
CHANGED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import { InferenceOutputError } from "../../lib/InferenceOutputError";
|
|
2
|
+
import type { BaseArgs, Options, RequestArgs } from "../../types";
|
|
3
|
+
import { request } from "../custom/request";
|
|
4
|
+
import { base64FromBytes } from "@huggingface/shared";
|
|
5
|
+
|
|
6
|
+
export type ImageToImageArgs = BaseArgs & {
|
|
7
|
+
/**
|
|
8
|
+
* The initial image condition
|
|
9
|
+
*
|
|
10
|
+
**/
|
|
11
|
+
inputs: Blob | ArrayBuffer;
|
|
12
|
+
|
|
13
|
+
parameters?: {
|
|
14
|
+
/**
|
|
15
|
+
* The text prompt to guide the image generation.
|
|
16
|
+
*/
|
|
17
|
+
prompt?: string;
|
|
18
|
+
/**
|
|
19
|
+
* strengh param only works for SD img2img and alt diffusion img2img models
|
|
20
|
+
* Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
|
|
21
|
+
* will be used as a starting point, adding more noise to it the larger the `strength`. The number of
|
|
22
|
+
* denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
|
|
23
|
+
* be maximum and the denoising process will run for the full number of iterations specified in
|
|
24
|
+
* `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
|
|
25
|
+
**/
|
|
26
|
+
strength?: number;
|
|
27
|
+
/**
|
|
28
|
+
* An optional negative prompt for the image generation
|
|
29
|
+
*/
|
|
30
|
+
negative_prompt?: string;
|
|
31
|
+
/**
|
|
32
|
+
* The height in pixels of the generated image
|
|
33
|
+
*/
|
|
34
|
+
height?: number;
|
|
35
|
+
/**
|
|
36
|
+
* The width in pixels of the generated image
|
|
37
|
+
*/
|
|
38
|
+
width?: number;
|
|
39
|
+
/**
|
|
40
|
+
* The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference.
|
|
41
|
+
*/
|
|
42
|
+
num_inference_steps?: number;
|
|
43
|
+
/**
|
|
44
|
+
* Guidance scale: Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality.
|
|
45
|
+
*/
|
|
46
|
+
guidance_scale?: number;
|
|
47
|
+
/**
|
|
48
|
+
* guess_mode only works for ControlNet models, defaults to False In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
|
|
49
|
+
* you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
|
|
50
|
+
*/
|
|
51
|
+
guess_mode?: boolean;
|
|
52
|
+
};
|
|
53
|
+
};
|
|
54
|
+
|
|
55
|
+
export type ImageToImageOutput = Blob;
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* This task reads some text input and outputs an image.
|
|
59
|
+
* Recommended model: lllyasviel/sd-controlnet-depth
|
|
60
|
+
*/
|
|
61
|
+
export async function imageToImage(args: ImageToImageArgs, options?: Options): Promise<ImageToImageOutput> {
|
|
62
|
+
let reqArgs: RequestArgs;
|
|
63
|
+
if (!args.parameters) {
|
|
64
|
+
reqArgs = {
|
|
65
|
+
accessToken: args.accessToken,
|
|
66
|
+
model: args.model,
|
|
67
|
+
data: args.inputs,
|
|
68
|
+
};
|
|
69
|
+
} else {
|
|
70
|
+
reqArgs = {
|
|
71
|
+
...args,
|
|
72
|
+
inputs: base64FromBytes(
|
|
73
|
+
new Uint8Array(args.inputs instanceof ArrayBuffer ? args.inputs : await args.inputs.arrayBuffer())
|
|
74
|
+
),
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
const res = await request<ImageToImageOutput>(reqArgs, options);
|
|
78
|
+
const isValidOutput = res && res instanceof Blob;
|
|
79
|
+
if (!isValidOutput) {
|
|
80
|
+
throw new InferenceOutputError("Expected Blob");
|
|
81
|
+
}
|
|
82
|
+
return res;
|
|
83
|
+
}
|
package/src/tasks/index.ts
CHANGED
|
@@ -13,6 +13,7 @@ export * from "./cv/imageSegmentation";
|
|
|
13
13
|
export * from "./cv/imageToText";
|
|
14
14
|
export * from "./cv/objectDetection";
|
|
15
15
|
export * from "./cv/textToImage";
|
|
16
|
+
export * from "./cv/imageToImage";
|
|
16
17
|
|
|
17
18
|
// Natural Language Processing tasks
|
|
18
19
|
export * from "./nlp/conversational";
|
|
@@ -12,7 +12,7 @@ export type DocumentQuestionAnsweringArgs = BaseArgs & {
|
|
|
12
12
|
*
|
|
13
13
|
* You can use native `File` in browsers, or `new Blob([buffer])` in node, or for a base64 image `new Blob([btoa(base64String)])`, or even `await (await fetch('...)).blob()`
|
|
14
14
|
**/
|
|
15
|
-
image: Blob;
|
|
15
|
+
image: Blob | ArrayBuffer;
|
|
16
16
|
question: string;
|
|
17
17
|
};
|
|
18
18
|
};
|
|
@@ -47,8 +47,12 @@ export async function documentQuestionAnswering(
|
|
|
47
47
|
...args,
|
|
48
48
|
inputs: {
|
|
49
49
|
question: args.inputs.question,
|
|
50
|
-
// convert Blob to base64
|
|
51
|
-
image: base64FromBytes(
|
|
50
|
+
// convert Blob or ArrayBuffer to base64
|
|
51
|
+
image: base64FromBytes(
|
|
52
|
+
new Uint8Array(
|
|
53
|
+
args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
|
|
54
|
+
)
|
|
55
|
+
),
|
|
52
56
|
},
|
|
53
57
|
} as RequestArgs;
|
|
54
58
|
const res = toArray(
|
|
@@ -10,7 +10,7 @@ export type VisualQuestionAnsweringArgs = BaseArgs & {
|
|
|
10
10
|
*
|
|
11
11
|
* You can use native `File` in browsers, or `new Blob([buffer])` in node, or for a base64 image `new Blob([btoa(base64String)])`, or even `await (await fetch('...)).blob()`
|
|
12
12
|
**/
|
|
13
|
-
image: Blob;
|
|
13
|
+
image: Blob | ArrayBuffer;
|
|
14
14
|
question: string;
|
|
15
15
|
};
|
|
16
16
|
};
|
|
@@ -37,8 +37,12 @@ export async function visualQuestionAnswering(
|
|
|
37
37
|
...args,
|
|
38
38
|
inputs: {
|
|
39
39
|
question: args.inputs.question,
|
|
40
|
-
// convert Blob to base64
|
|
41
|
-
image: base64FromBytes(
|
|
40
|
+
// convert Blob or ArrayBuffer to base64
|
|
41
|
+
image: base64FromBytes(
|
|
42
|
+
new Uint8Array(
|
|
43
|
+
args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
|
|
44
|
+
)
|
|
45
|
+
),
|
|
42
46
|
},
|
|
43
47
|
} as RequestArgs;
|
|
44
48
|
const res = (await request<[VisualQuestionAnsweringOutput]>(reqArgs, options))?.[0];
|