@huggingface/inference 2.2.2 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -170,6 +170,14 @@ await hf.imageToText({
170
170
  model: 'nlpconnect/vit-gpt2-image-captioning'
171
171
  })
172
172
 
173
+ await hf.imageToImage({
174
+ inputs: readFileSync("test/stormtrooper_depth.png"),
175
+ parameters: {
176
+ prompt: "elmo's lecture",
177
+ },
178
+ model: "lllyasviel/sd-controlnet-depth",
179
+ });
180
+
173
181
  // Multimodal
174
182
 
175
183
  await hf.visualQuestionAnswering({
@@ -260,12 +268,15 @@ const { generated_text } = await gpt2.textGeneration({inputs: 'The answer to the
260
268
  - [x] Image segmentation
261
269
  - [x] Text to image
262
270
  - [x] Image to text - [demo](https://huggingface.co/spaces/huggingfacejs/image-to-text)
271
+ - [x] Image to Image
263
272
 
264
273
  ### Multimodal
274
+
265
275
  - [x] Document question answering - [demo](https://huggingface.co/spaces/huggingfacejs/doc-vis-qa)
266
276
  - [x] Visual question answering - [demo](https://huggingface.co/spaces/huggingfacejs/doc-vis-qa)
267
277
 
268
278
  ### Tabular
279
+
269
280
  - [x] Tabular regression
270
281
 
271
282
  ## Tree-shaking
@@ -288,7 +299,7 @@ This will enable tree-shaking by your bundler.
288
299
  ## Running tests
289
300
 
290
301
  ```console
291
- HF_ACCESS_TOKEN="your access token" npm run test
302
+ HF_ACCESS_TOKEN="your access token" pnpm run test
292
303
  ```
293
304
 
294
305
  ## Finding appropriate models
package/dist/index.js CHANGED
@@ -30,6 +30,7 @@ __export(src_exports, {
30
30
  fillMask: () => fillMask,
31
31
  imageClassification: () => imageClassification,
32
32
  imageSegmentation: () => imageSegmentation,
33
+ imageToImage: () => imageToImage,
33
34
  imageToText: () => imageToText,
34
35
  objectDetection: () => objectDetection,
35
36
  questionAnswering: () => questionAnswering,
@@ -62,6 +63,7 @@ __export(tasks_exports, {
62
63
  fillMask: () => fillMask,
63
64
  imageClassification: () => imageClassification,
64
65
  imageSegmentation: () => imageSegmentation,
66
+ imageToImage: () => imageToImage,
65
67
  imageToText: () => imageToText,
66
68
  objectDetection: () => objectDetection,
67
69
  questionAnswering: () => questionAnswering,
@@ -397,6 +399,48 @@ async function textToImage(args, options) {
397
399
  return res;
398
400
  }
399
401
 
402
+ // ../shared/src/base64FromBytes.ts
403
+ function base64FromBytes(arr) {
404
+ if (globalThis.Buffer) {
405
+ return globalThis.Buffer.from(arr).toString("base64");
406
+ } else {
407
+ const bin = [];
408
+ arr.forEach((byte) => {
409
+ bin.push(String.fromCharCode(byte));
410
+ });
411
+ return globalThis.btoa(bin.join(""));
412
+ }
413
+ }
414
+
415
+ // ../shared/src/isBackend.ts
416
+ var isBrowser = typeof window !== "undefined" && typeof window.document !== "undefined";
417
+ var isWebWorker = typeof self === "object" && self.constructor && self.constructor.name === "DedicatedWorkerGlobalScope";
418
+
419
+ // src/tasks/cv/imageToImage.ts
420
+ async function imageToImage(args, options) {
421
+ let reqArgs;
422
+ if (!args.parameters) {
423
+ reqArgs = {
424
+ accessToken: args.accessToken,
425
+ model: args.model,
426
+ data: args.inputs
427
+ };
428
+ } else {
429
+ reqArgs = {
430
+ ...args,
431
+ inputs: base64FromBytes(
432
+ new Uint8Array(args.inputs instanceof ArrayBuffer ? args.inputs : await args.inputs.arrayBuffer())
433
+ )
434
+ };
435
+ }
436
+ const res = await request(reqArgs, options);
437
+ const isValidOutput = res && res instanceof Blob;
438
+ if (!isValidOutput) {
439
+ throw new InferenceOutputError("Expected Blob");
440
+ }
441
+ return res;
442
+ }
443
+
400
444
  // src/tasks/nlp/conversational.ts
401
445
  async function conversational(args, options) {
402
446
  const res = await request(args, options);
@@ -561,31 +605,18 @@ async function zeroShotClassification(args, options) {
561
605
  return res;
562
606
  }
563
607
 
564
- // ../shared/src/base64FromBytes.ts
565
- function base64FromBytes(arr) {
566
- if (globalThis.Buffer) {
567
- return globalThis.Buffer.from(arr).toString("base64");
568
- } else {
569
- const bin = [];
570
- arr.forEach((byte) => {
571
- bin.push(String.fromCharCode(byte));
572
- });
573
- return globalThis.btoa(bin.join(""));
574
- }
575
- }
576
-
577
- // ../shared/src/isBackend.ts
578
- var isBrowser = typeof window !== "undefined" && typeof window.document !== "undefined";
579
- var isWebWorker = typeof self === "object" && self.constructor && self.constructor.name === "DedicatedWorkerGlobalScope";
580
-
581
608
  // src/tasks/multimodal/documentQuestionAnswering.ts
582
609
  async function documentQuestionAnswering(args, options) {
583
610
  const reqArgs = {
584
611
  ...args,
585
612
  inputs: {
586
613
  question: args.inputs.question,
587
- // convert Blob to base64
588
- image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer()))
614
+ // convert Blob or ArrayBuffer to base64
615
+ image: base64FromBytes(
616
+ new Uint8Array(
617
+ args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
618
+ )
619
+ )
589
620
  }
590
621
  };
591
622
  const res = toArray(
@@ -604,8 +635,12 @@ async function visualQuestionAnswering(args, options) {
604
635
  ...args,
605
636
  inputs: {
606
637
  question: args.inputs.question,
607
- // convert Blob to base64
608
- image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer()))
638
+ // convert Blob or ArrayBuffer to base64
639
+ image: base64FromBytes(
640
+ new Uint8Array(
641
+ args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
642
+ )
643
+ )
609
644
  }
610
645
  };
611
646
  const res = (await request(reqArgs, options))?.[0];
@@ -677,6 +712,7 @@ var HfInferenceEndpoint = class {
677
712
  fillMask,
678
713
  imageClassification,
679
714
  imageSegmentation,
715
+ imageToImage,
680
716
  imageToText,
681
717
  objectDetection,
682
718
  questionAnswering,
package/dist/index.mjs CHANGED
@@ -15,6 +15,7 @@ __export(tasks_exports, {
15
15
  fillMask: () => fillMask,
16
16
  imageClassification: () => imageClassification,
17
17
  imageSegmentation: () => imageSegmentation,
18
+ imageToImage: () => imageToImage,
18
19
  imageToText: () => imageToText,
19
20
  objectDetection: () => objectDetection,
20
21
  questionAnswering: () => questionAnswering,
@@ -350,6 +351,48 @@ async function textToImage(args, options) {
350
351
  return res;
351
352
  }
352
353
 
354
+ // ../shared/src/base64FromBytes.ts
355
+ function base64FromBytes(arr) {
356
+ if (globalThis.Buffer) {
357
+ return globalThis.Buffer.from(arr).toString("base64");
358
+ } else {
359
+ const bin = [];
360
+ arr.forEach((byte) => {
361
+ bin.push(String.fromCharCode(byte));
362
+ });
363
+ return globalThis.btoa(bin.join(""));
364
+ }
365
+ }
366
+
367
+ // ../shared/src/isBackend.ts
368
+ var isBrowser = typeof window !== "undefined" && typeof window.document !== "undefined";
369
+ var isWebWorker = typeof self === "object" && self.constructor && self.constructor.name === "DedicatedWorkerGlobalScope";
370
+
371
+ // src/tasks/cv/imageToImage.ts
372
+ async function imageToImage(args, options) {
373
+ let reqArgs;
374
+ if (!args.parameters) {
375
+ reqArgs = {
376
+ accessToken: args.accessToken,
377
+ model: args.model,
378
+ data: args.inputs
379
+ };
380
+ } else {
381
+ reqArgs = {
382
+ ...args,
383
+ inputs: base64FromBytes(
384
+ new Uint8Array(args.inputs instanceof ArrayBuffer ? args.inputs : await args.inputs.arrayBuffer())
385
+ )
386
+ };
387
+ }
388
+ const res = await request(reqArgs, options);
389
+ const isValidOutput = res && res instanceof Blob;
390
+ if (!isValidOutput) {
391
+ throw new InferenceOutputError("Expected Blob");
392
+ }
393
+ return res;
394
+ }
395
+
353
396
  // src/tasks/nlp/conversational.ts
354
397
  async function conversational(args, options) {
355
398
  const res = await request(args, options);
@@ -514,31 +557,18 @@ async function zeroShotClassification(args, options) {
514
557
  return res;
515
558
  }
516
559
 
517
- // ../shared/src/base64FromBytes.ts
518
- function base64FromBytes(arr) {
519
- if (globalThis.Buffer) {
520
- return globalThis.Buffer.from(arr).toString("base64");
521
- } else {
522
- const bin = [];
523
- arr.forEach((byte) => {
524
- bin.push(String.fromCharCode(byte));
525
- });
526
- return globalThis.btoa(bin.join(""));
527
- }
528
- }
529
-
530
- // ../shared/src/isBackend.ts
531
- var isBrowser = typeof window !== "undefined" && typeof window.document !== "undefined";
532
- var isWebWorker = typeof self === "object" && self.constructor && self.constructor.name === "DedicatedWorkerGlobalScope";
533
-
534
560
  // src/tasks/multimodal/documentQuestionAnswering.ts
535
561
  async function documentQuestionAnswering(args, options) {
536
562
  const reqArgs = {
537
563
  ...args,
538
564
  inputs: {
539
565
  question: args.inputs.question,
540
- // convert Blob to base64
541
- image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer()))
566
+ // convert Blob or ArrayBuffer to base64
567
+ image: base64FromBytes(
568
+ new Uint8Array(
569
+ args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
570
+ )
571
+ )
542
572
  }
543
573
  };
544
574
  const res = toArray(
@@ -557,8 +587,12 @@ async function visualQuestionAnswering(args, options) {
557
587
  ...args,
558
588
  inputs: {
559
589
  question: args.inputs.question,
560
- // convert Blob to base64
561
- image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer()))
590
+ // convert Blob or ArrayBuffer to base64
591
+ image: base64FromBytes(
592
+ new Uint8Array(
593
+ args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
594
+ )
595
+ )
562
596
  }
563
597
  };
564
598
  const res = (await request(reqArgs, options))?.[0];
@@ -629,6 +663,7 @@ export {
629
663
  fillMask,
630
664
  imageClassification,
631
665
  imageSegmentation,
666
+ imageToImage,
632
667
  imageToText,
633
668
  objectDetection,
634
669
  questionAnswering,
package/package.json CHANGED
@@ -1,6 +1,7 @@
1
1
  {
2
2
  "name": "@huggingface/inference",
3
- "version": "2.2.2",
3
+ "version": "2.3.0",
4
+ "packageManager": "pnpm@8.3.1",
4
5
  "license": "MIT",
5
6
  "author": "Tim Mikeladze <tim.mikeladze@gmail.com>",
6
7
  "description": "Typescript wrapper for the Hugging Face Inference API",
@@ -0,0 +1,83 @@
1
+ import { InferenceOutputError } from "../../lib/InferenceOutputError";
2
+ import type { BaseArgs, Options, RequestArgs } from "../../types";
3
+ import { request } from "../custom/request";
4
+ import { base64FromBytes } from "@huggingface/shared";
5
+
6
+ export type ImageToImageArgs = BaseArgs & {
7
+ /**
8
+ * The initial image condition
9
+ *
10
+ **/
11
+ inputs: Blob | ArrayBuffer;
12
+
13
+ parameters?: {
14
+ /**
15
+ * The text prompt to guide the image generation.
16
+ */
17
+ prompt?: string;
18
+ /**
19
+ * strengh param only works for SD img2img and alt diffusion img2img models
20
+ * Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
21
+ * will be used as a starting point, adding more noise to it the larger the `strength`. The number of
22
+ * denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
23
+ * be maximum and the denoising process will run for the full number of iterations specified in
24
+ * `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
25
+ **/
26
+ strength?: number;
27
+ /**
28
+ * An optional negative prompt for the image generation
29
+ */
30
+ negative_prompt?: string;
31
+ /**
32
+ * The height in pixels of the generated image
33
+ */
34
+ height?: number;
35
+ /**
36
+ * The width in pixels of the generated image
37
+ */
38
+ width?: number;
39
+ /**
40
+ * The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference.
41
+ */
42
+ num_inference_steps?: number;
43
+ /**
44
+ * Guidance scale: Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality.
45
+ */
46
+ guidance_scale?: number;
47
+ /**
48
+ * guess_mode only works for ControlNet models, defaults to False In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
49
+ * you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
50
+ */
51
+ guess_mode?: boolean;
52
+ };
53
+ };
54
+
55
+ export type ImageToImageOutput = Blob;
56
+
57
+ /**
58
+ * This task reads some text input and outputs an image.
59
+ * Recommended model: lllyasviel/sd-controlnet-depth
60
+ */
61
+ export async function imageToImage(args: ImageToImageArgs, options?: Options): Promise<ImageToImageOutput> {
62
+ let reqArgs: RequestArgs;
63
+ if (!args.parameters) {
64
+ reqArgs = {
65
+ accessToken: args.accessToken,
66
+ model: args.model,
67
+ data: args.inputs,
68
+ };
69
+ } else {
70
+ reqArgs = {
71
+ ...args,
72
+ inputs: base64FromBytes(
73
+ new Uint8Array(args.inputs instanceof ArrayBuffer ? args.inputs : await args.inputs.arrayBuffer())
74
+ ),
75
+ };
76
+ }
77
+ const res = await request<ImageToImageOutput>(reqArgs, options);
78
+ const isValidOutput = res && res instanceof Blob;
79
+ if (!isValidOutput) {
80
+ throw new InferenceOutputError("Expected Blob");
81
+ }
82
+ return res;
83
+ }
@@ -13,6 +13,7 @@ export * from "./cv/imageSegmentation";
13
13
  export * from "./cv/imageToText";
14
14
  export * from "./cv/objectDetection";
15
15
  export * from "./cv/textToImage";
16
+ export * from "./cv/imageToImage";
16
17
 
17
18
  // Natural Language Processing tasks
18
19
  export * from "./nlp/conversational";
@@ -12,7 +12,7 @@ export type DocumentQuestionAnsweringArgs = BaseArgs & {
12
12
  *
13
13
  * You can use native `File` in browsers, or `new Blob([buffer])` in node, or for a base64 image `new Blob([btoa(base64String)])`, or even `await (await fetch('...)).blob()`
14
14
  **/
15
- image: Blob;
15
+ image: Blob | ArrayBuffer;
16
16
  question: string;
17
17
  };
18
18
  };
@@ -47,8 +47,12 @@ export async function documentQuestionAnswering(
47
47
  ...args,
48
48
  inputs: {
49
49
  question: args.inputs.question,
50
- // convert Blob to base64
51
- image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer())),
50
+ // convert Blob or ArrayBuffer to base64
51
+ image: base64FromBytes(
52
+ new Uint8Array(
53
+ args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
54
+ )
55
+ ),
52
56
  },
53
57
  } as RequestArgs;
54
58
  const res = toArray(
@@ -10,7 +10,7 @@ export type VisualQuestionAnsweringArgs = BaseArgs & {
10
10
  *
11
11
  * You can use native `File` in browsers, or `new Blob([buffer])` in node, or for a base64 image `new Blob([btoa(base64String)])`, or even `await (await fetch('...)).blob()`
12
12
  **/
13
- image: Blob;
13
+ image: Blob | ArrayBuffer;
14
14
  question: string;
15
15
  };
16
16
  };
@@ -37,8 +37,12 @@ export async function visualQuestionAnswering(
37
37
  ...args,
38
38
  inputs: {
39
39
  question: args.inputs.question,
40
- // convert Blob to base64
41
- image: base64FromBytes(new Uint8Array(await args.inputs.image.arrayBuffer())),
40
+ // convert Blob or ArrayBuffer to base64
41
+ image: base64FromBytes(
42
+ new Uint8Array(
43
+ args.inputs.image instanceof ArrayBuffer ? args.inputs.image : await args.inputs.image.arrayBuffer()
44
+ )
45
+ ),
42
46
  },
43
47
  } as RequestArgs;
44
48
  const res = (await request<[VisualQuestionAnsweringOutput]>(reqArgs, options))?.[0];