@huggingface/tasks 0.0.7 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -276,6 +276,10 @@ declare const PIPELINE_DATA: {
276
276
  };
277
277
  "image-to-image": {
278
278
  name: string;
279
+ subtasks: {
280
+ type: string;
281
+ name: string;
282
+ }[];
279
283
  modality: "cv";
280
284
  color: "indigo";
281
285
  };
@@ -416,6 +420,16 @@ declare const PIPELINE_DATA: {
416
420
  modality: "cv";
417
421
  color: "yellow";
418
422
  };
423
+ "text-to-3d": {
424
+ name: string;
425
+ modality: "multimodal";
426
+ color: "yellow";
427
+ };
428
+ "image-to-3d": {
429
+ name: string;
430
+ modality: "multimodal";
431
+ color: "green";
432
+ };
419
433
  other: {
420
434
  name: string;
421
435
  modality: "other";
@@ -425,9 +439,9 @@ declare const PIPELINE_DATA: {
425
439
  };
426
440
  };
427
441
  type PipelineType = keyof typeof PIPELINE_DATA;
428
- declare const PIPELINE_TYPES: ("other" | "text-classification" | "token-classification" | "table-question-answering" | "question-answering" | "zero-shot-classification" | "translation" | "summarization" | "conversational" | "feature-extraction" | "text-generation" | "text2text-generation" | "fill-mask" | "sentence-similarity" | "text-to-speech" | "text-to-audio" | "automatic-speech-recognition" | "audio-to-audio" | "audio-classification" | "voice-activity-detection" | "depth-estimation" | "image-classification" | "object-detection" | "image-segmentation" | "text-to-image" | "image-to-text" | "image-to-image" | "image-to-video" | "unconditional-image-generation" | "video-classification" | "reinforcement-learning" | "robotics" | "tabular-classification" | "tabular-regression" | "tabular-to-text" | "table-to-text" | "multiple-choice" | "text-retrieval" | "time-series-forecasting" | "text-to-video" | "visual-question-answering" | "document-question-answering" | "zero-shot-image-classification" | "graph-ml" | "mask-generation" | "zero-shot-object-detection")[];
442
+ declare const PIPELINE_TYPES: ("other" | "text-classification" | "token-classification" | "table-question-answering" | "question-answering" | "zero-shot-classification" | "translation" | "summarization" | "conversational" | "feature-extraction" | "text-generation" | "text2text-generation" | "fill-mask" | "sentence-similarity" | "text-to-speech" | "text-to-audio" | "automatic-speech-recognition" | "audio-to-audio" | "audio-classification" | "voice-activity-detection" | "depth-estimation" | "image-classification" | "object-detection" | "image-segmentation" | "text-to-image" | "image-to-text" | "image-to-image" | "image-to-video" | "unconditional-image-generation" | "video-classification" | "reinforcement-learning" | "robotics" | "tabular-classification" | "tabular-regression" | "tabular-to-text" | "table-to-text" | "multiple-choice" | "text-retrieval" | "time-series-forecasting" | "text-to-video" | "visual-question-answering" | "document-question-answering" | "zero-shot-image-classification" | "graph-ml" | "mask-generation" | "zero-shot-object-detection" | "text-to-3d" | "image-to-3d")[];
429
443
  declare const SUBTASK_TYPES: string[];
430
- declare const PIPELINE_TYPES_SET: Set<"other" | "text-classification" | "token-classification" | "table-question-answering" | "question-answering" | "zero-shot-classification" | "translation" | "summarization" | "conversational" | "feature-extraction" | "text-generation" | "text2text-generation" | "fill-mask" | "sentence-similarity" | "text-to-speech" | "text-to-audio" | "automatic-speech-recognition" | "audio-to-audio" | "audio-classification" | "voice-activity-detection" | "depth-estimation" | "image-classification" | "object-detection" | "image-segmentation" | "text-to-image" | "image-to-text" | "image-to-image" | "image-to-video" | "unconditional-image-generation" | "video-classification" | "reinforcement-learning" | "robotics" | "tabular-classification" | "tabular-regression" | "tabular-to-text" | "table-to-text" | "multiple-choice" | "text-retrieval" | "time-series-forecasting" | "text-to-video" | "visual-question-answering" | "document-question-answering" | "zero-shot-image-classification" | "graph-ml" | "mask-generation" | "zero-shot-object-detection">;
444
+ declare const PIPELINE_TYPES_SET: Set<"other" | "text-classification" | "token-classification" | "table-question-answering" | "question-answering" | "zero-shot-classification" | "translation" | "summarization" | "conversational" | "feature-extraction" | "text-generation" | "text2text-generation" | "fill-mask" | "sentence-similarity" | "text-to-speech" | "text-to-audio" | "automatic-speech-recognition" | "audio-to-audio" | "audio-classification" | "voice-activity-detection" | "depth-estimation" | "image-classification" | "object-detection" | "image-segmentation" | "text-to-image" | "image-to-text" | "image-to-image" | "image-to-video" | "unconditional-image-generation" | "video-classification" | "reinforcement-learning" | "robotics" | "tabular-classification" | "tabular-regression" | "tabular-to-text" | "table-to-text" | "multiple-choice" | "text-retrieval" | "time-series-forecasting" | "text-to-video" | "visual-question-answering" | "document-question-answering" | "zero-shot-image-classification" | "graph-ml" | "mask-generation" | "zero-shot-object-detection" | "text-to-3d" | "image-to-3d">;
431
445
 
432
446
  /**
433
447
  * Mapping from library name (excluding Transformers) to its supported tasks.
@@ -758,10 +772,13 @@ declare namespace curl {
758
772
  }
759
773
 
760
774
  declare const snippetZeroShotClassification$1: (model: ModelData) => string;
775
+ declare const snippetZeroShotImageClassification: (model: ModelData) => string;
761
776
  declare const snippetBasic$1: (model: ModelData) => string;
762
777
  declare const snippetFile$1: (model: ModelData) => string;
763
778
  declare const snippetTextToImage$1: (model: ModelData) => string;
779
+ declare const snippetTabular: (model: ModelData) => string;
764
780
  declare const snippetTextToAudio$1: (model: ModelData) => string;
781
+ declare const snippetDocumentQuestionAnswering: (model: ModelData) => string;
765
782
  declare const pythonSnippets: Partial<Record<PipelineType, (model: ModelData) => string>>;
766
783
  declare function getPythonInferenceSnippet(model: ModelData, accessToken: string): string;
767
784
  declare function hasPythonInferenceSnippet(model: ModelData): boolean;
@@ -769,16 +786,22 @@ declare function hasPythonInferenceSnippet(model: ModelData): boolean;
769
786
  declare const python_getPythonInferenceSnippet: typeof getPythonInferenceSnippet;
770
787
  declare const python_hasPythonInferenceSnippet: typeof hasPythonInferenceSnippet;
771
788
  declare const python_pythonSnippets: typeof pythonSnippets;
789
+ declare const python_snippetDocumentQuestionAnswering: typeof snippetDocumentQuestionAnswering;
790
+ declare const python_snippetTabular: typeof snippetTabular;
791
+ declare const python_snippetZeroShotImageClassification: typeof snippetZeroShotImageClassification;
772
792
  declare namespace python {
773
793
  export {
774
794
  python_getPythonInferenceSnippet as getPythonInferenceSnippet,
775
795
  python_hasPythonInferenceSnippet as hasPythonInferenceSnippet,
776
796
  python_pythonSnippets as pythonSnippets,
777
797
  snippetBasic$1 as snippetBasic,
798
+ python_snippetDocumentQuestionAnswering as snippetDocumentQuestionAnswering,
778
799
  snippetFile$1 as snippetFile,
800
+ python_snippetTabular as snippetTabular,
779
801
  snippetTextToAudio$1 as snippetTextToAudio,
780
802
  snippetTextToImage$1 as snippetTextToImage,
781
803
  snippetZeroShotClassification$1 as snippetZeroShotClassification,
804
+ python_snippetZeroShotImageClassification as snippetZeroShotImageClassification,
782
805
  };
783
806
  }
784
807
 
package/dist/index.js CHANGED
@@ -1801,6 +1801,20 @@ var PIPELINE_DATA = {
1801
1801
  },
1802
1802
  "image-to-image": {
1803
1803
  name: "Image-to-Image",
1804
+ subtasks: [
1805
+ {
1806
+ type: "image-inpainting",
1807
+ name: "Image Inpainting"
1808
+ },
1809
+ {
1810
+ type: "image-colorization",
1811
+ name: "Image Colorization"
1812
+ },
1813
+ {
1814
+ type: "super-resolution",
1815
+ name: "Super Resolution"
1816
+ }
1817
+ ],
1804
1818
  modality: "cv",
1805
1819
  color: "indigo"
1806
1820
  },
@@ -1987,6 +2001,16 @@ var PIPELINE_DATA = {
1987
2001
  modality: "cv",
1988
2002
  color: "yellow"
1989
2003
  },
2004
+ "text-to-3d": {
2005
+ name: "Text-to-3D",
2006
+ modality: "multimodal",
2007
+ color: "yellow"
2008
+ },
2009
+ "image-to-3d": {
2010
+ name: "Image-to-3D",
2011
+ modality: "multimodal",
2012
+ color: "green"
2013
+ },
1990
2014
  other: {
1991
2015
  name: "Other",
1992
2016
  modality: "other",
@@ -4406,18 +4430,18 @@ var data_default33 = taskData33;
4406
4430
 
4407
4431
  // src/tasks/index.ts
4408
4432
  var TASKS_MODEL_LIBRARIES = {
4409
- "audio-classification": ["speechbrain", "transformers"],
4433
+ "audio-classification": ["speechbrain", "transformers", "transformers.js"],
4410
4434
  "audio-to-audio": ["asteroid", "speechbrain"],
4411
4435
  "automatic-speech-recognition": ["espnet", "nemo", "speechbrain", "transformers", "transformers.js"],
4412
4436
  conversational: ["transformers"],
4413
- "depth-estimation": ["transformers"],
4414
- "document-question-answering": ["transformers"],
4437
+ "depth-estimation": ["transformers", "transformers.js"],
4438
+ "document-question-answering": ["transformers", "transformers.js"],
4415
4439
  "feature-extraction": ["sentence-transformers", "transformers", "transformers.js"],
4416
4440
  "fill-mask": ["transformers", "transformers.js"],
4417
4441
  "graph-ml": ["transformers"],
4418
4442
  "image-classification": ["keras", "timm", "transformers", "transformers.js"],
4419
4443
  "image-segmentation": ["transformers", "transformers.js"],
4420
- "image-to-image": ["diffusers"],
4444
+ "image-to-image": ["diffusers", "transformers.js"],
4421
4445
  "image-to-text": ["transformers.js"],
4422
4446
  "image-to-video": ["diffusers"],
4423
4447
  "video-classification": ["transformers"],
@@ -4439,8 +4463,8 @@ var TASKS_MODEL_LIBRARIES = {
4439
4463
  "text-generation": ["transformers", "transformers.js"],
4440
4464
  "text-retrieval": [],
4441
4465
  "text-to-image": ["diffusers"],
4442
- "text-to-speech": ["espnet", "tensorflowtts", "transformers"],
4443
- "text-to-audio": ["transformers"],
4466
+ "text-to-speech": ["espnet", "tensorflowtts", "transformers", "transformers.js"],
4467
+ "text-to-audio": ["transformers", "transformers.js"],
4444
4468
  "text-to-video": ["diffusers"],
4445
4469
  "text2text-generation": ["transformers", "transformers.js"],
4446
4470
  "time-series-forecasting": [],
@@ -4455,11 +4479,13 @@ var TASKS_MODEL_LIBRARIES = {
4455
4479
  ],
4456
4480
  translation: ["transformers", "transformers.js"],
4457
4481
  "unconditional-image-generation": ["diffusers"],
4458
- "visual-question-answering": ["transformers"],
4482
+ "visual-question-answering": ["transformers", "transformers.js"],
4459
4483
  "voice-activity-detection": [],
4460
4484
  "zero-shot-classification": ["transformers", "transformers.js"],
4461
4485
  "zero-shot-image-classification": ["transformers", "transformers.js"],
4462
- "zero-shot-object-detection": ["transformers"]
4486
+ "zero-shot-object-detection": ["transformers", "transformers.js"],
4487
+ "text-to-3d": [],
4488
+ "image-to-3d": []
4463
4489
  };
4464
4490
  function getData(type, partialTaskData = data_default14) {
4465
4491
  return {
@@ -4515,7 +4541,9 @@ var TASKS_DATA = {
4515
4541
  "voice-activity-detection": void 0,
4516
4542
  "zero-shot-classification": getData("zero-shot-classification", data_default32),
4517
4543
  "zero-shot-image-classification": getData("zero-shot-image-classification", data_default33),
4518
- "zero-shot-object-detection": getData("zero-shot-object-detection", data_default14)
4544
+ "zero-shot-object-detection": getData("zero-shot-object-detection", data_default14),
4545
+ "text-to-3d": getData("text-to-3d", data_default14),
4546
+ "image-to-3d": getData("image-to-3d", data_default14)
4519
4547
  };
4520
4548
 
4521
4549
  // src/model-libraries.ts
@@ -4622,6 +4650,10 @@ var inputsTableQuestionAnswering = () => `{
4622
4650
  ]
4623
4651
  }
4624
4652
  }`;
4653
+ var inputsVisualQuestionAnswering = () => `{
4654
+ "image": "cat.png",
4655
+ "question": "What is in this image?"
4656
+ }`;
4625
4657
  var inputsQuestionAnswering = () => `{
4626
4658
  "question": "What is my name?",
4627
4659
  "context": "My name is Clara and I live in Berkeley."
@@ -4650,11 +4682,14 @@ var inputsTextToImage = () => `"Astronaut riding a horse"`;
4650
4682
  var inputsTextToSpeech = () => `"The answer to the universe is 42"`;
4651
4683
  var inputsTextToAudio = () => `"liquid drum and bass, atmospheric synths, airy sounds"`;
4652
4684
  var inputsAutomaticSpeechRecognition = () => `"sample1.flac"`;
4685
+ var inputsTabularPrediction = () => `'{"Height":[11.52,12.48],"Length1":[23.2,24.0],"Length2":[25.4,26.3],"Species": ["Bream","Bream"]}'`;
4686
+ var inputsZeroShotImageClassification = () => `"cats.jpg"`;
4653
4687
  var modelInputSnippets = {
4654
4688
  "audio-to-audio": inputsAudioToAudio,
4655
4689
  "audio-classification": inputsAudioClassification,
4656
4690
  "automatic-speech-recognition": inputsAutomaticSpeechRecognition,
4657
4691
  conversational: inputsConversational,
4692
+ "document-question-answering": inputsVisualQuestionAnswering,
4658
4693
  "feature-extraction": inputsFeatureExtraction,
4659
4694
  "fill-mask": inputsFillMask,
4660
4695
  "image-classification": inputsImageClassification,
@@ -4665,6 +4700,8 @@ var modelInputSnippets = {
4665
4700
  "sentence-similarity": inputsSentenceSimilarity,
4666
4701
  summarization: inputsSummarization,
4667
4702
  "table-question-answering": inputsTableQuestionAnswering,
4703
+ "tabular-regression": inputsTabularPrediction,
4704
+ "tabular-classification": inputsTabularPrediction,
4668
4705
  "text-classification": inputsTextClassification,
4669
4706
  "text-generation": inputsTextGeneration,
4670
4707
  "text-to-image": inputsTextToImage,
@@ -4673,7 +4710,8 @@ var modelInputSnippets = {
4673
4710
  "text2text-generation": inputsText2TextGeneration,
4674
4711
  "token-classification": inputsTokenClassification,
4675
4712
  translation: inputsTranslation,
4676
- "zero-shot-classification": inputsZeroShotClassification
4713
+ "zero-shot-classification": inputsZeroShotClassification,
4714
+ "zero-shot-image-classification": inputsZeroShotImageClassification
4677
4715
  };
4678
4716
  function getModelInputSnippet(model, noWrap = false, noQuotes = false) {
4679
4717
  if (model.pipeline_tag) {
@@ -4761,10 +4799,13 @@ __export(python_exports, {
4761
4799
  hasPythonInferenceSnippet: () => hasPythonInferenceSnippet,
4762
4800
  pythonSnippets: () => pythonSnippets,
4763
4801
  snippetBasic: () => snippetBasic2,
4802
+ snippetDocumentQuestionAnswering: () => snippetDocumentQuestionAnswering,
4764
4803
  snippetFile: () => snippetFile2,
4804
+ snippetTabular: () => snippetTabular,
4765
4805
  snippetTextToAudio: () => snippetTextToAudio,
4766
4806
  snippetTextToImage: () => snippetTextToImage,
4767
- snippetZeroShotClassification: () => snippetZeroShotClassification2
4807
+ snippetZeroShotClassification: () => snippetZeroShotClassification2,
4808
+ snippetZeroShotImageClassification: () => snippetZeroShotImageClassification
4768
4809
  });
4769
4810
  var snippetZeroShotClassification2 = (model) => `def query(payload):
4770
4811
  response = requests.post(API_URL, headers=headers, json=payload)
@@ -4774,6 +4815,20 @@ output = query({
4774
4815
  "inputs": ${getModelInputSnippet(model)},
4775
4816
  "parameters": {"candidate_labels": ["refund", "legal", "faq"]},
4776
4817
  })`;
4818
+ var snippetZeroShotImageClassification = (model) => `def query(data):
4819
+ with open(data["image_path"], "rb") as f:
4820
+ img = f.read()
4821
+ payload={
4822
+ "parameters": data["parameters"],
4823
+ "inputs": base64.b64encode(img).decode("utf-8")
4824
+ }
4825
+ response = requests.post(API_URL, headers=headers, json=payload)
4826
+ return response.json()
4827
+
4828
+ output = query({
4829
+ "image_path": ${getModelInputSnippet(model)},
4830
+ "parameters": {"candidate_labels": ["cat", "dog", "llama"]},
4831
+ })`;
4777
4832
  var snippetBasic2 = (model) => `def query(payload):
4778
4833
  response = requests.post(API_URL, headers=headers, json=payload)
4779
4834
  return response.json()
@@ -4798,6 +4853,12 @@ image_bytes = query({
4798
4853
  import io
4799
4854
  from PIL import Image
4800
4855
  image = Image.open(io.BytesIO(image_bytes))`;
4856
+ var snippetTabular = (model) => `def query(payload):
4857
+ response = requests.post(API_URL, headers=headers, json=payload)
4858
+ return response.content
4859
+ response = query({
4860
+ "inputs": {"data": ${getModelInputSnippet(model)}},
4861
+ })`;
4801
4862
  var snippetTextToAudio = (model) => {
4802
4863
  if (model.library_name === "transformers") {
4803
4864
  return `def query(payload):
@@ -4823,8 +4884,18 @@ from IPython.display import Audio
4823
4884
  Audio(audio, rate=sampling_rate)`;
4824
4885
  }
4825
4886
  };
4887
+ var snippetDocumentQuestionAnswering = (model) => `def query(payload):
4888
+ with open(payload["image"], "rb") as f:
4889
+ img = f.read()
4890
+ payload["image"] = base64.b64encode(img).decode("utf-8")
4891
+ response = requests.post(API_URL, headers=headers, json=payload)
4892
+ return response.json()
4893
+
4894
+ output = query({
4895
+ "inputs": ${getModelInputSnippet(model)},
4896
+ })`;
4826
4897
  var pythonSnippets = {
4827
- // Same order as in js/src/lib/interfaces/Types.ts
4898
+ // Same order as in tasks/src/pipelines.ts
4828
4899
  "text-classification": snippetBasic2,
4829
4900
  "token-classification": snippetBasic2,
4830
4901
  "table-question-answering": snippetBasic2,
@@ -4845,9 +4916,13 @@ var pythonSnippets = {
4845
4916
  "audio-to-audio": snippetFile2,
4846
4917
  "audio-classification": snippetFile2,
4847
4918
  "image-classification": snippetFile2,
4848
- "image-to-text": snippetFile2,
4919
+ "tabular-regression": snippetTabular,
4920
+ "tabular-classification": snippetTabular,
4849
4921
  "object-detection": snippetFile2,
4850
- "image-segmentation": snippetFile2
4922
+ "image-segmentation": snippetFile2,
4923
+ "document-question-answering": snippetDocumentQuestionAnswering,
4924
+ "image-to-text": snippetFile2,
4925
+ "zero-shot-image-classification": snippetZeroShotImageClassification
4851
4926
  };
4852
4927
  function getPythonInferenceSnippet(model, accessToken) {
4853
4928
  const body = model.pipeline_tag && model.pipeline_tag in pythonSnippets ? pythonSnippets[model.pipeline_tag]?.(model) ?? "" : "";
package/dist/index.mjs CHANGED
@@ -1763,6 +1763,20 @@ var PIPELINE_DATA = {
1763
1763
  },
1764
1764
  "image-to-image": {
1765
1765
  name: "Image-to-Image",
1766
+ subtasks: [
1767
+ {
1768
+ type: "image-inpainting",
1769
+ name: "Image Inpainting"
1770
+ },
1771
+ {
1772
+ type: "image-colorization",
1773
+ name: "Image Colorization"
1774
+ },
1775
+ {
1776
+ type: "super-resolution",
1777
+ name: "Super Resolution"
1778
+ }
1779
+ ],
1766
1780
  modality: "cv",
1767
1781
  color: "indigo"
1768
1782
  },
@@ -1949,6 +1963,16 @@ var PIPELINE_DATA = {
1949
1963
  modality: "cv",
1950
1964
  color: "yellow"
1951
1965
  },
1966
+ "text-to-3d": {
1967
+ name: "Text-to-3D",
1968
+ modality: "multimodal",
1969
+ color: "yellow"
1970
+ },
1971
+ "image-to-3d": {
1972
+ name: "Image-to-3D",
1973
+ modality: "multimodal",
1974
+ color: "green"
1975
+ },
1952
1976
  other: {
1953
1977
  name: "Other",
1954
1978
  modality: "other",
@@ -4368,18 +4392,18 @@ var data_default33 = taskData33;
4368
4392
 
4369
4393
  // src/tasks/index.ts
4370
4394
  var TASKS_MODEL_LIBRARIES = {
4371
- "audio-classification": ["speechbrain", "transformers"],
4395
+ "audio-classification": ["speechbrain", "transformers", "transformers.js"],
4372
4396
  "audio-to-audio": ["asteroid", "speechbrain"],
4373
4397
  "automatic-speech-recognition": ["espnet", "nemo", "speechbrain", "transformers", "transformers.js"],
4374
4398
  conversational: ["transformers"],
4375
- "depth-estimation": ["transformers"],
4376
- "document-question-answering": ["transformers"],
4399
+ "depth-estimation": ["transformers", "transformers.js"],
4400
+ "document-question-answering": ["transformers", "transformers.js"],
4377
4401
  "feature-extraction": ["sentence-transformers", "transformers", "transformers.js"],
4378
4402
  "fill-mask": ["transformers", "transformers.js"],
4379
4403
  "graph-ml": ["transformers"],
4380
4404
  "image-classification": ["keras", "timm", "transformers", "transformers.js"],
4381
4405
  "image-segmentation": ["transformers", "transformers.js"],
4382
- "image-to-image": ["diffusers"],
4406
+ "image-to-image": ["diffusers", "transformers.js"],
4383
4407
  "image-to-text": ["transformers.js"],
4384
4408
  "image-to-video": ["diffusers"],
4385
4409
  "video-classification": ["transformers"],
@@ -4401,8 +4425,8 @@ var TASKS_MODEL_LIBRARIES = {
4401
4425
  "text-generation": ["transformers", "transformers.js"],
4402
4426
  "text-retrieval": [],
4403
4427
  "text-to-image": ["diffusers"],
4404
- "text-to-speech": ["espnet", "tensorflowtts", "transformers"],
4405
- "text-to-audio": ["transformers"],
4428
+ "text-to-speech": ["espnet", "tensorflowtts", "transformers", "transformers.js"],
4429
+ "text-to-audio": ["transformers", "transformers.js"],
4406
4430
  "text-to-video": ["diffusers"],
4407
4431
  "text2text-generation": ["transformers", "transformers.js"],
4408
4432
  "time-series-forecasting": [],
@@ -4417,11 +4441,13 @@ var TASKS_MODEL_LIBRARIES = {
4417
4441
  ],
4418
4442
  translation: ["transformers", "transformers.js"],
4419
4443
  "unconditional-image-generation": ["diffusers"],
4420
- "visual-question-answering": ["transformers"],
4444
+ "visual-question-answering": ["transformers", "transformers.js"],
4421
4445
  "voice-activity-detection": [],
4422
4446
  "zero-shot-classification": ["transformers", "transformers.js"],
4423
4447
  "zero-shot-image-classification": ["transformers", "transformers.js"],
4424
- "zero-shot-object-detection": ["transformers"]
4448
+ "zero-shot-object-detection": ["transformers", "transformers.js"],
4449
+ "text-to-3d": [],
4450
+ "image-to-3d": []
4425
4451
  };
4426
4452
  function getData(type, partialTaskData = data_default14) {
4427
4453
  return {
@@ -4477,7 +4503,9 @@ var TASKS_DATA = {
4477
4503
  "voice-activity-detection": void 0,
4478
4504
  "zero-shot-classification": getData("zero-shot-classification", data_default32),
4479
4505
  "zero-shot-image-classification": getData("zero-shot-image-classification", data_default33),
4480
- "zero-shot-object-detection": getData("zero-shot-object-detection", data_default14)
4506
+ "zero-shot-object-detection": getData("zero-shot-object-detection", data_default14),
4507
+ "text-to-3d": getData("text-to-3d", data_default14),
4508
+ "image-to-3d": getData("image-to-3d", data_default14)
4481
4509
  };
4482
4510
 
4483
4511
  // src/model-libraries.ts
@@ -4584,6 +4612,10 @@ var inputsTableQuestionAnswering = () => `{
4584
4612
  ]
4585
4613
  }
4586
4614
  }`;
4615
+ var inputsVisualQuestionAnswering = () => `{
4616
+ "image": "cat.png",
4617
+ "question": "What is in this image?"
4618
+ }`;
4587
4619
  var inputsQuestionAnswering = () => `{
4588
4620
  "question": "What is my name?",
4589
4621
  "context": "My name is Clara and I live in Berkeley."
@@ -4612,11 +4644,14 @@ var inputsTextToImage = () => `"Astronaut riding a horse"`;
4612
4644
  var inputsTextToSpeech = () => `"The answer to the universe is 42"`;
4613
4645
  var inputsTextToAudio = () => `"liquid drum and bass, atmospheric synths, airy sounds"`;
4614
4646
  var inputsAutomaticSpeechRecognition = () => `"sample1.flac"`;
4647
+ var inputsTabularPrediction = () => `'{"Height":[11.52,12.48],"Length1":[23.2,24.0],"Length2":[25.4,26.3],"Species": ["Bream","Bream"]}'`;
4648
+ var inputsZeroShotImageClassification = () => `"cats.jpg"`;
4615
4649
  var modelInputSnippets = {
4616
4650
  "audio-to-audio": inputsAudioToAudio,
4617
4651
  "audio-classification": inputsAudioClassification,
4618
4652
  "automatic-speech-recognition": inputsAutomaticSpeechRecognition,
4619
4653
  conversational: inputsConversational,
4654
+ "document-question-answering": inputsVisualQuestionAnswering,
4620
4655
  "feature-extraction": inputsFeatureExtraction,
4621
4656
  "fill-mask": inputsFillMask,
4622
4657
  "image-classification": inputsImageClassification,
@@ -4627,6 +4662,8 @@ var modelInputSnippets = {
4627
4662
  "sentence-similarity": inputsSentenceSimilarity,
4628
4663
  summarization: inputsSummarization,
4629
4664
  "table-question-answering": inputsTableQuestionAnswering,
4665
+ "tabular-regression": inputsTabularPrediction,
4666
+ "tabular-classification": inputsTabularPrediction,
4630
4667
  "text-classification": inputsTextClassification,
4631
4668
  "text-generation": inputsTextGeneration,
4632
4669
  "text-to-image": inputsTextToImage,
@@ -4635,7 +4672,8 @@ var modelInputSnippets = {
4635
4672
  "text2text-generation": inputsText2TextGeneration,
4636
4673
  "token-classification": inputsTokenClassification,
4637
4674
  translation: inputsTranslation,
4638
- "zero-shot-classification": inputsZeroShotClassification
4675
+ "zero-shot-classification": inputsZeroShotClassification,
4676
+ "zero-shot-image-classification": inputsZeroShotImageClassification
4639
4677
  };
4640
4678
  function getModelInputSnippet(model, noWrap = false, noQuotes = false) {
4641
4679
  if (model.pipeline_tag) {
@@ -4723,10 +4761,13 @@ __export(python_exports, {
4723
4761
  hasPythonInferenceSnippet: () => hasPythonInferenceSnippet,
4724
4762
  pythonSnippets: () => pythonSnippets,
4725
4763
  snippetBasic: () => snippetBasic2,
4764
+ snippetDocumentQuestionAnswering: () => snippetDocumentQuestionAnswering,
4726
4765
  snippetFile: () => snippetFile2,
4766
+ snippetTabular: () => snippetTabular,
4727
4767
  snippetTextToAudio: () => snippetTextToAudio,
4728
4768
  snippetTextToImage: () => snippetTextToImage,
4729
- snippetZeroShotClassification: () => snippetZeroShotClassification2
4769
+ snippetZeroShotClassification: () => snippetZeroShotClassification2,
4770
+ snippetZeroShotImageClassification: () => snippetZeroShotImageClassification
4730
4771
  });
4731
4772
  var snippetZeroShotClassification2 = (model) => `def query(payload):
4732
4773
  response = requests.post(API_URL, headers=headers, json=payload)
@@ -4736,6 +4777,20 @@ output = query({
4736
4777
  "inputs": ${getModelInputSnippet(model)},
4737
4778
  "parameters": {"candidate_labels": ["refund", "legal", "faq"]},
4738
4779
  })`;
4780
+ var snippetZeroShotImageClassification = (model) => `def query(data):
4781
+ with open(data["image_path"], "rb") as f:
4782
+ img = f.read()
4783
+ payload={
4784
+ "parameters": data["parameters"],
4785
+ "inputs": base64.b64encode(img).decode("utf-8")
4786
+ }
4787
+ response = requests.post(API_URL, headers=headers, json=payload)
4788
+ return response.json()
4789
+
4790
+ output = query({
4791
+ "image_path": ${getModelInputSnippet(model)},
4792
+ "parameters": {"candidate_labels": ["cat", "dog", "llama"]},
4793
+ })`;
4739
4794
  var snippetBasic2 = (model) => `def query(payload):
4740
4795
  response = requests.post(API_URL, headers=headers, json=payload)
4741
4796
  return response.json()
@@ -4760,6 +4815,12 @@ image_bytes = query({
4760
4815
  import io
4761
4816
  from PIL import Image
4762
4817
  image = Image.open(io.BytesIO(image_bytes))`;
4818
+ var snippetTabular = (model) => `def query(payload):
4819
+ response = requests.post(API_URL, headers=headers, json=payload)
4820
+ return response.content
4821
+ response = query({
4822
+ "inputs": {"data": ${getModelInputSnippet(model)}},
4823
+ })`;
4763
4824
  var snippetTextToAudio = (model) => {
4764
4825
  if (model.library_name === "transformers") {
4765
4826
  return `def query(payload):
@@ -4785,8 +4846,18 @@ from IPython.display import Audio
4785
4846
  Audio(audio, rate=sampling_rate)`;
4786
4847
  }
4787
4848
  };
4849
+ var snippetDocumentQuestionAnswering = (model) => `def query(payload):
4850
+ with open(payload["image"], "rb") as f:
4851
+ img = f.read()
4852
+ payload["image"] = base64.b64encode(img).decode("utf-8")
4853
+ response = requests.post(API_URL, headers=headers, json=payload)
4854
+ return response.json()
4855
+
4856
+ output = query({
4857
+ "inputs": ${getModelInputSnippet(model)},
4858
+ })`;
4788
4859
  var pythonSnippets = {
4789
- // Same order as in js/src/lib/interfaces/Types.ts
4860
+ // Same order as in tasks/src/pipelines.ts
4790
4861
  "text-classification": snippetBasic2,
4791
4862
  "token-classification": snippetBasic2,
4792
4863
  "table-question-answering": snippetBasic2,
@@ -4807,9 +4878,13 @@ var pythonSnippets = {
4807
4878
  "audio-to-audio": snippetFile2,
4808
4879
  "audio-classification": snippetFile2,
4809
4880
  "image-classification": snippetFile2,
4810
- "image-to-text": snippetFile2,
4881
+ "tabular-regression": snippetTabular,
4882
+ "tabular-classification": snippetTabular,
4811
4883
  "object-detection": snippetFile2,
4812
- "image-segmentation": snippetFile2
4884
+ "image-segmentation": snippetFile2,
4885
+ "document-question-answering": snippetDocumentQuestionAnswering,
4886
+ "image-to-text": snippetFile2,
4887
+ "zero-shot-image-classification": snippetZeroShotImageClassification
4813
4888
  };
4814
4889
  function getPythonInferenceSnippet(model, accessToken) {
4815
4890
  const body = model.pipeline_tag && model.pipeline_tag in pythonSnippets ? pythonSnippets[model.pipeline_tag]?.(model) ?? "" : "";
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@huggingface/tasks",
3
3
  "packageManager": "pnpm@8.10.5",
4
- "version": "0.0.7",
4
+ "version": "0.0.8",
5
5
  "description": "List of ML tasks for huggingface.co/tasks",
6
6
  "repository": "https://github.com/huggingface/huggingface.js.git",
7
7
  "publishConfig": {
@@ -30,9 +30,7 @@
30
30
  ],
31
31
  "author": "Hugging Face",
32
32
  "license": "MIT",
33
- "devDependencies": {
34
- "typescript": "^5.0.4"
35
- },
33
+ "devDependencies": {},
36
34
  "scripts": {
37
35
  "lint": "eslint --quiet --fix --ext .cjs,.ts .",
38
36
  "lint:check": "eslint --ext .cjs,.ts .",
package/src/pipelines.ts CHANGED
@@ -435,6 +435,20 @@ export const PIPELINE_DATA = {
435
435
  },
436
436
  "image-to-image": {
437
437
  name: "Image-to-Image",
438
+ subtasks: [
439
+ {
440
+ type: "image-inpainting",
441
+ name: "Image Inpainting",
442
+ },
443
+ {
444
+ type: "image-colorization",
445
+ name: "Image Colorization",
446
+ },
447
+ {
448
+ type: "super-resolution",
449
+ name: "Super Resolution",
450
+ },
451
+ ],
438
452
  modality: "cv",
439
453
  color: "indigo",
440
454
  },
@@ -621,6 +635,16 @@ export const PIPELINE_DATA = {
621
635
  modality: "cv",
622
636
  color: "yellow",
623
637
  },
638
+ "text-to-3d": {
639
+ name: "Text-to-3D",
640
+ modality: "multimodal",
641
+ color: "yellow",
642
+ },
643
+ "image-to-3d": {
644
+ name: "Image-to-3D",
645
+ modality: "multimodal",
646
+ color: "green",
647
+ },
624
648
  other: {
625
649
  name: "Other",
626
650
  modality: "other",
@@ -31,6 +31,12 @@ const inputsTableQuestionAnswering = () =>
31
31
  }
32
32
  }`;
33
33
 
34
+ const inputsVisualQuestionAnswering = () =>
35
+ `{
36
+ "image": "cat.png",
37
+ "question": "What is in this image?"
38
+ }`;
39
+
34
40
  const inputsQuestionAnswering = () =>
35
41
  `{
36
42
  "question": "What is my name?",
@@ -79,6 +85,11 @@ const inputsTextToAudio = () => `"liquid drum and bass, atmospheric synths, airy
79
85
 
80
86
  const inputsAutomaticSpeechRecognition = () => `"sample1.flac"`;
81
87
 
88
+ const inputsTabularPrediction = () =>
89
+ `'{"Height":[11.52,12.48],"Length1":[23.2,24.0],"Length2":[25.4,26.3],"Species": ["Bream","Bream"]}'`;
90
+
91
+ const inputsZeroShotImageClassification = () => `"cats.jpg"`;
92
+
82
93
  const modelInputSnippets: {
83
94
  [key in PipelineType]?: (model: ModelData) => string;
84
95
  } = {
@@ -86,6 +97,7 @@ const modelInputSnippets: {
86
97
  "audio-classification": inputsAudioClassification,
87
98
  "automatic-speech-recognition": inputsAutomaticSpeechRecognition,
88
99
  conversational: inputsConversational,
100
+ "document-question-answering": inputsVisualQuestionAnswering,
89
101
  "feature-extraction": inputsFeatureExtraction,
90
102
  "fill-mask": inputsFillMask,
91
103
  "image-classification": inputsImageClassification,
@@ -96,6 +108,8 @@ const modelInputSnippets: {
96
108
  "sentence-similarity": inputsSentenceSimilarity,
97
109
  summarization: inputsSummarization,
98
110
  "table-question-answering": inputsTableQuestionAnswering,
111
+ "tabular-regression": inputsTabularPrediction,
112
+ "tabular-classification": inputsTabularPrediction,
99
113
  "text-classification": inputsTextClassification,
100
114
  "text-generation": inputsTextGeneration,
101
115
  "text-to-image": inputsTextToImage,
@@ -105,6 +119,7 @@ const modelInputSnippets: {
105
119
  "token-classification": inputsTokenClassification,
106
120
  translation: inputsTranslation,
107
121
  "zero-shot-classification": inputsZeroShotClassification,
122
+ "zero-shot-image-classification": inputsZeroShotImageClassification,
108
123
  };
109
124
 
110
125
  // Use noWrap to put the whole snippet on a single line (removing new lines and tabulations)
@@ -12,6 +12,22 @@ output = query({
12
12
  "parameters": {"candidate_labels": ["refund", "legal", "faq"]},
13
13
  })`;
14
14
 
15
+ export const snippetZeroShotImageClassification = (model: ModelData): string =>
16
+ `def query(data):
17
+ with open(data["image_path"], "rb") as f:
18
+ img = f.read()
19
+ payload={
20
+ "parameters": data["parameters"],
21
+ "inputs": base64.b64encode(img).decode("utf-8")
22
+ }
23
+ response = requests.post(API_URL, headers=headers, json=payload)
24
+ return response.json()
25
+
26
+ output = query({
27
+ "image_path": ${getModelInputSnippet(model)},
28
+ "parameters": {"candidate_labels": ["cat", "dog", "llama"]},
29
+ })`;
30
+
15
31
  export const snippetBasic = (model: ModelData): string =>
16
32
  `def query(payload):
17
33
  response = requests.post(API_URL, headers=headers, json=payload)
@@ -42,6 +58,14 @@ import io
42
58
  from PIL import Image
43
59
  image = Image.open(io.BytesIO(image_bytes))`;
44
60
 
61
+ export const snippetTabular = (model: ModelData): string =>
62
+ `def query(payload):
63
+ response = requests.post(API_URL, headers=headers, json=payload)
64
+ return response.content
65
+ response = query({
66
+ "inputs": {"data": ${getModelInputSnippet(model)}},
67
+ })`;
68
+
45
69
  export const snippetTextToAudio = (model: ModelData): string => {
46
70
  // Transformers TTS pipeline and api-inference-community (AIC) pipeline outputs are diverged
47
71
  // with the latest update to inference-api (IA).
@@ -70,8 +94,21 @@ from IPython.display import Audio
70
94
  Audio(audio, rate=sampling_rate)`;
71
95
  }
72
96
  };
97
+
98
+ export const snippetDocumentQuestionAnswering = (model: ModelData): string =>
99
+ `def query(payload):
100
+ with open(payload["image"], "rb") as f:
101
+ img = f.read()
102
+ payload["image"] = base64.b64encode(img).decode("utf-8")
103
+ response = requests.post(API_URL, headers=headers, json=payload)
104
+ return response.json()
105
+
106
+ output = query({
107
+ "inputs": ${getModelInputSnippet(model)},
108
+ })`;
109
+
73
110
  export const pythonSnippets: Partial<Record<PipelineType, (model: ModelData) => string>> = {
74
- // Same order as in js/src/lib/interfaces/Types.ts
111
+ // Same order as in tasks/src/pipelines.ts
75
112
  "text-classification": snippetBasic,
76
113
  "token-classification": snippetBasic,
77
114
  "table-question-answering": snippetBasic,
@@ -92,9 +129,13 @@ export const pythonSnippets: Partial<Record<PipelineType, (model: ModelData) =>
92
129
  "audio-to-audio": snippetFile,
93
130
  "audio-classification": snippetFile,
94
131
  "image-classification": snippetFile,
95
- "image-to-text": snippetFile,
132
+ "tabular-regression": snippetTabular,
133
+ "tabular-classification": snippetTabular,
96
134
  "object-detection": snippetFile,
97
135
  "image-segmentation": snippetFile,
136
+ "document-question-answering": snippetDocumentQuestionAnswering,
137
+ "image-to-text": snippetFile,
138
+ "zero-shot-image-classification": snippetZeroShotImageClassification,
98
139
  };
99
140
 
100
141
  export function getPythonInferenceSnippet(model: ModelData, accessToken: string): string {
@@ -40,18 +40,18 @@ import type { ModelLibraryKey } from "../model-libraries";
40
40
  * Model libraries compatible with each ML task
41
41
  */
42
42
  export const TASKS_MODEL_LIBRARIES: Record<PipelineType, ModelLibraryKey[]> = {
43
- "audio-classification": ["speechbrain", "transformers"],
43
+ "audio-classification": ["speechbrain", "transformers", "transformers.js"],
44
44
  "audio-to-audio": ["asteroid", "speechbrain"],
45
45
  "automatic-speech-recognition": ["espnet", "nemo", "speechbrain", "transformers", "transformers.js"],
46
46
  conversational: ["transformers"],
47
- "depth-estimation": ["transformers"],
48
- "document-question-answering": ["transformers"],
47
+ "depth-estimation": ["transformers", "transformers.js"],
48
+ "document-question-answering": ["transformers", "transformers.js"],
49
49
  "feature-extraction": ["sentence-transformers", "transformers", "transformers.js"],
50
50
  "fill-mask": ["transformers", "transformers.js"],
51
51
  "graph-ml": ["transformers"],
52
52
  "image-classification": ["keras", "timm", "transformers", "transformers.js"],
53
53
  "image-segmentation": ["transformers", "transformers.js"],
54
- "image-to-image": ["diffusers"],
54
+ "image-to-image": ["diffusers", "transformers.js"],
55
55
  "image-to-text": ["transformers.js"],
56
56
  "image-to-video": ["diffusers"],
57
57
  "video-classification": ["transformers"],
@@ -73,8 +73,8 @@ export const TASKS_MODEL_LIBRARIES: Record<PipelineType, ModelLibraryKey[]> = {
73
73
  "text-generation": ["transformers", "transformers.js"],
74
74
  "text-retrieval": [],
75
75
  "text-to-image": ["diffusers"],
76
- "text-to-speech": ["espnet", "tensorflowtts", "transformers"],
77
- "text-to-audio": ["transformers"],
76
+ "text-to-speech": ["espnet", "tensorflowtts", "transformers", "transformers.js"],
77
+ "text-to-audio": ["transformers", "transformers.js"],
78
78
  "text-to-video": ["diffusers"],
79
79
  "text2text-generation": ["transformers", "transformers.js"],
80
80
  "time-series-forecasting": [],
@@ -89,11 +89,13 @@ export const TASKS_MODEL_LIBRARIES: Record<PipelineType, ModelLibraryKey[]> = {
89
89
  ],
90
90
  translation: ["transformers", "transformers.js"],
91
91
  "unconditional-image-generation": ["diffusers"],
92
- "visual-question-answering": ["transformers"],
92
+ "visual-question-answering": ["transformers", "transformers.js"],
93
93
  "voice-activity-detection": [],
94
94
  "zero-shot-classification": ["transformers", "transformers.js"],
95
95
  "zero-shot-image-classification": ["transformers", "transformers.js"],
96
- "zero-shot-object-detection": ["transformers"],
96
+ "zero-shot-object-detection": ["transformers", "transformers.js"],
97
+ "text-to-3d": [],
98
+ "image-to-3d": [],
97
99
  };
98
100
 
99
101
  /**
@@ -161,6 +163,8 @@ export const TASKS_DATA: Record<PipelineType, TaskData | undefined> = {
161
163
  "zero-shot-classification": getData("zero-shot-classification", zeroShotClassification),
162
164
  "zero-shot-image-classification": getData("zero-shot-image-classification", zeroShotImageClassification),
163
165
  "zero-shot-object-detection": getData("zero-shot-object-detection", placeholder),
166
+ "text-to-3d": getData("text-to-3d", placeholder),
167
+ "image-to-3d": getData("image-to-3d", placeholder),
164
168
  } as const;
165
169
 
166
170
  export interface ExampleRepo {
@@ -32,6 +32,16 @@ The most popular models for this task are GPT-based models, [Mistral](mistralai/
32
32
 
33
33
  These models are trained to learn the mapping between a pair of texts (e.g. translation from one language to another). The most popular variants of these models are [NLLB](facebook/nllb-200-distilled-600M), [FLAN-T5](https://huggingface.co/google/flan-t5-xxl), and [BART](https://huggingface.co/docs/transformers/model_doc/bart). Text-to-Text models are trained with multi-tasking capabilities, they can accomplish a wide range of tasks, including summarization, translation, and text classification.
34
34
 
35
+ ## Language Model Variants
36
+
37
+ When it comes to text generation, the underlying language model can come in several types:
38
+
39
+ - **Base models:** refers to plain language models like [Mistral 7B](mistralai/Mistral-7B-v0.1) and [Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b-hf). These models are good for fine-tuning and few-shot prompting.
40
+
41
+ - **Instruction-trained models:** these models are trained in a multi-task manner to follow a broad range of instructions like "Write me a recipe for chocolate cake". Models like [Flan-T5](https://huggingface.co/google/flan-t5-xl), [Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1), and [falcon-40b-instruct](https://huggingface.co/tiiuae/falcon-40b-instruct) are examples of instruction-trained models. In general, instruction-trained models will produce better responses to instructions than base models.
42
+
43
+ - **Human feedback models:** these models extend base and instruction-trained models by incorporating human feedback that rates the quality of the generated text according to criteria like [helpfulness, honesty, and harmlessness](https://arxiv.org/abs/2112.00861). The human feedback is then combined with an optimization technique like reinforcement learning to align the original model to be closer with human preferences. The overall methodology is often called [Reinforcement Learning from Human Feedback](https://huggingface.co/blog/rlhf), or RLHF for short. [Llama2-Chat](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) is an open-source model aligned through human feedback.
44
+
35
45
  ## Inference
36
46
 
37
47
  You can use the 🤗 Transformers library `text-generation` pipeline to do inference with Text Generation models. It takes an incomplete text and returns multiple outputs with which the text can be completed.