@huggingface/tasks 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. package/README.md +1 -1
  2. package/dist/{index.mjs → index.cjs} +2695 -2497
  3. package/dist/index.d.ts +427 -65
  4. package/dist/index.js +2660 -2532
  5. package/package.json +13 -8
  6. package/src/index.ts +2 -5
  7. package/src/library-to-tasks.ts +1 -1
  8. package/src/model-data.ts +1 -1
  9. package/src/model-libraries-downloads.ts +20 -0
  10. package/src/{library-ui-elements.ts → model-libraries-snippets.ts} +50 -296
  11. package/src/model-libraries.ts +375 -44
  12. package/src/pipelines.ts +1 -1
  13. package/src/tasks/audio-classification/about.md +1 -1
  14. package/src/tasks/audio-classification/inference.ts +51 -0
  15. package/src/tasks/audio-classification/spec/input.json +34 -0
  16. package/src/tasks/audio-classification/spec/output.json +10 -0
  17. package/src/tasks/audio-to-audio/about.md +1 -1
  18. package/src/tasks/automatic-speech-recognition/about.md +4 -2
  19. package/src/tasks/automatic-speech-recognition/inference.ts +159 -0
  20. package/src/tasks/automatic-speech-recognition/spec/input.json +34 -0
  21. package/src/tasks/automatic-speech-recognition/spec/output.json +38 -0
  22. package/src/tasks/common-definitions.json +117 -0
  23. package/src/tasks/depth-estimation/data.ts +8 -4
  24. package/src/tasks/depth-estimation/inference.ts +35 -0
  25. package/src/tasks/depth-estimation/spec/input.json +25 -0
  26. package/src/tasks/depth-estimation/spec/output.json +16 -0
  27. package/src/tasks/document-question-answering/inference.ts +110 -0
  28. package/src/tasks/document-question-answering/spec/input.json +85 -0
  29. package/src/tasks/document-question-answering/spec/output.json +36 -0
  30. package/src/tasks/feature-extraction/inference.ts +22 -0
  31. package/src/tasks/feature-extraction/spec/input.json +26 -0
  32. package/src/tasks/feature-extraction/spec/output.json +7 -0
  33. package/src/tasks/fill-mask/inference.ts +62 -0
  34. package/src/tasks/fill-mask/spec/input.json +38 -0
  35. package/src/tasks/fill-mask/spec/output.json +29 -0
  36. package/src/tasks/image-classification/inference.ts +51 -0
  37. package/src/tasks/image-classification/spec/input.json +34 -0
  38. package/src/tasks/image-classification/spec/output.json +10 -0
  39. package/src/tasks/image-segmentation/inference.ts +65 -0
  40. package/src/tasks/image-segmentation/spec/input.json +54 -0
  41. package/src/tasks/image-segmentation/spec/output.json +25 -0
  42. package/src/tasks/image-to-image/inference.ts +67 -0
  43. package/src/tasks/image-to-image/spec/input.json +54 -0
  44. package/src/tasks/image-to-image/spec/output.json +12 -0
  45. package/src/tasks/image-to-text/inference.ts +143 -0
  46. package/src/tasks/image-to-text/spec/input.json +34 -0
  47. package/src/tasks/image-to-text/spec/output.json +14 -0
  48. package/src/tasks/index.ts +5 -2
  49. package/src/tasks/mask-generation/about.md +65 -0
  50. package/src/tasks/mask-generation/data.ts +42 -5
  51. package/src/tasks/object-detection/inference.ts +62 -0
  52. package/src/tasks/object-detection/spec/input.json +30 -0
  53. package/src/tasks/object-detection/spec/output.json +46 -0
  54. package/src/tasks/placeholder/data.ts +3 -0
  55. package/src/tasks/placeholder/spec/input.json +35 -0
  56. package/src/tasks/placeholder/spec/output.json +17 -0
  57. package/src/tasks/question-answering/inference.ts +99 -0
  58. package/src/tasks/question-answering/spec/input.json +67 -0
  59. package/src/tasks/question-answering/spec/output.json +29 -0
  60. package/src/tasks/sentence-similarity/about.md +2 -2
  61. package/src/tasks/sentence-similarity/inference.ts +32 -0
  62. package/src/tasks/sentence-similarity/spec/input.json +40 -0
  63. package/src/tasks/sentence-similarity/spec/output.json +12 -0
  64. package/src/tasks/summarization/data.ts +1 -0
  65. package/src/tasks/summarization/inference.ts +59 -0
  66. package/src/tasks/summarization/spec/input.json +7 -0
  67. package/src/tasks/summarization/spec/output.json +7 -0
  68. package/src/tasks/table-question-answering/inference.ts +61 -0
  69. package/src/tasks/table-question-answering/spec/input.json +44 -0
  70. package/src/tasks/table-question-answering/spec/output.json +40 -0
  71. package/src/tasks/tabular-classification/about.md +1 -1
  72. package/src/tasks/tabular-regression/about.md +1 -1
  73. package/src/tasks/text-classification/about.md +1 -0
  74. package/src/tasks/text-classification/inference.ts +51 -0
  75. package/src/tasks/text-classification/spec/input.json +35 -0
  76. package/src/tasks/text-classification/spec/output.json +10 -0
  77. package/src/tasks/text-generation/about.md +24 -13
  78. package/src/tasks/text-generation/data.ts +22 -38
  79. package/src/tasks/text-generation/inference.ts +194 -0
  80. package/src/tasks/text-generation/spec/input.json +90 -0
  81. package/src/tasks/text-generation/spec/output.json +120 -0
  82. package/src/tasks/text-to-audio/inference.ts +143 -0
  83. package/src/tasks/text-to-audio/spec/input.json +31 -0
  84. package/src/tasks/text-to-audio/spec/output.json +17 -0
  85. package/src/tasks/text-to-image/about.md +11 -2
  86. package/src/tasks/text-to-image/data.ts +6 -2
  87. package/src/tasks/text-to-image/inference.ts +71 -0
  88. package/src/tasks/text-to-image/spec/input.json +59 -0
  89. package/src/tasks/text-to-image/spec/output.json +13 -0
  90. package/src/tasks/text-to-speech/about.md +4 -2
  91. package/src/tasks/text-to-speech/data.ts +1 -0
  92. package/src/tasks/text-to-speech/inference.ts +147 -0
  93. package/src/tasks/text-to-speech/spec/input.json +7 -0
  94. package/src/tasks/text-to-speech/spec/output.json +7 -0
  95. package/src/tasks/text2text-generation/inference.ts +55 -0
  96. package/src/tasks/text2text-generation/spec/input.json +55 -0
  97. package/src/tasks/text2text-generation/spec/output.json +14 -0
  98. package/src/tasks/token-classification/inference.ts +82 -0
  99. package/src/tasks/token-classification/spec/input.json +65 -0
  100. package/src/tasks/token-classification/spec/output.json +33 -0
  101. package/src/tasks/translation/data.ts +1 -0
  102. package/src/tasks/translation/inference.ts +59 -0
  103. package/src/tasks/translation/spec/input.json +7 -0
  104. package/src/tasks/translation/spec/output.json +7 -0
  105. package/src/tasks/video-classification/inference.ts +59 -0
  106. package/src/tasks/video-classification/spec/input.json +42 -0
  107. package/src/tasks/video-classification/spec/output.json +10 -0
  108. package/src/tasks/visual-question-answering/inference.ts +63 -0
  109. package/src/tasks/visual-question-answering/spec/input.json +41 -0
  110. package/src/tasks/visual-question-answering/spec/output.json +21 -0
  111. package/src/tasks/zero-shot-classification/inference.ts +67 -0
  112. package/src/tasks/zero-shot-classification/spec/input.json +50 -0
  113. package/src/tasks/zero-shot-classification/spec/output.json +10 -0
  114. package/src/tasks/zero-shot-image-classification/data.ts +8 -5
  115. package/src/tasks/zero-shot-image-classification/inference.ts +61 -0
  116. package/src/tasks/zero-shot-image-classification/spec/input.json +45 -0
  117. package/src/tasks/zero-shot-image-classification/spec/output.json +10 -0
  118. package/src/tasks/zero-shot-object-detection/about.md +6 -0
  119. package/src/tasks/zero-shot-object-detection/data.ts +6 -1
  120. package/src/tasks/zero-shot-object-detection/inference.ts +66 -0
  121. package/src/tasks/zero-shot-object-detection/spec/input.json +40 -0
  122. package/src/tasks/zero-shot-object-detection/spec/output.json +47 -0
  123. package/tsconfig.json +3 -3
@@ -0,0 +1,33 @@
1
+ {
2
+ "$id": "/inference/schemas/token-classification/output.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Outputs of inference for the Token Classification task",
5
+ "title": "TokenClassificationOutput",
6
+ "type": "array",
7
+ "items": {
8
+ "type": "object",
9
+ "properties": {
10
+ "entity_group": {
11
+ "type": "string",
12
+ "description": "The predicted label for that group of tokens"
13
+ },
14
+ "score": {
15
+ "type": "number",
16
+ "description": "The associated score / probability"
17
+ },
18
+ "word": {
19
+ "type": "string",
20
+ "description": "The corresponding text"
21
+ },
22
+ "start": {
23
+ "type": "integer",
24
+ "description": "The character position in the input where this group begins."
25
+ },
26
+ "end": {
27
+ "type": "integer",
28
+ "description": "The character position in the input where this group ends."
29
+ }
30
+ },
31
+ "required": ["label", "score"]
32
+ }
33
+ }
@@ -1,6 +1,7 @@
1
1
  import type { TaskDataCustom } from "..";
2
2
 
3
3
  const taskData: TaskDataCustom = {
4
+ canonicalId: "text2text-generation",
4
5
  datasets: [
5
6
  {
6
7
  description: "A dataset of copyright-free books translated into 16 different languages.",
@@ -0,0 +1,59 @@
1
+ /**
2
+ * Inference code generated from the JSON schema spec in ./spec
3
+ *
4
+ * Using src/scripts/inference-codegen
5
+ */
6
+
7
+ /**
8
+ * Inputs for Translation inference
9
+ *
10
+ * Inputs for Text2text Generation inference
11
+ */
12
+ export interface TranslationInput {
13
+ /**
14
+ * The input text data
15
+ */
16
+ inputs: string;
17
+ /**
18
+ * Additional inference parameters
19
+ */
20
+ parameters?: Text2TextGenerationParameters;
21
+ [property: string]: unknown;
22
+ }
23
+
24
+ /**
25
+ * Additional inference parameters
26
+ *
27
+ * Additional inference parameters for Text2text Generation
28
+ */
29
+ export interface Text2TextGenerationParameters {
30
+ /**
31
+ * Whether to clean up the potential extra spaces in the text output.
32
+ */
33
+ clean_up_tokenization_spaces?: boolean;
34
+ /**
35
+ * Additional parametrization of the text generation algorithm
36
+ */
37
+ generate_parameters?: { [key: string]: unknown };
38
+ /**
39
+ * The truncation strategy to use
40
+ */
41
+ truncation?: Text2TextGenerationTruncationStrategy;
42
+ [property: string]: unknown;
43
+ }
44
+
45
+ export type Text2TextGenerationTruncationStrategy = "do_not_truncate" | "longest_first" | "only_first" | "only_second";
46
+
47
+ /**
48
+ * Outputs for Translation inference
49
+ *
50
+ * Outputs of inference for the Text2text Generation task
51
+ */
52
+ export interface TranslationOutput {
53
+ generatedText: unknown;
54
+ /**
55
+ * The generated text.
56
+ */
57
+ generated_text?: string;
58
+ [property: string]: unknown;
59
+ }
@@ -0,0 +1,7 @@
1
+ {
2
+ "$ref": "/inference/schemas/text2text-generation/input.json",
3
+ "$id": "/inference/schemas/translation/input.json",
4
+ "$schema": "http://json-schema.org/draft-06/schema#",
5
+ "title": "TranslationInput",
6
+ "description": "Inputs for Translation inference"
7
+ }
@@ -0,0 +1,7 @@
1
+ {
2
+ "$ref": "/inference/schemas/text2text-generation/output.json",
3
+ "$id": "/inference/schemas/translation/output.json",
4
+ "$schema": "http://json-schema.org/draft-06/schema#",
5
+ "title": "TranslationOutput",
6
+ "description": "Outputs for Translation inference"
7
+ }
@@ -0,0 +1,59 @@
1
+ /**
2
+ * Inference code generated from the JSON schema spec in ./spec
3
+ *
4
+ * Using src/scripts/inference-codegen
5
+ */
6
+ /**
7
+ * Inputs for Video Classification inference
8
+ */
9
+ export interface VideoClassificationInput {
10
+ /**
11
+ * The input video data
12
+ */
13
+ inputs: unknown;
14
+ /**
15
+ * Additional inference parameters
16
+ */
17
+ parameters?: VideoClassificationParameters;
18
+ [property: string]: unknown;
19
+ }
20
+ /**
21
+ * Additional inference parameters
22
+ *
23
+ * Additional inference parameters for Video Classification
24
+ */
25
+ export interface VideoClassificationParameters {
26
+ /**
27
+ * The sampling rate used to select frames from the video.
28
+ */
29
+ frame_sampling_rate?: number;
30
+ function_to_apply?: ClassificationOutputTransform;
31
+ /**
32
+ * The number of sampled frames to consider for classification.
33
+ */
34
+ num_frames?: number;
35
+ /**
36
+ * When specified, limits the output to the top K most probable classes.
37
+ */
38
+ top_k?: number;
39
+ [property: string]: unknown;
40
+ }
41
+ /**
42
+ * The function to apply to the model outputs in order to retrieve the scores.
43
+ */
44
+ export type ClassificationOutputTransform = "sigmoid" | "softmax" | "none";
45
+ export type VideoClassificationOutput = VideoClassificationOutputElement[];
46
+ /**
47
+ * Outputs of inference for the Video Classification task
48
+ */
49
+ export interface VideoClassificationOutputElement {
50
+ /**
51
+ * The predicted class label.
52
+ */
53
+ label: string;
54
+ /**
55
+ * The corresponding probability.
56
+ */
57
+ score: number;
58
+ [property: string]: unknown;
59
+ }
@@ -0,0 +1,42 @@
1
+ {
2
+ "$id": "/inference/schemas/video-classification/input.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Inputs for Video Classification inference",
5
+ "title": "VideoClassificationInput",
6
+ "type": "object",
7
+ "properties": {
8
+ "inputs": {
9
+ "description": "The input video data"
10
+ },
11
+ "parameters": {
12
+ "description": "Additional inference parameters",
13
+ "$ref": "#/$defs/VideoClassificationParameters"
14
+ }
15
+ },
16
+ "$defs": {
17
+ "VideoClassificationParameters": {
18
+ "title": "VideoClassificationParameters",
19
+ "description": "Additional inference parameters for Video Classification",
20
+ "type": "object",
21
+ "properties": {
22
+ "function_to_apply": {
23
+ "title": "TextClassificationOutputTransform",
24
+ "$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutputTransform"
25
+ },
26
+ "num_frames": {
27
+ "type": "integer",
28
+ "description": "The number of sampled frames to consider for classification."
29
+ },
30
+ "frame_sampling_rate": {
31
+ "type": "integer",
32
+ "description": "The sampling rate used to select frames from the video."
33
+ },
34
+ "top_k": {
35
+ "type": "integer",
36
+ "description": "When specified, limits the output to the top K most probable classes."
37
+ }
38
+ }
39
+ }
40
+ },
41
+ "required": ["inputs"]
42
+ }
@@ -0,0 +1,10 @@
1
+ {
2
+ "$id": "/inference/schemas/video-classification/output.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Outputs of inference for the Video Classification task",
5
+ "title": "VideoClassificationOutput",
6
+ "type": "array",
7
+ "items": {
8
+ "$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
9
+ }
10
+ }
@@ -0,0 +1,63 @@
1
+ /**
2
+ * Inference code generated from the JSON schema spec in ./spec
3
+ *
4
+ * Using src/scripts/inference-codegen
5
+ */
6
+ /**
7
+ * Inputs for Visual Question Answering inference
8
+ */
9
+ export interface VisualQuestionAnsweringInput {
10
+ /**
11
+ * One (image, question) pair to answer
12
+ */
13
+ inputs: VisualQuestionAnsweringInputData;
14
+ /**
15
+ * Additional inference parameters
16
+ */
17
+ parameters?: VisualQuestionAnsweringParameters;
18
+ [property: string]: unknown;
19
+ }
20
+ /**
21
+ * One (image, question) pair to answer
22
+ */
23
+ export interface VisualQuestionAnsweringInputData {
24
+ /**
25
+ * The image.
26
+ */
27
+ image: unknown;
28
+ /**
29
+ * The question to answer based on the image.
30
+ */
31
+ question: unknown;
32
+ [property: string]: unknown;
33
+ }
34
+ /**
35
+ * Additional inference parameters
36
+ *
37
+ * Additional inference parameters for Visual Question Answering
38
+ */
39
+ export interface VisualQuestionAnsweringParameters {
40
+ /**
41
+ * The number of answers to return (will be chosen by order of likelihood). Note that we
42
+ * return less than topk answers if there are not enough options available within the
43
+ * context.
44
+ */
45
+ top_k?: number;
46
+ [property: string]: unknown;
47
+ }
48
+ export type VisualQuestionAnsweringOutput = VisualQuestionAnsweringOutputElement[];
49
+ /**
50
+ * Outputs of inference for the Visual Question Answering task
51
+ */
52
+ export interface VisualQuestionAnsweringOutputElement {
53
+ /**
54
+ * The answer to the question
55
+ */
56
+ answer?: string;
57
+ label: unknown;
58
+ /**
59
+ * The associated score / probability
60
+ */
61
+ score: number;
62
+ [property: string]: unknown;
63
+ }
@@ -0,0 +1,41 @@
1
+ {
2
+ "$id": "/inference/schemas/visual-question-answering/input.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Inputs for Visual Question Answering inference",
5
+ "title": "VisualQuestionAnsweringInput",
6
+ "type": "object",
7
+ "properties": {
8
+ "inputs": {
9
+ "description": "One (image, question) pair to answer",
10
+ "type": "object",
11
+ "title": "VisualQuestionAnsweringInputData",
12
+ "properties": {
13
+ "image": {
14
+ "description": "The image."
15
+ },
16
+ "question": {
17
+ "description": "The question to answer based on the image."
18
+ }
19
+ },
20
+ "required": ["question", "image"]
21
+ },
22
+ "parameters": {
23
+ "description": "Additional inference parameters",
24
+ "$ref": "#/$defs/VisualQuestionAnsweringParameters"
25
+ }
26
+ },
27
+ "$defs": {
28
+ "VisualQuestionAnsweringParameters": {
29
+ "title": "VisualQuestionAnsweringParameters",
30
+ "description": "Additional inference parameters for Visual Question Answering",
31
+ "type": "object",
32
+ "properties": {
33
+ "top_k": {
34
+ "type": "integer",
35
+ "description": "The number of answers to return (will be chosen by order of likelihood). Note that we return less than topk answers if there are not enough options available within the context."
36
+ }
37
+ }
38
+ }
39
+ },
40
+ "required": ["inputs"]
41
+ }
@@ -0,0 +1,21 @@
1
+ {
2
+ "$id": "/inference/schemas/visual-question-answering/output.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Outputs of inference for the Visual Question Answering task",
5
+ "title": "VisualQuestionAnsweringOutput",
6
+ "type": "array",
7
+ "items": {
8
+ "type": "object",
9
+ "properties": {
10
+ "answer": {
11
+ "type": "string",
12
+ "description": "The answer to the question"
13
+ },
14
+ "score": {
15
+ "type": "number",
16
+ "description": "The associated score / probability"
17
+ }
18
+ },
19
+ "required": ["label", "score"]
20
+ }
21
+ }
@@ -0,0 +1,67 @@
1
+ /**
2
+ * Inference code generated from the JSON schema spec in ./spec
3
+ *
4
+ * Using src/scripts/inference-codegen
5
+ */
6
+ /**
7
+ * Inputs for Zero Shot Classification inference
8
+ */
9
+ export interface ZeroShotClassificationInput {
10
+ /**
11
+ * The input text data, with candidate labels
12
+ */
13
+ inputs: ZeroShotClassificationInputData;
14
+ /**
15
+ * Additional inference parameters
16
+ */
17
+ parameters?: ZeroShotClassificationParameters;
18
+ [property: string]: unknown;
19
+ }
20
+ /**
21
+ * The input text data, with candidate labels
22
+ */
23
+ export interface ZeroShotClassificationInputData {
24
+ /**
25
+ * The set of possible class labels to classify the text into.
26
+ */
27
+ candidateLabels: string[];
28
+ /**
29
+ * The text to classify
30
+ */
31
+ text: string;
32
+ [property: string]: unknown;
33
+ }
34
+ /**
35
+ * Additional inference parameters
36
+ *
37
+ * Additional inference parameters for Zero Shot Classification
38
+ */
39
+ export interface ZeroShotClassificationParameters {
40
+ /**
41
+ * The sentence used in conjunction with candidateLabels to attempt the text classification
42
+ * by replacing the placeholder with the candidate labels.
43
+ */
44
+ hypothesis_template?: string;
45
+ /**
46
+ * Whether multiple candidate labels can be true. If false, the scores are normalized such
47
+ * that the sum of the label likelihoods for each sequence is 1. If true, the labels are
48
+ * considered independent and probabilities are normalized for each candidate.
49
+ */
50
+ multi_label?: boolean;
51
+ [property: string]: unknown;
52
+ }
53
+ export type ZeroShotClassificationOutput = ZeroShotClassificationOutputElement[];
54
+ /**
55
+ * Outputs of inference for the Zero Shot Classification task
56
+ */
57
+ export interface ZeroShotClassificationOutputElement {
58
+ /**
59
+ * The predicted class label.
60
+ */
61
+ label: string;
62
+ /**
63
+ * The corresponding probability.
64
+ */
65
+ score: number;
66
+ [property: string]: unknown;
67
+ }
@@ -0,0 +1,50 @@
1
+ {
2
+ "$id": "/inference/schemas/zero-shot-classification/input.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Inputs for Zero Shot Classification inference",
5
+ "title": "ZeroShotClassificationInput",
6
+ "type": "object",
7
+ "properties": {
8
+ "inputs": {
9
+ "description": "The input text data, with candidate labels",
10
+ "type": "object",
11
+ "title": "ZeroShotClassificationInputData",
12
+ "properties": {
13
+ "text": {
14
+ "type": "string",
15
+ "description": "The text to classify"
16
+ },
17
+ "candidateLabels": {
18
+ "type": "array",
19
+ "description": "The set of possible class labels to classify the text into.",
20
+ "items": {
21
+ "type": "string"
22
+ }
23
+ }
24
+ },
25
+ "required": ["text", "candidateLabels"]
26
+ },
27
+ "parameters": {
28
+ "description": "Additional inference parameters",
29
+ "$ref": "#/$defs/ZeroShotClassificationParameters"
30
+ }
31
+ },
32
+ "$defs": {
33
+ "ZeroShotClassificationParameters": {
34
+ "title": "ZeroShotClassificationParameters",
35
+ "description": "Additional inference parameters for Zero Shot Classification",
36
+ "type": "object",
37
+ "properties": {
38
+ "hypothesis_template": {
39
+ "type": "string",
40
+ "description": "The sentence used in conjunction with candidateLabels to attempt the text classification by replacing the placeholder with the candidate labels."
41
+ },
42
+ "multi_label": {
43
+ "type": "boolean",
44
+ "description": "Whether multiple candidate labels can be true. If false, the scores are normalized such that the sum of the label likelihoods for each sequence is 1. If true, the labels are considered independent and probabilities are normalized for each candidate."
45
+ }
46
+ }
47
+ }
48
+ },
49
+ "required": ["inputs"]
50
+ }
@@ -0,0 +1,10 @@
1
+ {
2
+ "$id": "/inference/schemas/zero-shot-classification/output.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Outputs of inference for the Zero Shot Classification task",
5
+ "title": "ZeroShotClassificationOutput",
6
+ "type": "array",
7
+ "items": {
8
+ "$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
9
+ }
10
+ }
@@ -52,9 +52,8 @@ const taskData: TaskDataCustom = {
52
52
  id: "openai/clip-vit-base-patch16",
53
53
  },
54
54
  {
55
- description:
56
- "Robust image classification model trained on publicly available image-caption data trained on additional high pixel data for better performance.",
57
- id: "openai/clip-vit-large-patch14-336",
55
+ description: "Strong zero-shot image classification model.",
56
+ id: "google/siglip-base-patch16-224",
58
57
  },
59
58
  {
60
59
  description: "Strong image classification model for biomedical domain.",
@@ -64,12 +63,16 @@ const taskData: TaskDataCustom = {
64
63
  spaces: [
65
64
  {
66
65
  description:
67
- "An application that leverages zero shot image classification to find best captions to generate an image. ",
66
+ "An application that leverages zero-shot image classification to find best captions to generate an image. ",
68
67
  id: "pharma/CLIP-Interrogator",
69
68
  },
69
+ {
70
+ description: "An application to compare different zero-shot image classification models. ",
71
+ id: "merve/compare_clip_siglip",
72
+ },
70
73
  ],
71
74
  summary:
72
- "Zero shot image classification is the task of classifying previously unseen classes during training of a model.",
75
+ "Zero-shot image classification is the task of classifying previously unseen classes during training of a model.",
73
76
  widgetModels: ["openai/clip-vit-large-patch14-336"],
74
77
  youtubeId: "",
75
78
  };
@@ -0,0 +1,61 @@
1
+ /**
2
+ * Inference code generated from the JSON schema spec in ./spec
3
+ *
4
+ * Using src/scripts/inference-codegen
5
+ */
6
+ /**
7
+ * Inputs for Zero Shot Image Classification inference
8
+ */
9
+ export interface ZeroShotImageClassificationInput {
10
+ /**
11
+ * The input image data, with candidate labels
12
+ */
13
+ inputs: ZeroShotImageClassificationInputData;
14
+ /**
15
+ * Additional inference parameters
16
+ */
17
+ parameters?: ZeroShotImageClassificationParameters;
18
+ [property: string]: unknown;
19
+ }
20
+ /**
21
+ * The input image data, with candidate labels
22
+ */
23
+ export interface ZeroShotImageClassificationInputData {
24
+ /**
25
+ * The candidate labels for this image
26
+ */
27
+ candidateLabels: string[];
28
+ /**
29
+ * The image data to classify
30
+ */
31
+ image: unknown;
32
+ [property: string]: unknown;
33
+ }
34
+ /**
35
+ * Additional inference parameters
36
+ *
37
+ * Additional inference parameters for Zero Shot Image Classification
38
+ */
39
+ export interface ZeroShotImageClassificationParameters {
40
+ /**
41
+ * The sentence used in conjunction with candidateLabels to attempt the text classification
42
+ * by replacing the placeholder with the candidate labels.
43
+ */
44
+ hypothesis_template?: string;
45
+ [property: string]: unknown;
46
+ }
47
+ export type ZeroShotImageClassificationOutput = ZeroShotImageClassificationOutputElement[];
48
+ /**
49
+ * Outputs of inference for the Zero Shot Image Classification task
50
+ */
51
+ export interface ZeroShotImageClassificationOutputElement {
52
+ /**
53
+ * The predicted class label.
54
+ */
55
+ label: string;
56
+ /**
57
+ * The corresponding probability.
58
+ */
59
+ score: number;
60
+ [property: string]: unknown;
61
+ }
@@ -0,0 +1,45 @@
1
+ {
2
+ "$id": "/inference/schemas/zero-shot-image-classification/input.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Inputs for Zero Shot Image Classification inference",
5
+ "title": "ZeroShotImageClassificationInput",
6
+ "type": "object",
7
+ "properties": {
8
+ "inputs": {
9
+ "description": "The input image data, with candidate labels",
10
+ "type": "object",
11
+ "title": "ZeroShotImageClassificationInputData",
12
+ "properties": {
13
+ "image": {
14
+ "description": "The image data to classify"
15
+ },
16
+ "candidateLabels": {
17
+ "description": "The candidate labels for this image",
18
+ "type": "array",
19
+ "items": {
20
+ "type": "string"
21
+ }
22
+ }
23
+ },
24
+ "required": ["image", "candidateLabels"]
25
+ },
26
+ "parameters": {
27
+ "description": "Additional inference parameters",
28
+ "$ref": "#/$defs/ZeroShotImageClassificationParameters"
29
+ }
30
+ },
31
+ "$defs": {
32
+ "ZeroShotImageClassificationParameters": {
33
+ "title": "ZeroShotImageClassificationParameters",
34
+ "description": "Additional inference parameters for Zero Shot Image Classification",
35
+ "type": "object",
36
+ "properties": {
37
+ "hypothesis_template": {
38
+ "type": "string",
39
+ "description": "The sentence used in conjunction with candidateLabels to attempt the text classification by replacing the placeholder with the candidate labels."
40
+ }
41
+ }
42
+ }
43
+ },
44
+ "required": ["inputs"]
45
+ }
@@ -0,0 +1,10 @@
1
+ {
2
+ "$id": "/inference/schemas/zero-shot-image-classification/output.json",
3
+ "$schema": "http://json-schema.org/draft-06/schema#",
4
+ "description": "Outputs of inference for the Zero Shot Image Classification task",
5
+ "title": "ZeroShotImageClassificationOutput",
6
+ "type": "array",
7
+ "items": {
8
+ "$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
9
+ }
10
+ }
@@ -1,5 +1,7 @@
1
1
  ## Use Cases
2
2
 
3
+ Zero-shot object detection models can be used in any object detection application where the detection involves text queries for objects of interest.
4
+
3
5
  ### Object Search
4
6
 
5
7
  Zero-shot object detection models can be used in image search. Smartphones, for example, use zero-shot object detection models to detect entities (such as specific places or objects) and allow the user to search for the entity on the internet.
@@ -8,6 +10,10 @@ Zero-shot object detection models can be used in image search. Smartphones, for
8
10
 
9
11
  Zero-shot object detection models are used to count instances of objects in a given image. This can include counting the objects in warehouses or stores or the number of visitors in a store. They are also used to manage crowds at events to prevent disasters.
10
12
 
13
+ ### Object Tracking
14
+
15
+ Zero-shot object detectors can track objects in videos.
16
+
11
17
  ## Inference
12
18
 
13
19
  You can infer with zero-shot object detection models through the `zero-shot-object-detection` pipeline. When calling the pipeline, you just need to specify a path or HTTP link to an image and the candidate labels.
@@ -47,7 +47,12 @@ const taskData: TaskDataCustom = {
47
47
  id: "google/owlv2-base-patch16-ensemble",
48
48
  },
49
49
  ],
50
- spaces: [],
50
+ spaces: [
51
+ {
52
+ description: "A demo to try the state-of-the-art zero-shot object detection model, OWLv2.",
53
+ id: "merve/owlv2",
54
+ },
55
+ ],
51
56
  summary:
52
57
  "Zero-shot object detection is a computer vision task to detect objects and their classes in images, without any prior training or knowledge of the classes. Zero-shot object detection models receive an image as input, as well as a list of candidate classes, and output the bounding boxes and labels where the objects have been detected.",
53
58
  widgetModels: [],