@huggingface/tasks 0.13.1-test → 0.13.1-test2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +4 -2
- package/src/dataset-libraries.ts +89 -0
- package/src/default-widget-inputs.ts +718 -0
- package/src/gguf.ts +40 -0
- package/src/hardware.ts +482 -0
- package/src/index.ts +59 -0
- package/src/library-to-tasks.ts +76 -0
- package/src/local-apps.ts +412 -0
- package/src/model-data.ts +149 -0
- package/src/model-libraries-downloads.ts +18 -0
- package/src/model-libraries-snippets.ts +1128 -0
- package/src/model-libraries.ts +820 -0
- package/src/pipelines.ts +698 -0
- package/src/snippets/common.ts +39 -0
- package/src/snippets/curl.spec.ts +94 -0
- package/src/snippets/curl.ts +120 -0
- package/src/snippets/index.ts +7 -0
- package/src/snippets/inputs.ts +167 -0
- package/src/snippets/js.spec.ts +148 -0
- package/src/snippets/js.ts +305 -0
- package/src/snippets/python.spec.ts +144 -0
- package/src/snippets/python.ts +321 -0
- package/src/snippets/types.ts +16 -0
- package/src/tasks/audio-classification/about.md +86 -0
- package/src/tasks/audio-classification/data.ts +81 -0
- package/src/tasks/audio-classification/inference.ts +52 -0
- package/src/tasks/audio-classification/spec/input.json +35 -0
- package/src/tasks/audio-classification/spec/output.json +11 -0
- package/src/tasks/audio-to-audio/about.md +56 -0
- package/src/tasks/audio-to-audio/data.ts +70 -0
- package/src/tasks/automatic-speech-recognition/about.md +90 -0
- package/src/tasks/automatic-speech-recognition/data.ts +82 -0
- package/src/tasks/automatic-speech-recognition/inference.ts +160 -0
- package/src/tasks/automatic-speech-recognition/spec/input.json +35 -0
- package/src/tasks/automatic-speech-recognition/spec/output.json +38 -0
- package/src/tasks/chat-completion/inference.ts +322 -0
- package/src/tasks/chat-completion/spec/input.json +350 -0
- package/src/tasks/chat-completion/spec/output.json +206 -0
- package/src/tasks/chat-completion/spec/stream_output.json +213 -0
- package/src/tasks/common-definitions.json +100 -0
- package/src/tasks/depth-estimation/about.md +45 -0
- package/src/tasks/depth-estimation/data.ts +70 -0
- package/src/tasks/depth-estimation/inference.ts +35 -0
- package/src/tasks/depth-estimation/spec/input.json +25 -0
- package/src/tasks/depth-estimation/spec/output.json +16 -0
- package/src/tasks/document-question-answering/about.md +53 -0
- package/src/tasks/document-question-answering/data.ts +85 -0
- package/src/tasks/document-question-answering/inference.ts +110 -0
- package/src/tasks/document-question-answering/spec/input.json +85 -0
- package/src/tasks/document-question-answering/spec/output.json +36 -0
- package/src/tasks/feature-extraction/about.md +72 -0
- package/src/tasks/feature-extraction/data.ts +57 -0
- package/src/tasks/feature-extraction/inference.ts +40 -0
- package/src/tasks/feature-extraction/spec/input.json +47 -0
- package/src/tasks/feature-extraction/spec/output.json +15 -0
- package/src/tasks/fill-mask/about.md +51 -0
- package/src/tasks/fill-mask/data.ts +79 -0
- package/src/tasks/fill-mask/inference.ts +62 -0
- package/src/tasks/fill-mask/spec/input.json +38 -0
- package/src/tasks/fill-mask/spec/output.json +29 -0
- package/src/tasks/image-classification/about.md +50 -0
- package/src/tasks/image-classification/data.ts +88 -0
- package/src/tasks/image-classification/inference.ts +52 -0
- package/src/tasks/image-classification/spec/input.json +35 -0
- package/src/tasks/image-classification/spec/output.json +11 -0
- package/src/tasks/image-feature-extraction/about.md +23 -0
- package/src/tasks/image-feature-extraction/data.ts +59 -0
- package/src/tasks/image-segmentation/about.md +63 -0
- package/src/tasks/image-segmentation/data.ts +99 -0
- package/src/tasks/image-segmentation/inference.ts +69 -0
- package/src/tasks/image-segmentation/spec/input.json +45 -0
- package/src/tasks/image-segmentation/spec/output.json +26 -0
- package/src/tasks/image-text-to-text/about.md +76 -0
- package/src/tasks/image-text-to-text/data.ts +102 -0
- package/src/tasks/image-to-3d/about.md +62 -0
- package/src/tasks/image-to-3d/data.ts +75 -0
- package/src/tasks/image-to-image/about.md +129 -0
- package/src/tasks/image-to-image/data.ts +101 -0
- package/src/tasks/image-to-image/inference.ts +68 -0
- package/src/tasks/image-to-image/spec/input.json +55 -0
- package/src/tasks/image-to-image/spec/output.json +12 -0
- package/src/tasks/image-to-text/about.md +61 -0
- package/src/tasks/image-to-text/data.ts +82 -0
- package/src/tasks/image-to-text/inference.ts +143 -0
- package/src/tasks/image-to-text/spec/input.json +34 -0
- package/src/tasks/image-to-text/spec/output.json +14 -0
- package/src/tasks/index.ts +312 -0
- package/src/tasks/keypoint-detection/about.md +57 -0
- package/src/tasks/keypoint-detection/data.ts +50 -0
- package/src/tasks/mask-generation/about.md +65 -0
- package/src/tasks/mask-generation/data.ts +55 -0
- package/src/tasks/object-detection/about.md +37 -0
- package/src/tasks/object-detection/data.ts +86 -0
- package/src/tasks/object-detection/inference.ts +75 -0
- package/src/tasks/object-detection/spec/input.json +31 -0
- package/src/tasks/object-detection/spec/output.json +50 -0
- package/src/tasks/placeholder/about.md +15 -0
- package/src/tasks/placeholder/data.ts +21 -0
- package/src/tasks/placeholder/spec/input.json +35 -0
- package/src/tasks/placeholder/spec/output.json +17 -0
- package/src/tasks/question-answering/about.md +56 -0
- package/src/tasks/question-answering/data.ts +75 -0
- package/src/tasks/question-answering/inference.ts +99 -0
- package/src/tasks/question-answering/spec/input.json +67 -0
- package/src/tasks/question-answering/spec/output.json +29 -0
- package/src/tasks/reinforcement-learning/about.md +167 -0
- package/src/tasks/reinforcement-learning/data.ts +75 -0
- package/src/tasks/sentence-similarity/about.md +97 -0
- package/src/tasks/sentence-similarity/data.ts +101 -0
- package/src/tasks/sentence-similarity/inference.ts +32 -0
- package/src/tasks/sentence-similarity/spec/input.json +40 -0
- package/src/tasks/sentence-similarity/spec/output.json +12 -0
- package/src/tasks/summarization/about.md +58 -0
- package/src/tasks/summarization/data.ts +76 -0
- package/src/tasks/summarization/inference.ts +57 -0
- package/src/tasks/summarization/spec/input.json +42 -0
- package/src/tasks/summarization/spec/output.json +14 -0
- package/src/tasks/table-question-answering/about.md +43 -0
- package/src/tasks/table-question-answering/data.ts +59 -0
- package/src/tasks/table-question-answering/inference.ts +61 -0
- package/src/tasks/table-question-answering/spec/input.json +44 -0
- package/src/tasks/table-question-answering/spec/output.json +40 -0
- package/src/tasks/tabular-classification/about.md +65 -0
- package/src/tasks/tabular-classification/data.ts +68 -0
- package/src/tasks/tabular-regression/about.md +87 -0
- package/src/tasks/tabular-regression/data.ts +57 -0
- package/src/tasks/text-classification/about.md +173 -0
- package/src/tasks/text-classification/data.ts +103 -0
- package/src/tasks/text-classification/inference.ts +51 -0
- package/src/tasks/text-classification/spec/input.json +35 -0
- package/src/tasks/text-classification/spec/output.json +11 -0
- package/src/tasks/text-generation/about.md +154 -0
- package/src/tasks/text-generation/data.ts +114 -0
- package/src/tasks/text-generation/inference.ts +200 -0
- package/src/tasks/text-generation/spec/input.json +219 -0
- package/src/tasks/text-generation/spec/output.json +179 -0
- package/src/tasks/text-generation/spec/stream_output.json +103 -0
- package/src/tasks/text-to-3d/about.md +62 -0
- package/src/tasks/text-to-3d/data.ts +56 -0
- package/src/tasks/text-to-audio/inference.ts +143 -0
- package/src/tasks/text-to-audio/spec/input.json +31 -0
- package/src/tasks/text-to-audio/spec/output.json +17 -0
- package/src/tasks/text-to-image/about.md +96 -0
- package/src/tasks/text-to-image/data.ts +100 -0
- package/src/tasks/text-to-image/inference.ts +75 -0
- package/src/tasks/text-to-image/spec/input.json +63 -0
- package/src/tasks/text-to-image/spec/output.json +13 -0
- package/src/tasks/text-to-speech/about.md +63 -0
- package/src/tasks/text-to-speech/data.ts +79 -0
- package/src/tasks/text-to-speech/inference.ts +145 -0
- package/src/tasks/text-to-speech/spec/input.json +31 -0
- package/src/tasks/text-to-speech/spec/output.json +7 -0
- package/src/tasks/text-to-video/about.md +41 -0
- package/src/tasks/text-to-video/data.ts +102 -0
- package/src/tasks/text2text-generation/inference.ts +55 -0
- package/src/tasks/text2text-generation/spec/input.json +55 -0
- package/src/tasks/text2text-generation/spec/output.json +14 -0
- package/src/tasks/token-classification/about.md +76 -0
- package/src/tasks/token-classification/data.ts +92 -0
- package/src/tasks/token-classification/inference.ts +85 -0
- package/src/tasks/token-classification/spec/input.json +65 -0
- package/src/tasks/token-classification/spec/output.json +37 -0
- package/src/tasks/translation/about.md +65 -0
- package/src/tasks/translation/data.ts +70 -0
- package/src/tasks/translation/inference.ts +67 -0
- package/src/tasks/translation/spec/input.json +50 -0
- package/src/tasks/translation/spec/output.json +14 -0
- package/src/tasks/unconditional-image-generation/about.md +50 -0
- package/src/tasks/unconditional-image-generation/data.ts +72 -0
- package/src/tasks/video-classification/about.md +37 -0
- package/src/tasks/video-classification/data.ts +84 -0
- package/src/tasks/video-classification/inference.ts +59 -0
- package/src/tasks/video-classification/spec/input.json +42 -0
- package/src/tasks/video-classification/spec/output.json +10 -0
- package/src/tasks/video-text-to-text/about.md +98 -0
- package/src/tasks/video-text-to-text/data.ts +66 -0
- package/src/tasks/visual-question-answering/about.md +48 -0
- package/src/tasks/visual-question-answering/data.ts +97 -0
- package/src/tasks/visual-question-answering/inference.ts +62 -0
- package/src/tasks/visual-question-answering/spec/input.json +41 -0
- package/src/tasks/visual-question-answering/spec/output.json +21 -0
- package/src/tasks/zero-shot-classification/about.md +40 -0
- package/src/tasks/zero-shot-classification/data.ts +70 -0
- package/src/tasks/zero-shot-classification/inference.ts +67 -0
- package/src/tasks/zero-shot-classification/spec/input.json +50 -0
- package/src/tasks/zero-shot-classification/spec/output.json +11 -0
- package/src/tasks/zero-shot-image-classification/about.md +75 -0
- package/src/tasks/zero-shot-image-classification/data.ts +84 -0
- package/src/tasks/zero-shot-image-classification/inference.ts +61 -0
- package/src/tasks/zero-shot-image-classification/spec/input.json +45 -0
- package/src/tasks/zero-shot-image-classification/spec/output.json +10 -0
- package/src/tasks/zero-shot-object-detection/about.md +45 -0
- package/src/tasks/zero-shot-object-detection/data.ts +67 -0
- package/src/tasks/zero-shot-object-detection/inference.ts +66 -0
- package/src/tasks/zero-shot-object-detection/spec/input.json +40 -0
- package/src/tasks/zero-shot-object-detection/spec/output.json +47 -0
- package/src/tokenizer-data.ts +32 -0
- package/src/widget-example.ts +125 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/depth-estimation/output.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Outputs of inference for the Depth Estimation task",
|
|
5
|
+
"title": "DepthEstimationOutput",
|
|
6
|
+
|
|
7
|
+
"type": "object",
|
|
8
|
+
"properties": {
|
|
9
|
+
"predicted_depth": {
|
|
10
|
+
"description": "The predicted depth as a tensor"
|
|
11
|
+
},
|
|
12
|
+
"depth": {
|
|
13
|
+
"description": "The predicted depth as an image"
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
## Use Cases
|
|
2
|
+
|
|
3
|
+
Document Question Answering models can be used to answer natural language questions about documents. Typically, document QA models consider textual, layout and potentially visual information. This is useful when the question requires some understanding of the visual aspects of the document.
|
|
4
|
+
Nevertheless, certain document QA models can work without document images. Hence the task is not limited to visually-rich documents and allows users to ask questions based on spreadsheets, text PDFs, etc!
|
|
5
|
+
|
|
6
|
+
### Document Parsing
|
|
7
|
+
|
|
8
|
+
One of the most popular use cases of document question answering models is the parsing of structured documents. For example, you can extract the name, address, and other information from a form. You can also use the model to extract information from a table, or even a resume.
|
|
9
|
+
|
|
10
|
+
### Invoice Information Extraction
|
|
11
|
+
|
|
12
|
+
Another very popular use case is invoice information extraction. For example, you can extract the invoice number, the invoice date, the total amount, the VAT number, and the invoice recipient.
|
|
13
|
+
|
|
14
|
+
## Inference
|
|
15
|
+
|
|
16
|
+
You can infer with Document QA models with the 🤗 Transformers library using the [`document-question-answering` pipeline](https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.DocumentQuestionAnsweringPipeline). If no model checkpoint is given, the pipeline will be initialized with [`impira/layoutlm-document-qa`](https://huggingface.co/impira/layoutlm-document-qa). This pipeline takes question(s) and document(s) as input, and returns the answer.
|
|
17
|
+
👉 Note that the question answering task solved here is extractive: the model extracts the answer from a context (the document).
|
|
18
|
+
|
|
19
|
+
```python
|
|
20
|
+
from transformers import pipeline
|
|
21
|
+
from PIL import Image
|
|
22
|
+
|
|
23
|
+
pipe = pipeline("document-question-answering", model="naver-clova-ix/donut-base-finetuned-docvqa")
|
|
24
|
+
|
|
25
|
+
question = "What is the purchase amount?"
|
|
26
|
+
image = Image.open("your-document.png")
|
|
27
|
+
|
|
28
|
+
pipe(image=image, question=question)
|
|
29
|
+
|
|
30
|
+
## [{'answer': '20,000$'}]
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Useful Resources
|
|
34
|
+
|
|
35
|
+
Would you like to learn more about Document QA? Awesome! Here are some curated resources that you may find helpful!
|
|
36
|
+
|
|
37
|
+
- [Document Visual Question Answering (DocVQA) challenge](https://rrc.cvc.uab.es/?ch=17)
|
|
38
|
+
- [DocVQA: A Dataset for Document Visual Question Answering](https://arxiv.org/abs/2007.00398) (Dataset paper)
|
|
39
|
+
- [ICDAR 2021 Competition on Document Visual Question Answering](https://lilianweng.github.io/lil-log/2020/10/29/open-domain-question-answering.html) (Conference paper)
|
|
40
|
+
- [HuggingFace's Document Question Answering pipeline](https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.DocumentQuestionAnsweringPipeline)
|
|
41
|
+
- [Github repo: DocQuery - Document Query Engine Powered by Large Language Models](https://github.com/impira/docquery)
|
|
42
|
+
|
|
43
|
+
### Notebooks
|
|
44
|
+
|
|
45
|
+
- [Fine-tuning Donut on DocVQA dataset](https://github.com/NielsRogge/Transformers-Tutorials/tree/0ea77f29d01217587d7e32a848f3691d9c15d6ab/Donut/DocVQA)
|
|
46
|
+
- [Fine-tuning LayoutLMv2 on DocVQA dataset](https://github.com/NielsRogge/Transformers-Tutorials/tree/1b4bad710c41017d07a8f63b46a12523bfd2e835/LayoutLMv2/DocVQA)
|
|
47
|
+
- [Accelerating Document AI](https://huggingface.co/blog/document-ai)
|
|
48
|
+
|
|
49
|
+
### Documentation
|
|
50
|
+
|
|
51
|
+
- [Document question answering task guide](https://huggingface.co/docs/transformers/tasks/document_question_answering)
|
|
52
|
+
|
|
53
|
+
The contents of this page are contributed by [Eliott Zemour](https://huggingface.co/eliolio) and reviewed by [Kwadwo Agyapon-Ntra](https://huggingface.co/KayO) and [Ankur Goyal](https://huggingface.co/ankrgyl).
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import type { TaskDataCustom } from "../index.js";
|
|
2
|
+
|
|
3
|
+
const taskData: TaskDataCustom = {
|
|
4
|
+
datasets: [
|
|
5
|
+
{
|
|
6
|
+
description: "Largest document understanding dataset.",
|
|
7
|
+
id: "HuggingFaceM4/Docmatix",
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
description:
|
|
11
|
+
"Dataset from the 2020 DocVQA challenge. The documents are taken from the UCSF Industry Documents Library.",
|
|
12
|
+
id: "eliolio/docvqa",
|
|
13
|
+
},
|
|
14
|
+
],
|
|
15
|
+
demo: {
|
|
16
|
+
inputs: [
|
|
17
|
+
{
|
|
18
|
+
label: "Question",
|
|
19
|
+
content: "What is the idea behind the consumer relations efficiency team?",
|
|
20
|
+
type: "text",
|
|
21
|
+
},
|
|
22
|
+
{
|
|
23
|
+
filename: "document-question-answering-input.png",
|
|
24
|
+
type: "img",
|
|
25
|
+
},
|
|
26
|
+
],
|
|
27
|
+
outputs: [
|
|
28
|
+
{
|
|
29
|
+
label: "Answer",
|
|
30
|
+
content: "Balance cost efficiency with quality customer service",
|
|
31
|
+
type: "text",
|
|
32
|
+
},
|
|
33
|
+
],
|
|
34
|
+
},
|
|
35
|
+
metrics: [
|
|
36
|
+
{
|
|
37
|
+
description:
|
|
38
|
+
"The evaluation metric for the DocVQA challenge is the Average Normalized Levenshtein Similarity (ANLS). This metric is flexible to character regognition errors and compares the predicted answer with the ground truth answer.",
|
|
39
|
+
id: "anls",
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
description:
|
|
43
|
+
"Exact Match is a metric based on the strict character match of the predicted answer and the right answer. For answers predicted correctly, the Exact Match will be 1. Even if only one character is different, Exact Match will be 0",
|
|
44
|
+
id: "exact-match",
|
|
45
|
+
},
|
|
46
|
+
],
|
|
47
|
+
models: [
|
|
48
|
+
{
|
|
49
|
+
description: "A robust document question answering model.",
|
|
50
|
+
id: "impira/layoutlm-document-qa",
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
description: "A document question answering model specialized in invoices.",
|
|
54
|
+
id: "impira/layoutlm-invoices",
|
|
55
|
+
},
|
|
56
|
+
{
|
|
57
|
+
description: "A special model for OCR-free document question answering.",
|
|
58
|
+
id: "microsoft/udop-large",
|
|
59
|
+
},
|
|
60
|
+
{
|
|
61
|
+
description: "A powerful model for document question answering.",
|
|
62
|
+
id: "google/pix2struct-docvqa-large",
|
|
63
|
+
},
|
|
64
|
+
],
|
|
65
|
+
spaces: [
|
|
66
|
+
{
|
|
67
|
+
description: "A robust document question answering application.",
|
|
68
|
+
id: "impira/docquery",
|
|
69
|
+
},
|
|
70
|
+
{
|
|
71
|
+
description: "An application that can answer questions from invoices.",
|
|
72
|
+
id: "impira/invoices",
|
|
73
|
+
},
|
|
74
|
+
{
|
|
75
|
+
description: "An application to compare different document question answering models.",
|
|
76
|
+
id: "merve/compare_docvqa_models",
|
|
77
|
+
},
|
|
78
|
+
],
|
|
79
|
+
summary:
|
|
80
|
+
"Document Question Answering (also known as Document Visual Question Answering) is the task of answering questions on document images. Document question answering models take a (document, question) pair as input and return an answer in natural language. Models usually rely on multi-modal features, combining text, position of words (bounding-boxes) and image.",
|
|
81
|
+
widgetModels: ["impira/layoutlm-invoices"],
|
|
82
|
+
youtubeId: "",
|
|
83
|
+
};
|
|
84
|
+
|
|
85
|
+
export default taskData;
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Inference code generated from the JSON schema spec in ./spec
|
|
3
|
+
*
|
|
4
|
+
* Using src/scripts/inference-codegen
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* Inputs for Document Question Answering inference
|
|
8
|
+
*/
|
|
9
|
+
export interface DocumentQuestionAnsweringInput {
|
|
10
|
+
/**
|
|
11
|
+
* One (document, question) pair to answer
|
|
12
|
+
*/
|
|
13
|
+
inputs: DocumentQuestionAnsweringInputData;
|
|
14
|
+
/**
|
|
15
|
+
* Additional inference parameters
|
|
16
|
+
*/
|
|
17
|
+
parameters?: DocumentQuestionAnsweringParameters;
|
|
18
|
+
[property: string]: unknown;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* One (document, question) pair to answer
|
|
22
|
+
*/
|
|
23
|
+
export interface DocumentQuestionAnsweringInputData {
|
|
24
|
+
/**
|
|
25
|
+
* The image on which the question is asked
|
|
26
|
+
*/
|
|
27
|
+
image: unknown;
|
|
28
|
+
/**
|
|
29
|
+
* A question to ask of the document
|
|
30
|
+
*/
|
|
31
|
+
question: string;
|
|
32
|
+
[property: string]: unknown;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Additional inference parameters
|
|
36
|
+
*
|
|
37
|
+
* Additional inference parameters for Document Question Answering
|
|
38
|
+
*/
|
|
39
|
+
export interface DocumentQuestionAnsweringParameters {
|
|
40
|
+
/**
|
|
41
|
+
* If the words in the document are too long to fit with the question for the model, it will
|
|
42
|
+
* be split in several chunks with some overlap. This argument controls the size of that
|
|
43
|
+
* overlap.
|
|
44
|
+
*/
|
|
45
|
+
doc_stride?: number;
|
|
46
|
+
/**
|
|
47
|
+
* Whether to accept impossible as an answer
|
|
48
|
+
*/
|
|
49
|
+
handle_impossible_answer?: boolean;
|
|
50
|
+
/**
|
|
51
|
+
* Language to use while running OCR. Defaults to english.
|
|
52
|
+
*/
|
|
53
|
+
lang?: string;
|
|
54
|
+
/**
|
|
55
|
+
* The maximum length of predicted answers (e.g., only answers with a shorter length are
|
|
56
|
+
* considered).
|
|
57
|
+
*/
|
|
58
|
+
max_answer_len?: number;
|
|
59
|
+
/**
|
|
60
|
+
* The maximum length of the question after tokenization. It will be truncated if needed.
|
|
61
|
+
*/
|
|
62
|
+
max_question_len?: number;
|
|
63
|
+
/**
|
|
64
|
+
* The maximum length of the total sentence (context + question) in tokens of each chunk
|
|
65
|
+
* passed to the model. The context will be split in several chunks (using doc_stride as
|
|
66
|
+
* overlap) if needed.
|
|
67
|
+
*/
|
|
68
|
+
max_seq_len?: number;
|
|
69
|
+
/**
|
|
70
|
+
* The number of answers to return (will be chosen by order of likelihood). Can return less
|
|
71
|
+
* than top_k answers if there are not enough options available within the context.
|
|
72
|
+
*/
|
|
73
|
+
top_k?: number;
|
|
74
|
+
/**
|
|
75
|
+
* A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
|
|
76
|
+
* skip the OCR step and use the provided bounding boxes instead.
|
|
77
|
+
*/
|
|
78
|
+
word_boxes?: WordBox[];
|
|
79
|
+
[property: string]: unknown;
|
|
80
|
+
}
|
|
81
|
+
export type WordBox = number[] | string;
|
|
82
|
+
export type DocumentQuestionAnsweringOutput = DocumentQuestionAnsweringOutputElement[];
|
|
83
|
+
/**
|
|
84
|
+
* Outputs of inference for the Document Question Answering task
|
|
85
|
+
*/
|
|
86
|
+
export interface DocumentQuestionAnsweringOutputElement {
|
|
87
|
+
/**
|
|
88
|
+
* The answer to the question.
|
|
89
|
+
*/
|
|
90
|
+
answer: string;
|
|
91
|
+
/**
|
|
92
|
+
* The end word index of the answer (in the OCR’d version of the input or provided word
|
|
93
|
+
* boxes).
|
|
94
|
+
*/
|
|
95
|
+
end: number;
|
|
96
|
+
/**
|
|
97
|
+
* The probability associated to the answer.
|
|
98
|
+
*/
|
|
99
|
+
score: number;
|
|
100
|
+
/**
|
|
101
|
+
* The start word index of the answer (in the OCR’d version of the input or provided word
|
|
102
|
+
* boxes).
|
|
103
|
+
*/
|
|
104
|
+
start: number;
|
|
105
|
+
/**
|
|
106
|
+
* The index of each word/box pair that is in the answer
|
|
107
|
+
*/
|
|
108
|
+
words: number[];
|
|
109
|
+
[property: string]: unknown;
|
|
110
|
+
}
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/document-question-answering/input.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Inputs for Document Question Answering inference",
|
|
5
|
+
"title": "DocumentQuestionAnsweringInput",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"properties": {
|
|
8
|
+
"inputs": {
|
|
9
|
+
"description": "One (document, question) pair to answer",
|
|
10
|
+
"type": "object",
|
|
11
|
+
"title": "DocumentQuestionAnsweringInputData",
|
|
12
|
+
"properties": {
|
|
13
|
+
"image": {
|
|
14
|
+
"description": "The image on which the question is asked"
|
|
15
|
+
},
|
|
16
|
+
"question": {
|
|
17
|
+
"type": "string",
|
|
18
|
+
"description": "A question to ask of the document"
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
"required": ["image", "question"]
|
|
22
|
+
},
|
|
23
|
+
"parameters": {
|
|
24
|
+
"description": "Additional inference parameters",
|
|
25
|
+
"$ref": "#/$defs/DocumentQuestionAnsweringParameters"
|
|
26
|
+
}
|
|
27
|
+
},
|
|
28
|
+
"$defs": {
|
|
29
|
+
"DocumentQuestionAnsweringParameters": {
|
|
30
|
+
"title": "DocumentQuestionAnsweringParameters",
|
|
31
|
+
"description": "Additional inference parameters for Document Question Answering",
|
|
32
|
+
"type": "object",
|
|
33
|
+
"properties": {
|
|
34
|
+
"doc_stride": {
|
|
35
|
+
"type": "integer",
|
|
36
|
+
"description": "If the words in the document are too long to fit with the question for the model, it will be split in several chunks with some overlap. This argument controls the size of that overlap."
|
|
37
|
+
},
|
|
38
|
+
"handle_impossible_answer": {
|
|
39
|
+
"type": "boolean",
|
|
40
|
+
"description": "Whether to accept impossible as an answer"
|
|
41
|
+
},
|
|
42
|
+
"lang": {
|
|
43
|
+
"type": "string",
|
|
44
|
+
"description": "Language to use while running OCR. Defaults to english."
|
|
45
|
+
},
|
|
46
|
+
"max_answer_len": {
|
|
47
|
+
"type": "integer",
|
|
48
|
+
"description": "The maximum length of predicted answers (e.g., only answers with a shorter length are considered)."
|
|
49
|
+
},
|
|
50
|
+
"max_seq_len": {
|
|
51
|
+
"type": "integer",
|
|
52
|
+
"description": "The maximum length of the total sentence (context + question) in tokens of each chunk passed to the model. The context will be split in several chunks (using doc_stride as overlap) if needed."
|
|
53
|
+
},
|
|
54
|
+
"max_question_len": {
|
|
55
|
+
"type": "integer",
|
|
56
|
+
"description": "The maximum length of the question after tokenization. It will be truncated if needed."
|
|
57
|
+
},
|
|
58
|
+
"top_k": {
|
|
59
|
+
"type": "integer",
|
|
60
|
+
"description": "The number of answers to return (will be chosen by order of likelihood). Can return less than top_k answers if there are not enough options available within the context."
|
|
61
|
+
},
|
|
62
|
+
"word_boxes": {
|
|
63
|
+
"type": "array",
|
|
64
|
+
"description": "A list of words and bounding boxes (normalized 0->1000). If provided, the inference will skip the OCR step and use the provided bounding boxes instead.",
|
|
65
|
+
"items": {
|
|
66
|
+
"anyOf": [
|
|
67
|
+
{
|
|
68
|
+
"type": "string"
|
|
69
|
+
},
|
|
70
|
+
{
|
|
71
|
+
"type": "array",
|
|
72
|
+
"items": {
|
|
73
|
+
"type": "number"
|
|
74
|
+
},
|
|
75
|
+
"maxLength": 4,
|
|
76
|
+
"minLength": 4
|
|
77
|
+
}
|
|
78
|
+
]
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
},
|
|
84
|
+
"required": ["inputs"]
|
|
85
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/document-question-answering/output.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Outputs of inference for the Document Question Answering task",
|
|
5
|
+
"title": "DocumentQuestionAnsweringOutput",
|
|
6
|
+
"type": "array",
|
|
7
|
+
"items": {
|
|
8
|
+
"type": "object",
|
|
9
|
+
"properties": {
|
|
10
|
+
"answer": {
|
|
11
|
+
"type": "string",
|
|
12
|
+
"description": "The answer to the question."
|
|
13
|
+
},
|
|
14
|
+
"score": {
|
|
15
|
+
"type": "number",
|
|
16
|
+
"description": "The probability associated to the answer."
|
|
17
|
+
},
|
|
18
|
+
"start": {
|
|
19
|
+
"type": "integer",
|
|
20
|
+
"description": "The start word index of the answer (in the OCR\u2019d version of the input or provided word boxes)."
|
|
21
|
+
},
|
|
22
|
+
"end": {
|
|
23
|
+
"type": "integer",
|
|
24
|
+
"description": "The end word index of the answer (in the OCR\u2019d version of the input or provided word boxes)."
|
|
25
|
+
},
|
|
26
|
+
"words": {
|
|
27
|
+
"type": "array",
|
|
28
|
+
"items": {
|
|
29
|
+
"type": "integer"
|
|
30
|
+
},
|
|
31
|
+
"description": "The index of each word/box pair that is in the answer"
|
|
32
|
+
}
|
|
33
|
+
},
|
|
34
|
+
"required": ["answer", "score", "start", "end", "words"]
|
|
35
|
+
}
|
|
36
|
+
}
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
## Use Cases
|
|
2
|
+
|
|
3
|
+
### Transfer Learning
|
|
4
|
+
|
|
5
|
+
Models trained on a specific dataset can learn features about the data. For instance, a model trained on an English poetry dataset learns English grammar at a very high level. This information can be transferred to a new model that is going to be trained on tweets. This process of extracting features and transferring to another model is called transfer learning. One can pass their dataset through a feature extraction pipeline and feed the result to a classifier.
|
|
6
|
+
|
|
7
|
+
### Retrieval and Reranking
|
|
8
|
+
|
|
9
|
+
Retrieval is the process of obtaining relevant documents or information based on a user's search query. In the context of NLP, retrieval systems aim to find relevant text passages or documents from a large corpus of data that match the user's query. The goal is to return a set of results that are likely to be useful to the user. On the other hand, reranking is a technique used to improve the quality of retrieval results by reordering them based on their relevance to the query.
|
|
10
|
+
|
|
11
|
+
### Retrieval Augmented Generation
|
|
12
|
+
|
|
13
|
+
Retrieval-augmented generation (RAG) is a technique in which user inputs to generative models are first queried through a knowledge base, and the most relevant information from the knowledge base is used to augment the prompt to reduce hallucinations during generation. Feature extraction models (primarily retrieval and reranking models) can be used in RAG to reduce model hallucinations and ground the model.
|
|
14
|
+
|
|
15
|
+
## Inference
|
|
16
|
+
|
|
17
|
+
You can infer feature extraction models using `pipeline` of transformers library.
|
|
18
|
+
|
|
19
|
+
```python
|
|
20
|
+
from transformers import pipeline
|
|
21
|
+
checkpoint = "facebook/bart-base"
|
|
22
|
+
feature_extractor = pipeline("feature-extraction", framework="pt", model=checkpoint)
|
|
23
|
+
text = "Transformers is an awesome library!"
|
|
24
|
+
|
|
25
|
+
#Reducing along the first dimension to get a 768 dimensional array
|
|
26
|
+
feature_extractor(text,return_tensors = "pt")[0].numpy().mean(axis=0)
|
|
27
|
+
|
|
28
|
+
'''tensor([[[ 2.5834, 2.7571, 0.9024, ..., 1.5036, -0.0435, -0.8603],
|
|
29
|
+
[-1.2850, -1.0094, -2.0826, ..., 1.5993, -0.9017, 0.6426],
|
|
30
|
+
[ 0.9082, 0.3896, -0.6843, ..., 0.7061, 0.6517, 1.0550],
|
|
31
|
+
...,
|
|
32
|
+
[ 0.6919, -1.1946, 0.2438, ..., 1.3646, -1.8661, -0.1642],
|
|
33
|
+
[-0.1701, -2.0019, -0.4223, ..., 0.3680, -1.9704, -0.0068],
|
|
34
|
+
[ 0.2520, -0.6869, -1.0582, ..., 0.5198, -2.2106, 0.4547]]])'''
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
A very popular library for training similarity and search models is called `sentence-transformers`. To get started, install the library.
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install -U sentence-transformers
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
You can infer with `sentence-transformers` models as follows.
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
from sentence_transformers import SentenceTransformer
|
|
47
|
+
|
|
48
|
+
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
|
49
|
+
sentences = [
|
|
50
|
+
"The weather is lovely today.",
|
|
51
|
+
"It's so sunny outside!",
|
|
52
|
+
"He drove to the stadium.",
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
embeddings = model.encode(sentences)
|
|
56
|
+
similarities = model.similarity(embeddings, embeddings)
|
|
57
|
+
print(similarities)
|
|
58
|
+
# tensor([[1.0000, 0.6660, 0.1046],
|
|
59
|
+
# [0.6660, 1.0000, 0.1411],
|
|
60
|
+
# [0.1046, 0.1411, 1.0000]])
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Text Embedding Inference
|
|
64
|
+
|
|
65
|
+
[Text Embeddings Inference (TEI)](https://github.com/huggingface/text-embeddings-inference) is a toolkit to easily serve feature extraction models using few lines of code.
|
|
66
|
+
|
|
67
|
+
## Useful resources
|
|
68
|
+
|
|
69
|
+
- [Documentation for feature extraction task in 🤗Transformers](https://huggingface.co/docs/transformers/main_classes/feature_extractor)
|
|
70
|
+
- [Introduction to MTEB Benchmark](https://huggingface.co/blog/mteb)
|
|
71
|
+
- [Cookbook: Simple RAG for GitHub issues using Hugging Face Zephyr and LangChain](https://huggingface.co/learn/cookbook/rag_zephyr_langchain)
|
|
72
|
+
- [sentence-transformers organization on Hugging Face Hub](https://huggingface.co/sentence-transformers)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import type { TaskDataCustom } from "../index.js";
|
|
2
|
+
|
|
3
|
+
const taskData: TaskDataCustom = {
|
|
4
|
+
datasets: [
|
|
5
|
+
{
|
|
6
|
+
description:
|
|
7
|
+
"Wikipedia dataset containing cleaned articles of all languages. Can be used to train `feature-extraction` models.",
|
|
8
|
+
id: "wikipedia",
|
|
9
|
+
},
|
|
10
|
+
],
|
|
11
|
+
demo: {
|
|
12
|
+
inputs: [
|
|
13
|
+
{
|
|
14
|
+
label: "Input",
|
|
15
|
+
content: "India, officially the Republic of India, is a country in South Asia.",
|
|
16
|
+
type: "text",
|
|
17
|
+
},
|
|
18
|
+
],
|
|
19
|
+
outputs: [
|
|
20
|
+
{
|
|
21
|
+
table: [
|
|
22
|
+
["Dimension 1", "Dimension 2", "Dimension 3"],
|
|
23
|
+
["2.583383083343506", "2.757075071334839", "0.9023529887199402"],
|
|
24
|
+
["8.29393482208252", "1.1071064472198486", "2.03399395942688"],
|
|
25
|
+
["-0.7754912972450256", "-1.647324562072754", "-0.6113331913948059"],
|
|
26
|
+
["0.07087723910808563", "1.5942802429199219", "1.4610432386398315"],
|
|
27
|
+
],
|
|
28
|
+
type: "tabular",
|
|
29
|
+
},
|
|
30
|
+
],
|
|
31
|
+
},
|
|
32
|
+
metrics: [],
|
|
33
|
+
models: [
|
|
34
|
+
{
|
|
35
|
+
description: "A powerful feature extraction model for natural language processing tasks.",
|
|
36
|
+
id: "thenlper/gte-large",
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
description: "A strong feature extraction model for retrieval.",
|
|
40
|
+
id: "Alibaba-NLP/gte-Qwen1.5-7B-instruct",
|
|
41
|
+
},
|
|
42
|
+
],
|
|
43
|
+
spaces: [
|
|
44
|
+
{
|
|
45
|
+
description: "A leaderboard to rank text feature extraction models based on a benchmark.",
|
|
46
|
+
id: "mteb/leaderboard",
|
|
47
|
+
},
|
|
48
|
+
{
|
|
49
|
+
description: "A leaderboard to rank best feature extraction models based on human feedback.",
|
|
50
|
+
id: "mteb/arena",
|
|
51
|
+
},
|
|
52
|
+
],
|
|
53
|
+
summary: "Feature extraction is the task of extracting features learnt in a model.",
|
|
54
|
+
widgetModels: ["facebook/bart-base"],
|
|
55
|
+
};
|
|
56
|
+
|
|
57
|
+
export default taskData;
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Inference code generated from the JSON schema spec in ./spec
|
|
3
|
+
*
|
|
4
|
+
* Using src/scripts/inference-codegen
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
export type FeatureExtractionOutput = Array<number[]>;
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Feature Extraction Input.
|
|
11
|
+
*
|
|
12
|
+
* Auto-generated from TEI specs.
|
|
13
|
+
* For more details, check out
|
|
14
|
+
* https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tei-import.ts.
|
|
15
|
+
*/
|
|
16
|
+
export interface FeatureExtractionInput {
|
|
17
|
+
/**
|
|
18
|
+
* The text to embed.
|
|
19
|
+
*/
|
|
20
|
+
inputs: string;
|
|
21
|
+
normalize?: boolean;
|
|
22
|
+
/**
|
|
23
|
+
* The name of the prompt that should be used by for encoding. If not set, no prompt
|
|
24
|
+
* will be applied.
|
|
25
|
+
*
|
|
26
|
+
* Must be a key in the `Sentence Transformers` configuration `prompts` dictionary.
|
|
27
|
+
*
|
|
28
|
+
* For example if ``prompt_name`` is "query" and the ``prompts`` is {"query": "query: ",
|
|
29
|
+
* ...},
|
|
30
|
+
* then the sentence "What is the capital of France?" will be encoded as
|
|
31
|
+
* "query: What is the capital of France?" because the prompt text will be prepended before
|
|
32
|
+
* any text to encode.
|
|
33
|
+
*/
|
|
34
|
+
prompt_name?: string;
|
|
35
|
+
truncate?: boolean;
|
|
36
|
+
truncation_direction?: FeatureExtractionInputTruncationDirection;
|
|
37
|
+
[property: string]: unknown;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
export type FeatureExtractionInputTruncationDirection = "Left" | "Right";
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/feature-extraction/input.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Feature Extraction Input.\n\nAuto-generated from TEI specs.\nFor more details, check out https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tei-import.ts.",
|
|
5
|
+
"title": "FeatureExtractionInput",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"required": ["inputs"],
|
|
8
|
+
"properties": {
|
|
9
|
+
"inputs": {
|
|
10
|
+
"type": "string",
|
|
11
|
+
"description": "The text to embed."
|
|
12
|
+
},
|
|
13
|
+
"normalize": {
|
|
14
|
+
"type": "boolean",
|
|
15
|
+
"default": "true",
|
|
16
|
+
"example": "true"
|
|
17
|
+
},
|
|
18
|
+
"prompt_name": {
|
|
19
|
+
"type": "string",
|
|
20
|
+
"description": "The name of the prompt that should be used by for encoding. If not set, no prompt\nwill be applied.\n\nMust be a key in the `Sentence Transformers` configuration `prompts` dictionary.\n\nFor example if ``prompt_name`` is \"query\" and the ``prompts`` is {\"query\": \"query: \", ...},\nthen the sentence \"What is the capital of France?\" will be encoded as\n\"query: What is the capital of France?\" because the prompt text will be prepended before\nany text to encode.",
|
|
21
|
+
"default": "null",
|
|
22
|
+
"example": "null",
|
|
23
|
+
"nullable": true
|
|
24
|
+
},
|
|
25
|
+
"truncate": {
|
|
26
|
+
"type": "boolean",
|
|
27
|
+
"default": "false",
|
|
28
|
+
"example": "false",
|
|
29
|
+
"nullable": true
|
|
30
|
+
},
|
|
31
|
+
"truncation_direction": {
|
|
32
|
+
"allOf": [
|
|
33
|
+
{
|
|
34
|
+
"$ref": "#/$defs/FeatureExtractionInputTruncationDirection"
|
|
35
|
+
}
|
|
36
|
+
],
|
|
37
|
+
"default": "right"
|
|
38
|
+
}
|
|
39
|
+
},
|
|
40
|
+
"$defs": {
|
|
41
|
+
"FeatureExtractionInputTruncationDirection": {
|
|
42
|
+
"type": "string",
|
|
43
|
+
"enum": ["Left", "Right"],
|
|
44
|
+
"title": "FeatureExtractionInputTruncationDirection"
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$id": "/inference/schemas/feature-extraction/output.json",
|
|
3
|
+
"$schema": "http://json-schema.org/draft-06/schema#",
|
|
4
|
+
"description": "Feature Extraction Output.\n\nAuto-generated from TEI specs.\nFor more details, check out https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tei-import.ts.",
|
|
5
|
+
"title": "FeatureExtractionOutput",
|
|
6
|
+
"type": "array",
|
|
7
|
+
"$defs": {},
|
|
8
|
+
"items": {
|
|
9
|
+
"type": "array",
|
|
10
|
+
"items": {
|
|
11
|
+
"type": "number",
|
|
12
|
+
"format": "float"
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
}
|