@huggingface/tasks 0.0.9 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +6 -1
- package/dist/index.js +70 -39
- package/dist/index.mjs +70 -39
- package/package.json +1 -1
- package/src/default-widget-inputs.ts +1 -1
- package/src/index.ts +3 -4
- package/src/library-to-tasks.ts +1 -0
- package/src/library-ui-elements.ts +18 -13
- package/src/model-data.ts +2 -116
- package/src/model-libraries.ts +1 -0
- package/src/tasks/depth-estimation/data.ts +5 -3
- package/src/tasks/document-question-answering/data.ts +8 -0
- package/src/tasks/image-to-text/about.md +13 -0
- package/src/tasks/image-to-text/data.ts +10 -14
- package/src/tasks/index.ts +3 -3
- package/src/tasks/object-detection/data.ts +5 -2
- package/src/tasks/table-question-answering/about.md +1 -1
- package/src/tasks/text-generation/about.md +4 -0
- package/src/tasks/text-to-image/data.ts +13 -11
- package/src/tasks/text-to-video/data.ts +3 -3
- package/src/tasks/visual-question-answering/data.ts +4 -0
- package/src/widget-example.ts +118 -0
package/dist/index.d.ts
CHANGED
|
@@ -23,6 +23,7 @@ declare enum ModelLibrary {
|
|
|
23
23
|
"pyannote-audio" = "pyannote.audio",
|
|
24
24
|
"sample-factory" = "Sample Factory",
|
|
25
25
|
"sentence-transformers" = "Sentence Transformers",
|
|
26
|
+
"setfit" = "SetFit",
|
|
26
27
|
"sklearn" = "Scikit-learn",
|
|
27
28
|
"spacy" = "spaCy",
|
|
28
29
|
"span-marker" = "SpanMarker",
|
|
@@ -452,6 +453,9 @@ declare const PIPELINE_TYPES_SET: Set<"other" | "text-classification" | "token-c
|
|
|
452
453
|
*/
|
|
453
454
|
declare const LIBRARY_TASK_MAPPING_EXCLUDING_TRANSFORMERS: Partial<Record<ModelLibraryKey, PipelineType[]>>;
|
|
454
455
|
|
|
456
|
+
/**
|
|
457
|
+
* See default-widget-inputs.ts for the default widget inputs, this files only contains the types
|
|
458
|
+
*/
|
|
455
459
|
type TableData = Record<string, (string | number)[]>;
|
|
456
460
|
type WidgetExampleOutputLabels = Array<{
|
|
457
461
|
label: string;
|
|
@@ -526,6 +530,7 @@ interface WidgetExampleSentenceSimilarityInput<TOutput = WidgetExampleOutput> ex
|
|
|
526
530
|
type WidgetExample<TOutput = WidgetExampleOutput> = WidgetExampleTextInput<TOutput> | WidgetExampleTextAndContextInput<TOutput> | WidgetExampleTextAndTableInput<TOutput> | WidgetExampleAssetInput<TOutput> | WidgetExampleAssetAndPromptInput<TOutput> | WidgetExampleAssetAndTextInput<TOutput> | WidgetExampleAssetAndZeroShotInput<TOutput> | WidgetExampleStructuredDataInput<TOutput> | WidgetExampleTableDataInput<TOutput> | WidgetExampleZeroShotTextInput<TOutput> | WidgetExampleSentenceSimilarityInput<TOutput>;
|
|
527
531
|
type KeysOfUnion<T> = T extends unknown ? keyof T : never;
|
|
528
532
|
type WidgetExampleAttribute = KeysOfUnion<WidgetExample>;
|
|
533
|
+
|
|
529
534
|
declare enum InferenceDisplayability {
|
|
530
535
|
/**
|
|
531
536
|
* Yes
|
|
@@ -618,7 +623,7 @@ interface ModelData {
|
|
|
618
623
|
inference?: boolean | {
|
|
619
624
|
parameters?: Record<string, unknown>;
|
|
620
625
|
};
|
|
621
|
-
base_model?: string;
|
|
626
|
+
base_model?: string | string[];
|
|
622
627
|
};
|
|
623
628
|
/**
|
|
624
629
|
* Library name
|
package/dist/index.js
CHANGED
|
@@ -66,6 +66,7 @@ var LIBRARY_TASK_MAPPING_EXCLUDING_TRANSFORMERS = {
|
|
|
66
66
|
peft: ["text-generation"],
|
|
67
67
|
"pyannote-audio": ["automatic-speech-recognition"],
|
|
68
68
|
"sentence-transformers": ["feature-extraction", "sentence-similarity"],
|
|
69
|
+
setfit: ["text-classification"],
|
|
69
70
|
sklearn: ["tabular-classification", "tabular-regression", "text-classification"],
|
|
70
71
|
spacy: ["token-classification", "text-classification", "sentence-similarity"],
|
|
71
72
|
"span-marker": ["token-classification"],
|
|
@@ -118,7 +119,7 @@ var asteroid = (model) => [
|
|
|
118
119
|
model = BaseModel.from_pretrained("${model.id}")`
|
|
119
120
|
];
|
|
120
121
|
function get_base_diffusers_model(model) {
|
|
121
|
-
return model.cardData?.base_model ?? "fill-in-base-model";
|
|
122
|
+
return model.cardData?.base_model?.toString() ?? "fill-in-base-model";
|
|
122
123
|
}
|
|
123
124
|
var bertopic = (model) => [
|
|
124
125
|
`from bertopic import BERTopic
|
|
@@ -218,8 +219,8 @@ var paddlenlp = (model) => {
|
|
|
218
219
|
[
|
|
219
220
|
`from paddlenlp.transformers import AutoTokenizer, ${architecture}`,
|
|
220
221
|
"",
|
|
221
|
-
`tokenizer = AutoTokenizer.from_pretrained("${model.id}"
|
|
222
|
-
`model = ${architecture}.from_pretrained("${model.id}"
|
|
222
|
+
`tokenizer = AutoTokenizer.from_pretrained("${model.id}", from_hf_hub=True)`,
|
|
223
|
+
`model = ${architecture}.from_pretrained("${model.id}", from_hf_hub=True)`
|
|
223
224
|
].join("\n")
|
|
224
225
|
];
|
|
225
226
|
} else {
|
|
@@ -228,8 +229,8 @@ var paddlenlp = (model) => {
|
|
|
228
229
|
`# \u26A0\uFE0F Type of model unknown`,
|
|
229
230
|
`from paddlenlp.transformers import AutoTokenizer, AutoModel`,
|
|
230
231
|
"",
|
|
231
|
-
`tokenizer = AutoTokenizer.from_pretrained("${model.id}"
|
|
232
|
-
`model = AutoModel.from_pretrained("${model.id}"
|
|
232
|
+
`tokenizer = AutoTokenizer.from_pretrained("${model.id}", from_hf_hub=True)`,
|
|
233
|
+
`model = AutoModel.from_pretrained("${model.id}", from_hf_hub=True)`
|
|
233
234
|
].join("\n")
|
|
234
235
|
];
|
|
235
236
|
}
|
|
@@ -365,6 +366,11 @@ var sentenceTransformers = (model) => [
|
|
|
365
366
|
|
|
366
367
|
model = SentenceTransformer("${model.id}")`
|
|
367
368
|
];
|
|
369
|
+
var setfit = (model) => [
|
|
370
|
+
`from setfit import SetFitModel
|
|
371
|
+
|
|
372
|
+
model = SetFitModel.from_pretrained("${model.id}")`
|
|
373
|
+
];
|
|
368
374
|
var spacy = (model) => [
|
|
369
375
|
`!pip install https://huggingface.co/${model.id}/resolve/main/${nameWithoutNamespace(model.id)}-any-py3-none-any.whl
|
|
370
376
|
|
|
@@ -637,6 +643,13 @@ var MODEL_LIBRARIES_UI_ELEMENTS = {
|
|
|
637
643
|
docsUrl: "https://huggingface.co/docs/hub/sentence-transformers",
|
|
638
644
|
snippets: sentenceTransformers
|
|
639
645
|
},
|
|
646
|
+
setfit: {
|
|
647
|
+
btnLabel: "setfit",
|
|
648
|
+
repoName: "setfit",
|
|
649
|
+
repoUrl: "https://github.com/huggingface/setfit",
|
|
650
|
+
docsUrl: "https://huggingface.co/docs/hub/setfit",
|
|
651
|
+
snippets: setfit
|
|
652
|
+
},
|
|
640
653
|
sklearn: {
|
|
641
654
|
btnLabel: "Scikit-learn",
|
|
642
655
|
repoName: "Scikit-learn",
|
|
@@ -2343,6 +2356,10 @@ var taskData5 = {
|
|
|
2343
2356
|
{
|
|
2344
2357
|
description: "A special model for OCR-free Document QA task. Donut model fine-tuned on DocVQA.",
|
|
2345
2358
|
id: "naver-clova-ix/donut-base-finetuned-docvqa"
|
|
2359
|
+
},
|
|
2360
|
+
{
|
|
2361
|
+
description: "A powerful model for document question answering.",
|
|
2362
|
+
id: "google/pix2struct-docvqa-large"
|
|
2346
2363
|
}
|
|
2347
2364
|
],
|
|
2348
2365
|
spaces: [
|
|
@@ -2353,6 +2370,10 @@ var taskData5 = {
|
|
|
2353
2370
|
{
|
|
2354
2371
|
description: "An application that can answer questions from invoices.",
|
|
2355
2372
|
id: "impira/invoices"
|
|
2373
|
+
},
|
|
2374
|
+
{
|
|
2375
|
+
description: "An application to compare different document question answering models.",
|
|
2376
|
+
id: "merve/compare_docvqa_models"
|
|
2356
2377
|
}
|
|
2357
2378
|
],
|
|
2358
2379
|
summary: "Document Question Answering (also known as Document Visual Question Answering) is the task of answering questions on document images. Document question answering models take a (document, question) pair as input and return an answer in natural language. Models usually rely on multi-modal features, combining text, position of words (bounding-boxes) and image.",
|
|
@@ -2696,30 +2717,26 @@ var taskData10 = {
|
|
|
2696
2717
|
models: [
|
|
2697
2718
|
{
|
|
2698
2719
|
description: "A robust image captioning model.",
|
|
2699
|
-
id: "Salesforce/
|
|
2720
|
+
id: "Salesforce/blip2-opt-2.7b"
|
|
2700
2721
|
},
|
|
2701
2722
|
{
|
|
2702
|
-
description: "A
|
|
2703
|
-
id: "
|
|
2723
|
+
description: "A powerful and accurate image-to-text model that can also localize concepts in images.",
|
|
2724
|
+
id: "microsoft/kosmos-2-patch14-224"
|
|
2704
2725
|
},
|
|
2705
2726
|
{
|
|
2706
2727
|
description: "A strong optical character recognition model.",
|
|
2707
|
-
id: "
|
|
2708
|
-
},
|
|
2709
|
-
{
|
|
2710
|
-
description: "A strong visual question answering model for scientific diagrams.",
|
|
2711
|
-
id: "google/pix2struct-ai2d-base"
|
|
2728
|
+
id: "facebook/nougat-base"
|
|
2712
2729
|
},
|
|
2713
2730
|
{
|
|
2714
|
-
description: "A
|
|
2715
|
-
id: "
|
|
2716
|
-
},
|
|
2717
|
-
{
|
|
2718
|
-
description: "A captioning model for images that contain text.",
|
|
2719
|
-
id: "google/pix2struct-textcaps-base"
|
|
2731
|
+
description: "A powerful model that lets you have a conversation with the image.",
|
|
2732
|
+
id: "llava-hf/llava-1.5-7b-hf"
|
|
2720
2733
|
}
|
|
2721
2734
|
],
|
|
2722
2735
|
spaces: [
|
|
2736
|
+
{
|
|
2737
|
+
description: "An application that compares various image captioning models.",
|
|
2738
|
+
id: "nielsr/comparing-captioning-models"
|
|
2739
|
+
},
|
|
2723
2740
|
{
|
|
2724
2741
|
description: "A robust image captioning application.",
|
|
2725
2742
|
id: "flax-community/image-captioning"
|
|
@@ -2881,7 +2898,6 @@ var taskData12 = {
|
|
|
2881
2898
|
],
|
|
2882
2899
|
models: [
|
|
2883
2900
|
{
|
|
2884
|
-
// TO DO: write description
|
|
2885
2901
|
description: "Solid object detection model trained on the benchmark dataset COCO 2017.",
|
|
2886
2902
|
id: "facebook/detr-resnet-50"
|
|
2887
2903
|
},
|
|
@@ -2891,9 +2907,13 @@ var taskData12 = {
|
|
|
2891
2907
|
}
|
|
2892
2908
|
],
|
|
2893
2909
|
spaces: [
|
|
2910
|
+
{
|
|
2911
|
+
description: "Leaderboard to compare various object detection models across several metrics.",
|
|
2912
|
+
id: "hf-vision/object_detection_leaderboard"
|
|
2913
|
+
},
|
|
2894
2914
|
{
|
|
2895
2915
|
description: "An object detection application that can detect unseen objects out of the box.",
|
|
2896
|
-
id: "
|
|
2916
|
+
id: "merve/owlv2"
|
|
2897
2917
|
},
|
|
2898
2918
|
{
|
|
2899
2919
|
description: "An application that contains various object detection models to try from.",
|
|
@@ -2939,14 +2959,16 @@ var taskData13 = {
|
|
|
2939
2959
|
metrics: [],
|
|
2940
2960
|
models: [
|
|
2941
2961
|
{
|
|
2942
|
-
// TO DO: write description
|
|
2943
2962
|
description: "Strong Depth Estimation model trained on 1.4 million images.",
|
|
2944
2963
|
id: "Intel/dpt-large"
|
|
2945
2964
|
},
|
|
2946
2965
|
{
|
|
2947
|
-
// TO DO: write description
|
|
2948
2966
|
description: "Strong Depth Estimation model trained on the KITTI dataset.",
|
|
2949
|
-
id: "
|
|
2967
|
+
id: "facebook/dpt-dinov2-large-kitti"
|
|
2968
|
+
},
|
|
2969
|
+
{
|
|
2970
|
+
description: "A strong monocular depth estimation model.",
|
|
2971
|
+
id: "Bingxin/Marigold"
|
|
2950
2972
|
}
|
|
2951
2973
|
],
|
|
2952
2974
|
spaces: [
|
|
@@ -3500,12 +3522,12 @@ var taskData22 = {
|
|
|
3500
3522
|
],
|
|
3501
3523
|
models: [
|
|
3502
3524
|
{
|
|
3503
|
-
description: "
|
|
3504
|
-
id: "
|
|
3525
|
+
description: "One of the most powerful image generation models that can generate realistic outputs.",
|
|
3526
|
+
id: "stabilityai/stable-diffusion-xl-base-1.0"
|
|
3505
3527
|
},
|
|
3506
3528
|
{
|
|
3507
|
-
description: "A
|
|
3508
|
-
id: "
|
|
3529
|
+
description: "A powerful yet fast image generation model.",
|
|
3530
|
+
id: "latent-consistency/lcm-lora-sdxl"
|
|
3509
3531
|
},
|
|
3510
3532
|
{
|
|
3511
3533
|
description: "A text-to-image model that can generate coherent text inside image.",
|
|
@@ -3522,19 +3544,23 @@ var taskData22 = {
|
|
|
3522
3544
|
id: "stabilityai/stable-diffusion"
|
|
3523
3545
|
},
|
|
3524
3546
|
{
|
|
3525
|
-
description: "
|
|
3547
|
+
description: "A text-to-image application to generate comics.",
|
|
3548
|
+
id: "jbilcke-hf/ai-comic-factory"
|
|
3549
|
+
},
|
|
3550
|
+
{
|
|
3551
|
+
description: "A text-to-image application that can generate coherent text inside the image.",
|
|
3526
3552
|
id: "DeepFloyd/IF"
|
|
3527
3553
|
},
|
|
3528
3554
|
{
|
|
3529
|
-
description: "
|
|
3530
|
-
id: "
|
|
3555
|
+
description: "A powerful yet very fast image generation application.",
|
|
3556
|
+
id: "latent-consistency/lcm-lora-for-sdxl"
|
|
3531
3557
|
},
|
|
3532
3558
|
{
|
|
3533
|
-
description: "
|
|
3559
|
+
description: "A powerful text-to-image application that can generate 3D representations.",
|
|
3534
3560
|
id: "hysts/Shap-E"
|
|
3535
3561
|
},
|
|
3536
3562
|
{
|
|
3537
|
-
description: "
|
|
3563
|
+
description: "An application for `text-to-image`, `image-to-image` and image inpainting.",
|
|
3538
3564
|
id: "ArtGAN/Stable-Diffusion-ControlNet-WebUI"
|
|
3539
3565
|
}
|
|
3540
3566
|
],
|
|
@@ -4026,7 +4052,7 @@ var taskData28 = {
|
|
|
4026
4052
|
models: [
|
|
4027
4053
|
{
|
|
4028
4054
|
description: "A strong model for video generation.",
|
|
4029
|
-
id: "
|
|
4055
|
+
id: "Vchitect/LaVie"
|
|
4030
4056
|
},
|
|
4031
4057
|
{
|
|
4032
4058
|
description: "A robust model for text-to-video generation.",
|
|
@@ -4034,7 +4060,7 @@ var taskData28 = {
|
|
|
4034
4060
|
},
|
|
4035
4061
|
{
|
|
4036
4062
|
description: "A text-to-video generation model with high quality and smooth outputs.",
|
|
4037
|
-
id: "
|
|
4063
|
+
id: "hotshotco/Hotshot-XL"
|
|
4038
4064
|
}
|
|
4039
4065
|
],
|
|
4040
4066
|
spaces: [
|
|
@@ -4044,7 +4070,7 @@ var taskData28 = {
|
|
|
4044
4070
|
},
|
|
4045
4071
|
{
|
|
4046
4072
|
description: "An application that generates video from image and text.",
|
|
4047
|
-
id: "
|
|
4073
|
+
id: "Vchitect/LaVie"
|
|
4048
4074
|
},
|
|
4049
4075
|
{
|
|
4050
4076
|
description: "An application that generates videos from text and provides multi-model support.",
|
|
@@ -4274,6 +4300,10 @@ var taskData31 = {
|
|
|
4274
4300
|
}
|
|
4275
4301
|
],
|
|
4276
4302
|
spaces: [
|
|
4303
|
+
{
|
|
4304
|
+
description: "An application that compares visual question answering models across different tasks.",
|
|
4305
|
+
id: "merve/pix2struct"
|
|
4306
|
+
},
|
|
4277
4307
|
{
|
|
4278
4308
|
description: "An application that can answer questions based on images.",
|
|
4279
4309
|
id: "nielsr/vilt-vqa"
|
|
@@ -4441,8 +4471,8 @@ var TASKS_MODEL_LIBRARIES = {
|
|
|
4441
4471
|
"graph-ml": ["transformers"],
|
|
4442
4472
|
"image-classification": ["keras", "timm", "transformers", "transformers.js"],
|
|
4443
4473
|
"image-segmentation": ["transformers", "transformers.js"],
|
|
4444
|
-
"image-to-image": ["diffusers", "transformers.js"],
|
|
4445
|
-
"image-to-text": ["transformers.js"],
|
|
4474
|
+
"image-to-image": ["diffusers", "transformers", "transformers.js"],
|
|
4475
|
+
"image-to-text": ["transformers", "transformers.js"],
|
|
4446
4476
|
"image-to-video": ["diffusers"],
|
|
4447
4477
|
"video-classification": ["transformers"],
|
|
4448
4478
|
"mask-generation": ["transformers"],
|
|
@@ -4459,7 +4489,7 @@ var TASKS_MODEL_LIBRARIES = {
|
|
|
4459
4489
|
"tabular-classification": ["sklearn"],
|
|
4460
4490
|
"tabular-regression": ["sklearn"],
|
|
4461
4491
|
"tabular-to-text": ["transformers"],
|
|
4462
|
-
"text-classification": ["adapter-transformers", "spacy", "transformers", "transformers.js"],
|
|
4492
|
+
"text-classification": ["adapter-transformers", "setfit", "spacy", "transformers", "transformers.js"],
|
|
4463
4493
|
"text-generation": ["transformers", "transformers.js"],
|
|
4464
4494
|
"text-retrieval": [],
|
|
4465
4495
|
"text-to-image": ["diffusers"],
|
|
@@ -4566,6 +4596,7 @@ var ModelLibrary = /* @__PURE__ */ ((ModelLibrary2) => {
|
|
|
4566
4596
|
ModelLibrary2["pyannote-audio"] = "pyannote.audio";
|
|
4567
4597
|
ModelLibrary2["sample-factory"] = "Sample Factory";
|
|
4568
4598
|
ModelLibrary2["sentence-transformers"] = "Sentence Transformers";
|
|
4599
|
+
ModelLibrary2["setfit"] = "SetFit";
|
|
4569
4600
|
ModelLibrary2["sklearn"] = "Scikit-learn";
|
|
4570
4601
|
ModelLibrary2["spacy"] = "spaCy";
|
|
4571
4602
|
ModelLibrary2["span-marker"] = "SpanMarker";
|
package/dist/index.mjs
CHANGED
|
@@ -28,6 +28,7 @@ var LIBRARY_TASK_MAPPING_EXCLUDING_TRANSFORMERS = {
|
|
|
28
28
|
peft: ["text-generation"],
|
|
29
29
|
"pyannote-audio": ["automatic-speech-recognition"],
|
|
30
30
|
"sentence-transformers": ["feature-extraction", "sentence-similarity"],
|
|
31
|
+
setfit: ["text-classification"],
|
|
31
32
|
sklearn: ["tabular-classification", "tabular-regression", "text-classification"],
|
|
32
33
|
spacy: ["token-classification", "text-classification", "sentence-similarity"],
|
|
33
34
|
"span-marker": ["token-classification"],
|
|
@@ -80,7 +81,7 @@ var asteroid = (model) => [
|
|
|
80
81
|
model = BaseModel.from_pretrained("${model.id}")`
|
|
81
82
|
];
|
|
82
83
|
function get_base_diffusers_model(model) {
|
|
83
|
-
return model.cardData?.base_model ?? "fill-in-base-model";
|
|
84
|
+
return model.cardData?.base_model?.toString() ?? "fill-in-base-model";
|
|
84
85
|
}
|
|
85
86
|
var bertopic = (model) => [
|
|
86
87
|
`from bertopic import BERTopic
|
|
@@ -180,8 +181,8 @@ var paddlenlp = (model) => {
|
|
|
180
181
|
[
|
|
181
182
|
`from paddlenlp.transformers import AutoTokenizer, ${architecture}`,
|
|
182
183
|
"",
|
|
183
|
-
`tokenizer = AutoTokenizer.from_pretrained("${model.id}"
|
|
184
|
-
`model = ${architecture}.from_pretrained("${model.id}"
|
|
184
|
+
`tokenizer = AutoTokenizer.from_pretrained("${model.id}", from_hf_hub=True)`,
|
|
185
|
+
`model = ${architecture}.from_pretrained("${model.id}", from_hf_hub=True)`
|
|
185
186
|
].join("\n")
|
|
186
187
|
];
|
|
187
188
|
} else {
|
|
@@ -190,8 +191,8 @@ var paddlenlp = (model) => {
|
|
|
190
191
|
`# \u26A0\uFE0F Type of model unknown`,
|
|
191
192
|
`from paddlenlp.transformers import AutoTokenizer, AutoModel`,
|
|
192
193
|
"",
|
|
193
|
-
`tokenizer = AutoTokenizer.from_pretrained("${model.id}"
|
|
194
|
-
`model = AutoModel.from_pretrained("${model.id}"
|
|
194
|
+
`tokenizer = AutoTokenizer.from_pretrained("${model.id}", from_hf_hub=True)`,
|
|
195
|
+
`model = AutoModel.from_pretrained("${model.id}", from_hf_hub=True)`
|
|
195
196
|
].join("\n")
|
|
196
197
|
];
|
|
197
198
|
}
|
|
@@ -327,6 +328,11 @@ var sentenceTransformers = (model) => [
|
|
|
327
328
|
|
|
328
329
|
model = SentenceTransformer("${model.id}")`
|
|
329
330
|
];
|
|
331
|
+
var setfit = (model) => [
|
|
332
|
+
`from setfit import SetFitModel
|
|
333
|
+
|
|
334
|
+
model = SetFitModel.from_pretrained("${model.id}")`
|
|
335
|
+
];
|
|
330
336
|
var spacy = (model) => [
|
|
331
337
|
`!pip install https://huggingface.co/${model.id}/resolve/main/${nameWithoutNamespace(model.id)}-any-py3-none-any.whl
|
|
332
338
|
|
|
@@ -599,6 +605,13 @@ var MODEL_LIBRARIES_UI_ELEMENTS = {
|
|
|
599
605
|
docsUrl: "https://huggingface.co/docs/hub/sentence-transformers",
|
|
600
606
|
snippets: sentenceTransformers
|
|
601
607
|
},
|
|
608
|
+
setfit: {
|
|
609
|
+
btnLabel: "setfit",
|
|
610
|
+
repoName: "setfit",
|
|
611
|
+
repoUrl: "https://github.com/huggingface/setfit",
|
|
612
|
+
docsUrl: "https://huggingface.co/docs/hub/setfit",
|
|
613
|
+
snippets: setfit
|
|
614
|
+
},
|
|
602
615
|
sklearn: {
|
|
603
616
|
btnLabel: "Scikit-learn",
|
|
604
617
|
repoName: "Scikit-learn",
|
|
@@ -2305,6 +2318,10 @@ var taskData5 = {
|
|
|
2305
2318
|
{
|
|
2306
2319
|
description: "A special model for OCR-free Document QA task. Donut model fine-tuned on DocVQA.",
|
|
2307
2320
|
id: "naver-clova-ix/donut-base-finetuned-docvqa"
|
|
2321
|
+
},
|
|
2322
|
+
{
|
|
2323
|
+
description: "A powerful model for document question answering.",
|
|
2324
|
+
id: "google/pix2struct-docvqa-large"
|
|
2308
2325
|
}
|
|
2309
2326
|
],
|
|
2310
2327
|
spaces: [
|
|
@@ -2315,6 +2332,10 @@ var taskData5 = {
|
|
|
2315
2332
|
{
|
|
2316
2333
|
description: "An application that can answer questions from invoices.",
|
|
2317
2334
|
id: "impira/invoices"
|
|
2335
|
+
},
|
|
2336
|
+
{
|
|
2337
|
+
description: "An application to compare different document question answering models.",
|
|
2338
|
+
id: "merve/compare_docvqa_models"
|
|
2318
2339
|
}
|
|
2319
2340
|
],
|
|
2320
2341
|
summary: "Document Question Answering (also known as Document Visual Question Answering) is the task of answering questions on document images. Document question answering models take a (document, question) pair as input and return an answer in natural language. Models usually rely on multi-modal features, combining text, position of words (bounding-boxes) and image.",
|
|
@@ -2658,30 +2679,26 @@ var taskData10 = {
|
|
|
2658
2679
|
models: [
|
|
2659
2680
|
{
|
|
2660
2681
|
description: "A robust image captioning model.",
|
|
2661
|
-
id: "Salesforce/
|
|
2682
|
+
id: "Salesforce/blip2-opt-2.7b"
|
|
2662
2683
|
},
|
|
2663
2684
|
{
|
|
2664
|
-
description: "A
|
|
2665
|
-
id: "
|
|
2685
|
+
description: "A powerful and accurate image-to-text model that can also localize concepts in images.",
|
|
2686
|
+
id: "microsoft/kosmos-2-patch14-224"
|
|
2666
2687
|
},
|
|
2667
2688
|
{
|
|
2668
2689
|
description: "A strong optical character recognition model.",
|
|
2669
|
-
id: "
|
|
2670
|
-
},
|
|
2671
|
-
{
|
|
2672
|
-
description: "A strong visual question answering model for scientific diagrams.",
|
|
2673
|
-
id: "google/pix2struct-ai2d-base"
|
|
2690
|
+
id: "facebook/nougat-base"
|
|
2674
2691
|
},
|
|
2675
2692
|
{
|
|
2676
|
-
description: "A
|
|
2677
|
-
id: "
|
|
2678
|
-
},
|
|
2679
|
-
{
|
|
2680
|
-
description: "A captioning model for images that contain text.",
|
|
2681
|
-
id: "google/pix2struct-textcaps-base"
|
|
2693
|
+
description: "A powerful model that lets you have a conversation with the image.",
|
|
2694
|
+
id: "llava-hf/llava-1.5-7b-hf"
|
|
2682
2695
|
}
|
|
2683
2696
|
],
|
|
2684
2697
|
spaces: [
|
|
2698
|
+
{
|
|
2699
|
+
description: "An application that compares various image captioning models.",
|
|
2700
|
+
id: "nielsr/comparing-captioning-models"
|
|
2701
|
+
},
|
|
2685
2702
|
{
|
|
2686
2703
|
description: "A robust image captioning application.",
|
|
2687
2704
|
id: "flax-community/image-captioning"
|
|
@@ -2843,7 +2860,6 @@ var taskData12 = {
|
|
|
2843
2860
|
],
|
|
2844
2861
|
models: [
|
|
2845
2862
|
{
|
|
2846
|
-
// TO DO: write description
|
|
2847
2863
|
description: "Solid object detection model trained on the benchmark dataset COCO 2017.",
|
|
2848
2864
|
id: "facebook/detr-resnet-50"
|
|
2849
2865
|
},
|
|
@@ -2853,9 +2869,13 @@ var taskData12 = {
|
|
|
2853
2869
|
}
|
|
2854
2870
|
],
|
|
2855
2871
|
spaces: [
|
|
2872
|
+
{
|
|
2873
|
+
description: "Leaderboard to compare various object detection models across several metrics.",
|
|
2874
|
+
id: "hf-vision/object_detection_leaderboard"
|
|
2875
|
+
},
|
|
2856
2876
|
{
|
|
2857
2877
|
description: "An object detection application that can detect unseen objects out of the box.",
|
|
2858
|
-
id: "
|
|
2878
|
+
id: "merve/owlv2"
|
|
2859
2879
|
},
|
|
2860
2880
|
{
|
|
2861
2881
|
description: "An application that contains various object detection models to try from.",
|
|
@@ -2901,14 +2921,16 @@ var taskData13 = {
|
|
|
2901
2921
|
metrics: [],
|
|
2902
2922
|
models: [
|
|
2903
2923
|
{
|
|
2904
|
-
// TO DO: write description
|
|
2905
2924
|
description: "Strong Depth Estimation model trained on 1.4 million images.",
|
|
2906
2925
|
id: "Intel/dpt-large"
|
|
2907
2926
|
},
|
|
2908
2927
|
{
|
|
2909
|
-
// TO DO: write description
|
|
2910
2928
|
description: "Strong Depth Estimation model trained on the KITTI dataset.",
|
|
2911
|
-
id: "
|
|
2929
|
+
id: "facebook/dpt-dinov2-large-kitti"
|
|
2930
|
+
},
|
|
2931
|
+
{
|
|
2932
|
+
description: "A strong monocular depth estimation model.",
|
|
2933
|
+
id: "Bingxin/Marigold"
|
|
2912
2934
|
}
|
|
2913
2935
|
],
|
|
2914
2936
|
spaces: [
|
|
@@ -3462,12 +3484,12 @@ var taskData22 = {
|
|
|
3462
3484
|
],
|
|
3463
3485
|
models: [
|
|
3464
3486
|
{
|
|
3465
|
-
description: "
|
|
3466
|
-
id: "
|
|
3487
|
+
description: "One of the most powerful image generation models that can generate realistic outputs.",
|
|
3488
|
+
id: "stabilityai/stable-diffusion-xl-base-1.0"
|
|
3467
3489
|
},
|
|
3468
3490
|
{
|
|
3469
|
-
description: "A
|
|
3470
|
-
id: "
|
|
3491
|
+
description: "A powerful yet fast image generation model.",
|
|
3492
|
+
id: "latent-consistency/lcm-lora-sdxl"
|
|
3471
3493
|
},
|
|
3472
3494
|
{
|
|
3473
3495
|
description: "A text-to-image model that can generate coherent text inside image.",
|
|
@@ -3484,19 +3506,23 @@ var taskData22 = {
|
|
|
3484
3506
|
id: "stabilityai/stable-diffusion"
|
|
3485
3507
|
},
|
|
3486
3508
|
{
|
|
3487
|
-
description: "
|
|
3509
|
+
description: "A text-to-image application to generate comics.",
|
|
3510
|
+
id: "jbilcke-hf/ai-comic-factory"
|
|
3511
|
+
},
|
|
3512
|
+
{
|
|
3513
|
+
description: "A text-to-image application that can generate coherent text inside the image.",
|
|
3488
3514
|
id: "DeepFloyd/IF"
|
|
3489
3515
|
},
|
|
3490
3516
|
{
|
|
3491
|
-
description: "
|
|
3492
|
-
id: "
|
|
3517
|
+
description: "A powerful yet very fast image generation application.",
|
|
3518
|
+
id: "latent-consistency/lcm-lora-for-sdxl"
|
|
3493
3519
|
},
|
|
3494
3520
|
{
|
|
3495
|
-
description: "
|
|
3521
|
+
description: "A powerful text-to-image application that can generate 3D representations.",
|
|
3496
3522
|
id: "hysts/Shap-E"
|
|
3497
3523
|
},
|
|
3498
3524
|
{
|
|
3499
|
-
description: "
|
|
3525
|
+
description: "An application for `text-to-image`, `image-to-image` and image inpainting.",
|
|
3500
3526
|
id: "ArtGAN/Stable-Diffusion-ControlNet-WebUI"
|
|
3501
3527
|
}
|
|
3502
3528
|
],
|
|
@@ -3988,7 +4014,7 @@ var taskData28 = {
|
|
|
3988
4014
|
models: [
|
|
3989
4015
|
{
|
|
3990
4016
|
description: "A strong model for video generation.",
|
|
3991
|
-
id: "
|
|
4017
|
+
id: "Vchitect/LaVie"
|
|
3992
4018
|
},
|
|
3993
4019
|
{
|
|
3994
4020
|
description: "A robust model for text-to-video generation.",
|
|
@@ -3996,7 +4022,7 @@ var taskData28 = {
|
|
|
3996
4022
|
},
|
|
3997
4023
|
{
|
|
3998
4024
|
description: "A text-to-video generation model with high quality and smooth outputs.",
|
|
3999
|
-
id: "
|
|
4025
|
+
id: "hotshotco/Hotshot-XL"
|
|
4000
4026
|
}
|
|
4001
4027
|
],
|
|
4002
4028
|
spaces: [
|
|
@@ -4006,7 +4032,7 @@ var taskData28 = {
|
|
|
4006
4032
|
},
|
|
4007
4033
|
{
|
|
4008
4034
|
description: "An application that generates video from image and text.",
|
|
4009
|
-
id: "
|
|
4035
|
+
id: "Vchitect/LaVie"
|
|
4010
4036
|
},
|
|
4011
4037
|
{
|
|
4012
4038
|
description: "An application that generates videos from text and provides multi-model support.",
|
|
@@ -4236,6 +4262,10 @@ var taskData31 = {
|
|
|
4236
4262
|
}
|
|
4237
4263
|
],
|
|
4238
4264
|
spaces: [
|
|
4265
|
+
{
|
|
4266
|
+
description: "An application that compares visual question answering models across different tasks.",
|
|
4267
|
+
id: "merve/pix2struct"
|
|
4268
|
+
},
|
|
4239
4269
|
{
|
|
4240
4270
|
description: "An application that can answer questions based on images.",
|
|
4241
4271
|
id: "nielsr/vilt-vqa"
|
|
@@ -4403,8 +4433,8 @@ var TASKS_MODEL_LIBRARIES = {
|
|
|
4403
4433
|
"graph-ml": ["transformers"],
|
|
4404
4434
|
"image-classification": ["keras", "timm", "transformers", "transformers.js"],
|
|
4405
4435
|
"image-segmentation": ["transformers", "transformers.js"],
|
|
4406
|
-
"image-to-image": ["diffusers", "transformers.js"],
|
|
4407
|
-
"image-to-text": ["transformers.js"],
|
|
4436
|
+
"image-to-image": ["diffusers", "transformers", "transformers.js"],
|
|
4437
|
+
"image-to-text": ["transformers", "transformers.js"],
|
|
4408
4438
|
"image-to-video": ["diffusers"],
|
|
4409
4439
|
"video-classification": ["transformers"],
|
|
4410
4440
|
"mask-generation": ["transformers"],
|
|
@@ -4421,7 +4451,7 @@ var TASKS_MODEL_LIBRARIES = {
|
|
|
4421
4451
|
"tabular-classification": ["sklearn"],
|
|
4422
4452
|
"tabular-regression": ["sklearn"],
|
|
4423
4453
|
"tabular-to-text": ["transformers"],
|
|
4424
|
-
"text-classification": ["adapter-transformers", "spacy", "transformers", "transformers.js"],
|
|
4454
|
+
"text-classification": ["adapter-transformers", "setfit", "spacy", "transformers", "transformers.js"],
|
|
4425
4455
|
"text-generation": ["transformers", "transformers.js"],
|
|
4426
4456
|
"text-retrieval": [],
|
|
4427
4457
|
"text-to-image": ["diffusers"],
|
|
@@ -4528,6 +4558,7 @@ var ModelLibrary = /* @__PURE__ */ ((ModelLibrary2) => {
|
|
|
4528
4558
|
ModelLibrary2["pyannote-audio"] = "pyannote.audio";
|
|
4529
4559
|
ModelLibrary2["sample-factory"] = "Sample Factory";
|
|
4530
4560
|
ModelLibrary2["sentence-transformers"] = "Sentence Transformers";
|
|
4561
|
+
ModelLibrary2["setfit"] = "SetFit";
|
|
4531
4562
|
ModelLibrary2["sklearn"] = "Scikit-learn";
|
|
4532
4563
|
ModelLibrary2["spacy"] = "spaCy";
|
|
4533
4564
|
ModelLibrary2["span-marker"] = "SpanMarker";
|
package/package.json
CHANGED
package/src/index.ts
CHANGED
|
@@ -16,9 +16,8 @@ export {
|
|
|
16
16
|
} from "./pipelines";
|
|
17
17
|
export { ModelLibrary, ALL_DISPLAY_MODEL_LIBRARY_KEYS } from "./model-libraries";
|
|
18
18
|
export type { ModelLibraryKey } from "./model-libraries";
|
|
19
|
-
export {
|
|
20
|
-
|
|
21
|
-
TransformersInfo,
|
|
19
|
+
export type { ModelData, TransformersInfo } from "./model-data";
|
|
20
|
+
export type {
|
|
22
21
|
WidgetExample,
|
|
23
22
|
WidgetExampleAttribute,
|
|
24
23
|
WidgetExampleAssetAndPromptInput,
|
|
@@ -37,7 +36,7 @@ export {
|
|
|
37
36
|
WidgetExampleOutputLabels,
|
|
38
37
|
WidgetExampleOutputAnswerScore,
|
|
39
38
|
WidgetExampleOutputText,
|
|
40
|
-
} from "./
|
|
39
|
+
} from "./widget-example";
|
|
41
40
|
export { InferenceDisplayability } from "./model-data";
|
|
42
41
|
|
|
43
42
|
export { TAG_NFAA_CONTENT, OTHER_TAGS_SUGGESTIONS, TAG_TEXT_GENERATION_INFERENCE, TAG_CUSTOM_CODE } from "./tags";
|
package/src/library-to-tasks.ts
CHANGED
|
@@ -31,6 +31,7 @@ export const LIBRARY_TASK_MAPPING_EXCLUDING_TRANSFORMERS: Partial<Record<ModelLi
|
|
|
31
31
|
peft: ["text-generation"],
|
|
32
32
|
"pyannote-audio": ["automatic-speech-recognition"],
|
|
33
33
|
"sentence-transformers": ["feature-extraction", "sentence-similarity"],
|
|
34
|
+
setfit: ["text-classification"],
|
|
34
35
|
sklearn: ["tabular-classification", "tabular-regression", "text-classification"],
|
|
35
36
|
spacy: ["token-classification", "text-classification", "sentence-similarity"],
|
|
36
37
|
"span-marker": ["token-classification"],
|
|
@@ -72,7 +72,7 @@ model = BaseModel.from_pretrained("${model.id}")`,
|
|
|
72
72
|
];
|
|
73
73
|
|
|
74
74
|
function get_base_diffusers_model(model: ModelData): string {
|
|
75
|
-
return model.cardData?.base_model ?? "fill-in-base-model";
|
|
75
|
+
return model.cardData?.base_model?.toString() ?? "fill-in-base-model";
|
|
76
76
|
}
|
|
77
77
|
|
|
78
78
|
const bertopic = (model: ModelData) => [
|
|
@@ -187,12 +187,8 @@ const paddlenlp = (model: ModelData) => {
|
|
|
187
187
|
[
|
|
188
188
|
`from paddlenlp.transformers import AutoTokenizer, ${architecture}`,
|
|
189
189
|
"",
|
|
190
|
-
`tokenizer = AutoTokenizer.from_pretrained("${model.id}"
|
|
191
|
-
|
|
192
|
-
}, from_hf_hub=True)`,
|
|
193
|
-
`model = ${architecture}.from_pretrained("${model.id}"${
|
|
194
|
-
model.private ? ", use_auth_token=True" : ""
|
|
195
|
-
}, from_hf_hub=True)`,
|
|
190
|
+
`tokenizer = AutoTokenizer.from_pretrained("${model.id}", from_hf_hub=True)`,
|
|
191
|
+
`model = ${architecture}.from_pretrained("${model.id}", from_hf_hub=True)`,
|
|
196
192
|
].join("\n"),
|
|
197
193
|
];
|
|
198
194
|
} else {
|
|
@@ -201,12 +197,8 @@ const paddlenlp = (model: ModelData) => {
|
|
|
201
197
|
`# ⚠️ Type of model unknown`,
|
|
202
198
|
`from paddlenlp.transformers import AutoTokenizer, AutoModel`,
|
|
203
199
|
"",
|
|
204
|
-
`tokenizer = AutoTokenizer.from_pretrained("${model.id}"
|
|
205
|
-
|
|
206
|
-
}, from_hf_hub=True)`,
|
|
207
|
-
`model = AutoModel.from_pretrained("${model.id}"${
|
|
208
|
-
model.private ? ", use_auth_token=True" : ""
|
|
209
|
-
}, from_hf_hub=True)`,
|
|
200
|
+
`tokenizer = AutoTokenizer.from_pretrained("${model.id}", from_hf_hub=True)`,
|
|
201
|
+
`model = AutoModel.from_pretrained("${model.id}", from_hf_hub=True)`,
|
|
210
202
|
].join("\n"),
|
|
211
203
|
];
|
|
212
204
|
}
|
|
@@ -358,6 +350,12 @@ const sentenceTransformers = (model: ModelData) => [
|
|
|
358
350
|
model = SentenceTransformer("${model.id}")`,
|
|
359
351
|
];
|
|
360
352
|
|
|
353
|
+
const setfit = (model: ModelData) => [
|
|
354
|
+
`from setfit import SetFitModel
|
|
355
|
+
|
|
356
|
+
model = SetFitModel.from_pretrained("${model.id}")`,
|
|
357
|
+
];
|
|
358
|
+
|
|
361
359
|
const spacy = (model: ModelData) => [
|
|
362
360
|
`!pip install https://huggingface.co/${model.id}/resolve/main/${nameWithoutNamespace(model.id)}-any-py3-none-any.whl
|
|
363
361
|
|
|
@@ -661,6 +659,13 @@ export const MODEL_LIBRARIES_UI_ELEMENTS: Partial<Record<ModelLibraryKey, Librar
|
|
|
661
659
|
docsUrl: "https://huggingface.co/docs/hub/sentence-transformers",
|
|
662
660
|
snippets: sentenceTransformers,
|
|
663
661
|
},
|
|
662
|
+
setfit: {
|
|
663
|
+
btnLabel: "setfit",
|
|
664
|
+
repoName: "setfit",
|
|
665
|
+
repoUrl: "https://github.com/huggingface/setfit",
|
|
666
|
+
docsUrl: "https://huggingface.co/docs/hub/setfit",
|
|
667
|
+
snippets: setfit,
|
|
668
|
+
},
|
|
664
669
|
sklearn: {
|
|
665
670
|
btnLabel: "Scikit-learn",
|
|
666
671
|
repoName: "Scikit-learn",
|
package/src/model-data.ts
CHANGED
|
@@ -1,119 +1,5 @@
|
|
|
1
1
|
import type { PipelineType } from "./pipelines";
|
|
2
|
-
|
|
3
|
-
type TableData = Record<string, (string | number)[]>;
|
|
4
|
-
|
|
5
|
-
//#region outputs
|
|
6
|
-
export type WidgetExampleOutputLabels = Array<{ label: string; score: number }>;
|
|
7
|
-
export interface WidgetExampleOutputAnswerScore {
|
|
8
|
-
answer: string;
|
|
9
|
-
score: number;
|
|
10
|
-
}
|
|
11
|
-
export interface WidgetExampleOutputText {
|
|
12
|
-
text: string;
|
|
13
|
-
}
|
|
14
|
-
export interface WidgetExampleOutputUrl {
|
|
15
|
-
url: string;
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
export type WidgetExampleOutput =
|
|
19
|
-
| WidgetExampleOutputLabels
|
|
20
|
-
| WidgetExampleOutputAnswerScore
|
|
21
|
-
| WidgetExampleOutputText
|
|
22
|
-
| WidgetExampleOutputUrl;
|
|
23
|
-
//#endregion
|
|
24
|
-
|
|
25
|
-
export interface WidgetExampleBase<TOutput> {
|
|
26
|
-
example_title?: string;
|
|
27
|
-
group?: string;
|
|
28
|
-
/**
|
|
29
|
-
* Potential overrides to API parameters for this specific example
|
|
30
|
-
* (takes precedences over the model card metadata's inference.parameters)
|
|
31
|
-
*/
|
|
32
|
-
parameters?: {
|
|
33
|
-
/// token-classification
|
|
34
|
-
aggregation_strategy?: string;
|
|
35
|
-
/// text-generation
|
|
36
|
-
top_k?: number;
|
|
37
|
-
top_p?: number;
|
|
38
|
-
temperature?: number;
|
|
39
|
-
max_new_tokens?: number;
|
|
40
|
-
do_sample?: boolean;
|
|
41
|
-
/// text-to-image
|
|
42
|
-
negative_prompt?: string;
|
|
43
|
-
guidance_scale?: number;
|
|
44
|
-
num_inference_steps?: number;
|
|
45
|
-
};
|
|
46
|
-
/**
|
|
47
|
-
* Optional output
|
|
48
|
-
*/
|
|
49
|
-
output?: TOutput;
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
export interface WidgetExampleTextInput<TOutput = WidgetExampleOutput> extends WidgetExampleBase<TOutput> {
|
|
53
|
-
text: string;
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
export interface WidgetExampleTextAndContextInput<TOutput = WidgetExampleOutput>
|
|
57
|
-
extends WidgetExampleTextInput<TOutput> {
|
|
58
|
-
context: string;
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
export interface WidgetExampleTextAndTableInput<TOutput = WidgetExampleOutput> extends WidgetExampleTextInput<TOutput> {
|
|
62
|
-
table: TableData;
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
export interface WidgetExampleAssetInput<TOutput = WidgetExampleOutput> extends WidgetExampleBase<TOutput> {
|
|
66
|
-
src: string;
|
|
67
|
-
}
|
|
68
|
-
export interface WidgetExampleAssetAndPromptInput<TOutput = WidgetExampleOutput>
|
|
69
|
-
extends WidgetExampleAssetInput<TOutput> {
|
|
70
|
-
prompt: string;
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
export type WidgetExampleAssetAndTextInput<TOutput = WidgetExampleOutput> = WidgetExampleAssetInput<TOutput> &
|
|
74
|
-
WidgetExampleTextInput<TOutput>;
|
|
75
|
-
|
|
76
|
-
export type WidgetExampleAssetAndZeroShotInput<TOutput = WidgetExampleOutput> = WidgetExampleAssetInput<TOutput> &
|
|
77
|
-
WidgetExampleZeroShotTextInput<TOutput>;
|
|
78
|
-
|
|
79
|
-
export interface WidgetExampleStructuredDataInput<TOutput = WidgetExampleOutput> extends WidgetExampleBase<TOutput> {
|
|
80
|
-
structured_data: TableData;
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
export interface WidgetExampleTableDataInput<TOutput = WidgetExampleOutput> extends WidgetExampleBase<TOutput> {
|
|
84
|
-
table: TableData;
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
export interface WidgetExampleZeroShotTextInput<TOutput = WidgetExampleOutput> extends WidgetExampleTextInput<TOutput> {
|
|
88
|
-
text: string;
|
|
89
|
-
candidate_labels: string;
|
|
90
|
-
multi_class: boolean;
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
export interface WidgetExampleSentenceSimilarityInput<TOutput = WidgetExampleOutput>
|
|
94
|
-
extends WidgetExampleBase<TOutput> {
|
|
95
|
-
source_sentence: string;
|
|
96
|
-
sentences: string[];
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
//#endregion
|
|
100
|
-
|
|
101
|
-
export type WidgetExample<TOutput = WidgetExampleOutput> =
|
|
102
|
-
| WidgetExampleTextInput<TOutput>
|
|
103
|
-
| WidgetExampleTextAndContextInput<TOutput>
|
|
104
|
-
| WidgetExampleTextAndTableInput<TOutput>
|
|
105
|
-
| WidgetExampleAssetInput<TOutput>
|
|
106
|
-
| WidgetExampleAssetAndPromptInput<TOutput>
|
|
107
|
-
| WidgetExampleAssetAndTextInput<TOutput>
|
|
108
|
-
| WidgetExampleAssetAndZeroShotInput<TOutput>
|
|
109
|
-
| WidgetExampleStructuredDataInput<TOutput>
|
|
110
|
-
| WidgetExampleTableDataInput<TOutput>
|
|
111
|
-
| WidgetExampleZeroShotTextInput<TOutput>
|
|
112
|
-
| WidgetExampleSentenceSimilarityInput<TOutput>;
|
|
113
|
-
|
|
114
|
-
type KeysOfUnion<T> = T extends unknown ? keyof T : never;
|
|
115
|
-
|
|
116
|
-
export type WidgetExampleAttribute = KeysOfUnion<WidgetExample>;
|
|
2
|
+
import type { WidgetExample } from "./widget-example";
|
|
117
3
|
|
|
118
4
|
export enum InferenceDisplayability {
|
|
119
5
|
/**
|
|
@@ -207,7 +93,7 @@ export interface ModelData {
|
|
|
207
93
|
| {
|
|
208
94
|
parameters?: Record<string, unknown>;
|
|
209
95
|
};
|
|
210
|
-
base_model?: string;
|
|
96
|
+
base_model?: string | string[];
|
|
211
97
|
};
|
|
212
98
|
/**
|
|
213
99
|
* Library name
|
package/src/model-libraries.ts
CHANGED
|
@@ -23,6 +23,7 @@ export enum ModelLibrary {
|
|
|
23
23
|
"pyannote-audio" = "pyannote.audio",
|
|
24
24
|
"sample-factory" = "Sample Factory",
|
|
25
25
|
"sentence-transformers" = "Sentence Transformers",
|
|
26
|
+
"setfit" = "SetFit",
|
|
26
27
|
"sklearn" = "Scikit-learn",
|
|
27
28
|
"spacy" = "spaCy",
|
|
28
29
|
"span-marker" = "SpanMarker",
|
|
@@ -24,14 +24,16 @@ const taskData: TaskDataCustom = {
|
|
|
24
24
|
metrics: [],
|
|
25
25
|
models: [
|
|
26
26
|
{
|
|
27
|
-
// TO DO: write description
|
|
28
27
|
description: "Strong Depth Estimation model trained on 1.4 million images.",
|
|
29
28
|
id: "Intel/dpt-large",
|
|
30
29
|
},
|
|
31
30
|
{
|
|
32
|
-
// TO DO: write description
|
|
33
31
|
description: "Strong Depth Estimation model trained on the KITTI dataset.",
|
|
34
|
-
id: "
|
|
32
|
+
id: "facebook/dpt-dinov2-large-kitti",
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
description: "A strong monocular depth estimation model.",
|
|
36
|
+
id: "Bingxin/Marigold",
|
|
35
37
|
},
|
|
36
38
|
],
|
|
37
39
|
spaces: [
|
|
@@ -50,6 +50,10 @@ const taskData: TaskDataCustom = {
|
|
|
50
50
|
description: "A special model for OCR-free Document QA task. Donut model fine-tuned on DocVQA.",
|
|
51
51
|
id: "naver-clova-ix/donut-base-finetuned-docvqa",
|
|
52
52
|
},
|
|
53
|
+
{
|
|
54
|
+
description: "A powerful model for document question answering.",
|
|
55
|
+
id: "google/pix2struct-docvqa-large",
|
|
56
|
+
},
|
|
53
57
|
],
|
|
54
58
|
spaces: [
|
|
55
59
|
{
|
|
@@ -60,6 +64,10 @@ const taskData: TaskDataCustom = {
|
|
|
60
64
|
description: "An application that can answer questions from invoices.",
|
|
61
65
|
id: "impira/invoices",
|
|
62
66
|
},
|
|
67
|
+
{
|
|
68
|
+
description: "An application to compare different document question answering models.",
|
|
69
|
+
id: "merve/compare_docvqa_models",
|
|
70
|
+
},
|
|
63
71
|
],
|
|
64
72
|
summary:
|
|
65
73
|
"Document Question Answering (also known as Document Visual Question Answering) is the task of answering questions on document images. Document question answering models take a (document, question) pair as input and return an answer in natural language. Models usually rely on multi-modal features, combining text, position of words (bounding-boxes) and image.",
|
|
@@ -27,6 +27,19 @@ captioner("https://huggingface.co/datasets/Narsil/image_dummy/resolve/main/parro
|
|
|
27
27
|
## [{'generated_text': 'two birds are standing next to each other '}]
|
|
28
28
|
```
|
|
29
29
|
|
|
30
|
+
### Conversation about the Image
|
|
31
|
+
|
|
32
|
+
Some text generation models also take image inputs. These are called vision language models. You can use `image-to-text` pipeline to use these models like below.
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
from transformers import pipeline
|
|
36
|
+
|
|
37
|
+
mm_pipeline = pipeline("image-to-text",model="llava-hf/llava-1.5-7b-hf")
|
|
38
|
+
mm_pipeline("https://huggingface.co/spaces/llava-hf/llava-4bit/resolve/main/examples/baklava.png", "How to make this pastry?")
|
|
39
|
+
|
|
40
|
+
## [{'generated_text': 'To create these pastries, you will need a few key ingredients and tools. Firstly, gather the dough by combining flour with water in your mixing bowl until it forms into an elastic ball that can be easily rolled out on top of another surface or table without breaking apart (like pizza).'}]
|
|
41
|
+
```
|
|
42
|
+
|
|
30
43
|
### OCR
|
|
31
44
|
|
|
32
45
|
This code snippet uses Microsoft’s TrOCR, an encoder-decoder model consisting of an image Transformer encoder and a text Transformer decoder for state-of-the-art optical character recognition (OCR) on single-text line images.
|
|
@@ -32,30 +32,26 @@ const taskData: TaskDataCustom = {
|
|
|
32
32
|
models: [
|
|
33
33
|
{
|
|
34
34
|
description: "A robust image captioning model.",
|
|
35
|
-
id: "Salesforce/
|
|
35
|
+
id: "Salesforce/blip2-opt-2.7b",
|
|
36
36
|
},
|
|
37
37
|
{
|
|
38
|
-
description: "A
|
|
39
|
-
id: "
|
|
38
|
+
description: "A powerful and accurate image-to-text model that can also localize concepts in images.",
|
|
39
|
+
id: "microsoft/kosmos-2-patch14-224",
|
|
40
40
|
},
|
|
41
41
|
{
|
|
42
42
|
description: "A strong optical character recognition model.",
|
|
43
|
-
id: "
|
|
43
|
+
id: "facebook/nougat-base",
|
|
44
44
|
},
|
|
45
45
|
{
|
|
46
|
-
description: "A
|
|
47
|
-
id: "
|
|
48
|
-
},
|
|
49
|
-
{
|
|
50
|
-
description: "A strong captioning model for UI components.",
|
|
51
|
-
id: "google/pix2struct-widget-captioning-base",
|
|
52
|
-
},
|
|
53
|
-
{
|
|
54
|
-
description: "A captioning model for images that contain text.",
|
|
55
|
-
id: "google/pix2struct-textcaps-base",
|
|
46
|
+
description: "A powerful model that lets you have a conversation with the image.",
|
|
47
|
+
id: "llava-hf/llava-1.5-7b-hf",
|
|
56
48
|
},
|
|
57
49
|
],
|
|
58
50
|
spaces: [
|
|
51
|
+
{
|
|
52
|
+
description: "An application that compares various image captioning models.",
|
|
53
|
+
id: "nielsr/comparing-captioning-models",
|
|
54
|
+
},
|
|
59
55
|
{
|
|
60
56
|
description: "A robust image captioning application.",
|
|
61
57
|
id: "flax-community/image-captioning",
|
package/src/tasks/index.ts
CHANGED
|
@@ -51,8 +51,8 @@ export const TASKS_MODEL_LIBRARIES: Record<PipelineType, ModelLibraryKey[]> = {
|
|
|
51
51
|
"graph-ml": ["transformers"],
|
|
52
52
|
"image-classification": ["keras", "timm", "transformers", "transformers.js"],
|
|
53
53
|
"image-segmentation": ["transformers", "transformers.js"],
|
|
54
|
-
"image-to-image": ["diffusers", "transformers.js"],
|
|
55
|
-
"image-to-text": ["transformers.js"],
|
|
54
|
+
"image-to-image": ["diffusers", "transformers", "transformers.js"],
|
|
55
|
+
"image-to-text": ["transformers", "transformers.js"],
|
|
56
56
|
"image-to-video": ["diffusers"],
|
|
57
57
|
"video-classification": ["transformers"],
|
|
58
58
|
"mask-generation": ["transformers"],
|
|
@@ -69,7 +69,7 @@ export const TASKS_MODEL_LIBRARIES: Record<PipelineType, ModelLibraryKey[]> = {
|
|
|
69
69
|
"tabular-classification": ["sklearn"],
|
|
70
70
|
"tabular-regression": ["sklearn"],
|
|
71
71
|
"tabular-to-text": ["transformers"],
|
|
72
|
-
"text-classification": ["adapter-transformers", "spacy", "transformers", "transformers.js"],
|
|
72
|
+
"text-classification": ["adapter-transformers", "setfit", "spacy", "transformers", "transformers.js"],
|
|
73
73
|
"text-generation": ["transformers", "transformers.js"],
|
|
74
74
|
"text-retrieval": [],
|
|
75
75
|
"text-to-image": ["diffusers"],
|
|
@@ -40,7 +40,6 @@ const taskData: TaskDataCustom = {
|
|
|
40
40
|
],
|
|
41
41
|
models: [
|
|
42
42
|
{
|
|
43
|
-
// TO DO: write description
|
|
44
43
|
description: "Solid object detection model trained on the benchmark dataset COCO 2017.",
|
|
45
44
|
id: "facebook/detr-resnet-50",
|
|
46
45
|
},
|
|
@@ -50,9 +49,13 @@ const taskData: TaskDataCustom = {
|
|
|
50
49
|
},
|
|
51
50
|
],
|
|
52
51
|
spaces: [
|
|
52
|
+
{
|
|
53
|
+
description: "Leaderboard to compare various object detection models across several metrics.",
|
|
54
|
+
id: "hf-vision/object_detection_leaderboard",
|
|
55
|
+
},
|
|
53
56
|
{
|
|
54
57
|
description: "An object detection application that can detect unseen objects out of the box.",
|
|
55
|
-
id: "
|
|
58
|
+
id: "merve/owlv2",
|
|
56
59
|
},
|
|
57
60
|
{
|
|
58
61
|
description: "An application that contains various object detection models to try from.",
|
|
@@ -42,6 +42,10 @@ When it comes to text generation, the underlying language model can come in seve
|
|
|
42
42
|
|
|
43
43
|
- **Human feedback models:** these models extend base and instruction-trained models by incorporating human feedback that rates the quality of the generated text according to criteria like [helpfulness, honesty, and harmlessness](https://arxiv.org/abs/2112.00861). The human feedback is then combined with an optimization technique like reinforcement learning to align the original model to be closer with human preferences. The overall methodology is often called [Reinforcement Learning from Human Feedback](https://huggingface.co/blog/rlhf), or RLHF for short. [Llama2-Chat](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) is an open-source model aligned through human feedback.
|
|
44
44
|
|
|
45
|
+
## Text Generation from Image and Text
|
|
46
|
+
|
|
47
|
+
There are language models that can input both text and image and output text, called vision language models. [LLaVA](https://huggingface.co/llava-hf/llava-1.5-7b-hf) and [BLIP-2](https://huggingface.co/Salesforce/blip2-opt-2.7b) are good examples. Although they work just like other language models by means of input parameters for generation, since they also take input images, you can use them with `image-to-text` pipeline. You can find information about the pipeline in [image-to-text](https://huggingface.co/tasks/image-to-text) task page.
|
|
48
|
+
|
|
45
49
|
## Inference
|
|
46
50
|
|
|
47
51
|
You can use the 🤗 Transformers library `text-generation` pipeline to do inference with Text Generation models. It takes an incomplete text and returns multiple outputs with which the text can be completed.
|
|
@@ -45,14 +45,12 @@ const taskData: TaskDataCustom = {
|
|
|
45
45
|
],
|
|
46
46
|
models: [
|
|
47
47
|
{
|
|
48
|
-
description:
|
|
49
|
-
|
|
50
|
-
id: "CompVis/stable-diffusion-v1-4",
|
|
48
|
+
description: "One of the most powerful image generation models that can generate realistic outputs.",
|
|
49
|
+
id: "stabilityai/stable-diffusion-xl-base-1.0",
|
|
51
50
|
},
|
|
52
51
|
{
|
|
53
|
-
description:
|
|
54
|
-
|
|
55
|
-
id: "dalle-mini/dalle-mega",
|
|
52
|
+
description: "A powerful yet fast image generation model.",
|
|
53
|
+
id: "latent-consistency/lcm-lora-sdxl",
|
|
56
54
|
},
|
|
57
55
|
{
|
|
58
56
|
description: "A text-to-image model that can generate coherent text inside image.",
|
|
@@ -69,19 +67,23 @@ const taskData: TaskDataCustom = {
|
|
|
69
67
|
id: "stabilityai/stable-diffusion",
|
|
70
68
|
},
|
|
71
69
|
{
|
|
72
|
-
description: "
|
|
70
|
+
description: "A text-to-image application to generate comics.",
|
|
71
|
+
id: "jbilcke-hf/ai-comic-factory",
|
|
72
|
+
},
|
|
73
|
+
{
|
|
74
|
+
description: "A text-to-image application that can generate coherent text inside the image.",
|
|
73
75
|
id: "DeepFloyd/IF",
|
|
74
76
|
},
|
|
75
77
|
{
|
|
76
|
-
description: "
|
|
77
|
-
id: "
|
|
78
|
+
description: "A powerful yet very fast image generation application.",
|
|
79
|
+
id: "latent-consistency/lcm-lora-for-sdxl",
|
|
78
80
|
},
|
|
79
81
|
{
|
|
80
|
-
description: "
|
|
82
|
+
description: "A powerful text-to-image application that can generate 3D representations.",
|
|
81
83
|
id: "hysts/Shap-E",
|
|
82
84
|
},
|
|
83
85
|
{
|
|
84
|
-
description: "
|
|
86
|
+
description: "An application for `text-to-image`, `image-to-image` and image inpainting.",
|
|
85
87
|
id: "ArtGAN/Stable-Diffusion-ControlNet-WebUI",
|
|
86
88
|
},
|
|
87
89
|
],
|
|
@@ -68,7 +68,7 @@ const taskData: TaskDataCustom = {
|
|
|
68
68
|
models: [
|
|
69
69
|
{
|
|
70
70
|
description: "A strong model for video generation.",
|
|
71
|
-
id: "
|
|
71
|
+
id: "Vchitect/LaVie",
|
|
72
72
|
},
|
|
73
73
|
{
|
|
74
74
|
description: "A robust model for text-to-video generation.",
|
|
@@ -76,7 +76,7 @@ const taskData: TaskDataCustom = {
|
|
|
76
76
|
},
|
|
77
77
|
{
|
|
78
78
|
description: "A text-to-video generation model with high quality and smooth outputs.",
|
|
79
|
-
id: "
|
|
79
|
+
id: "hotshotco/Hotshot-XL",
|
|
80
80
|
},
|
|
81
81
|
],
|
|
82
82
|
spaces: [
|
|
@@ -86,7 +86,7 @@ const taskData: TaskDataCustom = {
|
|
|
86
86
|
},
|
|
87
87
|
{
|
|
88
88
|
description: "An application that generates video from image and text.",
|
|
89
|
-
id: "
|
|
89
|
+
id: "Vchitect/LaVie",
|
|
90
90
|
},
|
|
91
91
|
{
|
|
92
92
|
description: "An application that generates videos from text and provides multi-model support.",
|
|
@@ -71,6 +71,10 @@ const taskData: TaskDataCustom = {
|
|
|
71
71
|
},
|
|
72
72
|
],
|
|
73
73
|
spaces: [
|
|
74
|
+
{
|
|
75
|
+
description: "An application that compares visual question answering models across different tasks.",
|
|
76
|
+
id: "merve/pix2struct",
|
|
77
|
+
},
|
|
74
78
|
{
|
|
75
79
|
description: "An application that can answer questions based on images.",
|
|
76
80
|
id: "nielsr/vilt-vqa",
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* See default-widget-inputs.ts for the default widget inputs, this files only contains the types
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
type TableData = Record<string, (string | number)[]>;
|
|
6
|
+
|
|
7
|
+
//#region outputs
|
|
8
|
+
export type WidgetExampleOutputLabels = Array<{ label: string; score: number }>;
|
|
9
|
+
export interface WidgetExampleOutputAnswerScore {
|
|
10
|
+
answer: string;
|
|
11
|
+
score: number;
|
|
12
|
+
}
|
|
13
|
+
export interface WidgetExampleOutputText {
|
|
14
|
+
text: string;
|
|
15
|
+
}
|
|
16
|
+
export interface WidgetExampleOutputUrl {
|
|
17
|
+
url: string;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export type WidgetExampleOutput =
|
|
21
|
+
| WidgetExampleOutputLabels
|
|
22
|
+
| WidgetExampleOutputAnswerScore
|
|
23
|
+
| WidgetExampleOutputText
|
|
24
|
+
| WidgetExampleOutputUrl;
|
|
25
|
+
//#endregion
|
|
26
|
+
|
|
27
|
+
export interface WidgetExampleBase<TOutput> {
|
|
28
|
+
example_title?: string;
|
|
29
|
+
group?: string;
|
|
30
|
+
/**
|
|
31
|
+
* Potential overrides to API parameters for this specific example
|
|
32
|
+
* (takes precedences over the model card metadata's inference.parameters)
|
|
33
|
+
*/
|
|
34
|
+
parameters?: {
|
|
35
|
+
/// token-classification
|
|
36
|
+
aggregation_strategy?: string;
|
|
37
|
+
/// text-generation
|
|
38
|
+
top_k?: number;
|
|
39
|
+
top_p?: number;
|
|
40
|
+
temperature?: number;
|
|
41
|
+
max_new_tokens?: number;
|
|
42
|
+
do_sample?: boolean;
|
|
43
|
+
/// text-to-image
|
|
44
|
+
negative_prompt?: string;
|
|
45
|
+
guidance_scale?: number;
|
|
46
|
+
num_inference_steps?: number;
|
|
47
|
+
};
|
|
48
|
+
/**
|
|
49
|
+
* Optional output
|
|
50
|
+
*/
|
|
51
|
+
output?: TOutput;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export interface WidgetExampleTextInput<TOutput = WidgetExampleOutput> extends WidgetExampleBase<TOutput> {
|
|
55
|
+
text: string;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
export interface WidgetExampleTextAndContextInput<TOutput = WidgetExampleOutput>
|
|
59
|
+
extends WidgetExampleTextInput<TOutput> {
|
|
60
|
+
context: string;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
export interface WidgetExampleTextAndTableInput<TOutput = WidgetExampleOutput> extends WidgetExampleTextInput<TOutput> {
|
|
64
|
+
table: TableData;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
export interface WidgetExampleAssetInput<TOutput = WidgetExampleOutput> extends WidgetExampleBase<TOutput> {
|
|
68
|
+
src: string;
|
|
69
|
+
}
|
|
70
|
+
export interface WidgetExampleAssetAndPromptInput<TOutput = WidgetExampleOutput>
|
|
71
|
+
extends WidgetExampleAssetInput<TOutput> {
|
|
72
|
+
prompt: string;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
export type WidgetExampleAssetAndTextInput<TOutput = WidgetExampleOutput> = WidgetExampleAssetInput<TOutput> &
|
|
76
|
+
WidgetExampleTextInput<TOutput>;
|
|
77
|
+
|
|
78
|
+
export type WidgetExampleAssetAndZeroShotInput<TOutput = WidgetExampleOutput> = WidgetExampleAssetInput<TOutput> &
|
|
79
|
+
WidgetExampleZeroShotTextInput<TOutput>;
|
|
80
|
+
|
|
81
|
+
export interface WidgetExampleStructuredDataInput<TOutput = WidgetExampleOutput> extends WidgetExampleBase<TOutput> {
|
|
82
|
+
structured_data: TableData;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
export interface WidgetExampleTableDataInput<TOutput = WidgetExampleOutput> extends WidgetExampleBase<TOutput> {
|
|
86
|
+
table: TableData;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
export interface WidgetExampleZeroShotTextInput<TOutput = WidgetExampleOutput> extends WidgetExampleTextInput<TOutput> {
|
|
90
|
+
text: string;
|
|
91
|
+
candidate_labels: string;
|
|
92
|
+
multi_class: boolean;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
export interface WidgetExampleSentenceSimilarityInput<TOutput = WidgetExampleOutput>
|
|
96
|
+
extends WidgetExampleBase<TOutput> {
|
|
97
|
+
source_sentence: string;
|
|
98
|
+
sentences: string[];
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
//#endregion
|
|
102
|
+
|
|
103
|
+
export type WidgetExample<TOutput = WidgetExampleOutput> =
|
|
104
|
+
| WidgetExampleTextInput<TOutput>
|
|
105
|
+
| WidgetExampleTextAndContextInput<TOutput>
|
|
106
|
+
| WidgetExampleTextAndTableInput<TOutput>
|
|
107
|
+
| WidgetExampleAssetInput<TOutput>
|
|
108
|
+
| WidgetExampleAssetAndPromptInput<TOutput>
|
|
109
|
+
| WidgetExampleAssetAndTextInput<TOutput>
|
|
110
|
+
| WidgetExampleAssetAndZeroShotInput<TOutput>
|
|
111
|
+
| WidgetExampleStructuredDataInput<TOutput>
|
|
112
|
+
| WidgetExampleTableDataInput<TOutput>
|
|
113
|
+
| WidgetExampleZeroShotTextInput<TOutput>
|
|
114
|
+
| WidgetExampleSentenceSimilarityInput<TOutput>;
|
|
115
|
+
|
|
116
|
+
type KeysOfUnion<T> = T extends unknown ? keyof T : never;
|
|
117
|
+
|
|
118
|
+
export type WidgetExampleAttribute = KeysOfUnion<WidgetExample>;
|