@huggingface/tasks 0.11.7 → 0.11.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/local-apps.ts CHANGED
@@ -99,6 +99,29 @@ const snippetLlamacpp = (model: ModelData, filepath?: string): LocalAppSnippet[]
99
99
  ];
100
100
  };
101
101
 
102
+ const snippetLocalAI = (model: ModelData, filepath?: string): LocalAppSnippet[] => {
103
+ const command = (binary: string) =>
104
+ ["# Load and run the model:", `${binary} huggingface://${model.id}/${filepath ?? "{{GGUF_FILE}}"}`].join("\n");
105
+ return [
106
+ {
107
+ title: "Install from binary",
108
+ setup: "curl https://localai.io/install.sh | sh",
109
+ content: command("local-ai run"),
110
+ },
111
+ {
112
+ title: "Use Docker images",
113
+ setup: [
114
+ // prettier-ignore
115
+ "# Pull the image:",
116
+ "docker pull localai/localai:latest-cpu",
117
+ ].join("\n"),
118
+ content: command(
119
+ "docker run -p 8080:8080 --name localai -v $PWD/models:/build/models localai/localai:latest-cpu"
120
+ ),
121
+ },
122
+ ];
123
+ };
124
+
102
125
  /**
103
126
  * Add your new local app here.
104
127
  *
@@ -126,6 +149,13 @@ export const LOCAL_APPS = {
126
149
  deeplink: (model, filepath) =>
127
150
  new URL(`lmstudio://open_from_hf?model=${model.id}${filepath ? `&file=${filepath}` : ""}`),
128
151
  },
152
+ localai: {
153
+ prettyLabel: "LocalAI",
154
+ docsUrl: "https://github.com/mudler/LocalAI",
155
+ mainTask: "text-generation",
156
+ displayOnModelPage: isGgufModel,
157
+ snippet: snippetLocalAI,
158
+ },
129
159
  jan: {
130
160
  prettyLabel: "Jan",
131
161
  docsUrl: "https://jan.ai",
@@ -201,10 +231,24 @@ export const LOCAL_APPS = {
201
231
  docsUrl: "https://diffusionbee.com",
202
232
  mainTask: "text-to-image",
203
233
  macOSOnly: true,
204
- comingSoon: true,
205
234
  displayOnModelPage: (model) => model.library_name === "diffusers" && model.pipeline_tag === "text-to-image",
206
235
  deeplink: (model) => new URL(`diffusionbee://open_from_hf?model=${model.id}`),
207
236
  },
237
+ joyfusion: {
238
+ prettyLabel: "JoyFusion",
239
+ docsUrl: "https://joyfusion.app",
240
+ mainTask: "text-to-image",
241
+ macOSOnly: true,
242
+ displayOnModelPage: (model) => model.tags.includes("coreml") && model.pipeline_tag === "text-to-image",
243
+ deeplink: (model) => new URL(`https://joyfusion.app/import_from_hf?repo_id=${model.id}`),
244
+ },
245
+ invoke: {
246
+ prettyLabel: "Invoke",
247
+ docsUrl: "https://github.com/invoke-ai/InvokeAI",
248
+ mainTask: "text-to-image",
249
+ displayOnModelPage: (model) => model.library_name === "diffusers" && model.pipeline_tag === "text-to-image",
250
+ deeplink: (model) => new URL(`https://models.invoke.ai/huggingface/${model.id}`),
251
+ },
208
252
  } satisfies Record<string, LocalApp>;
209
253
 
210
254
  export type LocalAppKey = keyof typeof LOCAL_APPS;
@@ -261,6 +261,24 @@ backbone = keras_nlp.models.Backbone.from_preset("hf://${model.id}")
261
261
  `,
262
262
  ];
263
263
 
264
+ export const llama_cpp_python = (model: ModelData): string[] => [
265
+ `from llama_cpp import Llama
266
+
267
+ llm = Llama.from_pretrained(
268
+ repo_id="${model.id}",
269
+ filename="{{GGUF_FILE}}",
270
+ )
271
+
272
+ llm.create_chat_completion(
273
+ messages = [
274
+ {
275
+ "role": "user",
276
+ "content": "What is the capital of France?"
277
+ }
278
+ ]
279
+ )`,
280
+ ];
281
+
264
282
  export const tf_keras = (model: ModelData): string[] => [
265
283
  `# Note: 'keras<3.x' or 'tf_keras' must be installed (legacy)
266
284
  # See https://github.com/keras-team/tf-keras for more details.
@@ -507,6 +525,35 @@ export const fastai = (model: ModelData): string[] => [
507
525
  learn = from_pretrained_fastai("${model.id}")`,
508
526
  ];
509
527
 
528
+ export const sam2 = (model: ModelData): string[] => {
529
+ const image_predictor = `# Use SAM2 with images
530
+ import torch
531
+ from sam2.sam2_image_predictor import SAM2ImagePredictor
532
+
533
+ predictor = SAM2ImagePredictor.from_pretrained(${model.id})
534
+
535
+ with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
536
+ predictor.set_image(<your_image>)
537
+ masks, _, _ = predictor.predict(<input_prompts>)`;
538
+
539
+ const video_predictor = `# Use SAM2 with videos
540
+ import torch
541
+ from sam2.sam2_video_predictor import SAM2VideoPredictor
542
+
543
+ predictor = SAM2VideoPredictor.from_pretrained(${model.id})
544
+
545
+ with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
546
+ state = predictor.init_state(<your_video>)
547
+
548
+ # add new prompts and instantly get the output on the same frame
549
+ frame_idx, object_ids, masks = predictor.add_new_points(state, <your_prompts>):
550
+
551
+ # propagate the prompts to get masklets throughout the video
552
+ for frame_idx, object_ids, masks in predictor.propagate_in_video(state):
553
+ ...`;
554
+ return [image_predictor, video_predictor];
555
+ };
556
+
510
557
  export const sampleFactory = (model: ModelData): string[] => [
511
558
  `python -m sample_factory.huggingface.load_from_hub -r ${model.id} -d ./train_dir`,
512
559
  ];
@@ -314,6 +314,12 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
314
314
  filter: false,
315
315
  countDownloads: `path:"liveportrait/landmark.onnx"`,
316
316
  },
317
+ "llama-cpp-python": {
318
+ prettyLabel: "llama-cpp-python",
319
+ repoName: "llama-cpp-python",
320
+ repoUrl: "https://github.com/abetlen/llama-cpp-python",
321
+ snippets: snippets.llama_cpp_python,
322
+ },
317
323
  mindspore: {
318
324
  prettyLabel: "MindSpore",
319
325
  repoName: "mindspore",
@@ -429,6 +435,14 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
429
435
  filter: false,
430
436
  countDownloads: `path:"tokenizer.model"`,
431
437
  },
438
+ refiners: {
439
+ prettyLabel: "Refiners",
440
+ repoName: "Refiners",
441
+ repoUrl: "https://github.com/finegrain-ai/refiners",
442
+ docsUrl: "https://refine.rs/",
443
+ filter: false,
444
+ countDownloads: `path:"model.safetensors"`,
445
+ },
432
446
  saelens: {
433
447
  prettyLabel: "SAELens",
434
448
  repoName: "SAELens",
@@ -436,6 +450,14 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
436
450
  snippets: snippets.saelens,
437
451
  filter: false,
438
452
  },
453
+ sam2: {
454
+ prettyLabel: "sam2",
455
+ repoName: "sam2",
456
+ repoUrl: "https://github.com/facebookresearch/segment-anything-2",
457
+ filter: false,
458
+ snippets: snippets.sam2,
459
+ countDownloads: `path_extension:"pt"`,
460
+ },
439
461
  "sample-factory": {
440
462
  prettyLabel: "sample-factory",
441
463
  repoName: "sample-factory",
package/src/pipelines.ts CHANGED
@@ -652,6 +652,12 @@ export const PIPELINE_DATA = {
652
652
  modality: "cv",
653
653
  color: "indigo",
654
654
  },
655
+ "video-text-to-text": {
656
+ name: "Video-Text-to-Text",
657
+ modality: "multimodal",
658
+ color: "blue",
659
+ hideInDatasets: true,
660
+ },
655
661
  other: {
656
662
  name: "Other",
657
663
  modality: "other",
@@ -42,9 +42,13 @@ const taskData: TaskDataCustom = {
42
42
  ],
43
43
  spaces: [
44
44
  {
45
- description: "A leaderboard to rank best feature extraction models..",
45
+ description: "A leaderboard to rank text feature extraction models based on a benchmark.",
46
46
  id: "mteb/leaderboard",
47
47
  },
48
+ {
49
+ description: "A leaderboard to rank best feature extraction models based on human feedback.",
50
+ id: "mteb/arena",
51
+ },
48
52
  ],
49
53
  summary: "Feature extraction is the task of extracting features learnt in a model.",
50
54
  widgetModels: ["facebook/bart-base"],
@@ -36,16 +36,20 @@ const taskData: TaskDataCustom = {
36
36
  },
37
37
  {
38
38
  description: "A strong image feature extraction model.",
39
- id: "google/vit-base-patch16-224-in21k",
39
+ id: "nvidia/MambaVision-T-1K",
40
40
  },
41
41
  {
42
- description: "A robust image feature extraction models.",
42
+ description: "A robust image feature extraction model.",
43
43
  id: "facebook/dino-vitb16",
44
44
  },
45
45
  {
46
- description: "Strong image-text-to-text model made for information retrieval from documents.",
46
+ description: "Strong image feature extraction model made for information retrieval from documents.",
47
47
  id: "vidore/colpali",
48
48
  },
49
+ {
50
+ description: "Strong image feature extraction model that can be used on images and documents.",
51
+ id: "OpenGVLab/InternViT-6B-448px-V1-2",
52
+ },
49
53
  ],
50
54
  spaces: [],
51
55
  summary: "Image feature extraction is the task of extracting features learnt in a computer vision model.",
@@ -48,16 +48,16 @@ const taskData: TaskDataCustom = {
48
48
  id: "facebook/detr-resnet-50-panoptic",
49
49
  },
50
50
  {
51
- description: "Semantic segmentation model trained on ADE20k benchmark dataset.",
52
- id: "microsoft/beit-large-finetuned-ade-640-640",
51
+ description: "Background removal model.",
52
+ id: "briaai/RMBG-1.4",
53
53
  },
54
54
  {
55
55
  description: "Semantic segmentation model trained on ADE20k benchmark dataset with 512x512 resolution.",
56
56
  id: "nvidia/segformer-b0-finetuned-ade-512-512",
57
57
  },
58
58
  {
59
- description: "Semantic segmentation model trained Cityscapes dataset.",
60
- id: "facebook/mask2former-swin-large-cityscapes-semantic",
59
+ description: "A multipurpose image segmentation model for high resolution images.",
60
+ id: "ZhengPeng7/BiRefNet",
61
61
  },
62
62
  {
63
63
  description: "Panoptic segmentation model trained COCO (common objects) dataset.",
@@ -72,3 +72,5 @@ print(processor.decode(output[0], skip_special_tokens=True))
72
72
  - [Vision Language Models Explained](https://huggingface.co/blog/vlms)
73
73
  - [Open-source Multimodality and How to Achieve it using Hugging Face](https://www.youtube.com/watch?v=IoGaGfU1CIg&t=601s)
74
74
  - [Introducing Idefics2: A Powerful 8B Vision-Language Model for the community](https://huggingface.co/blog/idefics2)
75
+ - [Image-text-to-text task guide](https://huggingface.co/tasks/image-text-to-text)
76
+ - [Preference Optimization for Vision Language Models with TRL](https://huggingface.co/blog/dpo_vlm)
@@ -88,7 +88,7 @@ const taskData: TaskDataCustom = {
88
88
  summary:
89
89
  "Image-text-to-text models take in an image and text prompt and output text. These models are also called vision-language models, or VLMs. The difference from image-to-text models is that these models take an additional text input, not restricting the model to certain use cases like image captioning, and may also be trained to accept a conversation as input.",
90
90
  widgetModels: ["microsoft/kosmos-2-patch14-224"],
91
- youtubeId: "",
91
+ youtubeId: "IoGaGfU1CIg",
92
92
  };
93
93
 
94
94
  export default taskData;
@@ -45,8 +45,8 @@ const taskData: TaskDataCustom = {
45
45
  ],
46
46
  models: [
47
47
  {
48
- description: "A model that enhances images captured in low light conditions.",
49
- id: "keras-io/low-light-image-enhancement",
48
+ description: "An image-to-image model to improve image resolution.",
49
+ id: "fal/AuraSR-v2",
50
50
  },
51
51
  {
52
52
  description: "A model that increases the resolution of an image.",
@@ -166,6 +166,7 @@ export const TASKS_MODEL_LIBRARIES: Record<PipelineType, ModelLibraryKey[]> = {
166
166
  ],
167
167
  translation: ["transformers", "transformers.js"],
168
168
  "unconditional-image-generation": ["diffusers"],
169
+ "video-text-to-text": ["transformers"],
169
170
  "visual-question-answering": ["transformers", "transformers.js"],
170
171
  "voice-activity-detection": [],
171
172
  "zero-shot-classification": ["transformers", "transformers.js"],
@@ -236,6 +237,7 @@ export const TASKS_DATA: Record<PipelineType, TaskData | undefined> = {
236
237
  "token-classification": getData("token-classification", tokenClassification),
237
238
  translation: getData("translation", translation),
238
239
  "unconditional-image-generation": getData("unconditional-image-generation", unconditionalImageGeneration),
240
+ "video-text-to-text": getData("video-text-to-text", placeholder),
239
241
  "visual-question-answering": getData("visual-question-answering", visualQuestionAnswering),
240
242
  "voice-activity-detection": undefined,
241
243
  "zero-shot-classification": getData("zero-shot-classification", zeroShotClassification),
@@ -24,14 +24,14 @@ const taskData: TaskDataCustom = {
24
24
  },
25
25
  {
26
26
  description: "Very strong mask generation model.",
27
- id: "facebook/sam-vit-huge",
27
+ id: "facebook/sam2-hiera-large",
28
28
  },
29
29
  ],
30
30
  spaces: [
31
31
  {
32
32
  description:
33
- "An application that combines a mask generation model with an image embedding model for open-vocabulary image segmentation.",
34
- id: "SkalskiP/SAM_and_MetaCLIP",
33
+ "An application that combines a mask generation model with a zero-shot object detection model for text-guided image segmentation.",
34
+ id: "merve/OWLSAM2",
35
35
  },
36
36
  {
37
37
  description: "An application that compares the performance of a large and a small mask generation model.",
@@ -39,7 +39,7 @@ const taskData: TaskDataCustom = {
39
39
  },
40
40
  {
41
41
  description: "An application based on an improved mask generation model.",
42
- id: "linfanluntan/Grounded-SAM",
42
+ id: "SkalskiP/segment-anything-model-2",
43
43
  },
44
44
  {
45
45
  description: "An application to remove objects from videos using mask generation models.",
@@ -19,6 +19,10 @@ const taskData: TaskDataCustom = {
19
19
  description: "An instruction dataset with preference ratings on responses.",
20
20
  id: "openbmb/UltraFeedback",
21
21
  },
22
+ {
23
+ description: "A large synthetic dataset for alignment of text generation models.",
24
+ id: "argilla/magpie-ultra-v0.1",
25
+ },
22
26
  ],
23
27
  demo: {
24
28
  inputs: [
@@ -51,32 +55,32 @@ const taskData: TaskDataCustom = {
51
55
  ],
52
56
  models: [
53
57
  {
54
- description: "A large language model trained for text generation.",
55
- id: "bigscience/bloom-560m",
58
+ description: "A text-generation model trained to follow instructions.",
59
+ id: "google/gemma-2-2b-it",
56
60
  },
57
61
  {
58
- description: "A large code generation model that can generate code in 80+ languages.",
62
+ description: "A code generation model that can generate code in 80+ languages.",
59
63
  id: "bigcode/starcoder",
60
64
  },
61
65
  {
62
- description: "A very powerful text generation model.",
63
- id: "mistralai/Mixtral-8x7B-Instruct-v0.1",
66
+ description: "Very powerful text generation model trained to follow instructions.",
67
+ id: "meta-llama/Meta-Llama-3.1-8B-Instruct",
64
68
  },
65
69
  {
66
70
  description: "Small yet powerful text generation model.",
67
- id: "microsoft/phi-2",
71
+ id: "microsoft/Phi-3-mini-4k-instruct",
68
72
  },
69
73
  {
70
- description: "A very powerful model that can chat, do mathematical reasoning and write code.",
71
- id: "openchat/openchat-3.5-0106",
74
+ description: "A very powerful model that can solve mathematical problems.",
75
+ id: "AI-MO/NuminaMath-7B-TIR",
72
76
  },
73
77
  {
74
- description: "Very strong yet small assistant model.",
75
- id: "HuggingFaceH4/zephyr-7b-beta",
78
+ description: "Strong coding assistant model.",
79
+ id: "HuggingFaceH4/starchat2-15b-v0.1",
76
80
  },
77
81
  {
78
82
  description: "Very strong open-source large language model.",
79
- id: "meta-llama/Llama-2-70b-hf",
83
+ id: "mistralai/Mistral-Nemo-Instruct-2407",
80
84
  },
81
85
  ],
82
86
  spaces: [
@@ -104,7 +108,7 @@ const taskData: TaskDataCustom = {
104
108
  summary:
105
109
  "Generating text is the task of generating new text given another text. These models can, for example, fill in incomplete text or paraphrase.",
106
110
  widgetModels: ["HuggingFaceH4/zephyr-7b-beta"],
107
- youtubeId: "Vpjb1lu0MDk",
111
+ youtubeId: "e9gNEAlsOvU",
108
112
  };
109
113
 
110
114
  export default taskData;
@@ -46,15 +46,15 @@ const taskData: TaskDataCustom = {
46
46
  models: [
47
47
  {
48
48
  description: "One of the most powerful image generation models that can generate realistic outputs.",
49
- id: "stabilityai/stable-diffusion-xl-base-1.0",
49
+ id: "black-forest-labs/FLUX.1-dev",
50
50
  },
51
51
  {
52
52
  description: "A powerful yet fast image generation model.",
53
53
  id: "latent-consistency/lcm-lora-sdxl",
54
54
  },
55
55
  {
56
- description: "A very fast text-to-image model.",
57
- id: "ByteDance/SDXL-Lightning",
56
+ description: "Text-to-image model for photorealistic generation.",
57
+ id: "Kwai-Kolors/Kolors",
58
58
  },
59
59
  {
60
60
  description: "A powerful text-to-image model.",