@synsci/cli-darwin-x64 1.1.49
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skills/accelerate/SKILL.md +332 -0
- package/bin/skills/accelerate/references/custom-plugins.md +453 -0
- package/bin/skills/accelerate/references/megatron-integration.md +489 -0
- package/bin/skills/accelerate/references/performance.md +525 -0
- package/bin/skills/audiocraft/SKILL.md +564 -0
- package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
- package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
- package/bin/skills/autogpt/SKILL.md +403 -0
- package/bin/skills/autogpt/references/advanced-usage.md +535 -0
- package/bin/skills/autogpt/references/troubleshooting.md +420 -0
- package/bin/skills/awq/SKILL.md +310 -0
- package/bin/skills/awq/references/advanced-usage.md +324 -0
- package/bin/skills/awq/references/troubleshooting.md +344 -0
- package/bin/skills/axolotl/SKILL.md +158 -0
- package/bin/skills/axolotl/references/api.md +5548 -0
- package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
- package/bin/skills/axolotl/references/index.md +15 -0
- package/bin/skills/axolotl/references/other.md +3563 -0
- package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
- package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
- package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
- package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
- package/bin/skills/bitsandbytes/SKILL.md +411 -0
- package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
- package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
- package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
- package/bin/skills/blip-2/SKILL.md +564 -0
- package/bin/skills/blip-2/references/advanced-usage.md +680 -0
- package/bin/skills/blip-2/references/troubleshooting.md +526 -0
- package/bin/skills/chroma/SKILL.md +406 -0
- package/bin/skills/chroma/references/integration.md +38 -0
- package/bin/skills/clip/SKILL.md +253 -0
- package/bin/skills/clip/references/applications.md +207 -0
- package/bin/skills/constitutional-ai/SKILL.md +290 -0
- package/bin/skills/crewai/SKILL.md +498 -0
- package/bin/skills/crewai/references/flows.md +438 -0
- package/bin/skills/crewai/references/tools.md +429 -0
- package/bin/skills/crewai/references/troubleshooting.md +480 -0
- package/bin/skills/deepspeed/SKILL.md +141 -0
- package/bin/skills/deepspeed/references/08.md +17 -0
- package/bin/skills/deepspeed/references/09.md +173 -0
- package/bin/skills/deepspeed/references/2020.md +378 -0
- package/bin/skills/deepspeed/references/2023.md +279 -0
- package/bin/skills/deepspeed/references/assets.md +179 -0
- package/bin/skills/deepspeed/references/index.md +35 -0
- package/bin/skills/deepspeed/references/mii.md +118 -0
- package/bin/skills/deepspeed/references/other.md +1191 -0
- package/bin/skills/deepspeed/references/tutorials.md +6554 -0
- package/bin/skills/dspy/SKILL.md +590 -0
- package/bin/skills/dspy/references/examples.md +663 -0
- package/bin/skills/dspy/references/modules.md +475 -0
- package/bin/skills/dspy/references/optimizers.md +566 -0
- package/bin/skills/faiss/SKILL.md +221 -0
- package/bin/skills/faiss/references/index_types.md +280 -0
- package/bin/skills/flash-attention/SKILL.md +367 -0
- package/bin/skills/flash-attention/references/benchmarks.md +215 -0
- package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
- package/bin/skills/gguf/SKILL.md +427 -0
- package/bin/skills/gguf/references/advanced-usage.md +504 -0
- package/bin/skills/gguf/references/troubleshooting.md +442 -0
- package/bin/skills/gptq/SKILL.md +450 -0
- package/bin/skills/gptq/references/calibration.md +337 -0
- package/bin/skills/gptq/references/integration.md +129 -0
- package/bin/skills/gptq/references/troubleshooting.md +95 -0
- package/bin/skills/grpo-rl-training/README.md +97 -0
- package/bin/skills/grpo-rl-training/SKILL.md +572 -0
- package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
- package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
- package/bin/skills/guidance/SKILL.md +572 -0
- package/bin/skills/guidance/references/backends.md +554 -0
- package/bin/skills/guidance/references/constraints.md +674 -0
- package/bin/skills/guidance/references/examples.md +767 -0
- package/bin/skills/hqq/SKILL.md +445 -0
- package/bin/skills/hqq/references/advanced-usage.md +528 -0
- package/bin/skills/hqq/references/troubleshooting.md +503 -0
- package/bin/skills/hugging-face-cli/SKILL.md +191 -0
- package/bin/skills/hugging-face-cli/references/commands.md +954 -0
- package/bin/skills/hugging-face-cli/references/examples.md +374 -0
- package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
- package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
- package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
- package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
- package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
- package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
- package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
- package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
- package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
- package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
- package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
- package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
- package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
- package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
- package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
- package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
- package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
- package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
- package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
- package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
- package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
- package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
- package/bin/skills/hugging-face-jobs/index.html +216 -0
- package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
- package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
- package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
- package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
- package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
- package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
- package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
- package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
- package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
- package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
- package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
- package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
- package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
- package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
- package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
- package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
- package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
- package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
- package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
- package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
- package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
- package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
- package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
- package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
- package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
- package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
- package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
- package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
- package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
- package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
- package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
- package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
- package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
- package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
- package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
- package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
- package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
- package/bin/skills/instructor/SKILL.md +740 -0
- package/bin/skills/instructor/references/examples.md +107 -0
- package/bin/skills/instructor/references/providers.md +70 -0
- package/bin/skills/instructor/references/validation.md +606 -0
- package/bin/skills/knowledge-distillation/SKILL.md +458 -0
- package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
- package/bin/skills/lambda-labs/SKILL.md +545 -0
- package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
- package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
- package/bin/skills/langchain/SKILL.md +480 -0
- package/bin/skills/langchain/references/agents.md +499 -0
- package/bin/skills/langchain/references/integration.md +562 -0
- package/bin/skills/langchain/references/rag.md +600 -0
- package/bin/skills/langsmith/SKILL.md +422 -0
- package/bin/skills/langsmith/references/advanced-usage.md +548 -0
- package/bin/skills/langsmith/references/troubleshooting.md +537 -0
- package/bin/skills/litgpt/SKILL.md +469 -0
- package/bin/skills/litgpt/references/custom-models.md +568 -0
- package/bin/skills/litgpt/references/distributed-training.md +451 -0
- package/bin/skills/litgpt/references/supported-models.md +336 -0
- package/bin/skills/litgpt/references/training-recipes.md +619 -0
- package/bin/skills/llama-cpp/SKILL.md +258 -0
- package/bin/skills/llama-cpp/references/optimization.md +89 -0
- package/bin/skills/llama-cpp/references/quantization.md +213 -0
- package/bin/skills/llama-cpp/references/server.md +125 -0
- package/bin/skills/llama-factory/SKILL.md +80 -0
- package/bin/skills/llama-factory/references/_images.md +23 -0
- package/bin/skills/llama-factory/references/advanced.md +1055 -0
- package/bin/skills/llama-factory/references/getting_started.md +349 -0
- package/bin/skills/llama-factory/references/index.md +19 -0
- package/bin/skills/llama-factory/references/other.md +31 -0
- package/bin/skills/llamaguard/SKILL.md +337 -0
- package/bin/skills/llamaindex/SKILL.md +569 -0
- package/bin/skills/llamaindex/references/agents.md +83 -0
- package/bin/skills/llamaindex/references/data_connectors.md +108 -0
- package/bin/skills/llamaindex/references/query_engines.md +406 -0
- package/bin/skills/llava/SKILL.md +304 -0
- package/bin/skills/llava/references/training.md +197 -0
- package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
- package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
- package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
- package/bin/skills/long-context/SKILL.md +536 -0
- package/bin/skills/long-context/references/extension_methods.md +468 -0
- package/bin/skills/long-context/references/fine_tuning.md +611 -0
- package/bin/skills/long-context/references/rope.md +402 -0
- package/bin/skills/mamba/SKILL.md +260 -0
- package/bin/skills/mamba/references/architecture-details.md +206 -0
- package/bin/skills/mamba/references/benchmarks.md +255 -0
- package/bin/skills/mamba/references/training-guide.md +388 -0
- package/bin/skills/megatron-core/SKILL.md +366 -0
- package/bin/skills/megatron-core/references/benchmarks.md +249 -0
- package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
- package/bin/skills/megatron-core/references/production-examples.md +473 -0
- package/bin/skills/megatron-core/references/training-recipes.md +547 -0
- package/bin/skills/miles/SKILL.md +315 -0
- package/bin/skills/miles/references/api-reference.md +141 -0
- package/bin/skills/miles/references/troubleshooting.md +352 -0
- package/bin/skills/mlflow/SKILL.md +704 -0
- package/bin/skills/mlflow/references/deployment.md +744 -0
- package/bin/skills/mlflow/references/model-registry.md +770 -0
- package/bin/skills/mlflow/references/tracking.md +680 -0
- package/bin/skills/modal/SKILL.md +341 -0
- package/bin/skills/modal/references/advanced-usage.md +503 -0
- package/bin/skills/modal/references/troubleshooting.md +494 -0
- package/bin/skills/model-merging/SKILL.md +539 -0
- package/bin/skills/model-merging/references/evaluation.md +462 -0
- package/bin/skills/model-merging/references/examples.md +428 -0
- package/bin/skills/model-merging/references/methods.md +352 -0
- package/bin/skills/model-pruning/SKILL.md +495 -0
- package/bin/skills/model-pruning/references/wanda.md +347 -0
- package/bin/skills/moe-training/SKILL.md +526 -0
- package/bin/skills/moe-training/references/architectures.md +432 -0
- package/bin/skills/moe-training/references/inference.md +348 -0
- package/bin/skills/moe-training/references/training.md +425 -0
- package/bin/skills/nanogpt/SKILL.md +290 -0
- package/bin/skills/nanogpt/references/architecture.md +382 -0
- package/bin/skills/nanogpt/references/data.md +476 -0
- package/bin/skills/nanogpt/references/training.md +564 -0
- package/bin/skills/nemo-curator/SKILL.md +383 -0
- package/bin/skills/nemo-curator/references/deduplication.md +87 -0
- package/bin/skills/nemo-curator/references/filtering.md +102 -0
- package/bin/skills/nemo-evaluator/SKILL.md +494 -0
- package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
- package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
- package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
- package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
- package/bin/skills/nemo-guardrails/SKILL.md +297 -0
- package/bin/skills/nnsight/SKILL.md +436 -0
- package/bin/skills/nnsight/references/README.md +78 -0
- package/bin/skills/nnsight/references/api.md +344 -0
- package/bin/skills/nnsight/references/tutorials.md +300 -0
- package/bin/skills/openrlhf/SKILL.md +249 -0
- package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
- package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
- package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
- package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
- package/bin/skills/outlines/SKILL.md +652 -0
- package/bin/skills/outlines/references/backends.md +615 -0
- package/bin/skills/outlines/references/examples.md +773 -0
- package/bin/skills/outlines/references/json_generation.md +652 -0
- package/bin/skills/peft/SKILL.md +431 -0
- package/bin/skills/peft/references/advanced-usage.md +514 -0
- package/bin/skills/peft/references/troubleshooting.md +480 -0
- package/bin/skills/phoenix/SKILL.md +475 -0
- package/bin/skills/phoenix/references/advanced-usage.md +619 -0
- package/bin/skills/phoenix/references/troubleshooting.md +538 -0
- package/bin/skills/pinecone/SKILL.md +358 -0
- package/bin/skills/pinecone/references/deployment.md +181 -0
- package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
- package/bin/skills/pytorch-fsdp/references/index.md +7 -0
- package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
- package/bin/skills/pytorch-lightning/SKILL.md +346 -0
- package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
- package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
- package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
- package/bin/skills/pyvene/SKILL.md +473 -0
- package/bin/skills/pyvene/references/README.md +73 -0
- package/bin/skills/pyvene/references/api.md +383 -0
- package/bin/skills/pyvene/references/tutorials.md +376 -0
- package/bin/skills/qdrant/SKILL.md +493 -0
- package/bin/skills/qdrant/references/advanced-usage.md +648 -0
- package/bin/skills/qdrant/references/troubleshooting.md +631 -0
- package/bin/skills/ray-data/SKILL.md +326 -0
- package/bin/skills/ray-data/references/integration.md +82 -0
- package/bin/skills/ray-data/references/transformations.md +83 -0
- package/bin/skills/ray-train/SKILL.md +406 -0
- package/bin/skills/ray-train/references/multi-node.md +628 -0
- package/bin/skills/rwkv/SKILL.md +260 -0
- package/bin/skills/rwkv/references/architecture-details.md +344 -0
- package/bin/skills/rwkv/references/rwkv7.md +386 -0
- package/bin/skills/rwkv/references/state-management.md +369 -0
- package/bin/skills/saelens/SKILL.md +386 -0
- package/bin/skills/saelens/references/README.md +70 -0
- package/bin/skills/saelens/references/api.md +333 -0
- package/bin/skills/saelens/references/tutorials.md +318 -0
- package/bin/skills/segment-anything/SKILL.md +500 -0
- package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
- package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
- package/bin/skills/sentence-transformers/SKILL.md +255 -0
- package/bin/skills/sentence-transformers/references/models.md +123 -0
- package/bin/skills/sentencepiece/SKILL.md +235 -0
- package/bin/skills/sentencepiece/references/algorithms.md +200 -0
- package/bin/skills/sentencepiece/references/training.md +304 -0
- package/bin/skills/sglang/SKILL.md +442 -0
- package/bin/skills/sglang/references/deployment.md +490 -0
- package/bin/skills/sglang/references/radix-attention.md +413 -0
- package/bin/skills/sglang/references/structured-generation.md +541 -0
- package/bin/skills/simpo/SKILL.md +219 -0
- package/bin/skills/simpo/references/datasets.md +478 -0
- package/bin/skills/simpo/references/hyperparameters.md +452 -0
- package/bin/skills/simpo/references/loss-functions.md +350 -0
- package/bin/skills/skypilot/SKILL.md +509 -0
- package/bin/skills/skypilot/references/advanced-usage.md +491 -0
- package/bin/skills/skypilot/references/troubleshooting.md +570 -0
- package/bin/skills/slime/SKILL.md +464 -0
- package/bin/skills/slime/references/api-reference.md +392 -0
- package/bin/skills/slime/references/troubleshooting.md +386 -0
- package/bin/skills/speculative-decoding/SKILL.md +467 -0
- package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
- package/bin/skills/speculative-decoding/references/medusa.md +350 -0
- package/bin/skills/stable-diffusion/SKILL.md +519 -0
- package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
- package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
- package/bin/skills/tensorboard/SKILL.md +629 -0
- package/bin/skills/tensorboard/references/integrations.md +638 -0
- package/bin/skills/tensorboard/references/profiling.md +545 -0
- package/bin/skills/tensorboard/references/visualization.md +620 -0
- package/bin/skills/tensorrt-llm/SKILL.md +187 -0
- package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
- package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
- package/bin/skills/tensorrt-llm/references/serving.md +470 -0
- package/bin/skills/tinker/SKILL.md +362 -0
- package/bin/skills/tinker/references/api-reference.md +168 -0
- package/bin/skills/tinker/references/getting-started.md +157 -0
- package/bin/skills/tinker/references/loss-functions.md +163 -0
- package/bin/skills/tinker/references/models-and-lora.md +139 -0
- package/bin/skills/tinker/references/recipes.md +280 -0
- package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
- package/bin/skills/tinker/references/rendering.md +243 -0
- package/bin/skills/tinker/references/supervised-learning.md +232 -0
- package/bin/skills/tinker-training-cost/SKILL.md +187 -0
- package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
- package/bin/skills/torchforge/SKILL.md +433 -0
- package/bin/skills/torchforge/references/api-reference.md +327 -0
- package/bin/skills/torchforge/references/troubleshooting.md +409 -0
- package/bin/skills/torchtitan/SKILL.md +358 -0
- package/bin/skills/torchtitan/references/checkpoint.md +181 -0
- package/bin/skills/torchtitan/references/custom-models.md +258 -0
- package/bin/skills/torchtitan/references/float8.md +133 -0
- package/bin/skills/torchtitan/references/fsdp.md +126 -0
- package/bin/skills/transformer-lens/SKILL.md +346 -0
- package/bin/skills/transformer-lens/references/README.md +54 -0
- package/bin/skills/transformer-lens/references/api.md +362 -0
- package/bin/skills/transformer-lens/references/tutorials.md +339 -0
- package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
- package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
- package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
- package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
- package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
- package/bin/skills/unsloth/SKILL.md +80 -0
- package/bin/skills/unsloth/references/index.md +7 -0
- package/bin/skills/unsloth/references/llms-full.md +16799 -0
- package/bin/skills/unsloth/references/llms-txt.md +12044 -0
- package/bin/skills/unsloth/references/llms.md +82 -0
- package/bin/skills/verl/SKILL.md +391 -0
- package/bin/skills/verl/references/api-reference.md +301 -0
- package/bin/skills/verl/references/troubleshooting.md +391 -0
- package/bin/skills/vllm/SKILL.md +364 -0
- package/bin/skills/vllm/references/optimization.md +226 -0
- package/bin/skills/vllm/references/quantization.md +284 -0
- package/bin/skills/vllm/references/server-deployment.md +255 -0
- package/bin/skills/vllm/references/troubleshooting.md +447 -0
- package/bin/skills/weights-and-biases/SKILL.md +590 -0
- package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
- package/bin/skills/weights-and-biases/references/integrations.md +700 -0
- package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
- package/bin/skills/whisper/SKILL.md +317 -0
- package/bin/skills/whisper/references/languages.md +189 -0
- package/bin/synsc +0 -0
- package/package.json +10 -0
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="UTF-8">
|
|
5
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
6
|
+
<title>hf-jobs - Run Workloads on Hugging Face Jobs</title>
|
|
7
|
+
<style>
|
|
8
|
+
* {
|
|
9
|
+
margin: 0;
|
|
10
|
+
padding: 0;
|
|
11
|
+
box-sizing: border-box;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
body {
|
|
15
|
+
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
|
|
16
|
+
line-height: 1.6;
|
|
17
|
+
color: #333;
|
|
18
|
+
background: #f5f5f5;
|
|
19
|
+
padding: 20px;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
.container {
|
|
23
|
+
max-width: 1200px;
|
|
24
|
+
margin: 0 auto;
|
|
25
|
+
background: white;
|
|
26
|
+
padding: 40px;
|
|
27
|
+
border-radius: 8px;
|
|
28
|
+
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
h1 {
|
|
32
|
+
color: #ffd21e;
|
|
33
|
+
background: #000;
|
|
34
|
+
padding: 20px;
|
|
35
|
+
margin: -40px -40px 30px -40px;
|
|
36
|
+
border-radius: 8px 8px 0 0;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
h2 {
|
|
40
|
+
color: #1e1e1e;
|
|
41
|
+
margin-top: 30px;
|
|
42
|
+
margin-bottom: 15px;
|
|
43
|
+
padding-bottom: 10px;
|
|
44
|
+
border-bottom: 2px solid #ffd21e;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
h3 {
|
|
48
|
+
color: #555;
|
|
49
|
+
margin-top: 20px;
|
|
50
|
+
margin-bottom: 10px;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
.description {
|
|
54
|
+
background: #f9f9f9;
|
|
55
|
+
padding: 20px;
|
|
56
|
+
border-left: 4px solid #ffd21e;
|
|
57
|
+
margin-bottom: 30px;
|
|
58
|
+
border-radius: 4px;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
.file-list {
|
|
62
|
+
list-style: none;
|
|
63
|
+
padding: 0;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
.file-list li {
|
|
67
|
+
padding: 12px;
|
|
68
|
+
margin: 8px 0;
|
|
69
|
+
background: #f9f9f9;
|
|
70
|
+
border-radius: 4px;
|
|
71
|
+
border-left: 3px solid #ffd21e;
|
|
72
|
+
transition: background 0.2s;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
.file-list li:hover {
|
|
76
|
+
background: #f0f0f0;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
.file-list a {
|
|
80
|
+
color: #0066cc;
|
|
81
|
+
text-decoration: none;
|
|
82
|
+
font-weight: 500;
|
|
83
|
+
display: block;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
.file-list a:hover {
|
|
87
|
+
text-decoration: underline;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
.file-path {
|
|
91
|
+
color: #666;
|
|
92
|
+
font-size: 0.9em;
|
|
93
|
+
font-family: 'Monaco', 'Courier New', monospace;
|
|
94
|
+
margin-top: 4px;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
.file-description {
|
|
98
|
+
color: #777;
|
|
99
|
+
font-size: 0.9em;
|
|
100
|
+
margin-top: 4px;
|
|
101
|
+
font-style: italic;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
.metadata {
|
|
105
|
+
background: #f0f0f0;
|
|
106
|
+
padding: 15px;
|
|
107
|
+
border-radius: 4px;
|
|
108
|
+
margin-bottom: 30px;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
.metadata p {
|
|
112
|
+
margin: 5px 0;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
.metadata strong {
|
|
116
|
+
color: #333;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
.section {
|
|
120
|
+
margin-bottom: 40px;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
code {
|
|
124
|
+
background: #f4f4f4;
|
|
125
|
+
padding: 2px 6px;
|
|
126
|
+
border-radius: 3px;
|
|
127
|
+
font-family: 'Monaco', 'Courier New', monospace;
|
|
128
|
+
font-size: 0.9em;
|
|
129
|
+
}
|
|
130
|
+
</style>
|
|
131
|
+
</head>
|
|
132
|
+
<body>
|
|
133
|
+
<div class="container">
|
|
134
|
+
<h1>Agent Skill : hf-jobs</h1>
|
|
135
|
+
|
|
136
|
+
<div class="description">
|
|
137
|
+
<p><strong>Run any workload on Hugging Face Jobs.</strong></p>
|
|
138
|
+
<p>Use this skill when you want to run GPU/CPU workloads (batch inference, synthetic data generation, dataset stats, experiments) on Hugging Face Jobs, with correct token handling and result persistence back to the Hub.</p>
|
|
139
|
+
</div>
|
|
140
|
+
|
|
141
|
+
<div class="metadata">
|
|
142
|
+
<p><strong>Skill Name:</strong> hf-jobs</p>
|
|
143
|
+
<p><strong>Main Documentation:</strong> <a href="hf-jobs/SKILL.md">hf-jobs/SKILL.md</a></p>
|
|
144
|
+
<p><strong>Scripts Directory:</strong> <code>hf-jobs/scripts/</code></p>
|
|
145
|
+
<p><strong>References Directory:</strong> <code>hf-jobs/references/</code></p>
|
|
146
|
+
</div>
|
|
147
|
+
|
|
148
|
+
<div class="section">
|
|
149
|
+
<h2>Overview</h2>
|
|
150
|
+
<p>This skill focuses on running real workloads via Hugging Face Jobs. It includes ready-to-run UV scripts and guides for authentication (HF tokens), secrets vs env vars, timeouts, hardware selection, and pushing results to the Hub.</p>
|
|
151
|
+
</div>
|
|
152
|
+
|
|
153
|
+
<div class="section">
|
|
154
|
+
<h2>Core Documentation</h2>
|
|
155
|
+
<ul class="file-list">
|
|
156
|
+
<li>
|
|
157
|
+
<a href="hf-jobs/SKILL.md">SKILL.md</a>
|
|
158
|
+
<div class="file-path">hf-jobs/SKILL.md</div>
|
|
159
|
+
<div class="file-description">Complete skill documentation (how to submit jobs, tokens/secrets, timeouts, persistence, and how to use the bundled scripts)</div>
|
|
160
|
+
</li>
|
|
161
|
+
</ul>
|
|
162
|
+
</div>
|
|
163
|
+
|
|
164
|
+
<div class="section">
|
|
165
|
+
<h2>References</h2>
|
|
166
|
+
<ul class="file-list">
|
|
167
|
+
<li>
|
|
168
|
+
<a href="hf-jobs/references/token_usage.md">token_usage.md</a>
|
|
169
|
+
<div class="file-path">hf-jobs/references/token_usage.md</div>
|
|
170
|
+
<div class="file-description">Token best practices: secrets vs env, permissions, common errors (401/403), and secure patterns</div>
|
|
171
|
+
</li>
|
|
172
|
+
<li>
|
|
173
|
+
<a href="hf-jobs/references/hub_saving.md">hub_saving.md</a>
|
|
174
|
+
<div class="file-path">hf-jobs/references/hub_saving.md</div>
|
|
175
|
+
<div class="file-description">How to persist results: push datasets/models/files to the Hub (ephemeral job filesystem)</div>
|
|
176
|
+
</li>
|
|
177
|
+
<li>
|
|
178
|
+
<a href="hf-jobs/references/hardware_guide.md">hardware_guide.md</a>
|
|
179
|
+
<div class="file-path">hf-jobs/references/hardware_guide.md</div>
|
|
180
|
+
<div class="file-description">Flavor selection guidance for CPU/GPU/TPU workloads</div>
|
|
181
|
+
</li>
|
|
182
|
+
<li>
|
|
183
|
+
<a href="hf-jobs/references/troubleshooting.md">troubleshooting.md</a>
|
|
184
|
+
<div class="file-path">hf-jobs/references/troubleshooting.md</div>
|
|
185
|
+
<div class="file-description">Common failure modes (timeouts, missing deps, OOM, auth) and fixes</div>
|
|
186
|
+
</li>
|
|
187
|
+
</ul>
|
|
188
|
+
</div>
|
|
189
|
+
|
|
190
|
+
<div class="section">
|
|
191
|
+
<h2>Scripts</h2>
|
|
192
|
+
<ul class="file-list">
|
|
193
|
+
<li>
|
|
194
|
+
<a href="hf-jobs/scripts/generate-responses.py">generate-responses.py</a>
|
|
195
|
+
<div class="file-path">hf-jobs/scripts/generate-responses.py</div>
|
|
196
|
+
<div class="file-description">vLLM batch generation: load prompts/messages from a dataset, generate responses, push dataset + card to Hub</div>
|
|
197
|
+
</li>
|
|
198
|
+
<li>
|
|
199
|
+
<a href="hf-jobs/scripts/cot-self-instruct.py">cot-self-instruct.py</a>
|
|
200
|
+
<div class="file-path">hf-jobs/scripts/cot-self-instruct.py</div>
|
|
201
|
+
<div class="file-description">CoT Self-Instruct synthetic data generation (reasoning/instruction) + optional filtering, pushes dataset + card</div>
|
|
202
|
+
</li>
|
|
203
|
+
<li>
|
|
204
|
+
<a href="hf-jobs/scripts/finepdfs-stats.py">finepdfs-stats.py</a>
|
|
205
|
+
<div class="file-path">hf-jobs/scripts/finepdfs-stats.py</div>
|
|
206
|
+
<div class="file-description">Polars streaming stats over Hub parquet (finepdfs-edu); optional upload of computed stats to a dataset repo</div>
|
|
207
|
+
</li>
|
|
208
|
+
</ul>
|
|
209
|
+
</div>
|
|
210
|
+
</div>
|
|
211
|
+
</body>
|
|
212
|
+
</html>
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
|
|
@@ -0,0 +1,336 @@
|
|
|
1
|
+
# Hardware Selection Guide
|
|
2
|
+
|
|
3
|
+
Choosing the right hardware (flavor) is critical for cost-effective workloads.
|
|
4
|
+
|
|
5
|
+
> **Reference:** [HF Jobs Hardware Documentation](https://huggingface.co/docs/hub/en/spaces-config-reference) (updated 07/2025)
|
|
6
|
+
|
|
7
|
+
## Available Hardware
|
|
8
|
+
|
|
9
|
+
### CPU Flavors
|
|
10
|
+
| Flavor | Description | Use Case |
|
|
11
|
+
|--------|-------------|----------|
|
|
12
|
+
| `cpu-basic` | Basic CPU instance | Testing, lightweight scripts |
|
|
13
|
+
| `cpu-upgrade` | Enhanced CPU instance | Data processing, parallel workloads |
|
|
14
|
+
|
|
15
|
+
**Use cases:** Data processing, testing scripts, lightweight workloads
|
|
16
|
+
**Not recommended for:** Model training, GPU-accelerated workloads
|
|
17
|
+
|
|
18
|
+
### GPU Flavors
|
|
19
|
+
|
|
20
|
+
| Flavor | GPU | VRAM | Use Case |
|
|
21
|
+
|--------|-----|------|----------|
|
|
22
|
+
| `t4-small` | NVIDIA T4 | 16GB | <1B models, demos, quick tests |
|
|
23
|
+
| `t4-medium` | NVIDIA T4 | 16GB | 1-3B models, development |
|
|
24
|
+
| `l4x1` | NVIDIA L4 | 24GB | 3-7B models, efficient workloads |
|
|
25
|
+
| `l4x4` | 4x NVIDIA L4 | 96GB | Multi-GPU, parallel workloads |
|
|
26
|
+
| `a10g-small` | NVIDIA A10G | 24GB | 3-7B models, production |
|
|
27
|
+
| `a10g-large` | NVIDIA A10G | 24GB | 7-13B models, batch inference |
|
|
28
|
+
| `a10g-largex2` | 2x NVIDIA A10G | 48GB | Multi-GPU, large models |
|
|
29
|
+
| `a10g-largex4` | 4x NVIDIA A10G | 96GB | Multi-GPU, very large models |
|
|
30
|
+
| `a100-large` | NVIDIA A100 | 40GB | 13B+ models, fastest GPU option |
|
|
31
|
+
|
|
32
|
+
### TPU Flavors
|
|
33
|
+
|
|
34
|
+
| Flavor | Configuration | Use Case |
|
|
35
|
+
|--------|---------------|----------|
|
|
36
|
+
| `v5e-1x1` | TPU v5e (1x1) | Small TPU workloads |
|
|
37
|
+
| `v5e-2x2` | TPU v5e (2x2) | Medium TPU workloads |
|
|
38
|
+
| `v5e-2x4` | TPU v5e (2x4) | Large TPU workloads |
|
|
39
|
+
|
|
40
|
+
**TPU Use Cases:**
|
|
41
|
+
- JAX/Flax model training
|
|
42
|
+
- Large-scale inference
|
|
43
|
+
- TPU-optimized workloads
|
|
44
|
+
|
|
45
|
+
## Selection Guidelines
|
|
46
|
+
|
|
47
|
+
### By Workload Type
|
|
48
|
+
|
|
49
|
+
**Data Processing**
|
|
50
|
+
- **Recommended:** `cpu-upgrade` or `l4x1`
|
|
51
|
+
- **Use case:** Transform, filter, analyze datasets
|
|
52
|
+
- **Batch size:** Depends on data size
|
|
53
|
+
- **Time:** Varies by dataset size
|
|
54
|
+
|
|
55
|
+
**Batch Inference**
|
|
56
|
+
- **Recommended:** `a10g-large` or `a100-large`
|
|
57
|
+
- **Use case:** Run inference on thousands of samples
|
|
58
|
+
- **Batch size:** 8-32 depending on model
|
|
59
|
+
- **Time:** Depends on number of samples
|
|
60
|
+
|
|
61
|
+
**Experiments & Benchmarks**
|
|
62
|
+
- **Recommended:** `a10g-small` or `a10g-large`
|
|
63
|
+
- **Use case:** Reproducible ML experiments
|
|
64
|
+
- **Batch size:** Varies
|
|
65
|
+
- **Time:** Depends on experiment complexity
|
|
66
|
+
|
|
67
|
+
**Model Training** (see `model-trainer` skill for details)
|
|
68
|
+
- **Recommended:** See model-trainer skill
|
|
69
|
+
- **Use case:** Fine-tuning models
|
|
70
|
+
- **Batch size:** Depends on model size
|
|
71
|
+
- **Time:** Hours to days
|
|
72
|
+
|
|
73
|
+
**Synthetic Data Generation**
|
|
74
|
+
- **Recommended:** `a10g-large` or `a100-large`
|
|
75
|
+
- **Use case:** Generate datasets using LLMs
|
|
76
|
+
- **Batch size:** Depends on generation method
|
|
77
|
+
- **Time:** Hours for large datasets
|
|
78
|
+
|
|
79
|
+
### By Budget
|
|
80
|
+
|
|
81
|
+
**Minimal Budget (<$5 total)**
|
|
82
|
+
- Use `cpu-basic` or `t4-small`
|
|
83
|
+
- Process small datasets
|
|
84
|
+
- Quick tests and demos
|
|
85
|
+
|
|
86
|
+
**Small Budget ($5-20)**
|
|
87
|
+
- Use `t4-medium` or `a10g-small`
|
|
88
|
+
- Process medium datasets
|
|
89
|
+
- Run experiments
|
|
90
|
+
|
|
91
|
+
**Medium Budget ($20-50)**
|
|
92
|
+
- Use `a10g-small` or `a10g-large`
|
|
93
|
+
- Process large datasets
|
|
94
|
+
- Production workloads
|
|
95
|
+
|
|
96
|
+
**Large Budget ($50-200)**
|
|
97
|
+
- Use `a10g-large` or `a100-large`
|
|
98
|
+
- Large-scale processing
|
|
99
|
+
- Multiple experiments
|
|
100
|
+
|
|
101
|
+
### By Model Size (for inference/processing)
|
|
102
|
+
|
|
103
|
+
**Tiny Models (<1B parameters)**
|
|
104
|
+
- **Recommended:** `t4-small`
|
|
105
|
+
- **Example:** Qwen2.5-0.5B, TinyLlama
|
|
106
|
+
- **Batch size:** 8-16
|
|
107
|
+
|
|
108
|
+
**Small Models (1-3B parameters)**
|
|
109
|
+
- **Recommended:** `t4-medium` or `a10g-small`
|
|
110
|
+
- **Example:** Qwen2.5-1.5B, Phi-2
|
|
111
|
+
- **Batch size:** 4-8
|
|
112
|
+
|
|
113
|
+
**Medium Models (3-7B parameters)**
|
|
114
|
+
- **Recommended:** `a10g-small` or `a10g-large`
|
|
115
|
+
- **Example:** Qwen2.5-7B, Mistral-7B
|
|
116
|
+
- **Batch size:** 2-4
|
|
117
|
+
|
|
118
|
+
**Large Models (7-13B parameters)**
|
|
119
|
+
- **Recommended:** `a10g-large` or `a100-large`
|
|
120
|
+
- **Example:** Llama-3-8B
|
|
121
|
+
- **Batch size:** 1-2
|
|
122
|
+
|
|
123
|
+
**Very Large Models (13B+ parameters)**
|
|
124
|
+
- **Recommended:** `a100-large`
|
|
125
|
+
- **Example:** Llama-3-13B, Llama-3-70B
|
|
126
|
+
- **Batch size:** 1
|
|
127
|
+
|
|
128
|
+
## Memory Considerations
|
|
129
|
+
|
|
130
|
+
### Estimating Memory Requirements
|
|
131
|
+
|
|
132
|
+
**For inference:**
|
|
133
|
+
```
|
|
134
|
+
Memory (GB) ≈ (Model params in billions) × 2-4
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
**For training:**
|
|
138
|
+
```
|
|
139
|
+
Memory (GB) ≈ (Model params in billions) × 20 (full) or × 4 (LoRA)
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
**Examples:**
|
|
143
|
+
- Qwen2.5-0.5B inference: ~1-2GB ✅ fits t4-small
|
|
144
|
+
- Qwen2.5-7B inference: ~14-28GB ✅ fits a10g-large
|
|
145
|
+
- Qwen2.5-7B training: ~140GB ❌ not feasible without LoRA
|
|
146
|
+
|
|
147
|
+
### Memory Optimization
|
|
148
|
+
|
|
149
|
+
If hitting memory limits:
|
|
150
|
+
|
|
151
|
+
1. **Reduce batch size**
|
|
152
|
+
```python
|
|
153
|
+
batch_size = 1
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
2. **Process in chunks**
|
|
157
|
+
```python
|
|
158
|
+
for chunk in chunks:
|
|
159
|
+
process(chunk)
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
3. **Use smaller models**
|
|
163
|
+
- Use quantized models
|
|
164
|
+
- Use LoRA adapters
|
|
165
|
+
|
|
166
|
+
4. **Upgrade hardware**
|
|
167
|
+
- cpu → t4 → a10g → a100
|
|
168
|
+
|
|
169
|
+
## Cost Estimation
|
|
170
|
+
|
|
171
|
+
### Formula
|
|
172
|
+
|
|
173
|
+
```
|
|
174
|
+
Total Cost = (Hours of runtime) × (Cost per hour)
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### Example Calculations
|
|
178
|
+
|
|
179
|
+
**Data processing:**
|
|
180
|
+
- Hardware: cpu-upgrade ($0.50/hour)
|
|
181
|
+
- Time: 1 hour
|
|
182
|
+
- Cost: $0.50
|
|
183
|
+
|
|
184
|
+
**Batch inference:**
|
|
185
|
+
- Hardware: a10g-large ($5/hour)
|
|
186
|
+
- Time: 2 hours
|
|
187
|
+
- Cost: $10.00
|
|
188
|
+
|
|
189
|
+
**Experiments:**
|
|
190
|
+
- Hardware: a10g-small ($3.50/hour)
|
|
191
|
+
- Time: 4 hours
|
|
192
|
+
- Cost: $14.00
|
|
193
|
+
|
|
194
|
+
### Cost Optimization Tips
|
|
195
|
+
|
|
196
|
+
1. **Start small:** Test on cpu-basic or t4-small
|
|
197
|
+
2. **Monitor runtime:** Set appropriate timeouts
|
|
198
|
+
3. **Optimize code:** Reduce unnecessary compute
|
|
199
|
+
4. **Choose right hardware:** Don't over-provision
|
|
200
|
+
5. **Use checkpoints:** Resume if job fails
|
|
201
|
+
6. **Monitor costs:** Check running jobs regularly
|
|
202
|
+
|
|
203
|
+
## Multi-GPU Workloads
|
|
204
|
+
|
|
205
|
+
Multi-GPU flavors automatically distribute workloads:
|
|
206
|
+
|
|
207
|
+
**Multi-GPU flavors:**
|
|
208
|
+
- `l4x4` - 4x L4 GPUs (96GB total VRAM)
|
|
209
|
+
- `a10g-largex2` - 2x A10G GPUs (48GB total VRAM)
|
|
210
|
+
- `a10g-largex4` - 4x A10G GPUs (96GB total VRAM)
|
|
211
|
+
|
|
212
|
+
**When to use:**
|
|
213
|
+
- Large models (>13B parameters)
|
|
214
|
+
- Need faster processing (linear speedup)
|
|
215
|
+
- Large datasets (>100K samples)
|
|
216
|
+
- Parallel workloads
|
|
217
|
+
- Tensor parallelism for inference
|
|
218
|
+
|
|
219
|
+
**MCP Tool Example:**
|
|
220
|
+
```python
|
|
221
|
+
hf_jobs("uv", {
|
|
222
|
+
"script": "process.py",
|
|
223
|
+
"flavor": "a10g-largex2", # 2 GPUs
|
|
224
|
+
"timeout": "4h",
|
|
225
|
+
"secrets": {"HF_TOKEN": "$HF_TOKEN"}
|
|
226
|
+
})
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
**CLI Equivalent:**
|
|
230
|
+
```bash
|
|
231
|
+
hf jobs uv run process.py --flavor a10g-largex2 --timeout 4h
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
## Choosing Between Options
|
|
235
|
+
|
|
236
|
+
### CPU vs GPU
|
|
237
|
+
|
|
238
|
+
**Choose CPU when:**
|
|
239
|
+
- No GPU acceleration needed
|
|
240
|
+
- Data processing only
|
|
241
|
+
- Budget constrained
|
|
242
|
+
- Simple workloads
|
|
243
|
+
|
|
244
|
+
**Choose GPU when:**
|
|
245
|
+
- Model inference/training
|
|
246
|
+
- GPU-accelerated libraries
|
|
247
|
+
- Need faster processing
|
|
248
|
+
- Large models
|
|
249
|
+
|
|
250
|
+
### a10g vs a100
|
|
251
|
+
|
|
252
|
+
**Choose a10g when:**
|
|
253
|
+
- Model <13B parameters
|
|
254
|
+
- Budget conscious
|
|
255
|
+
- Processing time not critical
|
|
256
|
+
|
|
257
|
+
**Choose a100 when:**
|
|
258
|
+
- Model 13B+ parameters
|
|
259
|
+
- Need fastest processing
|
|
260
|
+
- Memory requirements high
|
|
261
|
+
- Budget allows
|
|
262
|
+
|
|
263
|
+
### Single vs Multi-GPU
|
|
264
|
+
|
|
265
|
+
**Choose single GPU when:**
|
|
266
|
+
- Model <7B parameters
|
|
267
|
+
- Budget constrained
|
|
268
|
+
- Simpler debugging
|
|
269
|
+
|
|
270
|
+
**Choose multi-GPU when:**
|
|
271
|
+
- Model >13B parameters
|
|
272
|
+
- Need faster processing
|
|
273
|
+
- Large batch sizes required
|
|
274
|
+
- Cost-effective for large jobs
|
|
275
|
+
|
|
276
|
+
## Quick Reference
|
|
277
|
+
|
|
278
|
+
### All Available Flavors
|
|
279
|
+
|
|
280
|
+
```python
|
|
281
|
+
# Official flavor list (updated 07/2025)
|
|
282
|
+
FLAVORS = {
|
|
283
|
+
# CPU
|
|
284
|
+
"cpu-basic", # Testing, lightweight
|
|
285
|
+
"cpu-upgrade", # Data processing
|
|
286
|
+
|
|
287
|
+
# GPU - Single
|
|
288
|
+
"t4-small", # 16GB - <1B models
|
|
289
|
+
"t4-medium", # 16GB - 1-3B models
|
|
290
|
+
"l4x1", # 24GB - 3-7B models
|
|
291
|
+
"a10g-small", # 24GB - 3-7B production
|
|
292
|
+
"a10g-large", # 24GB - 7-13B models
|
|
293
|
+
"a100-large", # 40GB - 13B+ models
|
|
294
|
+
|
|
295
|
+
# GPU - Multi
|
|
296
|
+
"l4x4", # 4x L4 (96GB total)
|
|
297
|
+
"a10g-largex2", # 2x A10G (48GB total)
|
|
298
|
+
"a10g-largex4", # 4x A10G (96GB total)
|
|
299
|
+
|
|
300
|
+
# TPU
|
|
301
|
+
"v5e-1x1", # TPU v5e 1x1
|
|
302
|
+
"v5e-2x2", # TPU v5e 2x2
|
|
303
|
+
"v5e-2x4", # TPU v5e 2x4
|
|
304
|
+
}
|
|
305
|
+
```
|
|
306
|
+
|
|
307
|
+
### Workload → Hardware Mapping
|
|
308
|
+
|
|
309
|
+
```python
|
|
310
|
+
HARDWARE_MAP = {
|
|
311
|
+
"data_processing": "cpu-upgrade",
|
|
312
|
+
"batch_inference_small": "t4-small",
|
|
313
|
+
"batch_inference_medium": "a10g-large",
|
|
314
|
+
"batch_inference_large": "a100-large",
|
|
315
|
+
"experiments": "a10g-small",
|
|
316
|
+
"tpu_workloads": "v5e-1x1",
|
|
317
|
+
"training": "see model-trainer skill"
|
|
318
|
+
}
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
### CLI Examples
|
|
322
|
+
|
|
323
|
+
```bash
|
|
324
|
+
# CPU job
|
|
325
|
+
hf jobs run python:3.12 python script.py
|
|
326
|
+
|
|
327
|
+
# GPU job
|
|
328
|
+
hf jobs run --flavor a10g-large pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel python script.py
|
|
329
|
+
|
|
330
|
+
# TPU job
|
|
331
|
+
hf jobs run --flavor v5e-1x1 your-tpu-image python script.py
|
|
332
|
+
|
|
333
|
+
# UV script with GPU
|
|
334
|
+
hf jobs uv run --flavor a10g-small my_script.py
|
|
335
|
+
```
|
|
336
|
+
|