@synsci/cli-darwin-x64 1.1.49
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skills/accelerate/SKILL.md +332 -0
- package/bin/skills/accelerate/references/custom-plugins.md +453 -0
- package/bin/skills/accelerate/references/megatron-integration.md +489 -0
- package/bin/skills/accelerate/references/performance.md +525 -0
- package/bin/skills/audiocraft/SKILL.md +564 -0
- package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
- package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
- package/bin/skills/autogpt/SKILL.md +403 -0
- package/bin/skills/autogpt/references/advanced-usage.md +535 -0
- package/bin/skills/autogpt/references/troubleshooting.md +420 -0
- package/bin/skills/awq/SKILL.md +310 -0
- package/bin/skills/awq/references/advanced-usage.md +324 -0
- package/bin/skills/awq/references/troubleshooting.md +344 -0
- package/bin/skills/axolotl/SKILL.md +158 -0
- package/bin/skills/axolotl/references/api.md +5548 -0
- package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
- package/bin/skills/axolotl/references/index.md +15 -0
- package/bin/skills/axolotl/references/other.md +3563 -0
- package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
- package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
- package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
- package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
- package/bin/skills/bitsandbytes/SKILL.md +411 -0
- package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
- package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
- package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
- package/bin/skills/blip-2/SKILL.md +564 -0
- package/bin/skills/blip-2/references/advanced-usage.md +680 -0
- package/bin/skills/blip-2/references/troubleshooting.md +526 -0
- package/bin/skills/chroma/SKILL.md +406 -0
- package/bin/skills/chroma/references/integration.md +38 -0
- package/bin/skills/clip/SKILL.md +253 -0
- package/bin/skills/clip/references/applications.md +207 -0
- package/bin/skills/constitutional-ai/SKILL.md +290 -0
- package/bin/skills/crewai/SKILL.md +498 -0
- package/bin/skills/crewai/references/flows.md +438 -0
- package/bin/skills/crewai/references/tools.md +429 -0
- package/bin/skills/crewai/references/troubleshooting.md +480 -0
- package/bin/skills/deepspeed/SKILL.md +141 -0
- package/bin/skills/deepspeed/references/08.md +17 -0
- package/bin/skills/deepspeed/references/09.md +173 -0
- package/bin/skills/deepspeed/references/2020.md +378 -0
- package/bin/skills/deepspeed/references/2023.md +279 -0
- package/bin/skills/deepspeed/references/assets.md +179 -0
- package/bin/skills/deepspeed/references/index.md +35 -0
- package/bin/skills/deepspeed/references/mii.md +118 -0
- package/bin/skills/deepspeed/references/other.md +1191 -0
- package/bin/skills/deepspeed/references/tutorials.md +6554 -0
- package/bin/skills/dspy/SKILL.md +590 -0
- package/bin/skills/dspy/references/examples.md +663 -0
- package/bin/skills/dspy/references/modules.md +475 -0
- package/bin/skills/dspy/references/optimizers.md +566 -0
- package/bin/skills/faiss/SKILL.md +221 -0
- package/bin/skills/faiss/references/index_types.md +280 -0
- package/bin/skills/flash-attention/SKILL.md +367 -0
- package/bin/skills/flash-attention/references/benchmarks.md +215 -0
- package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
- package/bin/skills/gguf/SKILL.md +427 -0
- package/bin/skills/gguf/references/advanced-usage.md +504 -0
- package/bin/skills/gguf/references/troubleshooting.md +442 -0
- package/bin/skills/gptq/SKILL.md +450 -0
- package/bin/skills/gptq/references/calibration.md +337 -0
- package/bin/skills/gptq/references/integration.md +129 -0
- package/bin/skills/gptq/references/troubleshooting.md +95 -0
- package/bin/skills/grpo-rl-training/README.md +97 -0
- package/bin/skills/grpo-rl-training/SKILL.md +572 -0
- package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
- package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
- package/bin/skills/guidance/SKILL.md +572 -0
- package/bin/skills/guidance/references/backends.md +554 -0
- package/bin/skills/guidance/references/constraints.md +674 -0
- package/bin/skills/guidance/references/examples.md +767 -0
- package/bin/skills/hqq/SKILL.md +445 -0
- package/bin/skills/hqq/references/advanced-usage.md +528 -0
- package/bin/skills/hqq/references/troubleshooting.md +503 -0
- package/bin/skills/hugging-face-cli/SKILL.md +191 -0
- package/bin/skills/hugging-face-cli/references/commands.md +954 -0
- package/bin/skills/hugging-face-cli/references/examples.md +374 -0
- package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
- package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
- package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
- package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
- package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
- package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
- package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
- package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
- package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
- package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
- package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
- package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
- package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
- package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
- package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
- package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
- package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
- package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
- package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
- package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
- package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
- package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
- package/bin/skills/hugging-face-jobs/index.html +216 -0
- package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
- package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
- package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
- package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
- package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
- package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
- package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
- package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
- package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
- package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
- package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
- package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
- package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
- package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
- package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
- package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
- package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
- package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
- package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
- package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
- package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
- package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
- package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
- package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
- package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
- package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
- package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
- package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
- package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
- package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
- package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
- package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
- package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
- package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
- package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
- package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
- package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
- package/bin/skills/instructor/SKILL.md +740 -0
- package/bin/skills/instructor/references/examples.md +107 -0
- package/bin/skills/instructor/references/providers.md +70 -0
- package/bin/skills/instructor/references/validation.md +606 -0
- package/bin/skills/knowledge-distillation/SKILL.md +458 -0
- package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
- package/bin/skills/lambda-labs/SKILL.md +545 -0
- package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
- package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
- package/bin/skills/langchain/SKILL.md +480 -0
- package/bin/skills/langchain/references/agents.md +499 -0
- package/bin/skills/langchain/references/integration.md +562 -0
- package/bin/skills/langchain/references/rag.md +600 -0
- package/bin/skills/langsmith/SKILL.md +422 -0
- package/bin/skills/langsmith/references/advanced-usage.md +548 -0
- package/bin/skills/langsmith/references/troubleshooting.md +537 -0
- package/bin/skills/litgpt/SKILL.md +469 -0
- package/bin/skills/litgpt/references/custom-models.md +568 -0
- package/bin/skills/litgpt/references/distributed-training.md +451 -0
- package/bin/skills/litgpt/references/supported-models.md +336 -0
- package/bin/skills/litgpt/references/training-recipes.md +619 -0
- package/bin/skills/llama-cpp/SKILL.md +258 -0
- package/bin/skills/llama-cpp/references/optimization.md +89 -0
- package/bin/skills/llama-cpp/references/quantization.md +213 -0
- package/bin/skills/llama-cpp/references/server.md +125 -0
- package/bin/skills/llama-factory/SKILL.md +80 -0
- package/bin/skills/llama-factory/references/_images.md +23 -0
- package/bin/skills/llama-factory/references/advanced.md +1055 -0
- package/bin/skills/llama-factory/references/getting_started.md +349 -0
- package/bin/skills/llama-factory/references/index.md +19 -0
- package/bin/skills/llama-factory/references/other.md +31 -0
- package/bin/skills/llamaguard/SKILL.md +337 -0
- package/bin/skills/llamaindex/SKILL.md +569 -0
- package/bin/skills/llamaindex/references/agents.md +83 -0
- package/bin/skills/llamaindex/references/data_connectors.md +108 -0
- package/bin/skills/llamaindex/references/query_engines.md +406 -0
- package/bin/skills/llava/SKILL.md +304 -0
- package/bin/skills/llava/references/training.md +197 -0
- package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
- package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
- package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
- package/bin/skills/long-context/SKILL.md +536 -0
- package/bin/skills/long-context/references/extension_methods.md +468 -0
- package/bin/skills/long-context/references/fine_tuning.md +611 -0
- package/bin/skills/long-context/references/rope.md +402 -0
- package/bin/skills/mamba/SKILL.md +260 -0
- package/bin/skills/mamba/references/architecture-details.md +206 -0
- package/bin/skills/mamba/references/benchmarks.md +255 -0
- package/bin/skills/mamba/references/training-guide.md +388 -0
- package/bin/skills/megatron-core/SKILL.md +366 -0
- package/bin/skills/megatron-core/references/benchmarks.md +249 -0
- package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
- package/bin/skills/megatron-core/references/production-examples.md +473 -0
- package/bin/skills/megatron-core/references/training-recipes.md +547 -0
- package/bin/skills/miles/SKILL.md +315 -0
- package/bin/skills/miles/references/api-reference.md +141 -0
- package/bin/skills/miles/references/troubleshooting.md +352 -0
- package/bin/skills/mlflow/SKILL.md +704 -0
- package/bin/skills/mlflow/references/deployment.md +744 -0
- package/bin/skills/mlflow/references/model-registry.md +770 -0
- package/bin/skills/mlflow/references/tracking.md +680 -0
- package/bin/skills/modal/SKILL.md +341 -0
- package/bin/skills/modal/references/advanced-usage.md +503 -0
- package/bin/skills/modal/references/troubleshooting.md +494 -0
- package/bin/skills/model-merging/SKILL.md +539 -0
- package/bin/skills/model-merging/references/evaluation.md +462 -0
- package/bin/skills/model-merging/references/examples.md +428 -0
- package/bin/skills/model-merging/references/methods.md +352 -0
- package/bin/skills/model-pruning/SKILL.md +495 -0
- package/bin/skills/model-pruning/references/wanda.md +347 -0
- package/bin/skills/moe-training/SKILL.md +526 -0
- package/bin/skills/moe-training/references/architectures.md +432 -0
- package/bin/skills/moe-training/references/inference.md +348 -0
- package/bin/skills/moe-training/references/training.md +425 -0
- package/bin/skills/nanogpt/SKILL.md +290 -0
- package/bin/skills/nanogpt/references/architecture.md +382 -0
- package/bin/skills/nanogpt/references/data.md +476 -0
- package/bin/skills/nanogpt/references/training.md +564 -0
- package/bin/skills/nemo-curator/SKILL.md +383 -0
- package/bin/skills/nemo-curator/references/deduplication.md +87 -0
- package/bin/skills/nemo-curator/references/filtering.md +102 -0
- package/bin/skills/nemo-evaluator/SKILL.md +494 -0
- package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
- package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
- package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
- package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
- package/bin/skills/nemo-guardrails/SKILL.md +297 -0
- package/bin/skills/nnsight/SKILL.md +436 -0
- package/bin/skills/nnsight/references/README.md +78 -0
- package/bin/skills/nnsight/references/api.md +344 -0
- package/bin/skills/nnsight/references/tutorials.md +300 -0
- package/bin/skills/openrlhf/SKILL.md +249 -0
- package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
- package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
- package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
- package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
- package/bin/skills/outlines/SKILL.md +652 -0
- package/bin/skills/outlines/references/backends.md +615 -0
- package/bin/skills/outlines/references/examples.md +773 -0
- package/bin/skills/outlines/references/json_generation.md +652 -0
- package/bin/skills/peft/SKILL.md +431 -0
- package/bin/skills/peft/references/advanced-usage.md +514 -0
- package/bin/skills/peft/references/troubleshooting.md +480 -0
- package/bin/skills/phoenix/SKILL.md +475 -0
- package/bin/skills/phoenix/references/advanced-usage.md +619 -0
- package/bin/skills/phoenix/references/troubleshooting.md +538 -0
- package/bin/skills/pinecone/SKILL.md +358 -0
- package/bin/skills/pinecone/references/deployment.md +181 -0
- package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
- package/bin/skills/pytorch-fsdp/references/index.md +7 -0
- package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
- package/bin/skills/pytorch-lightning/SKILL.md +346 -0
- package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
- package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
- package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
- package/bin/skills/pyvene/SKILL.md +473 -0
- package/bin/skills/pyvene/references/README.md +73 -0
- package/bin/skills/pyvene/references/api.md +383 -0
- package/bin/skills/pyvene/references/tutorials.md +376 -0
- package/bin/skills/qdrant/SKILL.md +493 -0
- package/bin/skills/qdrant/references/advanced-usage.md +648 -0
- package/bin/skills/qdrant/references/troubleshooting.md +631 -0
- package/bin/skills/ray-data/SKILL.md +326 -0
- package/bin/skills/ray-data/references/integration.md +82 -0
- package/bin/skills/ray-data/references/transformations.md +83 -0
- package/bin/skills/ray-train/SKILL.md +406 -0
- package/bin/skills/ray-train/references/multi-node.md +628 -0
- package/bin/skills/rwkv/SKILL.md +260 -0
- package/bin/skills/rwkv/references/architecture-details.md +344 -0
- package/bin/skills/rwkv/references/rwkv7.md +386 -0
- package/bin/skills/rwkv/references/state-management.md +369 -0
- package/bin/skills/saelens/SKILL.md +386 -0
- package/bin/skills/saelens/references/README.md +70 -0
- package/bin/skills/saelens/references/api.md +333 -0
- package/bin/skills/saelens/references/tutorials.md +318 -0
- package/bin/skills/segment-anything/SKILL.md +500 -0
- package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
- package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
- package/bin/skills/sentence-transformers/SKILL.md +255 -0
- package/bin/skills/sentence-transformers/references/models.md +123 -0
- package/bin/skills/sentencepiece/SKILL.md +235 -0
- package/bin/skills/sentencepiece/references/algorithms.md +200 -0
- package/bin/skills/sentencepiece/references/training.md +304 -0
- package/bin/skills/sglang/SKILL.md +442 -0
- package/bin/skills/sglang/references/deployment.md +490 -0
- package/bin/skills/sglang/references/radix-attention.md +413 -0
- package/bin/skills/sglang/references/structured-generation.md +541 -0
- package/bin/skills/simpo/SKILL.md +219 -0
- package/bin/skills/simpo/references/datasets.md +478 -0
- package/bin/skills/simpo/references/hyperparameters.md +452 -0
- package/bin/skills/simpo/references/loss-functions.md +350 -0
- package/bin/skills/skypilot/SKILL.md +509 -0
- package/bin/skills/skypilot/references/advanced-usage.md +491 -0
- package/bin/skills/skypilot/references/troubleshooting.md +570 -0
- package/bin/skills/slime/SKILL.md +464 -0
- package/bin/skills/slime/references/api-reference.md +392 -0
- package/bin/skills/slime/references/troubleshooting.md +386 -0
- package/bin/skills/speculative-decoding/SKILL.md +467 -0
- package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
- package/bin/skills/speculative-decoding/references/medusa.md +350 -0
- package/bin/skills/stable-diffusion/SKILL.md +519 -0
- package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
- package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
- package/bin/skills/tensorboard/SKILL.md +629 -0
- package/bin/skills/tensorboard/references/integrations.md +638 -0
- package/bin/skills/tensorboard/references/profiling.md +545 -0
- package/bin/skills/tensorboard/references/visualization.md +620 -0
- package/bin/skills/tensorrt-llm/SKILL.md +187 -0
- package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
- package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
- package/bin/skills/tensorrt-llm/references/serving.md +470 -0
- package/bin/skills/tinker/SKILL.md +362 -0
- package/bin/skills/tinker/references/api-reference.md +168 -0
- package/bin/skills/tinker/references/getting-started.md +157 -0
- package/bin/skills/tinker/references/loss-functions.md +163 -0
- package/bin/skills/tinker/references/models-and-lora.md +139 -0
- package/bin/skills/tinker/references/recipes.md +280 -0
- package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
- package/bin/skills/tinker/references/rendering.md +243 -0
- package/bin/skills/tinker/references/supervised-learning.md +232 -0
- package/bin/skills/tinker-training-cost/SKILL.md +187 -0
- package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
- package/bin/skills/torchforge/SKILL.md +433 -0
- package/bin/skills/torchforge/references/api-reference.md +327 -0
- package/bin/skills/torchforge/references/troubleshooting.md +409 -0
- package/bin/skills/torchtitan/SKILL.md +358 -0
- package/bin/skills/torchtitan/references/checkpoint.md +181 -0
- package/bin/skills/torchtitan/references/custom-models.md +258 -0
- package/bin/skills/torchtitan/references/float8.md +133 -0
- package/bin/skills/torchtitan/references/fsdp.md +126 -0
- package/bin/skills/transformer-lens/SKILL.md +346 -0
- package/bin/skills/transformer-lens/references/README.md +54 -0
- package/bin/skills/transformer-lens/references/api.md +362 -0
- package/bin/skills/transformer-lens/references/tutorials.md +339 -0
- package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
- package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
- package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
- package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
- package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
- package/bin/skills/unsloth/SKILL.md +80 -0
- package/bin/skills/unsloth/references/index.md +7 -0
- package/bin/skills/unsloth/references/llms-full.md +16799 -0
- package/bin/skills/unsloth/references/llms-txt.md +12044 -0
- package/bin/skills/unsloth/references/llms.md +82 -0
- package/bin/skills/verl/SKILL.md +391 -0
- package/bin/skills/verl/references/api-reference.md +301 -0
- package/bin/skills/verl/references/troubleshooting.md +391 -0
- package/bin/skills/vllm/SKILL.md +364 -0
- package/bin/skills/vllm/references/optimization.md +226 -0
- package/bin/skills/vllm/references/quantization.md +284 -0
- package/bin/skills/vllm/references/server-deployment.md +255 -0
- package/bin/skills/vllm/references/troubleshooting.md +447 -0
- package/bin/skills/weights-and-biases/SKILL.md +590 -0
- package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
- package/bin/skills/weights-and-biases/references/integrations.md +700 -0
- package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
- package/bin/skills/whisper/SKILL.md +317 -0
- package/bin/skills/whisper/references/languages.md +189 -0
- package/bin/synsc +0 -0
- package/package.json +10 -0
|
@@ -0,0 +1,526 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: moe-training
|
|
3
|
+
description: Train Mixture of Experts (MoE) models using DeepSpeed or HuggingFace. Use when training large-scale models with limited compute (5× cost reduction vs dense models), implementing sparse architectures like Mixtral 8x7B or DeepSeek-V3, or scaling model capacity without proportional compute increase. Covers MoE architectures, routing mechanisms, load balancing, expert parallelism, and inference optimization.
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
author: Synthetic Sciences
|
|
6
|
+
license: MIT
|
|
7
|
+
tags: [Emerging Techniques, MoE, Mixture Of Experts, Sparse Models, DeepSpeed, Expert Parallelism, Mixtral, DeepSeek, Routing, Load Balancing, Efficient Training]
|
|
8
|
+
dependencies: [deepspeed, transformers, torch, accelerate]
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# MoE Training: Mixture of Experts
|
|
12
|
+
|
|
13
|
+
## When to Use This Skill
|
|
14
|
+
|
|
15
|
+
Use MoE Training when you need to:
|
|
16
|
+
- **Train larger models** with limited compute (5× cost reduction vs dense models)
|
|
17
|
+
- **Scale model capacity** without proportional compute increase
|
|
18
|
+
- **Achieve better performance** per compute budget than dense models
|
|
19
|
+
- **Specialize experts** for different domains/tasks/languages
|
|
20
|
+
- **Reduce inference latency** with sparse activation (only 13B/47B params active in Mixtral)
|
|
21
|
+
- **Implement SOTA models** like Mixtral 8x7B, DeepSeek-V3, Switch Transformers
|
|
22
|
+
|
|
23
|
+
**Notable MoE Models**: Mixtral 8x7B (Mistral AI), DeepSeek-V3, Switch Transformers (Google), GLaM (Google), NLLB-MoE (Meta)
|
|
24
|
+
|
|
25
|
+
## Installation
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
# DeepSpeed with MoE support
|
|
29
|
+
pip install deepspeed>=0.6.0
|
|
30
|
+
|
|
31
|
+
# Megatron-DeepSpeed for large-scale training
|
|
32
|
+
git clone https://github.com/microsoft/Megatron-DeepSpeed
|
|
33
|
+
cd Megatron-DeepSpeed
|
|
34
|
+
pip install -r requirements.txt
|
|
35
|
+
|
|
36
|
+
# Alternative: HuggingFace Transformers
|
|
37
|
+
pip install transformers accelerate
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Quick Start
|
|
41
|
+
|
|
42
|
+
### Basic MoE Architecture
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
import torch
|
|
46
|
+
import torch.nn as nn
|
|
47
|
+
|
|
48
|
+
class MoELayer(nn.Module):
|
|
49
|
+
"""Sparse Mixture of Experts layer."""
|
|
50
|
+
|
|
51
|
+
def __init__(self, hidden_size, num_experts=8, top_k=2):
|
|
52
|
+
super().__init__()
|
|
53
|
+
self.num_experts = num_experts
|
|
54
|
+
self.top_k = top_k
|
|
55
|
+
|
|
56
|
+
# Expert networks (FFN)
|
|
57
|
+
self.experts = nn.ModuleList([
|
|
58
|
+
nn.Sequential(
|
|
59
|
+
nn.Linear(hidden_size, 4 * hidden_size),
|
|
60
|
+
nn.GELU(),
|
|
61
|
+
nn.Linear(4 * hidden_size, hidden_size)
|
|
62
|
+
)
|
|
63
|
+
for _ in range(num_experts)
|
|
64
|
+
])
|
|
65
|
+
|
|
66
|
+
# Gating network (router)
|
|
67
|
+
self.gate = nn.Linear(hidden_size, num_experts)
|
|
68
|
+
|
|
69
|
+
def forward(self, x):
|
|
70
|
+
# x shape: (batch_size, seq_len, hidden_size)
|
|
71
|
+
batch_size, seq_len, hidden_size = x.shape
|
|
72
|
+
|
|
73
|
+
# Flatten for routing
|
|
74
|
+
x_flat = x.view(-1, hidden_size) # (batch_size * seq_len, hidden_size)
|
|
75
|
+
|
|
76
|
+
# Compute gate scores
|
|
77
|
+
gate_logits = self.gate(x_flat) # (batch_size * seq_len, num_experts)
|
|
78
|
+
|
|
79
|
+
# Top-k routing
|
|
80
|
+
gate_scores = torch.softmax(gate_logits, dim=-1)
|
|
81
|
+
topk_scores, topk_indices = torch.topk(gate_scores, self.top_k, dim=-1)
|
|
82
|
+
|
|
83
|
+
# Normalize top-k scores
|
|
84
|
+
topk_scores = topk_scores / topk_scores.sum(dim=-1, keepdim=True)
|
|
85
|
+
|
|
86
|
+
# Dispatch and combine expert outputs
|
|
87
|
+
output = torch.zeros_like(x_flat)
|
|
88
|
+
|
|
89
|
+
for i in range(self.top_k):
|
|
90
|
+
expert_idx = topk_indices[:, i]
|
|
91
|
+
expert_scores = topk_scores[:, i].unsqueeze(-1)
|
|
92
|
+
|
|
93
|
+
# Route tokens to experts
|
|
94
|
+
for expert_id in range(self.num_experts):
|
|
95
|
+
mask = (expert_idx == expert_id)
|
|
96
|
+
if mask.any():
|
|
97
|
+
expert_input = x_flat[mask]
|
|
98
|
+
expert_output = self.experts[expert_id](expert_input)
|
|
99
|
+
output[mask] += expert_scores[mask] * expert_output
|
|
100
|
+
|
|
101
|
+
# Reshape back
|
|
102
|
+
return output.view(batch_size, seq_len, hidden_size)
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### DeepSpeed MoE Training
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
# Training script with MoE
|
|
109
|
+
deepspeed pretrain_gpt_moe.py \
|
|
110
|
+
--num-layers 24 \
|
|
111
|
+
--hidden-size 1024 \
|
|
112
|
+
--num-attention-heads 16 \
|
|
113
|
+
--seq-length 2048 \
|
|
114
|
+
--max-position-embeddings 2048 \
|
|
115
|
+
--micro-batch-size 4 \
|
|
116
|
+
--global-batch-size 256 \
|
|
117
|
+
--train-iters 500000 \
|
|
118
|
+
--lr 0.0001 \
|
|
119
|
+
--min-lr 0.00001 \
|
|
120
|
+
--lr-decay-style cosine \
|
|
121
|
+
--num-experts 128 \
|
|
122
|
+
--moe-expert-parallel-size 4 \
|
|
123
|
+
--moe-loss-coeff 0.01 \
|
|
124
|
+
--moe-train-capacity-factor 1.25 \
|
|
125
|
+
--moe-eval-capacity-factor 2.0 \
|
|
126
|
+
--fp16 \
|
|
127
|
+
--deepspeed_config ds_config.json
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## Core Concepts
|
|
131
|
+
|
|
132
|
+
### 1. MoE Architecture
|
|
133
|
+
|
|
134
|
+
**Key Components:**
|
|
135
|
+
- **Experts**: Multiple specialized FFN networks (typically 8-128)
|
|
136
|
+
- **Router/Gate**: Learned network that selects which experts to use
|
|
137
|
+
- **Top-k Routing**: Activate only k experts per token (k=1 or k=2)
|
|
138
|
+
- **Load Balancing**: Ensure even expert utilization
|
|
139
|
+
|
|
140
|
+
```
|
|
141
|
+
Input Token
|
|
142
|
+
↓
|
|
143
|
+
Router (Gate Network)
|
|
144
|
+
↓
|
|
145
|
+
Top-k Expert Selection (e.g., 2 out of 8)
|
|
146
|
+
↓
|
|
147
|
+
Expert 1 (weight: 0.6) + Expert 5 (weight: 0.4)
|
|
148
|
+
↓
|
|
149
|
+
Weighted Combination
|
|
150
|
+
↓
|
|
151
|
+
Output
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
### 2. Routing Mechanisms
|
|
155
|
+
|
|
156
|
+
**Top-1 Routing (Switch Transformer):**
|
|
157
|
+
```python
|
|
158
|
+
# Simplest routing: one expert per token
|
|
159
|
+
gate_logits = router(x) # (batch, seq_len, num_experts)
|
|
160
|
+
expert_idx = torch.argmax(gate_logits, dim=-1) # Hard routing
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
**Top-2 Routing (Mixtral):**
|
|
164
|
+
```python
|
|
165
|
+
# Top-2: two experts per token
|
|
166
|
+
gate_scores = torch.softmax(router(x), dim=-1)
|
|
167
|
+
top2_scores, top2_indices = torch.topk(gate_scores, k=2, dim=-1)
|
|
168
|
+
|
|
169
|
+
# Normalize scores
|
|
170
|
+
top2_scores = top2_scores / top2_scores.sum(dim=-1, keepdim=True)
|
|
171
|
+
|
|
172
|
+
# Combine expert outputs
|
|
173
|
+
output = (top2_scores[:, :, 0:1] * expert_outputs[top2_indices[:, :, 0]] +
|
|
174
|
+
top2_scores[:, :, 1:2] * expert_outputs[top2_indices[:, :, 1]])
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
**Expert Choice Routing:**
|
|
178
|
+
```python
|
|
179
|
+
# Experts choose top-k tokens (instead of tokens choosing experts)
|
|
180
|
+
# Guarantees perfect load balancing
|
|
181
|
+
expert_scores = router(x).transpose(-1, -2) # (batch, num_experts, seq_len)
|
|
182
|
+
topk_tokens = torch.topk(expert_scores, k=capacity_per_expert, dim=-1)
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
### 3. Load Balancing
|
|
186
|
+
|
|
187
|
+
**Auxiliary Loss:**
|
|
188
|
+
```python
|
|
189
|
+
def load_balancing_loss(gate_logits, expert_indices, num_experts):
|
|
190
|
+
"""Encourage uniform expert usage."""
|
|
191
|
+
# Fraction of tokens routed to each expert
|
|
192
|
+
expert_counts = torch.bincount(expert_indices.flatten(), minlength=num_experts)
|
|
193
|
+
expert_fraction = expert_counts.float() / expert_indices.numel()
|
|
194
|
+
|
|
195
|
+
# Gate probability for each expert (average across tokens)
|
|
196
|
+
gate_probs = torch.softmax(gate_logits, dim=-1).mean(dim=0)
|
|
197
|
+
|
|
198
|
+
# Auxiliary loss: encourage alignment
|
|
199
|
+
aux_loss = num_experts * (expert_fraction * gate_probs).sum()
|
|
200
|
+
|
|
201
|
+
return aux_loss
|
|
202
|
+
|
|
203
|
+
# Add to main loss
|
|
204
|
+
total_loss = language_model_loss + 0.01 * load_balancing_loss(...)
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
**Router Z-Loss (Stability):**
|
|
208
|
+
```python
|
|
209
|
+
def router_z_loss(logits):
|
|
210
|
+
"""Encourage router to have lower entropy (more decisive)."""
|
|
211
|
+
z_loss = torch.logsumexp(logits, dim=-1).pow(2).mean()
|
|
212
|
+
return z_loss
|
|
213
|
+
|
|
214
|
+
total_loss = lm_loss + 0.01 * aux_loss + 0.001 * router_z_loss(gate_logits)
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
### 4. Expert Parallelism
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
# DeepSpeed configuration
|
|
221
|
+
{
|
|
222
|
+
"train_batch_size": 256,
|
|
223
|
+
"fp16": {"enabled": true},
|
|
224
|
+
"moe": {
|
|
225
|
+
"enabled": true,
|
|
226
|
+
"num_experts": 128,
|
|
227
|
+
"expert_parallel_size": 8, # Distribute 128 experts across 8 GPUs
|
|
228
|
+
"capacity_factor": 1.25, # Expert capacity = tokens_per_batch * capacity_factor / num_experts
|
|
229
|
+
"drop_tokens": true, # Drop tokens exceeding capacity
|
|
230
|
+
"use_residual": false
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
## Training Configuration
|
|
236
|
+
|
|
237
|
+
### DeepSpeed MoE Config
|
|
238
|
+
|
|
239
|
+
```json
|
|
240
|
+
{
|
|
241
|
+
"train_batch_size": 256,
|
|
242
|
+
"gradient_accumulation_steps": 1,
|
|
243
|
+
"optimizer": {
|
|
244
|
+
"type": "Adam",
|
|
245
|
+
"params": {
|
|
246
|
+
"lr": 0.0001,
|
|
247
|
+
"betas": [0.9, 0.999],
|
|
248
|
+
"eps": 1e-8
|
|
249
|
+
}
|
|
250
|
+
},
|
|
251
|
+
"fp16": {
|
|
252
|
+
"enabled": true,
|
|
253
|
+
"loss_scale": 0,
|
|
254
|
+
"initial_scale_power": 16
|
|
255
|
+
},
|
|
256
|
+
"moe": {
|
|
257
|
+
"enabled": true,
|
|
258
|
+
"num_experts": 128,
|
|
259
|
+
"expert_parallel_size": 8,
|
|
260
|
+
"moe_loss_coeff": 0.01,
|
|
261
|
+
"train_capacity_factor": 1.25,
|
|
262
|
+
"eval_capacity_factor": 2.0,
|
|
263
|
+
"min_capacity": 4,
|
|
264
|
+
"drop_tokens": true,
|
|
265
|
+
"use_residual": false,
|
|
266
|
+
"use_tutel": false
|
|
267
|
+
},
|
|
268
|
+
"zero_optimization": {
|
|
269
|
+
"stage": 1
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
### Training Script
|
|
275
|
+
|
|
276
|
+
```bash
|
|
277
|
+
#!/bin/bash
|
|
278
|
+
|
|
279
|
+
# Mixtral-style MoE training
|
|
280
|
+
deepspeed --num_gpus 8 pretrain_moe.py \
|
|
281
|
+
--model-parallel-size 1 \
|
|
282
|
+
--num-layers 32 \
|
|
283
|
+
--hidden-size 4096 \
|
|
284
|
+
--num-attention-heads 32 \
|
|
285
|
+
--seq-length 2048 \
|
|
286
|
+
--max-position-embeddings 4096 \
|
|
287
|
+
--micro-batch-size 2 \
|
|
288
|
+
--global-batch-size 256 \
|
|
289
|
+
--train-iters 500000 \
|
|
290
|
+
--save-interval 5000 \
|
|
291
|
+
--eval-interval 1000 \
|
|
292
|
+
--eval-iters 100 \
|
|
293
|
+
--lr 0.0001 \
|
|
294
|
+
--min-lr 0.00001 \
|
|
295
|
+
--lr-decay-style cosine \
|
|
296
|
+
--lr-warmup-iters 2000 \
|
|
297
|
+
--clip-grad 1.0 \
|
|
298
|
+
--weight-decay 0.1 \
|
|
299
|
+
--num-experts 8 \
|
|
300
|
+
--moe-expert-parallel-size 4 \
|
|
301
|
+
--moe-loss-coeff 0.01 \
|
|
302
|
+
--moe-train-capacity-factor 1.25 \
|
|
303
|
+
--moe-eval-capacity-factor 2.0 \
|
|
304
|
+
--disable-moe-token-dropping \
|
|
305
|
+
--fp16 \
|
|
306
|
+
--deepspeed \
|
|
307
|
+
--deepspeed_config ds_config_moe.json \
|
|
308
|
+
--data-path /path/to/data \
|
|
309
|
+
--vocab-file /path/to/vocab.json \
|
|
310
|
+
--merge-file /path/to/merges.txt
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
## Advanced Patterns
|
|
314
|
+
|
|
315
|
+
### Mixtral 8x7B Architecture
|
|
316
|
+
|
|
317
|
+
```python
|
|
318
|
+
class MixtralMoEBlock(nn.Module):
|
|
319
|
+
"""Mixtral-style MoE block with 8 experts, top-2 routing."""
|
|
320
|
+
|
|
321
|
+
def __init__(self, config):
|
|
322
|
+
super().__init__()
|
|
323
|
+
self.hidden_dim = config.hidden_size
|
|
324
|
+
self.ffn_dim = config.intermediate_size
|
|
325
|
+
self.num_experts = config.num_local_experts # 8
|
|
326
|
+
self.top_k = config.num_experts_per_tok # 2
|
|
327
|
+
|
|
328
|
+
# 8 expert FFNs
|
|
329
|
+
self.experts = nn.ModuleList([
|
|
330
|
+
nn.Sequential(
|
|
331
|
+
nn.Linear(self.hidden_dim, self.ffn_dim, bias=False),
|
|
332
|
+
nn.SiLU(),
|
|
333
|
+
nn.Linear(self.ffn_dim, self.hidden_dim, bias=False)
|
|
334
|
+
)
|
|
335
|
+
for _ in range(self.num_experts)
|
|
336
|
+
])
|
|
337
|
+
|
|
338
|
+
# Router
|
|
339
|
+
self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
|
|
340
|
+
|
|
341
|
+
def forward(self, hidden_states):
|
|
342
|
+
batch_size, sequence_length, hidden_dim = hidden_states.shape
|
|
343
|
+
|
|
344
|
+
# Flatten
|
|
345
|
+
hidden_states = hidden_states.view(-1, hidden_dim)
|
|
346
|
+
|
|
347
|
+
# Router logits
|
|
348
|
+
router_logits = self.gate(hidden_states) # (batch * seq_len, num_experts)
|
|
349
|
+
|
|
350
|
+
# Softmax and top-2
|
|
351
|
+
routing_weights = torch.softmax(router_logits, dim=1)
|
|
352
|
+
routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
|
|
353
|
+
|
|
354
|
+
# Normalize routing weights
|
|
355
|
+
routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
|
|
356
|
+
|
|
357
|
+
# Initialize output
|
|
358
|
+
final_hidden_states = torch.zeros_like(hidden_states)
|
|
359
|
+
|
|
360
|
+
# Route to experts
|
|
361
|
+
for expert_idx in range(self.num_experts):
|
|
362
|
+
expert_layer = self.experts[expert_idx]
|
|
363
|
+
idx, top_x = torch.where(selected_experts == expert_idx)
|
|
364
|
+
|
|
365
|
+
if idx.shape[0] == 0:
|
|
366
|
+
continue
|
|
367
|
+
|
|
368
|
+
# Current expert tokens
|
|
369
|
+
current_hidden_states = hidden_states[idx]
|
|
370
|
+
|
|
371
|
+
# Expert forward
|
|
372
|
+
current_hidden_states = expert_layer(current_hidden_states)
|
|
373
|
+
|
|
374
|
+
# Weighted by routing scores
|
|
375
|
+
current_hidden_states *= routing_weights[idx, top_x, None]
|
|
376
|
+
|
|
377
|
+
# Accumulate
|
|
378
|
+
final_hidden_states.index_add_(0, idx, current_hidden_states)
|
|
379
|
+
|
|
380
|
+
# Reshape
|
|
381
|
+
return final_hidden_states.view(batch_size, sequence_length, hidden_dim)
|
|
382
|
+
```
|
|
383
|
+
|
|
384
|
+
### PR-MoE (Pyramid-Residual-MoE)
|
|
385
|
+
|
|
386
|
+
```bash
|
|
387
|
+
# DeepSpeed PR-MoE: 3x better parameter efficiency
|
|
388
|
+
deepspeed pretrain_gpt_moe.py \
|
|
389
|
+
--num-layers 24 \
|
|
390
|
+
--hidden-size 1024 \
|
|
391
|
+
--num-attention-heads 16 \
|
|
392
|
+
--num-experts "[128, 64, 32, 16]" \
|
|
393
|
+
--mlp-type residual \
|
|
394
|
+
--moe-expert-parallel-size 4 \
|
|
395
|
+
--moe-loss-coeff 0.01 \
|
|
396
|
+
--fp16
|
|
397
|
+
```
|
|
398
|
+
|
|
399
|
+
## Best Practices
|
|
400
|
+
|
|
401
|
+
### 1. Expert Count Selection
|
|
402
|
+
|
|
403
|
+
```python
|
|
404
|
+
# Rule of thumb: More experts = more capacity, but diminishing returns
|
|
405
|
+
# Typical configurations:
|
|
406
|
+
# - Small models (1B-7B): 8-16 experts
|
|
407
|
+
# - Medium models (7B-30B): 8-64 experts
|
|
408
|
+
# - Large models (30B+): 64-256 experts
|
|
409
|
+
|
|
410
|
+
# Example: Mixtral 8x7B
|
|
411
|
+
# Total params: 47B (8 experts × 7B each)
|
|
412
|
+
# Active params: 13B (2 experts × 7B, top-2 routing)
|
|
413
|
+
# Efficiency: 47B capacity with 13B compute
|
|
414
|
+
```
|
|
415
|
+
|
|
416
|
+
### 2. Capacity Factor Tuning
|
|
417
|
+
|
|
418
|
+
```python
|
|
419
|
+
# Capacity = (tokens_per_batch / num_experts) * capacity_factor
|
|
420
|
+
|
|
421
|
+
# Training: Lower capacity (faster, drops some tokens)
|
|
422
|
+
train_capacity_factor = 1.25 # 25% buffer
|
|
423
|
+
|
|
424
|
+
# Evaluation: Higher capacity (no dropping)
|
|
425
|
+
eval_capacity_factor = 2.0 # 100% buffer
|
|
426
|
+
|
|
427
|
+
# Formula:
|
|
428
|
+
expert_capacity = int((seq_len * batch_size / num_experts) * capacity_factor)
|
|
429
|
+
```
|
|
430
|
+
|
|
431
|
+
### 3. Learning Rate Guidelines
|
|
432
|
+
|
|
433
|
+
```python
|
|
434
|
+
# MoE models need lower LR than dense models
|
|
435
|
+
# - Dense model: lr = 6e-4
|
|
436
|
+
# - MoE model: lr = 1e-4 (3-6× lower)
|
|
437
|
+
|
|
438
|
+
# Also extend decay schedule
|
|
439
|
+
dense_lr_decay_iters = 300000
|
|
440
|
+
moe_lr_decay_iters = 500000 # 1.5-2× longer
|
|
441
|
+
```
|
|
442
|
+
|
|
443
|
+
### 4. Loss Coefficient Tuning
|
|
444
|
+
|
|
445
|
+
```python
|
|
446
|
+
# Start with standard values
|
|
447
|
+
moe_loss_coeff = 0.01 # Auxiliary loss (load balancing)
|
|
448
|
+
router_z_loss_coeff = 0.001 # Router entropy (stability)
|
|
449
|
+
|
|
450
|
+
# If load imbalance persists, increase aux loss
|
|
451
|
+
if max_expert_usage / min_expert_usage > 2.0:
|
|
452
|
+
moe_loss_coeff = 0.1 # Stronger load balancing
|
|
453
|
+
|
|
454
|
+
# If training unstable, increase z-loss
|
|
455
|
+
if grad_norm > 10.0:
|
|
456
|
+
router_z_loss_coeff = 0.01
|
|
457
|
+
```
|
|
458
|
+
|
|
459
|
+
### 5. Avoid Common Pitfalls
|
|
460
|
+
|
|
461
|
+
```python
|
|
462
|
+
# ❌ Bad: Using same LR as dense model
|
|
463
|
+
optimizer = Adam(model.parameters(), lr=6e-4)
|
|
464
|
+
|
|
465
|
+
# ✅ Good: Lower LR for MoE
|
|
466
|
+
optimizer = Adam([
|
|
467
|
+
{'params': model.non_moe_params, 'lr': 6e-4},
|
|
468
|
+
{'params': model.moe_params, 'lr': 1e-4}
|
|
469
|
+
])
|
|
470
|
+
|
|
471
|
+
# ❌ Bad: No load balancing
|
|
472
|
+
loss = lm_loss
|
|
473
|
+
|
|
474
|
+
# ✅ Good: Add auxiliary loss
|
|
475
|
+
loss = lm_loss + 0.01 * aux_loss + 0.001 * z_loss
|
|
476
|
+
|
|
477
|
+
# ❌ Bad: Too many experts for small dataset
|
|
478
|
+
num_experts = 128 # Overfitting risk
|
|
479
|
+
|
|
480
|
+
# ✅ Good: Match experts to data diversity
|
|
481
|
+
num_experts = 8 # Better for small datasets
|
|
482
|
+
```
|
|
483
|
+
|
|
484
|
+
## Inference Optimization
|
|
485
|
+
|
|
486
|
+
### Sparse Inference
|
|
487
|
+
|
|
488
|
+
```python
|
|
489
|
+
# Only activate top-k experts (huge memory savings)
|
|
490
|
+
@torch.no_grad()
|
|
491
|
+
def moe_inference(x, model, top_k=2):
|
|
492
|
+
"""Sparse MoE inference: only load k experts."""
|
|
493
|
+
# Router
|
|
494
|
+
gate_logits = model.gate(x)
|
|
495
|
+
topk_scores, topk_indices = torch.topk(
|
|
496
|
+
torch.softmax(gate_logits, dim=-1),
|
|
497
|
+
k=top_k,
|
|
498
|
+
dim=-1
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
# Load and run only top-k experts
|
|
502
|
+
output = torch.zeros_like(x)
|
|
503
|
+
for i in range(top_k):
|
|
504
|
+
expert_idx = topk_indices[:, i]
|
|
505
|
+
# Load expert from disk/offload if needed
|
|
506
|
+
expert = model.load_expert(expert_idx)
|
|
507
|
+
output += topk_scores[:, i:i+1] * expert(x)
|
|
508
|
+
|
|
509
|
+
return output
|
|
510
|
+
```
|
|
511
|
+
|
|
512
|
+
## Resources
|
|
513
|
+
|
|
514
|
+
- **DeepSpeed MoE Tutorial**: https://www.deepspeed.ai/tutorials/mixture-of-experts-nlg/
|
|
515
|
+
- **Mixtral Paper**: https://arxiv.org/abs/2401.04088
|
|
516
|
+
- **Switch Transformers**: https://arxiv.org/abs/2101.03961
|
|
517
|
+
- **HuggingFace MoE Guide**: https://huggingface.co/blog/moe
|
|
518
|
+
- **NVIDIA MoE Blog**: https://developer.nvidia.com/blog/applying-mixture-of-experts-in-llm-architectures/
|
|
519
|
+
|
|
520
|
+
## See Also
|
|
521
|
+
|
|
522
|
+
- `references/architectures.md` - MoE model architectures (Mixtral, Switch, DeepSeek-V3)
|
|
523
|
+
- `references/training.md` - Advanced training techniques and optimization
|
|
524
|
+
- `references/inference.md` - Production deployment and serving patterns
|
|
525
|
+
|
|
526
|
+
|