@synsci/cli-darwin-x64 1.1.49
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skills/accelerate/SKILL.md +332 -0
- package/bin/skills/accelerate/references/custom-plugins.md +453 -0
- package/bin/skills/accelerate/references/megatron-integration.md +489 -0
- package/bin/skills/accelerate/references/performance.md +525 -0
- package/bin/skills/audiocraft/SKILL.md +564 -0
- package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
- package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
- package/bin/skills/autogpt/SKILL.md +403 -0
- package/bin/skills/autogpt/references/advanced-usage.md +535 -0
- package/bin/skills/autogpt/references/troubleshooting.md +420 -0
- package/bin/skills/awq/SKILL.md +310 -0
- package/bin/skills/awq/references/advanced-usage.md +324 -0
- package/bin/skills/awq/references/troubleshooting.md +344 -0
- package/bin/skills/axolotl/SKILL.md +158 -0
- package/bin/skills/axolotl/references/api.md +5548 -0
- package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
- package/bin/skills/axolotl/references/index.md +15 -0
- package/bin/skills/axolotl/references/other.md +3563 -0
- package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
- package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
- package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
- package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
- package/bin/skills/bitsandbytes/SKILL.md +411 -0
- package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
- package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
- package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
- package/bin/skills/blip-2/SKILL.md +564 -0
- package/bin/skills/blip-2/references/advanced-usage.md +680 -0
- package/bin/skills/blip-2/references/troubleshooting.md +526 -0
- package/bin/skills/chroma/SKILL.md +406 -0
- package/bin/skills/chroma/references/integration.md +38 -0
- package/bin/skills/clip/SKILL.md +253 -0
- package/bin/skills/clip/references/applications.md +207 -0
- package/bin/skills/constitutional-ai/SKILL.md +290 -0
- package/bin/skills/crewai/SKILL.md +498 -0
- package/bin/skills/crewai/references/flows.md +438 -0
- package/bin/skills/crewai/references/tools.md +429 -0
- package/bin/skills/crewai/references/troubleshooting.md +480 -0
- package/bin/skills/deepspeed/SKILL.md +141 -0
- package/bin/skills/deepspeed/references/08.md +17 -0
- package/bin/skills/deepspeed/references/09.md +173 -0
- package/bin/skills/deepspeed/references/2020.md +378 -0
- package/bin/skills/deepspeed/references/2023.md +279 -0
- package/bin/skills/deepspeed/references/assets.md +179 -0
- package/bin/skills/deepspeed/references/index.md +35 -0
- package/bin/skills/deepspeed/references/mii.md +118 -0
- package/bin/skills/deepspeed/references/other.md +1191 -0
- package/bin/skills/deepspeed/references/tutorials.md +6554 -0
- package/bin/skills/dspy/SKILL.md +590 -0
- package/bin/skills/dspy/references/examples.md +663 -0
- package/bin/skills/dspy/references/modules.md +475 -0
- package/bin/skills/dspy/references/optimizers.md +566 -0
- package/bin/skills/faiss/SKILL.md +221 -0
- package/bin/skills/faiss/references/index_types.md +280 -0
- package/bin/skills/flash-attention/SKILL.md +367 -0
- package/bin/skills/flash-attention/references/benchmarks.md +215 -0
- package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
- package/bin/skills/gguf/SKILL.md +427 -0
- package/bin/skills/gguf/references/advanced-usage.md +504 -0
- package/bin/skills/gguf/references/troubleshooting.md +442 -0
- package/bin/skills/gptq/SKILL.md +450 -0
- package/bin/skills/gptq/references/calibration.md +337 -0
- package/bin/skills/gptq/references/integration.md +129 -0
- package/bin/skills/gptq/references/troubleshooting.md +95 -0
- package/bin/skills/grpo-rl-training/README.md +97 -0
- package/bin/skills/grpo-rl-training/SKILL.md +572 -0
- package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
- package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
- package/bin/skills/guidance/SKILL.md +572 -0
- package/bin/skills/guidance/references/backends.md +554 -0
- package/bin/skills/guidance/references/constraints.md +674 -0
- package/bin/skills/guidance/references/examples.md +767 -0
- package/bin/skills/hqq/SKILL.md +445 -0
- package/bin/skills/hqq/references/advanced-usage.md +528 -0
- package/bin/skills/hqq/references/troubleshooting.md +503 -0
- package/bin/skills/hugging-face-cli/SKILL.md +191 -0
- package/bin/skills/hugging-face-cli/references/commands.md +954 -0
- package/bin/skills/hugging-face-cli/references/examples.md +374 -0
- package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
- package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
- package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
- package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
- package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
- package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
- package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
- package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
- package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
- package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
- package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
- package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
- package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
- package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
- package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
- package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
- package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
- package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
- package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
- package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
- package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
- package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
- package/bin/skills/hugging-face-jobs/index.html +216 -0
- package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
- package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
- package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
- package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
- package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
- package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
- package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
- package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
- package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
- package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
- package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
- package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
- package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
- package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
- package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
- package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
- package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
- package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
- package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
- package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
- package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
- package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
- package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
- package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
- package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
- package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
- package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
- package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
- package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
- package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
- package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
- package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
- package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
- package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
- package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
- package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
- package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
- package/bin/skills/instructor/SKILL.md +740 -0
- package/bin/skills/instructor/references/examples.md +107 -0
- package/bin/skills/instructor/references/providers.md +70 -0
- package/bin/skills/instructor/references/validation.md +606 -0
- package/bin/skills/knowledge-distillation/SKILL.md +458 -0
- package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
- package/bin/skills/lambda-labs/SKILL.md +545 -0
- package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
- package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
- package/bin/skills/langchain/SKILL.md +480 -0
- package/bin/skills/langchain/references/agents.md +499 -0
- package/bin/skills/langchain/references/integration.md +562 -0
- package/bin/skills/langchain/references/rag.md +600 -0
- package/bin/skills/langsmith/SKILL.md +422 -0
- package/bin/skills/langsmith/references/advanced-usage.md +548 -0
- package/bin/skills/langsmith/references/troubleshooting.md +537 -0
- package/bin/skills/litgpt/SKILL.md +469 -0
- package/bin/skills/litgpt/references/custom-models.md +568 -0
- package/bin/skills/litgpt/references/distributed-training.md +451 -0
- package/bin/skills/litgpt/references/supported-models.md +336 -0
- package/bin/skills/litgpt/references/training-recipes.md +619 -0
- package/bin/skills/llama-cpp/SKILL.md +258 -0
- package/bin/skills/llama-cpp/references/optimization.md +89 -0
- package/bin/skills/llama-cpp/references/quantization.md +213 -0
- package/bin/skills/llama-cpp/references/server.md +125 -0
- package/bin/skills/llama-factory/SKILL.md +80 -0
- package/bin/skills/llama-factory/references/_images.md +23 -0
- package/bin/skills/llama-factory/references/advanced.md +1055 -0
- package/bin/skills/llama-factory/references/getting_started.md +349 -0
- package/bin/skills/llama-factory/references/index.md +19 -0
- package/bin/skills/llama-factory/references/other.md +31 -0
- package/bin/skills/llamaguard/SKILL.md +337 -0
- package/bin/skills/llamaindex/SKILL.md +569 -0
- package/bin/skills/llamaindex/references/agents.md +83 -0
- package/bin/skills/llamaindex/references/data_connectors.md +108 -0
- package/bin/skills/llamaindex/references/query_engines.md +406 -0
- package/bin/skills/llava/SKILL.md +304 -0
- package/bin/skills/llava/references/training.md +197 -0
- package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
- package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
- package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
- package/bin/skills/long-context/SKILL.md +536 -0
- package/bin/skills/long-context/references/extension_methods.md +468 -0
- package/bin/skills/long-context/references/fine_tuning.md +611 -0
- package/bin/skills/long-context/references/rope.md +402 -0
- package/bin/skills/mamba/SKILL.md +260 -0
- package/bin/skills/mamba/references/architecture-details.md +206 -0
- package/bin/skills/mamba/references/benchmarks.md +255 -0
- package/bin/skills/mamba/references/training-guide.md +388 -0
- package/bin/skills/megatron-core/SKILL.md +366 -0
- package/bin/skills/megatron-core/references/benchmarks.md +249 -0
- package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
- package/bin/skills/megatron-core/references/production-examples.md +473 -0
- package/bin/skills/megatron-core/references/training-recipes.md +547 -0
- package/bin/skills/miles/SKILL.md +315 -0
- package/bin/skills/miles/references/api-reference.md +141 -0
- package/bin/skills/miles/references/troubleshooting.md +352 -0
- package/bin/skills/mlflow/SKILL.md +704 -0
- package/bin/skills/mlflow/references/deployment.md +744 -0
- package/bin/skills/mlflow/references/model-registry.md +770 -0
- package/bin/skills/mlflow/references/tracking.md +680 -0
- package/bin/skills/modal/SKILL.md +341 -0
- package/bin/skills/modal/references/advanced-usage.md +503 -0
- package/bin/skills/modal/references/troubleshooting.md +494 -0
- package/bin/skills/model-merging/SKILL.md +539 -0
- package/bin/skills/model-merging/references/evaluation.md +462 -0
- package/bin/skills/model-merging/references/examples.md +428 -0
- package/bin/skills/model-merging/references/methods.md +352 -0
- package/bin/skills/model-pruning/SKILL.md +495 -0
- package/bin/skills/model-pruning/references/wanda.md +347 -0
- package/bin/skills/moe-training/SKILL.md +526 -0
- package/bin/skills/moe-training/references/architectures.md +432 -0
- package/bin/skills/moe-training/references/inference.md +348 -0
- package/bin/skills/moe-training/references/training.md +425 -0
- package/bin/skills/nanogpt/SKILL.md +290 -0
- package/bin/skills/nanogpt/references/architecture.md +382 -0
- package/bin/skills/nanogpt/references/data.md +476 -0
- package/bin/skills/nanogpt/references/training.md +564 -0
- package/bin/skills/nemo-curator/SKILL.md +383 -0
- package/bin/skills/nemo-curator/references/deduplication.md +87 -0
- package/bin/skills/nemo-curator/references/filtering.md +102 -0
- package/bin/skills/nemo-evaluator/SKILL.md +494 -0
- package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
- package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
- package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
- package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
- package/bin/skills/nemo-guardrails/SKILL.md +297 -0
- package/bin/skills/nnsight/SKILL.md +436 -0
- package/bin/skills/nnsight/references/README.md +78 -0
- package/bin/skills/nnsight/references/api.md +344 -0
- package/bin/skills/nnsight/references/tutorials.md +300 -0
- package/bin/skills/openrlhf/SKILL.md +249 -0
- package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
- package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
- package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
- package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
- package/bin/skills/outlines/SKILL.md +652 -0
- package/bin/skills/outlines/references/backends.md +615 -0
- package/bin/skills/outlines/references/examples.md +773 -0
- package/bin/skills/outlines/references/json_generation.md +652 -0
- package/bin/skills/peft/SKILL.md +431 -0
- package/bin/skills/peft/references/advanced-usage.md +514 -0
- package/bin/skills/peft/references/troubleshooting.md +480 -0
- package/bin/skills/phoenix/SKILL.md +475 -0
- package/bin/skills/phoenix/references/advanced-usage.md +619 -0
- package/bin/skills/phoenix/references/troubleshooting.md +538 -0
- package/bin/skills/pinecone/SKILL.md +358 -0
- package/bin/skills/pinecone/references/deployment.md +181 -0
- package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
- package/bin/skills/pytorch-fsdp/references/index.md +7 -0
- package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
- package/bin/skills/pytorch-lightning/SKILL.md +346 -0
- package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
- package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
- package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
- package/bin/skills/pyvene/SKILL.md +473 -0
- package/bin/skills/pyvene/references/README.md +73 -0
- package/bin/skills/pyvene/references/api.md +383 -0
- package/bin/skills/pyvene/references/tutorials.md +376 -0
- package/bin/skills/qdrant/SKILL.md +493 -0
- package/bin/skills/qdrant/references/advanced-usage.md +648 -0
- package/bin/skills/qdrant/references/troubleshooting.md +631 -0
- package/bin/skills/ray-data/SKILL.md +326 -0
- package/bin/skills/ray-data/references/integration.md +82 -0
- package/bin/skills/ray-data/references/transformations.md +83 -0
- package/bin/skills/ray-train/SKILL.md +406 -0
- package/bin/skills/ray-train/references/multi-node.md +628 -0
- package/bin/skills/rwkv/SKILL.md +260 -0
- package/bin/skills/rwkv/references/architecture-details.md +344 -0
- package/bin/skills/rwkv/references/rwkv7.md +386 -0
- package/bin/skills/rwkv/references/state-management.md +369 -0
- package/bin/skills/saelens/SKILL.md +386 -0
- package/bin/skills/saelens/references/README.md +70 -0
- package/bin/skills/saelens/references/api.md +333 -0
- package/bin/skills/saelens/references/tutorials.md +318 -0
- package/bin/skills/segment-anything/SKILL.md +500 -0
- package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
- package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
- package/bin/skills/sentence-transformers/SKILL.md +255 -0
- package/bin/skills/sentence-transformers/references/models.md +123 -0
- package/bin/skills/sentencepiece/SKILL.md +235 -0
- package/bin/skills/sentencepiece/references/algorithms.md +200 -0
- package/bin/skills/sentencepiece/references/training.md +304 -0
- package/bin/skills/sglang/SKILL.md +442 -0
- package/bin/skills/sglang/references/deployment.md +490 -0
- package/bin/skills/sglang/references/radix-attention.md +413 -0
- package/bin/skills/sglang/references/structured-generation.md +541 -0
- package/bin/skills/simpo/SKILL.md +219 -0
- package/bin/skills/simpo/references/datasets.md +478 -0
- package/bin/skills/simpo/references/hyperparameters.md +452 -0
- package/bin/skills/simpo/references/loss-functions.md +350 -0
- package/bin/skills/skypilot/SKILL.md +509 -0
- package/bin/skills/skypilot/references/advanced-usage.md +491 -0
- package/bin/skills/skypilot/references/troubleshooting.md +570 -0
- package/bin/skills/slime/SKILL.md +464 -0
- package/bin/skills/slime/references/api-reference.md +392 -0
- package/bin/skills/slime/references/troubleshooting.md +386 -0
- package/bin/skills/speculative-decoding/SKILL.md +467 -0
- package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
- package/bin/skills/speculative-decoding/references/medusa.md +350 -0
- package/bin/skills/stable-diffusion/SKILL.md +519 -0
- package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
- package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
- package/bin/skills/tensorboard/SKILL.md +629 -0
- package/bin/skills/tensorboard/references/integrations.md +638 -0
- package/bin/skills/tensorboard/references/profiling.md +545 -0
- package/bin/skills/tensorboard/references/visualization.md +620 -0
- package/bin/skills/tensorrt-llm/SKILL.md +187 -0
- package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
- package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
- package/bin/skills/tensorrt-llm/references/serving.md +470 -0
- package/bin/skills/tinker/SKILL.md +362 -0
- package/bin/skills/tinker/references/api-reference.md +168 -0
- package/bin/skills/tinker/references/getting-started.md +157 -0
- package/bin/skills/tinker/references/loss-functions.md +163 -0
- package/bin/skills/tinker/references/models-and-lora.md +139 -0
- package/bin/skills/tinker/references/recipes.md +280 -0
- package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
- package/bin/skills/tinker/references/rendering.md +243 -0
- package/bin/skills/tinker/references/supervised-learning.md +232 -0
- package/bin/skills/tinker-training-cost/SKILL.md +187 -0
- package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
- package/bin/skills/torchforge/SKILL.md +433 -0
- package/bin/skills/torchforge/references/api-reference.md +327 -0
- package/bin/skills/torchforge/references/troubleshooting.md +409 -0
- package/bin/skills/torchtitan/SKILL.md +358 -0
- package/bin/skills/torchtitan/references/checkpoint.md +181 -0
- package/bin/skills/torchtitan/references/custom-models.md +258 -0
- package/bin/skills/torchtitan/references/float8.md +133 -0
- package/bin/skills/torchtitan/references/fsdp.md +126 -0
- package/bin/skills/transformer-lens/SKILL.md +346 -0
- package/bin/skills/transformer-lens/references/README.md +54 -0
- package/bin/skills/transformer-lens/references/api.md +362 -0
- package/bin/skills/transformer-lens/references/tutorials.md +339 -0
- package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
- package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
- package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
- package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
- package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
- package/bin/skills/unsloth/SKILL.md +80 -0
- package/bin/skills/unsloth/references/index.md +7 -0
- package/bin/skills/unsloth/references/llms-full.md +16799 -0
- package/bin/skills/unsloth/references/llms-txt.md +12044 -0
- package/bin/skills/unsloth/references/llms.md +82 -0
- package/bin/skills/verl/SKILL.md +391 -0
- package/bin/skills/verl/references/api-reference.md +301 -0
- package/bin/skills/verl/references/troubleshooting.md +391 -0
- package/bin/skills/vllm/SKILL.md +364 -0
- package/bin/skills/vllm/references/optimization.md +226 -0
- package/bin/skills/vllm/references/quantization.md +284 -0
- package/bin/skills/vllm/references/server-deployment.md +255 -0
- package/bin/skills/vllm/references/troubleshooting.md +447 -0
- package/bin/skills/weights-and-biases/SKILL.md +590 -0
- package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
- package/bin/skills/weights-and-biases/references/integrations.md +700 -0
- package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
- package/bin/skills/whisper/SKILL.md +317 -0
- package/bin/skills/whisper/references/languages.md +189 -0
- package/bin/synsc +0 -0
- package/package.json +10 -0
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: distributed-llm-pretraining-torchtitan
|
|
3
|
+
description: Provides PyTorch-native distributed LLM pretraining using torchtitan with 4D parallelism (FSDP2, TP, PP, CP). Use when pretraining Llama 3.1, DeepSeek V3, or custom models at scale from 8 to 512+ GPUs with Float8, torch.compile, and distributed checkpointing.
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
author: Synthetic Sciences
|
|
6
|
+
license: MIT
|
|
7
|
+
tags: [Model Architecture, Distributed Training, TorchTitan, FSDP2, Tensor Parallel, Pipeline Parallel, Context Parallel, Float8, Llama, Pretraining]
|
|
8
|
+
dependencies: [torch>=2.6.0, torchtitan>=0.2.0, torchao>=0.5.0]
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# TorchTitan - PyTorch Native Distributed LLM Pretraining
|
|
12
|
+
|
|
13
|
+
## Quick start
|
|
14
|
+
|
|
15
|
+
TorchTitan is PyTorch's official platform for large-scale LLM pretraining with composable 4D parallelism (FSDP2, TP, PP, CP), achieving 65%+ speedups over baselines on H100 GPUs.
|
|
16
|
+
|
|
17
|
+
**Installation**:
|
|
18
|
+
```bash
|
|
19
|
+
# From PyPI (stable)
|
|
20
|
+
pip install torchtitan
|
|
21
|
+
|
|
22
|
+
# From source (latest features, requires PyTorch nightly)
|
|
23
|
+
git clone https://github.com/pytorch/torchtitan
|
|
24
|
+
cd torchtitan
|
|
25
|
+
pip install -r requirements.txt
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
**Download tokenizer**:
|
|
29
|
+
```bash
|
|
30
|
+
# Get HF token from https://huggingface.co/settings/tokens
|
|
31
|
+
python scripts/download_hf_assets.py --repo_id meta-llama/Llama-3.1-8B --assets tokenizer --hf_token=...
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
**Start training on 8 GPUs**:
|
|
35
|
+
```bash
|
|
36
|
+
CONFIG_FILE="./torchtitan/models/llama3/train_configs/llama3_8b.toml" ./run_train.sh
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Common workflows
|
|
40
|
+
|
|
41
|
+
### Workflow 1: Pretrain Llama 3.1 8B on single node
|
|
42
|
+
|
|
43
|
+
Copy this checklist:
|
|
44
|
+
|
|
45
|
+
```
|
|
46
|
+
Single Node Pretraining:
|
|
47
|
+
- [ ] Step 1: Download tokenizer
|
|
48
|
+
- [ ] Step 2: Configure training
|
|
49
|
+
- [ ] Step 3: Launch training
|
|
50
|
+
- [ ] Step 4: Monitor and checkpoint
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
**Step 1: Download tokenizer**
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
python scripts/download_hf_assets.py \
|
|
57
|
+
--repo_id meta-llama/Llama-3.1-8B \
|
|
58
|
+
--assets tokenizer \
|
|
59
|
+
--hf_token=YOUR_HF_TOKEN
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
**Step 2: Configure training**
|
|
63
|
+
|
|
64
|
+
Edit or create a TOML config file:
|
|
65
|
+
|
|
66
|
+
```toml
|
|
67
|
+
# llama3_8b_custom.toml
|
|
68
|
+
[job]
|
|
69
|
+
dump_folder = "./outputs"
|
|
70
|
+
description = "Llama 3.1 8B training"
|
|
71
|
+
|
|
72
|
+
[model]
|
|
73
|
+
name = "llama3"
|
|
74
|
+
flavor = "8B"
|
|
75
|
+
hf_assets_path = "./assets/hf/Llama-3.1-8B"
|
|
76
|
+
|
|
77
|
+
[optimizer]
|
|
78
|
+
name = "AdamW"
|
|
79
|
+
lr = 3e-4
|
|
80
|
+
|
|
81
|
+
[lr_scheduler]
|
|
82
|
+
warmup_steps = 200
|
|
83
|
+
|
|
84
|
+
[training]
|
|
85
|
+
local_batch_size = 2
|
|
86
|
+
seq_len = 8192
|
|
87
|
+
max_norm = 1.0
|
|
88
|
+
steps = 1000
|
|
89
|
+
dataset = "c4"
|
|
90
|
+
|
|
91
|
+
[parallelism]
|
|
92
|
+
data_parallel_shard_degree = -1 # Use all GPUs for FSDP
|
|
93
|
+
|
|
94
|
+
[activation_checkpoint]
|
|
95
|
+
mode = "selective"
|
|
96
|
+
selective_ac_option = "op"
|
|
97
|
+
|
|
98
|
+
[checkpoint]
|
|
99
|
+
enable = true
|
|
100
|
+
folder = "checkpoint"
|
|
101
|
+
interval = 500
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
**Step 3: Launch training**
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
# 8 GPUs on single node
|
|
108
|
+
CONFIG_FILE="./llama3_8b_custom.toml" ./run_train.sh
|
|
109
|
+
|
|
110
|
+
# Or explicitly with torchrun
|
|
111
|
+
torchrun --nproc_per_node=8 \
|
|
112
|
+
-m torchtitan.train \
|
|
113
|
+
--job.config_file ./llama3_8b_custom.toml
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
**Step 4: Monitor and checkpoint**
|
|
117
|
+
|
|
118
|
+
TensorBoard logs are saved to `./outputs/tb/`:
|
|
119
|
+
```bash
|
|
120
|
+
tensorboard --logdir ./outputs/tb
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### Workflow 2: Multi-node training with SLURM
|
|
124
|
+
|
|
125
|
+
```
|
|
126
|
+
Multi-Node Training:
|
|
127
|
+
- [ ] Step 1: Configure parallelism for scale
|
|
128
|
+
- [ ] Step 2: Set up SLURM script
|
|
129
|
+
- [ ] Step 3: Submit job
|
|
130
|
+
- [ ] Step 4: Resume from checkpoint
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
**Step 1: Configure parallelism for scale**
|
|
134
|
+
|
|
135
|
+
For 70B model on 256 GPUs (32 nodes):
|
|
136
|
+
```toml
|
|
137
|
+
[parallelism]
|
|
138
|
+
data_parallel_shard_degree = 32 # FSDP across 32 ranks
|
|
139
|
+
tensor_parallel_degree = 8 # TP within node
|
|
140
|
+
pipeline_parallel_degree = 1 # No PP for 70B
|
|
141
|
+
context_parallel_degree = 1 # Increase for long sequences
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
**Step 2: Set up SLURM script**
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
#!/bin/bash
|
|
148
|
+
#SBATCH --job-name=llama70b
|
|
149
|
+
#SBATCH --nodes=32
|
|
150
|
+
#SBATCH --ntasks-per-node=8
|
|
151
|
+
#SBATCH --gpus-per-node=8
|
|
152
|
+
|
|
153
|
+
srun torchrun \
|
|
154
|
+
--nnodes=32 \
|
|
155
|
+
--nproc_per_node=8 \
|
|
156
|
+
--rdzv_backend=c10d \
|
|
157
|
+
--rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT \
|
|
158
|
+
-m torchtitan.train \
|
|
159
|
+
--job.config_file ./llama3_70b.toml
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
**Step 3: Submit job**
|
|
163
|
+
|
|
164
|
+
```bash
|
|
165
|
+
sbatch multinode_trainer.slurm
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
**Step 4: Resume from checkpoint**
|
|
169
|
+
|
|
170
|
+
Training auto-resumes if checkpoint exists in configured folder.
|
|
171
|
+
|
|
172
|
+
### Workflow 3: Enable Float8 training for H100s
|
|
173
|
+
|
|
174
|
+
Float8 provides 30-50% speedup on H100 GPUs.
|
|
175
|
+
|
|
176
|
+
```
|
|
177
|
+
Float8 Training:
|
|
178
|
+
- [ ] Step 1: Install torchao
|
|
179
|
+
- [ ] Step 2: Configure Float8
|
|
180
|
+
- [ ] Step 3: Launch with compile
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
**Step 1: Install torchao**
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
USE_CPP=0 pip install git+https://github.com/pytorch/ao.git
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
**Step 2: Configure Float8**
|
|
190
|
+
|
|
191
|
+
Add to your TOML config:
|
|
192
|
+
```toml
|
|
193
|
+
[model]
|
|
194
|
+
converters = ["quantize.linear.float8"]
|
|
195
|
+
|
|
196
|
+
[quantize.linear.float8]
|
|
197
|
+
enable_fsdp_float8_all_gather = true
|
|
198
|
+
precompute_float8_dynamic_scale_for_fsdp = true
|
|
199
|
+
filter_fqns = ["output"] # Exclude output layer
|
|
200
|
+
|
|
201
|
+
[compile]
|
|
202
|
+
enable = true
|
|
203
|
+
components = ["model", "loss"]
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
**Step 3: Launch with compile**
|
|
207
|
+
|
|
208
|
+
```bash
|
|
209
|
+
CONFIG_FILE="./llama3_8b.toml" ./run_train.sh \
|
|
210
|
+
--model.converters="quantize.linear.float8" \
|
|
211
|
+
--quantize.linear.float8.enable_fsdp_float8_all_gather \
|
|
212
|
+
--compile.enable
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
### Workflow 4: 4D parallelism for 405B models
|
|
216
|
+
|
|
217
|
+
```
|
|
218
|
+
4D Parallelism (FSDP + TP + PP + CP):
|
|
219
|
+
- [ ] Step 1: Create seed checkpoint
|
|
220
|
+
- [ ] Step 2: Configure 4D parallelism
|
|
221
|
+
- [ ] Step 3: Launch on 512 GPUs
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
**Step 1: Create seed checkpoint**
|
|
225
|
+
|
|
226
|
+
Required for consistent initialization across PP stages:
|
|
227
|
+
```bash
|
|
228
|
+
NGPU=1 CONFIG_FILE=./llama3_405b.toml ./run_train.sh \
|
|
229
|
+
--checkpoint.enable \
|
|
230
|
+
--checkpoint.create_seed_checkpoint \
|
|
231
|
+
--parallelism.data_parallel_shard_degree 1 \
|
|
232
|
+
--parallelism.tensor_parallel_degree 1 \
|
|
233
|
+
--parallelism.pipeline_parallel_degree 1
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
**Step 2: Configure 4D parallelism**
|
|
237
|
+
|
|
238
|
+
```toml
|
|
239
|
+
[parallelism]
|
|
240
|
+
data_parallel_shard_degree = 8 # FSDP
|
|
241
|
+
tensor_parallel_degree = 8 # TP within node
|
|
242
|
+
pipeline_parallel_degree = 8 # PP across nodes
|
|
243
|
+
context_parallel_degree = 1 # CP for long sequences
|
|
244
|
+
|
|
245
|
+
[training]
|
|
246
|
+
local_batch_size = 32
|
|
247
|
+
seq_len = 8192
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
**Step 3: Launch on 512 GPUs**
|
|
251
|
+
|
|
252
|
+
```bash
|
|
253
|
+
# 64 nodes x 8 GPUs = 512 GPUs
|
|
254
|
+
srun torchrun --nnodes=64 --nproc_per_node=8 \
|
|
255
|
+
-m torchtitan.train \
|
|
256
|
+
--job.config_file ./llama3_405b.toml
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
## When to use vs alternatives
|
|
260
|
+
|
|
261
|
+
**Use TorchTitan when:**
|
|
262
|
+
- Pretraining LLMs from scratch (8B to 405B+)
|
|
263
|
+
- Need PyTorch-native solution without third-party dependencies
|
|
264
|
+
- Require composable 4D parallelism (FSDP2, TP, PP, CP)
|
|
265
|
+
- Training on H100s with Float8 support
|
|
266
|
+
- Want interoperable checkpoints with torchtune/HuggingFace
|
|
267
|
+
|
|
268
|
+
**Use alternatives instead:**
|
|
269
|
+
- **Megatron-LM**: Maximum performance for NVIDIA-only deployments
|
|
270
|
+
- **DeepSpeed**: Broader ZeRO optimization ecosystem, inference support
|
|
271
|
+
- **Axolotl/TRL**: Fine-tuning rather than pretraining
|
|
272
|
+
- **LitGPT**: Educational, smaller-scale training
|
|
273
|
+
|
|
274
|
+
## Common issues
|
|
275
|
+
|
|
276
|
+
**Issue: Out of memory on large models**
|
|
277
|
+
|
|
278
|
+
Enable activation checkpointing and reduce batch size:
|
|
279
|
+
```toml
|
|
280
|
+
[activation_checkpoint]
|
|
281
|
+
mode = "full" # Instead of "selective"
|
|
282
|
+
|
|
283
|
+
[training]
|
|
284
|
+
local_batch_size = 1
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
Or use gradient accumulation:
|
|
288
|
+
```toml
|
|
289
|
+
[training]
|
|
290
|
+
local_batch_size = 1
|
|
291
|
+
global_batch_size = 32 # Accumulates gradients
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
**Issue: TP causes high memory with async collectives**
|
|
295
|
+
|
|
296
|
+
Set environment variable:
|
|
297
|
+
```bash
|
|
298
|
+
export TORCH_NCCL_AVOID_RECORD_STREAMS=1
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
**Issue: Float8 training not faster**
|
|
302
|
+
|
|
303
|
+
Float8 only benefits large GEMMs. Filter small layers:
|
|
304
|
+
```toml
|
|
305
|
+
[quantize.linear.float8]
|
|
306
|
+
filter_fqns = ["attention.wk", "attention.wv", "output", "auto_filter_small_kn"]
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
**Issue: Checkpoint loading fails after parallelism change**
|
|
310
|
+
|
|
311
|
+
Use DCP's resharding capability:
|
|
312
|
+
```bash
|
|
313
|
+
# Convert sharded checkpoint to single file
|
|
314
|
+
python -m torch.distributed.checkpoint.format_utils \
|
|
315
|
+
dcp_to_torch checkpoint/step-1000 checkpoint.pt
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
**Issue: Pipeline parallelism initialization**
|
|
319
|
+
|
|
320
|
+
Create seed checkpoint first (see Workflow 4, Step 1).
|
|
321
|
+
|
|
322
|
+
## Supported models
|
|
323
|
+
|
|
324
|
+
| Model | Sizes | Status |
|
|
325
|
+
|-------|-------|--------|
|
|
326
|
+
| Llama 3.1 | 8B, 70B, 405B | Production |
|
|
327
|
+
| Llama 4 | Various | Experimental |
|
|
328
|
+
| DeepSeek V3 | 16B, 236B, 671B (MoE) | Experimental |
|
|
329
|
+
| GPT-OSS | 20B, 120B (MoE) | Experimental |
|
|
330
|
+
| Qwen 3 | Various | Experimental |
|
|
331
|
+
| Flux | Diffusion | Experimental |
|
|
332
|
+
|
|
333
|
+
## Performance benchmarks (H100)
|
|
334
|
+
|
|
335
|
+
| Model | GPUs | Parallelism | TPS/GPU | Techniques |
|
|
336
|
+
|-------|------|-------------|---------|------------|
|
|
337
|
+
| Llama 8B | 8 | FSDP | 5,762 | Baseline |
|
|
338
|
+
| Llama 8B | 8 | FSDP+compile+FP8 | 8,532 | +48% |
|
|
339
|
+
| Llama 70B | 256 | FSDP+TP+AsyncTP | 876 | 2D parallel |
|
|
340
|
+
| Llama 405B | 512 | FSDP+TP+PP | 128 | 3D parallel |
|
|
341
|
+
|
|
342
|
+
## Advanced topics
|
|
343
|
+
|
|
344
|
+
**FSDP2 configuration**: See [references/fsdp.md](references/fsdp.md) for detailed FSDP2 vs FSDP1 comparison and ZeRO equivalents.
|
|
345
|
+
|
|
346
|
+
**Float8 training**: See [references/float8.md](references/float8.md) for tensorwise vs rowwise scaling recipes.
|
|
347
|
+
|
|
348
|
+
**Checkpointing**: See [references/checkpoint.md](references/checkpoint.md) for HuggingFace conversion and async checkpointing.
|
|
349
|
+
|
|
350
|
+
**Adding custom models**: See [references/custom-models.md](references/custom-models.md) for TrainSpec protocol.
|
|
351
|
+
|
|
352
|
+
## Resources
|
|
353
|
+
|
|
354
|
+
- GitHub: https://github.com/pytorch/torchtitan
|
|
355
|
+
- Paper: https://arxiv.org/abs/2410.06511
|
|
356
|
+
- ICLR 2025: https://iclr.cc/virtual/2025/poster/29620
|
|
357
|
+
- PyTorch Forum: https://discuss.pytorch.org/c/distributed/torchtitan/44
|
|
358
|
+
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
# Checkpointing in TorchTitan
|
|
2
|
+
|
|
3
|
+
TorchTitan uses PyTorch Distributed Checkpoint (DCP) for fault-tolerant, interoperable checkpointing.
|
|
4
|
+
|
|
5
|
+
## Basic Configuration
|
|
6
|
+
|
|
7
|
+
```toml
|
|
8
|
+
[checkpoint]
|
|
9
|
+
enable = true
|
|
10
|
+
folder = "checkpoint"
|
|
11
|
+
interval = 500
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## Save Model Only (Smaller Checkpoints)
|
|
15
|
+
|
|
16
|
+
Exclude optimizer state and training metadata:
|
|
17
|
+
|
|
18
|
+
```toml
|
|
19
|
+
[checkpoint]
|
|
20
|
+
enable = true
|
|
21
|
+
last_save_model_only = true
|
|
22
|
+
export_dtype = "bfloat16" # Optional: export in lower precision
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Excluding Keys from Loading
|
|
26
|
+
|
|
27
|
+
Partial checkpoint loading for modified settings:
|
|
28
|
+
|
|
29
|
+
```toml
|
|
30
|
+
[checkpoint]
|
|
31
|
+
enable = true
|
|
32
|
+
exclude_from_loading = ["data_loader", "lr_scheduler"]
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
CLI equivalent:
|
|
36
|
+
```bash
|
|
37
|
+
--checkpoint.exclude_from_loading data_loader,lr_scheduler
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Creating Seed Checkpoints
|
|
41
|
+
|
|
42
|
+
Required for Pipeline Parallelism to ensure consistent initialization:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
NGPU=1 CONFIG_FILE=<path_to_config> ./run_train.sh \
|
|
46
|
+
--checkpoint.enable \
|
|
47
|
+
--checkpoint.create_seed_checkpoint \
|
|
48
|
+
--parallelism.data_parallel_replicate_degree 1 \
|
|
49
|
+
--parallelism.data_parallel_shard_degree 1 \
|
|
50
|
+
--parallelism.tensor_parallel_degree 1 \
|
|
51
|
+
--parallelism.pipeline_parallel_degree 1 \
|
|
52
|
+
--parallelism.context_parallel_degree 1 \
|
|
53
|
+
--parallelism.expert_parallel_degree 1
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
This initializes on single CPU for reproducible initialization across any GPU count.
|
|
57
|
+
|
|
58
|
+
## Async Checkpointing
|
|
59
|
+
|
|
60
|
+
Reduce checkpoint overhead with async writes:
|
|
61
|
+
|
|
62
|
+
```toml
|
|
63
|
+
[checkpoint]
|
|
64
|
+
enable = true
|
|
65
|
+
async_mode = "async" # Options: "disabled", "async", "async_with_pinned_mem"
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## HuggingFace Conversion
|
|
69
|
+
|
|
70
|
+
### During Training
|
|
71
|
+
|
|
72
|
+
Save directly in HuggingFace format:
|
|
73
|
+
|
|
74
|
+
```toml
|
|
75
|
+
[checkpoint]
|
|
76
|
+
last_save_in_hf = true
|
|
77
|
+
last_save_model_only = true
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Load from HuggingFace:
|
|
81
|
+
|
|
82
|
+
```toml
|
|
83
|
+
[checkpoint]
|
|
84
|
+
initial_load_in_hf = true
|
|
85
|
+
|
|
86
|
+
[model]
|
|
87
|
+
hf_assets_path = "./path/to/hf/checkpoint"
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Offline Conversion
|
|
91
|
+
|
|
92
|
+
Convert without running training:
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
# HuggingFace -> TorchTitan
|
|
96
|
+
python ./scripts/checkpoint_conversion/convert_from_hf.py \
|
|
97
|
+
<input_dir> <output_dir> \
|
|
98
|
+
--model_name llama3 \
|
|
99
|
+
--model_flavor 8B
|
|
100
|
+
|
|
101
|
+
# TorchTitan -> HuggingFace
|
|
102
|
+
python ./scripts/checkpoint_conversion/convert_to_hf.py \
|
|
103
|
+
<input_dir> <output_dir> \
|
|
104
|
+
--hf_assets_path ./assets/hf/Llama3.1-8B \
|
|
105
|
+
--model_name llama3 \
|
|
106
|
+
--model_flavor 8B
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Example
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
python ./scripts/convert_from_hf.py \
|
|
113
|
+
~/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/8cde5ca8380496c9a6cc7ef3a8b46a0372a1d920/ \
|
|
114
|
+
./initial_load_path/ \
|
|
115
|
+
--model_name llama3 \
|
|
116
|
+
--model_flavor 8B
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
## Converting to Single .pt File
|
|
120
|
+
|
|
121
|
+
Convert DCP sharded checkpoint to single PyTorch file:
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
python -m torch.distributed.checkpoint.format_utils \
|
|
125
|
+
dcp_to_torch \
|
|
126
|
+
torchtitan/outputs/checkpoint/step-1000 \
|
|
127
|
+
checkpoint.pt
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## Checkpoint Structure
|
|
131
|
+
|
|
132
|
+
DCP saves sharded checkpoints that can be resharded for different parallelism configurations:
|
|
133
|
+
|
|
134
|
+
```
|
|
135
|
+
checkpoint/
|
|
136
|
+
├── step-500/
|
|
137
|
+
│ ├── .metadata
|
|
138
|
+
│ ├── __0_0.distcp
|
|
139
|
+
│ ├── __0_1.distcp
|
|
140
|
+
│ └── ...
|
|
141
|
+
└── step-1000/
|
|
142
|
+
└── ...
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
## Resume Training
|
|
146
|
+
|
|
147
|
+
Training auto-resumes from the latest checkpoint in the configured folder. To resume from a specific step:
|
|
148
|
+
|
|
149
|
+
```toml
|
|
150
|
+
[checkpoint]
|
|
151
|
+
load_step = 500 # Resume from step 500
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
## Interoperability with TorchTune
|
|
155
|
+
|
|
156
|
+
Checkpoints saved with `last_save_model_only = true` can be loaded directly into [torchtune](https://github.com/pytorch/torchtune) for fine-tuning.
|
|
157
|
+
|
|
158
|
+
## Full Configuration Example
|
|
159
|
+
|
|
160
|
+
```toml
|
|
161
|
+
[checkpoint]
|
|
162
|
+
enable = true
|
|
163
|
+
folder = "checkpoint"
|
|
164
|
+
interval = 500
|
|
165
|
+
load_step = -1 # -1 = latest, or specify step number
|
|
166
|
+
last_save_model_only = true
|
|
167
|
+
export_dtype = "bfloat16"
|
|
168
|
+
async_mode = "async"
|
|
169
|
+
exclude_from_loading = []
|
|
170
|
+
last_save_in_hf = false
|
|
171
|
+
initial_load_in_hf = false
|
|
172
|
+
create_seed_checkpoint = false
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
## Best Practices
|
|
176
|
+
|
|
177
|
+
1. **Large models**: Use `async_mode = "async"` to overlap checkpoint saves with training
|
|
178
|
+
2. **Fine-tuning export**: Enable `last_save_model_only` and `export_dtype = "bfloat16"` for smaller files
|
|
179
|
+
3. **Pipeline parallelism**: Always create seed checkpoint first
|
|
180
|
+
4. **Debugging**: Save frequent checkpoints during development, reduce for production
|
|
181
|
+
5. **HF interop**: Use conversion scripts for offline conversion, direct save/load for training workflows
|