@synsci/cli-darwin-x64 1.1.49
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skills/accelerate/SKILL.md +332 -0
- package/bin/skills/accelerate/references/custom-plugins.md +453 -0
- package/bin/skills/accelerate/references/megatron-integration.md +489 -0
- package/bin/skills/accelerate/references/performance.md +525 -0
- package/bin/skills/audiocraft/SKILL.md +564 -0
- package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
- package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
- package/bin/skills/autogpt/SKILL.md +403 -0
- package/bin/skills/autogpt/references/advanced-usage.md +535 -0
- package/bin/skills/autogpt/references/troubleshooting.md +420 -0
- package/bin/skills/awq/SKILL.md +310 -0
- package/bin/skills/awq/references/advanced-usage.md +324 -0
- package/bin/skills/awq/references/troubleshooting.md +344 -0
- package/bin/skills/axolotl/SKILL.md +158 -0
- package/bin/skills/axolotl/references/api.md +5548 -0
- package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
- package/bin/skills/axolotl/references/index.md +15 -0
- package/bin/skills/axolotl/references/other.md +3563 -0
- package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
- package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
- package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
- package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
- package/bin/skills/bitsandbytes/SKILL.md +411 -0
- package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
- package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
- package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
- package/bin/skills/blip-2/SKILL.md +564 -0
- package/bin/skills/blip-2/references/advanced-usage.md +680 -0
- package/bin/skills/blip-2/references/troubleshooting.md +526 -0
- package/bin/skills/chroma/SKILL.md +406 -0
- package/bin/skills/chroma/references/integration.md +38 -0
- package/bin/skills/clip/SKILL.md +253 -0
- package/bin/skills/clip/references/applications.md +207 -0
- package/bin/skills/constitutional-ai/SKILL.md +290 -0
- package/bin/skills/crewai/SKILL.md +498 -0
- package/bin/skills/crewai/references/flows.md +438 -0
- package/bin/skills/crewai/references/tools.md +429 -0
- package/bin/skills/crewai/references/troubleshooting.md +480 -0
- package/bin/skills/deepspeed/SKILL.md +141 -0
- package/bin/skills/deepspeed/references/08.md +17 -0
- package/bin/skills/deepspeed/references/09.md +173 -0
- package/bin/skills/deepspeed/references/2020.md +378 -0
- package/bin/skills/deepspeed/references/2023.md +279 -0
- package/bin/skills/deepspeed/references/assets.md +179 -0
- package/bin/skills/deepspeed/references/index.md +35 -0
- package/bin/skills/deepspeed/references/mii.md +118 -0
- package/bin/skills/deepspeed/references/other.md +1191 -0
- package/bin/skills/deepspeed/references/tutorials.md +6554 -0
- package/bin/skills/dspy/SKILL.md +590 -0
- package/bin/skills/dspy/references/examples.md +663 -0
- package/bin/skills/dspy/references/modules.md +475 -0
- package/bin/skills/dspy/references/optimizers.md +566 -0
- package/bin/skills/faiss/SKILL.md +221 -0
- package/bin/skills/faiss/references/index_types.md +280 -0
- package/bin/skills/flash-attention/SKILL.md +367 -0
- package/bin/skills/flash-attention/references/benchmarks.md +215 -0
- package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
- package/bin/skills/gguf/SKILL.md +427 -0
- package/bin/skills/gguf/references/advanced-usage.md +504 -0
- package/bin/skills/gguf/references/troubleshooting.md +442 -0
- package/bin/skills/gptq/SKILL.md +450 -0
- package/bin/skills/gptq/references/calibration.md +337 -0
- package/bin/skills/gptq/references/integration.md +129 -0
- package/bin/skills/gptq/references/troubleshooting.md +95 -0
- package/bin/skills/grpo-rl-training/README.md +97 -0
- package/bin/skills/grpo-rl-training/SKILL.md +572 -0
- package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
- package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
- package/bin/skills/guidance/SKILL.md +572 -0
- package/bin/skills/guidance/references/backends.md +554 -0
- package/bin/skills/guidance/references/constraints.md +674 -0
- package/bin/skills/guidance/references/examples.md +767 -0
- package/bin/skills/hqq/SKILL.md +445 -0
- package/bin/skills/hqq/references/advanced-usage.md +528 -0
- package/bin/skills/hqq/references/troubleshooting.md +503 -0
- package/bin/skills/hugging-face-cli/SKILL.md +191 -0
- package/bin/skills/hugging-face-cli/references/commands.md +954 -0
- package/bin/skills/hugging-face-cli/references/examples.md +374 -0
- package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
- package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
- package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
- package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
- package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
- package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
- package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
- package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
- package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
- package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
- package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
- package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
- package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
- package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
- package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
- package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
- package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
- package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
- package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
- package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
- package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
- package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
- package/bin/skills/hugging-face-jobs/index.html +216 -0
- package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
- package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
- package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
- package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
- package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
- package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
- package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
- package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
- package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
- package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
- package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
- package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
- package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
- package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
- package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
- package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
- package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
- package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
- package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
- package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
- package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
- package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
- package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
- package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
- package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
- package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
- package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
- package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
- package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
- package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
- package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
- package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
- package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
- package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
- package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
- package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
- package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
- package/bin/skills/instructor/SKILL.md +740 -0
- package/bin/skills/instructor/references/examples.md +107 -0
- package/bin/skills/instructor/references/providers.md +70 -0
- package/bin/skills/instructor/references/validation.md +606 -0
- package/bin/skills/knowledge-distillation/SKILL.md +458 -0
- package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
- package/bin/skills/lambda-labs/SKILL.md +545 -0
- package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
- package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
- package/bin/skills/langchain/SKILL.md +480 -0
- package/bin/skills/langchain/references/agents.md +499 -0
- package/bin/skills/langchain/references/integration.md +562 -0
- package/bin/skills/langchain/references/rag.md +600 -0
- package/bin/skills/langsmith/SKILL.md +422 -0
- package/bin/skills/langsmith/references/advanced-usage.md +548 -0
- package/bin/skills/langsmith/references/troubleshooting.md +537 -0
- package/bin/skills/litgpt/SKILL.md +469 -0
- package/bin/skills/litgpt/references/custom-models.md +568 -0
- package/bin/skills/litgpt/references/distributed-training.md +451 -0
- package/bin/skills/litgpt/references/supported-models.md +336 -0
- package/bin/skills/litgpt/references/training-recipes.md +619 -0
- package/bin/skills/llama-cpp/SKILL.md +258 -0
- package/bin/skills/llama-cpp/references/optimization.md +89 -0
- package/bin/skills/llama-cpp/references/quantization.md +213 -0
- package/bin/skills/llama-cpp/references/server.md +125 -0
- package/bin/skills/llama-factory/SKILL.md +80 -0
- package/bin/skills/llama-factory/references/_images.md +23 -0
- package/bin/skills/llama-factory/references/advanced.md +1055 -0
- package/bin/skills/llama-factory/references/getting_started.md +349 -0
- package/bin/skills/llama-factory/references/index.md +19 -0
- package/bin/skills/llama-factory/references/other.md +31 -0
- package/bin/skills/llamaguard/SKILL.md +337 -0
- package/bin/skills/llamaindex/SKILL.md +569 -0
- package/bin/skills/llamaindex/references/agents.md +83 -0
- package/bin/skills/llamaindex/references/data_connectors.md +108 -0
- package/bin/skills/llamaindex/references/query_engines.md +406 -0
- package/bin/skills/llava/SKILL.md +304 -0
- package/bin/skills/llava/references/training.md +197 -0
- package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
- package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
- package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
- package/bin/skills/long-context/SKILL.md +536 -0
- package/bin/skills/long-context/references/extension_methods.md +468 -0
- package/bin/skills/long-context/references/fine_tuning.md +611 -0
- package/bin/skills/long-context/references/rope.md +402 -0
- package/bin/skills/mamba/SKILL.md +260 -0
- package/bin/skills/mamba/references/architecture-details.md +206 -0
- package/bin/skills/mamba/references/benchmarks.md +255 -0
- package/bin/skills/mamba/references/training-guide.md +388 -0
- package/bin/skills/megatron-core/SKILL.md +366 -0
- package/bin/skills/megatron-core/references/benchmarks.md +249 -0
- package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
- package/bin/skills/megatron-core/references/production-examples.md +473 -0
- package/bin/skills/megatron-core/references/training-recipes.md +547 -0
- package/bin/skills/miles/SKILL.md +315 -0
- package/bin/skills/miles/references/api-reference.md +141 -0
- package/bin/skills/miles/references/troubleshooting.md +352 -0
- package/bin/skills/mlflow/SKILL.md +704 -0
- package/bin/skills/mlflow/references/deployment.md +744 -0
- package/bin/skills/mlflow/references/model-registry.md +770 -0
- package/bin/skills/mlflow/references/tracking.md +680 -0
- package/bin/skills/modal/SKILL.md +341 -0
- package/bin/skills/modal/references/advanced-usage.md +503 -0
- package/bin/skills/modal/references/troubleshooting.md +494 -0
- package/bin/skills/model-merging/SKILL.md +539 -0
- package/bin/skills/model-merging/references/evaluation.md +462 -0
- package/bin/skills/model-merging/references/examples.md +428 -0
- package/bin/skills/model-merging/references/methods.md +352 -0
- package/bin/skills/model-pruning/SKILL.md +495 -0
- package/bin/skills/model-pruning/references/wanda.md +347 -0
- package/bin/skills/moe-training/SKILL.md +526 -0
- package/bin/skills/moe-training/references/architectures.md +432 -0
- package/bin/skills/moe-training/references/inference.md +348 -0
- package/bin/skills/moe-training/references/training.md +425 -0
- package/bin/skills/nanogpt/SKILL.md +290 -0
- package/bin/skills/nanogpt/references/architecture.md +382 -0
- package/bin/skills/nanogpt/references/data.md +476 -0
- package/bin/skills/nanogpt/references/training.md +564 -0
- package/bin/skills/nemo-curator/SKILL.md +383 -0
- package/bin/skills/nemo-curator/references/deduplication.md +87 -0
- package/bin/skills/nemo-curator/references/filtering.md +102 -0
- package/bin/skills/nemo-evaluator/SKILL.md +494 -0
- package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
- package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
- package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
- package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
- package/bin/skills/nemo-guardrails/SKILL.md +297 -0
- package/bin/skills/nnsight/SKILL.md +436 -0
- package/bin/skills/nnsight/references/README.md +78 -0
- package/bin/skills/nnsight/references/api.md +344 -0
- package/bin/skills/nnsight/references/tutorials.md +300 -0
- package/bin/skills/openrlhf/SKILL.md +249 -0
- package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
- package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
- package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
- package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
- package/bin/skills/outlines/SKILL.md +652 -0
- package/bin/skills/outlines/references/backends.md +615 -0
- package/bin/skills/outlines/references/examples.md +773 -0
- package/bin/skills/outlines/references/json_generation.md +652 -0
- package/bin/skills/peft/SKILL.md +431 -0
- package/bin/skills/peft/references/advanced-usage.md +514 -0
- package/bin/skills/peft/references/troubleshooting.md +480 -0
- package/bin/skills/phoenix/SKILL.md +475 -0
- package/bin/skills/phoenix/references/advanced-usage.md +619 -0
- package/bin/skills/phoenix/references/troubleshooting.md +538 -0
- package/bin/skills/pinecone/SKILL.md +358 -0
- package/bin/skills/pinecone/references/deployment.md +181 -0
- package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
- package/bin/skills/pytorch-fsdp/references/index.md +7 -0
- package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
- package/bin/skills/pytorch-lightning/SKILL.md +346 -0
- package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
- package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
- package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
- package/bin/skills/pyvene/SKILL.md +473 -0
- package/bin/skills/pyvene/references/README.md +73 -0
- package/bin/skills/pyvene/references/api.md +383 -0
- package/bin/skills/pyvene/references/tutorials.md +376 -0
- package/bin/skills/qdrant/SKILL.md +493 -0
- package/bin/skills/qdrant/references/advanced-usage.md +648 -0
- package/bin/skills/qdrant/references/troubleshooting.md +631 -0
- package/bin/skills/ray-data/SKILL.md +326 -0
- package/bin/skills/ray-data/references/integration.md +82 -0
- package/bin/skills/ray-data/references/transformations.md +83 -0
- package/bin/skills/ray-train/SKILL.md +406 -0
- package/bin/skills/ray-train/references/multi-node.md +628 -0
- package/bin/skills/rwkv/SKILL.md +260 -0
- package/bin/skills/rwkv/references/architecture-details.md +344 -0
- package/bin/skills/rwkv/references/rwkv7.md +386 -0
- package/bin/skills/rwkv/references/state-management.md +369 -0
- package/bin/skills/saelens/SKILL.md +386 -0
- package/bin/skills/saelens/references/README.md +70 -0
- package/bin/skills/saelens/references/api.md +333 -0
- package/bin/skills/saelens/references/tutorials.md +318 -0
- package/bin/skills/segment-anything/SKILL.md +500 -0
- package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
- package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
- package/bin/skills/sentence-transformers/SKILL.md +255 -0
- package/bin/skills/sentence-transformers/references/models.md +123 -0
- package/bin/skills/sentencepiece/SKILL.md +235 -0
- package/bin/skills/sentencepiece/references/algorithms.md +200 -0
- package/bin/skills/sentencepiece/references/training.md +304 -0
- package/bin/skills/sglang/SKILL.md +442 -0
- package/bin/skills/sglang/references/deployment.md +490 -0
- package/bin/skills/sglang/references/radix-attention.md +413 -0
- package/bin/skills/sglang/references/structured-generation.md +541 -0
- package/bin/skills/simpo/SKILL.md +219 -0
- package/bin/skills/simpo/references/datasets.md +478 -0
- package/bin/skills/simpo/references/hyperparameters.md +452 -0
- package/bin/skills/simpo/references/loss-functions.md +350 -0
- package/bin/skills/skypilot/SKILL.md +509 -0
- package/bin/skills/skypilot/references/advanced-usage.md +491 -0
- package/bin/skills/skypilot/references/troubleshooting.md +570 -0
- package/bin/skills/slime/SKILL.md +464 -0
- package/bin/skills/slime/references/api-reference.md +392 -0
- package/bin/skills/slime/references/troubleshooting.md +386 -0
- package/bin/skills/speculative-decoding/SKILL.md +467 -0
- package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
- package/bin/skills/speculative-decoding/references/medusa.md +350 -0
- package/bin/skills/stable-diffusion/SKILL.md +519 -0
- package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
- package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
- package/bin/skills/tensorboard/SKILL.md +629 -0
- package/bin/skills/tensorboard/references/integrations.md +638 -0
- package/bin/skills/tensorboard/references/profiling.md +545 -0
- package/bin/skills/tensorboard/references/visualization.md +620 -0
- package/bin/skills/tensorrt-llm/SKILL.md +187 -0
- package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
- package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
- package/bin/skills/tensorrt-llm/references/serving.md +470 -0
- package/bin/skills/tinker/SKILL.md +362 -0
- package/bin/skills/tinker/references/api-reference.md +168 -0
- package/bin/skills/tinker/references/getting-started.md +157 -0
- package/bin/skills/tinker/references/loss-functions.md +163 -0
- package/bin/skills/tinker/references/models-and-lora.md +139 -0
- package/bin/skills/tinker/references/recipes.md +280 -0
- package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
- package/bin/skills/tinker/references/rendering.md +243 -0
- package/bin/skills/tinker/references/supervised-learning.md +232 -0
- package/bin/skills/tinker-training-cost/SKILL.md +187 -0
- package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
- package/bin/skills/torchforge/SKILL.md +433 -0
- package/bin/skills/torchforge/references/api-reference.md +327 -0
- package/bin/skills/torchforge/references/troubleshooting.md +409 -0
- package/bin/skills/torchtitan/SKILL.md +358 -0
- package/bin/skills/torchtitan/references/checkpoint.md +181 -0
- package/bin/skills/torchtitan/references/custom-models.md +258 -0
- package/bin/skills/torchtitan/references/float8.md +133 -0
- package/bin/skills/torchtitan/references/fsdp.md +126 -0
- package/bin/skills/transformer-lens/SKILL.md +346 -0
- package/bin/skills/transformer-lens/references/README.md +54 -0
- package/bin/skills/transformer-lens/references/api.md +362 -0
- package/bin/skills/transformer-lens/references/tutorials.md +339 -0
- package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
- package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
- package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
- package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
- package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
- package/bin/skills/unsloth/SKILL.md +80 -0
- package/bin/skills/unsloth/references/index.md +7 -0
- package/bin/skills/unsloth/references/llms-full.md +16799 -0
- package/bin/skills/unsloth/references/llms-txt.md +12044 -0
- package/bin/skills/unsloth/references/llms.md +82 -0
- package/bin/skills/verl/SKILL.md +391 -0
- package/bin/skills/verl/references/api-reference.md +301 -0
- package/bin/skills/verl/references/troubleshooting.md +391 -0
- package/bin/skills/vllm/SKILL.md +364 -0
- package/bin/skills/vllm/references/optimization.md +226 -0
- package/bin/skills/vllm/references/quantization.md +284 -0
- package/bin/skills/vllm/references/server-deployment.md +255 -0
- package/bin/skills/vllm/references/troubleshooting.md +447 -0
- package/bin/skills/weights-and-biases/SKILL.md +590 -0
- package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
- package/bin/skills/weights-and-biases/references/integrations.md +700 -0
- package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
- package/bin/skills/whisper/SKILL.md +317 -0
- package/bin/skills/whisper/references/languages.md +189 -0
- package/bin/synsc +0 -0
- package/package.json +10 -0
|
@@ -0,0 +1,611 @@
|
|
|
1
|
+
# Lambda Labs Advanced Usage Guide
|
|
2
|
+
|
|
3
|
+
## Multi-Node Distributed Training
|
|
4
|
+
|
|
5
|
+
### PyTorch DDP across nodes
|
|
6
|
+
|
|
7
|
+
```python
|
|
8
|
+
# train_multi_node.py
|
|
9
|
+
import os
|
|
10
|
+
import torch
|
|
11
|
+
import torch.distributed as dist
|
|
12
|
+
from torch.nn.parallel import DistributedDataParallel as DDP
|
|
13
|
+
|
|
14
|
+
def setup_distributed():
|
|
15
|
+
# Environment variables set by launcher
|
|
16
|
+
rank = int(os.environ["RANK"])
|
|
17
|
+
world_size = int(os.environ["WORLD_SIZE"])
|
|
18
|
+
local_rank = int(os.environ["LOCAL_RANK"])
|
|
19
|
+
|
|
20
|
+
dist.init_process_group(
|
|
21
|
+
backend="nccl",
|
|
22
|
+
rank=rank,
|
|
23
|
+
world_size=world_size
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
torch.cuda.set_device(local_rank)
|
|
27
|
+
return rank, world_size, local_rank
|
|
28
|
+
|
|
29
|
+
def main():
|
|
30
|
+
rank, world_size, local_rank = setup_distributed()
|
|
31
|
+
|
|
32
|
+
model = MyModel().cuda(local_rank)
|
|
33
|
+
model = DDP(model, device_ids=[local_rank])
|
|
34
|
+
|
|
35
|
+
# Training loop with synchronized gradients
|
|
36
|
+
for epoch in range(num_epochs):
|
|
37
|
+
train_one_epoch(model, dataloader)
|
|
38
|
+
|
|
39
|
+
# Save checkpoint on rank 0 only
|
|
40
|
+
if rank == 0:
|
|
41
|
+
torch.save(model.module.state_dict(), f"checkpoint_{epoch}.pt")
|
|
42
|
+
|
|
43
|
+
dist.destroy_process_group()
|
|
44
|
+
|
|
45
|
+
if __name__ == "__main__":
|
|
46
|
+
main()
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Launch on multiple instances
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
# On Node 0 (master)
|
|
53
|
+
export MASTER_ADDR=<NODE0_PRIVATE_IP>
|
|
54
|
+
export MASTER_PORT=29500
|
|
55
|
+
|
|
56
|
+
torchrun \
|
|
57
|
+
--nnodes=2 \
|
|
58
|
+
--nproc_per_node=8 \
|
|
59
|
+
--node_rank=0 \
|
|
60
|
+
--master_addr=$MASTER_ADDR \
|
|
61
|
+
--master_port=$MASTER_PORT \
|
|
62
|
+
train_multi_node.py
|
|
63
|
+
|
|
64
|
+
# On Node 1
|
|
65
|
+
export MASTER_ADDR=<NODE0_PRIVATE_IP>
|
|
66
|
+
export MASTER_PORT=29500
|
|
67
|
+
|
|
68
|
+
torchrun \
|
|
69
|
+
--nnodes=2 \
|
|
70
|
+
--nproc_per_node=8 \
|
|
71
|
+
--node_rank=1 \
|
|
72
|
+
--master_addr=$MASTER_ADDR \
|
|
73
|
+
--master_port=$MASTER_PORT \
|
|
74
|
+
train_multi_node.py
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### FSDP for large models
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
|
|
81
|
+
from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
|
|
82
|
+
from transformers.models.llama.modeling_llama import LlamaDecoderLayer
|
|
83
|
+
|
|
84
|
+
# Wrap policy for transformer models
|
|
85
|
+
auto_wrap_policy = functools.partial(
|
|
86
|
+
transformer_auto_wrap_policy,
|
|
87
|
+
transformer_layer_cls={LlamaDecoderLayer}
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
model = FSDP(
|
|
91
|
+
model,
|
|
92
|
+
auto_wrap_policy=auto_wrap_policy,
|
|
93
|
+
mixed_precision=MixedPrecision(
|
|
94
|
+
param_dtype=torch.bfloat16,
|
|
95
|
+
reduce_dtype=torch.bfloat16,
|
|
96
|
+
buffer_dtype=torch.bfloat16,
|
|
97
|
+
),
|
|
98
|
+
device_id=local_rank,
|
|
99
|
+
)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### DeepSpeed ZeRO
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
# ds_config.json
|
|
106
|
+
{
|
|
107
|
+
"train_batch_size": 64,
|
|
108
|
+
"gradient_accumulation_steps": 4,
|
|
109
|
+
"fp16": {"enabled": true},
|
|
110
|
+
"zero_optimization": {
|
|
111
|
+
"stage": 3,
|
|
112
|
+
"offload_optimizer": {"device": "cpu"},
|
|
113
|
+
"offload_param": {"device": "cpu"}
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
# Launch with DeepSpeed
|
|
120
|
+
deepspeed --num_nodes=2 \
|
|
121
|
+
--num_gpus=8 \
|
|
122
|
+
--hostfile=hostfile.txt \
|
|
123
|
+
train.py --deepspeed ds_config.json
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
### Hostfile for multi-node
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
# hostfile.txt
|
|
130
|
+
node0_ip slots=8
|
|
131
|
+
node1_ip slots=8
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
## API Automation
|
|
135
|
+
|
|
136
|
+
### Auto-launch training jobs
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
import os
|
|
140
|
+
import time
|
|
141
|
+
import lambda_cloud_client
|
|
142
|
+
from lambda_cloud_client.models import LaunchInstanceRequest
|
|
143
|
+
|
|
144
|
+
class LambdaJobManager:
|
|
145
|
+
def __init__(self, api_key: str):
|
|
146
|
+
self.config = lambda_cloud_client.Configuration(
|
|
147
|
+
host="https://cloud.lambdalabs.com/api/v1",
|
|
148
|
+
access_token=api_key
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
def find_available_gpu(self, gpu_types: list[str], regions: list[str] = None):
|
|
152
|
+
"""Find first available GPU type across regions."""
|
|
153
|
+
with lambda_cloud_client.ApiClient(self.config) as client:
|
|
154
|
+
api = lambda_cloud_client.DefaultApi(client)
|
|
155
|
+
types = api.instance_types()
|
|
156
|
+
|
|
157
|
+
for gpu_type in gpu_types:
|
|
158
|
+
if gpu_type in types.data:
|
|
159
|
+
info = types.data[gpu_type]
|
|
160
|
+
for region in info.regions_with_capacity_available:
|
|
161
|
+
if regions is None or region.name in regions:
|
|
162
|
+
return gpu_type, region.name
|
|
163
|
+
|
|
164
|
+
return None, None
|
|
165
|
+
|
|
166
|
+
def launch_and_wait(self, instance_type: str, region: str,
|
|
167
|
+
ssh_key: str, filesystem: str = None,
|
|
168
|
+
timeout: int = 900) -> dict:
|
|
169
|
+
"""Launch instance and wait for it to be ready."""
|
|
170
|
+
with lambda_cloud_client.ApiClient(self.config) as client:
|
|
171
|
+
api = lambda_cloud_client.DefaultApi(client)
|
|
172
|
+
|
|
173
|
+
request = LaunchInstanceRequest(
|
|
174
|
+
region_name=region,
|
|
175
|
+
instance_type_name=instance_type,
|
|
176
|
+
ssh_key_names=[ssh_key],
|
|
177
|
+
file_system_names=[filesystem] if filesystem else [],
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
response = api.launch_instance(request)
|
|
181
|
+
instance_id = response.data.instance_ids[0]
|
|
182
|
+
|
|
183
|
+
# Poll until ready
|
|
184
|
+
start = time.time()
|
|
185
|
+
while time.time() - start < timeout:
|
|
186
|
+
instance = api.get_instance(instance_id)
|
|
187
|
+
if instance.data.status == "active":
|
|
188
|
+
return {
|
|
189
|
+
"id": instance_id,
|
|
190
|
+
"ip": instance.data.ip,
|
|
191
|
+
"status": "active"
|
|
192
|
+
}
|
|
193
|
+
time.sleep(30)
|
|
194
|
+
|
|
195
|
+
raise TimeoutError(f"Instance {instance_id} not ready after {timeout}s")
|
|
196
|
+
|
|
197
|
+
def terminate(self, instance_ids: list[str]):
|
|
198
|
+
"""Terminate instances."""
|
|
199
|
+
from lambda_cloud_client.models import TerminateInstanceRequest
|
|
200
|
+
|
|
201
|
+
with lambda_cloud_client.ApiClient(self.config) as client:
|
|
202
|
+
api = lambda_cloud_client.DefaultApi(client)
|
|
203
|
+
request = TerminateInstanceRequest(instance_ids=instance_ids)
|
|
204
|
+
api.terminate_instance(request)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
# Usage
|
|
208
|
+
manager = LambdaJobManager(os.environ["LAMBDA_API_KEY"])
|
|
209
|
+
|
|
210
|
+
# Find available H100 or A100
|
|
211
|
+
gpu_type, region = manager.find_available_gpu(
|
|
212
|
+
["gpu_8x_h100_sxm5", "gpu_8x_a100_80gb_sxm4"],
|
|
213
|
+
regions=["us-west-1", "us-east-1"]
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
if gpu_type:
|
|
217
|
+
instance = manager.launch_and_wait(
|
|
218
|
+
gpu_type, region,
|
|
219
|
+
ssh_key="my-key",
|
|
220
|
+
filesystem="training-data"
|
|
221
|
+
)
|
|
222
|
+
print(f"Ready: ssh ubuntu@{instance['ip']}")
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
### Batch job submission
|
|
226
|
+
|
|
227
|
+
```python
|
|
228
|
+
import subprocess
|
|
229
|
+
import paramiko
|
|
230
|
+
|
|
231
|
+
def run_remote_job(ip: str, ssh_key_path: str, commands: list[str]):
|
|
232
|
+
"""Execute commands on remote instance."""
|
|
233
|
+
client = paramiko.SSHClient()
|
|
234
|
+
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
|
235
|
+
client.connect(ip, username="ubuntu", key_filename=ssh_key_path)
|
|
236
|
+
|
|
237
|
+
for cmd in commands:
|
|
238
|
+
stdin, stdout, stderr = client.exec_command(cmd)
|
|
239
|
+
print(stdout.read().decode())
|
|
240
|
+
if stderr.read():
|
|
241
|
+
print(f"Error: {stderr.read().decode()}")
|
|
242
|
+
|
|
243
|
+
client.close()
|
|
244
|
+
|
|
245
|
+
# Submit training job
|
|
246
|
+
commands = [
|
|
247
|
+
"cd /lambda/nfs/storage/project",
|
|
248
|
+
"git pull",
|
|
249
|
+
"pip install -r requirements.txt",
|
|
250
|
+
"nohup torchrun --nproc_per_node=8 train.py > train.log 2>&1 &"
|
|
251
|
+
]
|
|
252
|
+
|
|
253
|
+
run_remote_job(instance["ip"], "~/.ssh/lambda_key", commands)
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
### Monitor training progress
|
|
257
|
+
|
|
258
|
+
```python
|
|
259
|
+
def monitor_job(ip: str, ssh_key_path: str, log_file: str = "train.log"):
|
|
260
|
+
"""Stream training logs from remote instance."""
|
|
261
|
+
import time
|
|
262
|
+
|
|
263
|
+
client = paramiko.SSHClient()
|
|
264
|
+
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
|
265
|
+
client.connect(ip, username="ubuntu", key_filename=ssh_key_path)
|
|
266
|
+
|
|
267
|
+
# Tail log file
|
|
268
|
+
stdin, stdout, stderr = client.exec_command(f"tail -f {log_file}")
|
|
269
|
+
|
|
270
|
+
try:
|
|
271
|
+
for line in stdout:
|
|
272
|
+
print(line.strip())
|
|
273
|
+
except KeyboardInterrupt:
|
|
274
|
+
pass
|
|
275
|
+
finally:
|
|
276
|
+
client.close()
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
## 1-Click Cluster Workflows
|
|
280
|
+
|
|
281
|
+
### Slurm job submission
|
|
282
|
+
|
|
283
|
+
```bash
|
|
284
|
+
#!/bin/bash
|
|
285
|
+
#SBATCH --job-name=llm-training
|
|
286
|
+
#SBATCH --nodes=4
|
|
287
|
+
#SBATCH --ntasks-per-node=8
|
|
288
|
+
#SBATCH --gpus-per-node=8
|
|
289
|
+
#SBATCH --time=24:00:00
|
|
290
|
+
#SBATCH --output=logs/%j.out
|
|
291
|
+
#SBATCH --error=logs/%j.err
|
|
292
|
+
|
|
293
|
+
# Set up distributed environment
|
|
294
|
+
export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
|
|
295
|
+
export MASTER_PORT=29500
|
|
296
|
+
|
|
297
|
+
# Launch training
|
|
298
|
+
srun torchrun \
|
|
299
|
+
--nnodes=$SLURM_NNODES \
|
|
300
|
+
--nproc_per_node=$SLURM_GPUS_PER_NODE \
|
|
301
|
+
--rdzv_backend=c10d \
|
|
302
|
+
--rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT \
|
|
303
|
+
train.py \
|
|
304
|
+
--config config.yaml
|
|
305
|
+
```
|
|
306
|
+
|
|
307
|
+
### Interactive cluster session
|
|
308
|
+
|
|
309
|
+
```bash
|
|
310
|
+
# Request interactive session
|
|
311
|
+
srun --nodes=1 --ntasks=1 --gpus=8 --time=4:00:00 --pty bash
|
|
312
|
+
|
|
313
|
+
# Now on compute node with 8 GPUs
|
|
314
|
+
nvidia-smi
|
|
315
|
+
python train.py
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
### Monitoring cluster jobs
|
|
319
|
+
|
|
320
|
+
```bash
|
|
321
|
+
# View job queue
|
|
322
|
+
squeue
|
|
323
|
+
|
|
324
|
+
# View job details
|
|
325
|
+
scontrol show job <JOB_ID>
|
|
326
|
+
|
|
327
|
+
# Cancel job
|
|
328
|
+
scancel <JOB_ID>
|
|
329
|
+
|
|
330
|
+
# View node status
|
|
331
|
+
sinfo
|
|
332
|
+
|
|
333
|
+
# View GPU usage across cluster
|
|
334
|
+
srun --nodes=4 nvidia-smi --query-gpu=name,utilization.gpu --format=csv
|
|
335
|
+
```
|
|
336
|
+
|
|
337
|
+
## Advanced Filesystem Usage
|
|
338
|
+
|
|
339
|
+
### Data staging workflow
|
|
340
|
+
|
|
341
|
+
```bash
|
|
342
|
+
# Stage data from S3 to filesystem (one-time)
|
|
343
|
+
aws s3 sync s3://my-bucket/dataset /lambda/nfs/storage/datasets/
|
|
344
|
+
|
|
345
|
+
# Or use rclone
|
|
346
|
+
rclone sync s3:my-bucket/dataset /lambda/nfs/storage/datasets/
|
|
347
|
+
```
|
|
348
|
+
|
|
349
|
+
### Shared filesystem across instances
|
|
350
|
+
|
|
351
|
+
```python
|
|
352
|
+
# Instance 1: Write checkpoints
|
|
353
|
+
checkpoint_path = "/lambda/nfs/shared/checkpoints/model_step_1000.pt"
|
|
354
|
+
torch.save(model.state_dict(), checkpoint_path)
|
|
355
|
+
|
|
356
|
+
# Instance 2: Read checkpoints
|
|
357
|
+
model.load_state_dict(torch.load(checkpoint_path))
|
|
358
|
+
```
|
|
359
|
+
|
|
360
|
+
### Filesystem best practices
|
|
361
|
+
|
|
362
|
+
```bash
|
|
363
|
+
# Organize for ML workflows
|
|
364
|
+
/lambda/nfs/storage/
|
|
365
|
+
├── datasets/
|
|
366
|
+
│ ├── raw/ # Original data
|
|
367
|
+
│ └── processed/ # Preprocessed data
|
|
368
|
+
├── models/
|
|
369
|
+
│ ├── pretrained/ # Base models
|
|
370
|
+
│ └── fine-tuned/ # Your trained models
|
|
371
|
+
├── checkpoints/
|
|
372
|
+
│ └── experiment_1/ # Per-experiment checkpoints
|
|
373
|
+
├── logs/
|
|
374
|
+
│ └── tensorboard/ # Training logs
|
|
375
|
+
└── outputs/
|
|
376
|
+
└── inference/ # Inference results
|
|
377
|
+
```
|
|
378
|
+
|
|
379
|
+
## Environment Management
|
|
380
|
+
|
|
381
|
+
### Custom Python environments
|
|
382
|
+
|
|
383
|
+
```bash
|
|
384
|
+
# Don't modify system Python, create venv
|
|
385
|
+
python -m venv ~/myenv
|
|
386
|
+
source ~/myenv/bin/activate
|
|
387
|
+
|
|
388
|
+
# Install packages
|
|
389
|
+
pip install torch transformers accelerate
|
|
390
|
+
|
|
391
|
+
# Save to filesystem for reuse
|
|
392
|
+
cp -r ~/myenv /lambda/nfs/storage/envs/myenv
|
|
393
|
+
```
|
|
394
|
+
|
|
395
|
+
### Conda environments
|
|
396
|
+
|
|
397
|
+
```bash
|
|
398
|
+
# Install miniconda (if not present)
|
|
399
|
+
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
|
|
400
|
+
bash Miniconda3-latest-Linux-x86_64.sh -b -p ~/miniconda3
|
|
401
|
+
|
|
402
|
+
# Create environment
|
|
403
|
+
~/miniconda3/bin/conda create -n ml python=3.10 pytorch pytorch-cuda=12.1 -c pytorch -c nvidia -y
|
|
404
|
+
|
|
405
|
+
# Activate
|
|
406
|
+
source ~/miniconda3/bin/activate ml
|
|
407
|
+
```
|
|
408
|
+
|
|
409
|
+
### Docker containers
|
|
410
|
+
|
|
411
|
+
```bash
|
|
412
|
+
# Pull and run NVIDIA container
|
|
413
|
+
docker run --gpus all -it --rm \
|
|
414
|
+
-v /lambda/nfs/storage:/data \
|
|
415
|
+
nvcr.io/nvidia/pytorch:24.01-py3
|
|
416
|
+
|
|
417
|
+
# Run training in container
|
|
418
|
+
docker run --gpus all -d \
|
|
419
|
+
-v /lambda/nfs/storage:/data \
|
|
420
|
+
-v $(pwd):/workspace \
|
|
421
|
+
nvcr.io/nvidia/pytorch:24.01-py3 \
|
|
422
|
+
python /workspace/train.py
|
|
423
|
+
```
|
|
424
|
+
|
|
425
|
+
## Monitoring and Observability
|
|
426
|
+
|
|
427
|
+
### GPU monitoring
|
|
428
|
+
|
|
429
|
+
```bash
|
|
430
|
+
# Real-time GPU stats
|
|
431
|
+
watch -n 1 nvidia-smi
|
|
432
|
+
|
|
433
|
+
# GPU utilization over time
|
|
434
|
+
nvidia-smi dmon -s u -d 1
|
|
435
|
+
|
|
436
|
+
# Detailed GPU info
|
|
437
|
+
nvidia-smi -q
|
|
438
|
+
```
|
|
439
|
+
|
|
440
|
+
### System monitoring
|
|
441
|
+
|
|
442
|
+
```bash
|
|
443
|
+
# CPU and memory
|
|
444
|
+
htop
|
|
445
|
+
|
|
446
|
+
# Disk I/O
|
|
447
|
+
iostat -x 1
|
|
448
|
+
|
|
449
|
+
# Network
|
|
450
|
+
iftop
|
|
451
|
+
|
|
452
|
+
# All resources
|
|
453
|
+
glances
|
|
454
|
+
```
|
|
455
|
+
|
|
456
|
+
### TensorBoard integration
|
|
457
|
+
|
|
458
|
+
```bash
|
|
459
|
+
# Start TensorBoard
|
|
460
|
+
tensorboard --logdir /lambda/nfs/storage/logs --port 6006 --bind_all
|
|
461
|
+
|
|
462
|
+
# SSH tunnel from local machine
|
|
463
|
+
ssh -L 6006:localhost:6006 ubuntu@<IP>
|
|
464
|
+
|
|
465
|
+
# Access at http://localhost:6006
|
|
466
|
+
```
|
|
467
|
+
|
|
468
|
+
### Weights & Biases integration
|
|
469
|
+
|
|
470
|
+
```python
|
|
471
|
+
import wandb
|
|
472
|
+
|
|
473
|
+
# Initialize with API key
|
|
474
|
+
wandb.login(key=os.environ["WANDB_API_KEY"])
|
|
475
|
+
|
|
476
|
+
# Start run
|
|
477
|
+
wandb.init(
|
|
478
|
+
project="lambda-training",
|
|
479
|
+
config={"learning_rate": 1e-4, "epochs": 100}
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
# Log metrics
|
|
483
|
+
wandb.log({"loss": loss, "accuracy": acc})
|
|
484
|
+
|
|
485
|
+
# Save artifacts to filesystem + W&B
|
|
486
|
+
wandb.save("/lambda/nfs/storage/checkpoints/best_model.pt")
|
|
487
|
+
```
|
|
488
|
+
|
|
489
|
+
## Cost Optimization Strategies
|
|
490
|
+
|
|
491
|
+
### Checkpointing for interruption recovery
|
|
492
|
+
|
|
493
|
+
```python
|
|
494
|
+
import os
|
|
495
|
+
|
|
496
|
+
def save_checkpoint(model, optimizer, epoch, loss, path):
|
|
497
|
+
torch.save({
|
|
498
|
+
'epoch': epoch,
|
|
499
|
+
'model_state_dict': model.state_dict(),
|
|
500
|
+
'optimizer_state_dict': optimizer.state_dict(),
|
|
501
|
+
'loss': loss,
|
|
502
|
+
}, path)
|
|
503
|
+
|
|
504
|
+
def load_checkpoint(path, model, optimizer):
|
|
505
|
+
if os.path.exists(path):
|
|
506
|
+
checkpoint = torch.load(path)
|
|
507
|
+
model.load_state_dict(checkpoint['model_state_dict'])
|
|
508
|
+
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
|
|
509
|
+
return checkpoint['epoch'], checkpoint['loss']
|
|
510
|
+
return 0, float('inf')
|
|
511
|
+
|
|
512
|
+
# Save every N steps to filesystem
|
|
513
|
+
checkpoint_path = "/lambda/nfs/storage/checkpoints/latest.pt"
|
|
514
|
+
if step % 1000 == 0:
|
|
515
|
+
save_checkpoint(model, optimizer, epoch, loss, checkpoint_path)
|
|
516
|
+
```
|
|
517
|
+
|
|
518
|
+
### Instance selection by workload
|
|
519
|
+
|
|
520
|
+
```python
|
|
521
|
+
def recommend_instance(model_params: int, batch_size: int, task: str) -> str:
|
|
522
|
+
"""Recommend Lambda instance based on workload."""
|
|
523
|
+
|
|
524
|
+
if task == "inference":
|
|
525
|
+
if model_params < 7e9:
|
|
526
|
+
return "gpu_1x_a10" # $0.75/hr
|
|
527
|
+
elif model_params < 13e9:
|
|
528
|
+
return "gpu_1x_a6000" # $0.80/hr
|
|
529
|
+
else:
|
|
530
|
+
return "gpu_1x_h100_pcie" # $2.49/hr
|
|
531
|
+
|
|
532
|
+
elif task == "fine-tuning":
|
|
533
|
+
if model_params < 7e9:
|
|
534
|
+
return "gpu_1x_a100" # $1.29/hr
|
|
535
|
+
elif model_params < 13e9:
|
|
536
|
+
return "gpu_4x_a100" # $5.16/hr
|
|
537
|
+
else:
|
|
538
|
+
return "gpu_8x_h100_sxm5" # $23.92/hr
|
|
539
|
+
|
|
540
|
+
elif task == "pretraining":
|
|
541
|
+
return "gpu_8x_h100_sxm5" # Maximum performance
|
|
542
|
+
|
|
543
|
+
return "gpu_1x_a100" # Default
|
|
544
|
+
```
|
|
545
|
+
|
|
546
|
+
### Auto-terminate idle instances
|
|
547
|
+
|
|
548
|
+
```python
|
|
549
|
+
import time
|
|
550
|
+
from datetime import datetime, timedelta
|
|
551
|
+
|
|
552
|
+
def auto_terminate_idle(api_key: str, idle_threshold_hours: float = 2):
|
|
553
|
+
"""Terminate instances idle for too long."""
|
|
554
|
+
manager = LambdaJobManager(api_key)
|
|
555
|
+
|
|
556
|
+
with lambda_cloud_client.ApiClient(manager.config) as client:
|
|
557
|
+
api = lambda_cloud_client.DefaultApi(client)
|
|
558
|
+
instances = api.list_instances()
|
|
559
|
+
|
|
560
|
+
for instance in instances.data:
|
|
561
|
+
# Check if instance has been running without activity
|
|
562
|
+
# (You'd need to track this separately)
|
|
563
|
+
launch_time = instance.launched_at
|
|
564
|
+
if datetime.now() - launch_time > timedelta(hours=idle_threshold_hours):
|
|
565
|
+
print(f"Terminating idle instance: {instance.id}")
|
|
566
|
+
manager.terminate([instance.id])
|
|
567
|
+
```
|
|
568
|
+
|
|
569
|
+
## Security Best Practices
|
|
570
|
+
|
|
571
|
+
### SSH key rotation
|
|
572
|
+
|
|
573
|
+
```bash
|
|
574
|
+
# Generate new key pair
|
|
575
|
+
ssh-keygen -t ed25519 -f ~/.ssh/lambda_key_new -C "lambda-$(date +%Y%m)"
|
|
576
|
+
|
|
577
|
+
# Add new key via Lambda console or API
|
|
578
|
+
# Update authorized_keys on running instances
|
|
579
|
+
ssh ubuntu@<IP> "echo '$(cat ~/.ssh/lambda_key_new.pub)' >> ~/.ssh/authorized_keys"
|
|
580
|
+
|
|
581
|
+
# Test new key
|
|
582
|
+
ssh -i ~/.ssh/lambda_key_new ubuntu@<IP>
|
|
583
|
+
|
|
584
|
+
# Remove old key from Lambda console
|
|
585
|
+
```
|
|
586
|
+
|
|
587
|
+
### Firewall configuration
|
|
588
|
+
|
|
589
|
+
```bash
|
|
590
|
+
# Lambda console: Only open necessary ports
|
|
591
|
+
# Recommended:
|
|
592
|
+
# - 22 (SSH) - Always needed
|
|
593
|
+
# - 6006 (TensorBoard) - If using
|
|
594
|
+
# - 8888 (Jupyter) - If using
|
|
595
|
+
# - 29500 (PyTorch distributed) - For multi-node only
|
|
596
|
+
```
|
|
597
|
+
|
|
598
|
+
### Secrets management
|
|
599
|
+
|
|
600
|
+
```bash
|
|
601
|
+
# Don't hardcode API keys in code
|
|
602
|
+
# Use environment variables
|
|
603
|
+
export HF_TOKEN="hf_..."
|
|
604
|
+
export WANDB_API_KEY="..."
|
|
605
|
+
|
|
606
|
+
# Or use .env file (add to .gitignore)
|
|
607
|
+
source .env
|
|
608
|
+
|
|
609
|
+
# On instance, store in ~/.bashrc
|
|
610
|
+
echo 'export HF_TOKEN="..."' >> ~/.bashrc
|
|
611
|
+
```
|