@synsci/cli-darwin-x64 1.1.49
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skills/accelerate/SKILL.md +332 -0
- package/bin/skills/accelerate/references/custom-plugins.md +453 -0
- package/bin/skills/accelerate/references/megatron-integration.md +489 -0
- package/bin/skills/accelerate/references/performance.md +525 -0
- package/bin/skills/audiocraft/SKILL.md +564 -0
- package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
- package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
- package/bin/skills/autogpt/SKILL.md +403 -0
- package/bin/skills/autogpt/references/advanced-usage.md +535 -0
- package/bin/skills/autogpt/references/troubleshooting.md +420 -0
- package/bin/skills/awq/SKILL.md +310 -0
- package/bin/skills/awq/references/advanced-usage.md +324 -0
- package/bin/skills/awq/references/troubleshooting.md +344 -0
- package/bin/skills/axolotl/SKILL.md +158 -0
- package/bin/skills/axolotl/references/api.md +5548 -0
- package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
- package/bin/skills/axolotl/references/index.md +15 -0
- package/bin/skills/axolotl/references/other.md +3563 -0
- package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
- package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
- package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
- package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
- package/bin/skills/bitsandbytes/SKILL.md +411 -0
- package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
- package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
- package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
- package/bin/skills/blip-2/SKILL.md +564 -0
- package/bin/skills/blip-2/references/advanced-usage.md +680 -0
- package/bin/skills/blip-2/references/troubleshooting.md +526 -0
- package/bin/skills/chroma/SKILL.md +406 -0
- package/bin/skills/chroma/references/integration.md +38 -0
- package/bin/skills/clip/SKILL.md +253 -0
- package/bin/skills/clip/references/applications.md +207 -0
- package/bin/skills/constitutional-ai/SKILL.md +290 -0
- package/bin/skills/crewai/SKILL.md +498 -0
- package/bin/skills/crewai/references/flows.md +438 -0
- package/bin/skills/crewai/references/tools.md +429 -0
- package/bin/skills/crewai/references/troubleshooting.md +480 -0
- package/bin/skills/deepspeed/SKILL.md +141 -0
- package/bin/skills/deepspeed/references/08.md +17 -0
- package/bin/skills/deepspeed/references/09.md +173 -0
- package/bin/skills/deepspeed/references/2020.md +378 -0
- package/bin/skills/deepspeed/references/2023.md +279 -0
- package/bin/skills/deepspeed/references/assets.md +179 -0
- package/bin/skills/deepspeed/references/index.md +35 -0
- package/bin/skills/deepspeed/references/mii.md +118 -0
- package/bin/skills/deepspeed/references/other.md +1191 -0
- package/bin/skills/deepspeed/references/tutorials.md +6554 -0
- package/bin/skills/dspy/SKILL.md +590 -0
- package/bin/skills/dspy/references/examples.md +663 -0
- package/bin/skills/dspy/references/modules.md +475 -0
- package/bin/skills/dspy/references/optimizers.md +566 -0
- package/bin/skills/faiss/SKILL.md +221 -0
- package/bin/skills/faiss/references/index_types.md +280 -0
- package/bin/skills/flash-attention/SKILL.md +367 -0
- package/bin/skills/flash-attention/references/benchmarks.md +215 -0
- package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
- package/bin/skills/gguf/SKILL.md +427 -0
- package/bin/skills/gguf/references/advanced-usage.md +504 -0
- package/bin/skills/gguf/references/troubleshooting.md +442 -0
- package/bin/skills/gptq/SKILL.md +450 -0
- package/bin/skills/gptq/references/calibration.md +337 -0
- package/bin/skills/gptq/references/integration.md +129 -0
- package/bin/skills/gptq/references/troubleshooting.md +95 -0
- package/bin/skills/grpo-rl-training/README.md +97 -0
- package/bin/skills/grpo-rl-training/SKILL.md +572 -0
- package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
- package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
- package/bin/skills/guidance/SKILL.md +572 -0
- package/bin/skills/guidance/references/backends.md +554 -0
- package/bin/skills/guidance/references/constraints.md +674 -0
- package/bin/skills/guidance/references/examples.md +767 -0
- package/bin/skills/hqq/SKILL.md +445 -0
- package/bin/skills/hqq/references/advanced-usage.md +528 -0
- package/bin/skills/hqq/references/troubleshooting.md +503 -0
- package/bin/skills/hugging-face-cli/SKILL.md +191 -0
- package/bin/skills/hugging-face-cli/references/commands.md +954 -0
- package/bin/skills/hugging-face-cli/references/examples.md +374 -0
- package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
- package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
- package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
- package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
- package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
- package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
- package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
- package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
- package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
- package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
- package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
- package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
- package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
- package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
- package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
- package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
- package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
- package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
- package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
- package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
- package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
- package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
- package/bin/skills/hugging-face-jobs/index.html +216 -0
- package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
- package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
- package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
- package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
- package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
- package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
- package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
- package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
- package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
- package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
- package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
- package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
- package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
- package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
- package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
- package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
- package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
- package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
- package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
- package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
- package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
- package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
- package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
- package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
- package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
- package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
- package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
- package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
- package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
- package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
- package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
- package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
- package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
- package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
- package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
- package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
- package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
- package/bin/skills/instructor/SKILL.md +740 -0
- package/bin/skills/instructor/references/examples.md +107 -0
- package/bin/skills/instructor/references/providers.md +70 -0
- package/bin/skills/instructor/references/validation.md +606 -0
- package/bin/skills/knowledge-distillation/SKILL.md +458 -0
- package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
- package/bin/skills/lambda-labs/SKILL.md +545 -0
- package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
- package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
- package/bin/skills/langchain/SKILL.md +480 -0
- package/bin/skills/langchain/references/agents.md +499 -0
- package/bin/skills/langchain/references/integration.md +562 -0
- package/bin/skills/langchain/references/rag.md +600 -0
- package/bin/skills/langsmith/SKILL.md +422 -0
- package/bin/skills/langsmith/references/advanced-usage.md +548 -0
- package/bin/skills/langsmith/references/troubleshooting.md +537 -0
- package/bin/skills/litgpt/SKILL.md +469 -0
- package/bin/skills/litgpt/references/custom-models.md +568 -0
- package/bin/skills/litgpt/references/distributed-training.md +451 -0
- package/bin/skills/litgpt/references/supported-models.md +336 -0
- package/bin/skills/litgpt/references/training-recipes.md +619 -0
- package/bin/skills/llama-cpp/SKILL.md +258 -0
- package/bin/skills/llama-cpp/references/optimization.md +89 -0
- package/bin/skills/llama-cpp/references/quantization.md +213 -0
- package/bin/skills/llama-cpp/references/server.md +125 -0
- package/bin/skills/llama-factory/SKILL.md +80 -0
- package/bin/skills/llama-factory/references/_images.md +23 -0
- package/bin/skills/llama-factory/references/advanced.md +1055 -0
- package/bin/skills/llama-factory/references/getting_started.md +349 -0
- package/bin/skills/llama-factory/references/index.md +19 -0
- package/bin/skills/llama-factory/references/other.md +31 -0
- package/bin/skills/llamaguard/SKILL.md +337 -0
- package/bin/skills/llamaindex/SKILL.md +569 -0
- package/bin/skills/llamaindex/references/agents.md +83 -0
- package/bin/skills/llamaindex/references/data_connectors.md +108 -0
- package/bin/skills/llamaindex/references/query_engines.md +406 -0
- package/bin/skills/llava/SKILL.md +304 -0
- package/bin/skills/llava/references/training.md +197 -0
- package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
- package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
- package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
- package/bin/skills/long-context/SKILL.md +536 -0
- package/bin/skills/long-context/references/extension_methods.md +468 -0
- package/bin/skills/long-context/references/fine_tuning.md +611 -0
- package/bin/skills/long-context/references/rope.md +402 -0
- package/bin/skills/mamba/SKILL.md +260 -0
- package/bin/skills/mamba/references/architecture-details.md +206 -0
- package/bin/skills/mamba/references/benchmarks.md +255 -0
- package/bin/skills/mamba/references/training-guide.md +388 -0
- package/bin/skills/megatron-core/SKILL.md +366 -0
- package/bin/skills/megatron-core/references/benchmarks.md +249 -0
- package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
- package/bin/skills/megatron-core/references/production-examples.md +473 -0
- package/bin/skills/megatron-core/references/training-recipes.md +547 -0
- package/bin/skills/miles/SKILL.md +315 -0
- package/bin/skills/miles/references/api-reference.md +141 -0
- package/bin/skills/miles/references/troubleshooting.md +352 -0
- package/bin/skills/mlflow/SKILL.md +704 -0
- package/bin/skills/mlflow/references/deployment.md +744 -0
- package/bin/skills/mlflow/references/model-registry.md +770 -0
- package/bin/skills/mlflow/references/tracking.md +680 -0
- package/bin/skills/modal/SKILL.md +341 -0
- package/bin/skills/modal/references/advanced-usage.md +503 -0
- package/bin/skills/modal/references/troubleshooting.md +494 -0
- package/bin/skills/model-merging/SKILL.md +539 -0
- package/bin/skills/model-merging/references/evaluation.md +462 -0
- package/bin/skills/model-merging/references/examples.md +428 -0
- package/bin/skills/model-merging/references/methods.md +352 -0
- package/bin/skills/model-pruning/SKILL.md +495 -0
- package/bin/skills/model-pruning/references/wanda.md +347 -0
- package/bin/skills/moe-training/SKILL.md +526 -0
- package/bin/skills/moe-training/references/architectures.md +432 -0
- package/bin/skills/moe-training/references/inference.md +348 -0
- package/bin/skills/moe-training/references/training.md +425 -0
- package/bin/skills/nanogpt/SKILL.md +290 -0
- package/bin/skills/nanogpt/references/architecture.md +382 -0
- package/bin/skills/nanogpt/references/data.md +476 -0
- package/bin/skills/nanogpt/references/training.md +564 -0
- package/bin/skills/nemo-curator/SKILL.md +383 -0
- package/bin/skills/nemo-curator/references/deduplication.md +87 -0
- package/bin/skills/nemo-curator/references/filtering.md +102 -0
- package/bin/skills/nemo-evaluator/SKILL.md +494 -0
- package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
- package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
- package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
- package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
- package/bin/skills/nemo-guardrails/SKILL.md +297 -0
- package/bin/skills/nnsight/SKILL.md +436 -0
- package/bin/skills/nnsight/references/README.md +78 -0
- package/bin/skills/nnsight/references/api.md +344 -0
- package/bin/skills/nnsight/references/tutorials.md +300 -0
- package/bin/skills/openrlhf/SKILL.md +249 -0
- package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
- package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
- package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
- package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
- package/bin/skills/outlines/SKILL.md +652 -0
- package/bin/skills/outlines/references/backends.md +615 -0
- package/bin/skills/outlines/references/examples.md +773 -0
- package/bin/skills/outlines/references/json_generation.md +652 -0
- package/bin/skills/peft/SKILL.md +431 -0
- package/bin/skills/peft/references/advanced-usage.md +514 -0
- package/bin/skills/peft/references/troubleshooting.md +480 -0
- package/bin/skills/phoenix/SKILL.md +475 -0
- package/bin/skills/phoenix/references/advanced-usage.md +619 -0
- package/bin/skills/phoenix/references/troubleshooting.md +538 -0
- package/bin/skills/pinecone/SKILL.md +358 -0
- package/bin/skills/pinecone/references/deployment.md +181 -0
- package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
- package/bin/skills/pytorch-fsdp/references/index.md +7 -0
- package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
- package/bin/skills/pytorch-lightning/SKILL.md +346 -0
- package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
- package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
- package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
- package/bin/skills/pyvene/SKILL.md +473 -0
- package/bin/skills/pyvene/references/README.md +73 -0
- package/bin/skills/pyvene/references/api.md +383 -0
- package/bin/skills/pyvene/references/tutorials.md +376 -0
- package/bin/skills/qdrant/SKILL.md +493 -0
- package/bin/skills/qdrant/references/advanced-usage.md +648 -0
- package/bin/skills/qdrant/references/troubleshooting.md +631 -0
- package/bin/skills/ray-data/SKILL.md +326 -0
- package/bin/skills/ray-data/references/integration.md +82 -0
- package/bin/skills/ray-data/references/transformations.md +83 -0
- package/bin/skills/ray-train/SKILL.md +406 -0
- package/bin/skills/ray-train/references/multi-node.md +628 -0
- package/bin/skills/rwkv/SKILL.md +260 -0
- package/bin/skills/rwkv/references/architecture-details.md +344 -0
- package/bin/skills/rwkv/references/rwkv7.md +386 -0
- package/bin/skills/rwkv/references/state-management.md +369 -0
- package/bin/skills/saelens/SKILL.md +386 -0
- package/bin/skills/saelens/references/README.md +70 -0
- package/bin/skills/saelens/references/api.md +333 -0
- package/bin/skills/saelens/references/tutorials.md +318 -0
- package/bin/skills/segment-anything/SKILL.md +500 -0
- package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
- package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
- package/bin/skills/sentence-transformers/SKILL.md +255 -0
- package/bin/skills/sentence-transformers/references/models.md +123 -0
- package/bin/skills/sentencepiece/SKILL.md +235 -0
- package/bin/skills/sentencepiece/references/algorithms.md +200 -0
- package/bin/skills/sentencepiece/references/training.md +304 -0
- package/bin/skills/sglang/SKILL.md +442 -0
- package/bin/skills/sglang/references/deployment.md +490 -0
- package/bin/skills/sglang/references/radix-attention.md +413 -0
- package/bin/skills/sglang/references/structured-generation.md +541 -0
- package/bin/skills/simpo/SKILL.md +219 -0
- package/bin/skills/simpo/references/datasets.md +478 -0
- package/bin/skills/simpo/references/hyperparameters.md +452 -0
- package/bin/skills/simpo/references/loss-functions.md +350 -0
- package/bin/skills/skypilot/SKILL.md +509 -0
- package/bin/skills/skypilot/references/advanced-usage.md +491 -0
- package/bin/skills/skypilot/references/troubleshooting.md +570 -0
- package/bin/skills/slime/SKILL.md +464 -0
- package/bin/skills/slime/references/api-reference.md +392 -0
- package/bin/skills/slime/references/troubleshooting.md +386 -0
- package/bin/skills/speculative-decoding/SKILL.md +467 -0
- package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
- package/bin/skills/speculative-decoding/references/medusa.md +350 -0
- package/bin/skills/stable-diffusion/SKILL.md +519 -0
- package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
- package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
- package/bin/skills/tensorboard/SKILL.md +629 -0
- package/bin/skills/tensorboard/references/integrations.md +638 -0
- package/bin/skills/tensorboard/references/profiling.md +545 -0
- package/bin/skills/tensorboard/references/visualization.md +620 -0
- package/bin/skills/tensorrt-llm/SKILL.md +187 -0
- package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
- package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
- package/bin/skills/tensorrt-llm/references/serving.md +470 -0
- package/bin/skills/tinker/SKILL.md +362 -0
- package/bin/skills/tinker/references/api-reference.md +168 -0
- package/bin/skills/tinker/references/getting-started.md +157 -0
- package/bin/skills/tinker/references/loss-functions.md +163 -0
- package/bin/skills/tinker/references/models-and-lora.md +139 -0
- package/bin/skills/tinker/references/recipes.md +280 -0
- package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
- package/bin/skills/tinker/references/rendering.md +243 -0
- package/bin/skills/tinker/references/supervised-learning.md +232 -0
- package/bin/skills/tinker-training-cost/SKILL.md +187 -0
- package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
- package/bin/skills/torchforge/SKILL.md +433 -0
- package/bin/skills/torchforge/references/api-reference.md +327 -0
- package/bin/skills/torchforge/references/troubleshooting.md +409 -0
- package/bin/skills/torchtitan/SKILL.md +358 -0
- package/bin/skills/torchtitan/references/checkpoint.md +181 -0
- package/bin/skills/torchtitan/references/custom-models.md +258 -0
- package/bin/skills/torchtitan/references/float8.md +133 -0
- package/bin/skills/torchtitan/references/fsdp.md +126 -0
- package/bin/skills/transformer-lens/SKILL.md +346 -0
- package/bin/skills/transformer-lens/references/README.md +54 -0
- package/bin/skills/transformer-lens/references/api.md +362 -0
- package/bin/skills/transformer-lens/references/tutorials.md +339 -0
- package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
- package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
- package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
- package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
- package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
- package/bin/skills/unsloth/SKILL.md +80 -0
- package/bin/skills/unsloth/references/index.md +7 -0
- package/bin/skills/unsloth/references/llms-full.md +16799 -0
- package/bin/skills/unsloth/references/llms-txt.md +12044 -0
- package/bin/skills/unsloth/references/llms.md +82 -0
- package/bin/skills/verl/SKILL.md +391 -0
- package/bin/skills/verl/references/api-reference.md +301 -0
- package/bin/skills/verl/references/troubleshooting.md +391 -0
- package/bin/skills/vllm/SKILL.md +364 -0
- package/bin/skills/vllm/references/optimization.md +226 -0
- package/bin/skills/vllm/references/quantization.md +284 -0
- package/bin/skills/vllm/references/server-deployment.md +255 -0
- package/bin/skills/vllm/references/troubleshooting.md +447 -0
- package/bin/skills/weights-and-biases/SKILL.md +590 -0
- package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
- package/bin/skills/weights-and-biases/references/integrations.md +700 -0
- package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
- package/bin/skills/whisper/SKILL.md +317 -0
- package/bin/skills/whisper/references/languages.md +189 -0
- package/bin/synsc +0 -0
- package/package.json +10 -0
|
@@ -0,0 +1,436 @@
|
|
|
1
|
+
# PyTorch Lightning Callbacks
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
Callbacks add functionality to training without modifying the LightningModule. They capture **non-essential logic** like checkpointing, early stopping, and logging.
|
|
6
|
+
|
|
7
|
+
## Built-In Callbacks
|
|
8
|
+
|
|
9
|
+
### 1. ModelCheckpoint
|
|
10
|
+
|
|
11
|
+
**Saves best models during training**:
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from lightning.pytorch.callbacks import ModelCheckpoint
|
|
15
|
+
|
|
16
|
+
# Save top 3 models based on validation loss
|
|
17
|
+
checkpoint = ModelCheckpoint(
|
|
18
|
+
dirpath='checkpoints/',
|
|
19
|
+
filename='model-{epoch:02d}-{val_loss:.2f}',
|
|
20
|
+
monitor='val_loss',
|
|
21
|
+
mode='min',
|
|
22
|
+
save_top_k=3,
|
|
23
|
+
save_last=True, # Also save last epoch
|
|
24
|
+
verbose=True
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
trainer = L.Trainer(callbacks=[checkpoint])
|
|
28
|
+
trainer.fit(model, train_loader, val_loader)
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
**Configuration options**:
|
|
32
|
+
```python
|
|
33
|
+
checkpoint = ModelCheckpoint(
|
|
34
|
+
monitor='val_acc', # Metric to monitor
|
|
35
|
+
mode='max', # 'max' for accuracy, 'min' for loss
|
|
36
|
+
save_top_k=5, # Keep best 5 models
|
|
37
|
+
save_last=True, # Save last epoch separately
|
|
38
|
+
every_n_epochs=1, # Save every N epochs
|
|
39
|
+
save_on_train_epoch_end=False, # Save on validation end instead
|
|
40
|
+
filename='best-{epoch}-{val_acc:.3f}', # Naming pattern
|
|
41
|
+
auto_insert_metric_name=False # Don't auto-add metric to filename
|
|
42
|
+
)
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
**Load checkpoint**:
|
|
46
|
+
```python
|
|
47
|
+
# Load best model
|
|
48
|
+
best_model_path = checkpoint.best_model_path
|
|
49
|
+
model = LitModel.load_from_checkpoint(best_model_path)
|
|
50
|
+
|
|
51
|
+
# Resume training
|
|
52
|
+
trainer = L.Trainer(callbacks=[checkpoint])
|
|
53
|
+
trainer.fit(model, train_loader, val_loader, ckpt_path='checkpoints/last.ckpt')
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### 2. EarlyStopping
|
|
57
|
+
|
|
58
|
+
**Stops training when metric stops improving**:
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from lightning.pytorch.callbacks import EarlyStopping
|
|
62
|
+
|
|
63
|
+
early_stop = EarlyStopping(
|
|
64
|
+
monitor='val_loss',
|
|
65
|
+
patience=5, # Wait 5 epochs
|
|
66
|
+
mode='min',
|
|
67
|
+
min_delta=0.001, # Minimum change to qualify as improvement
|
|
68
|
+
verbose=True,
|
|
69
|
+
strict=True, # Crash if monitored metric not found
|
|
70
|
+
check_on_train_epoch_end=False # Check on validation end
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
trainer = L.Trainer(callbacks=[early_stop])
|
|
74
|
+
trainer.fit(model, train_loader, val_loader)
|
|
75
|
+
# Stops automatically if no improvement for 5 epochs
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
**Advanced usage**:
|
|
79
|
+
```python
|
|
80
|
+
early_stop = EarlyStopping(
|
|
81
|
+
monitor='val_loss',
|
|
82
|
+
patience=10,
|
|
83
|
+
min_delta=0.0,
|
|
84
|
+
verbose=True,
|
|
85
|
+
mode='min',
|
|
86
|
+
stopping_threshold=0.1, # Stop if val_loss < 0.1
|
|
87
|
+
divergence_threshold=5.0, # Stop if val_loss > 5.0
|
|
88
|
+
check_finite=True # Stop on NaN/Inf
|
|
89
|
+
)
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### 3. LearningRateMonitor
|
|
93
|
+
|
|
94
|
+
**Logs learning rate**:
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
from lightning.pytorch.callbacks import LearningRateMonitor
|
|
98
|
+
|
|
99
|
+
lr_monitor = LearningRateMonitor(
|
|
100
|
+
logging_interval='epoch', # Or 'step'
|
|
101
|
+
log_momentum=True # Also log momentum
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
trainer = L.Trainer(callbacks=[lr_monitor])
|
|
105
|
+
# Learning rate automatically logged to TensorBoard/WandB
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### 4. TQDMProgressBar
|
|
109
|
+
|
|
110
|
+
**Customizes progress bar**:
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
from lightning.pytorch.callbacks import TQDMProgressBar
|
|
114
|
+
|
|
115
|
+
progress_bar = TQDMProgressBar(
|
|
116
|
+
refresh_rate=10, # Update every 10 batches
|
|
117
|
+
process_position=0
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
trainer = L.Trainer(callbacks=[progress_bar])
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### 5. GradientAccumulationScheduler
|
|
124
|
+
|
|
125
|
+
**Dynamic gradient accumulation**:
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
from lightning.pytorch.callbacks import GradientAccumulationScheduler
|
|
129
|
+
|
|
130
|
+
# Accumulate more gradients as training progresses
|
|
131
|
+
accumulator = GradientAccumulationScheduler(
|
|
132
|
+
scheduling={
|
|
133
|
+
0: 8, # Epochs 0-4: accumulate 8 batches
|
|
134
|
+
5: 4, # Epochs 5-9: accumulate 4 batches
|
|
135
|
+
10: 2 # Epochs 10+: accumulate 2 batches
|
|
136
|
+
}
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
trainer = L.Trainer(callbacks=[accumulator])
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
### 6. StochasticWeightAveraging (SWA)
|
|
143
|
+
|
|
144
|
+
**Averages weights for better generalization**:
|
|
145
|
+
|
|
146
|
+
```python
|
|
147
|
+
from lightning.pytorch.callbacks import StochasticWeightAveraging
|
|
148
|
+
|
|
149
|
+
swa = StochasticWeightAveraging(
|
|
150
|
+
swa_lrs=1e-2, # SWA learning rate
|
|
151
|
+
swa_epoch_start=0.8, # Start at 80% of training
|
|
152
|
+
annealing_epochs=10, # Annealing period
|
|
153
|
+
annealing_strategy='cos' # 'cos' or 'linear'
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
trainer = L.Trainer(callbacks=[swa])
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
## Custom Callbacks
|
|
160
|
+
|
|
161
|
+
### Basic Custom Callback
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
from lightning.pytorch.callbacks import Callback
|
|
165
|
+
|
|
166
|
+
class PrintingCallback(Callback):
|
|
167
|
+
def on_train_start(self, trainer, pl_module):
|
|
168
|
+
print("Training is starting!")
|
|
169
|
+
|
|
170
|
+
def on_train_end(self, trainer, pl_module):
|
|
171
|
+
print("Training is done!")
|
|
172
|
+
|
|
173
|
+
def on_epoch_end(self, trainer, pl_module):
|
|
174
|
+
print(f"Epoch {trainer.current_epoch} ended")
|
|
175
|
+
|
|
176
|
+
# Use it
|
|
177
|
+
trainer = L.Trainer(callbacks=[PrintingCallback()])
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
### Advanced Custom Callback
|
|
181
|
+
|
|
182
|
+
```python
|
|
183
|
+
class MetricsCallback(Callback):
|
|
184
|
+
"""Logs custom metrics every N batches."""
|
|
185
|
+
|
|
186
|
+
def __init__(self, log_every_n_batches=100):
|
|
187
|
+
self.log_every_n_batches = log_every_n_batches
|
|
188
|
+
self.metrics = []
|
|
189
|
+
|
|
190
|
+
def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
|
|
191
|
+
if batch_idx % self.log_every_n_batches == 0:
|
|
192
|
+
# Compute custom metric
|
|
193
|
+
metric = self.compute_metric(outputs)
|
|
194
|
+
self.metrics.append(metric)
|
|
195
|
+
|
|
196
|
+
# Log to Lightning
|
|
197
|
+
pl_module.log('custom_metric', metric)
|
|
198
|
+
|
|
199
|
+
def compute_metric(self, outputs):
|
|
200
|
+
# Your custom logic
|
|
201
|
+
return outputs['loss'].item()
|
|
202
|
+
|
|
203
|
+
def state_dict(self):
|
|
204
|
+
"""Save callback state in checkpoint."""
|
|
205
|
+
return {'metrics': self.metrics}
|
|
206
|
+
|
|
207
|
+
def load_state_dict(self, state_dict):
|
|
208
|
+
"""Restore callback state from checkpoint."""
|
|
209
|
+
self.metrics = state_dict['metrics']
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
### Gradient Monitoring Callback
|
|
213
|
+
|
|
214
|
+
```python
|
|
215
|
+
class GradientMonitorCallback(Callback):
|
|
216
|
+
"""Monitor gradient norms."""
|
|
217
|
+
|
|
218
|
+
def on_after_backward(self, trainer, pl_module):
|
|
219
|
+
# Compute gradient norm
|
|
220
|
+
total_norm = 0.0
|
|
221
|
+
for p in pl_module.parameters():
|
|
222
|
+
if p.grad is not None:
|
|
223
|
+
param_norm = p.grad.data.norm(2)
|
|
224
|
+
total_norm += param_norm.item() ** 2
|
|
225
|
+
total_norm = total_norm ** 0.5
|
|
226
|
+
|
|
227
|
+
# Log
|
|
228
|
+
pl_module.log('grad_norm', total_norm)
|
|
229
|
+
|
|
230
|
+
# Warn if exploding
|
|
231
|
+
if total_norm > 100:
|
|
232
|
+
print(f"Warning: Large gradient norm: {total_norm:.2f}")
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
### Model Inspection Callback
|
|
236
|
+
|
|
237
|
+
```python
|
|
238
|
+
class ModelInspectionCallback(Callback):
|
|
239
|
+
"""Inspect model activations during training."""
|
|
240
|
+
|
|
241
|
+
def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
|
|
242
|
+
if batch_idx == 0: # First batch of epoch
|
|
243
|
+
# Register hooks
|
|
244
|
+
self.activations = {}
|
|
245
|
+
|
|
246
|
+
def get_activation(name):
|
|
247
|
+
def hook(model, input, output):
|
|
248
|
+
self.activations[name] = output.detach()
|
|
249
|
+
return hook
|
|
250
|
+
|
|
251
|
+
# Attach to specific layers
|
|
252
|
+
pl_module.model.layer1.register_forward_hook(get_activation('layer1'))
|
|
253
|
+
pl_module.model.layer2.register_forward_hook(get_activation('layer2'))
|
|
254
|
+
|
|
255
|
+
def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
|
|
256
|
+
if batch_idx == 0:
|
|
257
|
+
# Log activation statistics
|
|
258
|
+
for name, activation in self.activations.items():
|
|
259
|
+
mean = activation.mean().item()
|
|
260
|
+
std = activation.std().item()
|
|
261
|
+
pl_module.log(f'{name}_mean', mean)
|
|
262
|
+
pl_module.log(f'{name}_std', std)
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
## Callback Hooks
|
|
266
|
+
|
|
267
|
+
**All available hooks**:
|
|
268
|
+
|
|
269
|
+
```python
|
|
270
|
+
class MyCallback(Callback):
|
|
271
|
+
# Setup/Teardown
|
|
272
|
+
def setup(self, trainer, pl_module, stage):
|
|
273
|
+
"""Called at beginning of fit/test/predict."""
|
|
274
|
+
pass
|
|
275
|
+
|
|
276
|
+
def teardown(self, trainer, pl_module, stage):
|
|
277
|
+
"""Called at end of fit/test/predict."""
|
|
278
|
+
pass
|
|
279
|
+
|
|
280
|
+
# Training
|
|
281
|
+
def on_train_start(self, trainer, pl_module):
|
|
282
|
+
pass
|
|
283
|
+
|
|
284
|
+
def on_train_epoch_start(self, trainer, pl_module):
|
|
285
|
+
pass
|
|
286
|
+
|
|
287
|
+
def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
|
|
288
|
+
pass
|
|
289
|
+
|
|
290
|
+
def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
|
|
291
|
+
pass
|
|
292
|
+
|
|
293
|
+
def on_train_epoch_end(self, trainer, pl_module):
|
|
294
|
+
pass
|
|
295
|
+
|
|
296
|
+
def on_train_end(self, trainer, pl_module):
|
|
297
|
+
pass
|
|
298
|
+
|
|
299
|
+
# Validation
|
|
300
|
+
def on_validation_start(self, trainer, pl_module):
|
|
301
|
+
pass
|
|
302
|
+
|
|
303
|
+
def on_validation_epoch_start(self, trainer, pl_module):
|
|
304
|
+
pass
|
|
305
|
+
|
|
306
|
+
def on_validation_batch_start(self, trainer, pl_module, batch, batch_idx, dataloader_idx):
|
|
307
|
+
pass
|
|
308
|
+
|
|
309
|
+
def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
|
|
310
|
+
pass
|
|
311
|
+
|
|
312
|
+
def on_validation_epoch_end(self, trainer, pl_module):
|
|
313
|
+
pass
|
|
314
|
+
|
|
315
|
+
def on_validation_end(self, trainer, pl_module):
|
|
316
|
+
pass
|
|
317
|
+
|
|
318
|
+
# Test (same structure as validation)
|
|
319
|
+
def on_test_start(self, trainer, pl_module):
|
|
320
|
+
pass
|
|
321
|
+
# ... (test_epoch_start, test_batch_start, etc.)
|
|
322
|
+
|
|
323
|
+
# Predict
|
|
324
|
+
def on_predict_start(self, trainer, pl_module):
|
|
325
|
+
pass
|
|
326
|
+
# ... (predict_epoch_start, predict_batch_start, etc.)
|
|
327
|
+
|
|
328
|
+
# Backward
|
|
329
|
+
def on_before_backward(self, trainer, pl_module, loss):
|
|
330
|
+
pass
|
|
331
|
+
|
|
332
|
+
def on_after_backward(self, trainer, pl_module):
|
|
333
|
+
pass
|
|
334
|
+
|
|
335
|
+
# Optimizer
|
|
336
|
+
def on_before_optimizer_step(self, trainer, pl_module, optimizer):
|
|
337
|
+
pass
|
|
338
|
+
|
|
339
|
+
# Checkpointing
|
|
340
|
+
def on_save_checkpoint(self, trainer, pl_module, checkpoint):
|
|
341
|
+
"""Add data to checkpoint."""
|
|
342
|
+
pass
|
|
343
|
+
|
|
344
|
+
def on_load_checkpoint(self, trainer, pl_module, checkpoint):
|
|
345
|
+
"""Restore data from checkpoint."""
|
|
346
|
+
pass
|
|
347
|
+
```
|
|
348
|
+
|
|
349
|
+
## Combining Multiple Callbacks
|
|
350
|
+
|
|
351
|
+
```python
|
|
352
|
+
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
|
|
353
|
+
|
|
354
|
+
# Create all callbacks
|
|
355
|
+
checkpoint = ModelCheckpoint(monitor='val_loss', mode='min', save_top_k=3)
|
|
356
|
+
early_stop = EarlyStopping(monitor='val_loss', patience=5)
|
|
357
|
+
lr_monitor = LearningRateMonitor(logging_interval='epoch')
|
|
358
|
+
custom_callback = MyCustomCallback()
|
|
359
|
+
|
|
360
|
+
# Add all to Trainer
|
|
361
|
+
trainer = L.Trainer(
|
|
362
|
+
callbacks=[checkpoint, early_stop, lr_monitor, custom_callback]
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
trainer.fit(model, train_loader, val_loader)
|
|
366
|
+
```
|
|
367
|
+
|
|
368
|
+
**Execution order**: Callbacks execute in the order they're added
|
|
369
|
+
|
|
370
|
+
## Best Practices
|
|
371
|
+
|
|
372
|
+
### 1. Keep Callbacks Independent
|
|
373
|
+
|
|
374
|
+
**Bad** (dependent on other callback):
|
|
375
|
+
```python
|
|
376
|
+
class BadCallback(Callback):
|
|
377
|
+
def on_train_end(self, trainer, pl_module):
|
|
378
|
+
# Assumes ModelCheckpoint is present
|
|
379
|
+
best_path = trainer.checkpoint_callback.best_model_path # Fragile!
|
|
380
|
+
```
|
|
381
|
+
|
|
382
|
+
**Good** (self-contained):
|
|
383
|
+
```python
|
|
384
|
+
class GoodCallback(Callback):
|
|
385
|
+
def on_train_end(self, trainer, pl_module):
|
|
386
|
+
# Find checkpoint callback if present
|
|
387
|
+
for callback in trainer.callbacks:
|
|
388
|
+
if isinstance(callback, ModelCheckpoint):
|
|
389
|
+
best_path = callback.best_model_path
|
|
390
|
+
break
|
|
391
|
+
```
|
|
392
|
+
|
|
393
|
+
### 2. Use State Dict for Persistence
|
|
394
|
+
|
|
395
|
+
```python
|
|
396
|
+
class StatefulCallback(Callback):
|
|
397
|
+
def __init__(self):
|
|
398
|
+
self.counter = 0
|
|
399
|
+
self.history = []
|
|
400
|
+
|
|
401
|
+
def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
|
|
402
|
+
self.counter += 1
|
|
403
|
+
self.history.append(outputs['loss'].item())
|
|
404
|
+
|
|
405
|
+
def state_dict(self):
|
|
406
|
+
"""Save state."""
|
|
407
|
+
return {
|
|
408
|
+
'counter': self.counter,
|
|
409
|
+
'history': self.history
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
def load_state_dict(self, state_dict):
|
|
413
|
+
"""Restore state."""
|
|
414
|
+
self.counter = state_dict['counter']
|
|
415
|
+
self.history = state_dict['history']
|
|
416
|
+
```
|
|
417
|
+
|
|
418
|
+
### 3. Handle Distributed Training
|
|
419
|
+
|
|
420
|
+
```python
|
|
421
|
+
class DistributedCallback(Callback):
|
|
422
|
+
def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
|
|
423
|
+
# Only run on main process
|
|
424
|
+
if trainer.is_global_zero:
|
|
425
|
+
print("This only prints once in distributed training")
|
|
426
|
+
|
|
427
|
+
# Run on all processes
|
|
428
|
+
loss = outputs['loss']
|
|
429
|
+
# ... do something with loss on each GPU
|
|
430
|
+
```
|
|
431
|
+
|
|
432
|
+
## Resources
|
|
433
|
+
|
|
434
|
+
- Callback API: https://lightning.ai/docs/pytorch/stable/extensions/callbacks.html
|
|
435
|
+
- Built-in callbacks: https://lightning.ai/docs/pytorch/stable/api_references.html#callbacks
|
|
436
|
+
- Examples: https://github.com/Lightning-AI/pytorch-lightning/tree/master/examples/callbacks
|