@synsci/cli-darwin-x64 1.1.49
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skills/accelerate/SKILL.md +332 -0
- package/bin/skills/accelerate/references/custom-plugins.md +453 -0
- package/bin/skills/accelerate/references/megatron-integration.md +489 -0
- package/bin/skills/accelerate/references/performance.md +525 -0
- package/bin/skills/audiocraft/SKILL.md +564 -0
- package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
- package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
- package/bin/skills/autogpt/SKILL.md +403 -0
- package/bin/skills/autogpt/references/advanced-usage.md +535 -0
- package/bin/skills/autogpt/references/troubleshooting.md +420 -0
- package/bin/skills/awq/SKILL.md +310 -0
- package/bin/skills/awq/references/advanced-usage.md +324 -0
- package/bin/skills/awq/references/troubleshooting.md +344 -0
- package/bin/skills/axolotl/SKILL.md +158 -0
- package/bin/skills/axolotl/references/api.md +5548 -0
- package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
- package/bin/skills/axolotl/references/index.md +15 -0
- package/bin/skills/axolotl/references/other.md +3563 -0
- package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
- package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
- package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
- package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
- package/bin/skills/bitsandbytes/SKILL.md +411 -0
- package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
- package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
- package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
- package/bin/skills/blip-2/SKILL.md +564 -0
- package/bin/skills/blip-2/references/advanced-usage.md +680 -0
- package/bin/skills/blip-2/references/troubleshooting.md +526 -0
- package/bin/skills/chroma/SKILL.md +406 -0
- package/bin/skills/chroma/references/integration.md +38 -0
- package/bin/skills/clip/SKILL.md +253 -0
- package/bin/skills/clip/references/applications.md +207 -0
- package/bin/skills/constitutional-ai/SKILL.md +290 -0
- package/bin/skills/crewai/SKILL.md +498 -0
- package/bin/skills/crewai/references/flows.md +438 -0
- package/bin/skills/crewai/references/tools.md +429 -0
- package/bin/skills/crewai/references/troubleshooting.md +480 -0
- package/bin/skills/deepspeed/SKILL.md +141 -0
- package/bin/skills/deepspeed/references/08.md +17 -0
- package/bin/skills/deepspeed/references/09.md +173 -0
- package/bin/skills/deepspeed/references/2020.md +378 -0
- package/bin/skills/deepspeed/references/2023.md +279 -0
- package/bin/skills/deepspeed/references/assets.md +179 -0
- package/bin/skills/deepspeed/references/index.md +35 -0
- package/bin/skills/deepspeed/references/mii.md +118 -0
- package/bin/skills/deepspeed/references/other.md +1191 -0
- package/bin/skills/deepspeed/references/tutorials.md +6554 -0
- package/bin/skills/dspy/SKILL.md +590 -0
- package/bin/skills/dspy/references/examples.md +663 -0
- package/bin/skills/dspy/references/modules.md +475 -0
- package/bin/skills/dspy/references/optimizers.md +566 -0
- package/bin/skills/faiss/SKILL.md +221 -0
- package/bin/skills/faiss/references/index_types.md +280 -0
- package/bin/skills/flash-attention/SKILL.md +367 -0
- package/bin/skills/flash-attention/references/benchmarks.md +215 -0
- package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
- package/bin/skills/gguf/SKILL.md +427 -0
- package/bin/skills/gguf/references/advanced-usage.md +504 -0
- package/bin/skills/gguf/references/troubleshooting.md +442 -0
- package/bin/skills/gptq/SKILL.md +450 -0
- package/bin/skills/gptq/references/calibration.md +337 -0
- package/bin/skills/gptq/references/integration.md +129 -0
- package/bin/skills/gptq/references/troubleshooting.md +95 -0
- package/bin/skills/grpo-rl-training/README.md +97 -0
- package/bin/skills/grpo-rl-training/SKILL.md +572 -0
- package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
- package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
- package/bin/skills/guidance/SKILL.md +572 -0
- package/bin/skills/guidance/references/backends.md +554 -0
- package/bin/skills/guidance/references/constraints.md +674 -0
- package/bin/skills/guidance/references/examples.md +767 -0
- package/bin/skills/hqq/SKILL.md +445 -0
- package/bin/skills/hqq/references/advanced-usage.md +528 -0
- package/bin/skills/hqq/references/troubleshooting.md +503 -0
- package/bin/skills/hugging-face-cli/SKILL.md +191 -0
- package/bin/skills/hugging-face-cli/references/commands.md +954 -0
- package/bin/skills/hugging-face-cli/references/examples.md +374 -0
- package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
- package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
- package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
- package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
- package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
- package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
- package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
- package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
- package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
- package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
- package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
- package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
- package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
- package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
- package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
- package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
- package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
- package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
- package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
- package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
- package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
- package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
- package/bin/skills/hugging-face-jobs/index.html +216 -0
- package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
- package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
- package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
- package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
- package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
- package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
- package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
- package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
- package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
- package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
- package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
- package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
- package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
- package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
- package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
- package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
- package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
- package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
- package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
- package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
- package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
- package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
- package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
- package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
- package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
- package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
- package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
- package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
- package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
- package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
- package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
- package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
- package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
- package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
- package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
- package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
- package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
- package/bin/skills/instructor/SKILL.md +740 -0
- package/bin/skills/instructor/references/examples.md +107 -0
- package/bin/skills/instructor/references/providers.md +70 -0
- package/bin/skills/instructor/references/validation.md +606 -0
- package/bin/skills/knowledge-distillation/SKILL.md +458 -0
- package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
- package/bin/skills/lambda-labs/SKILL.md +545 -0
- package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
- package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
- package/bin/skills/langchain/SKILL.md +480 -0
- package/bin/skills/langchain/references/agents.md +499 -0
- package/bin/skills/langchain/references/integration.md +562 -0
- package/bin/skills/langchain/references/rag.md +600 -0
- package/bin/skills/langsmith/SKILL.md +422 -0
- package/bin/skills/langsmith/references/advanced-usage.md +548 -0
- package/bin/skills/langsmith/references/troubleshooting.md +537 -0
- package/bin/skills/litgpt/SKILL.md +469 -0
- package/bin/skills/litgpt/references/custom-models.md +568 -0
- package/bin/skills/litgpt/references/distributed-training.md +451 -0
- package/bin/skills/litgpt/references/supported-models.md +336 -0
- package/bin/skills/litgpt/references/training-recipes.md +619 -0
- package/bin/skills/llama-cpp/SKILL.md +258 -0
- package/bin/skills/llama-cpp/references/optimization.md +89 -0
- package/bin/skills/llama-cpp/references/quantization.md +213 -0
- package/bin/skills/llama-cpp/references/server.md +125 -0
- package/bin/skills/llama-factory/SKILL.md +80 -0
- package/bin/skills/llama-factory/references/_images.md +23 -0
- package/bin/skills/llama-factory/references/advanced.md +1055 -0
- package/bin/skills/llama-factory/references/getting_started.md +349 -0
- package/bin/skills/llama-factory/references/index.md +19 -0
- package/bin/skills/llama-factory/references/other.md +31 -0
- package/bin/skills/llamaguard/SKILL.md +337 -0
- package/bin/skills/llamaindex/SKILL.md +569 -0
- package/bin/skills/llamaindex/references/agents.md +83 -0
- package/bin/skills/llamaindex/references/data_connectors.md +108 -0
- package/bin/skills/llamaindex/references/query_engines.md +406 -0
- package/bin/skills/llava/SKILL.md +304 -0
- package/bin/skills/llava/references/training.md +197 -0
- package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
- package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
- package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
- package/bin/skills/long-context/SKILL.md +536 -0
- package/bin/skills/long-context/references/extension_methods.md +468 -0
- package/bin/skills/long-context/references/fine_tuning.md +611 -0
- package/bin/skills/long-context/references/rope.md +402 -0
- package/bin/skills/mamba/SKILL.md +260 -0
- package/bin/skills/mamba/references/architecture-details.md +206 -0
- package/bin/skills/mamba/references/benchmarks.md +255 -0
- package/bin/skills/mamba/references/training-guide.md +388 -0
- package/bin/skills/megatron-core/SKILL.md +366 -0
- package/bin/skills/megatron-core/references/benchmarks.md +249 -0
- package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
- package/bin/skills/megatron-core/references/production-examples.md +473 -0
- package/bin/skills/megatron-core/references/training-recipes.md +547 -0
- package/bin/skills/miles/SKILL.md +315 -0
- package/bin/skills/miles/references/api-reference.md +141 -0
- package/bin/skills/miles/references/troubleshooting.md +352 -0
- package/bin/skills/mlflow/SKILL.md +704 -0
- package/bin/skills/mlflow/references/deployment.md +744 -0
- package/bin/skills/mlflow/references/model-registry.md +770 -0
- package/bin/skills/mlflow/references/tracking.md +680 -0
- package/bin/skills/modal/SKILL.md +341 -0
- package/bin/skills/modal/references/advanced-usage.md +503 -0
- package/bin/skills/modal/references/troubleshooting.md +494 -0
- package/bin/skills/model-merging/SKILL.md +539 -0
- package/bin/skills/model-merging/references/evaluation.md +462 -0
- package/bin/skills/model-merging/references/examples.md +428 -0
- package/bin/skills/model-merging/references/methods.md +352 -0
- package/bin/skills/model-pruning/SKILL.md +495 -0
- package/bin/skills/model-pruning/references/wanda.md +347 -0
- package/bin/skills/moe-training/SKILL.md +526 -0
- package/bin/skills/moe-training/references/architectures.md +432 -0
- package/bin/skills/moe-training/references/inference.md +348 -0
- package/bin/skills/moe-training/references/training.md +425 -0
- package/bin/skills/nanogpt/SKILL.md +290 -0
- package/bin/skills/nanogpt/references/architecture.md +382 -0
- package/bin/skills/nanogpt/references/data.md +476 -0
- package/bin/skills/nanogpt/references/training.md +564 -0
- package/bin/skills/nemo-curator/SKILL.md +383 -0
- package/bin/skills/nemo-curator/references/deduplication.md +87 -0
- package/bin/skills/nemo-curator/references/filtering.md +102 -0
- package/bin/skills/nemo-evaluator/SKILL.md +494 -0
- package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
- package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
- package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
- package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
- package/bin/skills/nemo-guardrails/SKILL.md +297 -0
- package/bin/skills/nnsight/SKILL.md +436 -0
- package/bin/skills/nnsight/references/README.md +78 -0
- package/bin/skills/nnsight/references/api.md +344 -0
- package/bin/skills/nnsight/references/tutorials.md +300 -0
- package/bin/skills/openrlhf/SKILL.md +249 -0
- package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
- package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
- package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
- package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
- package/bin/skills/outlines/SKILL.md +652 -0
- package/bin/skills/outlines/references/backends.md +615 -0
- package/bin/skills/outlines/references/examples.md +773 -0
- package/bin/skills/outlines/references/json_generation.md +652 -0
- package/bin/skills/peft/SKILL.md +431 -0
- package/bin/skills/peft/references/advanced-usage.md +514 -0
- package/bin/skills/peft/references/troubleshooting.md +480 -0
- package/bin/skills/phoenix/SKILL.md +475 -0
- package/bin/skills/phoenix/references/advanced-usage.md +619 -0
- package/bin/skills/phoenix/references/troubleshooting.md +538 -0
- package/bin/skills/pinecone/SKILL.md +358 -0
- package/bin/skills/pinecone/references/deployment.md +181 -0
- package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
- package/bin/skills/pytorch-fsdp/references/index.md +7 -0
- package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
- package/bin/skills/pytorch-lightning/SKILL.md +346 -0
- package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
- package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
- package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
- package/bin/skills/pyvene/SKILL.md +473 -0
- package/bin/skills/pyvene/references/README.md +73 -0
- package/bin/skills/pyvene/references/api.md +383 -0
- package/bin/skills/pyvene/references/tutorials.md +376 -0
- package/bin/skills/qdrant/SKILL.md +493 -0
- package/bin/skills/qdrant/references/advanced-usage.md +648 -0
- package/bin/skills/qdrant/references/troubleshooting.md +631 -0
- package/bin/skills/ray-data/SKILL.md +326 -0
- package/bin/skills/ray-data/references/integration.md +82 -0
- package/bin/skills/ray-data/references/transformations.md +83 -0
- package/bin/skills/ray-train/SKILL.md +406 -0
- package/bin/skills/ray-train/references/multi-node.md +628 -0
- package/bin/skills/rwkv/SKILL.md +260 -0
- package/bin/skills/rwkv/references/architecture-details.md +344 -0
- package/bin/skills/rwkv/references/rwkv7.md +386 -0
- package/bin/skills/rwkv/references/state-management.md +369 -0
- package/bin/skills/saelens/SKILL.md +386 -0
- package/bin/skills/saelens/references/README.md +70 -0
- package/bin/skills/saelens/references/api.md +333 -0
- package/bin/skills/saelens/references/tutorials.md +318 -0
- package/bin/skills/segment-anything/SKILL.md +500 -0
- package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
- package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
- package/bin/skills/sentence-transformers/SKILL.md +255 -0
- package/bin/skills/sentence-transformers/references/models.md +123 -0
- package/bin/skills/sentencepiece/SKILL.md +235 -0
- package/bin/skills/sentencepiece/references/algorithms.md +200 -0
- package/bin/skills/sentencepiece/references/training.md +304 -0
- package/bin/skills/sglang/SKILL.md +442 -0
- package/bin/skills/sglang/references/deployment.md +490 -0
- package/bin/skills/sglang/references/radix-attention.md +413 -0
- package/bin/skills/sglang/references/structured-generation.md +541 -0
- package/bin/skills/simpo/SKILL.md +219 -0
- package/bin/skills/simpo/references/datasets.md +478 -0
- package/bin/skills/simpo/references/hyperparameters.md +452 -0
- package/bin/skills/simpo/references/loss-functions.md +350 -0
- package/bin/skills/skypilot/SKILL.md +509 -0
- package/bin/skills/skypilot/references/advanced-usage.md +491 -0
- package/bin/skills/skypilot/references/troubleshooting.md +570 -0
- package/bin/skills/slime/SKILL.md +464 -0
- package/bin/skills/slime/references/api-reference.md +392 -0
- package/bin/skills/slime/references/troubleshooting.md +386 -0
- package/bin/skills/speculative-decoding/SKILL.md +467 -0
- package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
- package/bin/skills/speculative-decoding/references/medusa.md +350 -0
- package/bin/skills/stable-diffusion/SKILL.md +519 -0
- package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
- package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
- package/bin/skills/tensorboard/SKILL.md +629 -0
- package/bin/skills/tensorboard/references/integrations.md +638 -0
- package/bin/skills/tensorboard/references/profiling.md +545 -0
- package/bin/skills/tensorboard/references/visualization.md +620 -0
- package/bin/skills/tensorrt-llm/SKILL.md +187 -0
- package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
- package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
- package/bin/skills/tensorrt-llm/references/serving.md +470 -0
- package/bin/skills/tinker/SKILL.md +362 -0
- package/bin/skills/tinker/references/api-reference.md +168 -0
- package/bin/skills/tinker/references/getting-started.md +157 -0
- package/bin/skills/tinker/references/loss-functions.md +163 -0
- package/bin/skills/tinker/references/models-and-lora.md +139 -0
- package/bin/skills/tinker/references/recipes.md +280 -0
- package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
- package/bin/skills/tinker/references/rendering.md +243 -0
- package/bin/skills/tinker/references/supervised-learning.md +232 -0
- package/bin/skills/tinker-training-cost/SKILL.md +187 -0
- package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
- package/bin/skills/torchforge/SKILL.md +433 -0
- package/bin/skills/torchforge/references/api-reference.md +327 -0
- package/bin/skills/torchforge/references/troubleshooting.md +409 -0
- package/bin/skills/torchtitan/SKILL.md +358 -0
- package/bin/skills/torchtitan/references/checkpoint.md +181 -0
- package/bin/skills/torchtitan/references/custom-models.md +258 -0
- package/bin/skills/torchtitan/references/float8.md +133 -0
- package/bin/skills/torchtitan/references/fsdp.md +126 -0
- package/bin/skills/transformer-lens/SKILL.md +346 -0
- package/bin/skills/transformer-lens/references/README.md +54 -0
- package/bin/skills/transformer-lens/references/api.md +362 -0
- package/bin/skills/transformer-lens/references/tutorials.md +339 -0
- package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
- package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
- package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
- package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
- package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
- package/bin/skills/unsloth/SKILL.md +80 -0
- package/bin/skills/unsloth/references/index.md +7 -0
- package/bin/skills/unsloth/references/llms-full.md +16799 -0
- package/bin/skills/unsloth/references/llms-txt.md +12044 -0
- package/bin/skills/unsloth/references/llms.md +82 -0
- package/bin/skills/verl/SKILL.md +391 -0
- package/bin/skills/verl/references/api-reference.md +301 -0
- package/bin/skills/verl/references/troubleshooting.md +391 -0
- package/bin/skills/vllm/SKILL.md +364 -0
- package/bin/skills/vllm/references/optimization.md +226 -0
- package/bin/skills/vllm/references/quantization.md +284 -0
- package/bin/skills/vllm/references/server-deployment.md +255 -0
- package/bin/skills/vllm/references/troubleshooting.md +447 -0
- package/bin/skills/weights-and-biases/SKILL.md +590 -0
- package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
- package/bin/skills/weights-and-biases/references/integrations.md +700 -0
- package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
- package/bin/skills/whisper/SKILL.md +317 -0
- package/bin/skills/whisper/references/languages.md +189 -0
- package/bin/synsc +0 -0
- package/package.json +10 -0
|
@@ -0,0 +1,568 @@
|
|
|
1
|
+
# Custom Models
|
|
2
|
+
|
|
3
|
+
Guide to implementing custom model architectures in LitGPT.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
LitGPT's clean, single-file implementations make it easy to create custom architectures. You can extend the base `GPT` class or create entirely new models.
|
|
8
|
+
|
|
9
|
+
**Use cases**:
|
|
10
|
+
- Implementing new research architectures
|
|
11
|
+
- Adapting models for specific domains
|
|
12
|
+
- Experimenting with attention mechanisms
|
|
13
|
+
- Adding custom layers or components
|
|
14
|
+
|
|
15
|
+
## Key Files and Classes
|
|
16
|
+
|
|
17
|
+
### Core Architecture (`litgpt/model.py`)
|
|
18
|
+
|
|
19
|
+
**Main classes**:
|
|
20
|
+
- `GPT`: Top-level model class
|
|
21
|
+
- `Block`: Transformer block (attention + MLP)
|
|
22
|
+
- `CausalSelfAttention`: Attention mechanism
|
|
23
|
+
- `MLP`: Feed-forward network
|
|
24
|
+
- `RMSNorm` / `LayerNorm`: Normalization layers
|
|
25
|
+
|
|
26
|
+
**Configuration** (`litgpt/config.py`):
|
|
27
|
+
- `Config`: Base configuration dataclass
|
|
28
|
+
- Model-specific configs: `LlamaConfig`, `MistralConfig`, `PhiConfig`, etc.
|
|
29
|
+
|
|
30
|
+
## Custom Architecture Workflow
|
|
31
|
+
|
|
32
|
+
### Step 1: Define Configuration
|
|
33
|
+
|
|
34
|
+
Create a `Config` dataclass with your model's hyperparameters:
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
from dataclasses import dataclass
|
|
38
|
+
from litgpt.config import Config
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class MyModelConfig(Config):
|
|
42
|
+
"""Configuration for my custom model."""
|
|
43
|
+
# Standard parameters
|
|
44
|
+
name: str = "my-model-7b"
|
|
45
|
+
block_size: int = 4096
|
|
46
|
+
vocab_size: int = 32000
|
|
47
|
+
n_layer: int = 32
|
|
48
|
+
n_head: int = 32
|
|
49
|
+
n_embd: int = 4096
|
|
50
|
+
|
|
51
|
+
# Custom parameters
|
|
52
|
+
custom_param: float = 0.1
|
|
53
|
+
use_custom_attention: bool = True
|
|
54
|
+
|
|
55
|
+
# Optional: override defaults
|
|
56
|
+
rope_base: int = 10000
|
|
57
|
+
intermediate_size: int = 11008
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### Step 2: Implement Custom Components
|
|
61
|
+
|
|
62
|
+
#### Option A: Custom Attention
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
from litgpt.model import CausalSelfAttention
|
|
66
|
+
import torch
|
|
67
|
+
import torch.nn as nn
|
|
68
|
+
|
|
69
|
+
class CustomAttention(CausalSelfAttention):
|
|
70
|
+
"""Custom attention mechanism."""
|
|
71
|
+
|
|
72
|
+
def __init__(self, config):
|
|
73
|
+
super().__init__(config)
|
|
74
|
+
# Add custom components
|
|
75
|
+
self.custom_proj = nn.Linear(config.n_embd, config.n_embd)
|
|
76
|
+
self.custom_param = config.custom_param
|
|
77
|
+
|
|
78
|
+
def forward(self, x, mask=None, input_pos=None):
|
|
79
|
+
B, T, C = x.size()
|
|
80
|
+
|
|
81
|
+
# Standard Q, K, V projections
|
|
82
|
+
q = self.attn(x)
|
|
83
|
+
k = self.attn(x)
|
|
84
|
+
v = self.attn(x)
|
|
85
|
+
|
|
86
|
+
# Custom modification
|
|
87
|
+
q = q + self.custom_proj(x) * self.custom_param
|
|
88
|
+
|
|
89
|
+
# Rest of attention computation
|
|
90
|
+
q = q.view(B, T, self.n_head, self.head_size)
|
|
91
|
+
k = k.view(B, T, self.n_query_groups, self.head_size)
|
|
92
|
+
v = v.view(B, T, self.n_query_groups, self.head_size)
|
|
93
|
+
|
|
94
|
+
# Scaled dot-product attention
|
|
95
|
+
y = self.scaled_dot_product_attention(q, k, v, mask=mask)
|
|
96
|
+
|
|
97
|
+
y = y.reshape(B, T, C)
|
|
98
|
+
return self.proj(y)
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
#### Option B: Custom MLP
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
from litgpt.model import MLP
|
|
105
|
+
|
|
106
|
+
class CustomMLP(MLP):
|
|
107
|
+
"""Custom feed-forward network."""
|
|
108
|
+
|
|
109
|
+
def __init__(self, config):
|
|
110
|
+
super().__init__(config)
|
|
111
|
+
# Add custom layers
|
|
112
|
+
self.custom_layer = nn.Linear(config.intermediate_size, config.intermediate_size)
|
|
113
|
+
|
|
114
|
+
def forward(self, x):
|
|
115
|
+
x = self.fc_1(x)
|
|
116
|
+
x = self.act(x)
|
|
117
|
+
x = self.custom_layer(x) # Custom modification
|
|
118
|
+
x = self.fc_2(x)
|
|
119
|
+
return x
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
#### Option C: Custom Block
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
from litgpt.model import Block
|
|
126
|
+
|
|
127
|
+
class CustomBlock(Block):
|
|
128
|
+
"""Custom transformer block."""
|
|
129
|
+
|
|
130
|
+
def __init__(self, config):
|
|
131
|
+
super().__init__(config)
|
|
132
|
+
# Replace attention or MLP
|
|
133
|
+
self.attn = CustomAttention(config)
|
|
134
|
+
# Or: self.mlp = CustomMLP(config)
|
|
135
|
+
|
|
136
|
+
# Add custom components
|
|
137
|
+
self.custom_norm = nn.LayerNorm(config.n_embd)
|
|
138
|
+
|
|
139
|
+
def forward(self, x, input_pos=None, mask=None):
|
|
140
|
+
# Custom forward pass
|
|
141
|
+
h = self.norm_1(x)
|
|
142
|
+
h = self.attn(h, mask=mask, input_pos=input_pos)
|
|
143
|
+
x = x + h
|
|
144
|
+
|
|
145
|
+
# Custom normalization
|
|
146
|
+
x = x + self.custom_norm(x)
|
|
147
|
+
|
|
148
|
+
x = x + self.mlp(self.norm_2(x))
|
|
149
|
+
return x
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
### Step 3: Create Custom GPT Model
|
|
153
|
+
|
|
154
|
+
```python
|
|
155
|
+
from litgpt.model import GPT
|
|
156
|
+
import torch.nn as nn
|
|
157
|
+
|
|
158
|
+
class CustomGPT(GPT):
|
|
159
|
+
"""Custom GPT model."""
|
|
160
|
+
|
|
161
|
+
def __init__(self, config: MyModelConfig):
|
|
162
|
+
# Don't call super().__init__() - we reimplement
|
|
163
|
+
nn.Module.__init__(self)
|
|
164
|
+
self.config = config
|
|
165
|
+
|
|
166
|
+
# Standard components
|
|
167
|
+
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
|
168
|
+
self.transformer = nn.ModuleDict(
|
|
169
|
+
dict(
|
|
170
|
+
wte=nn.Embedding(config.vocab_size, config.n_embd),
|
|
171
|
+
h=nn.ModuleList(CustomBlock(config) for _ in range(config.n_layer)),
|
|
172
|
+
ln_f=nn.LayerNorm(config.n_embd),
|
|
173
|
+
)
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
# Custom components
|
|
177
|
+
if config.use_custom_attention:
|
|
178
|
+
self.custom_embedding = nn.Linear(config.n_embd, config.n_embd)
|
|
179
|
+
|
|
180
|
+
# Initialize weights
|
|
181
|
+
self.apply(self._init_weights)
|
|
182
|
+
|
|
183
|
+
def _init_weights(self, module):
|
|
184
|
+
"""Initialize weights (required)."""
|
|
185
|
+
if isinstance(module, nn.Linear):
|
|
186
|
+
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
|
|
187
|
+
if module.bias is not None:
|
|
188
|
+
torch.nn.init.zeros_(module.bias)
|
|
189
|
+
elif isinstance(module, nn.Embedding):
|
|
190
|
+
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
|
|
191
|
+
|
|
192
|
+
def forward(self, idx, input_pos=None):
|
|
193
|
+
"""Forward pass (must match base signature)."""
|
|
194
|
+
B, T = idx.size()
|
|
195
|
+
|
|
196
|
+
# Token embeddings
|
|
197
|
+
x = self.transformer.wte(idx)
|
|
198
|
+
|
|
199
|
+
# Custom embedding modification
|
|
200
|
+
if self.config.use_custom_attention:
|
|
201
|
+
x = x + self.custom_embedding(x)
|
|
202
|
+
|
|
203
|
+
# Transformer blocks
|
|
204
|
+
for block in self.transformer.h:
|
|
205
|
+
x = block(x, input_pos=input_pos)
|
|
206
|
+
|
|
207
|
+
# Final norm + LM head
|
|
208
|
+
x = self.transformer.ln_f(x)
|
|
209
|
+
return self.lm_head(x)
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
### Step 4: Register Configuration
|
|
213
|
+
|
|
214
|
+
Add your config to `litgpt/config.py`:
|
|
215
|
+
|
|
216
|
+
```python
|
|
217
|
+
# In litgpt/config.py
|
|
218
|
+
configs = [
|
|
219
|
+
# ... existing configs ...
|
|
220
|
+
|
|
221
|
+
# My custom model
|
|
222
|
+
dict(
|
|
223
|
+
name="my-model-7b",
|
|
224
|
+
hf_config=dict(org="myorg", name="my-model-7b"),
|
|
225
|
+
block_size=4096,
|
|
226
|
+
vocab_size=32000,
|
|
227
|
+
n_layer=32,
|
|
228
|
+
n_head=32,
|
|
229
|
+
n_embd=4096,
|
|
230
|
+
custom_param=0.1,
|
|
231
|
+
),
|
|
232
|
+
]
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
### Step 5: Use Your Custom Model
|
|
236
|
+
|
|
237
|
+
```python
|
|
238
|
+
from litgpt.api import LLM
|
|
239
|
+
from my_model import CustomGPT, MyModelConfig
|
|
240
|
+
|
|
241
|
+
# Initialize
|
|
242
|
+
config = MyModelConfig()
|
|
243
|
+
model = CustomGPT(config)
|
|
244
|
+
|
|
245
|
+
# Wrap with LLM API
|
|
246
|
+
llm = LLM(model=model, tokenizer_dir="path/to/tokenizer")
|
|
247
|
+
|
|
248
|
+
# Generate
|
|
249
|
+
result = llm.generate("Once upon a time", max_new_tokens=100)
|
|
250
|
+
print(result)
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
## Real Example: Adapter Fine-tuning
|
|
254
|
+
|
|
255
|
+
LitGPT's `Adapter` implementation shows a complete custom architecture:
|
|
256
|
+
|
|
257
|
+
### Adapter Configuration
|
|
258
|
+
|
|
259
|
+
```python
|
|
260
|
+
@dataclass
|
|
261
|
+
class Config(BaseConfig):
|
|
262
|
+
"""Adds adapter-specific parameters."""
|
|
263
|
+
adapter_prompt_length: int = 10
|
|
264
|
+
adapter_start_layer: int = 2
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
### Adapter GPT Model
|
|
268
|
+
|
|
269
|
+
```python
|
|
270
|
+
class GPT(BaseModel):
|
|
271
|
+
"""GPT model with adapter layers."""
|
|
272
|
+
|
|
273
|
+
def __init__(self, config: Config):
|
|
274
|
+
nn.Module.__init__(self)
|
|
275
|
+
self.config = config
|
|
276
|
+
|
|
277
|
+
# Standard components
|
|
278
|
+
self.lm_head = nn.Linear(config.n_embd, config.padded_vocab_size, bias=False)
|
|
279
|
+
self.transformer = nn.ModuleDict(
|
|
280
|
+
dict(
|
|
281
|
+
wte=nn.Embedding(config.padded_vocab_size, config.n_embd),
|
|
282
|
+
h=nn.ModuleList(Block(config, i) for i in range(config.n_layer)),
|
|
283
|
+
ln_f=config.norm_class(config.n_embd, eps=config.norm_eps),
|
|
284
|
+
)
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
# Adapter-specific: gating factor
|
|
288
|
+
self.gating_factor = torch.nn.Parameter(torch.zeros(1))
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
### Adapter Block
|
|
292
|
+
|
|
293
|
+
```python
|
|
294
|
+
class Block(BaseBlock):
|
|
295
|
+
"""Transformer block with adapter."""
|
|
296
|
+
|
|
297
|
+
def __init__(self, config: Config, block_idx: int):
|
|
298
|
+
super().__init__()
|
|
299
|
+
self.norm_1 = config.norm_class(config.n_embd, eps=config.norm_eps)
|
|
300
|
+
self.attn = CausalSelfAttention(config, block_idx)
|
|
301
|
+
self.norm_2 = config.norm_class(config.n_embd, eps=config.norm_eps)
|
|
302
|
+
self.mlp = config.mlp_class(config)
|
|
303
|
+
|
|
304
|
+
# Adapter: add prefix for certain layers
|
|
305
|
+
self.adapter_wte = (
|
|
306
|
+
nn.Embedding(config.adapter_prompt_length, config.n_embd)
|
|
307
|
+
if block_idx >= config.adapter_start_layer
|
|
308
|
+
else None
|
|
309
|
+
)
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
### Adapter Attention
|
|
313
|
+
|
|
314
|
+
```python
|
|
315
|
+
class CausalSelfAttention(BaseCausalSelfAttention):
|
|
316
|
+
"""Attention with adapter prompts."""
|
|
317
|
+
|
|
318
|
+
def forward(self, x: torch.Tensor, ...) -> torch.Tensor:
|
|
319
|
+
B, T, C = x.size()
|
|
320
|
+
|
|
321
|
+
# Add adapter prefix if enabled
|
|
322
|
+
if self.adapter_wte is not None:
|
|
323
|
+
adapter_prompts = self.adapter_wte(
|
|
324
|
+
torch.arange(self.adapter_prompt_length, device=x.device)
|
|
325
|
+
)
|
|
326
|
+
adapter_prompts = adapter_prompts.unsqueeze(0).expand(B, -1, -1)
|
|
327
|
+
x = torch.cat([adapter_prompts, x], dim=1)
|
|
328
|
+
|
|
329
|
+
# Standard attention with gating
|
|
330
|
+
q, k, v = self.attn(x).split(self.n_embd, dim=2)
|
|
331
|
+
y = self.scaled_dot_product_attention(q, k, v, mask=mask)
|
|
332
|
+
|
|
333
|
+
# Apply gating factor
|
|
334
|
+
y = y * self.gating_factor
|
|
335
|
+
|
|
336
|
+
return self.proj(y)
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
See full implementation: `litgpt/finetune/adapter.py`
|
|
340
|
+
|
|
341
|
+
## Real Example: AdapterV2
|
|
342
|
+
|
|
343
|
+
AdapterV2 shows custom linear layers:
|
|
344
|
+
|
|
345
|
+
### AdapterV2Linear
|
|
346
|
+
|
|
347
|
+
```python
|
|
348
|
+
class AdapterV2Linear(torch.nn.Module):
|
|
349
|
+
"""Linear layer with low-rank adapter."""
|
|
350
|
+
|
|
351
|
+
def __init__(self, in_features, out_features, adapter_rank=8, **kwargs):
|
|
352
|
+
super().__init__()
|
|
353
|
+
self.linear = torch.nn.Linear(in_features, out_features, **kwargs)
|
|
354
|
+
|
|
355
|
+
# Adapter: low-rank bottleneck
|
|
356
|
+
self.adapter_down = torch.nn.Linear(in_features, adapter_rank, bias=False)
|
|
357
|
+
self.adapter_up = torch.nn.Linear(adapter_rank, out_features, bias=False)
|
|
358
|
+
|
|
359
|
+
# Initialize adapter to identity
|
|
360
|
+
torch.nn.init.zeros_(self.adapter_up.weight)
|
|
361
|
+
|
|
362
|
+
def forward(self, x):
|
|
363
|
+
# Original linear transformation
|
|
364
|
+
out = self.linear(x)
|
|
365
|
+
|
|
366
|
+
# Add adapter contribution
|
|
367
|
+
adapter_out = self.adapter_up(self.adapter_down(x))
|
|
368
|
+
return out + adapter_out
|
|
369
|
+
```
|
|
370
|
+
|
|
371
|
+
See full implementation: `litgpt/finetune/adapter_v2.py`
|
|
372
|
+
|
|
373
|
+
## Custom Model Checklist
|
|
374
|
+
|
|
375
|
+
- [ ] Define `Config` dataclass with all hyperparameters
|
|
376
|
+
- [ ] Implement custom components (Attention, MLP, Block)
|
|
377
|
+
- [ ] Create custom `GPT` class
|
|
378
|
+
- [ ] Implement `_init_weights()` for proper initialization
|
|
379
|
+
- [ ] Implement `forward()` matching base signature
|
|
380
|
+
- [ ] Register configuration in `litgpt/config.py`
|
|
381
|
+
- [ ] Test with small model (100M params) first
|
|
382
|
+
- [ ] Verify training convergence
|
|
383
|
+
- [ ] Profile memory usage
|
|
384
|
+
|
|
385
|
+
## Testing Your Custom Model
|
|
386
|
+
|
|
387
|
+
### Unit Test
|
|
388
|
+
|
|
389
|
+
```python
|
|
390
|
+
import torch
|
|
391
|
+
from my_model import CustomGPT, MyModelConfig
|
|
392
|
+
|
|
393
|
+
def test_custom_model():
|
|
394
|
+
"""Test custom model forward pass."""
|
|
395
|
+
config = MyModelConfig(
|
|
396
|
+
n_layer=2,
|
|
397
|
+
n_head=4,
|
|
398
|
+
n_embd=128,
|
|
399
|
+
vocab_size=1000,
|
|
400
|
+
block_size=256,
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
model = CustomGPT(config)
|
|
404
|
+
model.eval()
|
|
405
|
+
|
|
406
|
+
# Test forward pass
|
|
407
|
+
batch_size = 2
|
|
408
|
+
seq_length = 16
|
|
409
|
+
idx = torch.randint(0, config.vocab_size, (batch_size, seq_length))
|
|
410
|
+
|
|
411
|
+
with torch.no_grad():
|
|
412
|
+
logits = model(idx)
|
|
413
|
+
|
|
414
|
+
assert logits.shape == (batch_size, seq_length, config.vocab_size)
|
|
415
|
+
print("✓ Forward pass works")
|
|
416
|
+
|
|
417
|
+
if __name__ == "__main__":
|
|
418
|
+
test_custom_model()
|
|
419
|
+
```
|
|
420
|
+
|
|
421
|
+
### Training Test
|
|
422
|
+
|
|
423
|
+
```python
|
|
424
|
+
from litgpt.api import LLM
|
|
425
|
+
|
|
426
|
+
def test_training():
|
|
427
|
+
"""Test custom model training."""
|
|
428
|
+
config = MyModelConfig(n_layer=2, n_head=4, n_embd=128)
|
|
429
|
+
model = CustomGPT(config)
|
|
430
|
+
|
|
431
|
+
# Small dataset for testing
|
|
432
|
+
data = [
|
|
433
|
+
{"instruction": "Test", "input": "", "output": "OK"}
|
|
434
|
+
]
|
|
435
|
+
|
|
436
|
+
# Should run without errors
|
|
437
|
+
llm = LLM(model=model)
|
|
438
|
+
# ... training code ...
|
|
439
|
+
print("✓ Training works")
|
|
440
|
+
```
|
|
441
|
+
|
|
442
|
+
## Common Patterns
|
|
443
|
+
|
|
444
|
+
### Adding New Attention Mechanism
|
|
445
|
+
|
|
446
|
+
```python
|
|
447
|
+
class MyAttention(nn.Module):
|
|
448
|
+
"""Template for custom attention."""
|
|
449
|
+
|
|
450
|
+
def __init__(self, config):
|
|
451
|
+
super().__init__()
|
|
452
|
+
self.n_head = config.n_head
|
|
453
|
+
self.n_embd = config.n_embd
|
|
454
|
+
self.head_size = self.n_embd // self.n_head
|
|
455
|
+
|
|
456
|
+
# Q, K, V projections
|
|
457
|
+
self.q_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
|
|
458
|
+
self.k_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
|
|
459
|
+
self.v_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
|
|
460
|
+
|
|
461
|
+
# Output projection
|
|
462
|
+
self.out_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
|
|
463
|
+
|
|
464
|
+
def forward(self, x, mask=None):
|
|
465
|
+
B, T, C = x.size()
|
|
466
|
+
|
|
467
|
+
# Project Q, K, V
|
|
468
|
+
q = self.q_proj(x).view(B, T, self.n_head, self.head_size)
|
|
469
|
+
k = self.k_proj(x).view(B, T, self.n_head, self.head_size)
|
|
470
|
+
v = self.v_proj(x).view(B, T, self.n_head, self.head_size)
|
|
471
|
+
|
|
472
|
+
# Custom attention computation here
|
|
473
|
+
# attn = custom_attention_function(q, k, v, mask)
|
|
474
|
+
|
|
475
|
+
# Output projection
|
|
476
|
+
out = self.out_proj(attn.reshape(B, T, C))
|
|
477
|
+
return out
|
|
478
|
+
```
|
|
479
|
+
|
|
480
|
+
### Adding Mixture of Experts
|
|
481
|
+
|
|
482
|
+
```python
|
|
483
|
+
class MoELayer(nn.Module):
|
|
484
|
+
"""Mixture of Experts layer."""
|
|
485
|
+
|
|
486
|
+
def __init__(self, config):
|
|
487
|
+
super().__init__()
|
|
488
|
+
self.num_experts = config.num_experts
|
|
489
|
+
self.top_k = config.moe_top_k
|
|
490
|
+
|
|
491
|
+
# Router
|
|
492
|
+
self.router = nn.Linear(config.n_embd, self.num_experts)
|
|
493
|
+
|
|
494
|
+
# Experts
|
|
495
|
+
self.experts = nn.ModuleList([
|
|
496
|
+
MLP(config) for _ in range(self.num_experts)
|
|
497
|
+
])
|
|
498
|
+
|
|
499
|
+
def forward(self, x):
|
|
500
|
+
B, T, C = x.size()
|
|
501
|
+
|
|
502
|
+
# Route tokens to experts
|
|
503
|
+
router_logits = self.router(x) # (B, T, num_experts)
|
|
504
|
+
router_probs = torch.softmax(router_logits, dim=-1)
|
|
505
|
+
|
|
506
|
+
# Select top-k experts
|
|
507
|
+
top_k_probs, top_k_indices = torch.topk(router_probs, self.top_k, dim=-1)
|
|
508
|
+
|
|
509
|
+
# Process through selected experts
|
|
510
|
+
output = torch.zeros_like(x)
|
|
511
|
+
for i in range(self.top_k):
|
|
512
|
+
expert_idx = top_k_indices[:, :, i]
|
|
513
|
+
expert_prob = top_k_probs[:, :, i:i+1]
|
|
514
|
+
|
|
515
|
+
# Route to expert
|
|
516
|
+
for expert_id in range(self.num_experts):
|
|
517
|
+
mask = (expert_idx == expert_id)
|
|
518
|
+
if mask.any():
|
|
519
|
+
expert_out = self.experts[expert_id](x[mask])
|
|
520
|
+
output[mask] += expert_out * expert_prob[mask]
|
|
521
|
+
|
|
522
|
+
return output
|
|
523
|
+
```
|
|
524
|
+
|
|
525
|
+
### Adding Positional Encoding
|
|
526
|
+
|
|
527
|
+
```python
|
|
528
|
+
class CustomPositionalEncoding(nn.Module):
|
|
529
|
+
"""Custom positional encoding."""
|
|
530
|
+
|
|
531
|
+
def __init__(self, config):
|
|
532
|
+
super().__init__()
|
|
533
|
+
self.n_embd = config.n_embd
|
|
534
|
+
self.register_buffer(
|
|
535
|
+
"pos_encoding",
|
|
536
|
+
self._create_encoding(config.block_size, config.n_embd)
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
def _create_encoding(self, max_len, d_model):
|
|
540
|
+
"""Create positional encoding matrix."""
|
|
541
|
+
pos = torch.arange(max_len).unsqueeze(1)
|
|
542
|
+
div = torch.exp(torch.arange(0, d_model, 2) * -(torch.log(torch.tensor(10000.0)) / d_model))
|
|
543
|
+
|
|
544
|
+
encoding = torch.zeros(max_len, d_model)
|
|
545
|
+
encoding[:, 0::2] = torch.sin(pos * div)
|
|
546
|
+
encoding[:, 1::2] = torch.cos(pos * div)
|
|
547
|
+
return encoding
|
|
548
|
+
|
|
549
|
+
def forward(self, x):
|
|
550
|
+
"""Add positional encoding."""
|
|
551
|
+
return x + self.pos_encoding[:x.size(1), :]
|
|
552
|
+
```
|
|
553
|
+
|
|
554
|
+
## Debugging Tips
|
|
555
|
+
|
|
556
|
+
1. **Start small**: Test with 2 layers, 128 hidden size
|
|
557
|
+
2. **Check shapes**: Print tensor shapes at each step
|
|
558
|
+
3. **Verify gradients**: Ensure all parameters have gradients
|
|
559
|
+
4. **Compare to base**: Run same config with base `GPT` model
|
|
560
|
+
5. **Profile memory**: Use `torch.cuda.memory_summary()`
|
|
561
|
+
|
|
562
|
+
## References
|
|
563
|
+
|
|
564
|
+
- Base model: `litgpt/model.py`
|
|
565
|
+
- Configuration: `litgpt/config.py`
|
|
566
|
+
- Adapter example: `litgpt/finetune/adapter.py`
|
|
567
|
+
- AdapterV2 example: `litgpt/finetune/adapter_v2.py`
|
|
568
|
+
- LoRA example: `litgpt/finetune/lora.py`
|