@synsci/cli-darwin-x64 1.1.49
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skills/accelerate/SKILL.md +332 -0
- package/bin/skills/accelerate/references/custom-plugins.md +453 -0
- package/bin/skills/accelerate/references/megatron-integration.md +489 -0
- package/bin/skills/accelerate/references/performance.md +525 -0
- package/bin/skills/audiocraft/SKILL.md +564 -0
- package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
- package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
- package/bin/skills/autogpt/SKILL.md +403 -0
- package/bin/skills/autogpt/references/advanced-usage.md +535 -0
- package/bin/skills/autogpt/references/troubleshooting.md +420 -0
- package/bin/skills/awq/SKILL.md +310 -0
- package/bin/skills/awq/references/advanced-usage.md +324 -0
- package/bin/skills/awq/references/troubleshooting.md +344 -0
- package/bin/skills/axolotl/SKILL.md +158 -0
- package/bin/skills/axolotl/references/api.md +5548 -0
- package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
- package/bin/skills/axolotl/references/index.md +15 -0
- package/bin/skills/axolotl/references/other.md +3563 -0
- package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
- package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
- package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
- package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
- package/bin/skills/bitsandbytes/SKILL.md +411 -0
- package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
- package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
- package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
- package/bin/skills/blip-2/SKILL.md +564 -0
- package/bin/skills/blip-2/references/advanced-usage.md +680 -0
- package/bin/skills/blip-2/references/troubleshooting.md +526 -0
- package/bin/skills/chroma/SKILL.md +406 -0
- package/bin/skills/chroma/references/integration.md +38 -0
- package/bin/skills/clip/SKILL.md +253 -0
- package/bin/skills/clip/references/applications.md +207 -0
- package/bin/skills/constitutional-ai/SKILL.md +290 -0
- package/bin/skills/crewai/SKILL.md +498 -0
- package/bin/skills/crewai/references/flows.md +438 -0
- package/bin/skills/crewai/references/tools.md +429 -0
- package/bin/skills/crewai/references/troubleshooting.md +480 -0
- package/bin/skills/deepspeed/SKILL.md +141 -0
- package/bin/skills/deepspeed/references/08.md +17 -0
- package/bin/skills/deepspeed/references/09.md +173 -0
- package/bin/skills/deepspeed/references/2020.md +378 -0
- package/bin/skills/deepspeed/references/2023.md +279 -0
- package/bin/skills/deepspeed/references/assets.md +179 -0
- package/bin/skills/deepspeed/references/index.md +35 -0
- package/bin/skills/deepspeed/references/mii.md +118 -0
- package/bin/skills/deepspeed/references/other.md +1191 -0
- package/bin/skills/deepspeed/references/tutorials.md +6554 -0
- package/bin/skills/dspy/SKILL.md +590 -0
- package/bin/skills/dspy/references/examples.md +663 -0
- package/bin/skills/dspy/references/modules.md +475 -0
- package/bin/skills/dspy/references/optimizers.md +566 -0
- package/bin/skills/faiss/SKILL.md +221 -0
- package/bin/skills/faiss/references/index_types.md +280 -0
- package/bin/skills/flash-attention/SKILL.md +367 -0
- package/bin/skills/flash-attention/references/benchmarks.md +215 -0
- package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
- package/bin/skills/gguf/SKILL.md +427 -0
- package/bin/skills/gguf/references/advanced-usage.md +504 -0
- package/bin/skills/gguf/references/troubleshooting.md +442 -0
- package/bin/skills/gptq/SKILL.md +450 -0
- package/bin/skills/gptq/references/calibration.md +337 -0
- package/bin/skills/gptq/references/integration.md +129 -0
- package/bin/skills/gptq/references/troubleshooting.md +95 -0
- package/bin/skills/grpo-rl-training/README.md +97 -0
- package/bin/skills/grpo-rl-training/SKILL.md +572 -0
- package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
- package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
- package/bin/skills/guidance/SKILL.md +572 -0
- package/bin/skills/guidance/references/backends.md +554 -0
- package/bin/skills/guidance/references/constraints.md +674 -0
- package/bin/skills/guidance/references/examples.md +767 -0
- package/bin/skills/hqq/SKILL.md +445 -0
- package/bin/skills/hqq/references/advanced-usage.md +528 -0
- package/bin/skills/hqq/references/troubleshooting.md +503 -0
- package/bin/skills/hugging-face-cli/SKILL.md +191 -0
- package/bin/skills/hugging-face-cli/references/commands.md +954 -0
- package/bin/skills/hugging-face-cli/references/examples.md +374 -0
- package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
- package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
- package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
- package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
- package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
- package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
- package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
- package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
- package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
- package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
- package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
- package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
- package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
- package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
- package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
- package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
- package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
- package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
- package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
- package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
- package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
- package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
- package/bin/skills/hugging-face-jobs/index.html +216 -0
- package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
- package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
- package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
- package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
- package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
- package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
- package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
- package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
- package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
- package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
- package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
- package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
- package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
- package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
- package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
- package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
- package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
- package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
- package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
- package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
- package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
- package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
- package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
- package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
- package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
- package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
- package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
- package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
- package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
- package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
- package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
- package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
- package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
- package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
- package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
- package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
- package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
- package/bin/skills/instructor/SKILL.md +740 -0
- package/bin/skills/instructor/references/examples.md +107 -0
- package/bin/skills/instructor/references/providers.md +70 -0
- package/bin/skills/instructor/references/validation.md +606 -0
- package/bin/skills/knowledge-distillation/SKILL.md +458 -0
- package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
- package/bin/skills/lambda-labs/SKILL.md +545 -0
- package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
- package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
- package/bin/skills/langchain/SKILL.md +480 -0
- package/bin/skills/langchain/references/agents.md +499 -0
- package/bin/skills/langchain/references/integration.md +562 -0
- package/bin/skills/langchain/references/rag.md +600 -0
- package/bin/skills/langsmith/SKILL.md +422 -0
- package/bin/skills/langsmith/references/advanced-usage.md +548 -0
- package/bin/skills/langsmith/references/troubleshooting.md +537 -0
- package/bin/skills/litgpt/SKILL.md +469 -0
- package/bin/skills/litgpt/references/custom-models.md +568 -0
- package/bin/skills/litgpt/references/distributed-training.md +451 -0
- package/bin/skills/litgpt/references/supported-models.md +336 -0
- package/bin/skills/litgpt/references/training-recipes.md +619 -0
- package/bin/skills/llama-cpp/SKILL.md +258 -0
- package/bin/skills/llama-cpp/references/optimization.md +89 -0
- package/bin/skills/llama-cpp/references/quantization.md +213 -0
- package/bin/skills/llama-cpp/references/server.md +125 -0
- package/bin/skills/llama-factory/SKILL.md +80 -0
- package/bin/skills/llama-factory/references/_images.md +23 -0
- package/bin/skills/llama-factory/references/advanced.md +1055 -0
- package/bin/skills/llama-factory/references/getting_started.md +349 -0
- package/bin/skills/llama-factory/references/index.md +19 -0
- package/bin/skills/llama-factory/references/other.md +31 -0
- package/bin/skills/llamaguard/SKILL.md +337 -0
- package/bin/skills/llamaindex/SKILL.md +569 -0
- package/bin/skills/llamaindex/references/agents.md +83 -0
- package/bin/skills/llamaindex/references/data_connectors.md +108 -0
- package/bin/skills/llamaindex/references/query_engines.md +406 -0
- package/bin/skills/llava/SKILL.md +304 -0
- package/bin/skills/llava/references/training.md +197 -0
- package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
- package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
- package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
- package/bin/skills/long-context/SKILL.md +536 -0
- package/bin/skills/long-context/references/extension_methods.md +468 -0
- package/bin/skills/long-context/references/fine_tuning.md +611 -0
- package/bin/skills/long-context/references/rope.md +402 -0
- package/bin/skills/mamba/SKILL.md +260 -0
- package/bin/skills/mamba/references/architecture-details.md +206 -0
- package/bin/skills/mamba/references/benchmarks.md +255 -0
- package/bin/skills/mamba/references/training-guide.md +388 -0
- package/bin/skills/megatron-core/SKILL.md +366 -0
- package/bin/skills/megatron-core/references/benchmarks.md +249 -0
- package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
- package/bin/skills/megatron-core/references/production-examples.md +473 -0
- package/bin/skills/megatron-core/references/training-recipes.md +547 -0
- package/bin/skills/miles/SKILL.md +315 -0
- package/bin/skills/miles/references/api-reference.md +141 -0
- package/bin/skills/miles/references/troubleshooting.md +352 -0
- package/bin/skills/mlflow/SKILL.md +704 -0
- package/bin/skills/mlflow/references/deployment.md +744 -0
- package/bin/skills/mlflow/references/model-registry.md +770 -0
- package/bin/skills/mlflow/references/tracking.md +680 -0
- package/bin/skills/modal/SKILL.md +341 -0
- package/bin/skills/modal/references/advanced-usage.md +503 -0
- package/bin/skills/modal/references/troubleshooting.md +494 -0
- package/bin/skills/model-merging/SKILL.md +539 -0
- package/bin/skills/model-merging/references/evaluation.md +462 -0
- package/bin/skills/model-merging/references/examples.md +428 -0
- package/bin/skills/model-merging/references/methods.md +352 -0
- package/bin/skills/model-pruning/SKILL.md +495 -0
- package/bin/skills/model-pruning/references/wanda.md +347 -0
- package/bin/skills/moe-training/SKILL.md +526 -0
- package/bin/skills/moe-training/references/architectures.md +432 -0
- package/bin/skills/moe-training/references/inference.md +348 -0
- package/bin/skills/moe-training/references/training.md +425 -0
- package/bin/skills/nanogpt/SKILL.md +290 -0
- package/bin/skills/nanogpt/references/architecture.md +382 -0
- package/bin/skills/nanogpt/references/data.md +476 -0
- package/bin/skills/nanogpt/references/training.md +564 -0
- package/bin/skills/nemo-curator/SKILL.md +383 -0
- package/bin/skills/nemo-curator/references/deduplication.md +87 -0
- package/bin/skills/nemo-curator/references/filtering.md +102 -0
- package/bin/skills/nemo-evaluator/SKILL.md +494 -0
- package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
- package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
- package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
- package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
- package/bin/skills/nemo-guardrails/SKILL.md +297 -0
- package/bin/skills/nnsight/SKILL.md +436 -0
- package/bin/skills/nnsight/references/README.md +78 -0
- package/bin/skills/nnsight/references/api.md +344 -0
- package/bin/skills/nnsight/references/tutorials.md +300 -0
- package/bin/skills/openrlhf/SKILL.md +249 -0
- package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
- package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
- package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
- package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
- package/bin/skills/outlines/SKILL.md +652 -0
- package/bin/skills/outlines/references/backends.md +615 -0
- package/bin/skills/outlines/references/examples.md +773 -0
- package/bin/skills/outlines/references/json_generation.md +652 -0
- package/bin/skills/peft/SKILL.md +431 -0
- package/bin/skills/peft/references/advanced-usage.md +514 -0
- package/bin/skills/peft/references/troubleshooting.md +480 -0
- package/bin/skills/phoenix/SKILL.md +475 -0
- package/bin/skills/phoenix/references/advanced-usage.md +619 -0
- package/bin/skills/phoenix/references/troubleshooting.md +538 -0
- package/bin/skills/pinecone/SKILL.md +358 -0
- package/bin/skills/pinecone/references/deployment.md +181 -0
- package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
- package/bin/skills/pytorch-fsdp/references/index.md +7 -0
- package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
- package/bin/skills/pytorch-lightning/SKILL.md +346 -0
- package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
- package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
- package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
- package/bin/skills/pyvene/SKILL.md +473 -0
- package/bin/skills/pyvene/references/README.md +73 -0
- package/bin/skills/pyvene/references/api.md +383 -0
- package/bin/skills/pyvene/references/tutorials.md +376 -0
- package/bin/skills/qdrant/SKILL.md +493 -0
- package/bin/skills/qdrant/references/advanced-usage.md +648 -0
- package/bin/skills/qdrant/references/troubleshooting.md +631 -0
- package/bin/skills/ray-data/SKILL.md +326 -0
- package/bin/skills/ray-data/references/integration.md +82 -0
- package/bin/skills/ray-data/references/transformations.md +83 -0
- package/bin/skills/ray-train/SKILL.md +406 -0
- package/bin/skills/ray-train/references/multi-node.md +628 -0
- package/bin/skills/rwkv/SKILL.md +260 -0
- package/bin/skills/rwkv/references/architecture-details.md +344 -0
- package/bin/skills/rwkv/references/rwkv7.md +386 -0
- package/bin/skills/rwkv/references/state-management.md +369 -0
- package/bin/skills/saelens/SKILL.md +386 -0
- package/bin/skills/saelens/references/README.md +70 -0
- package/bin/skills/saelens/references/api.md +333 -0
- package/bin/skills/saelens/references/tutorials.md +318 -0
- package/bin/skills/segment-anything/SKILL.md +500 -0
- package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
- package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
- package/bin/skills/sentence-transformers/SKILL.md +255 -0
- package/bin/skills/sentence-transformers/references/models.md +123 -0
- package/bin/skills/sentencepiece/SKILL.md +235 -0
- package/bin/skills/sentencepiece/references/algorithms.md +200 -0
- package/bin/skills/sentencepiece/references/training.md +304 -0
- package/bin/skills/sglang/SKILL.md +442 -0
- package/bin/skills/sglang/references/deployment.md +490 -0
- package/bin/skills/sglang/references/radix-attention.md +413 -0
- package/bin/skills/sglang/references/structured-generation.md +541 -0
- package/bin/skills/simpo/SKILL.md +219 -0
- package/bin/skills/simpo/references/datasets.md +478 -0
- package/bin/skills/simpo/references/hyperparameters.md +452 -0
- package/bin/skills/simpo/references/loss-functions.md +350 -0
- package/bin/skills/skypilot/SKILL.md +509 -0
- package/bin/skills/skypilot/references/advanced-usage.md +491 -0
- package/bin/skills/skypilot/references/troubleshooting.md +570 -0
- package/bin/skills/slime/SKILL.md +464 -0
- package/bin/skills/slime/references/api-reference.md +392 -0
- package/bin/skills/slime/references/troubleshooting.md +386 -0
- package/bin/skills/speculative-decoding/SKILL.md +467 -0
- package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
- package/bin/skills/speculative-decoding/references/medusa.md +350 -0
- package/bin/skills/stable-diffusion/SKILL.md +519 -0
- package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
- package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
- package/bin/skills/tensorboard/SKILL.md +629 -0
- package/bin/skills/tensorboard/references/integrations.md +638 -0
- package/bin/skills/tensorboard/references/profiling.md +545 -0
- package/bin/skills/tensorboard/references/visualization.md +620 -0
- package/bin/skills/tensorrt-llm/SKILL.md +187 -0
- package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
- package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
- package/bin/skills/tensorrt-llm/references/serving.md +470 -0
- package/bin/skills/tinker/SKILL.md +362 -0
- package/bin/skills/tinker/references/api-reference.md +168 -0
- package/bin/skills/tinker/references/getting-started.md +157 -0
- package/bin/skills/tinker/references/loss-functions.md +163 -0
- package/bin/skills/tinker/references/models-and-lora.md +139 -0
- package/bin/skills/tinker/references/recipes.md +280 -0
- package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
- package/bin/skills/tinker/references/rendering.md +243 -0
- package/bin/skills/tinker/references/supervised-learning.md +232 -0
- package/bin/skills/tinker-training-cost/SKILL.md +187 -0
- package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
- package/bin/skills/torchforge/SKILL.md +433 -0
- package/bin/skills/torchforge/references/api-reference.md +327 -0
- package/bin/skills/torchforge/references/troubleshooting.md +409 -0
- package/bin/skills/torchtitan/SKILL.md +358 -0
- package/bin/skills/torchtitan/references/checkpoint.md +181 -0
- package/bin/skills/torchtitan/references/custom-models.md +258 -0
- package/bin/skills/torchtitan/references/float8.md +133 -0
- package/bin/skills/torchtitan/references/fsdp.md +126 -0
- package/bin/skills/transformer-lens/SKILL.md +346 -0
- package/bin/skills/transformer-lens/references/README.md +54 -0
- package/bin/skills/transformer-lens/references/api.md +362 -0
- package/bin/skills/transformer-lens/references/tutorials.md +339 -0
- package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
- package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
- package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
- package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
- package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
- package/bin/skills/unsloth/SKILL.md +80 -0
- package/bin/skills/unsloth/references/index.md +7 -0
- package/bin/skills/unsloth/references/llms-full.md +16799 -0
- package/bin/skills/unsloth/references/llms-txt.md +12044 -0
- package/bin/skills/unsloth/references/llms.md +82 -0
- package/bin/skills/verl/SKILL.md +391 -0
- package/bin/skills/verl/references/api-reference.md +301 -0
- package/bin/skills/verl/references/troubleshooting.md +391 -0
- package/bin/skills/vllm/SKILL.md +364 -0
- package/bin/skills/vllm/references/optimization.md +226 -0
- package/bin/skills/vllm/references/quantization.md +284 -0
- package/bin/skills/vllm/references/server-deployment.md +255 -0
- package/bin/skills/vllm/references/troubleshooting.md +447 -0
- package/bin/skills/weights-and-biases/SKILL.md +590 -0
- package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
- package/bin/skills/weights-and-biases/references/integrations.md +700 -0
- package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
- package/bin/skills/whisper/SKILL.md +317 -0
- package/bin/skills/whisper/references/languages.md +189 -0
- package/bin/synsc +0 -0
- package/package.json +10 -0
|
@@ -0,0 +1,458 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: knowledge-distillation
|
|
3
|
+
description: Compress large language models using knowledge distillation from teacher to student models. Use when deploying smaller models with retained performance, transferring GPT-4 capabilities to open-source models, or reducing inference costs. Covers temperature scaling, soft targets, reverse KLD, logit distillation, and MiniLLM training strategies.
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
author: Synthetic Sciences
|
|
6
|
+
license: MIT
|
|
7
|
+
tags: [Emerging Techniques, Knowledge Distillation, Model Compression, Teacher-Student, MiniLLM, Reverse KLD, Soft Targets, Temperature Scaling, Logit Distillation, Model Transfer]
|
|
8
|
+
dependencies: [transformers, torch, datasets]
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# Knowledge Distillation: Compressing LLMs
|
|
12
|
+
|
|
13
|
+
## When to Use This Skill
|
|
14
|
+
|
|
15
|
+
Use Knowledge Distillation when you need to:
|
|
16
|
+
- **Compress models** from 70B → 7B while retaining 90%+ performance
|
|
17
|
+
- **Transfer capabilities** from proprietary models (GPT-4) to open-source (LLaMA, Mistral)
|
|
18
|
+
- **Reduce inference costs** by deploying smaller student models
|
|
19
|
+
- **Create specialized models** by distilling domain-specific knowledge
|
|
20
|
+
- **Improve small models** using synthetic data from large teachers
|
|
21
|
+
|
|
22
|
+
**Key Techniques**: Temperature scaling, soft targets, reverse KLD (MiniLLM), logit distillation, response distillation
|
|
23
|
+
|
|
24
|
+
**Papers**: Hinton et al. 2015 (arXiv 1503.02531), MiniLLM (arXiv 2306.08543), KD Survey (arXiv 2402.13116)
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
# Standard transformers
|
|
30
|
+
pip install transformers datasets accelerate
|
|
31
|
+
|
|
32
|
+
# For training
|
|
33
|
+
pip install torch deepspeed wandb
|
|
34
|
+
|
|
35
|
+
# Optional: MiniLLM implementation
|
|
36
|
+
git clone https://github.com/microsoft/LMOps
|
|
37
|
+
cd LMOps/minillm
|
|
38
|
+
pip install -e .
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Quick Start
|
|
42
|
+
|
|
43
|
+
### Basic Knowledge Distillation
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
import torch
|
|
47
|
+
import torch.nn.functional as F
|
|
48
|
+
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
|
|
49
|
+
|
|
50
|
+
# 1. Load teacher (large) and student (small) models
|
|
51
|
+
teacher = AutoModelForCausalLM.from_pretrained(
|
|
52
|
+
"meta-llama/Llama-2-70b-hf", # Large teacher
|
|
53
|
+
torch_dtype=torch.float16,
|
|
54
|
+
device_map="auto"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
student = AutoModelForCausalLM.from_pretrained(
|
|
58
|
+
"meta-llama/Llama-2-7b-hf", # Small student
|
|
59
|
+
torch_dtype=torch.float16,
|
|
60
|
+
device_map="cuda:0"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-70b-hf")
|
|
64
|
+
|
|
65
|
+
# 2. Define distillation loss
|
|
66
|
+
def distillation_loss(student_logits, teacher_logits, labels, temperature=2.0, alpha=0.5):
|
|
67
|
+
"""
|
|
68
|
+
Combine hard loss (cross-entropy) with soft loss (KL divergence).
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
temperature: Softens probability distributions (higher = softer)
|
|
72
|
+
alpha: Weight for distillation loss (1-alpha for hard loss)
|
|
73
|
+
"""
|
|
74
|
+
# Hard loss: Standard cross-entropy with true labels
|
|
75
|
+
hard_loss = F.cross_entropy(student_logits.view(-1, student_logits.size(-1)), labels.view(-1))
|
|
76
|
+
|
|
77
|
+
# Soft loss: KL divergence between student and teacher
|
|
78
|
+
soft_targets = F.softmax(teacher_logits / temperature, dim=-1)
|
|
79
|
+
soft_student = F.log_softmax(student_logits / temperature, dim=-1)
|
|
80
|
+
soft_loss = F.kl_div(soft_student, soft_targets, reduction='batchmean') * (temperature ** 2)
|
|
81
|
+
|
|
82
|
+
# Combined loss
|
|
83
|
+
return alpha * soft_loss + (1 - alpha) * hard_loss
|
|
84
|
+
|
|
85
|
+
# 3. Training loop
|
|
86
|
+
for batch in dataloader:
|
|
87
|
+
# Teacher forward (no grad)
|
|
88
|
+
with torch.no_grad():
|
|
89
|
+
teacher_outputs = teacher(**batch)
|
|
90
|
+
teacher_logits = teacher_outputs.logits
|
|
91
|
+
|
|
92
|
+
# Student forward
|
|
93
|
+
student_outputs = student(**batch)
|
|
94
|
+
student_logits = student_outputs.logits
|
|
95
|
+
|
|
96
|
+
# Compute distillation loss
|
|
97
|
+
loss = distillation_loss(
|
|
98
|
+
student_logits,
|
|
99
|
+
teacher_logits,
|
|
100
|
+
batch['labels'],
|
|
101
|
+
temperature=2.0,
|
|
102
|
+
alpha=0.7 # 70% soft, 30% hard
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# Backward and optimize
|
|
106
|
+
loss.backward()
|
|
107
|
+
optimizer.step()
|
|
108
|
+
optimizer.zero_grad()
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### MiniLLM (Reverse KLD)
|
|
112
|
+
|
|
113
|
+
**Source**: arXiv 2306.08543 (2024)
|
|
114
|
+
|
|
115
|
+
**Innovation**: Use reverse KLD instead of forward KLD for better generative model distillation.
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
def reverse_kl_loss(student_logits, teacher_logits, temperature=1.0):
|
|
119
|
+
"""
|
|
120
|
+
Reverse KL divergence: KL(Teacher || Student)
|
|
121
|
+
Better for generative models than forward KL.
|
|
122
|
+
"""
|
|
123
|
+
# Teacher distribution (target)
|
|
124
|
+
p_teacher = F.softmax(teacher_logits / temperature, dim=-1)
|
|
125
|
+
|
|
126
|
+
# Student distribution (model)
|
|
127
|
+
log_p_student = F.log_softmax(student_logits / temperature, dim=-1)
|
|
128
|
+
|
|
129
|
+
# Reverse KL: Sum over teacher, student learns to cover teacher's modes
|
|
130
|
+
reverse_kl = -(p_teacher * log_p_student).sum(dim=-1).mean()
|
|
131
|
+
|
|
132
|
+
return reverse_kl * (temperature ** 2)
|
|
133
|
+
|
|
134
|
+
# Training with MiniLLM
|
|
135
|
+
for batch in dataloader:
|
|
136
|
+
with torch.no_grad():
|
|
137
|
+
teacher_logits = teacher(**batch).logits
|
|
138
|
+
|
|
139
|
+
student_logits = student(**batch).logits
|
|
140
|
+
|
|
141
|
+
# Reverse KLD (better for generation)
|
|
142
|
+
loss = reverse_kl_loss(student_logits, teacher_logits, temperature=1.0)
|
|
143
|
+
|
|
144
|
+
loss.backward()
|
|
145
|
+
optimizer.step()
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
**Why reverse KL?**
|
|
149
|
+
- **Forward KL** (standard): Student learns to match teacher's *mean*
|
|
150
|
+
- **Reverse KL** (MiniLLM): Student learns to *cover* all teacher's modes
|
|
151
|
+
- Better for diverse text generation
|
|
152
|
+
|
|
153
|
+
### Response Distillation
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
# Generate synthetic data from teacher, train student to imitate
|
|
157
|
+
|
|
158
|
+
# 1. Generate synthetic responses from teacher
|
|
159
|
+
prompts = ["Explain AI:", "What is ML?", "Define NLP:"]
|
|
160
|
+
|
|
161
|
+
teacher_responses = []
|
|
162
|
+
for prompt in prompts:
|
|
163
|
+
inputs = tokenizer(prompt, return_tensors='pt').to(teacher.device)
|
|
164
|
+
outputs = teacher.generate(**inputs, max_new_tokens=256, do_sample=True, temperature=0.7)
|
|
165
|
+
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
166
|
+
teacher_responses.append(response)
|
|
167
|
+
|
|
168
|
+
# 2. Train student on teacher's responses (standard fine-tuning)
|
|
169
|
+
train_dataset = [
|
|
170
|
+
{"text": f"{prompt}\n{response}"}
|
|
171
|
+
for prompt, response in zip(prompts, teacher_responses)
|
|
172
|
+
]
|
|
173
|
+
|
|
174
|
+
# 3. Fine-tune student
|
|
175
|
+
trainer = Trainer(
|
|
176
|
+
model=student,
|
|
177
|
+
args=TrainingArguments(output_dir="./student", num_train_epochs=3, learning_rate=2e-5),
|
|
178
|
+
train_dataset=train_dataset,
|
|
179
|
+
)
|
|
180
|
+
trainer.train()
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
## Core Concepts
|
|
184
|
+
|
|
185
|
+
### 1. Temperature Scaling
|
|
186
|
+
|
|
187
|
+
**Purpose**: Soften probability distributions to expose teacher's uncertainty.
|
|
188
|
+
|
|
189
|
+
```python
|
|
190
|
+
# Low temperature (T=1): Sharp distribution
|
|
191
|
+
logits = [3.0, 2.0, 1.0]
|
|
192
|
+
probs_T1 = softmax(logits / 1.0) # [0.67, 0.24, 0.09]
|
|
193
|
+
|
|
194
|
+
# High temperature (T=4): Soft distribution
|
|
195
|
+
probs_T4 = softmax(logits / 4.0) # [0.42, 0.34, 0.24]
|
|
196
|
+
|
|
197
|
+
# Higher T reveals more information about relative rankings
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
**Rule**: Use T=2-5 for distillation (2 is common default).
|
|
201
|
+
|
|
202
|
+
### 2. Loss Function Components
|
|
203
|
+
|
|
204
|
+
```python
|
|
205
|
+
# Total loss = alpha * soft_loss + (1 - alpha) * hard_loss
|
|
206
|
+
|
|
207
|
+
# Soft loss: Learn from teacher's knowledge
|
|
208
|
+
soft_loss = KL(student || teacher)
|
|
209
|
+
|
|
210
|
+
# Hard loss: Learn from ground truth labels
|
|
211
|
+
hard_loss = CrossEntropy(student_output, true_labels)
|
|
212
|
+
|
|
213
|
+
# Typical values:
|
|
214
|
+
alpha = 0.5 # Balanced
|
|
215
|
+
alpha = 0.7 # More emphasis on teacher
|
|
216
|
+
alpha = 0.3 # More emphasis on labels
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
### 3. Forward vs Reverse KLD
|
|
220
|
+
|
|
221
|
+
```python
|
|
222
|
+
# Forward KL: KL(Student || Teacher)
|
|
223
|
+
# - Student matches teacher's average behavior
|
|
224
|
+
# - Mode-seeking: Student focuses on teacher's highest probability modes
|
|
225
|
+
# - Good for classification
|
|
226
|
+
|
|
227
|
+
# Reverse KL: KL(Teacher || Student)
|
|
228
|
+
# - Student covers all of teacher's behaviors
|
|
229
|
+
# - Mode-covering: Student learns diverse behaviors
|
|
230
|
+
# - Good for generation (MiniLLM)
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
## Training Strategies
|
|
234
|
+
|
|
235
|
+
### Strategy 1: Logit Distillation
|
|
236
|
+
|
|
237
|
+
```python
|
|
238
|
+
# Train student to match teacher's logits directly
|
|
239
|
+
|
|
240
|
+
def logit_distillation_trainer(student, teacher, dataloader, temperature=2.0):
|
|
241
|
+
optimizer = torch.optim.AdamW(student.parameters(), lr=2e-5)
|
|
242
|
+
|
|
243
|
+
for epoch in range(3):
|
|
244
|
+
for batch in dataloader:
|
|
245
|
+
# Get logits
|
|
246
|
+
with torch.no_grad():
|
|
247
|
+
teacher_logits = teacher(**batch).logits
|
|
248
|
+
|
|
249
|
+
student_logits = student(**batch).logits
|
|
250
|
+
|
|
251
|
+
# MSE on logits (alternative to KLD)
|
|
252
|
+
loss = F.mse_loss(student_logits, teacher_logits)
|
|
253
|
+
|
|
254
|
+
# Or use KLD
|
|
255
|
+
# loss = F.kl_div(
|
|
256
|
+
# F.log_softmax(student_logits/temperature, dim=-1),
|
|
257
|
+
# F.softmax(teacher_logits/temperature, dim=-1),
|
|
258
|
+
# reduction='batchmean'
|
|
259
|
+
# ) * (temperature ** 2)
|
|
260
|
+
|
|
261
|
+
loss.backward()
|
|
262
|
+
optimizer.step()
|
|
263
|
+
optimizer.zero_grad()
|
|
264
|
+
|
|
265
|
+
return student
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
### Strategy 2: Two-Stage Distillation
|
|
269
|
+
|
|
270
|
+
```python
|
|
271
|
+
# Stage 1: Distill from teacher
|
|
272
|
+
student = distill(teacher, student, epochs=5)
|
|
273
|
+
|
|
274
|
+
# Stage 2: Fine-tune on task-specific data
|
|
275
|
+
student = fine_tune(student, task_data, epochs=3)
|
|
276
|
+
|
|
277
|
+
# Results in better task performance than single-stage
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
### Strategy 3: Multi-Teacher Distillation
|
|
281
|
+
|
|
282
|
+
```python
|
|
283
|
+
# Learn from multiple expert teachers
|
|
284
|
+
|
|
285
|
+
def multi_teacher_distillation(student, teachers, batch):
|
|
286
|
+
"""Distill from ensemble of teachers."""
|
|
287
|
+
teacher_logits_list = []
|
|
288
|
+
|
|
289
|
+
# Get logits from all teachers
|
|
290
|
+
with torch.no_grad():
|
|
291
|
+
for teacher in teachers:
|
|
292
|
+
logits = teacher(**batch).logits
|
|
293
|
+
teacher_logits_list.append(logits)
|
|
294
|
+
|
|
295
|
+
# Average teacher predictions
|
|
296
|
+
avg_teacher_logits = torch.stack(teacher_logits_list).mean(dim=0)
|
|
297
|
+
|
|
298
|
+
# Student learns from ensemble
|
|
299
|
+
student_logits = student(**batch).logits
|
|
300
|
+
loss = F.kl_div(
|
|
301
|
+
F.log_softmax(student_logits, dim=-1),
|
|
302
|
+
F.softmax(avg_teacher_logits, dim=-1),
|
|
303
|
+
reduction='batchmean'
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
return loss
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
## Production Deployment
|
|
310
|
+
|
|
311
|
+
### Complete Training Script
|
|
312
|
+
|
|
313
|
+
```python
|
|
314
|
+
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
|
|
315
|
+
|
|
316
|
+
def train_distilled_model(
|
|
317
|
+
teacher_name="meta-llama/Llama-2-70b-hf",
|
|
318
|
+
student_name="meta-llama/Llama-2-7b-hf",
|
|
319
|
+
output_dir="./distilled-llama-7b",
|
|
320
|
+
temperature=2.0,
|
|
321
|
+
alpha=0.7,
|
|
322
|
+
):
|
|
323
|
+
# Load models
|
|
324
|
+
teacher = AutoModelForCausalLM.from_pretrained(teacher_name, torch_dtype=torch.float16, device_map="auto")
|
|
325
|
+
student = AutoModelForCausalLM.from_pretrained(student_name, torch_dtype=torch.float16)
|
|
326
|
+
tokenizer = AutoTokenizer.from_pretrained(teacher_name)
|
|
327
|
+
|
|
328
|
+
# Custom trainer with distillation
|
|
329
|
+
class DistillationTrainer(Trainer):
|
|
330
|
+
def compute_loss(self, model, inputs, return_outputs=False):
|
|
331
|
+
# Student forward
|
|
332
|
+
outputs_student = model(**inputs)
|
|
333
|
+
student_logits = outputs_student.logits
|
|
334
|
+
|
|
335
|
+
# Teacher forward (no grad)
|
|
336
|
+
with torch.no_grad():
|
|
337
|
+
outputs_teacher = teacher(**inputs)
|
|
338
|
+
teacher_logits = outputs_teacher.logits
|
|
339
|
+
|
|
340
|
+
# Distillation loss
|
|
341
|
+
soft_targets = F.softmax(teacher_logits / temperature, dim=-1)
|
|
342
|
+
soft_student = F.log_softmax(student_logits / temperature, dim=-1)
|
|
343
|
+
soft_loss = F.kl_div(soft_student, soft_targets, reduction='batchmean') * (temperature ** 2)
|
|
344
|
+
|
|
345
|
+
# Hard loss
|
|
346
|
+
hard_loss = outputs_student.loss
|
|
347
|
+
|
|
348
|
+
# Combined
|
|
349
|
+
loss = alpha * soft_loss + (1 - alpha) * hard_loss
|
|
350
|
+
|
|
351
|
+
return (loss, outputs_student) if return_outputs else loss
|
|
352
|
+
|
|
353
|
+
# Training arguments
|
|
354
|
+
training_args = TrainingArguments(
|
|
355
|
+
output_dir=output_dir,
|
|
356
|
+
num_train_epochs=3,
|
|
357
|
+
per_device_train_batch_size=4,
|
|
358
|
+
gradient_accumulation_steps=8,
|
|
359
|
+
learning_rate=2e-5,
|
|
360
|
+
warmup_steps=500,
|
|
361
|
+
logging_steps=100,
|
|
362
|
+
save_steps=1000,
|
|
363
|
+
bf16=True,
|
|
364
|
+
gradient_checkpointing=True,
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
# Train
|
|
368
|
+
trainer = DistillationTrainer(
|
|
369
|
+
model=student,
|
|
370
|
+
args=training_args,
|
|
371
|
+
train_dataset=train_dataset,
|
|
372
|
+
data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
trainer.train()
|
|
376
|
+
student.save_pretrained(output_dir)
|
|
377
|
+
tokenizer.save_pretrained(output_dir)
|
|
378
|
+
|
|
379
|
+
# Usage
|
|
380
|
+
train_distilled_model(
|
|
381
|
+
teacher_name="meta-llama/Llama-2-70b-hf",
|
|
382
|
+
student_name="meta-llama/Llama-2-7b-hf",
|
|
383
|
+
temperature=2.0,
|
|
384
|
+
alpha=0.7
|
|
385
|
+
)
|
|
386
|
+
```
|
|
387
|
+
|
|
388
|
+
## Best Practices
|
|
389
|
+
|
|
390
|
+
### 1. Hyperparameter Selection
|
|
391
|
+
|
|
392
|
+
```python
|
|
393
|
+
# Temperature
|
|
394
|
+
T = 1.0 # Sharp (less knowledge transfer)
|
|
395
|
+
T = 2.0 # Standard (good balance)
|
|
396
|
+
T = 5.0 # Soft (more knowledge transfer)
|
|
397
|
+
|
|
398
|
+
# Alpha (weight)
|
|
399
|
+
alpha = 0.5 # Balanced
|
|
400
|
+
alpha = 0.7 # Emphasize teacher knowledge
|
|
401
|
+
alpha = 0.9 # Strong distillation
|
|
402
|
+
|
|
403
|
+
# Rule: Higher T + higher alpha = stronger distillation
|
|
404
|
+
```
|
|
405
|
+
|
|
406
|
+
### 2. Model Size Ratio
|
|
407
|
+
|
|
408
|
+
```python
|
|
409
|
+
# Good ratios (teacher/student)
|
|
410
|
+
70B / 7B = 10× # Excellent
|
|
411
|
+
13B / 1B = 13× # Good
|
|
412
|
+
7B / 1B = 7× # Acceptable
|
|
413
|
+
|
|
414
|
+
# Avoid too large gap
|
|
415
|
+
70B / 1B = 70× # Too large, ineffective
|
|
416
|
+
```
|
|
417
|
+
|
|
418
|
+
### 3. Data Quality
|
|
419
|
+
|
|
420
|
+
```python
|
|
421
|
+
# Best: Use teacher-generated data + real data
|
|
422
|
+
train_data = {
|
|
423
|
+
"teacher_generated": 70%, # Diverse, high-quality
|
|
424
|
+
"real_data": 30% # Ground truth
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
# Avoid: Only real data (doesn't utilize teacher fully)
|
|
428
|
+
```
|
|
429
|
+
|
|
430
|
+
## Evaluation
|
|
431
|
+
|
|
432
|
+
```python
|
|
433
|
+
from transformers import pipeline
|
|
434
|
+
|
|
435
|
+
# Compare student vs teacher
|
|
436
|
+
teacher_pipe = pipeline("text-generation", model=teacher)
|
|
437
|
+
student_pipe = pipeline("text-generation", model=student)
|
|
438
|
+
|
|
439
|
+
prompts = ["Explain quantum computing:", "What is AI?"]
|
|
440
|
+
|
|
441
|
+
for prompt in prompts:
|
|
442
|
+
teacher_out = teacher_pipe(prompt, max_new_tokens=100)
|
|
443
|
+
student_out = student_pipe(prompt, max_new_tokens=100)
|
|
444
|
+
|
|
445
|
+
print(f"Prompt: {prompt}")
|
|
446
|
+
print(f"Teacher: {teacher_out[0]['generated_text']}")
|
|
447
|
+
print(f"Student: {student_out[0]['generated_text']}")
|
|
448
|
+
print(f"Match quality: {calculate_similarity(teacher_out, student_out):.2f}")
|
|
449
|
+
```
|
|
450
|
+
|
|
451
|
+
## Resources
|
|
452
|
+
|
|
453
|
+
- **Hinton et al. 2015 (Foundational)**: https://arxiv.org/abs/1503.02531
|
|
454
|
+
- **MiniLLM (Reverse KLD)**: https://arxiv.org/abs/2306.08543
|
|
455
|
+
- **KD Survey for LLMs (2024)**: https://arxiv.org/abs/2402.13116
|
|
456
|
+
- **MiniLLM GitHub**: https://github.com/microsoft/LMOps/tree/main/minillm
|
|
457
|
+
|
|
458
|
+
|