@synsci/cli-darwin-x64 1.1.49
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skills/accelerate/SKILL.md +332 -0
- package/bin/skills/accelerate/references/custom-plugins.md +453 -0
- package/bin/skills/accelerate/references/megatron-integration.md +489 -0
- package/bin/skills/accelerate/references/performance.md +525 -0
- package/bin/skills/audiocraft/SKILL.md +564 -0
- package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
- package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
- package/bin/skills/autogpt/SKILL.md +403 -0
- package/bin/skills/autogpt/references/advanced-usage.md +535 -0
- package/bin/skills/autogpt/references/troubleshooting.md +420 -0
- package/bin/skills/awq/SKILL.md +310 -0
- package/bin/skills/awq/references/advanced-usage.md +324 -0
- package/bin/skills/awq/references/troubleshooting.md +344 -0
- package/bin/skills/axolotl/SKILL.md +158 -0
- package/bin/skills/axolotl/references/api.md +5548 -0
- package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
- package/bin/skills/axolotl/references/index.md +15 -0
- package/bin/skills/axolotl/references/other.md +3563 -0
- package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
- package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
- package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
- package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
- package/bin/skills/bitsandbytes/SKILL.md +411 -0
- package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
- package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
- package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
- package/bin/skills/blip-2/SKILL.md +564 -0
- package/bin/skills/blip-2/references/advanced-usage.md +680 -0
- package/bin/skills/blip-2/references/troubleshooting.md +526 -0
- package/bin/skills/chroma/SKILL.md +406 -0
- package/bin/skills/chroma/references/integration.md +38 -0
- package/bin/skills/clip/SKILL.md +253 -0
- package/bin/skills/clip/references/applications.md +207 -0
- package/bin/skills/constitutional-ai/SKILL.md +290 -0
- package/bin/skills/crewai/SKILL.md +498 -0
- package/bin/skills/crewai/references/flows.md +438 -0
- package/bin/skills/crewai/references/tools.md +429 -0
- package/bin/skills/crewai/references/troubleshooting.md +480 -0
- package/bin/skills/deepspeed/SKILL.md +141 -0
- package/bin/skills/deepspeed/references/08.md +17 -0
- package/bin/skills/deepspeed/references/09.md +173 -0
- package/bin/skills/deepspeed/references/2020.md +378 -0
- package/bin/skills/deepspeed/references/2023.md +279 -0
- package/bin/skills/deepspeed/references/assets.md +179 -0
- package/bin/skills/deepspeed/references/index.md +35 -0
- package/bin/skills/deepspeed/references/mii.md +118 -0
- package/bin/skills/deepspeed/references/other.md +1191 -0
- package/bin/skills/deepspeed/references/tutorials.md +6554 -0
- package/bin/skills/dspy/SKILL.md +590 -0
- package/bin/skills/dspy/references/examples.md +663 -0
- package/bin/skills/dspy/references/modules.md +475 -0
- package/bin/skills/dspy/references/optimizers.md +566 -0
- package/bin/skills/faiss/SKILL.md +221 -0
- package/bin/skills/faiss/references/index_types.md +280 -0
- package/bin/skills/flash-attention/SKILL.md +367 -0
- package/bin/skills/flash-attention/references/benchmarks.md +215 -0
- package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
- package/bin/skills/gguf/SKILL.md +427 -0
- package/bin/skills/gguf/references/advanced-usage.md +504 -0
- package/bin/skills/gguf/references/troubleshooting.md +442 -0
- package/bin/skills/gptq/SKILL.md +450 -0
- package/bin/skills/gptq/references/calibration.md +337 -0
- package/bin/skills/gptq/references/integration.md +129 -0
- package/bin/skills/gptq/references/troubleshooting.md +95 -0
- package/bin/skills/grpo-rl-training/README.md +97 -0
- package/bin/skills/grpo-rl-training/SKILL.md +572 -0
- package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
- package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
- package/bin/skills/guidance/SKILL.md +572 -0
- package/bin/skills/guidance/references/backends.md +554 -0
- package/bin/skills/guidance/references/constraints.md +674 -0
- package/bin/skills/guidance/references/examples.md +767 -0
- package/bin/skills/hqq/SKILL.md +445 -0
- package/bin/skills/hqq/references/advanced-usage.md +528 -0
- package/bin/skills/hqq/references/troubleshooting.md +503 -0
- package/bin/skills/hugging-face-cli/SKILL.md +191 -0
- package/bin/skills/hugging-face-cli/references/commands.md +954 -0
- package/bin/skills/hugging-face-cli/references/examples.md +374 -0
- package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
- package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
- package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
- package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
- package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
- package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
- package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
- package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
- package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
- package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
- package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
- package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
- package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
- package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
- package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
- package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
- package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
- package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
- package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
- package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
- package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
- package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
- package/bin/skills/hugging-face-jobs/index.html +216 -0
- package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
- package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
- package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
- package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
- package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
- package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
- package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
- package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
- package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
- package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
- package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
- package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
- package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
- package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
- package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
- package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
- package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
- package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
- package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
- package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
- package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
- package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
- package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
- package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
- package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
- package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
- package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
- package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
- package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
- package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
- package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
- package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
- package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
- package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
- package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
- package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
- package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
- package/bin/skills/instructor/SKILL.md +740 -0
- package/bin/skills/instructor/references/examples.md +107 -0
- package/bin/skills/instructor/references/providers.md +70 -0
- package/bin/skills/instructor/references/validation.md +606 -0
- package/bin/skills/knowledge-distillation/SKILL.md +458 -0
- package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
- package/bin/skills/lambda-labs/SKILL.md +545 -0
- package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
- package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
- package/bin/skills/langchain/SKILL.md +480 -0
- package/bin/skills/langchain/references/agents.md +499 -0
- package/bin/skills/langchain/references/integration.md +562 -0
- package/bin/skills/langchain/references/rag.md +600 -0
- package/bin/skills/langsmith/SKILL.md +422 -0
- package/bin/skills/langsmith/references/advanced-usage.md +548 -0
- package/bin/skills/langsmith/references/troubleshooting.md +537 -0
- package/bin/skills/litgpt/SKILL.md +469 -0
- package/bin/skills/litgpt/references/custom-models.md +568 -0
- package/bin/skills/litgpt/references/distributed-training.md +451 -0
- package/bin/skills/litgpt/references/supported-models.md +336 -0
- package/bin/skills/litgpt/references/training-recipes.md +619 -0
- package/bin/skills/llama-cpp/SKILL.md +258 -0
- package/bin/skills/llama-cpp/references/optimization.md +89 -0
- package/bin/skills/llama-cpp/references/quantization.md +213 -0
- package/bin/skills/llama-cpp/references/server.md +125 -0
- package/bin/skills/llama-factory/SKILL.md +80 -0
- package/bin/skills/llama-factory/references/_images.md +23 -0
- package/bin/skills/llama-factory/references/advanced.md +1055 -0
- package/bin/skills/llama-factory/references/getting_started.md +349 -0
- package/bin/skills/llama-factory/references/index.md +19 -0
- package/bin/skills/llama-factory/references/other.md +31 -0
- package/bin/skills/llamaguard/SKILL.md +337 -0
- package/bin/skills/llamaindex/SKILL.md +569 -0
- package/bin/skills/llamaindex/references/agents.md +83 -0
- package/bin/skills/llamaindex/references/data_connectors.md +108 -0
- package/bin/skills/llamaindex/references/query_engines.md +406 -0
- package/bin/skills/llava/SKILL.md +304 -0
- package/bin/skills/llava/references/training.md +197 -0
- package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
- package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
- package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
- package/bin/skills/long-context/SKILL.md +536 -0
- package/bin/skills/long-context/references/extension_methods.md +468 -0
- package/bin/skills/long-context/references/fine_tuning.md +611 -0
- package/bin/skills/long-context/references/rope.md +402 -0
- package/bin/skills/mamba/SKILL.md +260 -0
- package/bin/skills/mamba/references/architecture-details.md +206 -0
- package/bin/skills/mamba/references/benchmarks.md +255 -0
- package/bin/skills/mamba/references/training-guide.md +388 -0
- package/bin/skills/megatron-core/SKILL.md +366 -0
- package/bin/skills/megatron-core/references/benchmarks.md +249 -0
- package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
- package/bin/skills/megatron-core/references/production-examples.md +473 -0
- package/bin/skills/megatron-core/references/training-recipes.md +547 -0
- package/bin/skills/miles/SKILL.md +315 -0
- package/bin/skills/miles/references/api-reference.md +141 -0
- package/bin/skills/miles/references/troubleshooting.md +352 -0
- package/bin/skills/mlflow/SKILL.md +704 -0
- package/bin/skills/mlflow/references/deployment.md +744 -0
- package/bin/skills/mlflow/references/model-registry.md +770 -0
- package/bin/skills/mlflow/references/tracking.md +680 -0
- package/bin/skills/modal/SKILL.md +341 -0
- package/bin/skills/modal/references/advanced-usage.md +503 -0
- package/bin/skills/modal/references/troubleshooting.md +494 -0
- package/bin/skills/model-merging/SKILL.md +539 -0
- package/bin/skills/model-merging/references/evaluation.md +462 -0
- package/bin/skills/model-merging/references/examples.md +428 -0
- package/bin/skills/model-merging/references/methods.md +352 -0
- package/bin/skills/model-pruning/SKILL.md +495 -0
- package/bin/skills/model-pruning/references/wanda.md +347 -0
- package/bin/skills/moe-training/SKILL.md +526 -0
- package/bin/skills/moe-training/references/architectures.md +432 -0
- package/bin/skills/moe-training/references/inference.md +348 -0
- package/bin/skills/moe-training/references/training.md +425 -0
- package/bin/skills/nanogpt/SKILL.md +290 -0
- package/bin/skills/nanogpt/references/architecture.md +382 -0
- package/bin/skills/nanogpt/references/data.md +476 -0
- package/bin/skills/nanogpt/references/training.md +564 -0
- package/bin/skills/nemo-curator/SKILL.md +383 -0
- package/bin/skills/nemo-curator/references/deduplication.md +87 -0
- package/bin/skills/nemo-curator/references/filtering.md +102 -0
- package/bin/skills/nemo-evaluator/SKILL.md +494 -0
- package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
- package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
- package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
- package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
- package/bin/skills/nemo-guardrails/SKILL.md +297 -0
- package/bin/skills/nnsight/SKILL.md +436 -0
- package/bin/skills/nnsight/references/README.md +78 -0
- package/bin/skills/nnsight/references/api.md +344 -0
- package/bin/skills/nnsight/references/tutorials.md +300 -0
- package/bin/skills/openrlhf/SKILL.md +249 -0
- package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
- package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
- package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
- package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
- package/bin/skills/outlines/SKILL.md +652 -0
- package/bin/skills/outlines/references/backends.md +615 -0
- package/bin/skills/outlines/references/examples.md +773 -0
- package/bin/skills/outlines/references/json_generation.md +652 -0
- package/bin/skills/peft/SKILL.md +431 -0
- package/bin/skills/peft/references/advanced-usage.md +514 -0
- package/bin/skills/peft/references/troubleshooting.md +480 -0
- package/bin/skills/phoenix/SKILL.md +475 -0
- package/bin/skills/phoenix/references/advanced-usage.md +619 -0
- package/bin/skills/phoenix/references/troubleshooting.md +538 -0
- package/bin/skills/pinecone/SKILL.md +358 -0
- package/bin/skills/pinecone/references/deployment.md +181 -0
- package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
- package/bin/skills/pytorch-fsdp/references/index.md +7 -0
- package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
- package/bin/skills/pytorch-lightning/SKILL.md +346 -0
- package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
- package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
- package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
- package/bin/skills/pyvene/SKILL.md +473 -0
- package/bin/skills/pyvene/references/README.md +73 -0
- package/bin/skills/pyvene/references/api.md +383 -0
- package/bin/skills/pyvene/references/tutorials.md +376 -0
- package/bin/skills/qdrant/SKILL.md +493 -0
- package/bin/skills/qdrant/references/advanced-usage.md +648 -0
- package/bin/skills/qdrant/references/troubleshooting.md +631 -0
- package/bin/skills/ray-data/SKILL.md +326 -0
- package/bin/skills/ray-data/references/integration.md +82 -0
- package/bin/skills/ray-data/references/transformations.md +83 -0
- package/bin/skills/ray-train/SKILL.md +406 -0
- package/bin/skills/ray-train/references/multi-node.md +628 -0
- package/bin/skills/rwkv/SKILL.md +260 -0
- package/bin/skills/rwkv/references/architecture-details.md +344 -0
- package/bin/skills/rwkv/references/rwkv7.md +386 -0
- package/bin/skills/rwkv/references/state-management.md +369 -0
- package/bin/skills/saelens/SKILL.md +386 -0
- package/bin/skills/saelens/references/README.md +70 -0
- package/bin/skills/saelens/references/api.md +333 -0
- package/bin/skills/saelens/references/tutorials.md +318 -0
- package/bin/skills/segment-anything/SKILL.md +500 -0
- package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
- package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
- package/bin/skills/sentence-transformers/SKILL.md +255 -0
- package/bin/skills/sentence-transformers/references/models.md +123 -0
- package/bin/skills/sentencepiece/SKILL.md +235 -0
- package/bin/skills/sentencepiece/references/algorithms.md +200 -0
- package/bin/skills/sentencepiece/references/training.md +304 -0
- package/bin/skills/sglang/SKILL.md +442 -0
- package/bin/skills/sglang/references/deployment.md +490 -0
- package/bin/skills/sglang/references/radix-attention.md +413 -0
- package/bin/skills/sglang/references/structured-generation.md +541 -0
- package/bin/skills/simpo/SKILL.md +219 -0
- package/bin/skills/simpo/references/datasets.md +478 -0
- package/bin/skills/simpo/references/hyperparameters.md +452 -0
- package/bin/skills/simpo/references/loss-functions.md +350 -0
- package/bin/skills/skypilot/SKILL.md +509 -0
- package/bin/skills/skypilot/references/advanced-usage.md +491 -0
- package/bin/skills/skypilot/references/troubleshooting.md +570 -0
- package/bin/skills/slime/SKILL.md +464 -0
- package/bin/skills/slime/references/api-reference.md +392 -0
- package/bin/skills/slime/references/troubleshooting.md +386 -0
- package/bin/skills/speculative-decoding/SKILL.md +467 -0
- package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
- package/bin/skills/speculative-decoding/references/medusa.md +350 -0
- package/bin/skills/stable-diffusion/SKILL.md +519 -0
- package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
- package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
- package/bin/skills/tensorboard/SKILL.md +629 -0
- package/bin/skills/tensorboard/references/integrations.md +638 -0
- package/bin/skills/tensorboard/references/profiling.md +545 -0
- package/bin/skills/tensorboard/references/visualization.md +620 -0
- package/bin/skills/tensorrt-llm/SKILL.md +187 -0
- package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
- package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
- package/bin/skills/tensorrt-llm/references/serving.md +470 -0
- package/bin/skills/tinker/SKILL.md +362 -0
- package/bin/skills/tinker/references/api-reference.md +168 -0
- package/bin/skills/tinker/references/getting-started.md +157 -0
- package/bin/skills/tinker/references/loss-functions.md +163 -0
- package/bin/skills/tinker/references/models-and-lora.md +139 -0
- package/bin/skills/tinker/references/recipes.md +280 -0
- package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
- package/bin/skills/tinker/references/rendering.md +243 -0
- package/bin/skills/tinker/references/supervised-learning.md +232 -0
- package/bin/skills/tinker-training-cost/SKILL.md +187 -0
- package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
- package/bin/skills/torchforge/SKILL.md +433 -0
- package/bin/skills/torchforge/references/api-reference.md +327 -0
- package/bin/skills/torchforge/references/troubleshooting.md +409 -0
- package/bin/skills/torchtitan/SKILL.md +358 -0
- package/bin/skills/torchtitan/references/checkpoint.md +181 -0
- package/bin/skills/torchtitan/references/custom-models.md +258 -0
- package/bin/skills/torchtitan/references/float8.md +133 -0
- package/bin/skills/torchtitan/references/fsdp.md +126 -0
- package/bin/skills/transformer-lens/SKILL.md +346 -0
- package/bin/skills/transformer-lens/references/README.md +54 -0
- package/bin/skills/transformer-lens/references/api.md +362 -0
- package/bin/skills/transformer-lens/references/tutorials.md +339 -0
- package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
- package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
- package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
- package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
- package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
- package/bin/skills/unsloth/SKILL.md +80 -0
- package/bin/skills/unsloth/references/index.md +7 -0
- package/bin/skills/unsloth/references/llms-full.md +16799 -0
- package/bin/skills/unsloth/references/llms-txt.md +12044 -0
- package/bin/skills/unsloth/references/llms.md +82 -0
- package/bin/skills/verl/SKILL.md +391 -0
- package/bin/skills/verl/references/api-reference.md +301 -0
- package/bin/skills/verl/references/troubleshooting.md +391 -0
- package/bin/skills/vllm/SKILL.md +364 -0
- package/bin/skills/vllm/references/optimization.md +226 -0
- package/bin/skills/vllm/references/quantization.md +284 -0
- package/bin/skills/vllm/references/server-deployment.md +255 -0
- package/bin/skills/vllm/references/troubleshooting.md +447 -0
- package/bin/skills/weights-and-biases/SKILL.md +590 -0
- package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
- package/bin/skills/weights-and-biases/references/integrations.md +700 -0
- package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
- package/bin/skills/whisper/SKILL.md +317 -0
- package/bin/skills/whisper/references/languages.md +189 -0
- package/bin/synsc +0 -0
- package/package.json +10 -0
|
@@ -0,0 +1,467 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: speculative-decoding
|
|
3
|
+
description: Accelerate LLM inference using speculative decoding, Medusa multiple heads, and lookahead decoding techniques. Use when optimizing inference speed (1.5-3.6× speedup), reducing latency for real-time applications, or deploying models with limited compute. Covers draft models, tree-based attention, Jacobi iteration, parallel token generation, and production deployment strategies.
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
author: Synthetic Sciences
|
|
6
|
+
license: MIT
|
|
7
|
+
tags: [Emerging Techniques, Speculative Decoding, Medusa, Lookahead Decoding, Fast Inference, Draft Models, Tree Attention, Parallel Generation, Latency Reduction, Inference Optimization]
|
|
8
|
+
dependencies: [transformers, torch]
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# Speculative Decoding: Accelerating LLM Inference
|
|
12
|
+
|
|
13
|
+
## When to Use This Skill
|
|
14
|
+
|
|
15
|
+
Use Speculative Decoding when you need to:
|
|
16
|
+
- **Speed up inference** by 1.5-3.6× without quality loss
|
|
17
|
+
- **Reduce latency** for real-time applications (chatbots, code generation)
|
|
18
|
+
- **Optimize throughput** for high-volume serving
|
|
19
|
+
- **Deploy efficiently** on limited hardware
|
|
20
|
+
- **Generate faster** without changing model architecture
|
|
21
|
+
|
|
22
|
+
**Key Techniques**: Draft model speculative decoding, Medusa (multiple heads), Lookahead Decoding (Jacobi iteration)
|
|
23
|
+
|
|
24
|
+
**Papers**: Medusa (arXiv 2401.10774), Lookahead Decoding (ICML 2024), Speculative Decoding Survey (ACL 2024)
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
# Standard speculative decoding (transformers)
|
|
30
|
+
pip install transformers accelerate
|
|
31
|
+
|
|
32
|
+
# Medusa (multiple decoding heads)
|
|
33
|
+
git clone https://github.com/FasterDecoding/Medusa
|
|
34
|
+
cd Medusa
|
|
35
|
+
pip install -e .
|
|
36
|
+
|
|
37
|
+
# Lookahead Decoding
|
|
38
|
+
git clone https://github.com/hao-ai-lab/LookaheadDecoding
|
|
39
|
+
cd LookaheadDecoding
|
|
40
|
+
pip install -e .
|
|
41
|
+
|
|
42
|
+
# Optional: vLLM with speculative decoding
|
|
43
|
+
pip install vllm
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Quick Start
|
|
47
|
+
|
|
48
|
+
### Basic Speculative Decoding (Draft Model)
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
52
|
+
|
|
53
|
+
# Load target model (large, slow)
|
|
54
|
+
target_model = AutoModelForCausalLM.from_pretrained(
|
|
55
|
+
"meta-llama/Llama-2-70b-hf",
|
|
56
|
+
device_map="auto",
|
|
57
|
+
torch_dtype=torch.float16
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# Load draft model (small, fast)
|
|
61
|
+
draft_model = AutoModelForCausalLM.from_pretrained(
|
|
62
|
+
"meta-llama/Llama-2-7b-hf",
|
|
63
|
+
device_map="auto",
|
|
64
|
+
torch_dtype=torch.float16
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-70b-hf")
|
|
68
|
+
|
|
69
|
+
# Generate with speculative decoding
|
|
70
|
+
prompt = "Explain quantum computing in simple terms:"
|
|
71
|
+
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
|
|
72
|
+
|
|
73
|
+
# Transformers 4.36+ supports assisted generation
|
|
74
|
+
outputs = target_model.generate(
|
|
75
|
+
**inputs,
|
|
76
|
+
assistant_model=draft_model, # Enable speculative decoding
|
|
77
|
+
max_new_tokens=256,
|
|
78
|
+
do_sample=True,
|
|
79
|
+
temperature=0.7,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
83
|
+
print(response)
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Medusa (Multiple Decoding Heads)
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
from medusa.model.medusa_model import MedusaModel
|
|
90
|
+
|
|
91
|
+
# Load Medusa-enhanced model
|
|
92
|
+
model = MedusaModel.from_pretrained(
|
|
93
|
+
"FasterDecoding/medusa-vicuna-7b-v1.3", # Pre-trained with Medusa heads
|
|
94
|
+
torch_dtype=torch.float16,
|
|
95
|
+
device_map="auto"
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
tokenizer = AutoTokenizer.from_pretrained("FasterDecoding/medusa-vicuna-7b-v1.3")
|
|
99
|
+
|
|
100
|
+
# Generate with Medusa (2-3× speedup)
|
|
101
|
+
prompt = "Write a Python function to calculate fibonacci numbers:"
|
|
102
|
+
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
|
|
103
|
+
|
|
104
|
+
outputs = model.medusa_generate(
|
|
105
|
+
**inputs,
|
|
106
|
+
max_new_tokens=256,
|
|
107
|
+
temperature=0.7,
|
|
108
|
+
posterior_threshold=0.09, # Acceptance threshold
|
|
109
|
+
posterior_alpha=0.3, # Tree construction parameter
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
### Lookahead Decoding (Jacobi Iteration)
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
from lookahead.lookahead_decoding import LookaheadDecoding
|
|
119
|
+
|
|
120
|
+
# Load model
|
|
121
|
+
model = AutoModelForCausalLM.from_pretrained(
|
|
122
|
+
"meta-llama/Llama-2-7b-hf",
|
|
123
|
+
torch_dtype=torch.float16,
|
|
124
|
+
device_map="auto"
|
|
125
|
+
)
|
|
126
|
+
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
|
|
127
|
+
|
|
128
|
+
# Initialize lookahead decoding
|
|
129
|
+
lookahead = LookaheadDecoding(
|
|
130
|
+
model=model,
|
|
131
|
+
tokenizer=tokenizer,
|
|
132
|
+
window_size=15, # Lookahead window (W)
|
|
133
|
+
ngram_size=5, # N-gram size (N)
|
|
134
|
+
guess_size=5 # Number of parallel guesses
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# Generate (1.5-2.3× speedup)
|
|
138
|
+
prompt = "Implement quicksort in Python:"
|
|
139
|
+
output = lookahead.generate(prompt, max_new_tokens=256)
|
|
140
|
+
print(output)
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## Core Concepts
|
|
144
|
+
|
|
145
|
+
### 1. Speculative Decoding (Draft Model)
|
|
146
|
+
|
|
147
|
+
**Idea**: Use small draft model to generate candidates, large target model to verify in parallel.
|
|
148
|
+
|
|
149
|
+
**Algorithm**:
|
|
150
|
+
1. Draft model generates K tokens speculatively
|
|
151
|
+
2. Target model evaluates all K tokens in parallel (single forward pass)
|
|
152
|
+
3. Accept tokens where draft and target agree
|
|
153
|
+
4. Reject first disagreement, continue from there
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
def speculative_decode(target_model, draft_model, prompt, K=4):
|
|
157
|
+
"""Speculative decoding algorithm."""
|
|
158
|
+
# 1. Generate K draft tokens
|
|
159
|
+
draft_tokens = draft_model.generate(prompt, max_new_tokens=K)
|
|
160
|
+
|
|
161
|
+
# 2. Target model evaluates all K tokens in one forward pass
|
|
162
|
+
target_logits = target_model(draft_tokens) # Parallel!
|
|
163
|
+
|
|
164
|
+
# 3. Accept/reject based on probability match
|
|
165
|
+
accepted = []
|
|
166
|
+
for i in range(K):
|
|
167
|
+
p_draft = softmax(draft_model.logits[i])
|
|
168
|
+
p_target = softmax(target_logits[i])
|
|
169
|
+
|
|
170
|
+
# Acceptance probability
|
|
171
|
+
if random.random() < min(1, p_target[draft_tokens[i]] / p_draft[draft_tokens[i]]):
|
|
172
|
+
accepted.append(draft_tokens[i])
|
|
173
|
+
else:
|
|
174
|
+
break # Reject, resample from target
|
|
175
|
+
|
|
176
|
+
return accepted
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
**Performance**:
|
|
180
|
+
- Speedup: 1.5-2× with good draft model
|
|
181
|
+
- Zero quality loss (mathematically equivalent to target model)
|
|
182
|
+
- Best when draft model is 5-10× smaller than target
|
|
183
|
+
|
|
184
|
+
### 2. Medusa (Multiple Decoding Heads)
|
|
185
|
+
|
|
186
|
+
**Source**: arXiv 2401.10774 (2024)
|
|
187
|
+
|
|
188
|
+
**Innovation**: Add multiple prediction heads to existing model, predict future tokens without separate draft model.
|
|
189
|
+
|
|
190
|
+
**Architecture**:
|
|
191
|
+
```
|
|
192
|
+
Input → Base LLM (frozen) → Hidden State
|
|
193
|
+
├→ Head 1 (predicts token t+1)
|
|
194
|
+
├→ Head 2 (predicts token t+2)
|
|
195
|
+
├→ Head 3 (predicts token t+3)
|
|
196
|
+
└→ Head 4 (predicts token t+4)
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
**Training**:
|
|
200
|
+
- **Medusa-1**: Freeze base LLM, train only heads
|
|
201
|
+
- 2.2× speedup, lossless
|
|
202
|
+
- **Medusa-2**: Fine-tune base LLM + heads together
|
|
203
|
+
- 2.3-3.6× speedup, better quality
|
|
204
|
+
|
|
205
|
+
**Tree-based Attention**:
|
|
206
|
+
```python
|
|
207
|
+
# Medusa constructs tree of candidates
|
|
208
|
+
# Example: Predict 2 steps ahead with top-2 per step
|
|
209
|
+
|
|
210
|
+
# Root
|
|
211
|
+
# / \
|
|
212
|
+
# T1a T1b (Step 1: 2 candidates)
|
|
213
|
+
# / \ / \
|
|
214
|
+
# T2a T2b T2c T2d (Step 2: 4 candidates total)
|
|
215
|
+
|
|
216
|
+
# Single forward pass evaluates entire tree!
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
**Advantages**:
|
|
220
|
+
- No separate draft model needed
|
|
221
|
+
- Minimal training (only heads)
|
|
222
|
+
- Compatible with any LLM
|
|
223
|
+
|
|
224
|
+
### 3. Lookahead Decoding (Jacobi Iteration)
|
|
225
|
+
|
|
226
|
+
**Source**: ICML 2024
|
|
227
|
+
|
|
228
|
+
**Core idea**: Reformulate autoregressive decoding as solving system of equations, solve in parallel using Jacobi iteration.
|
|
229
|
+
|
|
230
|
+
**Mathematical formulation**:
|
|
231
|
+
```
|
|
232
|
+
Traditional: y_t = f(x, y_1, ..., y_{t-1}) (sequential)
|
|
233
|
+
Jacobi: y_t^{(k+1)} = f(x, y_1^{(k)}, ..., y_{t-1}^{(k)}) (parallel)
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
**Two branches**:
|
|
237
|
+
|
|
238
|
+
1. **Lookahead Branch**: Generate n-grams in parallel
|
|
239
|
+
- Window size W: How many steps to look ahead
|
|
240
|
+
- N-gram size N: How many past tokens to use
|
|
241
|
+
|
|
242
|
+
2. **Verification Branch**: Verify promising n-grams
|
|
243
|
+
- Match n-grams with generated tokens
|
|
244
|
+
- Accept if first token matches
|
|
245
|
+
|
|
246
|
+
```python
|
|
247
|
+
class LookaheadDecoding:
|
|
248
|
+
def __init__(self, model, window_size=15, ngram_size=5):
|
|
249
|
+
self.model = model
|
|
250
|
+
self.W = window_size # Lookahead window
|
|
251
|
+
self.N = ngram_size # N-gram size
|
|
252
|
+
|
|
253
|
+
def generate_step(self, tokens):
|
|
254
|
+
# Lookahead branch: Generate W × N candidates
|
|
255
|
+
candidates = {}
|
|
256
|
+
for w in range(1, self.W + 1):
|
|
257
|
+
for n in range(1, self.N + 1):
|
|
258
|
+
# Generate n-gram starting at position w
|
|
259
|
+
ngram = self.generate_ngram(tokens, start=w, length=n)
|
|
260
|
+
candidates[(w, n)] = ngram
|
|
261
|
+
|
|
262
|
+
# Verification branch: Find matching n-grams
|
|
263
|
+
verified = []
|
|
264
|
+
for ngram in candidates.values():
|
|
265
|
+
if ngram[0] == tokens[-1]: # First token matches last input
|
|
266
|
+
if self.verify(tokens, ngram):
|
|
267
|
+
verified.append(ngram)
|
|
268
|
+
|
|
269
|
+
# Accept longest verified n-gram
|
|
270
|
+
return max(verified, key=len) if verified else [self.model.generate_next(tokens)]
|
|
271
|
+
```
|
|
272
|
+
|
|
273
|
+
**Performance**:
|
|
274
|
+
- Speedup: 1.5-2.3× (up to 3.6× for code generation)
|
|
275
|
+
- No draft model or training needed
|
|
276
|
+
- Works out-of-the-box with any model
|
|
277
|
+
|
|
278
|
+
## Method Comparison
|
|
279
|
+
|
|
280
|
+
| Method | Speedup | Training Needed | Draft Model | Quality Loss |
|
|
281
|
+
|--------|---------|-----------------|-------------|--------------|
|
|
282
|
+
| **Draft Model Speculative** | 1.5-2× | No | Yes (external) | None |
|
|
283
|
+
| **Medusa** | 2-3.6× | Minimal (heads only) | No (built-in heads) | None |
|
|
284
|
+
| **Lookahead** | 1.5-2.3× | None | No | None |
|
|
285
|
+
| **Naive Batching** | 1.2-1.5× | No | No | None |
|
|
286
|
+
|
|
287
|
+
## Advanced Patterns
|
|
288
|
+
|
|
289
|
+
### Training Medusa Heads
|
|
290
|
+
|
|
291
|
+
```python
|
|
292
|
+
from medusa.model.medusa_model import MedusaModel
|
|
293
|
+
from medusa.model.kv_cache import initialize_past_key_values
|
|
294
|
+
import torch.nn as nn
|
|
295
|
+
|
|
296
|
+
# 1. Load base model
|
|
297
|
+
base_model = AutoModelForCausalLM.from_pretrained(
|
|
298
|
+
"lmsys/vicuna-7b-v1.3",
|
|
299
|
+
torch_dtype=torch.float16
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
# 2. Add Medusa heads
|
|
303
|
+
num_heads = 4
|
|
304
|
+
medusa_heads = nn.ModuleList([
|
|
305
|
+
nn.Linear(base_model.config.hidden_size, base_model.config.vocab_size, bias=False)
|
|
306
|
+
for _ in range(num_heads)
|
|
307
|
+
])
|
|
308
|
+
|
|
309
|
+
# 3. Training loop (freeze base model for Medusa-1)
|
|
310
|
+
for param in base_model.parameters():
|
|
311
|
+
param.requires_grad = False # Freeze base
|
|
312
|
+
|
|
313
|
+
optimizer = torch.optim.Adam(medusa_heads.parameters(), lr=1e-3)
|
|
314
|
+
|
|
315
|
+
for batch in dataloader:
|
|
316
|
+
# Forward pass
|
|
317
|
+
hidden_states = base_model(**batch, output_hidden_states=True).hidden_states[-1]
|
|
318
|
+
|
|
319
|
+
# Predict future tokens with each head
|
|
320
|
+
loss = 0
|
|
321
|
+
for i, head in enumerate(medusa_heads):
|
|
322
|
+
logits = head(hidden_states)
|
|
323
|
+
# Target: tokens shifted by (i+1) positions
|
|
324
|
+
target = batch['input_ids'][:, i+1:]
|
|
325
|
+
loss += F.cross_entropy(logits[:, :-i-1], target)
|
|
326
|
+
|
|
327
|
+
# Backward
|
|
328
|
+
optimizer.zero_grad()
|
|
329
|
+
loss.backward()
|
|
330
|
+
optimizer.step()
|
|
331
|
+
```
|
|
332
|
+
|
|
333
|
+
### Hybrid: Speculative + Medusa
|
|
334
|
+
|
|
335
|
+
```python
|
|
336
|
+
# Use Medusa as draft model for speculative decoding
|
|
337
|
+
draft_medusa = MedusaModel.from_pretrained("medusa-vicuna-7b")
|
|
338
|
+
target_model = AutoModelForCausalLM.from_pretrained("vicuna-33b")
|
|
339
|
+
|
|
340
|
+
# Draft generates multiple candidates with Medusa
|
|
341
|
+
draft_tokens = draft_medusa.medusa_generate(prompt, max_new_tokens=5)
|
|
342
|
+
|
|
343
|
+
# Target verifies in single forward pass
|
|
344
|
+
outputs = target_model.generate(
|
|
345
|
+
prompt,
|
|
346
|
+
assistant_model=draft_medusa, # Use Medusa as draft
|
|
347
|
+
max_new_tokens=256
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
# Combines benefits: Medusa speed + large model quality
|
|
351
|
+
```
|
|
352
|
+
|
|
353
|
+
### Optimal Draft Model Selection
|
|
354
|
+
|
|
355
|
+
```python
|
|
356
|
+
def select_draft_model(target_model_size, target):
|
|
357
|
+
"""Select optimal draft model for speculative decoding."""
|
|
358
|
+
# Rule: Draft should be 5-10× smaller
|
|
359
|
+
if target_model_size == "70B":
|
|
360
|
+
return "7B" # 10× smaller
|
|
361
|
+
elif target_model_size == "33B":
|
|
362
|
+
return "7B" # 5× smaller
|
|
363
|
+
elif target_model_size == "13B":
|
|
364
|
+
return "1B" # 13× smaller
|
|
365
|
+
else:
|
|
366
|
+
return None # Target too small, use Medusa/Lookahead instead
|
|
367
|
+
|
|
368
|
+
# Example
|
|
369
|
+
draft = select_draft_model("70B", target_model)
|
|
370
|
+
# Returns "7B" → Use Llama-2-7b as draft for Llama-2-70b
|
|
371
|
+
```
|
|
372
|
+
|
|
373
|
+
## Best Practices
|
|
374
|
+
|
|
375
|
+
### 1. Choose the Right Method
|
|
376
|
+
|
|
377
|
+
```python
|
|
378
|
+
# New deployment → Medusa (best overall speedup, no draft model)
|
|
379
|
+
if deploying_new_model:
|
|
380
|
+
use_method = "Medusa"
|
|
381
|
+
|
|
382
|
+
# Existing deployment with small model available → Draft speculative
|
|
383
|
+
elif have_small_version_of_model:
|
|
384
|
+
use_method = "Draft Model Speculative"
|
|
385
|
+
|
|
386
|
+
# Want zero training/setup → Lookahead
|
|
387
|
+
elif want_plug_and_play:
|
|
388
|
+
use_method = "Lookahead Decoding"
|
|
389
|
+
```
|
|
390
|
+
|
|
391
|
+
### 2. Hyperparameter Tuning
|
|
392
|
+
|
|
393
|
+
**Draft Model Speculative**:
|
|
394
|
+
```python
|
|
395
|
+
# K = number of speculative tokens
|
|
396
|
+
K = 4 # Good default
|
|
397
|
+
K = 2 # Conservative (higher acceptance)
|
|
398
|
+
K = 8 # Aggressive (lower acceptance, but more when accepted)
|
|
399
|
+
|
|
400
|
+
# Rule: Larger K → more speedup IF draft model is good
|
|
401
|
+
```
|
|
402
|
+
|
|
403
|
+
**Medusa**:
|
|
404
|
+
```python
|
|
405
|
+
# Posterior threshold (acceptance confidence)
|
|
406
|
+
posterior_threshold = 0.09 # Standard (from paper)
|
|
407
|
+
posterior_threshold = 0.05 # More conservative (slower, higher quality)
|
|
408
|
+
posterior_threshold = 0.15 # More aggressive (faster, may degrade quality)
|
|
409
|
+
|
|
410
|
+
# Tree depth (how many steps ahead)
|
|
411
|
+
medusa_choices = [[0], [0, 0], [0, 1], [0, 0, 0]] # Depth 3 (standard)
|
|
412
|
+
```
|
|
413
|
+
|
|
414
|
+
**Lookahead**:
|
|
415
|
+
```python
|
|
416
|
+
# Window size W (lookahead distance)
|
|
417
|
+
# N-gram size N (context for generation)
|
|
418
|
+
|
|
419
|
+
# 7B model (more resources)
|
|
420
|
+
W, N = 15, 5
|
|
421
|
+
|
|
422
|
+
# 13B model (moderate)
|
|
423
|
+
W, N = 10, 5
|
|
424
|
+
|
|
425
|
+
# 33B+ model (limited resources)
|
|
426
|
+
W, N = 7, 5
|
|
427
|
+
```
|
|
428
|
+
|
|
429
|
+
### 3. Production Deployment
|
|
430
|
+
|
|
431
|
+
```python
|
|
432
|
+
# vLLM with speculative decoding
|
|
433
|
+
from vllm import LLM, SamplingParams
|
|
434
|
+
|
|
435
|
+
# Initialize with draft model
|
|
436
|
+
llm = LLM(
|
|
437
|
+
model="meta-llama/Llama-2-70b-hf",
|
|
438
|
+
speculative_model="meta-llama/Llama-2-7b-hf", # Draft model
|
|
439
|
+
num_speculative_tokens=5,
|
|
440
|
+
use_v2_block_manager=True,
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
# Generate
|
|
444
|
+
prompts = ["Tell me about AI:", "Explain quantum physics:"]
|
|
445
|
+
sampling_params = SamplingParams(temperature=0.7, max_tokens=256)
|
|
446
|
+
|
|
447
|
+
outputs = llm.generate(prompts, sampling_params)
|
|
448
|
+
for output in outputs:
|
|
449
|
+
print(output.outputs[0].text)
|
|
450
|
+
```
|
|
451
|
+
|
|
452
|
+
## Resources
|
|
453
|
+
|
|
454
|
+
- **Medusa Paper**: https://arxiv.org/abs/2401.10774
|
|
455
|
+
- **Medusa GitHub**: https://github.com/FasterDecoding/Medusa
|
|
456
|
+
- **Lookahead Decoding (ICML 2024)**: https://lmsys.org/blog/2023-11-21-lookahead-decoding/
|
|
457
|
+
- **Lookahead GitHub**: https://github.com/hao-ai-lab/LookaheadDecoding
|
|
458
|
+
- **Speculative Decoding Survey (ACL 2024)**: https://aclanthology.org/2024.findings-acl.456.pdf
|
|
459
|
+
- **Comprehensive Survey**: https://arxiv.org/abs/2401.07851
|
|
460
|
+
|
|
461
|
+
## See Also
|
|
462
|
+
|
|
463
|
+
- `references/draft_model.md` - Draft model selection and training
|
|
464
|
+
- `references/medusa.md` - Medusa architecture and training
|
|
465
|
+
- `references/lookahead.md` - Lookahead decoding implementation details
|
|
466
|
+
|
|
467
|
+
|