@synsci/cli-darwin-x64 1.1.49
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skills/accelerate/SKILL.md +332 -0
- package/bin/skills/accelerate/references/custom-plugins.md +453 -0
- package/bin/skills/accelerate/references/megatron-integration.md +489 -0
- package/bin/skills/accelerate/references/performance.md +525 -0
- package/bin/skills/audiocraft/SKILL.md +564 -0
- package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
- package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
- package/bin/skills/autogpt/SKILL.md +403 -0
- package/bin/skills/autogpt/references/advanced-usage.md +535 -0
- package/bin/skills/autogpt/references/troubleshooting.md +420 -0
- package/bin/skills/awq/SKILL.md +310 -0
- package/bin/skills/awq/references/advanced-usage.md +324 -0
- package/bin/skills/awq/references/troubleshooting.md +344 -0
- package/bin/skills/axolotl/SKILL.md +158 -0
- package/bin/skills/axolotl/references/api.md +5548 -0
- package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
- package/bin/skills/axolotl/references/index.md +15 -0
- package/bin/skills/axolotl/references/other.md +3563 -0
- package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
- package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
- package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
- package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
- package/bin/skills/bitsandbytes/SKILL.md +411 -0
- package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
- package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
- package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
- package/bin/skills/blip-2/SKILL.md +564 -0
- package/bin/skills/blip-2/references/advanced-usage.md +680 -0
- package/bin/skills/blip-2/references/troubleshooting.md +526 -0
- package/bin/skills/chroma/SKILL.md +406 -0
- package/bin/skills/chroma/references/integration.md +38 -0
- package/bin/skills/clip/SKILL.md +253 -0
- package/bin/skills/clip/references/applications.md +207 -0
- package/bin/skills/constitutional-ai/SKILL.md +290 -0
- package/bin/skills/crewai/SKILL.md +498 -0
- package/bin/skills/crewai/references/flows.md +438 -0
- package/bin/skills/crewai/references/tools.md +429 -0
- package/bin/skills/crewai/references/troubleshooting.md +480 -0
- package/bin/skills/deepspeed/SKILL.md +141 -0
- package/bin/skills/deepspeed/references/08.md +17 -0
- package/bin/skills/deepspeed/references/09.md +173 -0
- package/bin/skills/deepspeed/references/2020.md +378 -0
- package/bin/skills/deepspeed/references/2023.md +279 -0
- package/bin/skills/deepspeed/references/assets.md +179 -0
- package/bin/skills/deepspeed/references/index.md +35 -0
- package/bin/skills/deepspeed/references/mii.md +118 -0
- package/bin/skills/deepspeed/references/other.md +1191 -0
- package/bin/skills/deepspeed/references/tutorials.md +6554 -0
- package/bin/skills/dspy/SKILL.md +590 -0
- package/bin/skills/dspy/references/examples.md +663 -0
- package/bin/skills/dspy/references/modules.md +475 -0
- package/bin/skills/dspy/references/optimizers.md +566 -0
- package/bin/skills/faiss/SKILL.md +221 -0
- package/bin/skills/faiss/references/index_types.md +280 -0
- package/bin/skills/flash-attention/SKILL.md +367 -0
- package/bin/skills/flash-attention/references/benchmarks.md +215 -0
- package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
- package/bin/skills/gguf/SKILL.md +427 -0
- package/bin/skills/gguf/references/advanced-usage.md +504 -0
- package/bin/skills/gguf/references/troubleshooting.md +442 -0
- package/bin/skills/gptq/SKILL.md +450 -0
- package/bin/skills/gptq/references/calibration.md +337 -0
- package/bin/skills/gptq/references/integration.md +129 -0
- package/bin/skills/gptq/references/troubleshooting.md +95 -0
- package/bin/skills/grpo-rl-training/README.md +97 -0
- package/bin/skills/grpo-rl-training/SKILL.md +572 -0
- package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
- package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
- package/bin/skills/guidance/SKILL.md +572 -0
- package/bin/skills/guidance/references/backends.md +554 -0
- package/bin/skills/guidance/references/constraints.md +674 -0
- package/bin/skills/guidance/references/examples.md +767 -0
- package/bin/skills/hqq/SKILL.md +445 -0
- package/bin/skills/hqq/references/advanced-usage.md +528 -0
- package/bin/skills/hqq/references/troubleshooting.md +503 -0
- package/bin/skills/hugging-face-cli/SKILL.md +191 -0
- package/bin/skills/hugging-face-cli/references/commands.md +954 -0
- package/bin/skills/hugging-face-cli/references/examples.md +374 -0
- package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
- package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
- package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
- package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
- package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
- package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
- package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
- package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
- package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
- package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
- package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
- package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
- package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
- package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
- package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
- package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
- package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
- package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
- package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
- package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
- package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
- package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
- package/bin/skills/hugging-face-jobs/index.html +216 -0
- package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
- package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
- package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
- package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
- package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
- package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
- package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
- package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
- package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
- package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
- package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
- package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
- package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
- package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
- package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
- package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
- package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
- package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
- package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
- package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
- package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
- package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
- package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
- package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
- package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
- package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
- package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
- package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
- package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
- package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
- package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
- package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
- package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
- package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
- package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
- package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
- package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
- package/bin/skills/instructor/SKILL.md +740 -0
- package/bin/skills/instructor/references/examples.md +107 -0
- package/bin/skills/instructor/references/providers.md +70 -0
- package/bin/skills/instructor/references/validation.md +606 -0
- package/bin/skills/knowledge-distillation/SKILL.md +458 -0
- package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
- package/bin/skills/lambda-labs/SKILL.md +545 -0
- package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
- package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
- package/bin/skills/langchain/SKILL.md +480 -0
- package/bin/skills/langchain/references/agents.md +499 -0
- package/bin/skills/langchain/references/integration.md +562 -0
- package/bin/skills/langchain/references/rag.md +600 -0
- package/bin/skills/langsmith/SKILL.md +422 -0
- package/bin/skills/langsmith/references/advanced-usage.md +548 -0
- package/bin/skills/langsmith/references/troubleshooting.md +537 -0
- package/bin/skills/litgpt/SKILL.md +469 -0
- package/bin/skills/litgpt/references/custom-models.md +568 -0
- package/bin/skills/litgpt/references/distributed-training.md +451 -0
- package/bin/skills/litgpt/references/supported-models.md +336 -0
- package/bin/skills/litgpt/references/training-recipes.md +619 -0
- package/bin/skills/llama-cpp/SKILL.md +258 -0
- package/bin/skills/llama-cpp/references/optimization.md +89 -0
- package/bin/skills/llama-cpp/references/quantization.md +213 -0
- package/bin/skills/llama-cpp/references/server.md +125 -0
- package/bin/skills/llama-factory/SKILL.md +80 -0
- package/bin/skills/llama-factory/references/_images.md +23 -0
- package/bin/skills/llama-factory/references/advanced.md +1055 -0
- package/bin/skills/llama-factory/references/getting_started.md +349 -0
- package/bin/skills/llama-factory/references/index.md +19 -0
- package/bin/skills/llama-factory/references/other.md +31 -0
- package/bin/skills/llamaguard/SKILL.md +337 -0
- package/bin/skills/llamaindex/SKILL.md +569 -0
- package/bin/skills/llamaindex/references/agents.md +83 -0
- package/bin/skills/llamaindex/references/data_connectors.md +108 -0
- package/bin/skills/llamaindex/references/query_engines.md +406 -0
- package/bin/skills/llava/SKILL.md +304 -0
- package/bin/skills/llava/references/training.md +197 -0
- package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
- package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
- package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
- package/bin/skills/long-context/SKILL.md +536 -0
- package/bin/skills/long-context/references/extension_methods.md +468 -0
- package/bin/skills/long-context/references/fine_tuning.md +611 -0
- package/bin/skills/long-context/references/rope.md +402 -0
- package/bin/skills/mamba/SKILL.md +260 -0
- package/bin/skills/mamba/references/architecture-details.md +206 -0
- package/bin/skills/mamba/references/benchmarks.md +255 -0
- package/bin/skills/mamba/references/training-guide.md +388 -0
- package/bin/skills/megatron-core/SKILL.md +366 -0
- package/bin/skills/megatron-core/references/benchmarks.md +249 -0
- package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
- package/bin/skills/megatron-core/references/production-examples.md +473 -0
- package/bin/skills/megatron-core/references/training-recipes.md +547 -0
- package/bin/skills/miles/SKILL.md +315 -0
- package/bin/skills/miles/references/api-reference.md +141 -0
- package/bin/skills/miles/references/troubleshooting.md +352 -0
- package/bin/skills/mlflow/SKILL.md +704 -0
- package/bin/skills/mlflow/references/deployment.md +744 -0
- package/bin/skills/mlflow/references/model-registry.md +770 -0
- package/bin/skills/mlflow/references/tracking.md +680 -0
- package/bin/skills/modal/SKILL.md +341 -0
- package/bin/skills/modal/references/advanced-usage.md +503 -0
- package/bin/skills/modal/references/troubleshooting.md +494 -0
- package/bin/skills/model-merging/SKILL.md +539 -0
- package/bin/skills/model-merging/references/evaluation.md +462 -0
- package/bin/skills/model-merging/references/examples.md +428 -0
- package/bin/skills/model-merging/references/methods.md +352 -0
- package/bin/skills/model-pruning/SKILL.md +495 -0
- package/bin/skills/model-pruning/references/wanda.md +347 -0
- package/bin/skills/moe-training/SKILL.md +526 -0
- package/bin/skills/moe-training/references/architectures.md +432 -0
- package/bin/skills/moe-training/references/inference.md +348 -0
- package/bin/skills/moe-training/references/training.md +425 -0
- package/bin/skills/nanogpt/SKILL.md +290 -0
- package/bin/skills/nanogpt/references/architecture.md +382 -0
- package/bin/skills/nanogpt/references/data.md +476 -0
- package/bin/skills/nanogpt/references/training.md +564 -0
- package/bin/skills/nemo-curator/SKILL.md +383 -0
- package/bin/skills/nemo-curator/references/deduplication.md +87 -0
- package/bin/skills/nemo-curator/references/filtering.md +102 -0
- package/bin/skills/nemo-evaluator/SKILL.md +494 -0
- package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
- package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
- package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
- package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
- package/bin/skills/nemo-guardrails/SKILL.md +297 -0
- package/bin/skills/nnsight/SKILL.md +436 -0
- package/bin/skills/nnsight/references/README.md +78 -0
- package/bin/skills/nnsight/references/api.md +344 -0
- package/bin/skills/nnsight/references/tutorials.md +300 -0
- package/bin/skills/openrlhf/SKILL.md +249 -0
- package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
- package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
- package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
- package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
- package/bin/skills/outlines/SKILL.md +652 -0
- package/bin/skills/outlines/references/backends.md +615 -0
- package/bin/skills/outlines/references/examples.md +773 -0
- package/bin/skills/outlines/references/json_generation.md +652 -0
- package/bin/skills/peft/SKILL.md +431 -0
- package/bin/skills/peft/references/advanced-usage.md +514 -0
- package/bin/skills/peft/references/troubleshooting.md +480 -0
- package/bin/skills/phoenix/SKILL.md +475 -0
- package/bin/skills/phoenix/references/advanced-usage.md +619 -0
- package/bin/skills/phoenix/references/troubleshooting.md +538 -0
- package/bin/skills/pinecone/SKILL.md +358 -0
- package/bin/skills/pinecone/references/deployment.md +181 -0
- package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
- package/bin/skills/pytorch-fsdp/references/index.md +7 -0
- package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
- package/bin/skills/pytorch-lightning/SKILL.md +346 -0
- package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
- package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
- package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
- package/bin/skills/pyvene/SKILL.md +473 -0
- package/bin/skills/pyvene/references/README.md +73 -0
- package/bin/skills/pyvene/references/api.md +383 -0
- package/bin/skills/pyvene/references/tutorials.md +376 -0
- package/bin/skills/qdrant/SKILL.md +493 -0
- package/bin/skills/qdrant/references/advanced-usage.md +648 -0
- package/bin/skills/qdrant/references/troubleshooting.md +631 -0
- package/bin/skills/ray-data/SKILL.md +326 -0
- package/bin/skills/ray-data/references/integration.md +82 -0
- package/bin/skills/ray-data/references/transformations.md +83 -0
- package/bin/skills/ray-train/SKILL.md +406 -0
- package/bin/skills/ray-train/references/multi-node.md +628 -0
- package/bin/skills/rwkv/SKILL.md +260 -0
- package/bin/skills/rwkv/references/architecture-details.md +344 -0
- package/bin/skills/rwkv/references/rwkv7.md +386 -0
- package/bin/skills/rwkv/references/state-management.md +369 -0
- package/bin/skills/saelens/SKILL.md +386 -0
- package/bin/skills/saelens/references/README.md +70 -0
- package/bin/skills/saelens/references/api.md +333 -0
- package/bin/skills/saelens/references/tutorials.md +318 -0
- package/bin/skills/segment-anything/SKILL.md +500 -0
- package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
- package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
- package/bin/skills/sentence-transformers/SKILL.md +255 -0
- package/bin/skills/sentence-transformers/references/models.md +123 -0
- package/bin/skills/sentencepiece/SKILL.md +235 -0
- package/bin/skills/sentencepiece/references/algorithms.md +200 -0
- package/bin/skills/sentencepiece/references/training.md +304 -0
- package/bin/skills/sglang/SKILL.md +442 -0
- package/bin/skills/sglang/references/deployment.md +490 -0
- package/bin/skills/sglang/references/radix-attention.md +413 -0
- package/bin/skills/sglang/references/structured-generation.md +541 -0
- package/bin/skills/simpo/SKILL.md +219 -0
- package/bin/skills/simpo/references/datasets.md +478 -0
- package/bin/skills/simpo/references/hyperparameters.md +452 -0
- package/bin/skills/simpo/references/loss-functions.md +350 -0
- package/bin/skills/skypilot/SKILL.md +509 -0
- package/bin/skills/skypilot/references/advanced-usage.md +491 -0
- package/bin/skills/skypilot/references/troubleshooting.md +570 -0
- package/bin/skills/slime/SKILL.md +464 -0
- package/bin/skills/slime/references/api-reference.md +392 -0
- package/bin/skills/slime/references/troubleshooting.md +386 -0
- package/bin/skills/speculative-decoding/SKILL.md +467 -0
- package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
- package/bin/skills/speculative-decoding/references/medusa.md +350 -0
- package/bin/skills/stable-diffusion/SKILL.md +519 -0
- package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
- package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
- package/bin/skills/tensorboard/SKILL.md +629 -0
- package/bin/skills/tensorboard/references/integrations.md +638 -0
- package/bin/skills/tensorboard/references/profiling.md +545 -0
- package/bin/skills/tensorboard/references/visualization.md +620 -0
- package/bin/skills/tensorrt-llm/SKILL.md +187 -0
- package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
- package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
- package/bin/skills/tensorrt-llm/references/serving.md +470 -0
- package/bin/skills/tinker/SKILL.md +362 -0
- package/bin/skills/tinker/references/api-reference.md +168 -0
- package/bin/skills/tinker/references/getting-started.md +157 -0
- package/bin/skills/tinker/references/loss-functions.md +163 -0
- package/bin/skills/tinker/references/models-and-lora.md +139 -0
- package/bin/skills/tinker/references/recipes.md +280 -0
- package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
- package/bin/skills/tinker/references/rendering.md +243 -0
- package/bin/skills/tinker/references/supervised-learning.md +232 -0
- package/bin/skills/tinker-training-cost/SKILL.md +187 -0
- package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
- package/bin/skills/torchforge/SKILL.md +433 -0
- package/bin/skills/torchforge/references/api-reference.md +327 -0
- package/bin/skills/torchforge/references/troubleshooting.md +409 -0
- package/bin/skills/torchtitan/SKILL.md +358 -0
- package/bin/skills/torchtitan/references/checkpoint.md +181 -0
- package/bin/skills/torchtitan/references/custom-models.md +258 -0
- package/bin/skills/torchtitan/references/float8.md +133 -0
- package/bin/skills/torchtitan/references/fsdp.md +126 -0
- package/bin/skills/transformer-lens/SKILL.md +346 -0
- package/bin/skills/transformer-lens/references/README.md +54 -0
- package/bin/skills/transformer-lens/references/api.md +362 -0
- package/bin/skills/transformer-lens/references/tutorials.md +339 -0
- package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
- package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
- package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
- package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
- package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
- package/bin/skills/unsloth/SKILL.md +80 -0
- package/bin/skills/unsloth/references/index.md +7 -0
- package/bin/skills/unsloth/references/llms-full.md +16799 -0
- package/bin/skills/unsloth/references/llms-txt.md +12044 -0
- package/bin/skills/unsloth/references/llms.md +82 -0
- package/bin/skills/verl/SKILL.md +391 -0
- package/bin/skills/verl/references/api-reference.md +301 -0
- package/bin/skills/verl/references/troubleshooting.md +391 -0
- package/bin/skills/vllm/SKILL.md +364 -0
- package/bin/skills/vllm/references/optimization.md +226 -0
- package/bin/skills/vllm/references/quantization.md +284 -0
- package/bin/skills/vllm/references/server-deployment.md +255 -0
- package/bin/skills/vllm/references/troubleshooting.md +447 -0
- package/bin/skills/weights-and-biases/SKILL.md +590 -0
- package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
- package/bin/skills/weights-and-biases/references/integrations.md +700 -0
- package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
- package/bin/skills/whisper/SKILL.md +317 -0
- package/bin/skills/whisper/references/languages.md +189 -0
- package/bin/synsc +0 -0
- package/package.json +10 -0
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: tensorrt-llm
|
|
3
|
+
description: Optimizes LLM inference with NVIDIA TensorRT for maximum throughput and lowest latency. Use for production deployment on NVIDIA GPUs (A100/H100), when you need 10-100x faster inference than PyTorch, or for serving models with quantization (FP8/INT4), in-flight batching, and multi-GPU scaling.
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
author: Synthetic Sciences
|
|
6
|
+
license: MIT
|
|
7
|
+
tags: [Inference Serving, TensorRT-LLM, NVIDIA, Inference Optimization, High Throughput, Low Latency, Production, FP8, INT4, In-Flight Batching, Multi-GPU]
|
|
8
|
+
dependencies: [tensorrt-llm, torch]
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# TensorRT-LLM
|
|
12
|
+
|
|
13
|
+
NVIDIA's open-source library for optimizing LLM inference with state-of-the-art performance on NVIDIA GPUs.
|
|
14
|
+
|
|
15
|
+
## When to use TensorRT-LLM
|
|
16
|
+
|
|
17
|
+
**Use TensorRT-LLM when:**
|
|
18
|
+
- Deploying on NVIDIA GPUs (A100, H100, GB200)
|
|
19
|
+
- Need maximum throughput (24,000+ tokens/sec on Llama 3)
|
|
20
|
+
- Require low latency for real-time applications
|
|
21
|
+
- Working with quantized models (FP8, INT4, FP4)
|
|
22
|
+
- Scaling across multiple GPUs or nodes
|
|
23
|
+
|
|
24
|
+
**Use vLLM instead when:**
|
|
25
|
+
- Need simpler setup and Python-first API
|
|
26
|
+
- Want PagedAttention without TensorRT compilation
|
|
27
|
+
- Working with AMD GPUs or non-NVIDIA hardware
|
|
28
|
+
|
|
29
|
+
**Use llama.cpp instead when:**
|
|
30
|
+
- Deploying on CPU or Apple Silicon
|
|
31
|
+
- Need edge deployment without NVIDIA GPUs
|
|
32
|
+
- Want simpler GGUF quantization format
|
|
33
|
+
|
|
34
|
+
## Quick start
|
|
35
|
+
|
|
36
|
+
### Installation
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
# Docker (recommended)
|
|
40
|
+
docker pull nvidia/tensorrt_llm:latest
|
|
41
|
+
|
|
42
|
+
# pip install
|
|
43
|
+
pip install tensorrt_llm==1.2.0rc3
|
|
44
|
+
|
|
45
|
+
# Requires CUDA 13.0.0, TensorRT 10.13.2, Python 3.10-3.12
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### Basic inference
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
from tensorrt_llm import LLM, SamplingParams
|
|
52
|
+
|
|
53
|
+
# Initialize model
|
|
54
|
+
llm = LLM(model="meta-llama/Meta-Llama-3-8B")
|
|
55
|
+
|
|
56
|
+
# Configure sampling
|
|
57
|
+
sampling_params = SamplingParams(
|
|
58
|
+
max_tokens=100,
|
|
59
|
+
temperature=0.7,
|
|
60
|
+
top_p=0.9
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# Generate
|
|
64
|
+
prompts = ["Explain quantum computing"]
|
|
65
|
+
outputs = llm.generate(prompts, sampling_params)
|
|
66
|
+
|
|
67
|
+
for output in outputs:
|
|
68
|
+
print(output.text)
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Serving with trtllm-serve
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
# Start server (automatic model download and compilation)
|
|
75
|
+
trtllm-serve meta-llama/Meta-Llama-3-8B \
|
|
76
|
+
--tp_size 4 \ # Tensor parallelism (4 GPUs)
|
|
77
|
+
--max_batch_size 256 \
|
|
78
|
+
--max_num_tokens 4096
|
|
79
|
+
|
|
80
|
+
# Client request
|
|
81
|
+
curl -X POST http://localhost:8000/v1/chat/completions \
|
|
82
|
+
-H "Content-Type: application/json" \
|
|
83
|
+
-d '{
|
|
84
|
+
"model": "meta-llama/Meta-Llama-3-8B",
|
|
85
|
+
"messages": [{"role": "user", "content": "Hello!"}],
|
|
86
|
+
"temperature": 0.7,
|
|
87
|
+
"max_tokens": 100
|
|
88
|
+
}'
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Key features
|
|
92
|
+
|
|
93
|
+
### Performance optimizations
|
|
94
|
+
- **In-flight batching**: Dynamic batching during generation
|
|
95
|
+
- **Paged KV cache**: Efficient memory management
|
|
96
|
+
- **Flash Attention**: Optimized attention kernels
|
|
97
|
+
- **Quantization**: FP8, INT4, FP4 for 2-4× faster inference
|
|
98
|
+
- **CUDA graphs**: Reduced kernel launch overhead
|
|
99
|
+
|
|
100
|
+
### Parallelism
|
|
101
|
+
- **Tensor parallelism (TP)**: Split model across GPUs
|
|
102
|
+
- **Pipeline parallelism (PP)**: Layer-wise distribution
|
|
103
|
+
- **Expert parallelism**: For Mixture-of-Experts models
|
|
104
|
+
- **Multi-node**: Scale beyond single machine
|
|
105
|
+
|
|
106
|
+
### Advanced features
|
|
107
|
+
- **Speculative decoding**: Faster generation with draft models
|
|
108
|
+
- **LoRA serving**: Efficient multi-adapter deployment
|
|
109
|
+
- **Disaggregated serving**: Separate prefill and generation
|
|
110
|
+
|
|
111
|
+
## Common patterns
|
|
112
|
+
|
|
113
|
+
### Quantized model (FP8)
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
from tensorrt_llm import LLM
|
|
117
|
+
|
|
118
|
+
# Load FP8 quantized model (2× faster, 50% memory)
|
|
119
|
+
llm = LLM(
|
|
120
|
+
model="meta-llama/Meta-Llama-3-70B",
|
|
121
|
+
dtype="fp8",
|
|
122
|
+
max_num_tokens=8192
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# Inference same as before
|
|
126
|
+
outputs = llm.generate(["Summarize this article..."])
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### Multi-GPU deployment
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
# Tensor parallelism across 8 GPUs
|
|
133
|
+
llm = LLM(
|
|
134
|
+
model="meta-llama/Meta-Llama-3-405B",
|
|
135
|
+
tensor_parallel_size=8,
|
|
136
|
+
dtype="fp8"
|
|
137
|
+
)
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
### Batch inference
|
|
141
|
+
|
|
142
|
+
```python
|
|
143
|
+
# Process 100 prompts efficiently
|
|
144
|
+
prompts = [f"Question {i}: ..." for i in range(100)]
|
|
145
|
+
|
|
146
|
+
outputs = llm.generate(
|
|
147
|
+
prompts,
|
|
148
|
+
sampling_params=SamplingParams(max_tokens=200)
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# Automatic in-flight batching for maximum throughput
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
## Performance benchmarks
|
|
155
|
+
|
|
156
|
+
**Meta Llama 3-8B** (H100 GPU):
|
|
157
|
+
- Throughput: 24,000 tokens/sec
|
|
158
|
+
- Latency: ~10ms per token
|
|
159
|
+
- vs PyTorch: **100× faster**
|
|
160
|
+
|
|
161
|
+
**Llama 3-70B** (8× A100 80GB):
|
|
162
|
+
- FP8 quantization: 2× faster than FP16
|
|
163
|
+
- Memory: 50% reduction with FP8
|
|
164
|
+
|
|
165
|
+
## Supported models
|
|
166
|
+
|
|
167
|
+
- **LLaMA family**: Llama 2, Llama 3, CodeLlama
|
|
168
|
+
- **GPT family**: GPT-2, GPT-J, GPT-NeoX
|
|
169
|
+
- **Qwen**: Qwen, Qwen2, QwQ
|
|
170
|
+
- **DeepSeek**: DeepSeek-V2, DeepSeek-V3
|
|
171
|
+
- **Mixtral**: Mixtral-8x7B, Mixtral-8x22B
|
|
172
|
+
- **Vision**: LLaVA, Phi-3-vision
|
|
173
|
+
- **100+ models** on HuggingFace
|
|
174
|
+
|
|
175
|
+
## References
|
|
176
|
+
|
|
177
|
+
- **[Optimization Guide](references/optimization.md)** - Quantization, batching, KV cache tuning
|
|
178
|
+
- **[Multi-GPU Setup](references/multi-gpu.md)** - Tensor/pipeline parallelism, multi-node
|
|
179
|
+
- **[Serving Guide](references/serving.md)** - Production deployment, monitoring, autoscaling
|
|
180
|
+
|
|
181
|
+
## Resources
|
|
182
|
+
|
|
183
|
+
- **Docs**: https://nvidia.github.io/TensorRT-LLM/
|
|
184
|
+
- **GitHub**: https://github.com/NVIDIA/TensorRT-LLM
|
|
185
|
+
- **Models**: https://huggingface.co/models?library=tensorrt_llm
|
|
186
|
+
|
|
187
|
+
|
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
# Multi-GPU Deployment Guide
|
|
2
|
+
|
|
3
|
+
Comprehensive guide to scaling TensorRT-LLM across multiple GPUs and nodes.
|
|
4
|
+
|
|
5
|
+
## Parallelism Strategies
|
|
6
|
+
|
|
7
|
+
### Tensor Parallelism (TP)
|
|
8
|
+
|
|
9
|
+
**What it does**: Splits model layers across GPUs horizontally.
|
|
10
|
+
|
|
11
|
+
**Use case**:
|
|
12
|
+
- Model fits in total GPU memory but not single GPU
|
|
13
|
+
- Need low latency (single forward pass)
|
|
14
|
+
- GPUs on same node (NVLink required for best performance)
|
|
15
|
+
|
|
16
|
+
**Example** (Llama 3-70B on 4× A100):
|
|
17
|
+
```python
|
|
18
|
+
from tensorrt_llm import LLM
|
|
19
|
+
|
|
20
|
+
llm = LLM(
|
|
21
|
+
model="meta-llama/Meta-Llama-3-70B",
|
|
22
|
+
tensor_parallel_size=4, # Split across 4 GPUs
|
|
23
|
+
dtype="fp16"
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
# Model automatically sharded across GPUs
|
|
27
|
+
# Single forward pass, low latency
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
**Performance**:
|
|
31
|
+
- Latency: ~Same as single GPU
|
|
32
|
+
- Throughput: 4× higher (4 GPUs)
|
|
33
|
+
- Communication: High (activations synced every layer)
|
|
34
|
+
|
|
35
|
+
### Pipeline Parallelism (PP)
|
|
36
|
+
|
|
37
|
+
**What it does**: Splits model layers across GPUs vertically (layer-wise).
|
|
38
|
+
|
|
39
|
+
**Use case**:
|
|
40
|
+
- Very large models (175B+)
|
|
41
|
+
- Can tolerate higher latency
|
|
42
|
+
- GPUs across multiple nodes
|
|
43
|
+
|
|
44
|
+
**Example** (Llama 3-405B on 8× H100):
|
|
45
|
+
```python
|
|
46
|
+
llm = LLM(
|
|
47
|
+
model="meta-llama/Meta-Llama-3-405B",
|
|
48
|
+
tensor_parallel_size=4, # TP=4 within nodes
|
|
49
|
+
pipeline_parallel_size=2, # PP=2 across nodes
|
|
50
|
+
dtype="fp8"
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# Total: 8 GPUs (4×2)
|
|
54
|
+
# Layers 0-40: Node 1 (4 GPUs with TP)
|
|
55
|
+
# Layers 41-80: Node 2 (4 GPUs with TP)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
**Performance**:
|
|
59
|
+
- Latency: Higher (sequential through pipeline)
|
|
60
|
+
- Throughput: High with micro-batching
|
|
61
|
+
- Communication: Lower than TP
|
|
62
|
+
|
|
63
|
+
### Expert Parallelism (EP)
|
|
64
|
+
|
|
65
|
+
**What it does**: Distributes MoE experts across GPUs.
|
|
66
|
+
|
|
67
|
+
**Use case**: Mixture-of-Experts models (Mixtral, DeepSeek-V2)
|
|
68
|
+
|
|
69
|
+
**Example** (Mixtral-8x22B on 8× A100):
|
|
70
|
+
```python
|
|
71
|
+
llm = LLM(
|
|
72
|
+
model="mistralai/Mixtral-8x22B",
|
|
73
|
+
tensor_parallel_size=4,
|
|
74
|
+
expert_parallel_size=2, # Distribute 8 experts across 2 groups
|
|
75
|
+
dtype="fp8"
|
|
76
|
+
)
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Configuration Examples
|
|
80
|
+
|
|
81
|
+
### Small model (7-13B) - Single GPU
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
# Llama 3-8B on 1× A100 80GB
|
|
85
|
+
llm = LLM(
|
|
86
|
+
model="meta-llama/Meta-Llama-3-8B",
|
|
87
|
+
dtype="fp16" # or fp8 for H100
|
|
88
|
+
)
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
**Resources**:
|
|
92
|
+
- GPU: 1× A100 80GB
|
|
93
|
+
- Memory: ~16GB model + 30GB KV cache
|
|
94
|
+
- Throughput: 3,000-5,000 tokens/sec
|
|
95
|
+
|
|
96
|
+
### Medium model (70B) - Multi-GPU same node
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
# Llama 3-70B on 4× A100 80GB (NVLink)
|
|
100
|
+
llm = LLM(
|
|
101
|
+
model="meta-llama/Meta-Llama-3-70B",
|
|
102
|
+
tensor_parallel_size=4,
|
|
103
|
+
dtype="fp8" # 70GB → 35GB per GPU
|
|
104
|
+
)
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
**Resources**:
|
|
108
|
+
- GPU: 4× A100 80GB with NVLink
|
|
109
|
+
- Memory: ~35GB per GPU (FP8)
|
|
110
|
+
- Throughput: 10,000-15,000 tokens/sec
|
|
111
|
+
- Latency: 15-20ms per token
|
|
112
|
+
|
|
113
|
+
### Large model (405B) - Multi-node
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
# Llama 3-405B on 2 nodes × 8 H100 = 16 GPUs
|
|
117
|
+
llm = LLM(
|
|
118
|
+
model="meta-llama/Meta-Llama-3-405B",
|
|
119
|
+
tensor_parallel_size=8, # TP within each node
|
|
120
|
+
pipeline_parallel_size=2, # PP across 2 nodes
|
|
121
|
+
dtype="fp8"
|
|
122
|
+
)
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
**Resources**:
|
|
126
|
+
- GPU: 2 nodes × 8 H100 80GB
|
|
127
|
+
- Memory: ~25GB per GPU (FP8)
|
|
128
|
+
- Throughput: 20,000-30,000 tokens/sec
|
|
129
|
+
- Network: InfiniBand recommended
|
|
130
|
+
|
|
131
|
+
## Server Deployment
|
|
132
|
+
|
|
133
|
+
### Single-node multi-GPU
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
# Llama 3-70B on 4 GPUs (automatic TP)
|
|
137
|
+
trtllm-serve meta-llama/Meta-Llama-3-70B \
|
|
138
|
+
--tp_size 4 \
|
|
139
|
+
--max_batch_size 256 \
|
|
140
|
+
--dtype fp8
|
|
141
|
+
|
|
142
|
+
# Listens on http://localhost:8000
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### Multi-node with Ray
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
# Node 1 (head node)
|
|
149
|
+
ray start --head --port=6379
|
|
150
|
+
|
|
151
|
+
# Node 2 (worker)
|
|
152
|
+
ray start --address='node1:6379'
|
|
153
|
+
|
|
154
|
+
# Deploy across cluster
|
|
155
|
+
trtllm-serve meta-llama/Meta-Llama-3-405B \
|
|
156
|
+
--tp_size 8 \
|
|
157
|
+
--pp_size 2 \
|
|
158
|
+
--num_workers 2 \ # 2 nodes
|
|
159
|
+
--dtype fp8
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### Kubernetes deployment
|
|
163
|
+
|
|
164
|
+
```yaml
|
|
165
|
+
apiVersion: apps/v1
|
|
166
|
+
kind: Deployment
|
|
167
|
+
metadata:
|
|
168
|
+
name: tensorrt-llm-llama3-70b
|
|
169
|
+
spec:
|
|
170
|
+
replicas: 1
|
|
171
|
+
template:
|
|
172
|
+
spec:
|
|
173
|
+
containers:
|
|
174
|
+
- name: trtllm
|
|
175
|
+
image: nvidia/tensorrt_llm:latest
|
|
176
|
+
command:
|
|
177
|
+
- trtllm-serve
|
|
178
|
+
- meta-llama/Meta-Llama-3-70B
|
|
179
|
+
- --tp_size=4
|
|
180
|
+
- --max_batch_size=256
|
|
181
|
+
resources:
|
|
182
|
+
limits:
|
|
183
|
+
nvidia.com/gpu: 4 # Request 4 GPUs
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
## Parallelism Decision Tree
|
|
187
|
+
|
|
188
|
+
```
|
|
189
|
+
Model size < 20GB?
|
|
190
|
+
├─ YES: Single GPU (no parallelism)
|
|
191
|
+
└─ NO: Model size < 80GB?
|
|
192
|
+
├─ YES: TP=2 or TP=4 (same node)
|
|
193
|
+
└─ NO: Model size < 320GB?
|
|
194
|
+
├─ YES: TP=4 or TP=8 (same node, NVLink required)
|
|
195
|
+
└─ NO: TP=8 + PP=2 (multi-node)
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
## Communication Optimization
|
|
199
|
+
|
|
200
|
+
### NVLink vs PCIe
|
|
201
|
+
|
|
202
|
+
**NVLink** (DGX A100, HGX H100):
|
|
203
|
+
- Bandwidth: 600 GB/s (A100), 900 GB/s (H100)
|
|
204
|
+
- Ideal for TP (high communication)
|
|
205
|
+
- **Recommended for all multi-GPU setups**
|
|
206
|
+
|
|
207
|
+
**PCIe**:
|
|
208
|
+
- Bandwidth: 64 GB/s (PCIe 4.0 x16)
|
|
209
|
+
- 10× slower than NVLink
|
|
210
|
+
- Avoid TP, use PP instead
|
|
211
|
+
|
|
212
|
+
### InfiniBand for multi-node
|
|
213
|
+
|
|
214
|
+
**HDR InfiniBand** (200 Gb/s):
|
|
215
|
+
- Required for multi-node TP or PP
|
|
216
|
+
- Latency: <1μs
|
|
217
|
+
- **Essential for 405B+ models**
|
|
218
|
+
|
|
219
|
+
## Monitoring Multi-GPU
|
|
220
|
+
|
|
221
|
+
```python
|
|
222
|
+
# Monitor GPU utilization
|
|
223
|
+
nvidia-smi dmon -s u
|
|
224
|
+
|
|
225
|
+
# Monitor memory
|
|
226
|
+
nvidia-smi dmon -s m
|
|
227
|
+
|
|
228
|
+
# Monitor NVLink utilization
|
|
229
|
+
nvidia-smi nvlink --status
|
|
230
|
+
|
|
231
|
+
# TensorRT-LLM built-in metrics
|
|
232
|
+
curl http://localhost:8000/metrics
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
**Key metrics**:
|
|
236
|
+
- GPU utilization: Target 80-95%
|
|
237
|
+
- Memory usage: Should be balanced across GPUs
|
|
238
|
+
- NVLink traffic: High for TP, low for PP
|
|
239
|
+
- Throughput: Tokens/sec across all GPUs
|
|
240
|
+
|
|
241
|
+
## Common Issues
|
|
242
|
+
|
|
243
|
+
### Imbalanced GPU memory
|
|
244
|
+
|
|
245
|
+
**Symptom**: GPU 0 has 90% memory, GPU 3 has 40%
|
|
246
|
+
|
|
247
|
+
**Solutions**:
|
|
248
|
+
- Verify TP/PP configuration
|
|
249
|
+
- Check model sharding (should be equal)
|
|
250
|
+
- Restart server to reset state
|
|
251
|
+
|
|
252
|
+
### Low NVLink utilization
|
|
253
|
+
|
|
254
|
+
**Symptom**: NVLink bandwidth <100 GB/s with TP=4
|
|
255
|
+
|
|
256
|
+
**Solutions**:
|
|
257
|
+
- Verify NVLink topology: `nvidia-smi topo -m`
|
|
258
|
+
- Check for PCIe fallback
|
|
259
|
+
- Ensure GPUs are on same NVSwitch
|
|
260
|
+
|
|
261
|
+
### OOM with multi-GPU
|
|
262
|
+
|
|
263
|
+
**Solutions**:
|
|
264
|
+
- Increase TP size (more GPUs)
|
|
265
|
+
- Reduce batch size
|
|
266
|
+
- Enable FP8 quantization
|
|
267
|
+
- Use pipeline parallelism
|
|
268
|
+
|
|
269
|
+
## Performance Scaling
|
|
270
|
+
|
|
271
|
+
### TP Scaling (Llama 3-70B, FP8)
|
|
272
|
+
|
|
273
|
+
| GPUs | TP Size | Throughput | Latency | Efficiency |
|
|
274
|
+
|------|---------|------------|---------|------------|
|
|
275
|
+
| 1 | 1 | OOM | - | - |
|
|
276
|
+
| 2 | 2 | 6,000 tok/s | 18ms | 85% |
|
|
277
|
+
| 4 | 4 | 11,000 tok/s | 16ms | 78% |
|
|
278
|
+
| 8 | 8 | 18,000 tok/s | 15ms | 64% |
|
|
279
|
+
|
|
280
|
+
**Note**: Efficiency drops with more GPUs due to communication overhead.
|
|
281
|
+
|
|
282
|
+
### PP Scaling (Llama 3-405B, FP8)
|
|
283
|
+
|
|
284
|
+
| Nodes | TP | PP | Total GPUs | Throughput |
|
|
285
|
+
|-------|----|----|------------|------------|
|
|
286
|
+
| 1 | 8 | 1 | 8 | OOM |
|
|
287
|
+
| 2 | 8 | 2 | 16 | 25,000 tok/s |
|
|
288
|
+
| 4 | 8 | 4 | 32 | 45,000 tok/s |
|
|
289
|
+
|
|
290
|
+
## Best Practices
|
|
291
|
+
|
|
292
|
+
1. **Prefer TP over PP** when possible (lower latency)
|
|
293
|
+
2. **Use NVLink** for all TP deployments
|
|
294
|
+
3. **Use InfiniBand** for multi-node deployments
|
|
295
|
+
4. **Start with smallest TP** that fits model in memory
|
|
296
|
+
5. **Monitor GPU balance** - all GPUs should have similar utilization
|
|
297
|
+
6. **Test with benchmark** before production
|
|
298
|
+
7. **Use FP8** on H100 for 2× speedup
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
# TensorRT-LLM Optimization Guide
|
|
2
|
+
|
|
3
|
+
Comprehensive guide to optimizing LLM inference with TensorRT-LLM.
|
|
4
|
+
|
|
5
|
+
## Quantization
|
|
6
|
+
|
|
7
|
+
### FP8 Quantization (Recommended for H100)
|
|
8
|
+
|
|
9
|
+
**Benefits**:
|
|
10
|
+
- 2× faster inference
|
|
11
|
+
- 50% memory reduction
|
|
12
|
+
- Minimal accuracy loss (<1% perplexity degradation)
|
|
13
|
+
|
|
14
|
+
**Usage**:
|
|
15
|
+
```python
|
|
16
|
+
from tensorrt_llm import LLM
|
|
17
|
+
|
|
18
|
+
# Automatic FP8 quantization
|
|
19
|
+
llm = LLM(
|
|
20
|
+
model="meta-llama/Meta-Llama-3-70B",
|
|
21
|
+
dtype="fp8",
|
|
22
|
+
quantization="fp8"
|
|
23
|
+
)
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
**Performance** (Llama 3-70B on 8× H100):
|
|
27
|
+
- FP16: 5,000 tokens/sec
|
|
28
|
+
- FP8: **10,000 tokens/sec** (2× speedup)
|
|
29
|
+
- Memory: 140GB → 70GB
|
|
30
|
+
|
|
31
|
+
### INT4 Quantization (Maximum compression)
|
|
32
|
+
|
|
33
|
+
**Benefits**:
|
|
34
|
+
- 4× memory reduction
|
|
35
|
+
- 3-4× faster inference
|
|
36
|
+
- Fits larger models on same hardware
|
|
37
|
+
|
|
38
|
+
**Usage**:
|
|
39
|
+
```python
|
|
40
|
+
# INT4 with AWQ calibration
|
|
41
|
+
llm = LLM(
|
|
42
|
+
model="meta-llama/Meta-Llama-3-405B",
|
|
43
|
+
dtype="int4_awq",
|
|
44
|
+
quantization="awq"
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# INT4 with GPTQ calibration
|
|
48
|
+
llm = LLM(
|
|
49
|
+
model="meta-llama/Meta-Llama-3-405B",
|
|
50
|
+
dtype="int4_gptq",
|
|
51
|
+
quantization="gptq"
|
|
52
|
+
)
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
**Trade-offs**:
|
|
56
|
+
- Accuracy: 1-3% perplexity increase
|
|
57
|
+
- Speed: 3-4× faster than FP16
|
|
58
|
+
- Use case: When memory is critical
|
|
59
|
+
|
|
60
|
+
## In-Flight Batching
|
|
61
|
+
|
|
62
|
+
**What it does**: Dynamically batches requests during generation instead of waiting for all sequences to finish.
|
|
63
|
+
|
|
64
|
+
**Configuration**:
|
|
65
|
+
```python
|
|
66
|
+
# Server configuration
|
|
67
|
+
trtllm-serve meta-llama/Meta-Llama-3-8B \
|
|
68
|
+
--max_batch_size 256 \ # Maximum concurrent sequences
|
|
69
|
+
--max_num_tokens 4096 \ # Total tokens in batch
|
|
70
|
+
--enable_chunked_context \ # Split long prompts
|
|
71
|
+
--scheduler_policy max_utilization
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
**Performance**:
|
|
75
|
+
- Throughput: **4-8× higher** vs static batching
|
|
76
|
+
- Latency: Lower P50/P99 for mixed workloads
|
|
77
|
+
- GPU utilization: 80-95% vs 40-60%
|
|
78
|
+
|
|
79
|
+
## Paged KV Cache
|
|
80
|
+
|
|
81
|
+
**What it does**: Manages KV cache memory like OS manages virtual memory (paging).
|
|
82
|
+
|
|
83
|
+
**Benefits**:
|
|
84
|
+
- 40-60% higher throughput
|
|
85
|
+
- No memory fragmentation
|
|
86
|
+
- Supports longer sequences
|
|
87
|
+
|
|
88
|
+
**Configuration**:
|
|
89
|
+
```python
|
|
90
|
+
# Automatic paged KV cache (default)
|
|
91
|
+
llm = LLM(
|
|
92
|
+
model="meta-llama/Meta-Llama-3-8B",
|
|
93
|
+
kv_cache_free_gpu_mem_fraction=0.9, # Use 90% GPU mem for cache
|
|
94
|
+
enable_prefix_caching=True # Cache common prefixes
|
|
95
|
+
)
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Speculative Decoding
|
|
99
|
+
|
|
100
|
+
**What it does**: Uses small draft model to predict multiple tokens, verified by target model in parallel.
|
|
101
|
+
|
|
102
|
+
**Speedup**: 2-3× faster for long generations
|
|
103
|
+
|
|
104
|
+
**Usage**:
|
|
105
|
+
```python
|
|
106
|
+
from tensorrt_llm import LLM
|
|
107
|
+
|
|
108
|
+
# Target model (Llama 3-70B)
|
|
109
|
+
llm = LLM(
|
|
110
|
+
model="meta-llama/Meta-Llama-3-70B",
|
|
111
|
+
speculative_model="meta-llama/Meta-Llama-3-8B", # Draft model
|
|
112
|
+
num_speculative_tokens=5 # Tokens to predict ahead
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# Same API, 2-3× faster
|
|
116
|
+
outputs = llm.generate(prompts)
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
**Best models for drafting**:
|
|
120
|
+
- Target: Llama 3-70B → Draft: Llama 3-8B
|
|
121
|
+
- Target: Qwen2-72B → Draft: Qwen2-7B
|
|
122
|
+
- Same family, 8-10× smaller
|
|
123
|
+
|
|
124
|
+
## CUDA Graphs
|
|
125
|
+
|
|
126
|
+
**What it does**: Reduces kernel launch overhead by recording GPU operations.
|
|
127
|
+
|
|
128
|
+
**Benefits**:
|
|
129
|
+
- 10-20% lower latency
|
|
130
|
+
- More stable P99 latency
|
|
131
|
+
- Better for small batch sizes
|
|
132
|
+
|
|
133
|
+
**Configuration** (automatic by default):
|
|
134
|
+
```python
|
|
135
|
+
llm = LLM(
|
|
136
|
+
model="meta-llama/Meta-Llama-3-8B",
|
|
137
|
+
enable_cuda_graph=True, # Default: True
|
|
138
|
+
cuda_graph_cache_size=2 # Cache 2 graph variants
|
|
139
|
+
)
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## Chunked Context
|
|
143
|
+
|
|
144
|
+
**What it does**: Splits long prompts into chunks to reduce memory spikes.
|
|
145
|
+
|
|
146
|
+
**Use case**: Prompts >8K tokens with limited GPU memory
|
|
147
|
+
|
|
148
|
+
**Configuration**:
|
|
149
|
+
```bash
|
|
150
|
+
trtllm-serve meta-llama/Meta-Llama-3-8B \
|
|
151
|
+
--max_num_tokens 4096 \
|
|
152
|
+
--enable_chunked_context \
|
|
153
|
+
--max_chunked_prefill_length 2048 # Process 2K tokens at a time
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
## Overlap Scheduling
|
|
157
|
+
|
|
158
|
+
**What it does**: Overlaps compute and memory operations.
|
|
159
|
+
|
|
160
|
+
**Benefits**:
|
|
161
|
+
- 15-25% higher throughput
|
|
162
|
+
- Better GPU utilization
|
|
163
|
+
- Default in v1.2.0+
|
|
164
|
+
|
|
165
|
+
**No configuration needed** - enabled automatically.
|
|
166
|
+
|
|
167
|
+
## Quantization Comparison Table
|
|
168
|
+
|
|
169
|
+
| Method | Memory | Speed | Accuracy | Use Case |
|
|
170
|
+
|--------|--------|-------|----------|----------|
|
|
171
|
+
| FP16 | 1× (baseline) | 1× | Best | High accuracy needed |
|
|
172
|
+
| FP8 | 0.5× | 2× | -0.5% ppl | **H100 default** |
|
|
173
|
+
| INT4 AWQ | 0.25× | 3-4× | -1.5% ppl | Memory critical |
|
|
174
|
+
| INT4 GPTQ | 0.25× | 3-4× | -2% ppl | Maximum speed |
|
|
175
|
+
|
|
176
|
+
## Tuning Workflow
|
|
177
|
+
|
|
178
|
+
1. **Start with defaults**:
|
|
179
|
+
```python
|
|
180
|
+
llm = LLM(model="meta-llama/Meta-Llama-3-70B")
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
2. **Enable FP8** (if H100):
|
|
184
|
+
```python
|
|
185
|
+
llm = LLM(model="...", dtype="fp8")
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
3. **Tune batch size**:
|
|
189
|
+
```python
|
|
190
|
+
# Increase until OOM, then reduce 20%
|
|
191
|
+
trtllm-serve ... --max_batch_size 256
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
4. **Enable chunked context** (if long prompts):
|
|
195
|
+
```bash
|
|
196
|
+
--enable_chunked_context --max_chunked_prefill_length 2048
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
5. **Try speculative decoding** (if latency critical):
|
|
200
|
+
```python
|
|
201
|
+
llm = LLM(model="...", speculative_model="...")
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
## Benchmarking
|
|
205
|
+
|
|
206
|
+
```bash
|
|
207
|
+
# Install benchmark tool
|
|
208
|
+
pip install tensorrt_llm[benchmark]
|
|
209
|
+
|
|
210
|
+
# Run benchmark
|
|
211
|
+
python benchmarks/python/benchmark.py \
|
|
212
|
+
--model meta-llama/Meta-Llama-3-8B \
|
|
213
|
+
--batch_size 64 \
|
|
214
|
+
--input_len 128 \
|
|
215
|
+
--output_len 256 \
|
|
216
|
+
--dtype fp8
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
**Metrics to track**:
|
|
220
|
+
- Throughput (tokens/sec)
|
|
221
|
+
- Latency P50/P90/P99 (ms)
|
|
222
|
+
- GPU memory usage (GB)
|
|
223
|
+
- GPU utilization (%)
|
|
224
|
+
|
|
225
|
+
## Common Issues
|
|
226
|
+
|
|
227
|
+
**OOM errors**:
|
|
228
|
+
- Reduce `max_batch_size`
|
|
229
|
+
- Reduce `max_num_tokens`
|
|
230
|
+
- Enable INT4 quantization
|
|
231
|
+
- Increase `tensor_parallel_size`
|
|
232
|
+
|
|
233
|
+
**Low throughput**:
|
|
234
|
+
- Increase `max_batch_size`
|
|
235
|
+
- Enable in-flight batching
|
|
236
|
+
- Verify CUDA graphs enabled
|
|
237
|
+
- Check GPU utilization
|
|
238
|
+
|
|
239
|
+
**High latency**:
|
|
240
|
+
- Try speculative decoding
|
|
241
|
+
- Reduce `max_batch_size` (less queueing)
|
|
242
|
+
- Use FP8 instead of FP16
|