@synsci/cli-darwin-x64 1.1.49
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skills/accelerate/SKILL.md +332 -0
- package/bin/skills/accelerate/references/custom-plugins.md +453 -0
- package/bin/skills/accelerate/references/megatron-integration.md +489 -0
- package/bin/skills/accelerate/references/performance.md +525 -0
- package/bin/skills/audiocraft/SKILL.md +564 -0
- package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
- package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
- package/bin/skills/autogpt/SKILL.md +403 -0
- package/bin/skills/autogpt/references/advanced-usage.md +535 -0
- package/bin/skills/autogpt/references/troubleshooting.md +420 -0
- package/bin/skills/awq/SKILL.md +310 -0
- package/bin/skills/awq/references/advanced-usage.md +324 -0
- package/bin/skills/awq/references/troubleshooting.md +344 -0
- package/bin/skills/axolotl/SKILL.md +158 -0
- package/bin/skills/axolotl/references/api.md +5548 -0
- package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
- package/bin/skills/axolotl/references/index.md +15 -0
- package/bin/skills/axolotl/references/other.md +3563 -0
- package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
- package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
- package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
- package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
- package/bin/skills/bitsandbytes/SKILL.md +411 -0
- package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
- package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
- package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
- package/bin/skills/blip-2/SKILL.md +564 -0
- package/bin/skills/blip-2/references/advanced-usage.md +680 -0
- package/bin/skills/blip-2/references/troubleshooting.md +526 -0
- package/bin/skills/chroma/SKILL.md +406 -0
- package/bin/skills/chroma/references/integration.md +38 -0
- package/bin/skills/clip/SKILL.md +253 -0
- package/bin/skills/clip/references/applications.md +207 -0
- package/bin/skills/constitutional-ai/SKILL.md +290 -0
- package/bin/skills/crewai/SKILL.md +498 -0
- package/bin/skills/crewai/references/flows.md +438 -0
- package/bin/skills/crewai/references/tools.md +429 -0
- package/bin/skills/crewai/references/troubleshooting.md +480 -0
- package/bin/skills/deepspeed/SKILL.md +141 -0
- package/bin/skills/deepspeed/references/08.md +17 -0
- package/bin/skills/deepspeed/references/09.md +173 -0
- package/bin/skills/deepspeed/references/2020.md +378 -0
- package/bin/skills/deepspeed/references/2023.md +279 -0
- package/bin/skills/deepspeed/references/assets.md +179 -0
- package/bin/skills/deepspeed/references/index.md +35 -0
- package/bin/skills/deepspeed/references/mii.md +118 -0
- package/bin/skills/deepspeed/references/other.md +1191 -0
- package/bin/skills/deepspeed/references/tutorials.md +6554 -0
- package/bin/skills/dspy/SKILL.md +590 -0
- package/bin/skills/dspy/references/examples.md +663 -0
- package/bin/skills/dspy/references/modules.md +475 -0
- package/bin/skills/dspy/references/optimizers.md +566 -0
- package/bin/skills/faiss/SKILL.md +221 -0
- package/bin/skills/faiss/references/index_types.md +280 -0
- package/bin/skills/flash-attention/SKILL.md +367 -0
- package/bin/skills/flash-attention/references/benchmarks.md +215 -0
- package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
- package/bin/skills/gguf/SKILL.md +427 -0
- package/bin/skills/gguf/references/advanced-usage.md +504 -0
- package/bin/skills/gguf/references/troubleshooting.md +442 -0
- package/bin/skills/gptq/SKILL.md +450 -0
- package/bin/skills/gptq/references/calibration.md +337 -0
- package/bin/skills/gptq/references/integration.md +129 -0
- package/bin/skills/gptq/references/troubleshooting.md +95 -0
- package/bin/skills/grpo-rl-training/README.md +97 -0
- package/bin/skills/grpo-rl-training/SKILL.md +572 -0
- package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
- package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
- package/bin/skills/guidance/SKILL.md +572 -0
- package/bin/skills/guidance/references/backends.md +554 -0
- package/bin/skills/guidance/references/constraints.md +674 -0
- package/bin/skills/guidance/references/examples.md +767 -0
- package/bin/skills/hqq/SKILL.md +445 -0
- package/bin/skills/hqq/references/advanced-usage.md +528 -0
- package/bin/skills/hqq/references/troubleshooting.md +503 -0
- package/bin/skills/hugging-face-cli/SKILL.md +191 -0
- package/bin/skills/hugging-face-cli/references/commands.md +954 -0
- package/bin/skills/hugging-face-cli/references/examples.md +374 -0
- package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
- package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
- package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
- package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
- package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
- package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
- package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
- package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
- package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
- package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
- package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
- package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
- package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
- package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
- package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
- package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
- package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
- package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
- package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
- package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
- package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
- package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
- package/bin/skills/hugging-face-jobs/index.html +216 -0
- package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
- package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
- package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
- package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
- package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
- package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
- package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
- package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
- package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
- package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
- package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
- package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
- package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
- package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
- package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
- package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
- package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
- package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
- package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
- package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
- package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
- package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
- package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
- package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
- package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
- package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
- package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
- package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
- package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
- package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
- package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
- package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
- package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
- package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
- package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
- package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
- package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
- package/bin/skills/instructor/SKILL.md +740 -0
- package/bin/skills/instructor/references/examples.md +107 -0
- package/bin/skills/instructor/references/providers.md +70 -0
- package/bin/skills/instructor/references/validation.md +606 -0
- package/bin/skills/knowledge-distillation/SKILL.md +458 -0
- package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
- package/bin/skills/lambda-labs/SKILL.md +545 -0
- package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
- package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
- package/bin/skills/langchain/SKILL.md +480 -0
- package/bin/skills/langchain/references/agents.md +499 -0
- package/bin/skills/langchain/references/integration.md +562 -0
- package/bin/skills/langchain/references/rag.md +600 -0
- package/bin/skills/langsmith/SKILL.md +422 -0
- package/bin/skills/langsmith/references/advanced-usage.md +548 -0
- package/bin/skills/langsmith/references/troubleshooting.md +537 -0
- package/bin/skills/litgpt/SKILL.md +469 -0
- package/bin/skills/litgpt/references/custom-models.md +568 -0
- package/bin/skills/litgpt/references/distributed-training.md +451 -0
- package/bin/skills/litgpt/references/supported-models.md +336 -0
- package/bin/skills/litgpt/references/training-recipes.md +619 -0
- package/bin/skills/llama-cpp/SKILL.md +258 -0
- package/bin/skills/llama-cpp/references/optimization.md +89 -0
- package/bin/skills/llama-cpp/references/quantization.md +213 -0
- package/bin/skills/llama-cpp/references/server.md +125 -0
- package/bin/skills/llama-factory/SKILL.md +80 -0
- package/bin/skills/llama-factory/references/_images.md +23 -0
- package/bin/skills/llama-factory/references/advanced.md +1055 -0
- package/bin/skills/llama-factory/references/getting_started.md +349 -0
- package/bin/skills/llama-factory/references/index.md +19 -0
- package/bin/skills/llama-factory/references/other.md +31 -0
- package/bin/skills/llamaguard/SKILL.md +337 -0
- package/bin/skills/llamaindex/SKILL.md +569 -0
- package/bin/skills/llamaindex/references/agents.md +83 -0
- package/bin/skills/llamaindex/references/data_connectors.md +108 -0
- package/bin/skills/llamaindex/references/query_engines.md +406 -0
- package/bin/skills/llava/SKILL.md +304 -0
- package/bin/skills/llava/references/training.md +197 -0
- package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
- package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
- package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
- package/bin/skills/long-context/SKILL.md +536 -0
- package/bin/skills/long-context/references/extension_methods.md +468 -0
- package/bin/skills/long-context/references/fine_tuning.md +611 -0
- package/bin/skills/long-context/references/rope.md +402 -0
- package/bin/skills/mamba/SKILL.md +260 -0
- package/bin/skills/mamba/references/architecture-details.md +206 -0
- package/bin/skills/mamba/references/benchmarks.md +255 -0
- package/bin/skills/mamba/references/training-guide.md +388 -0
- package/bin/skills/megatron-core/SKILL.md +366 -0
- package/bin/skills/megatron-core/references/benchmarks.md +249 -0
- package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
- package/bin/skills/megatron-core/references/production-examples.md +473 -0
- package/bin/skills/megatron-core/references/training-recipes.md +547 -0
- package/bin/skills/miles/SKILL.md +315 -0
- package/bin/skills/miles/references/api-reference.md +141 -0
- package/bin/skills/miles/references/troubleshooting.md +352 -0
- package/bin/skills/mlflow/SKILL.md +704 -0
- package/bin/skills/mlflow/references/deployment.md +744 -0
- package/bin/skills/mlflow/references/model-registry.md +770 -0
- package/bin/skills/mlflow/references/tracking.md +680 -0
- package/bin/skills/modal/SKILL.md +341 -0
- package/bin/skills/modal/references/advanced-usage.md +503 -0
- package/bin/skills/modal/references/troubleshooting.md +494 -0
- package/bin/skills/model-merging/SKILL.md +539 -0
- package/bin/skills/model-merging/references/evaluation.md +462 -0
- package/bin/skills/model-merging/references/examples.md +428 -0
- package/bin/skills/model-merging/references/methods.md +352 -0
- package/bin/skills/model-pruning/SKILL.md +495 -0
- package/bin/skills/model-pruning/references/wanda.md +347 -0
- package/bin/skills/moe-training/SKILL.md +526 -0
- package/bin/skills/moe-training/references/architectures.md +432 -0
- package/bin/skills/moe-training/references/inference.md +348 -0
- package/bin/skills/moe-training/references/training.md +425 -0
- package/bin/skills/nanogpt/SKILL.md +290 -0
- package/bin/skills/nanogpt/references/architecture.md +382 -0
- package/bin/skills/nanogpt/references/data.md +476 -0
- package/bin/skills/nanogpt/references/training.md +564 -0
- package/bin/skills/nemo-curator/SKILL.md +383 -0
- package/bin/skills/nemo-curator/references/deduplication.md +87 -0
- package/bin/skills/nemo-curator/references/filtering.md +102 -0
- package/bin/skills/nemo-evaluator/SKILL.md +494 -0
- package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
- package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
- package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
- package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
- package/bin/skills/nemo-guardrails/SKILL.md +297 -0
- package/bin/skills/nnsight/SKILL.md +436 -0
- package/bin/skills/nnsight/references/README.md +78 -0
- package/bin/skills/nnsight/references/api.md +344 -0
- package/bin/skills/nnsight/references/tutorials.md +300 -0
- package/bin/skills/openrlhf/SKILL.md +249 -0
- package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
- package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
- package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
- package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
- package/bin/skills/outlines/SKILL.md +652 -0
- package/bin/skills/outlines/references/backends.md +615 -0
- package/bin/skills/outlines/references/examples.md +773 -0
- package/bin/skills/outlines/references/json_generation.md +652 -0
- package/bin/skills/peft/SKILL.md +431 -0
- package/bin/skills/peft/references/advanced-usage.md +514 -0
- package/bin/skills/peft/references/troubleshooting.md +480 -0
- package/bin/skills/phoenix/SKILL.md +475 -0
- package/bin/skills/phoenix/references/advanced-usage.md +619 -0
- package/bin/skills/phoenix/references/troubleshooting.md +538 -0
- package/bin/skills/pinecone/SKILL.md +358 -0
- package/bin/skills/pinecone/references/deployment.md +181 -0
- package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
- package/bin/skills/pytorch-fsdp/references/index.md +7 -0
- package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
- package/bin/skills/pytorch-lightning/SKILL.md +346 -0
- package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
- package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
- package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
- package/bin/skills/pyvene/SKILL.md +473 -0
- package/bin/skills/pyvene/references/README.md +73 -0
- package/bin/skills/pyvene/references/api.md +383 -0
- package/bin/skills/pyvene/references/tutorials.md +376 -0
- package/bin/skills/qdrant/SKILL.md +493 -0
- package/bin/skills/qdrant/references/advanced-usage.md +648 -0
- package/bin/skills/qdrant/references/troubleshooting.md +631 -0
- package/bin/skills/ray-data/SKILL.md +326 -0
- package/bin/skills/ray-data/references/integration.md +82 -0
- package/bin/skills/ray-data/references/transformations.md +83 -0
- package/bin/skills/ray-train/SKILL.md +406 -0
- package/bin/skills/ray-train/references/multi-node.md +628 -0
- package/bin/skills/rwkv/SKILL.md +260 -0
- package/bin/skills/rwkv/references/architecture-details.md +344 -0
- package/bin/skills/rwkv/references/rwkv7.md +386 -0
- package/bin/skills/rwkv/references/state-management.md +369 -0
- package/bin/skills/saelens/SKILL.md +386 -0
- package/bin/skills/saelens/references/README.md +70 -0
- package/bin/skills/saelens/references/api.md +333 -0
- package/bin/skills/saelens/references/tutorials.md +318 -0
- package/bin/skills/segment-anything/SKILL.md +500 -0
- package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
- package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
- package/bin/skills/sentence-transformers/SKILL.md +255 -0
- package/bin/skills/sentence-transformers/references/models.md +123 -0
- package/bin/skills/sentencepiece/SKILL.md +235 -0
- package/bin/skills/sentencepiece/references/algorithms.md +200 -0
- package/bin/skills/sentencepiece/references/training.md +304 -0
- package/bin/skills/sglang/SKILL.md +442 -0
- package/bin/skills/sglang/references/deployment.md +490 -0
- package/bin/skills/sglang/references/radix-attention.md +413 -0
- package/bin/skills/sglang/references/structured-generation.md +541 -0
- package/bin/skills/simpo/SKILL.md +219 -0
- package/bin/skills/simpo/references/datasets.md +478 -0
- package/bin/skills/simpo/references/hyperparameters.md +452 -0
- package/bin/skills/simpo/references/loss-functions.md +350 -0
- package/bin/skills/skypilot/SKILL.md +509 -0
- package/bin/skills/skypilot/references/advanced-usage.md +491 -0
- package/bin/skills/skypilot/references/troubleshooting.md +570 -0
- package/bin/skills/slime/SKILL.md +464 -0
- package/bin/skills/slime/references/api-reference.md +392 -0
- package/bin/skills/slime/references/troubleshooting.md +386 -0
- package/bin/skills/speculative-decoding/SKILL.md +467 -0
- package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
- package/bin/skills/speculative-decoding/references/medusa.md +350 -0
- package/bin/skills/stable-diffusion/SKILL.md +519 -0
- package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
- package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
- package/bin/skills/tensorboard/SKILL.md +629 -0
- package/bin/skills/tensorboard/references/integrations.md +638 -0
- package/bin/skills/tensorboard/references/profiling.md +545 -0
- package/bin/skills/tensorboard/references/visualization.md +620 -0
- package/bin/skills/tensorrt-llm/SKILL.md +187 -0
- package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
- package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
- package/bin/skills/tensorrt-llm/references/serving.md +470 -0
- package/bin/skills/tinker/SKILL.md +362 -0
- package/bin/skills/tinker/references/api-reference.md +168 -0
- package/bin/skills/tinker/references/getting-started.md +157 -0
- package/bin/skills/tinker/references/loss-functions.md +163 -0
- package/bin/skills/tinker/references/models-and-lora.md +139 -0
- package/bin/skills/tinker/references/recipes.md +280 -0
- package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
- package/bin/skills/tinker/references/rendering.md +243 -0
- package/bin/skills/tinker/references/supervised-learning.md +232 -0
- package/bin/skills/tinker-training-cost/SKILL.md +187 -0
- package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
- package/bin/skills/torchforge/SKILL.md +433 -0
- package/bin/skills/torchforge/references/api-reference.md +327 -0
- package/bin/skills/torchforge/references/troubleshooting.md +409 -0
- package/bin/skills/torchtitan/SKILL.md +358 -0
- package/bin/skills/torchtitan/references/checkpoint.md +181 -0
- package/bin/skills/torchtitan/references/custom-models.md +258 -0
- package/bin/skills/torchtitan/references/float8.md +133 -0
- package/bin/skills/torchtitan/references/fsdp.md +126 -0
- package/bin/skills/transformer-lens/SKILL.md +346 -0
- package/bin/skills/transformer-lens/references/README.md +54 -0
- package/bin/skills/transformer-lens/references/api.md +362 -0
- package/bin/skills/transformer-lens/references/tutorials.md +339 -0
- package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
- package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
- package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
- package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
- package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
- package/bin/skills/unsloth/SKILL.md +80 -0
- package/bin/skills/unsloth/references/index.md +7 -0
- package/bin/skills/unsloth/references/llms-full.md +16799 -0
- package/bin/skills/unsloth/references/llms-txt.md +12044 -0
- package/bin/skills/unsloth/references/llms.md +82 -0
- package/bin/skills/verl/SKILL.md +391 -0
- package/bin/skills/verl/references/api-reference.md +301 -0
- package/bin/skills/verl/references/troubleshooting.md +391 -0
- package/bin/skills/vllm/SKILL.md +364 -0
- package/bin/skills/vllm/references/optimization.md +226 -0
- package/bin/skills/vllm/references/quantization.md +284 -0
- package/bin/skills/vllm/references/server-deployment.md +255 -0
- package/bin/skills/vllm/references/troubleshooting.md +447 -0
- package/bin/skills/weights-and-biases/SKILL.md +590 -0
- package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
- package/bin/skills/weights-and-biases/references/integrations.md +700 -0
- package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
- package/bin/skills/whisper/SKILL.md +317 -0
- package/bin/skills/whisper/references/languages.md +189 -0
- package/bin/synsc +0 -0
- package/package.json +10 -0
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
# AWQ Advanced Usage Guide
|
|
2
|
+
|
|
3
|
+
## Quantization Algorithm Details
|
|
4
|
+
|
|
5
|
+
### How AWQ Works
|
|
6
|
+
|
|
7
|
+
AWQ (Activation-aware Weight Quantization) is based on the key insight that not all weights in an LLM are equally important. The algorithm:
|
|
8
|
+
|
|
9
|
+
1. **Identifies salient weights** (~1%) by examining activation distributions
|
|
10
|
+
2. **Applies mathematical scaling** to protect critical channels
|
|
11
|
+
3. **Quantizes remaining weights** to 4-bit with minimal error
|
|
12
|
+
|
|
13
|
+
**Core formula**: `L(s) = ||Q(W * s)(s^-1 * X) - W * X||`
|
|
14
|
+
|
|
15
|
+
Where:
|
|
16
|
+
- `Q` is the quantization function
|
|
17
|
+
- `W` is the weight matrix
|
|
18
|
+
- `s` is the scaling factor
|
|
19
|
+
- `X` is the input activation
|
|
20
|
+
|
|
21
|
+
### Why AWQ Outperforms GPTQ
|
|
22
|
+
|
|
23
|
+
| Aspect | AWQ | GPTQ |
|
|
24
|
+
|--------|-----|------|
|
|
25
|
+
| Calibration approach | Activation-aware scaling | Hessian-based reconstruction |
|
|
26
|
+
| Overfitting risk | Low (no backprop) | Higher (reconstruction-based) |
|
|
27
|
+
| Calibration data | 128-1024 tokens | Larger datasets needed |
|
|
28
|
+
| Generalization | Better across domains | Can overfit to calibration |
|
|
29
|
+
|
|
30
|
+
## WQLinear Kernel Variants
|
|
31
|
+
|
|
32
|
+
AutoAWQ provides multiple kernel implementations for different use cases:
|
|
33
|
+
|
|
34
|
+
### WQLinear_GEMM
|
|
35
|
+
- **Use case**: Batch inference, training
|
|
36
|
+
- **Best for**: Batch sizes > 1, throughput optimization
|
|
37
|
+
- **Implementation**: General matrix multiplication
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
quant_config = {"version": "GEMM"}
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### WQLinear_GEMV
|
|
44
|
+
- **Use case**: Single-token generation
|
|
45
|
+
- **Best for**: Streaming, chat applications
|
|
46
|
+
- **Speedup**: ~20% faster than GEMM for batch_size=1
|
|
47
|
+
- **Limitation**: Only works with batch_size=1
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
quant_config = {"version": "GEMV"}
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### WQLinear_GEMVFast
|
|
54
|
+
- **Use case**: Optimized single-token generation
|
|
55
|
+
- **Requirements**: awq_v2_ext kernels installed
|
|
56
|
+
- **Best for**: Maximum single-token speed
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
# Requires autoawq[kernels] installation
|
|
60
|
+
quant_config = {"version": "gemv_fast"}
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### WQLinear_Marlin
|
|
64
|
+
- **Use case**: High-throughput inference
|
|
65
|
+
- **Requirements**: Ampere+ GPUs (Compute Capability 8.0+)
|
|
66
|
+
- **Speedup**: 2x faster on A100/H100
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
from transformers import AwqConfig
|
|
70
|
+
|
|
71
|
+
config = AwqConfig(bits=4, version="marlin")
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### WQLinear_Exllama / ExllamaV2
|
|
75
|
+
- **Use case**: AMD GPU compatibility, faster prefill
|
|
76
|
+
- **Benefits**: Works with ROCm
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
config = AwqConfig(bits=4, version="exllama")
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### WQLinear_IPEX
|
|
83
|
+
- **Use case**: Intel CPU/XPU acceleration
|
|
84
|
+
- **Requirements**: Intel Extension for PyTorch, torch 2.4+
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
pip install autoawq[cpu]
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Group Size Configuration
|
|
91
|
+
|
|
92
|
+
Group size determines how weights are grouped for quantization:
|
|
93
|
+
|
|
94
|
+
| Group Size | Model Size | Accuracy | Speed | Use Case |
|
|
95
|
+
|------------|------------|----------|-------|----------|
|
|
96
|
+
| 32 | Larger | Best | Slower | Maximum accuracy |
|
|
97
|
+
| **128** | Medium | Good | Fast | **Recommended default** |
|
|
98
|
+
| 256 | Smaller | Lower | Faster | Speed-critical |
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
quant_config = {
|
|
102
|
+
"q_group_size": 128, # Recommended
|
|
103
|
+
"w_bit": 4,
|
|
104
|
+
"zero_point": True
|
|
105
|
+
}
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## Zero-Point Quantization
|
|
109
|
+
|
|
110
|
+
Zero-point quantization adds an offset to handle asymmetric weight distributions:
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
# With zero-point (recommended for most models)
|
|
114
|
+
quant_config = {"zero_point": True, "w_bit": 4, "q_group_size": 128}
|
|
115
|
+
|
|
116
|
+
# Without zero-point (symmetric quantization)
|
|
117
|
+
quant_config = {"zero_point": False, "w_bit": 4, "q_group_size": 128}
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
**When to disable zero-point**:
|
|
121
|
+
- Models with symmetric weight distributions
|
|
122
|
+
- When using specific kernels that don't support it
|
|
123
|
+
|
|
124
|
+
## Custom Calibration Strategies
|
|
125
|
+
|
|
126
|
+
### Domain-Specific Calibration
|
|
127
|
+
|
|
128
|
+
For domain-specific models, use relevant calibration data:
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
# Medical domain
|
|
132
|
+
medical_samples = [
|
|
133
|
+
"Patient presents with acute respiratory symptoms...",
|
|
134
|
+
"Differential diagnosis includes pneumonia, bronchitis...",
|
|
135
|
+
# More domain-specific examples
|
|
136
|
+
]
|
|
137
|
+
|
|
138
|
+
model.quantize(
|
|
139
|
+
tokenizer,
|
|
140
|
+
quant_config=quant_config,
|
|
141
|
+
calib_data=medical_samples,
|
|
142
|
+
max_calib_samples=256
|
|
143
|
+
)
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
### Instruction-Tuned Model Calibration
|
|
147
|
+
|
|
148
|
+
For chat/instruction models, include conversational data:
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
chat_samples = [
|
|
152
|
+
"Human: What is machine learning?\nAssistant: Machine learning is...",
|
|
153
|
+
"Human: Explain neural networks.\nAssistant: Neural networks are...",
|
|
154
|
+
]
|
|
155
|
+
|
|
156
|
+
model.quantize(tokenizer, quant_config=quant_config, calib_data=chat_samples)
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### Calibration Parameters
|
|
160
|
+
|
|
161
|
+
```python
|
|
162
|
+
model.quantize(
|
|
163
|
+
tokenizer,
|
|
164
|
+
quant_config=quant_config,
|
|
165
|
+
calib_data="pileval", # Dataset name or list
|
|
166
|
+
max_calib_samples=128, # Number of samples (more = slower but better)
|
|
167
|
+
max_calib_seq_len=512, # Sequence length
|
|
168
|
+
duo_scaling=True, # Scale weights and activations
|
|
169
|
+
apply_clip=True # Apply weight clipping
|
|
170
|
+
)
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
## Layer Fusion
|
|
174
|
+
|
|
175
|
+
Layer fusion combines multiple operations for better performance:
|
|
176
|
+
|
|
177
|
+
### Automatic Fusion
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
model = AutoAWQForCausalLM.from_quantized(
|
|
181
|
+
model_name,
|
|
182
|
+
fuse_layers=True # Enables automatic fusion
|
|
183
|
+
)
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
### What Gets Fused
|
|
187
|
+
|
|
188
|
+
- **Attention**: Q, K, V projections combined
|
|
189
|
+
- **MLP**: Gate and Up projections fused
|
|
190
|
+
- **Normalization**: Replaced with FasterTransformerRMSNorm
|
|
191
|
+
|
|
192
|
+
### Manual Fusion Configuration
|
|
193
|
+
|
|
194
|
+
```python
|
|
195
|
+
from transformers import AwqConfig
|
|
196
|
+
|
|
197
|
+
config = AwqConfig(
|
|
198
|
+
bits=4,
|
|
199
|
+
fuse_max_seq_len=2048, # Max context for fused attention
|
|
200
|
+
do_fuse=True,
|
|
201
|
+
modules_to_fuse={
|
|
202
|
+
"attention": ["q_proj", "k_proj", "v_proj"],
|
|
203
|
+
"mlp": ["gate_proj", "up_proj"],
|
|
204
|
+
"layernorm": ["input_layernorm", "post_attention_layernorm"],
|
|
205
|
+
}
|
|
206
|
+
)
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
## Memory Optimization
|
|
210
|
+
|
|
211
|
+
### Chunked Processing
|
|
212
|
+
|
|
213
|
+
For large models, AWQ processes in chunks to avoid OOM:
|
|
214
|
+
|
|
215
|
+
```python
|
|
216
|
+
from awq import AutoAWQForCausalLM
|
|
217
|
+
|
|
218
|
+
# Reduce memory during quantization
|
|
219
|
+
model = AutoAWQForCausalLM.from_pretrained(
|
|
220
|
+
model_path,
|
|
221
|
+
low_cpu_mem_usage=True
|
|
222
|
+
)
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
### Multi-GPU Quantization
|
|
226
|
+
|
|
227
|
+
```python
|
|
228
|
+
model = AutoAWQForCausalLM.from_pretrained(
|
|
229
|
+
"meta-llama/Llama-2-70b-hf",
|
|
230
|
+
device_map="auto"
|
|
231
|
+
)
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
### CPU Offloading
|
|
235
|
+
|
|
236
|
+
```python
|
|
237
|
+
model = AutoAWQForCausalLM.from_quantized(
|
|
238
|
+
model_name,
|
|
239
|
+
device_map="auto",
|
|
240
|
+
max_memory={
|
|
241
|
+
0: "24GB",
|
|
242
|
+
"cpu": "100GB"
|
|
243
|
+
}
|
|
244
|
+
)
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
## Modules to Not Convert
|
|
248
|
+
|
|
249
|
+
Some modules should remain in full precision:
|
|
250
|
+
|
|
251
|
+
```python
|
|
252
|
+
# Visual encoder in multimodal models
|
|
253
|
+
class LlavaAWQForCausalLM(BaseAWQForCausalLM):
|
|
254
|
+
modules_to_not_convert = ["visual"]
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
Common exclusions:
|
|
258
|
+
- `visual` - Vision encoders in VLMs
|
|
259
|
+
- `lm_head` - Output projection
|
|
260
|
+
- `embed_tokens` - Embedding layers
|
|
261
|
+
|
|
262
|
+
## Saving and Loading
|
|
263
|
+
|
|
264
|
+
### Save Quantized Model
|
|
265
|
+
|
|
266
|
+
```python
|
|
267
|
+
# Save locally
|
|
268
|
+
model.save_quantized("./my-awq-model")
|
|
269
|
+
tokenizer.save_pretrained("./my-awq-model")
|
|
270
|
+
|
|
271
|
+
# Save with safetensors (recommended)
|
|
272
|
+
model.save_quantized("./my-awq-model", safetensors=True)
|
|
273
|
+
|
|
274
|
+
# Save sharded (for large models)
|
|
275
|
+
model.save_quantized("./my-awq-model", shard_size="5GB")
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
### Push to HuggingFace
|
|
279
|
+
|
|
280
|
+
```python
|
|
281
|
+
model.push_to_hub("username/my-awq-model")
|
|
282
|
+
tokenizer.push_to_hub("username/my-awq-model")
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
### Load with Specific Backend
|
|
286
|
+
|
|
287
|
+
```python
|
|
288
|
+
from awq import AutoAWQForCausalLM
|
|
289
|
+
|
|
290
|
+
# Load with specific kernel
|
|
291
|
+
model = AutoAWQForCausalLM.from_quantized(
|
|
292
|
+
model_name,
|
|
293
|
+
use_exllama=True, # ExLlama backend
|
|
294
|
+
use_exllama_v2=True, # ExLlamaV2 (faster)
|
|
295
|
+
use_marlin=True, # Marlin kernels
|
|
296
|
+
use_ipex=True, # Intel CPU
|
|
297
|
+
fuse_layers=True # Enable fusion
|
|
298
|
+
)
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
## Benchmarking Your Model
|
|
302
|
+
|
|
303
|
+
```python
|
|
304
|
+
from awq.utils.utils import get_best_device
|
|
305
|
+
import time
|
|
306
|
+
|
|
307
|
+
model = AutoAWQForCausalLM.from_quantized(model_name, fuse_layers=True)
|
|
308
|
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
309
|
+
|
|
310
|
+
# Warmup
|
|
311
|
+
inputs = tokenizer("Hello", return_tensors="pt").to(get_best_device())
|
|
312
|
+
model.generate(**inputs, max_new_tokens=10)
|
|
313
|
+
|
|
314
|
+
# Benchmark
|
|
315
|
+
prompt = "Write a detailed essay about"
|
|
316
|
+
inputs = tokenizer(prompt, return_tensors="pt").to(get_best_device())
|
|
317
|
+
|
|
318
|
+
start = time.time()
|
|
319
|
+
outputs = model.generate(**inputs, max_new_tokens=200)
|
|
320
|
+
end = time.time()
|
|
321
|
+
|
|
322
|
+
tokens_generated = outputs.shape[1] - inputs.input_ids.shape[1]
|
|
323
|
+
print(f"Tokens/sec: {tokens_generated / (end - start):.2f}")
|
|
324
|
+
```
|
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
# AWQ Troubleshooting Guide
|
|
2
|
+
|
|
3
|
+
## Installation Issues
|
|
4
|
+
|
|
5
|
+
### CUDA Version Mismatch
|
|
6
|
+
|
|
7
|
+
**Error**: `RuntimeError: CUDA error: no kernel image is available for execution`
|
|
8
|
+
|
|
9
|
+
**Fix**: Install matching CUDA version:
|
|
10
|
+
```bash
|
|
11
|
+
# Check your CUDA version
|
|
12
|
+
nvcc --version
|
|
13
|
+
|
|
14
|
+
# Install matching autoawq
|
|
15
|
+
pip install autoawq --extra-index-url https://download.pytorch.org/whl/cu118 # For CUDA 11.8
|
|
16
|
+
pip install autoawq --extra-index-url https://download.pytorch.org/whl/cu121 # For CUDA 12.1
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
### Compute Capability Too Low
|
|
20
|
+
|
|
21
|
+
**Error**: `AssertionError: Compute capability must be >= 7.5`
|
|
22
|
+
|
|
23
|
+
**Fix**: AWQ requires NVIDIA GPUs with compute capability 7.5+ (Turing or newer):
|
|
24
|
+
- RTX 20xx series: 7.5 (supported)
|
|
25
|
+
- RTX 30xx series: 8.6 (supported)
|
|
26
|
+
- RTX 40xx series: 8.9 (supported)
|
|
27
|
+
- A100/H100: 8.0/9.0 (supported)
|
|
28
|
+
|
|
29
|
+
Older GPUs (GTX 10xx, V100) are not supported.
|
|
30
|
+
|
|
31
|
+
### Transformers Version Conflict
|
|
32
|
+
|
|
33
|
+
**Error**: `ImportError: cannot import name 'AwqConfig'`
|
|
34
|
+
|
|
35
|
+
**Fix**: AutoAWQ may downgrade transformers. Reinstall correct version:
|
|
36
|
+
```bash
|
|
37
|
+
pip install autoawq
|
|
38
|
+
pip install transformers>=4.45.0 --upgrade
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### Triton Not Found (Linux)
|
|
42
|
+
|
|
43
|
+
**Error**: `ModuleNotFoundError: No module named 'triton'`
|
|
44
|
+
|
|
45
|
+
**Fix**:
|
|
46
|
+
```bash
|
|
47
|
+
pip install triton
|
|
48
|
+
# Or install with kernels
|
|
49
|
+
pip install autoawq[kernels]
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Quantization Issues
|
|
53
|
+
|
|
54
|
+
### CUDA Out of Memory During Quantization
|
|
55
|
+
|
|
56
|
+
**Error**: `torch.cuda.OutOfMemoryError: CUDA out of memory`
|
|
57
|
+
|
|
58
|
+
**Solutions**:
|
|
59
|
+
|
|
60
|
+
1. **Reduce calibration samples**:
|
|
61
|
+
```python
|
|
62
|
+
model.quantize(
|
|
63
|
+
tokenizer,
|
|
64
|
+
quant_config=quant_config,
|
|
65
|
+
max_calib_samples=64 # Reduce from 128
|
|
66
|
+
)
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
2. **Use CPU offloading**:
|
|
70
|
+
```python
|
|
71
|
+
model = AutoAWQForCausalLM.from_pretrained(
|
|
72
|
+
model_path,
|
|
73
|
+
low_cpu_mem_usage=True
|
|
74
|
+
)
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
3. **Multi-GPU quantization**:
|
|
78
|
+
```python
|
|
79
|
+
model = AutoAWQForCausalLM.from_pretrained(
|
|
80
|
+
model_path,
|
|
81
|
+
device_map="auto"
|
|
82
|
+
)
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### NaN in Weights After Quantization
|
|
86
|
+
|
|
87
|
+
**Error**: `AssertionError: NaN detected in weights`
|
|
88
|
+
|
|
89
|
+
**Cause**: Calibration data issues or numerical instability.
|
|
90
|
+
|
|
91
|
+
**Fix**:
|
|
92
|
+
```python
|
|
93
|
+
# Use more calibration samples
|
|
94
|
+
model.quantize(
|
|
95
|
+
tokenizer,
|
|
96
|
+
quant_config=quant_config,
|
|
97
|
+
max_calib_samples=256,
|
|
98
|
+
max_calib_seq_len=1024
|
|
99
|
+
)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### Empty Calibration Samples
|
|
103
|
+
|
|
104
|
+
**Error**: `ValueError: Calibration samples are empty`
|
|
105
|
+
|
|
106
|
+
**Fix**: Ensure tokenizer produces valid output:
|
|
107
|
+
```python
|
|
108
|
+
# Check tokenizer
|
|
109
|
+
test = tokenizer("test", return_tensors="pt")
|
|
110
|
+
print(f"Token count: {test.input_ids.shape[1]}")
|
|
111
|
+
|
|
112
|
+
# Use explicit calibration data
|
|
113
|
+
calib_data = ["Your sample text here..."] * 128
|
|
114
|
+
model.quantize(tokenizer, quant_config=quant_config, calib_data=calib_data)
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### Unsupported Model Architecture
|
|
118
|
+
|
|
119
|
+
**Error**: `TypeError: 'model_type' is not supported`
|
|
120
|
+
|
|
121
|
+
**Cause**: Model architecture not in AWQ registry.
|
|
122
|
+
|
|
123
|
+
**Check supported models**:
|
|
124
|
+
```python
|
|
125
|
+
from awq.models import AWQ_CAUSAL_LM_MODEL_MAP
|
|
126
|
+
print(list(AWQ_CAUSAL_LM_MODEL_MAP.keys()))
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
**Supported**: llama, mistral, qwen2, falcon, mpt, phi, gemma, etc.
|
|
130
|
+
|
|
131
|
+
## Inference Issues
|
|
132
|
+
|
|
133
|
+
### Slow Inference Speed
|
|
134
|
+
|
|
135
|
+
**Problem**: Inference slower than expected.
|
|
136
|
+
|
|
137
|
+
**Solutions**:
|
|
138
|
+
|
|
139
|
+
1. **Enable layer fusion**:
|
|
140
|
+
```python
|
|
141
|
+
model = AutoAWQForCausalLM.from_quantized(
|
|
142
|
+
model_name,
|
|
143
|
+
fuse_layers=True
|
|
144
|
+
)
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
2. **Use correct kernel for batch size**:
|
|
148
|
+
```python
|
|
149
|
+
# For batch_size=1
|
|
150
|
+
quant_config = {"version": "GEMV"}
|
|
151
|
+
|
|
152
|
+
# For batch_size>1
|
|
153
|
+
quant_config = {"version": "GEMM"}
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
3. **Use Marlin on Ampere+ GPUs**:
|
|
157
|
+
```python
|
|
158
|
+
from transformers import AwqConfig
|
|
159
|
+
config = AwqConfig(bits=4, version="marlin")
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### Wrong Output / Garbage Text
|
|
163
|
+
|
|
164
|
+
**Problem**: Model produces nonsensical output after quantization.
|
|
165
|
+
|
|
166
|
+
**Causes and fixes**:
|
|
167
|
+
|
|
168
|
+
1. **Poor calibration data**: Use domain-relevant data
|
|
169
|
+
```python
|
|
170
|
+
calib_data = [
|
|
171
|
+
"Relevant examples from your use case...",
|
|
172
|
+
]
|
|
173
|
+
model.quantize(tokenizer, quant_config=quant_config, calib_data=calib_data)
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
2. **Tokenizer mismatch**: Ensure same tokenizer
|
|
177
|
+
```python
|
|
178
|
+
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
3. **Check generation config**:
|
|
182
|
+
```python
|
|
183
|
+
outputs = model.generate(
|
|
184
|
+
**inputs,
|
|
185
|
+
max_new_tokens=200,
|
|
186
|
+
do_sample=True,
|
|
187
|
+
temperature=0.7,
|
|
188
|
+
pad_token_id=tokenizer.eos_token_id
|
|
189
|
+
)
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
### FlashAttention2 Incompatibility
|
|
193
|
+
|
|
194
|
+
**Error**: `ValueError: Cannot use FlashAttention2 with fused modules`
|
|
195
|
+
|
|
196
|
+
**Fix**: Disable one or the other:
|
|
197
|
+
```python
|
|
198
|
+
# Option 1: Use fused modules (recommended for AWQ)
|
|
199
|
+
model = AutoAWQForCausalLM.from_quantized(model_name, fuse_layers=True)
|
|
200
|
+
|
|
201
|
+
# Option 2: Use FlashAttention2 without fusion
|
|
202
|
+
from transformers import AutoModelForCausalLM
|
|
203
|
+
model = AutoModelForCausalLM.from_pretrained(
|
|
204
|
+
model_name,
|
|
205
|
+
attn_implementation="flash_attention_2",
|
|
206
|
+
device_map="auto"
|
|
207
|
+
)
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
### AMD GPU Issues
|
|
211
|
+
|
|
212
|
+
**Error**: `RuntimeError: ROCm/HIP not found`
|
|
213
|
+
|
|
214
|
+
**Fix**: Use ExLlama backend for AMD:
|
|
215
|
+
```python
|
|
216
|
+
from transformers import AwqConfig
|
|
217
|
+
|
|
218
|
+
config = AwqConfig(bits=4, version="exllama")
|
|
219
|
+
model = AutoModelForCausalLM.from_pretrained(
|
|
220
|
+
model_name,
|
|
221
|
+
quantization_config=config
|
|
222
|
+
)
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
## Loading Issues
|
|
226
|
+
|
|
227
|
+
### Model Not Found
|
|
228
|
+
|
|
229
|
+
**Error**: `OSError: model_name is not a valid model identifier`
|
|
230
|
+
|
|
231
|
+
**Fix**: Check HuggingFace model exists:
|
|
232
|
+
```bash
|
|
233
|
+
# Search AWQ models
|
|
234
|
+
https://huggingface.co/models?library=awq
|
|
235
|
+
|
|
236
|
+
# Common AWQ model providers
|
|
237
|
+
TheBloke, teknium, Qwen, NousResearch
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
### Safetensors Error
|
|
241
|
+
|
|
242
|
+
**Error**: `safetensors_rust.SafetensorError: Error while deserializing`
|
|
243
|
+
|
|
244
|
+
**Fix**: Try loading without safetensors:
|
|
245
|
+
```python
|
|
246
|
+
model = AutoAWQForCausalLM.from_quantized(
|
|
247
|
+
model_name,
|
|
248
|
+
safetensors=False
|
|
249
|
+
)
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
### Device Map Conflicts
|
|
253
|
+
|
|
254
|
+
**Error**: `ValueError: You cannot use device_map with max_memory`
|
|
255
|
+
|
|
256
|
+
**Fix**: Use one or the other:
|
|
257
|
+
```python
|
|
258
|
+
# Auto device map
|
|
259
|
+
model = AutoAWQForCausalLM.from_quantized(model_name, device_map="auto")
|
|
260
|
+
|
|
261
|
+
# OR manual memory limits
|
|
262
|
+
model = AutoAWQForCausalLM.from_quantized(
|
|
263
|
+
model_name,
|
|
264
|
+
max_memory={0: "20GB", 1: "20GB"}
|
|
265
|
+
)
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
## vLLM Integration Issues
|
|
269
|
+
|
|
270
|
+
### Quantization Not Detected
|
|
271
|
+
|
|
272
|
+
**Error**: vLLM loads model in FP16 instead of quantized.
|
|
273
|
+
|
|
274
|
+
**Fix**: Explicitly specify quantization:
|
|
275
|
+
```python
|
|
276
|
+
from vllm import LLM
|
|
277
|
+
|
|
278
|
+
llm = LLM(
|
|
279
|
+
model="TheBloke/Llama-2-7B-AWQ",
|
|
280
|
+
quantization="awq", # Explicitly set
|
|
281
|
+
dtype="half"
|
|
282
|
+
)
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
### Marlin Kernel Error in vLLM
|
|
286
|
+
|
|
287
|
+
**Error**: `RuntimeError: Marlin kernel not supported`
|
|
288
|
+
|
|
289
|
+
**Fix**: Check GPU compatibility:
|
|
290
|
+
```python
|
|
291
|
+
import torch
|
|
292
|
+
print(torch.cuda.get_device_capability()) # Must be >= (8, 0)
|
|
293
|
+
|
|
294
|
+
# If not supported, use GEMM
|
|
295
|
+
llm = LLM(model="...", quantization="awq") # Uses GEMM by default
|
|
296
|
+
```
|
|
297
|
+
|
|
298
|
+
## Performance Debugging
|
|
299
|
+
|
|
300
|
+
### Memory Usage Check
|
|
301
|
+
|
|
302
|
+
```python
|
|
303
|
+
import torch
|
|
304
|
+
|
|
305
|
+
def print_gpu_memory():
|
|
306
|
+
for i in range(torch.cuda.device_count()):
|
|
307
|
+
allocated = torch.cuda.memory_allocated(i) / 1e9
|
|
308
|
+
reserved = torch.cuda.memory_reserved(i) / 1e9
|
|
309
|
+
print(f"GPU {i}: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved")
|
|
310
|
+
|
|
311
|
+
print_gpu_memory()
|
|
312
|
+
```
|
|
313
|
+
|
|
314
|
+
### Profiling Inference
|
|
315
|
+
|
|
316
|
+
```python
|
|
317
|
+
import time
|
|
318
|
+
|
|
319
|
+
def benchmark_model(model, tokenizer, prompt, n_runs=5):
|
|
320
|
+
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
|
|
321
|
+
|
|
322
|
+
# Warmup
|
|
323
|
+
model.generate(**inputs, max_new_tokens=10)
|
|
324
|
+
torch.cuda.synchronize()
|
|
325
|
+
|
|
326
|
+
# Benchmark
|
|
327
|
+
times = []
|
|
328
|
+
for _ in range(n_runs):
|
|
329
|
+
start = time.perf_counter()
|
|
330
|
+
outputs = model.generate(**inputs, max_new_tokens=100)
|
|
331
|
+
torch.cuda.synchronize()
|
|
332
|
+
times.append(time.perf_counter() - start)
|
|
333
|
+
|
|
334
|
+
tokens = outputs.shape[1] - inputs.input_ids.shape[1]
|
|
335
|
+
avg_time = sum(times) / len(times)
|
|
336
|
+
print(f"Average: {tokens/avg_time:.2f} tokens/sec")
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
## Getting Help
|
|
340
|
+
|
|
341
|
+
1. **Check deprecation notice**: AutoAWQ is deprecated, use llm-compressor for new projects
|
|
342
|
+
2. **GitHub Issues**: https://github.com/casper-hansen/AutoAWQ/issues
|
|
343
|
+
3. **HuggingFace Forums**: https://discuss.huggingface.co/
|
|
344
|
+
4. **vLLM Discord**: For vLLM integration issues
|