@synsci/cli-darwin-x64 1.1.49
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skills/accelerate/SKILL.md +332 -0
- package/bin/skills/accelerate/references/custom-plugins.md +453 -0
- package/bin/skills/accelerate/references/megatron-integration.md +489 -0
- package/bin/skills/accelerate/references/performance.md +525 -0
- package/bin/skills/audiocraft/SKILL.md +564 -0
- package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
- package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
- package/bin/skills/autogpt/SKILL.md +403 -0
- package/bin/skills/autogpt/references/advanced-usage.md +535 -0
- package/bin/skills/autogpt/references/troubleshooting.md +420 -0
- package/bin/skills/awq/SKILL.md +310 -0
- package/bin/skills/awq/references/advanced-usage.md +324 -0
- package/bin/skills/awq/references/troubleshooting.md +344 -0
- package/bin/skills/axolotl/SKILL.md +158 -0
- package/bin/skills/axolotl/references/api.md +5548 -0
- package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
- package/bin/skills/axolotl/references/index.md +15 -0
- package/bin/skills/axolotl/references/other.md +3563 -0
- package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
- package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
- package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
- package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
- package/bin/skills/bitsandbytes/SKILL.md +411 -0
- package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
- package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
- package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
- package/bin/skills/blip-2/SKILL.md +564 -0
- package/bin/skills/blip-2/references/advanced-usage.md +680 -0
- package/bin/skills/blip-2/references/troubleshooting.md +526 -0
- package/bin/skills/chroma/SKILL.md +406 -0
- package/bin/skills/chroma/references/integration.md +38 -0
- package/bin/skills/clip/SKILL.md +253 -0
- package/bin/skills/clip/references/applications.md +207 -0
- package/bin/skills/constitutional-ai/SKILL.md +290 -0
- package/bin/skills/crewai/SKILL.md +498 -0
- package/bin/skills/crewai/references/flows.md +438 -0
- package/bin/skills/crewai/references/tools.md +429 -0
- package/bin/skills/crewai/references/troubleshooting.md +480 -0
- package/bin/skills/deepspeed/SKILL.md +141 -0
- package/bin/skills/deepspeed/references/08.md +17 -0
- package/bin/skills/deepspeed/references/09.md +173 -0
- package/bin/skills/deepspeed/references/2020.md +378 -0
- package/bin/skills/deepspeed/references/2023.md +279 -0
- package/bin/skills/deepspeed/references/assets.md +179 -0
- package/bin/skills/deepspeed/references/index.md +35 -0
- package/bin/skills/deepspeed/references/mii.md +118 -0
- package/bin/skills/deepspeed/references/other.md +1191 -0
- package/bin/skills/deepspeed/references/tutorials.md +6554 -0
- package/bin/skills/dspy/SKILL.md +590 -0
- package/bin/skills/dspy/references/examples.md +663 -0
- package/bin/skills/dspy/references/modules.md +475 -0
- package/bin/skills/dspy/references/optimizers.md +566 -0
- package/bin/skills/faiss/SKILL.md +221 -0
- package/bin/skills/faiss/references/index_types.md +280 -0
- package/bin/skills/flash-attention/SKILL.md +367 -0
- package/bin/skills/flash-attention/references/benchmarks.md +215 -0
- package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
- package/bin/skills/gguf/SKILL.md +427 -0
- package/bin/skills/gguf/references/advanced-usage.md +504 -0
- package/bin/skills/gguf/references/troubleshooting.md +442 -0
- package/bin/skills/gptq/SKILL.md +450 -0
- package/bin/skills/gptq/references/calibration.md +337 -0
- package/bin/skills/gptq/references/integration.md +129 -0
- package/bin/skills/gptq/references/troubleshooting.md +95 -0
- package/bin/skills/grpo-rl-training/README.md +97 -0
- package/bin/skills/grpo-rl-training/SKILL.md +572 -0
- package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
- package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
- package/bin/skills/guidance/SKILL.md +572 -0
- package/bin/skills/guidance/references/backends.md +554 -0
- package/bin/skills/guidance/references/constraints.md +674 -0
- package/bin/skills/guidance/references/examples.md +767 -0
- package/bin/skills/hqq/SKILL.md +445 -0
- package/bin/skills/hqq/references/advanced-usage.md +528 -0
- package/bin/skills/hqq/references/troubleshooting.md +503 -0
- package/bin/skills/hugging-face-cli/SKILL.md +191 -0
- package/bin/skills/hugging-face-cli/references/commands.md +954 -0
- package/bin/skills/hugging-face-cli/references/examples.md +374 -0
- package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
- package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
- package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
- package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
- package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
- package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
- package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
- package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
- package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
- package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
- package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
- package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
- package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
- package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
- package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
- package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
- package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
- package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
- package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
- package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
- package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
- package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
- package/bin/skills/hugging-face-jobs/index.html +216 -0
- package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
- package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
- package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
- package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
- package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
- package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
- package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
- package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
- package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
- package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
- package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
- package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
- package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
- package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
- package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
- package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
- package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
- package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
- package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
- package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
- package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
- package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
- package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
- package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
- package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
- package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
- package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
- package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
- package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
- package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
- package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
- package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
- package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
- package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
- package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
- package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
- package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
- package/bin/skills/instructor/SKILL.md +740 -0
- package/bin/skills/instructor/references/examples.md +107 -0
- package/bin/skills/instructor/references/providers.md +70 -0
- package/bin/skills/instructor/references/validation.md +606 -0
- package/bin/skills/knowledge-distillation/SKILL.md +458 -0
- package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
- package/bin/skills/lambda-labs/SKILL.md +545 -0
- package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
- package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
- package/bin/skills/langchain/SKILL.md +480 -0
- package/bin/skills/langchain/references/agents.md +499 -0
- package/bin/skills/langchain/references/integration.md +562 -0
- package/bin/skills/langchain/references/rag.md +600 -0
- package/bin/skills/langsmith/SKILL.md +422 -0
- package/bin/skills/langsmith/references/advanced-usage.md +548 -0
- package/bin/skills/langsmith/references/troubleshooting.md +537 -0
- package/bin/skills/litgpt/SKILL.md +469 -0
- package/bin/skills/litgpt/references/custom-models.md +568 -0
- package/bin/skills/litgpt/references/distributed-training.md +451 -0
- package/bin/skills/litgpt/references/supported-models.md +336 -0
- package/bin/skills/litgpt/references/training-recipes.md +619 -0
- package/bin/skills/llama-cpp/SKILL.md +258 -0
- package/bin/skills/llama-cpp/references/optimization.md +89 -0
- package/bin/skills/llama-cpp/references/quantization.md +213 -0
- package/bin/skills/llama-cpp/references/server.md +125 -0
- package/bin/skills/llama-factory/SKILL.md +80 -0
- package/bin/skills/llama-factory/references/_images.md +23 -0
- package/bin/skills/llama-factory/references/advanced.md +1055 -0
- package/bin/skills/llama-factory/references/getting_started.md +349 -0
- package/bin/skills/llama-factory/references/index.md +19 -0
- package/bin/skills/llama-factory/references/other.md +31 -0
- package/bin/skills/llamaguard/SKILL.md +337 -0
- package/bin/skills/llamaindex/SKILL.md +569 -0
- package/bin/skills/llamaindex/references/agents.md +83 -0
- package/bin/skills/llamaindex/references/data_connectors.md +108 -0
- package/bin/skills/llamaindex/references/query_engines.md +406 -0
- package/bin/skills/llava/SKILL.md +304 -0
- package/bin/skills/llava/references/training.md +197 -0
- package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
- package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
- package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
- package/bin/skills/long-context/SKILL.md +536 -0
- package/bin/skills/long-context/references/extension_methods.md +468 -0
- package/bin/skills/long-context/references/fine_tuning.md +611 -0
- package/bin/skills/long-context/references/rope.md +402 -0
- package/bin/skills/mamba/SKILL.md +260 -0
- package/bin/skills/mamba/references/architecture-details.md +206 -0
- package/bin/skills/mamba/references/benchmarks.md +255 -0
- package/bin/skills/mamba/references/training-guide.md +388 -0
- package/bin/skills/megatron-core/SKILL.md +366 -0
- package/bin/skills/megatron-core/references/benchmarks.md +249 -0
- package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
- package/bin/skills/megatron-core/references/production-examples.md +473 -0
- package/bin/skills/megatron-core/references/training-recipes.md +547 -0
- package/bin/skills/miles/SKILL.md +315 -0
- package/bin/skills/miles/references/api-reference.md +141 -0
- package/bin/skills/miles/references/troubleshooting.md +352 -0
- package/bin/skills/mlflow/SKILL.md +704 -0
- package/bin/skills/mlflow/references/deployment.md +744 -0
- package/bin/skills/mlflow/references/model-registry.md +770 -0
- package/bin/skills/mlflow/references/tracking.md +680 -0
- package/bin/skills/modal/SKILL.md +341 -0
- package/bin/skills/modal/references/advanced-usage.md +503 -0
- package/bin/skills/modal/references/troubleshooting.md +494 -0
- package/bin/skills/model-merging/SKILL.md +539 -0
- package/bin/skills/model-merging/references/evaluation.md +462 -0
- package/bin/skills/model-merging/references/examples.md +428 -0
- package/bin/skills/model-merging/references/methods.md +352 -0
- package/bin/skills/model-pruning/SKILL.md +495 -0
- package/bin/skills/model-pruning/references/wanda.md +347 -0
- package/bin/skills/moe-training/SKILL.md +526 -0
- package/bin/skills/moe-training/references/architectures.md +432 -0
- package/bin/skills/moe-training/references/inference.md +348 -0
- package/bin/skills/moe-training/references/training.md +425 -0
- package/bin/skills/nanogpt/SKILL.md +290 -0
- package/bin/skills/nanogpt/references/architecture.md +382 -0
- package/bin/skills/nanogpt/references/data.md +476 -0
- package/bin/skills/nanogpt/references/training.md +564 -0
- package/bin/skills/nemo-curator/SKILL.md +383 -0
- package/bin/skills/nemo-curator/references/deduplication.md +87 -0
- package/bin/skills/nemo-curator/references/filtering.md +102 -0
- package/bin/skills/nemo-evaluator/SKILL.md +494 -0
- package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
- package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
- package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
- package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
- package/bin/skills/nemo-guardrails/SKILL.md +297 -0
- package/bin/skills/nnsight/SKILL.md +436 -0
- package/bin/skills/nnsight/references/README.md +78 -0
- package/bin/skills/nnsight/references/api.md +344 -0
- package/bin/skills/nnsight/references/tutorials.md +300 -0
- package/bin/skills/openrlhf/SKILL.md +249 -0
- package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
- package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
- package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
- package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
- package/bin/skills/outlines/SKILL.md +652 -0
- package/bin/skills/outlines/references/backends.md +615 -0
- package/bin/skills/outlines/references/examples.md +773 -0
- package/bin/skills/outlines/references/json_generation.md +652 -0
- package/bin/skills/peft/SKILL.md +431 -0
- package/bin/skills/peft/references/advanced-usage.md +514 -0
- package/bin/skills/peft/references/troubleshooting.md +480 -0
- package/bin/skills/phoenix/SKILL.md +475 -0
- package/bin/skills/phoenix/references/advanced-usage.md +619 -0
- package/bin/skills/phoenix/references/troubleshooting.md +538 -0
- package/bin/skills/pinecone/SKILL.md +358 -0
- package/bin/skills/pinecone/references/deployment.md +181 -0
- package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
- package/bin/skills/pytorch-fsdp/references/index.md +7 -0
- package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
- package/bin/skills/pytorch-lightning/SKILL.md +346 -0
- package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
- package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
- package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
- package/bin/skills/pyvene/SKILL.md +473 -0
- package/bin/skills/pyvene/references/README.md +73 -0
- package/bin/skills/pyvene/references/api.md +383 -0
- package/bin/skills/pyvene/references/tutorials.md +376 -0
- package/bin/skills/qdrant/SKILL.md +493 -0
- package/bin/skills/qdrant/references/advanced-usage.md +648 -0
- package/bin/skills/qdrant/references/troubleshooting.md +631 -0
- package/bin/skills/ray-data/SKILL.md +326 -0
- package/bin/skills/ray-data/references/integration.md +82 -0
- package/bin/skills/ray-data/references/transformations.md +83 -0
- package/bin/skills/ray-train/SKILL.md +406 -0
- package/bin/skills/ray-train/references/multi-node.md +628 -0
- package/bin/skills/rwkv/SKILL.md +260 -0
- package/bin/skills/rwkv/references/architecture-details.md +344 -0
- package/bin/skills/rwkv/references/rwkv7.md +386 -0
- package/bin/skills/rwkv/references/state-management.md +369 -0
- package/bin/skills/saelens/SKILL.md +386 -0
- package/bin/skills/saelens/references/README.md +70 -0
- package/bin/skills/saelens/references/api.md +333 -0
- package/bin/skills/saelens/references/tutorials.md +318 -0
- package/bin/skills/segment-anything/SKILL.md +500 -0
- package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
- package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
- package/bin/skills/sentence-transformers/SKILL.md +255 -0
- package/bin/skills/sentence-transformers/references/models.md +123 -0
- package/bin/skills/sentencepiece/SKILL.md +235 -0
- package/bin/skills/sentencepiece/references/algorithms.md +200 -0
- package/bin/skills/sentencepiece/references/training.md +304 -0
- package/bin/skills/sglang/SKILL.md +442 -0
- package/bin/skills/sglang/references/deployment.md +490 -0
- package/bin/skills/sglang/references/radix-attention.md +413 -0
- package/bin/skills/sglang/references/structured-generation.md +541 -0
- package/bin/skills/simpo/SKILL.md +219 -0
- package/bin/skills/simpo/references/datasets.md +478 -0
- package/bin/skills/simpo/references/hyperparameters.md +452 -0
- package/bin/skills/simpo/references/loss-functions.md +350 -0
- package/bin/skills/skypilot/SKILL.md +509 -0
- package/bin/skills/skypilot/references/advanced-usage.md +491 -0
- package/bin/skills/skypilot/references/troubleshooting.md +570 -0
- package/bin/skills/slime/SKILL.md +464 -0
- package/bin/skills/slime/references/api-reference.md +392 -0
- package/bin/skills/slime/references/troubleshooting.md +386 -0
- package/bin/skills/speculative-decoding/SKILL.md +467 -0
- package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
- package/bin/skills/speculative-decoding/references/medusa.md +350 -0
- package/bin/skills/stable-diffusion/SKILL.md +519 -0
- package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
- package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
- package/bin/skills/tensorboard/SKILL.md +629 -0
- package/bin/skills/tensorboard/references/integrations.md +638 -0
- package/bin/skills/tensorboard/references/profiling.md +545 -0
- package/bin/skills/tensorboard/references/visualization.md +620 -0
- package/bin/skills/tensorrt-llm/SKILL.md +187 -0
- package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
- package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
- package/bin/skills/tensorrt-llm/references/serving.md +470 -0
- package/bin/skills/tinker/SKILL.md +362 -0
- package/bin/skills/tinker/references/api-reference.md +168 -0
- package/bin/skills/tinker/references/getting-started.md +157 -0
- package/bin/skills/tinker/references/loss-functions.md +163 -0
- package/bin/skills/tinker/references/models-and-lora.md +139 -0
- package/bin/skills/tinker/references/recipes.md +280 -0
- package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
- package/bin/skills/tinker/references/rendering.md +243 -0
- package/bin/skills/tinker/references/supervised-learning.md +232 -0
- package/bin/skills/tinker-training-cost/SKILL.md +187 -0
- package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
- package/bin/skills/torchforge/SKILL.md +433 -0
- package/bin/skills/torchforge/references/api-reference.md +327 -0
- package/bin/skills/torchforge/references/troubleshooting.md +409 -0
- package/bin/skills/torchtitan/SKILL.md +358 -0
- package/bin/skills/torchtitan/references/checkpoint.md +181 -0
- package/bin/skills/torchtitan/references/custom-models.md +258 -0
- package/bin/skills/torchtitan/references/float8.md +133 -0
- package/bin/skills/torchtitan/references/fsdp.md +126 -0
- package/bin/skills/transformer-lens/SKILL.md +346 -0
- package/bin/skills/transformer-lens/references/README.md +54 -0
- package/bin/skills/transformer-lens/references/api.md +362 -0
- package/bin/skills/transformer-lens/references/tutorials.md +339 -0
- package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
- package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
- package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
- package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
- package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
- package/bin/skills/unsloth/SKILL.md +80 -0
- package/bin/skills/unsloth/references/index.md +7 -0
- package/bin/skills/unsloth/references/llms-full.md +16799 -0
- package/bin/skills/unsloth/references/llms-txt.md +12044 -0
- package/bin/skills/unsloth/references/llms.md +82 -0
- package/bin/skills/verl/SKILL.md +391 -0
- package/bin/skills/verl/references/api-reference.md +301 -0
- package/bin/skills/verl/references/troubleshooting.md +391 -0
- package/bin/skills/vllm/SKILL.md +364 -0
- package/bin/skills/vllm/references/optimization.md +226 -0
- package/bin/skills/vllm/references/quantization.md +284 -0
- package/bin/skills/vllm/references/server-deployment.md +255 -0
- package/bin/skills/vllm/references/troubleshooting.md +447 -0
- package/bin/skills/weights-and-biases/SKILL.md +590 -0
- package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
- package/bin/skills/weights-and-biases/references/integrations.md +700 -0
- package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
- package/bin/skills/whisper/SKILL.md +317 -0
- package/bin/skills/whisper/references/languages.md +189 -0
- package/bin/synsc +0 -0
- package/package.json +10 -0
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
# Model Merging Methods: Deep Dive
|
|
2
|
+
|
|
3
|
+
Complete technical guide to model merging algorithms based on research papers.
|
|
4
|
+
|
|
5
|
+
## Table of Contents
|
|
6
|
+
- TIES-Merging Algorithm
|
|
7
|
+
- DARE (Drop And REscale)
|
|
8
|
+
- Linear Merging
|
|
9
|
+
- SLERP
|
|
10
|
+
- Task Arithmetic
|
|
11
|
+
- Comparison
|
|
12
|
+
|
|
13
|
+
## TIES-Merging: Resolving Interference
|
|
14
|
+
|
|
15
|
+
**Paper**: "TIES-Merging: Resolving Interference When Merging Models" (NeurIPS 2023)
|
|
16
|
+
**Authors**: Prateek Yadav et al.
|
|
17
|
+
**Code**: https://github.com/prateeky2806/ties-merging
|
|
18
|
+
|
|
19
|
+
### Algorithm Overview
|
|
20
|
+
|
|
21
|
+
TIES-Merging addresses two major sources of interference:
|
|
22
|
+
1. Redundant parameter values
|
|
23
|
+
2. Sign disagreement across models
|
|
24
|
+
|
|
25
|
+
**Three-Step Process**: TRIM, ELECT, MERGE
|
|
26
|
+
|
|
27
|
+
### Step 1: TRIM (Reset Small Changes)
|
|
28
|
+
|
|
29
|
+
Remove parameters that changed minimally during fine-tuning.
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
def trim(task_vector, density=0.2):
|
|
33
|
+
"""Keep top-k% parameters by magnitude, reset rest to 0."""
|
|
34
|
+
# Calculate magnitude
|
|
35
|
+
magnitudes = torch.abs(task_vector)
|
|
36
|
+
|
|
37
|
+
# Get threshold for top-k%
|
|
38
|
+
k = int(density * task_vector.numel())
|
|
39
|
+
threshold = torch.topk(magnitudes.flatten(), k).values.min()
|
|
40
|
+
|
|
41
|
+
# Create mask: keep parameters above threshold
|
|
42
|
+
mask = magnitudes >= threshold
|
|
43
|
+
|
|
44
|
+
# Apply mask
|
|
45
|
+
trimmed_vector = task_vector * mask
|
|
46
|
+
|
|
47
|
+
return trimmed_vector
|
|
48
|
+
|
|
49
|
+
# Example
|
|
50
|
+
task_vector_1 = finetuned_model_1 - base_model
|
|
51
|
+
task_vector_2 = finetuned_model_2 - base_model
|
|
52
|
+
|
|
53
|
+
trimmed_1 = trim(task_vector_1, density=0.2) # Keep top 20%
|
|
54
|
+
trimmed_2 = trim(task_vector_2, density=0.2)
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Step 2: ELECT SIGN (Resolve Conflicts)
|
|
58
|
+
|
|
59
|
+
When parameters have conflicting signs, elect the dominant sign.
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
def elect_sign(task_vectors):
|
|
63
|
+
"""Resolve sign conflicts across multiple task vectors."""
|
|
64
|
+
# Stack all task vectors
|
|
65
|
+
stacked = torch.stack(task_vectors) # (num_models, num_params)
|
|
66
|
+
|
|
67
|
+
# Count positive vs negative for each parameter
|
|
68
|
+
positive_count = (stacked > 0).sum(dim=0)
|
|
69
|
+
negative_count = (stacked < 0).sum(dim=0)
|
|
70
|
+
|
|
71
|
+
# Elect majority sign
|
|
72
|
+
final_sign = torch.where(
|
|
73
|
+
positive_count > negative_count,
|
|
74
|
+
torch.ones_like(stacked[0]),
|
|
75
|
+
-torch.ones_like(stacked[0])
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# Where tie, keep sign from first model
|
|
79
|
+
tie_mask = (positive_count == negative_count)
|
|
80
|
+
final_sign[tie_mask] = torch.sign(stacked[0][tie_mask])
|
|
81
|
+
|
|
82
|
+
return final_sign
|
|
83
|
+
|
|
84
|
+
# Example
|
|
85
|
+
task_vectors = [trimmed_1, trimmed_2, trimmed_3]
|
|
86
|
+
elected_sign = elect_sign(task_vectors)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### Step 3: MERGE (Disjoint Merging)
|
|
90
|
+
|
|
91
|
+
Merge only parameters that agree with elected sign.
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
def ties_merge(base_model, task_vectors, density=0.2, lambda_param=1.0):
|
|
95
|
+
"""Complete TIES-Merging algorithm."""
|
|
96
|
+
# Step 1: Trim each task vector
|
|
97
|
+
trimmed_vectors = [trim(tv, density) for tv in task_vectors]
|
|
98
|
+
|
|
99
|
+
# Step 2: Elect sign
|
|
100
|
+
elected_sign = elect_sign(trimmed_vectors)
|
|
101
|
+
|
|
102
|
+
# Step 3: Merge aligned parameters
|
|
103
|
+
merged_task_vector = torch.zeros_like(task_vectors[0])
|
|
104
|
+
|
|
105
|
+
for tv in trimmed_vectors:
|
|
106
|
+
# Keep only parameters aligned with elected sign
|
|
107
|
+
aligned_mask = (torch.sign(tv) == elected_sign) | (tv == 0)
|
|
108
|
+
aligned_params = tv * aligned_mask
|
|
109
|
+
|
|
110
|
+
# Accumulate
|
|
111
|
+
merged_task_vector += aligned_params
|
|
112
|
+
|
|
113
|
+
# Average
|
|
114
|
+
num_models = len(task_vectors)
|
|
115
|
+
merged_task_vector /= num_models
|
|
116
|
+
|
|
117
|
+
# Add back to base model
|
|
118
|
+
final_model = base_model + lambda_param * merged_task_vector
|
|
119
|
+
|
|
120
|
+
return final_model
|
|
121
|
+
|
|
122
|
+
# Usage
|
|
123
|
+
base = load_model("mistralai/Mistral-7B-v0.1")
|
|
124
|
+
model_1 = load_model("WizardLM/WizardMath-7B-V1.1")
|
|
125
|
+
model_2 = load_model("teknium/OpenHermes-2.5-Mistral-7B")
|
|
126
|
+
model_3 = load_model("NousResearch/Nous-Hermes-2-Mistral-7B-DPO")
|
|
127
|
+
|
|
128
|
+
task_vectors = [
|
|
129
|
+
model_1 - base,
|
|
130
|
+
model_2 - base,
|
|
131
|
+
model_3 - base
|
|
132
|
+
]
|
|
133
|
+
|
|
134
|
+
merged = ties_merge(base, task_vectors, density=0.5, lambda_param=1.0)
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### Hyperparameters
|
|
138
|
+
|
|
139
|
+
**density** (ρ): Fraction of parameters to keep (default: 0.2)
|
|
140
|
+
- Lower (0.1-0.3): More aggressive pruning, higher sparsity
|
|
141
|
+
- Higher (0.5-0.8): Conservative pruning, denser result
|
|
142
|
+
|
|
143
|
+
**lambda** (λ): Scaling factor for merged task vector (default: 1.0)
|
|
144
|
+
- Lower (<1.0): Less influence from fine-tuned models
|
|
145
|
+
- Higher (>1.0): More influence from fine-tuned models
|
|
146
|
+
|
|
147
|
+
## DARE: Drop And REscale
|
|
148
|
+
|
|
149
|
+
**Paper**: "Language Models are Super Mario: Absorbing Abilities from Homologous Models as a Free Lunch" (arXiv 2311.03099, 2023)
|
|
150
|
+
**Authors**: Le Yu, Bowen Yu, Haiyang Yu, Fei Huang, Yongbin Li
|
|
151
|
+
|
|
152
|
+
### Algorithm
|
|
153
|
+
|
|
154
|
+
DARE randomly drops delta parameters and rescales remaining ones.
|
|
155
|
+
|
|
156
|
+
### Mathematical Formulation
|
|
157
|
+
|
|
158
|
+
Given:
|
|
159
|
+
- Base model parameters: θ₀
|
|
160
|
+
- Fine-tuned model parameters: θₜ
|
|
161
|
+
- Delta parameters: δₜ = θₜ - θ₀
|
|
162
|
+
|
|
163
|
+
**Step 1: Random Drop**
|
|
164
|
+
|
|
165
|
+
```
|
|
166
|
+
m_t ~ Bernoulli(p) # Drop mask
|
|
167
|
+
δ̃_t = (1 - m_t) ⊙ δ_t # Element-wise product
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
**Step 2: Rescale**
|
|
171
|
+
|
|
172
|
+
```
|
|
173
|
+
δ̂_t = δ̃_t / (1 - p) # Rescale to preserve expectation
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
**Final Model**
|
|
177
|
+
|
|
178
|
+
```
|
|
179
|
+
θ̂_t = θ₀ + δ̂_t
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
### Implementation
|
|
183
|
+
|
|
184
|
+
```python
|
|
185
|
+
def dare(base_model, finetuned_model, drop_rate=0.9):
|
|
186
|
+
"""DARE: Drop And REscale delta parameters."""
|
|
187
|
+
# Compute delta
|
|
188
|
+
delta = finetuned_model - base_model
|
|
189
|
+
|
|
190
|
+
# Random drop mask (Bernoulli)
|
|
191
|
+
drop_mask = torch.bernoulli(torch.full_like(delta, drop_rate))
|
|
192
|
+
|
|
193
|
+
# Apply mask (keep 1-p, drop p)
|
|
194
|
+
dropped_delta = delta * (1 - drop_mask)
|
|
195
|
+
|
|
196
|
+
# Rescale to preserve expectation
|
|
197
|
+
rescaled_delta = dropped_delta / (1 - drop_rate)
|
|
198
|
+
|
|
199
|
+
# Reconstruct model
|
|
200
|
+
result = base_model + rescaled_delta
|
|
201
|
+
|
|
202
|
+
return result
|
|
203
|
+
|
|
204
|
+
# Example
|
|
205
|
+
base = load_model("mistralai/Mistral-7B-v0.1")
|
|
206
|
+
finetuned = load_model("WizardLM/WizardMath-7B-V1.1")
|
|
207
|
+
|
|
208
|
+
# Drop 90% of delta parameters
|
|
209
|
+
result = dare(base, finetuned, drop_rate=0.9)
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
### DARE + TIES (DARE-TIES)
|
|
213
|
+
|
|
214
|
+
Combine both methods for best results.
|
|
215
|
+
|
|
216
|
+
```python
|
|
217
|
+
def dare_ties(base_model, finetuned_models, drop_rate=0.9, density=0.5):
|
|
218
|
+
"""DARE + TIES-Merging."""
|
|
219
|
+
# Step 1: Apply DARE to each model
|
|
220
|
+
dare_deltas = []
|
|
221
|
+
for model in finetuned_models:
|
|
222
|
+
delta = model - base_model
|
|
223
|
+
|
|
224
|
+
# DARE drop
|
|
225
|
+
drop_mask = torch.bernoulli(torch.full_like(delta, drop_rate))
|
|
226
|
+
dropped = delta * (1 - drop_mask)
|
|
227
|
+
rescaled = dropped / (1 - drop_rate)
|
|
228
|
+
|
|
229
|
+
dare_deltas.append(rescaled)
|
|
230
|
+
|
|
231
|
+
# Step 2: Apply TIES to DARE-processed deltas
|
|
232
|
+
merged = ties_merge(base_model, dare_deltas, density=density)
|
|
233
|
+
|
|
234
|
+
return merged
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
### Hyperparameters
|
|
238
|
+
|
|
239
|
+
**drop_rate** (p): Probability of dropping each parameter (default: 0.9)
|
|
240
|
+
- Lower (0.5-0.7): Conservative, keeps more parameters
|
|
241
|
+
- Higher (0.9-0.99): Aggressive, maximum sparsity
|
|
242
|
+
- Works well even at 0.99 for large models
|
|
243
|
+
|
|
244
|
+
**Observations**:
|
|
245
|
+
- Larger models tolerate higher drop rates
|
|
246
|
+
- Delta parameters with small absolute values (<0.002) can be safely dropped
|
|
247
|
+
- Performance improves with model size
|
|
248
|
+
|
|
249
|
+
## Linear Merging (Model Soup)
|
|
250
|
+
|
|
251
|
+
Simple weighted average.
|
|
252
|
+
|
|
253
|
+
```python
|
|
254
|
+
def linear_merge(models, weights):
|
|
255
|
+
"""Weighted average of model parameters."""
|
|
256
|
+
assert len(models) == len(weights)
|
|
257
|
+
assert sum(weights) == 1.0, "Weights should sum to 1"
|
|
258
|
+
|
|
259
|
+
merged = sum(w * model for w, model in zip(weights, models))
|
|
260
|
+
|
|
261
|
+
return merged
|
|
262
|
+
|
|
263
|
+
# Example
|
|
264
|
+
models = [model_1, model_2, model_3]
|
|
265
|
+
weights = [0.4, 0.3, 0.3]
|
|
266
|
+
merged = linear_merge(models, weights)
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
## SLERP: Spherical Linear Interpolation
|
|
270
|
+
|
|
271
|
+
Interpolate along sphere in weight space.
|
|
272
|
+
|
|
273
|
+
```python
|
|
274
|
+
def slerp(model_1, model_2, t=0.5):
|
|
275
|
+
"""SLERP between two models."""
|
|
276
|
+
# Flatten parameters
|
|
277
|
+
p1 = torch.cat([p.flatten() for p in model_1.parameters()])
|
|
278
|
+
p2 = torch.cat([p.flatten() for p in model_2.parameters()])
|
|
279
|
+
|
|
280
|
+
# Normalize
|
|
281
|
+
p1_norm = p1 / p1.norm()
|
|
282
|
+
p2_norm = p2 / p2.norm()
|
|
283
|
+
|
|
284
|
+
# Compute angle
|
|
285
|
+
dot = (p1_norm * p2_norm).sum()
|
|
286
|
+
theta = torch.acos(torch.clamp(dot, -1.0, 1.0))
|
|
287
|
+
|
|
288
|
+
# SLERP formula
|
|
289
|
+
if theta < 1e-6:
|
|
290
|
+
# Vectors nearly parallel, use linear interpolation
|
|
291
|
+
result = (1 - t) * p1 + t * p2
|
|
292
|
+
else:
|
|
293
|
+
# Spherical interpolation
|
|
294
|
+
sin_theta = torch.sin(theta)
|
|
295
|
+
result = (torch.sin((1 - t) * theta) / sin_theta) * p1 + \
|
|
296
|
+
(torch.sin(t * theta) / sin_theta) * p2
|
|
297
|
+
|
|
298
|
+
# Reshape back to model
|
|
299
|
+
merged_model = reshape_to_model(result, model_1)
|
|
300
|
+
|
|
301
|
+
return merged_model
|
|
302
|
+
|
|
303
|
+
# Example
|
|
304
|
+
merged = slerp(model_1, model_2, t=0.5) # 50-50 blend
|
|
305
|
+
```
|
|
306
|
+
|
|
307
|
+
## Task Arithmetic
|
|
308
|
+
|
|
309
|
+
Add task vectors to base model.
|
|
310
|
+
|
|
311
|
+
```python
|
|
312
|
+
def task_arithmetic(base_model, finetuned_models, lambdas):
|
|
313
|
+
"""Task arithmetic merging."""
|
|
314
|
+
# Extract task vectors
|
|
315
|
+
task_vectors = [model - base_model for model in finetuned_models]
|
|
316
|
+
|
|
317
|
+
# Weighted sum
|
|
318
|
+
combined_vector = sum(λ * tv for λ, tv in zip(lambdas, task_vectors))
|
|
319
|
+
|
|
320
|
+
# Add to base
|
|
321
|
+
merged = base_model + combined_vector
|
|
322
|
+
|
|
323
|
+
return merged
|
|
324
|
+
|
|
325
|
+
# Example
|
|
326
|
+
base = load_model("mistralai/Mistral-7B-v0.1")
|
|
327
|
+
math_model = load_model("WizardLM/WizardMath-7B-V1.1")
|
|
328
|
+
code_model = load_model("ajibawa-2023/Code-Mistral-7B")
|
|
329
|
+
|
|
330
|
+
merged = task_arithmetic(
|
|
331
|
+
base,
|
|
332
|
+
[math_model, code_model],
|
|
333
|
+
lambdas=[0.6, 0.4]
|
|
334
|
+
)
|
|
335
|
+
```
|
|
336
|
+
|
|
337
|
+
## Method Comparison
|
|
338
|
+
|
|
339
|
+
| Method | Pros | Cons | Best For |
|
|
340
|
+
|--------|------|------|----------|
|
|
341
|
+
| **Linear** | Simple, fast | Basic averaging | 2-3 similar models |
|
|
342
|
+
| **SLERP** | Preserves magnitude | Only 2 models | Smooth blending |
|
|
343
|
+
| **Task Arithmetic** | Intuitive, flexible | Sign conflicts | Multiple specialized models |
|
|
344
|
+
| **TIES** | Resolves conflicts | More complex | Many task-specific models |
|
|
345
|
+
| **DARE** | High sparsity | Random variance | Reducing redundancy |
|
|
346
|
+
| **DARE-TIES** | Best performance | Most complex | Production (state-of-art) |
|
|
347
|
+
|
|
348
|
+
## Resources
|
|
349
|
+
|
|
350
|
+
- **TIES Paper**: https://arxiv.org/abs/2306.01708
|
|
351
|
+
- **DARE Paper**: https://arxiv.org/abs/2311.03099
|
|
352
|
+
- **mergekit**: https://github.com/arcee-ai/mergekit
|