@synsci/cli-darwin-x64 1.1.49
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skills/accelerate/SKILL.md +332 -0
- package/bin/skills/accelerate/references/custom-plugins.md +453 -0
- package/bin/skills/accelerate/references/megatron-integration.md +489 -0
- package/bin/skills/accelerate/references/performance.md +525 -0
- package/bin/skills/audiocraft/SKILL.md +564 -0
- package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
- package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
- package/bin/skills/autogpt/SKILL.md +403 -0
- package/bin/skills/autogpt/references/advanced-usage.md +535 -0
- package/bin/skills/autogpt/references/troubleshooting.md +420 -0
- package/bin/skills/awq/SKILL.md +310 -0
- package/bin/skills/awq/references/advanced-usage.md +324 -0
- package/bin/skills/awq/references/troubleshooting.md +344 -0
- package/bin/skills/axolotl/SKILL.md +158 -0
- package/bin/skills/axolotl/references/api.md +5548 -0
- package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
- package/bin/skills/axolotl/references/index.md +15 -0
- package/bin/skills/axolotl/references/other.md +3563 -0
- package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
- package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
- package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
- package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
- package/bin/skills/bitsandbytes/SKILL.md +411 -0
- package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
- package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
- package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
- package/bin/skills/blip-2/SKILL.md +564 -0
- package/bin/skills/blip-2/references/advanced-usage.md +680 -0
- package/bin/skills/blip-2/references/troubleshooting.md +526 -0
- package/bin/skills/chroma/SKILL.md +406 -0
- package/bin/skills/chroma/references/integration.md +38 -0
- package/bin/skills/clip/SKILL.md +253 -0
- package/bin/skills/clip/references/applications.md +207 -0
- package/bin/skills/constitutional-ai/SKILL.md +290 -0
- package/bin/skills/crewai/SKILL.md +498 -0
- package/bin/skills/crewai/references/flows.md +438 -0
- package/bin/skills/crewai/references/tools.md +429 -0
- package/bin/skills/crewai/references/troubleshooting.md +480 -0
- package/bin/skills/deepspeed/SKILL.md +141 -0
- package/bin/skills/deepspeed/references/08.md +17 -0
- package/bin/skills/deepspeed/references/09.md +173 -0
- package/bin/skills/deepspeed/references/2020.md +378 -0
- package/bin/skills/deepspeed/references/2023.md +279 -0
- package/bin/skills/deepspeed/references/assets.md +179 -0
- package/bin/skills/deepspeed/references/index.md +35 -0
- package/bin/skills/deepspeed/references/mii.md +118 -0
- package/bin/skills/deepspeed/references/other.md +1191 -0
- package/bin/skills/deepspeed/references/tutorials.md +6554 -0
- package/bin/skills/dspy/SKILL.md +590 -0
- package/bin/skills/dspy/references/examples.md +663 -0
- package/bin/skills/dspy/references/modules.md +475 -0
- package/bin/skills/dspy/references/optimizers.md +566 -0
- package/bin/skills/faiss/SKILL.md +221 -0
- package/bin/skills/faiss/references/index_types.md +280 -0
- package/bin/skills/flash-attention/SKILL.md +367 -0
- package/bin/skills/flash-attention/references/benchmarks.md +215 -0
- package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
- package/bin/skills/gguf/SKILL.md +427 -0
- package/bin/skills/gguf/references/advanced-usage.md +504 -0
- package/bin/skills/gguf/references/troubleshooting.md +442 -0
- package/bin/skills/gptq/SKILL.md +450 -0
- package/bin/skills/gptq/references/calibration.md +337 -0
- package/bin/skills/gptq/references/integration.md +129 -0
- package/bin/skills/gptq/references/troubleshooting.md +95 -0
- package/bin/skills/grpo-rl-training/README.md +97 -0
- package/bin/skills/grpo-rl-training/SKILL.md +572 -0
- package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
- package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
- package/bin/skills/guidance/SKILL.md +572 -0
- package/bin/skills/guidance/references/backends.md +554 -0
- package/bin/skills/guidance/references/constraints.md +674 -0
- package/bin/skills/guidance/references/examples.md +767 -0
- package/bin/skills/hqq/SKILL.md +445 -0
- package/bin/skills/hqq/references/advanced-usage.md +528 -0
- package/bin/skills/hqq/references/troubleshooting.md +503 -0
- package/bin/skills/hugging-face-cli/SKILL.md +191 -0
- package/bin/skills/hugging-face-cli/references/commands.md +954 -0
- package/bin/skills/hugging-face-cli/references/examples.md +374 -0
- package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
- package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
- package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
- package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
- package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
- package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
- package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
- package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
- package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
- package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
- package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
- package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
- package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
- package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
- package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
- package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
- package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
- package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
- package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
- package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
- package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
- package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
- package/bin/skills/hugging-face-jobs/index.html +216 -0
- package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
- package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
- package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
- package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
- package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
- package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
- package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
- package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
- package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
- package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
- package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
- package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
- package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
- package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
- package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
- package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
- package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
- package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
- package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
- package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
- package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
- package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
- package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
- package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
- package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
- package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
- package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
- package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
- package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
- package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
- package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
- package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
- package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
- package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
- package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
- package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
- package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
- package/bin/skills/instructor/SKILL.md +740 -0
- package/bin/skills/instructor/references/examples.md +107 -0
- package/bin/skills/instructor/references/providers.md +70 -0
- package/bin/skills/instructor/references/validation.md +606 -0
- package/bin/skills/knowledge-distillation/SKILL.md +458 -0
- package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
- package/bin/skills/lambda-labs/SKILL.md +545 -0
- package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
- package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
- package/bin/skills/langchain/SKILL.md +480 -0
- package/bin/skills/langchain/references/agents.md +499 -0
- package/bin/skills/langchain/references/integration.md +562 -0
- package/bin/skills/langchain/references/rag.md +600 -0
- package/bin/skills/langsmith/SKILL.md +422 -0
- package/bin/skills/langsmith/references/advanced-usage.md +548 -0
- package/bin/skills/langsmith/references/troubleshooting.md +537 -0
- package/bin/skills/litgpt/SKILL.md +469 -0
- package/bin/skills/litgpt/references/custom-models.md +568 -0
- package/bin/skills/litgpt/references/distributed-training.md +451 -0
- package/bin/skills/litgpt/references/supported-models.md +336 -0
- package/bin/skills/litgpt/references/training-recipes.md +619 -0
- package/bin/skills/llama-cpp/SKILL.md +258 -0
- package/bin/skills/llama-cpp/references/optimization.md +89 -0
- package/bin/skills/llama-cpp/references/quantization.md +213 -0
- package/bin/skills/llama-cpp/references/server.md +125 -0
- package/bin/skills/llama-factory/SKILL.md +80 -0
- package/bin/skills/llama-factory/references/_images.md +23 -0
- package/bin/skills/llama-factory/references/advanced.md +1055 -0
- package/bin/skills/llama-factory/references/getting_started.md +349 -0
- package/bin/skills/llama-factory/references/index.md +19 -0
- package/bin/skills/llama-factory/references/other.md +31 -0
- package/bin/skills/llamaguard/SKILL.md +337 -0
- package/bin/skills/llamaindex/SKILL.md +569 -0
- package/bin/skills/llamaindex/references/agents.md +83 -0
- package/bin/skills/llamaindex/references/data_connectors.md +108 -0
- package/bin/skills/llamaindex/references/query_engines.md +406 -0
- package/bin/skills/llava/SKILL.md +304 -0
- package/bin/skills/llava/references/training.md +197 -0
- package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
- package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
- package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
- package/bin/skills/long-context/SKILL.md +536 -0
- package/bin/skills/long-context/references/extension_methods.md +468 -0
- package/bin/skills/long-context/references/fine_tuning.md +611 -0
- package/bin/skills/long-context/references/rope.md +402 -0
- package/bin/skills/mamba/SKILL.md +260 -0
- package/bin/skills/mamba/references/architecture-details.md +206 -0
- package/bin/skills/mamba/references/benchmarks.md +255 -0
- package/bin/skills/mamba/references/training-guide.md +388 -0
- package/bin/skills/megatron-core/SKILL.md +366 -0
- package/bin/skills/megatron-core/references/benchmarks.md +249 -0
- package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
- package/bin/skills/megatron-core/references/production-examples.md +473 -0
- package/bin/skills/megatron-core/references/training-recipes.md +547 -0
- package/bin/skills/miles/SKILL.md +315 -0
- package/bin/skills/miles/references/api-reference.md +141 -0
- package/bin/skills/miles/references/troubleshooting.md +352 -0
- package/bin/skills/mlflow/SKILL.md +704 -0
- package/bin/skills/mlflow/references/deployment.md +744 -0
- package/bin/skills/mlflow/references/model-registry.md +770 -0
- package/bin/skills/mlflow/references/tracking.md +680 -0
- package/bin/skills/modal/SKILL.md +341 -0
- package/bin/skills/modal/references/advanced-usage.md +503 -0
- package/bin/skills/modal/references/troubleshooting.md +494 -0
- package/bin/skills/model-merging/SKILL.md +539 -0
- package/bin/skills/model-merging/references/evaluation.md +462 -0
- package/bin/skills/model-merging/references/examples.md +428 -0
- package/bin/skills/model-merging/references/methods.md +352 -0
- package/bin/skills/model-pruning/SKILL.md +495 -0
- package/bin/skills/model-pruning/references/wanda.md +347 -0
- package/bin/skills/moe-training/SKILL.md +526 -0
- package/bin/skills/moe-training/references/architectures.md +432 -0
- package/bin/skills/moe-training/references/inference.md +348 -0
- package/bin/skills/moe-training/references/training.md +425 -0
- package/bin/skills/nanogpt/SKILL.md +290 -0
- package/bin/skills/nanogpt/references/architecture.md +382 -0
- package/bin/skills/nanogpt/references/data.md +476 -0
- package/bin/skills/nanogpt/references/training.md +564 -0
- package/bin/skills/nemo-curator/SKILL.md +383 -0
- package/bin/skills/nemo-curator/references/deduplication.md +87 -0
- package/bin/skills/nemo-curator/references/filtering.md +102 -0
- package/bin/skills/nemo-evaluator/SKILL.md +494 -0
- package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
- package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
- package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
- package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
- package/bin/skills/nemo-guardrails/SKILL.md +297 -0
- package/bin/skills/nnsight/SKILL.md +436 -0
- package/bin/skills/nnsight/references/README.md +78 -0
- package/bin/skills/nnsight/references/api.md +344 -0
- package/bin/skills/nnsight/references/tutorials.md +300 -0
- package/bin/skills/openrlhf/SKILL.md +249 -0
- package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
- package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
- package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
- package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
- package/bin/skills/outlines/SKILL.md +652 -0
- package/bin/skills/outlines/references/backends.md +615 -0
- package/bin/skills/outlines/references/examples.md +773 -0
- package/bin/skills/outlines/references/json_generation.md +652 -0
- package/bin/skills/peft/SKILL.md +431 -0
- package/bin/skills/peft/references/advanced-usage.md +514 -0
- package/bin/skills/peft/references/troubleshooting.md +480 -0
- package/bin/skills/phoenix/SKILL.md +475 -0
- package/bin/skills/phoenix/references/advanced-usage.md +619 -0
- package/bin/skills/phoenix/references/troubleshooting.md +538 -0
- package/bin/skills/pinecone/SKILL.md +358 -0
- package/bin/skills/pinecone/references/deployment.md +181 -0
- package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
- package/bin/skills/pytorch-fsdp/references/index.md +7 -0
- package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
- package/bin/skills/pytorch-lightning/SKILL.md +346 -0
- package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
- package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
- package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
- package/bin/skills/pyvene/SKILL.md +473 -0
- package/bin/skills/pyvene/references/README.md +73 -0
- package/bin/skills/pyvene/references/api.md +383 -0
- package/bin/skills/pyvene/references/tutorials.md +376 -0
- package/bin/skills/qdrant/SKILL.md +493 -0
- package/bin/skills/qdrant/references/advanced-usage.md +648 -0
- package/bin/skills/qdrant/references/troubleshooting.md +631 -0
- package/bin/skills/ray-data/SKILL.md +326 -0
- package/bin/skills/ray-data/references/integration.md +82 -0
- package/bin/skills/ray-data/references/transformations.md +83 -0
- package/bin/skills/ray-train/SKILL.md +406 -0
- package/bin/skills/ray-train/references/multi-node.md +628 -0
- package/bin/skills/rwkv/SKILL.md +260 -0
- package/bin/skills/rwkv/references/architecture-details.md +344 -0
- package/bin/skills/rwkv/references/rwkv7.md +386 -0
- package/bin/skills/rwkv/references/state-management.md +369 -0
- package/bin/skills/saelens/SKILL.md +386 -0
- package/bin/skills/saelens/references/README.md +70 -0
- package/bin/skills/saelens/references/api.md +333 -0
- package/bin/skills/saelens/references/tutorials.md +318 -0
- package/bin/skills/segment-anything/SKILL.md +500 -0
- package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
- package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
- package/bin/skills/sentence-transformers/SKILL.md +255 -0
- package/bin/skills/sentence-transformers/references/models.md +123 -0
- package/bin/skills/sentencepiece/SKILL.md +235 -0
- package/bin/skills/sentencepiece/references/algorithms.md +200 -0
- package/bin/skills/sentencepiece/references/training.md +304 -0
- package/bin/skills/sglang/SKILL.md +442 -0
- package/bin/skills/sglang/references/deployment.md +490 -0
- package/bin/skills/sglang/references/radix-attention.md +413 -0
- package/bin/skills/sglang/references/structured-generation.md +541 -0
- package/bin/skills/simpo/SKILL.md +219 -0
- package/bin/skills/simpo/references/datasets.md +478 -0
- package/bin/skills/simpo/references/hyperparameters.md +452 -0
- package/bin/skills/simpo/references/loss-functions.md +350 -0
- package/bin/skills/skypilot/SKILL.md +509 -0
- package/bin/skills/skypilot/references/advanced-usage.md +491 -0
- package/bin/skills/skypilot/references/troubleshooting.md +570 -0
- package/bin/skills/slime/SKILL.md +464 -0
- package/bin/skills/slime/references/api-reference.md +392 -0
- package/bin/skills/slime/references/troubleshooting.md +386 -0
- package/bin/skills/speculative-decoding/SKILL.md +467 -0
- package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
- package/bin/skills/speculative-decoding/references/medusa.md +350 -0
- package/bin/skills/stable-diffusion/SKILL.md +519 -0
- package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
- package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
- package/bin/skills/tensorboard/SKILL.md +629 -0
- package/bin/skills/tensorboard/references/integrations.md +638 -0
- package/bin/skills/tensorboard/references/profiling.md +545 -0
- package/bin/skills/tensorboard/references/visualization.md +620 -0
- package/bin/skills/tensorrt-llm/SKILL.md +187 -0
- package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
- package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
- package/bin/skills/tensorrt-llm/references/serving.md +470 -0
- package/bin/skills/tinker/SKILL.md +362 -0
- package/bin/skills/tinker/references/api-reference.md +168 -0
- package/bin/skills/tinker/references/getting-started.md +157 -0
- package/bin/skills/tinker/references/loss-functions.md +163 -0
- package/bin/skills/tinker/references/models-and-lora.md +139 -0
- package/bin/skills/tinker/references/recipes.md +280 -0
- package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
- package/bin/skills/tinker/references/rendering.md +243 -0
- package/bin/skills/tinker/references/supervised-learning.md +232 -0
- package/bin/skills/tinker-training-cost/SKILL.md +187 -0
- package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
- package/bin/skills/torchforge/SKILL.md +433 -0
- package/bin/skills/torchforge/references/api-reference.md +327 -0
- package/bin/skills/torchforge/references/troubleshooting.md +409 -0
- package/bin/skills/torchtitan/SKILL.md +358 -0
- package/bin/skills/torchtitan/references/checkpoint.md +181 -0
- package/bin/skills/torchtitan/references/custom-models.md +258 -0
- package/bin/skills/torchtitan/references/float8.md +133 -0
- package/bin/skills/torchtitan/references/fsdp.md +126 -0
- package/bin/skills/transformer-lens/SKILL.md +346 -0
- package/bin/skills/transformer-lens/references/README.md +54 -0
- package/bin/skills/transformer-lens/references/api.md +362 -0
- package/bin/skills/transformer-lens/references/tutorials.md +339 -0
- package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
- package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
- package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
- package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
- package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
- package/bin/skills/unsloth/SKILL.md +80 -0
- package/bin/skills/unsloth/references/index.md +7 -0
- package/bin/skills/unsloth/references/llms-full.md +16799 -0
- package/bin/skills/unsloth/references/llms-txt.md +12044 -0
- package/bin/skills/unsloth/references/llms.md +82 -0
- package/bin/skills/verl/SKILL.md +391 -0
- package/bin/skills/verl/references/api-reference.md +301 -0
- package/bin/skills/verl/references/troubleshooting.md +391 -0
- package/bin/skills/vllm/SKILL.md +364 -0
- package/bin/skills/vllm/references/optimization.md +226 -0
- package/bin/skills/vllm/references/quantization.md +284 -0
- package/bin/skills/vllm/references/server-deployment.md +255 -0
- package/bin/skills/vllm/references/troubleshooting.md +447 -0
- package/bin/skills/weights-and-biases/SKILL.md +590 -0
- package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
- package/bin/skills/weights-and-biases/references/integrations.md +700 -0
- package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
- package/bin/skills/whisper/SKILL.md +317 -0
- package/bin/skills/whisper/references/languages.md +189 -0
- package/bin/synsc +0 -0
- package/package.json +10 -0
|
@@ -0,0 +1,611 @@
|
|
|
1
|
+
# Fine-tuning for Context Extension
|
|
2
|
+
|
|
3
|
+
Complete guide to fine-tuning transformer models for longer context windows.
|
|
4
|
+
|
|
5
|
+
## Table of Contents
|
|
6
|
+
- Data Preparation
|
|
7
|
+
- Training Configuration
|
|
8
|
+
- YaRN Fine-tuning
|
|
9
|
+
- Position Interpolation Fine-tuning
|
|
10
|
+
- Evaluation
|
|
11
|
+
- Production Deployment
|
|
12
|
+
|
|
13
|
+
## Data Preparation
|
|
14
|
+
|
|
15
|
+
### Long Document Datasets
|
|
16
|
+
|
|
17
|
+
**Best datasets for context extension**:
|
|
18
|
+
|
|
19
|
+
```python
|
|
20
|
+
# 1. PG-19 (Books)
|
|
21
|
+
from datasets import load_dataset
|
|
22
|
+
|
|
23
|
+
pg19 = load_dataset("pg19", split="train")
|
|
24
|
+
# Average length: 50k-150k tokens
|
|
25
|
+
# Quality: High (literary works)
|
|
26
|
+
|
|
27
|
+
# 2. arXiv Papers
|
|
28
|
+
arxiv = load_dataset("scientific_papers", "arxiv", split="train")
|
|
29
|
+
# Average length: 4k-15k tokens
|
|
30
|
+
# Quality: High (technical content)
|
|
31
|
+
|
|
32
|
+
# 3. Long-form GitHub Code
|
|
33
|
+
github = load_dataset("codeparrot/github-code", split="train")
|
|
34
|
+
# Filter for large files (>5k tokens)
|
|
35
|
+
|
|
36
|
+
# 4. Long Conversations
|
|
37
|
+
conversations = load_dataset("HuggingFaceH4/ultrachat_200k", split="train")
|
|
38
|
+
# Concatenate multi-turn dialogues
|
|
39
|
+
|
|
40
|
+
# 5. Wikipedia Articles (concatenated)
|
|
41
|
+
wikipedia = load_dataset("wikipedia", "20220301.en", split="train")
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Creating Training Sequences
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
def create_long_sequences(dataset, target_length=32768, tokenizer=None):
|
|
48
|
+
"""Create training sequences of target length."""
|
|
49
|
+
sequences = []
|
|
50
|
+
|
|
51
|
+
for example in dataset:
|
|
52
|
+
# Tokenize
|
|
53
|
+
tokens = tokenizer.encode(example['text'])
|
|
54
|
+
|
|
55
|
+
# If single document is long enough
|
|
56
|
+
if len(tokens) >= target_length:
|
|
57
|
+
# Split into chunks
|
|
58
|
+
for i in range(0, len(tokens) - target_length, target_length // 2):
|
|
59
|
+
sequences.append(tokens[i:i + target_length])
|
|
60
|
+
else:
|
|
61
|
+
# Concatenate multiple documents
|
|
62
|
+
buffer = tokens
|
|
63
|
+
while len(buffer) < target_length:
|
|
64
|
+
next_example = next(dataset)
|
|
65
|
+
buffer.extend(tokenizer.encode(next_example['text']))
|
|
66
|
+
|
|
67
|
+
sequences.append(buffer[:target_length])
|
|
68
|
+
|
|
69
|
+
return sequences
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Data Quality Checks
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
def validate_training_data(sequences, tokenizer, min_length=8192):
|
|
76
|
+
"""Ensure data quality for context extension."""
|
|
77
|
+
issues = []
|
|
78
|
+
|
|
79
|
+
for i, seq in enumerate(sequences):
|
|
80
|
+
# 1. Check length
|
|
81
|
+
if len(seq) < min_length:
|
|
82
|
+
issues.append(f"Sequence {i}: too short ({len(seq)} tokens)")
|
|
83
|
+
|
|
84
|
+
# 2. Check for repetition (copy-paste errors)
|
|
85
|
+
if has_excessive_repetition(seq):
|
|
86
|
+
issues.append(f"Sequence {i}: excessive repetition")
|
|
87
|
+
|
|
88
|
+
# 3. Check for truncation artifacts
|
|
89
|
+
if looks_truncated(seq, tokenizer):
|
|
90
|
+
issues.append(f"Sequence {i}: appears truncated")
|
|
91
|
+
|
|
92
|
+
if issues:
|
|
93
|
+
print(f"⚠️ Found {len(issues)} data quality issues:")
|
|
94
|
+
for issue in issues[:10]: # Show first 10
|
|
95
|
+
print(f" - {issue}")
|
|
96
|
+
|
|
97
|
+
return len(issues) == 0
|
|
98
|
+
|
|
99
|
+
def has_excessive_repetition(tokens, window=50, threshold=0.8):
|
|
100
|
+
"""Detect copy-paste or generated repetition."""
|
|
101
|
+
for i in range(len(tokens) - window * 2):
|
|
102
|
+
chunk1 = tokens[i:i + window]
|
|
103
|
+
chunk2 = tokens[i + window:i + window * 2]
|
|
104
|
+
similarity = sum(a == b for a, b in zip(chunk1, chunk2)) / window
|
|
105
|
+
if similarity > threshold:
|
|
106
|
+
return True
|
|
107
|
+
return False
|
|
108
|
+
|
|
109
|
+
def looks_truncated(tokens, tokenizer):
|
|
110
|
+
"""Check if sequence ends mid-sentence."""
|
|
111
|
+
last_20 = tokenizer.decode(tokens[-20:])
|
|
112
|
+
# Check for incomplete sentences
|
|
113
|
+
return not any(last_20.endswith(c) for c in ['.', '!', '?', '\n'])
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## Training Configuration
|
|
117
|
+
|
|
118
|
+
### Position Interpolation Setup
|
|
119
|
+
|
|
120
|
+
**Minimal fine-tuning** (fastest method):
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
from transformers import (
|
|
124
|
+
AutoModelForCausalLM,
|
|
125
|
+
AutoTokenizer,
|
|
126
|
+
TrainingArguments,
|
|
127
|
+
Trainer
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
# 1. Load base model
|
|
131
|
+
model = AutoModelForCausalLM.from_pretrained(
|
|
132
|
+
"meta-llama/Llama-2-7b-hf",
|
|
133
|
+
torch_dtype=torch.float16,
|
|
134
|
+
device_map="auto"
|
|
135
|
+
)
|
|
136
|
+
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
|
|
137
|
+
|
|
138
|
+
# 2. Configure position interpolation
|
|
139
|
+
scaling_factor = 16.0 # 2k → 32k
|
|
140
|
+
model.config.max_position_embeddings = 32768
|
|
141
|
+
model.config.rope_scaling = {
|
|
142
|
+
"type": "linear",
|
|
143
|
+
"factor": scaling_factor
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
# 3. Training arguments
|
|
147
|
+
training_args = TrainingArguments(
|
|
148
|
+
output_dir="./llama-2-7b-32k",
|
|
149
|
+
num_train_epochs=1,
|
|
150
|
+
max_steps=1000, # Only 1000 steps!
|
|
151
|
+
per_device_train_batch_size=1,
|
|
152
|
+
gradient_accumulation_steps=16,
|
|
153
|
+
learning_rate=2e-5, # Low LR
|
|
154
|
+
warmup_steps=100,
|
|
155
|
+
lr_scheduler_type="cosine",
|
|
156
|
+
logging_steps=10,
|
|
157
|
+
save_steps=500,
|
|
158
|
+
bf16=True,
|
|
159
|
+
gradient_checkpointing=True, # Reduce memory
|
|
160
|
+
dataloader_num_workers=4,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
# 4. Create trainer
|
|
164
|
+
trainer = Trainer(
|
|
165
|
+
model=model,
|
|
166
|
+
args=training_args,
|
|
167
|
+
train_dataset=long_context_dataset,
|
|
168
|
+
data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
# 5. Train
|
|
172
|
+
trainer.train()
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
### YaRN Setup
|
|
176
|
+
|
|
177
|
+
**State-of-the-art extension** (best quality):
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
# 1. Install YaRN
|
|
181
|
+
# git clone https://github.com/jquesnelle/yarn
|
|
182
|
+
# cd yarn && pip install -e .
|
|
183
|
+
|
|
184
|
+
# 2. Configure YaRN scaling
|
|
185
|
+
model.config.max_position_embeddings = 32768
|
|
186
|
+
model.config.rope_scaling = {
|
|
187
|
+
"type": "yarn",
|
|
188
|
+
"factor": 16.0,
|
|
189
|
+
"original_max_position_embeddings": 2048,
|
|
190
|
+
"attention_factor": 1.0,
|
|
191
|
+
"beta_fast": 32,
|
|
192
|
+
"beta_slow": 1,
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
# 3. Training arguments (fewer steps than position interpolation!)
|
|
196
|
+
training_args = TrainingArguments(
|
|
197
|
+
output_dir="./llama-2-7b-32k-yarn",
|
|
198
|
+
max_steps=400, # 400 steps (vs 1000 for PI)
|
|
199
|
+
per_device_train_batch_size=1,
|
|
200
|
+
gradient_accumulation_steps=16,
|
|
201
|
+
learning_rate=2e-5,
|
|
202
|
+
warmup_steps=50,
|
|
203
|
+
bf16=True,
|
|
204
|
+
gradient_checkpointing=True,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# 4. Train
|
|
208
|
+
trainer = Trainer(model=model, args=training_args, train_dataset=dataset)
|
|
209
|
+
trainer.train()
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
### Full Configuration Example
|
|
213
|
+
|
|
214
|
+
```python
|
|
215
|
+
# Complete fine-tuning script
|
|
216
|
+
import torch
|
|
217
|
+
from transformers import (
|
|
218
|
+
AutoModelForCausalLM,
|
|
219
|
+
AutoTokenizer,
|
|
220
|
+
TrainingArguments,
|
|
221
|
+
Trainer,
|
|
222
|
+
DataCollatorForLanguageModeling,
|
|
223
|
+
)
|
|
224
|
+
from datasets import load_dataset
|
|
225
|
+
|
|
226
|
+
def prepare_long_context_data(dataset, tokenizer, context_length=32768):
|
|
227
|
+
"""Prepare training data."""
|
|
228
|
+
def tokenize_function(examples):
|
|
229
|
+
# Concatenate all texts
|
|
230
|
+
concatenated = "\n\n".join(examples['text'])
|
|
231
|
+
# Tokenize
|
|
232
|
+
tokenized = tokenizer(
|
|
233
|
+
concatenated,
|
|
234
|
+
truncation=False,
|
|
235
|
+
return_tensors=None,
|
|
236
|
+
)
|
|
237
|
+
# Split into chunks
|
|
238
|
+
total_length = len(tokenized['input_ids'])
|
|
239
|
+
chunks = []
|
|
240
|
+
for i in range(0, total_length - context_length, context_length // 2):
|
|
241
|
+
chunk = {
|
|
242
|
+
'input_ids': tokenized['input_ids'][i:i + context_length],
|
|
243
|
+
'attention_mask': tokenized['attention_mask'][i:i + context_length],
|
|
244
|
+
}
|
|
245
|
+
chunks.append(chunk)
|
|
246
|
+
return chunks
|
|
247
|
+
|
|
248
|
+
return dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)
|
|
249
|
+
|
|
250
|
+
def fine_tune_long_context(
|
|
251
|
+
base_model="meta-llama/Llama-2-7b-hf",
|
|
252
|
+
target_context=32768,
|
|
253
|
+
method="yarn", # or "linear"
|
|
254
|
+
output_dir="./output",
|
|
255
|
+
max_steps=400,
|
|
256
|
+
):
|
|
257
|
+
"""Complete fine-tuning pipeline."""
|
|
258
|
+
|
|
259
|
+
# Load model and tokenizer
|
|
260
|
+
print(f"Loading {base_model}...")
|
|
261
|
+
model = AutoModelForCausalLM.from_pretrained(
|
|
262
|
+
base_model,
|
|
263
|
+
torch_dtype=torch.bfloat16,
|
|
264
|
+
device_map="auto",
|
|
265
|
+
use_cache=False # Required for gradient checkpointing
|
|
266
|
+
)
|
|
267
|
+
tokenizer = AutoTokenizer.from_pretrained(base_model)
|
|
268
|
+
tokenizer.pad_token = tokenizer.eos_token
|
|
269
|
+
|
|
270
|
+
# Configure scaling
|
|
271
|
+
original_context = model.config.max_position_embeddings
|
|
272
|
+
scaling_factor = target_context / original_context
|
|
273
|
+
|
|
274
|
+
print(f"Scaling {original_context} → {target_context} ({scaling_factor}×)")
|
|
275
|
+
model.config.max_position_embeddings = target_context
|
|
276
|
+
|
|
277
|
+
if method == "yarn":
|
|
278
|
+
model.config.rope_scaling = {
|
|
279
|
+
"type": "yarn",
|
|
280
|
+
"factor": scaling_factor,
|
|
281
|
+
"original_max_position_embeddings": original_context,
|
|
282
|
+
"attention_factor": 1.0,
|
|
283
|
+
"beta_fast": 32,
|
|
284
|
+
"beta_slow": 1,
|
|
285
|
+
}
|
|
286
|
+
else: # linear
|
|
287
|
+
model.config.rope_scaling = {
|
|
288
|
+
"type": "linear",
|
|
289
|
+
"factor": scaling_factor
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
# Enable gradient checkpointing
|
|
293
|
+
model.gradient_checkpointing_enable()
|
|
294
|
+
|
|
295
|
+
# Load and prepare data
|
|
296
|
+
print("Preparing training data...")
|
|
297
|
+
dataset = load_dataset("pg19", split="train[:1000]") # Use subset for testing
|
|
298
|
+
train_dataset = prepare_long_context_data(dataset, tokenizer, target_context)
|
|
299
|
+
|
|
300
|
+
# Training arguments
|
|
301
|
+
training_args = TrainingArguments(
|
|
302
|
+
output_dir=output_dir,
|
|
303
|
+
max_steps=max_steps,
|
|
304
|
+
per_device_train_batch_size=1,
|
|
305
|
+
gradient_accumulation_steps=16,
|
|
306
|
+
learning_rate=2e-5,
|
|
307
|
+
warmup_steps=max_steps // 10,
|
|
308
|
+
lr_scheduler_type="cosine",
|
|
309
|
+
logging_steps=10,
|
|
310
|
+
save_steps=max_steps // 4,
|
|
311
|
+
bf16=True,
|
|
312
|
+
gradient_checkpointing=True,
|
|
313
|
+
dataloader_num_workers=4,
|
|
314
|
+
remove_unused_columns=False,
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
# Trainer
|
|
318
|
+
trainer = Trainer(
|
|
319
|
+
model=model,
|
|
320
|
+
args=training_args,
|
|
321
|
+
train_dataset=train_dataset,
|
|
322
|
+
data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
# Train
|
|
326
|
+
print("Starting fine-tuning...")
|
|
327
|
+
trainer.train()
|
|
328
|
+
|
|
329
|
+
# Save
|
|
330
|
+
print(f"Saving model to {output_dir}...")
|
|
331
|
+
model.save_pretrained(output_dir)
|
|
332
|
+
tokenizer.save_pretrained(output_dir)
|
|
333
|
+
|
|
334
|
+
print("Done!")
|
|
335
|
+
|
|
336
|
+
# Usage
|
|
337
|
+
if __name__ == "__main__":
|
|
338
|
+
fine_tune_long_context(
|
|
339
|
+
base_model="meta-llama/Llama-2-7b-hf",
|
|
340
|
+
target_context=32768,
|
|
341
|
+
method="yarn",
|
|
342
|
+
max_steps=400,
|
|
343
|
+
)
|
|
344
|
+
```
|
|
345
|
+
|
|
346
|
+
## Evaluation
|
|
347
|
+
|
|
348
|
+
### Perplexity Evaluation
|
|
349
|
+
|
|
350
|
+
```python
|
|
351
|
+
import torch
|
|
352
|
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
353
|
+
from datasets import load_dataset
|
|
354
|
+
import math
|
|
355
|
+
|
|
356
|
+
def evaluate_perplexity(model, tokenizer, dataset, context_length=32768):
|
|
357
|
+
"""Evaluate perplexity on long context."""
|
|
358
|
+
model.eval()
|
|
359
|
+
total_loss = 0
|
|
360
|
+
total_tokens = 0
|
|
361
|
+
|
|
362
|
+
with torch.no_grad():
|
|
363
|
+
for example in dataset:
|
|
364
|
+
# Tokenize
|
|
365
|
+
tokens = tokenizer(
|
|
366
|
+
example['text'],
|
|
367
|
+
return_tensors='pt',
|
|
368
|
+
max_length=context_length,
|
|
369
|
+
truncation=True,
|
|
370
|
+
).to(model.device)
|
|
371
|
+
|
|
372
|
+
# Forward pass
|
|
373
|
+
outputs = model(**tokens, labels=tokens['input_ids'])
|
|
374
|
+
loss = outputs.loss
|
|
375
|
+
num_tokens = tokens['input_ids'].numel()
|
|
376
|
+
|
|
377
|
+
total_loss += loss.item() * num_tokens
|
|
378
|
+
total_tokens += num_tokens
|
|
379
|
+
|
|
380
|
+
# Compute perplexity
|
|
381
|
+
avg_loss = total_loss / total_tokens
|
|
382
|
+
perplexity = math.exp(avg_loss)
|
|
383
|
+
|
|
384
|
+
return perplexity
|
|
385
|
+
|
|
386
|
+
# Usage
|
|
387
|
+
model = AutoModelForCausalLM.from_pretrained("./llama-2-7b-32k")
|
|
388
|
+
tokenizer = AutoTokenizer.from_pretrained("./llama-2-7b-32k")
|
|
389
|
+
|
|
390
|
+
test_dataset = load_dataset("pg19", split="test[:100]")
|
|
391
|
+
ppl = evaluate_perplexity(model, tokenizer, test_dataset, context_length=32768)
|
|
392
|
+
|
|
393
|
+
print(f"Perplexity at 32k context: {ppl:.2f}")
|
|
394
|
+
```
|
|
395
|
+
|
|
396
|
+
### Passkey Retrieval Test
|
|
397
|
+
|
|
398
|
+
```python
|
|
399
|
+
def passkey_retrieval_test(model, tokenizer, context_lengths=[4096, 8192, 16384, 32768]):
|
|
400
|
+
"""Test ability to retrieve information from different positions."""
|
|
401
|
+
results = {}
|
|
402
|
+
|
|
403
|
+
for context_len in context_lengths:
|
|
404
|
+
# Create synthetic document with passkey at random position
|
|
405
|
+
passkey = "12345"
|
|
406
|
+
position = random.randint(100, context_len - 100)
|
|
407
|
+
|
|
408
|
+
# Generate filler text
|
|
409
|
+
filler = "The quick brown fox jumps over the lazy dog. " * (context_len // 10)
|
|
410
|
+
text = filler[:position] + f"The passkey is {passkey}. " + filler[position:]
|
|
411
|
+
|
|
412
|
+
# Truncate to context length
|
|
413
|
+
tokens = tokenizer(text, return_tensors='pt', max_length=context_len, truncation=True)
|
|
414
|
+
|
|
415
|
+
# Query
|
|
416
|
+
prompt = text + "\nWhat is the passkey?"
|
|
417
|
+
inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
|
|
418
|
+
|
|
419
|
+
# Generate
|
|
420
|
+
outputs = model.generate(**inputs, max_new_tokens=10)
|
|
421
|
+
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
422
|
+
|
|
423
|
+
# Check if passkey retrieved
|
|
424
|
+
success = passkey in response
|
|
425
|
+
results[context_len] = success
|
|
426
|
+
|
|
427
|
+
print(f"Context {context_len}: {'✓' if success else '✗'}")
|
|
428
|
+
|
|
429
|
+
return results
|
|
430
|
+
```
|
|
431
|
+
|
|
432
|
+
### Long Document Q&A
|
|
433
|
+
|
|
434
|
+
```python
|
|
435
|
+
from datasets import load_dataset
|
|
436
|
+
|
|
437
|
+
def test_long_qa(model, tokenizer, max_length=32768):
|
|
438
|
+
"""Test on long-form QA dataset."""
|
|
439
|
+
# Load dataset
|
|
440
|
+
dataset = load_dataset("narrativeqa", split="test[:100]")
|
|
441
|
+
|
|
442
|
+
correct = 0
|
|
443
|
+
total = 0
|
|
444
|
+
|
|
445
|
+
for example in dataset:
|
|
446
|
+
# Long document
|
|
447
|
+
document = example['document']['text']
|
|
448
|
+
question = example['question']['text']
|
|
449
|
+
gold_answers = example['answers']
|
|
450
|
+
|
|
451
|
+
# Create prompt
|
|
452
|
+
prompt = f"Document:\n{document}\n\nQuestion: {question}\n\nAnswer:"
|
|
453
|
+
|
|
454
|
+
# Tokenize (may exceed original context)
|
|
455
|
+
inputs = tokenizer(
|
|
456
|
+
prompt,
|
|
457
|
+
return_tensors='pt',
|
|
458
|
+
max_length=max_length,
|
|
459
|
+
truncation=True
|
|
460
|
+
).to(model.device)
|
|
461
|
+
|
|
462
|
+
# Generate
|
|
463
|
+
outputs = model.generate(
|
|
464
|
+
**inputs,
|
|
465
|
+
max_new_tokens=50,
|
|
466
|
+
temperature=0.7,
|
|
467
|
+
)
|
|
468
|
+
answer = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
|
|
469
|
+
|
|
470
|
+
# Check correctness
|
|
471
|
+
if any(gold in answer.lower() for gold in gold_answers):
|
|
472
|
+
correct += 1
|
|
473
|
+
total += 1
|
|
474
|
+
|
|
475
|
+
accuracy = correct / total
|
|
476
|
+
print(f"Long QA Accuracy: {accuracy:.1%}")
|
|
477
|
+
return accuracy
|
|
478
|
+
```
|
|
479
|
+
|
|
480
|
+
## Best Practices
|
|
481
|
+
|
|
482
|
+
### 1. Gradual Scaling
|
|
483
|
+
|
|
484
|
+
```python
|
|
485
|
+
# Don't jump directly to 128k!
|
|
486
|
+
# Scale incrementally:
|
|
487
|
+
|
|
488
|
+
# Step 1: 2k → 8k
|
|
489
|
+
fine_tune(model, target=8192, steps=200)
|
|
490
|
+
|
|
491
|
+
# Step 2: 8k → 16k
|
|
492
|
+
fine_tune(model, target=16384, steps=200)
|
|
493
|
+
|
|
494
|
+
# Step 3: 16k → 32k
|
|
495
|
+
fine_tune(model, target=32768, steps=400)
|
|
496
|
+
|
|
497
|
+
# Each step builds on previous, reducing total training needed
|
|
498
|
+
```
|
|
499
|
+
|
|
500
|
+
### 2. Learning Rate Tuning
|
|
501
|
+
|
|
502
|
+
```python
|
|
503
|
+
# Position Interpolation: Lower LR
|
|
504
|
+
lr_pi = 2e-5
|
|
505
|
+
|
|
506
|
+
# YaRN: Can use slightly higher LR
|
|
507
|
+
lr_yarn = 5e-5
|
|
508
|
+
|
|
509
|
+
# Rule: Larger scaling factors need lower LR
|
|
510
|
+
lr = base_lr / sqrt(scaling_factor)
|
|
511
|
+
```
|
|
512
|
+
|
|
513
|
+
### 3. Gradient Checkpointing
|
|
514
|
+
|
|
515
|
+
```python
|
|
516
|
+
# Essential for long context (saves ~50% memory)
|
|
517
|
+
model.gradient_checkpointing_enable()
|
|
518
|
+
|
|
519
|
+
# Trade-off: ~20% slower training, but fits in memory
|
|
520
|
+
```
|
|
521
|
+
|
|
522
|
+
### 4. Flash Attention
|
|
523
|
+
|
|
524
|
+
```python
|
|
525
|
+
# 2-3× speedup for long sequences
|
|
526
|
+
model = AutoModelForCausalLM.from_pretrained(
|
|
527
|
+
"meta-llama/Llama-2-7b-hf",
|
|
528
|
+
attn_implementation="flash_attention_2", # Flash Attention 2
|
|
529
|
+
torch_dtype=torch.bfloat16
|
|
530
|
+
)
|
|
531
|
+
```
|
|
532
|
+
|
|
533
|
+
## Production Deployment
|
|
534
|
+
|
|
535
|
+
### Save and Upload
|
|
536
|
+
|
|
537
|
+
```python
|
|
538
|
+
# Save fine-tuned model
|
|
539
|
+
model.save_pretrained("./llama-2-7b-32k-yarn")
|
|
540
|
+
tokenizer.save_pretrained("./llama-2-7b-32k-yarn")
|
|
541
|
+
|
|
542
|
+
# Upload to HuggingFace Hub
|
|
543
|
+
from huggingface_hub import HfApi
|
|
544
|
+
|
|
545
|
+
api = HfApi()
|
|
546
|
+
api.upload_folder(
|
|
547
|
+
folder_path="./llama-2-7b-32k-yarn",
|
|
548
|
+
repo_id="your-username/llama-2-7b-32k-yarn",
|
|
549
|
+
repo_type="model",
|
|
550
|
+
)
|
|
551
|
+
```
|
|
552
|
+
|
|
553
|
+
### Inference Configuration
|
|
554
|
+
|
|
555
|
+
```python
|
|
556
|
+
# Load for inference
|
|
557
|
+
model = AutoModelForCausalLM.from_pretrained(
|
|
558
|
+
"your-username/llama-2-7b-32k-yarn",
|
|
559
|
+
torch_dtype=torch.float16,
|
|
560
|
+
device_map="auto",
|
|
561
|
+
max_memory={0: "40GB", "cpu": "100GB"} # Offload to CPU if needed
|
|
562
|
+
)
|
|
563
|
+
|
|
564
|
+
# Process long document
|
|
565
|
+
long_text = "..." * 30000 # 30k tokens
|
|
566
|
+
inputs = tokenizer(long_text, return_tensors="pt", truncation=False).to('cuda')
|
|
567
|
+
|
|
568
|
+
outputs = model.generate(
|
|
569
|
+
**inputs,
|
|
570
|
+
max_new_tokens=512,
|
|
571
|
+
do_sample=True,
|
|
572
|
+
temperature=0.7,
|
|
573
|
+
top_p=0.9,
|
|
574
|
+
)
|
|
575
|
+
|
|
576
|
+
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
577
|
+
```
|
|
578
|
+
|
|
579
|
+
## Troubleshooting
|
|
580
|
+
|
|
581
|
+
### Issue: Out of Memory
|
|
582
|
+
|
|
583
|
+
**Solutions**:
|
|
584
|
+
1. Enable gradient checkpointing
|
|
585
|
+
2. Reduce batch size to 1
|
|
586
|
+
3. Increase gradient accumulation steps
|
|
587
|
+
4. Use bfloat16 or float16
|
|
588
|
+
5. Use Flash Attention
|
|
589
|
+
|
|
590
|
+
### Issue: Poor Extrapolation
|
|
591
|
+
|
|
592
|
+
**Solutions**:
|
|
593
|
+
1. Use YaRN instead of linear scaling
|
|
594
|
+
2. Increase fine-tuning steps
|
|
595
|
+
3. Use higher-quality long-form data
|
|
596
|
+
4. Gradual scaling (8k → 16k → 32k)
|
|
597
|
+
|
|
598
|
+
### Issue: Training Instability
|
|
599
|
+
|
|
600
|
+
**Solutions**:
|
|
601
|
+
1. Lower learning rate
|
|
602
|
+
2. Increase warmup steps
|
|
603
|
+
3. Use gradient clipping
|
|
604
|
+
4. Check data quality
|
|
605
|
+
|
|
606
|
+
## Resources
|
|
607
|
+
|
|
608
|
+
- **Position Interpolation Paper**: https://arxiv.org/abs/2306.15595
|
|
609
|
+
- **YaRN Paper**: https://arxiv.org/abs/2309.00071
|
|
610
|
+
- **Together AI Guide**: https://www.together.ai/blog/llama-2-7b-32k
|
|
611
|
+
- **HuggingFace Long Context Guide**: https://huggingface.co/blog/long-range-transformers
|