@synsci/cli-darwin-x64 1.1.49
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skills/accelerate/SKILL.md +332 -0
- package/bin/skills/accelerate/references/custom-plugins.md +453 -0
- package/bin/skills/accelerate/references/megatron-integration.md +489 -0
- package/bin/skills/accelerate/references/performance.md +525 -0
- package/bin/skills/audiocraft/SKILL.md +564 -0
- package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
- package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
- package/bin/skills/autogpt/SKILL.md +403 -0
- package/bin/skills/autogpt/references/advanced-usage.md +535 -0
- package/bin/skills/autogpt/references/troubleshooting.md +420 -0
- package/bin/skills/awq/SKILL.md +310 -0
- package/bin/skills/awq/references/advanced-usage.md +324 -0
- package/bin/skills/awq/references/troubleshooting.md +344 -0
- package/bin/skills/axolotl/SKILL.md +158 -0
- package/bin/skills/axolotl/references/api.md +5548 -0
- package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
- package/bin/skills/axolotl/references/index.md +15 -0
- package/bin/skills/axolotl/references/other.md +3563 -0
- package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
- package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
- package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
- package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
- package/bin/skills/bitsandbytes/SKILL.md +411 -0
- package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
- package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
- package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
- package/bin/skills/blip-2/SKILL.md +564 -0
- package/bin/skills/blip-2/references/advanced-usage.md +680 -0
- package/bin/skills/blip-2/references/troubleshooting.md +526 -0
- package/bin/skills/chroma/SKILL.md +406 -0
- package/bin/skills/chroma/references/integration.md +38 -0
- package/bin/skills/clip/SKILL.md +253 -0
- package/bin/skills/clip/references/applications.md +207 -0
- package/bin/skills/constitutional-ai/SKILL.md +290 -0
- package/bin/skills/crewai/SKILL.md +498 -0
- package/bin/skills/crewai/references/flows.md +438 -0
- package/bin/skills/crewai/references/tools.md +429 -0
- package/bin/skills/crewai/references/troubleshooting.md +480 -0
- package/bin/skills/deepspeed/SKILL.md +141 -0
- package/bin/skills/deepspeed/references/08.md +17 -0
- package/bin/skills/deepspeed/references/09.md +173 -0
- package/bin/skills/deepspeed/references/2020.md +378 -0
- package/bin/skills/deepspeed/references/2023.md +279 -0
- package/bin/skills/deepspeed/references/assets.md +179 -0
- package/bin/skills/deepspeed/references/index.md +35 -0
- package/bin/skills/deepspeed/references/mii.md +118 -0
- package/bin/skills/deepspeed/references/other.md +1191 -0
- package/bin/skills/deepspeed/references/tutorials.md +6554 -0
- package/bin/skills/dspy/SKILL.md +590 -0
- package/bin/skills/dspy/references/examples.md +663 -0
- package/bin/skills/dspy/references/modules.md +475 -0
- package/bin/skills/dspy/references/optimizers.md +566 -0
- package/bin/skills/faiss/SKILL.md +221 -0
- package/bin/skills/faiss/references/index_types.md +280 -0
- package/bin/skills/flash-attention/SKILL.md +367 -0
- package/bin/skills/flash-attention/references/benchmarks.md +215 -0
- package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
- package/bin/skills/gguf/SKILL.md +427 -0
- package/bin/skills/gguf/references/advanced-usage.md +504 -0
- package/bin/skills/gguf/references/troubleshooting.md +442 -0
- package/bin/skills/gptq/SKILL.md +450 -0
- package/bin/skills/gptq/references/calibration.md +337 -0
- package/bin/skills/gptq/references/integration.md +129 -0
- package/bin/skills/gptq/references/troubleshooting.md +95 -0
- package/bin/skills/grpo-rl-training/README.md +97 -0
- package/bin/skills/grpo-rl-training/SKILL.md +572 -0
- package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
- package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
- package/bin/skills/guidance/SKILL.md +572 -0
- package/bin/skills/guidance/references/backends.md +554 -0
- package/bin/skills/guidance/references/constraints.md +674 -0
- package/bin/skills/guidance/references/examples.md +767 -0
- package/bin/skills/hqq/SKILL.md +445 -0
- package/bin/skills/hqq/references/advanced-usage.md +528 -0
- package/bin/skills/hqq/references/troubleshooting.md +503 -0
- package/bin/skills/hugging-face-cli/SKILL.md +191 -0
- package/bin/skills/hugging-face-cli/references/commands.md +954 -0
- package/bin/skills/hugging-face-cli/references/examples.md +374 -0
- package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
- package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
- package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
- package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
- package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
- package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
- package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
- package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
- package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
- package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
- package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
- package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
- package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
- package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
- package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
- package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
- package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
- package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
- package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
- package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
- package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
- package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
- package/bin/skills/hugging-face-jobs/index.html +216 -0
- package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
- package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
- package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
- package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
- package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
- package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
- package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
- package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
- package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
- package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
- package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
- package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
- package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
- package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
- package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
- package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
- package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
- package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
- package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
- package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
- package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
- package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
- package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
- package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
- package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
- package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
- package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
- package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
- package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
- package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
- package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
- package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
- package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
- package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
- package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
- package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
- package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
- package/bin/skills/instructor/SKILL.md +740 -0
- package/bin/skills/instructor/references/examples.md +107 -0
- package/bin/skills/instructor/references/providers.md +70 -0
- package/bin/skills/instructor/references/validation.md +606 -0
- package/bin/skills/knowledge-distillation/SKILL.md +458 -0
- package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
- package/bin/skills/lambda-labs/SKILL.md +545 -0
- package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
- package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
- package/bin/skills/langchain/SKILL.md +480 -0
- package/bin/skills/langchain/references/agents.md +499 -0
- package/bin/skills/langchain/references/integration.md +562 -0
- package/bin/skills/langchain/references/rag.md +600 -0
- package/bin/skills/langsmith/SKILL.md +422 -0
- package/bin/skills/langsmith/references/advanced-usage.md +548 -0
- package/bin/skills/langsmith/references/troubleshooting.md +537 -0
- package/bin/skills/litgpt/SKILL.md +469 -0
- package/bin/skills/litgpt/references/custom-models.md +568 -0
- package/bin/skills/litgpt/references/distributed-training.md +451 -0
- package/bin/skills/litgpt/references/supported-models.md +336 -0
- package/bin/skills/litgpt/references/training-recipes.md +619 -0
- package/bin/skills/llama-cpp/SKILL.md +258 -0
- package/bin/skills/llama-cpp/references/optimization.md +89 -0
- package/bin/skills/llama-cpp/references/quantization.md +213 -0
- package/bin/skills/llama-cpp/references/server.md +125 -0
- package/bin/skills/llama-factory/SKILL.md +80 -0
- package/bin/skills/llama-factory/references/_images.md +23 -0
- package/bin/skills/llama-factory/references/advanced.md +1055 -0
- package/bin/skills/llama-factory/references/getting_started.md +349 -0
- package/bin/skills/llama-factory/references/index.md +19 -0
- package/bin/skills/llama-factory/references/other.md +31 -0
- package/bin/skills/llamaguard/SKILL.md +337 -0
- package/bin/skills/llamaindex/SKILL.md +569 -0
- package/bin/skills/llamaindex/references/agents.md +83 -0
- package/bin/skills/llamaindex/references/data_connectors.md +108 -0
- package/bin/skills/llamaindex/references/query_engines.md +406 -0
- package/bin/skills/llava/SKILL.md +304 -0
- package/bin/skills/llava/references/training.md +197 -0
- package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
- package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
- package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
- package/bin/skills/long-context/SKILL.md +536 -0
- package/bin/skills/long-context/references/extension_methods.md +468 -0
- package/bin/skills/long-context/references/fine_tuning.md +611 -0
- package/bin/skills/long-context/references/rope.md +402 -0
- package/bin/skills/mamba/SKILL.md +260 -0
- package/bin/skills/mamba/references/architecture-details.md +206 -0
- package/bin/skills/mamba/references/benchmarks.md +255 -0
- package/bin/skills/mamba/references/training-guide.md +388 -0
- package/bin/skills/megatron-core/SKILL.md +366 -0
- package/bin/skills/megatron-core/references/benchmarks.md +249 -0
- package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
- package/bin/skills/megatron-core/references/production-examples.md +473 -0
- package/bin/skills/megatron-core/references/training-recipes.md +547 -0
- package/bin/skills/miles/SKILL.md +315 -0
- package/bin/skills/miles/references/api-reference.md +141 -0
- package/bin/skills/miles/references/troubleshooting.md +352 -0
- package/bin/skills/mlflow/SKILL.md +704 -0
- package/bin/skills/mlflow/references/deployment.md +744 -0
- package/bin/skills/mlflow/references/model-registry.md +770 -0
- package/bin/skills/mlflow/references/tracking.md +680 -0
- package/bin/skills/modal/SKILL.md +341 -0
- package/bin/skills/modal/references/advanced-usage.md +503 -0
- package/bin/skills/modal/references/troubleshooting.md +494 -0
- package/bin/skills/model-merging/SKILL.md +539 -0
- package/bin/skills/model-merging/references/evaluation.md +462 -0
- package/bin/skills/model-merging/references/examples.md +428 -0
- package/bin/skills/model-merging/references/methods.md +352 -0
- package/bin/skills/model-pruning/SKILL.md +495 -0
- package/bin/skills/model-pruning/references/wanda.md +347 -0
- package/bin/skills/moe-training/SKILL.md +526 -0
- package/bin/skills/moe-training/references/architectures.md +432 -0
- package/bin/skills/moe-training/references/inference.md +348 -0
- package/bin/skills/moe-training/references/training.md +425 -0
- package/bin/skills/nanogpt/SKILL.md +290 -0
- package/bin/skills/nanogpt/references/architecture.md +382 -0
- package/bin/skills/nanogpt/references/data.md +476 -0
- package/bin/skills/nanogpt/references/training.md +564 -0
- package/bin/skills/nemo-curator/SKILL.md +383 -0
- package/bin/skills/nemo-curator/references/deduplication.md +87 -0
- package/bin/skills/nemo-curator/references/filtering.md +102 -0
- package/bin/skills/nemo-evaluator/SKILL.md +494 -0
- package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
- package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
- package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
- package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
- package/bin/skills/nemo-guardrails/SKILL.md +297 -0
- package/bin/skills/nnsight/SKILL.md +436 -0
- package/bin/skills/nnsight/references/README.md +78 -0
- package/bin/skills/nnsight/references/api.md +344 -0
- package/bin/skills/nnsight/references/tutorials.md +300 -0
- package/bin/skills/openrlhf/SKILL.md +249 -0
- package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
- package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
- package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
- package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
- package/bin/skills/outlines/SKILL.md +652 -0
- package/bin/skills/outlines/references/backends.md +615 -0
- package/bin/skills/outlines/references/examples.md +773 -0
- package/bin/skills/outlines/references/json_generation.md +652 -0
- package/bin/skills/peft/SKILL.md +431 -0
- package/bin/skills/peft/references/advanced-usage.md +514 -0
- package/bin/skills/peft/references/troubleshooting.md +480 -0
- package/bin/skills/phoenix/SKILL.md +475 -0
- package/bin/skills/phoenix/references/advanced-usage.md +619 -0
- package/bin/skills/phoenix/references/troubleshooting.md +538 -0
- package/bin/skills/pinecone/SKILL.md +358 -0
- package/bin/skills/pinecone/references/deployment.md +181 -0
- package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
- package/bin/skills/pytorch-fsdp/references/index.md +7 -0
- package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
- package/bin/skills/pytorch-lightning/SKILL.md +346 -0
- package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
- package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
- package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
- package/bin/skills/pyvene/SKILL.md +473 -0
- package/bin/skills/pyvene/references/README.md +73 -0
- package/bin/skills/pyvene/references/api.md +383 -0
- package/bin/skills/pyvene/references/tutorials.md +376 -0
- package/bin/skills/qdrant/SKILL.md +493 -0
- package/bin/skills/qdrant/references/advanced-usage.md +648 -0
- package/bin/skills/qdrant/references/troubleshooting.md +631 -0
- package/bin/skills/ray-data/SKILL.md +326 -0
- package/bin/skills/ray-data/references/integration.md +82 -0
- package/bin/skills/ray-data/references/transformations.md +83 -0
- package/bin/skills/ray-train/SKILL.md +406 -0
- package/bin/skills/ray-train/references/multi-node.md +628 -0
- package/bin/skills/rwkv/SKILL.md +260 -0
- package/bin/skills/rwkv/references/architecture-details.md +344 -0
- package/bin/skills/rwkv/references/rwkv7.md +386 -0
- package/bin/skills/rwkv/references/state-management.md +369 -0
- package/bin/skills/saelens/SKILL.md +386 -0
- package/bin/skills/saelens/references/README.md +70 -0
- package/bin/skills/saelens/references/api.md +333 -0
- package/bin/skills/saelens/references/tutorials.md +318 -0
- package/bin/skills/segment-anything/SKILL.md +500 -0
- package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
- package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
- package/bin/skills/sentence-transformers/SKILL.md +255 -0
- package/bin/skills/sentence-transformers/references/models.md +123 -0
- package/bin/skills/sentencepiece/SKILL.md +235 -0
- package/bin/skills/sentencepiece/references/algorithms.md +200 -0
- package/bin/skills/sentencepiece/references/training.md +304 -0
- package/bin/skills/sglang/SKILL.md +442 -0
- package/bin/skills/sglang/references/deployment.md +490 -0
- package/bin/skills/sglang/references/radix-attention.md +413 -0
- package/bin/skills/sglang/references/structured-generation.md +541 -0
- package/bin/skills/simpo/SKILL.md +219 -0
- package/bin/skills/simpo/references/datasets.md +478 -0
- package/bin/skills/simpo/references/hyperparameters.md +452 -0
- package/bin/skills/simpo/references/loss-functions.md +350 -0
- package/bin/skills/skypilot/SKILL.md +509 -0
- package/bin/skills/skypilot/references/advanced-usage.md +491 -0
- package/bin/skills/skypilot/references/troubleshooting.md +570 -0
- package/bin/skills/slime/SKILL.md +464 -0
- package/bin/skills/slime/references/api-reference.md +392 -0
- package/bin/skills/slime/references/troubleshooting.md +386 -0
- package/bin/skills/speculative-decoding/SKILL.md +467 -0
- package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
- package/bin/skills/speculative-decoding/references/medusa.md +350 -0
- package/bin/skills/stable-diffusion/SKILL.md +519 -0
- package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
- package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
- package/bin/skills/tensorboard/SKILL.md +629 -0
- package/bin/skills/tensorboard/references/integrations.md +638 -0
- package/bin/skills/tensorboard/references/profiling.md +545 -0
- package/bin/skills/tensorboard/references/visualization.md +620 -0
- package/bin/skills/tensorrt-llm/SKILL.md +187 -0
- package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
- package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
- package/bin/skills/tensorrt-llm/references/serving.md +470 -0
- package/bin/skills/tinker/SKILL.md +362 -0
- package/bin/skills/tinker/references/api-reference.md +168 -0
- package/bin/skills/tinker/references/getting-started.md +157 -0
- package/bin/skills/tinker/references/loss-functions.md +163 -0
- package/bin/skills/tinker/references/models-and-lora.md +139 -0
- package/bin/skills/tinker/references/recipes.md +280 -0
- package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
- package/bin/skills/tinker/references/rendering.md +243 -0
- package/bin/skills/tinker/references/supervised-learning.md +232 -0
- package/bin/skills/tinker-training-cost/SKILL.md +187 -0
- package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
- package/bin/skills/torchforge/SKILL.md +433 -0
- package/bin/skills/torchforge/references/api-reference.md +327 -0
- package/bin/skills/torchforge/references/troubleshooting.md +409 -0
- package/bin/skills/torchtitan/SKILL.md +358 -0
- package/bin/skills/torchtitan/references/checkpoint.md +181 -0
- package/bin/skills/torchtitan/references/custom-models.md +258 -0
- package/bin/skills/torchtitan/references/float8.md +133 -0
- package/bin/skills/torchtitan/references/fsdp.md +126 -0
- package/bin/skills/transformer-lens/SKILL.md +346 -0
- package/bin/skills/transformer-lens/references/README.md +54 -0
- package/bin/skills/transformer-lens/references/api.md +362 -0
- package/bin/skills/transformer-lens/references/tutorials.md +339 -0
- package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
- package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
- package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
- package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
- package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
- package/bin/skills/unsloth/SKILL.md +80 -0
- package/bin/skills/unsloth/references/index.md +7 -0
- package/bin/skills/unsloth/references/llms-full.md +16799 -0
- package/bin/skills/unsloth/references/llms-txt.md +12044 -0
- package/bin/skills/unsloth/references/llms.md +82 -0
- package/bin/skills/verl/SKILL.md +391 -0
- package/bin/skills/verl/references/api-reference.md +301 -0
- package/bin/skills/verl/references/troubleshooting.md +391 -0
- package/bin/skills/vllm/SKILL.md +364 -0
- package/bin/skills/vllm/references/optimization.md +226 -0
- package/bin/skills/vllm/references/quantization.md +284 -0
- package/bin/skills/vllm/references/server-deployment.md +255 -0
- package/bin/skills/vllm/references/troubleshooting.md +447 -0
- package/bin/skills/weights-and-biases/SKILL.md +590 -0
- package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
- package/bin/skills/weights-and-biases/references/integrations.md +700 -0
- package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
- package/bin/skills/whisper/SKILL.md +317 -0
- package/bin/skills/whisper/references/languages.md +189 -0
- package/bin/synsc +0 -0
- package/package.json +10 -0
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: rwkv-architecture
|
|
3
|
+
description: RNN+Transformer hybrid with O(n) inference. Linear time, infinite context, no KV cache. Train like GPT (parallel), infer like RNN (sequential). Linux Foundation AI project. Production at Windows, Office, NeMo. RWKV-7 (March 2025). Models up to 14B parameters.
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
author: Synthetic Sciences
|
|
6
|
+
license: MIT
|
|
7
|
+
tags: [RWKV, Model Architecture, RNN, Transformer Hybrid, Linear Complexity, Infinite Context, Efficient Inference, Linux Foundation, Alternative Architecture]
|
|
8
|
+
dependencies: [rwkv, torch, transformers]
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# RWKV - Receptance Weighted Key Value
|
|
12
|
+
|
|
13
|
+
## Quick start
|
|
14
|
+
|
|
15
|
+
RWKV (RwaKuv) combines Transformer parallelization (training) with RNN efficiency (inference).
|
|
16
|
+
|
|
17
|
+
**Installation**:
|
|
18
|
+
```bash
|
|
19
|
+
# Install PyTorch
|
|
20
|
+
pip install torch --upgrade --extra-index-url https://download.pytorch.org/whl/cu121
|
|
21
|
+
|
|
22
|
+
# Install dependencies
|
|
23
|
+
pip install pytorch-lightning==1.9.5 deepspeed wandb ninja --upgrade
|
|
24
|
+
|
|
25
|
+
# Install RWKV
|
|
26
|
+
pip install rwkv
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
**Basic usage** (GPT mode + RNN mode):
|
|
30
|
+
```python
|
|
31
|
+
import os
|
|
32
|
+
from rwkv.model import RWKV
|
|
33
|
+
|
|
34
|
+
os.environ["RWKV_JIT_ON"] = '1'
|
|
35
|
+
os.environ["RWKV_CUDA_ON"] = '1' # Use CUDA kernel for speed
|
|
36
|
+
|
|
37
|
+
# Load model
|
|
38
|
+
model = RWKV(
|
|
39
|
+
model='/path/to/RWKV-4-Pile-1B5-20220903-8040',
|
|
40
|
+
strategy='cuda fp16'
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# GPT mode (parallel processing)
|
|
44
|
+
out, state = model.forward([187, 510, 1563, 310, 247], None)
|
|
45
|
+
print(out.detach().cpu().numpy()) # Logits
|
|
46
|
+
|
|
47
|
+
# RNN mode (sequential processing, same result)
|
|
48
|
+
out, state = model.forward([187, 510], None) # First 2 tokens
|
|
49
|
+
out, state = model.forward([1563], state) # Next token
|
|
50
|
+
out, state = model.forward([310, 247], state) # Last tokens
|
|
51
|
+
print(out.detach().cpu().numpy()) # Same logits as above!
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Common workflows
|
|
55
|
+
|
|
56
|
+
### Workflow 1: Text generation (streaming)
|
|
57
|
+
|
|
58
|
+
**Efficient token-by-token generation**:
|
|
59
|
+
```python
|
|
60
|
+
from rwkv.model import RWKV
|
|
61
|
+
from rwkv.utils import PIPELINE
|
|
62
|
+
|
|
63
|
+
model = RWKV(model='RWKV-4-Pile-14B-20230313-ctx8192-test1050', strategy='cuda fp16')
|
|
64
|
+
pipeline = PIPELINE(model, "20B_tokenizer.json")
|
|
65
|
+
|
|
66
|
+
# Initial prompt
|
|
67
|
+
prompt = "The future of AI is"
|
|
68
|
+
state = None
|
|
69
|
+
|
|
70
|
+
# Generate token by token
|
|
71
|
+
for token in prompt:
|
|
72
|
+
out, state = pipeline.model.forward(pipeline.encode(token), state)
|
|
73
|
+
|
|
74
|
+
# Continue generation
|
|
75
|
+
for _ in range(100):
|
|
76
|
+
out, state = pipeline.model.forward(None, state)
|
|
77
|
+
token = pipeline.sample_logits(out)
|
|
78
|
+
print(pipeline.decode(token), end='', flush=True)
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
**Key advantage**: Constant memory per token (no growing KV cache)
|
|
82
|
+
|
|
83
|
+
### Workflow 2: Long context processing (infinite context)
|
|
84
|
+
|
|
85
|
+
**Process million-token sequences**:
|
|
86
|
+
```python
|
|
87
|
+
model = RWKV(model='RWKV-4-Pile-14B', strategy='cuda fp16')
|
|
88
|
+
|
|
89
|
+
# Process very long document
|
|
90
|
+
state = None
|
|
91
|
+
long_document = load_document() # e.g., 1M tokens
|
|
92
|
+
|
|
93
|
+
# Stream through entire document
|
|
94
|
+
for chunk in chunks(long_document, chunk_size=1024):
|
|
95
|
+
out, state = model.forward(chunk, state)
|
|
96
|
+
|
|
97
|
+
# State now contains information from entire 1M token document
|
|
98
|
+
# Memory usage: O(1) (constant, not O(n)!)
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### Workflow 3: Fine-tuning RWKV
|
|
102
|
+
|
|
103
|
+
**Standard fine-tuning workflow**:
|
|
104
|
+
```python
|
|
105
|
+
# Training script
|
|
106
|
+
import pytorch_lightning as pl
|
|
107
|
+
from rwkv.model import RWKV
|
|
108
|
+
from rwkv.trainer import RWKVTrainer
|
|
109
|
+
|
|
110
|
+
# Configure model
|
|
111
|
+
config = {
|
|
112
|
+
'n_layer': 24,
|
|
113
|
+
'n_embd': 1024,
|
|
114
|
+
'vocab_size': 50277,
|
|
115
|
+
'ctx_len': 1024
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
# Setup trainer
|
|
119
|
+
trainer = pl.Trainer(
|
|
120
|
+
accelerator='gpu',
|
|
121
|
+
devices=8,
|
|
122
|
+
precision='bf16',
|
|
123
|
+
strategy='deepspeed_stage_2',
|
|
124
|
+
max_epochs=1
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
# Train
|
|
128
|
+
model = RWKV(config)
|
|
129
|
+
trainer.fit(model, train_dataloader)
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### Workflow 4: RWKV vs Transformer comparison
|
|
133
|
+
|
|
134
|
+
**Memory comparison** (1M token sequence):
|
|
135
|
+
```python
|
|
136
|
+
# Transformer (GPT)
|
|
137
|
+
# Memory: O(n²) for attention
|
|
138
|
+
# KV cache: 1M × hidden_dim × n_layers × 2 (keys + values)
|
|
139
|
+
# Example: 1M × 4096 × 24 × 2 = ~400GB (impractical!)
|
|
140
|
+
|
|
141
|
+
# RWKV
|
|
142
|
+
# Memory: O(1) per token
|
|
143
|
+
# State: hidden_dim × n_layers = 4096 × 24 = ~400KB
|
|
144
|
+
# 1,000,000× more efficient!
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
**Speed comparison** (inference):
|
|
148
|
+
```python
|
|
149
|
+
# Transformer: O(n) per token (quadratic overall)
|
|
150
|
+
# First token: 1 computation
|
|
151
|
+
# Second token: 2 computations
|
|
152
|
+
# ...
|
|
153
|
+
# 1000th token: 1000 computations
|
|
154
|
+
|
|
155
|
+
# RWKV: O(1) per token (linear overall)
|
|
156
|
+
# Every token: 1 computation
|
|
157
|
+
# 1000th token: 1 computation (same as first!)
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## When to use vs alternatives
|
|
161
|
+
|
|
162
|
+
**Use RWKV when**:
|
|
163
|
+
- Need very long context (100K+ tokens)
|
|
164
|
+
- Want constant memory usage
|
|
165
|
+
- Building streaming applications
|
|
166
|
+
- Need RNN efficiency with Transformer performance
|
|
167
|
+
- Memory-constrained deployment
|
|
168
|
+
|
|
169
|
+
**Key advantages**:
|
|
170
|
+
- **Linear time**: O(n) vs O(n²) for Transformers
|
|
171
|
+
- **No KV cache**: Constant memory per token
|
|
172
|
+
- **Infinite context**: No fixed window limit
|
|
173
|
+
- **Parallelizable training**: Like GPT
|
|
174
|
+
- **Sequential inference**: Like RNN
|
|
175
|
+
|
|
176
|
+
**Use alternatives instead**:
|
|
177
|
+
- **Transformers**: Need absolute best performance, have compute
|
|
178
|
+
- **Mamba**: Want state-space models
|
|
179
|
+
- **RetNet**: Need retention mechanism
|
|
180
|
+
- **Hyena**: Want convolution-based approach
|
|
181
|
+
|
|
182
|
+
## Common issues
|
|
183
|
+
|
|
184
|
+
**Issue: Out of memory during training**
|
|
185
|
+
|
|
186
|
+
Use gradient checkpointing and DeepSpeed:
|
|
187
|
+
```python
|
|
188
|
+
trainer = pl.Trainer(
|
|
189
|
+
strategy='deepspeed_stage_3', # Full ZeRO-3
|
|
190
|
+
precision='bf16'
|
|
191
|
+
)
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
**Issue: Slow inference**
|
|
195
|
+
|
|
196
|
+
Enable CUDA kernel:
|
|
197
|
+
```python
|
|
198
|
+
os.environ["RWKV_CUDA_ON"] = '1'
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
**Issue: Model not loading**
|
|
202
|
+
|
|
203
|
+
Check model path and strategy:
|
|
204
|
+
```python
|
|
205
|
+
model = RWKV(
|
|
206
|
+
model='/absolute/path/to/model.pth',
|
|
207
|
+
strategy='cuda fp16' # Or 'cpu fp32' for CPU
|
|
208
|
+
)
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
**Issue: State management in RNN mode**
|
|
212
|
+
|
|
213
|
+
Always pass state between forward calls:
|
|
214
|
+
```python
|
|
215
|
+
# WRONG: State lost
|
|
216
|
+
out1, _ = model.forward(tokens1, None)
|
|
217
|
+
out2, _ = model.forward(tokens2, None) # No context from tokens1!
|
|
218
|
+
|
|
219
|
+
# CORRECT: State preserved
|
|
220
|
+
out1, state = model.forward(tokens1, None)
|
|
221
|
+
out2, state = model.forward(tokens2, state) # Has context from tokens1
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
## Advanced topics
|
|
225
|
+
|
|
226
|
+
**Time-mixing and channel-mixing**: See [references/architecture-details.md](references/architecture-details.md) for WKV operation, time-decay mechanism, and receptance gates.
|
|
227
|
+
|
|
228
|
+
**State management**: See [references/state-management.md](references/state-management.md) for att_x_prev, att_kv, ffn_x_prev states, and numerical stability considerations.
|
|
229
|
+
|
|
230
|
+
**RWKV-7 improvements**: See [references/rwkv7.md](references/rwkv7.md) for latest architectural improvements (March 2025) and multimodal capabilities.
|
|
231
|
+
|
|
232
|
+
## Hardware requirements
|
|
233
|
+
|
|
234
|
+
- **GPU**: NVIDIA (CUDA 11.6+) or CPU
|
|
235
|
+
- **VRAM** (FP16):
|
|
236
|
+
- 169M model: 1GB
|
|
237
|
+
- 430M model: 2GB
|
|
238
|
+
- 1.5B model: 4GB
|
|
239
|
+
- 3B model: 8GB
|
|
240
|
+
- 7B model: 16GB
|
|
241
|
+
- 14B model: 32GB
|
|
242
|
+
- **Inference**: O(1) memory per token
|
|
243
|
+
- **Training**: Parallelizable like GPT
|
|
244
|
+
|
|
245
|
+
**Performance** (vs Transformers):
|
|
246
|
+
- **Speed**: Similar training, faster inference
|
|
247
|
+
- **Memory**: 1000× less for long sequences
|
|
248
|
+
- **Scaling**: Linear vs quadratic
|
|
249
|
+
|
|
250
|
+
## Resources
|
|
251
|
+
|
|
252
|
+
- Paper (RWKV): https://arxiv.org/abs/2305.13048 (May 2023)
|
|
253
|
+
- Paper (RWKV-7): https://arxiv.org/abs/2503.14456 (March 2025)
|
|
254
|
+
- GitHub: https://github.com/BlinkDL/RWKV-LM ⭐ 12,000+
|
|
255
|
+
- Docs: https://wiki.rwkv.com/
|
|
256
|
+
- Models: https://huggingface.co/BlinkDL
|
|
257
|
+
- Linux Foundation AI: Official project
|
|
258
|
+
- Production: Microsoft Windows, Office integration, NeMo support
|
|
259
|
+
|
|
260
|
+
|
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
# RWKV Architecture Details
|
|
2
|
+
|
|
3
|
+
## Time-Mixing and Channel-Mixing Blocks
|
|
4
|
+
|
|
5
|
+
RWKV alternates between **Time-Mixing** (sequence processing) and **Channel-Mixing** (feature processing) blocks.
|
|
6
|
+
|
|
7
|
+
### Time-Mixing Block (WKV Operation)
|
|
8
|
+
|
|
9
|
+
The core innovation is the **WKV (Weighted Key-Value)** mechanism:
|
|
10
|
+
|
|
11
|
+
```python
|
|
12
|
+
# Traditional Attention (O(n²))
|
|
13
|
+
scores = Q @ K.T / sqrt(d) # n×n matrix
|
|
14
|
+
attention = softmax(scores)
|
|
15
|
+
output = attention @ V
|
|
16
|
+
|
|
17
|
+
# RWKV Time-Mixing (O(n))
|
|
18
|
+
# Compute WKV in linear time using recurrence
|
|
19
|
+
for t in range(T):
|
|
20
|
+
wkv[t] = (exp(w) * k[t] @ v[t] + a[t] * aa[t]) / (exp(w) * k[t] + a[t] * ab[t])
|
|
21
|
+
aa[t+1] = exp(w) * k[t] @ v[t] + exp(-u) * aa[t]
|
|
22
|
+
ab[t+1] = exp(w) * k[t] + exp(-u) * ab[t]
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
**Full Time-Mixing implementation**:
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
class RWKV_TimeMix(nn.Module):
|
|
29
|
+
def __init__(self, d_model, n_layer):
|
|
30
|
+
super().__init__()
|
|
31
|
+
self.d_model = d_model
|
|
32
|
+
|
|
33
|
+
# Linear projections
|
|
34
|
+
self.key = nn.Linear(d_model, d_model, bias=False)
|
|
35
|
+
self.value = nn.Linear(d_model, d_model, bias=False)
|
|
36
|
+
self.receptance = nn.Linear(d_model, d_model, bias=False)
|
|
37
|
+
self.output = nn.Linear(d_model, d_model, bias=False)
|
|
38
|
+
|
|
39
|
+
# Time-mixing parameters
|
|
40
|
+
self.time_mix_k = nn.Parameter(torch.ones(1, 1, d_model))
|
|
41
|
+
self.time_mix_v = nn.Parameter(torch.ones(1, 1, d_model))
|
|
42
|
+
self.time_mix_r = nn.Parameter(torch.ones(1, 1, d_model))
|
|
43
|
+
|
|
44
|
+
# Time-decay and bonus
|
|
45
|
+
self.time_decay = nn.Parameter(torch.ones(d_model)) # w
|
|
46
|
+
self.time_first = nn.Parameter(torch.ones(d_model)) # u
|
|
47
|
+
|
|
48
|
+
def forward(self, x, state=None):
|
|
49
|
+
B, T, C = x.shape
|
|
50
|
+
|
|
51
|
+
# Time-shift mixing (interpolate with previous token)
|
|
52
|
+
if state is None:
|
|
53
|
+
state = torch.zeros(B, C, 3, device=x.device) # [aa, ab, x_prev]
|
|
54
|
+
|
|
55
|
+
x_prev = state[:, :, 2].unsqueeze(1) # Previous x
|
|
56
|
+
xk = x * self.time_mix_k + x_prev * (1 - self.time_mix_k)
|
|
57
|
+
xv = x * self.time_mix_v + x_prev * (1 - self.time_mix_v)
|
|
58
|
+
xr = x * self.time_mix_r + x_prev * (1 - self.time_mix_r)
|
|
59
|
+
|
|
60
|
+
# Compute k, v, r
|
|
61
|
+
k = self.key(xk)
|
|
62
|
+
v = self.value(xv)
|
|
63
|
+
r = self.receptance(xr)
|
|
64
|
+
|
|
65
|
+
# WKV computation (parallelizable or sequential)
|
|
66
|
+
wkv = self.wkv(k, v, state[:, :, :2])
|
|
67
|
+
|
|
68
|
+
# Apply receptance gate and output projection
|
|
69
|
+
out = self.output(torch.sigmoid(r) * wkv)
|
|
70
|
+
|
|
71
|
+
# Update state
|
|
72
|
+
new_state = torch.stack([state_aa, state_ab, x[:, -1]], dim=2)
|
|
73
|
+
|
|
74
|
+
return out, new_state
|
|
75
|
+
|
|
76
|
+
def wkv(self, k, v, state):
|
|
77
|
+
# Parallel implementation (training)
|
|
78
|
+
# Sequential implementation (inference) - see below
|
|
79
|
+
...
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### WKV Parallel Algorithm (Training)
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
def wkv_forward(w, u, k, v):
|
|
86
|
+
"""
|
|
87
|
+
Parallel WKV computation for training.
|
|
88
|
+
w: time_decay (d_model,)
|
|
89
|
+
u: time_first (d_model,)
|
|
90
|
+
k: keys (batch, seq_len, d_model)
|
|
91
|
+
v: values (batch, seq_len, d_model)
|
|
92
|
+
"""
|
|
93
|
+
B, T, C = k.shape
|
|
94
|
+
|
|
95
|
+
# Compute cumulative sums with exponential decay
|
|
96
|
+
# This is the key to O(n) parallel computation
|
|
97
|
+
w = -torch.exp(w) # Negative for decay
|
|
98
|
+
|
|
99
|
+
# Associative scan operation
|
|
100
|
+
wkv = torch.zeros(B, T, C, device=k.device)
|
|
101
|
+
state = torch.zeros(B, C, device=k.device)
|
|
102
|
+
|
|
103
|
+
for t in range(T):
|
|
104
|
+
kv = k[:, t] * v[:, t]
|
|
105
|
+
wkv[:, t] = (u * kv + state) / (u * k[:, t] + torch.exp(state_count))
|
|
106
|
+
state = w * state + kv
|
|
107
|
+
|
|
108
|
+
return wkv
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### WKV Sequential Algorithm (Inference)
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
def wkv_inference(w, u, k, v, state):
|
|
115
|
+
"""
|
|
116
|
+
Sequential WKV for O(1) per-token inference.
|
|
117
|
+
state: (aa, ab) from previous step
|
|
118
|
+
"""
|
|
119
|
+
w = -torch.exp(w) # time_decay
|
|
120
|
+
u = torch.exp(u) # time_first
|
|
121
|
+
|
|
122
|
+
# Unpack state
|
|
123
|
+
aa, ab = state # aa = numerator, ab = denominator
|
|
124
|
+
|
|
125
|
+
# Compute WKV for current token
|
|
126
|
+
kv = k * v
|
|
127
|
+
wkv = (u * kv + aa) / (u * k + ab)
|
|
128
|
+
|
|
129
|
+
# Update state for next token
|
|
130
|
+
new_aa = w * aa + kv
|
|
131
|
+
new_ab = w * ab + k
|
|
132
|
+
|
|
133
|
+
return wkv, (new_aa, new_ab)
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
### Channel-Mixing Block
|
|
137
|
+
|
|
138
|
+
Replaces Transformer FFN with time-shifted variant:
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
class RWKV_ChannelMix(nn.Module):
|
|
142
|
+
def __init__(self, d_model, hidden_ratio=4):
|
|
143
|
+
super().__init__()
|
|
144
|
+
self.d_model = d_model
|
|
145
|
+
self.hidden = d_model * hidden_ratio
|
|
146
|
+
|
|
147
|
+
# Time-mixing for channel
|
|
148
|
+
self.time_mix_k = nn.Parameter(torch.ones(1, 1, d_model))
|
|
149
|
+
self.time_mix_r = nn.Parameter(torch.ones(1, 1, d_model))
|
|
150
|
+
|
|
151
|
+
# FFN layers
|
|
152
|
+
self.key = nn.Linear(d_model, self.hidden, bias=False)
|
|
153
|
+
self.receptance = nn.Linear(d_model, d_model, bias=False)
|
|
154
|
+
self.value = nn.Linear(self.hidden, d_model, bias=False)
|
|
155
|
+
|
|
156
|
+
def forward(self, x, x_prev):
|
|
157
|
+
# Time-shift mixing
|
|
158
|
+
xk = x * self.time_mix_k + x_prev * (1 - self.time_mix_k)
|
|
159
|
+
xr = x * self.time_mix_r + x_prev * (1 - self.time_mix_r)
|
|
160
|
+
|
|
161
|
+
# Channel mixing
|
|
162
|
+
k = self.key(xk)
|
|
163
|
+
k = torch.square(torch.relu(k)) # Squared ReLU activation
|
|
164
|
+
kv = self.value(k)
|
|
165
|
+
|
|
166
|
+
# Receptance gate
|
|
167
|
+
r = torch.sigmoid(self.receptance(xr))
|
|
168
|
+
|
|
169
|
+
return r * kv
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
## RWKV Block Structure
|
|
173
|
+
|
|
174
|
+
```python
|
|
175
|
+
class RWKV_Block(nn.Module):
|
|
176
|
+
def __init__(self, d_model, n_layer):
|
|
177
|
+
super().__init__()
|
|
178
|
+
self.ln1 = nn.LayerNorm(d_model)
|
|
179
|
+
self.ln2 = nn.LayerNorm(d_model)
|
|
180
|
+
self.att = RWKV_TimeMix(d_model, n_layer)
|
|
181
|
+
self.ffn = RWKV_ChannelMix(d_model)
|
|
182
|
+
|
|
183
|
+
def forward(self, x, state):
|
|
184
|
+
# Time-mixing with residual
|
|
185
|
+
att_out, new_state = self.att(self.ln1(x), state)
|
|
186
|
+
x = x + att_out
|
|
187
|
+
|
|
188
|
+
# Channel-mixing with residual
|
|
189
|
+
ffn_out = self.ffn(self.ln2(x), state[:, :, 2]) # Use x_prev from state
|
|
190
|
+
x = x + ffn_out
|
|
191
|
+
|
|
192
|
+
return x, new_state
|
|
193
|
+
|
|
194
|
+
# Full RWKV model
|
|
195
|
+
model = nn.Sequential(
|
|
196
|
+
Embedding(...),
|
|
197
|
+
*[RWKV_Block(d_model, i) for i in range(n_layers)],
|
|
198
|
+
LayerNorm(d_model),
|
|
199
|
+
LMHead(...)
|
|
200
|
+
)
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
## Time-Decay Mechanism
|
|
204
|
+
|
|
205
|
+
The **time_decay** parameter `w` controls how fast information decays:
|
|
206
|
+
|
|
207
|
+
```python
|
|
208
|
+
# Initialization (RWKV-4)
|
|
209
|
+
time_decay = torch.ones(n_layers, d_model)
|
|
210
|
+
for i in range(n_layers):
|
|
211
|
+
for j in range(d_model):
|
|
212
|
+
# Logarithmic spacing
|
|
213
|
+
ratio = (i + 1) / n_layers
|
|
214
|
+
time_decay[i, j] = -5.0 + 8.0 * ratio + 0.3 * (j / d_model)
|
|
215
|
+
|
|
216
|
+
# Effect on memory
|
|
217
|
+
w = -exp(time_decay) # Range: [-exp(-5), -exp(3)] ≈ [-0.007, -20]
|
|
218
|
+
# Smaller w = slower decay = longer memory
|
|
219
|
+
# Larger w = faster decay = shorter memory
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
**Layer-wise decay pattern**:
|
|
223
|
+
- Early layers (shallow): Fast decay, capture local patterns
|
|
224
|
+
- Later layers (deep): Slow decay, capture long-range dependencies
|
|
225
|
+
|
|
226
|
+
## Receptance Gate
|
|
227
|
+
|
|
228
|
+
The **receptance** mechanism controls information flow:
|
|
229
|
+
|
|
230
|
+
```python
|
|
231
|
+
r = sigmoid(receptance(x)) # Range [0, 1]
|
|
232
|
+
output = r * wkv # Gate the WKV output
|
|
233
|
+
|
|
234
|
+
# High receptance (r ≈ 1): Pass information through
|
|
235
|
+
# Low receptance (r ≈ 0): Block information
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
**Purpose**: Similar to LSTM forget gate, but learned per-token
|
|
239
|
+
|
|
240
|
+
## RWKV-4 vs RWKV-5 vs RWKV-6 vs RWKV-7
|
|
241
|
+
|
|
242
|
+
### RWKV-4 (Original)
|
|
243
|
+
```python
|
|
244
|
+
# Time-shift with previous token
|
|
245
|
+
xx = x * time_mix + x_prev * (1 - time_mix)
|
|
246
|
+
k, v, r = key(xx), value(xx), receptance(xx)
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
### RWKV-5 (2023)
|
|
250
|
+
```python
|
|
251
|
+
# Separate time-mix for k, v, r
|
|
252
|
+
xk = x * time_mix_k + x_prev * (1 - time_mix_k)
|
|
253
|
+
xv = x * time_mix_v + x_prev * (1 - time_mix_v)
|
|
254
|
+
xr = x * time_mix_r + x_prev * (1 - time_mix_r)
|
|
255
|
+
k, v, r = key(xk), value(xk), receptance(xr)
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
### RWKV-6 (2024)
|
|
259
|
+
- Added **multi-head time-mixing** (like multi-head attention)
|
|
260
|
+
- Separate time-decay per head
|
|
261
|
+
- Improved stability for large models
|
|
262
|
+
|
|
263
|
+
```python
|
|
264
|
+
# Per-head processing
|
|
265
|
+
for h in range(n_heads):
|
|
266
|
+
k_h = key[h](x) # Separate projection per head
|
|
267
|
+
w_h = time_decay[h] # Separate decay per head
|
|
268
|
+
wkv_h = wkv(k_h, v_h, w_h)
|
|
269
|
+
output = concat(wkv_0, wkv_1, ..., wkv_H)
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
### RWKV-7 (March 2025)
|
|
273
|
+
- **Multimodal support** (vision + language)
|
|
274
|
+
- Improved numerical stability
|
|
275
|
+
- Better scaling to 14B+ parameters
|
|
276
|
+
|
|
277
|
+
## Numerical Stability
|
|
278
|
+
|
|
279
|
+
### Issue: Exponential Overflow
|
|
280
|
+
|
|
281
|
+
```python
|
|
282
|
+
# Problem: exp(wkv) can overflow
|
|
283
|
+
wkv = exp(u * kv) / exp(u * k) # Can overflow!
|
|
284
|
+
```
|
|
285
|
+
|
|
286
|
+
### Solution: Log-space Computation
|
|
287
|
+
|
|
288
|
+
```python
|
|
289
|
+
# Stable implementation
|
|
290
|
+
log_wkv_num = u + log(kv) + log(aa)
|
|
291
|
+
log_wkv_den = u + log(k) + log(ab)
|
|
292
|
+
wkv = exp(log_wkv_num - log_wkv_den) # Numerically stable
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
### Gradient Clipping
|
|
296
|
+
|
|
297
|
+
```python
|
|
298
|
+
# Recommended for training stability
|
|
299
|
+
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
## State Management
|
|
303
|
+
|
|
304
|
+
### State Shape
|
|
305
|
+
|
|
306
|
+
```python
|
|
307
|
+
# For batch inference
|
|
308
|
+
state = torch.zeros(
|
|
309
|
+
batch_size,
|
|
310
|
+
n_layers,
|
|
311
|
+
4, # (att_aa, att_ab, att_x_prev, ffn_x_prev)
|
|
312
|
+
d_model
|
|
313
|
+
)
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
### State Initialization
|
|
317
|
+
|
|
318
|
+
```python
|
|
319
|
+
# Zero initialization (standard)
|
|
320
|
+
state = None # Model creates zero state
|
|
321
|
+
|
|
322
|
+
# Warm state (from previous conversation)
|
|
323
|
+
_, state = model.forward(previous_context, None)
|
|
324
|
+
# Use `state` for next turn
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
### State Serialization
|
|
328
|
+
|
|
329
|
+
```python
|
|
330
|
+
# Save conversation state
|
|
331
|
+
torch.save(state, 'conversation_state.pt')
|
|
332
|
+
|
|
333
|
+
# Resume conversation
|
|
334
|
+
state = torch.load('conversation_state.pt')
|
|
335
|
+
out, state = model.forward(new_tokens, state)
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
## Resources
|
|
339
|
+
|
|
340
|
+
- Paper (RWKV): https://arxiv.org/abs/2305.13048 (May 2023)
|
|
341
|
+
- Paper (RWKV-7): https://arxiv.org/abs/2503.14456 (March 2025)
|
|
342
|
+
- GitHub: https://github.com/BlinkDL/RWKV-LM
|
|
343
|
+
- Math derivation: https://wiki.rwkv.com/
|
|
344
|
+
- CUDA kernels: https://github.com/BlinkDL/RWKV-CUDA
|