@synsci/cli-darwin-x64 1.1.49

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (373) hide show
  1. package/bin/skills/accelerate/SKILL.md +332 -0
  2. package/bin/skills/accelerate/references/custom-plugins.md +453 -0
  3. package/bin/skills/accelerate/references/megatron-integration.md +489 -0
  4. package/bin/skills/accelerate/references/performance.md +525 -0
  5. package/bin/skills/audiocraft/SKILL.md +564 -0
  6. package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
  7. package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
  8. package/bin/skills/autogpt/SKILL.md +403 -0
  9. package/bin/skills/autogpt/references/advanced-usage.md +535 -0
  10. package/bin/skills/autogpt/references/troubleshooting.md +420 -0
  11. package/bin/skills/awq/SKILL.md +310 -0
  12. package/bin/skills/awq/references/advanced-usage.md +324 -0
  13. package/bin/skills/awq/references/troubleshooting.md +344 -0
  14. package/bin/skills/axolotl/SKILL.md +158 -0
  15. package/bin/skills/axolotl/references/api.md +5548 -0
  16. package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
  17. package/bin/skills/axolotl/references/index.md +15 -0
  18. package/bin/skills/axolotl/references/other.md +3563 -0
  19. package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
  20. package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
  21. package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
  22. package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
  23. package/bin/skills/bitsandbytes/SKILL.md +411 -0
  24. package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
  25. package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
  26. package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
  27. package/bin/skills/blip-2/SKILL.md +564 -0
  28. package/bin/skills/blip-2/references/advanced-usage.md +680 -0
  29. package/bin/skills/blip-2/references/troubleshooting.md +526 -0
  30. package/bin/skills/chroma/SKILL.md +406 -0
  31. package/bin/skills/chroma/references/integration.md +38 -0
  32. package/bin/skills/clip/SKILL.md +253 -0
  33. package/bin/skills/clip/references/applications.md +207 -0
  34. package/bin/skills/constitutional-ai/SKILL.md +290 -0
  35. package/bin/skills/crewai/SKILL.md +498 -0
  36. package/bin/skills/crewai/references/flows.md +438 -0
  37. package/bin/skills/crewai/references/tools.md +429 -0
  38. package/bin/skills/crewai/references/troubleshooting.md +480 -0
  39. package/bin/skills/deepspeed/SKILL.md +141 -0
  40. package/bin/skills/deepspeed/references/08.md +17 -0
  41. package/bin/skills/deepspeed/references/09.md +173 -0
  42. package/bin/skills/deepspeed/references/2020.md +378 -0
  43. package/bin/skills/deepspeed/references/2023.md +279 -0
  44. package/bin/skills/deepspeed/references/assets.md +179 -0
  45. package/bin/skills/deepspeed/references/index.md +35 -0
  46. package/bin/skills/deepspeed/references/mii.md +118 -0
  47. package/bin/skills/deepspeed/references/other.md +1191 -0
  48. package/bin/skills/deepspeed/references/tutorials.md +6554 -0
  49. package/bin/skills/dspy/SKILL.md +590 -0
  50. package/bin/skills/dspy/references/examples.md +663 -0
  51. package/bin/skills/dspy/references/modules.md +475 -0
  52. package/bin/skills/dspy/references/optimizers.md +566 -0
  53. package/bin/skills/faiss/SKILL.md +221 -0
  54. package/bin/skills/faiss/references/index_types.md +280 -0
  55. package/bin/skills/flash-attention/SKILL.md +367 -0
  56. package/bin/skills/flash-attention/references/benchmarks.md +215 -0
  57. package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
  58. package/bin/skills/gguf/SKILL.md +427 -0
  59. package/bin/skills/gguf/references/advanced-usage.md +504 -0
  60. package/bin/skills/gguf/references/troubleshooting.md +442 -0
  61. package/bin/skills/gptq/SKILL.md +450 -0
  62. package/bin/skills/gptq/references/calibration.md +337 -0
  63. package/bin/skills/gptq/references/integration.md +129 -0
  64. package/bin/skills/gptq/references/troubleshooting.md +95 -0
  65. package/bin/skills/grpo-rl-training/README.md +97 -0
  66. package/bin/skills/grpo-rl-training/SKILL.md +572 -0
  67. package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
  68. package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
  69. package/bin/skills/guidance/SKILL.md +572 -0
  70. package/bin/skills/guidance/references/backends.md +554 -0
  71. package/bin/skills/guidance/references/constraints.md +674 -0
  72. package/bin/skills/guidance/references/examples.md +767 -0
  73. package/bin/skills/hqq/SKILL.md +445 -0
  74. package/bin/skills/hqq/references/advanced-usage.md +528 -0
  75. package/bin/skills/hqq/references/troubleshooting.md +503 -0
  76. package/bin/skills/hugging-face-cli/SKILL.md +191 -0
  77. package/bin/skills/hugging-face-cli/references/commands.md +954 -0
  78. package/bin/skills/hugging-face-cli/references/examples.md +374 -0
  79. package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
  80. package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
  81. package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
  82. package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
  83. package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
  84. package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
  85. package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
  86. package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
  87. package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
  88. package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
  89. package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
  90. package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
  91. package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
  92. package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
  93. package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
  94. package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
  95. package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
  96. package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
  97. package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
  98. package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
  99. package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
  100. package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
  101. package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
  102. package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
  103. package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
  104. package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
  105. package/bin/skills/hugging-face-jobs/index.html +216 -0
  106. package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
  107. package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
  108. package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
  109. package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
  110. package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
  111. package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
  112. package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
  113. package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
  114. package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
  115. package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
  116. package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
  117. package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
  118. package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
  119. package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
  120. package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
  121. package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
  122. package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
  123. package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
  124. package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
  125. package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
  126. package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
  127. package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
  128. package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
  129. package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
  130. package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
  131. package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
  132. package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
  133. package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
  134. package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
  135. package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
  136. package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
  137. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
  138. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
  139. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
  140. package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
  141. package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
  142. package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
  143. package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
  144. package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
  145. package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
  146. package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
  147. package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
  148. package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
  149. package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
  150. package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
  151. package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
  152. package/bin/skills/instructor/SKILL.md +740 -0
  153. package/bin/skills/instructor/references/examples.md +107 -0
  154. package/bin/skills/instructor/references/providers.md +70 -0
  155. package/bin/skills/instructor/references/validation.md +606 -0
  156. package/bin/skills/knowledge-distillation/SKILL.md +458 -0
  157. package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
  158. package/bin/skills/lambda-labs/SKILL.md +545 -0
  159. package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
  160. package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
  161. package/bin/skills/langchain/SKILL.md +480 -0
  162. package/bin/skills/langchain/references/agents.md +499 -0
  163. package/bin/skills/langchain/references/integration.md +562 -0
  164. package/bin/skills/langchain/references/rag.md +600 -0
  165. package/bin/skills/langsmith/SKILL.md +422 -0
  166. package/bin/skills/langsmith/references/advanced-usage.md +548 -0
  167. package/bin/skills/langsmith/references/troubleshooting.md +537 -0
  168. package/bin/skills/litgpt/SKILL.md +469 -0
  169. package/bin/skills/litgpt/references/custom-models.md +568 -0
  170. package/bin/skills/litgpt/references/distributed-training.md +451 -0
  171. package/bin/skills/litgpt/references/supported-models.md +336 -0
  172. package/bin/skills/litgpt/references/training-recipes.md +619 -0
  173. package/bin/skills/llama-cpp/SKILL.md +258 -0
  174. package/bin/skills/llama-cpp/references/optimization.md +89 -0
  175. package/bin/skills/llama-cpp/references/quantization.md +213 -0
  176. package/bin/skills/llama-cpp/references/server.md +125 -0
  177. package/bin/skills/llama-factory/SKILL.md +80 -0
  178. package/bin/skills/llama-factory/references/_images.md +23 -0
  179. package/bin/skills/llama-factory/references/advanced.md +1055 -0
  180. package/bin/skills/llama-factory/references/getting_started.md +349 -0
  181. package/bin/skills/llama-factory/references/index.md +19 -0
  182. package/bin/skills/llama-factory/references/other.md +31 -0
  183. package/bin/skills/llamaguard/SKILL.md +337 -0
  184. package/bin/skills/llamaindex/SKILL.md +569 -0
  185. package/bin/skills/llamaindex/references/agents.md +83 -0
  186. package/bin/skills/llamaindex/references/data_connectors.md +108 -0
  187. package/bin/skills/llamaindex/references/query_engines.md +406 -0
  188. package/bin/skills/llava/SKILL.md +304 -0
  189. package/bin/skills/llava/references/training.md +197 -0
  190. package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
  191. package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
  192. package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
  193. package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
  194. package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
  195. package/bin/skills/long-context/SKILL.md +536 -0
  196. package/bin/skills/long-context/references/extension_methods.md +468 -0
  197. package/bin/skills/long-context/references/fine_tuning.md +611 -0
  198. package/bin/skills/long-context/references/rope.md +402 -0
  199. package/bin/skills/mamba/SKILL.md +260 -0
  200. package/bin/skills/mamba/references/architecture-details.md +206 -0
  201. package/bin/skills/mamba/references/benchmarks.md +255 -0
  202. package/bin/skills/mamba/references/training-guide.md +388 -0
  203. package/bin/skills/megatron-core/SKILL.md +366 -0
  204. package/bin/skills/megatron-core/references/benchmarks.md +249 -0
  205. package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
  206. package/bin/skills/megatron-core/references/production-examples.md +473 -0
  207. package/bin/skills/megatron-core/references/training-recipes.md +547 -0
  208. package/bin/skills/miles/SKILL.md +315 -0
  209. package/bin/skills/miles/references/api-reference.md +141 -0
  210. package/bin/skills/miles/references/troubleshooting.md +352 -0
  211. package/bin/skills/mlflow/SKILL.md +704 -0
  212. package/bin/skills/mlflow/references/deployment.md +744 -0
  213. package/bin/skills/mlflow/references/model-registry.md +770 -0
  214. package/bin/skills/mlflow/references/tracking.md +680 -0
  215. package/bin/skills/modal/SKILL.md +341 -0
  216. package/bin/skills/modal/references/advanced-usage.md +503 -0
  217. package/bin/skills/modal/references/troubleshooting.md +494 -0
  218. package/bin/skills/model-merging/SKILL.md +539 -0
  219. package/bin/skills/model-merging/references/evaluation.md +462 -0
  220. package/bin/skills/model-merging/references/examples.md +428 -0
  221. package/bin/skills/model-merging/references/methods.md +352 -0
  222. package/bin/skills/model-pruning/SKILL.md +495 -0
  223. package/bin/skills/model-pruning/references/wanda.md +347 -0
  224. package/bin/skills/moe-training/SKILL.md +526 -0
  225. package/bin/skills/moe-training/references/architectures.md +432 -0
  226. package/bin/skills/moe-training/references/inference.md +348 -0
  227. package/bin/skills/moe-training/references/training.md +425 -0
  228. package/bin/skills/nanogpt/SKILL.md +290 -0
  229. package/bin/skills/nanogpt/references/architecture.md +382 -0
  230. package/bin/skills/nanogpt/references/data.md +476 -0
  231. package/bin/skills/nanogpt/references/training.md +564 -0
  232. package/bin/skills/nemo-curator/SKILL.md +383 -0
  233. package/bin/skills/nemo-curator/references/deduplication.md +87 -0
  234. package/bin/skills/nemo-curator/references/filtering.md +102 -0
  235. package/bin/skills/nemo-evaluator/SKILL.md +494 -0
  236. package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
  237. package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
  238. package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
  239. package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
  240. package/bin/skills/nemo-guardrails/SKILL.md +297 -0
  241. package/bin/skills/nnsight/SKILL.md +436 -0
  242. package/bin/skills/nnsight/references/README.md +78 -0
  243. package/bin/skills/nnsight/references/api.md +344 -0
  244. package/bin/skills/nnsight/references/tutorials.md +300 -0
  245. package/bin/skills/openrlhf/SKILL.md +249 -0
  246. package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
  247. package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
  248. package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
  249. package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
  250. package/bin/skills/outlines/SKILL.md +652 -0
  251. package/bin/skills/outlines/references/backends.md +615 -0
  252. package/bin/skills/outlines/references/examples.md +773 -0
  253. package/bin/skills/outlines/references/json_generation.md +652 -0
  254. package/bin/skills/peft/SKILL.md +431 -0
  255. package/bin/skills/peft/references/advanced-usage.md +514 -0
  256. package/bin/skills/peft/references/troubleshooting.md +480 -0
  257. package/bin/skills/phoenix/SKILL.md +475 -0
  258. package/bin/skills/phoenix/references/advanced-usage.md +619 -0
  259. package/bin/skills/phoenix/references/troubleshooting.md +538 -0
  260. package/bin/skills/pinecone/SKILL.md +358 -0
  261. package/bin/skills/pinecone/references/deployment.md +181 -0
  262. package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
  263. package/bin/skills/pytorch-fsdp/references/index.md +7 -0
  264. package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
  265. package/bin/skills/pytorch-lightning/SKILL.md +346 -0
  266. package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
  267. package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
  268. package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
  269. package/bin/skills/pyvene/SKILL.md +473 -0
  270. package/bin/skills/pyvene/references/README.md +73 -0
  271. package/bin/skills/pyvene/references/api.md +383 -0
  272. package/bin/skills/pyvene/references/tutorials.md +376 -0
  273. package/bin/skills/qdrant/SKILL.md +493 -0
  274. package/bin/skills/qdrant/references/advanced-usage.md +648 -0
  275. package/bin/skills/qdrant/references/troubleshooting.md +631 -0
  276. package/bin/skills/ray-data/SKILL.md +326 -0
  277. package/bin/skills/ray-data/references/integration.md +82 -0
  278. package/bin/skills/ray-data/references/transformations.md +83 -0
  279. package/bin/skills/ray-train/SKILL.md +406 -0
  280. package/bin/skills/ray-train/references/multi-node.md +628 -0
  281. package/bin/skills/rwkv/SKILL.md +260 -0
  282. package/bin/skills/rwkv/references/architecture-details.md +344 -0
  283. package/bin/skills/rwkv/references/rwkv7.md +386 -0
  284. package/bin/skills/rwkv/references/state-management.md +369 -0
  285. package/bin/skills/saelens/SKILL.md +386 -0
  286. package/bin/skills/saelens/references/README.md +70 -0
  287. package/bin/skills/saelens/references/api.md +333 -0
  288. package/bin/skills/saelens/references/tutorials.md +318 -0
  289. package/bin/skills/segment-anything/SKILL.md +500 -0
  290. package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
  291. package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
  292. package/bin/skills/sentence-transformers/SKILL.md +255 -0
  293. package/bin/skills/sentence-transformers/references/models.md +123 -0
  294. package/bin/skills/sentencepiece/SKILL.md +235 -0
  295. package/bin/skills/sentencepiece/references/algorithms.md +200 -0
  296. package/bin/skills/sentencepiece/references/training.md +304 -0
  297. package/bin/skills/sglang/SKILL.md +442 -0
  298. package/bin/skills/sglang/references/deployment.md +490 -0
  299. package/bin/skills/sglang/references/radix-attention.md +413 -0
  300. package/bin/skills/sglang/references/structured-generation.md +541 -0
  301. package/bin/skills/simpo/SKILL.md +219 -0
  302. package/bin/skills/simpo/references/datasets.md +478 -0
  303. package/bin/skills/simpo/references/hyperparameters.md +452 -0
  304. package/bin/skills/simpo/references/loss-functions.md +350 -0
  305. package/bin/skills/skypilot/SKILL.md +509 -0
  306. package/bin/skills/skypilot/references/advanced-usage.md +491 -0
  307. package/bin/skills/skypilot/references/troubleshooting.md +570 -0
  308. package/bin/skills/slime/SKILL.md +464 -0
  309. package/bin/skills/slime/references/api-reference.md +392 -0
  310. package/bin/skills/slime/references/troubleshooting.md +386 -0
  311. package/bin/skills/speculative-decoding/SKILL.md +467 -0
  312. package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
  313. package/bin/skills/speculative-decoding/references/medusa.md +350 -0
  314. package/bin/skills/stable-diffusion/SKILL.md +519 -0
  315. package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
  316. package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
  317. package/bin/skills/tensorboard/SKILL.md +629 -0
  318. package/bin/skills/tensorboard/references/integrations.md +638 -0
  319. package/bin/skills/tensorboard/references/profiling.md +545 -0
  320. package/bin/skills/tensorboard/references/visualization.md +620 -0
  321. package/bin/skills/tensorrt-llm/SKILL.md +187 -0
  322. package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
  323. package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
  324. package/bin/skills/tensorrt-llm/references/serving.md +470 -0
  325. package/bin/skills/tinker/SKILL.md +362 -0
  326. package/bin/skills/tinker/references/api-reference.md +168 -0
  327. package/bin/skills/tinker/references/getting-started.md +157 -0
  328. package/bin/skills/tinker/references/loss-functions.md +163 -0
  329. package/bin/skills/tinker/references/models-and-lora.md +139 -0
  330. package/bin/skills/tinker/references/recipes.md +280 -0
  331. package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
  332. package/bin/skills/tinker/references/rendering.md +243 -0
  333. package/bin/skills/tinker/references/supervised-learning.md +232 -0
  334. package/bin/skills/tinker-training-cost/SKILL.md +187 -0
  335. package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
  336. package/bin/skills/torchforge/SKILL.md +433 -0
  337. package/bin/skills/torchforge/references/api-reference.md +327 -0
  338. package/bin/skills/torchforge/references/troubleshooting.md +409 -0
  339. package/bin/skills/torchtitan/SKILL.md +358 -0
  340. package/bin/skills/torchtitan/references/checkpoint.md +181 -0
  341. package/bin/skills/torchtitan/references/custom-models.md +258 -0
  342. package/bin/skills/torchtitan/references/float8.md +133 -0
  343. package/bin/skills/torchtitan/references/fsdp.md +126 -0
  344. package/bin/skills/transformer-lens/SKILL.md +346 -0
  345. package/bin/skills/transformer-lens/references/README.md +54 -0
  346. package/bin/skills/transformer-lens/references/api.md +362 -0
  347. package/bin/skills/transformer-lens/references/tutorials.md +339 -0
  348. package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
  349. package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
  350. package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
  351. package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
  352. package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
  353. package/bin/skills/unsloth/SKILL.md +80 -0
  354. package/bin/skills/unsloth/references/index.md +7 -0
  355. package/bin/skills/unsloth/references/llms-full.md +16799 -0
  356. package/bin/skills/unsloth/references/llms-txt.md +12044 -0
  357. package/bin/skills/unsloth/references/llms.md +82 -0
  358. package/bin/skills/verl/SKILL.md +391 -0
  359. package/bin/skills/verl/references/api-reference.md +301 -0
  360. package/bin/skills/verl/references/troubleshooting.md +391 -0
  361. package/bin/skills/vllm/SKILL.md +364 -0
  362. package/bin/skills/vllm/references/optimization.md +226 -0
  363. package/bin/skills/vllm/references/quantization.md +284 -0
  364. package/bin/skills/vllm/references/server-deployment.md +255 -0
  365. package/bin/skills/vllm/references/troubleshooting.md +447 -0
  366. package/bin/skills/weights-and-biases/SKILL.md +590 -0
  367. package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
  368. package/bin/skills/weights-and-biases/references/integrations.md +700 -0
  369. package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
  370. package/bin/skills/whisper/SKILL.md +317 -0
  371. package/bin/skills/whisper/references/languages.md +189 -0
  372. package/bin/synsc +0 -0
  373. package/package.json +10 -0
@@ -0,0 +1,536 @@
1
+ ---
2
+ name: long-context
3
+ description: Extend context windows of transformer models using RoPE, YaRN, ALiBi, and position interpolation techniques. Use when processing long documents (32k-128k+ tokens), extending pre-trained models beyond original context limits, or implementing efficient positional encodings. Covers rotary embeddings, attention biases, interpolation methods, and extrapolation strategies for LLMs.
4
+ version: 1.0.0
5
+ author: Synthetic Sciences
6
+ license: MIT
7
+ tags: [Emerging Techniques, Long Context, RoPE, YaRN, ALiBi, Position Interpolation, Extended Context, Rotary Embeddings, Attention Bias, Context Extension, Positional Encoding]
8
+ dependencies: [transformers, torch, flash-attn]
9
+ ---
10
+
11
+ # Long Context: Extending Transformer Context Windows
12
+
13
+ ## When to Use This Skill
14
+
15
+ Use Long Context techniques when you need to:
16
+ - **Process long documents** (32k, 64k, 128k+ tokens) with transformer models
17
+ - **Extend context windows** of pre-trained models (LLaMA, Mistral, etc.)
18
+ - **Implement efficient positional encodings** (RoPE, ALiBi)
19
+ - **Train models** with length extrapolation capabilities
20
+ - **Deploy models** that handle variable-length inputs efficiently
21
+ - **Fine-tune** existing models for longer contexts with minimal compute
22
+
23
+ **Key Techniques**: RoPE (Rotary Position Embeddings), YaRN, ALiBi (Attention with Linear Biases), Position Interpolation
24
+
25
+ **Papers**: RoFormer (arXiv 2104.09864), YaRN (arXiv 2309.00071), ALiBi (arXiv 2108.12409), Position Interpolation (arXiv 2306.15595)
26
+
27
+ ## Installation
28
+
29
+ ```bash
30
+ # HuggingFace Transformers (includes RoPE, YaRN support)
31
+ pip install transformers torch
32
+
33
+ # For custom implementations
34
+ pip install einops # Tensor operations
35
+ pip install rotary-embedding-torch # Standalone RoPE
36
+
37
+ # Optional: FlashAttention for efficiency
38
+ pip install flash-attn --no-build-isolation
39
+ ```
40
+
41
+ ## Quick Start
42
+
43
+ ### RoPE (Rotary Position Embeddings)
44
+
45
+ ```python
46
+ import torch
47
+ import torch.nn as nn
48
+
49
+ class RotaryEmbedding(nn.Module):
50
+ """Rotary Position Embeddings (RoPE)."""
51
+
52
+ def __init__(self, dim, max_seq_len=8192, base=10000):
53
+ super().__init__()
54
+ # Compute inverse frequencies
55
+ inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
56
+ self.register_buffer("inv_freq", inv_freq)
57
+ self.max_seq_len = max_seq_len
58
+
59
+ def forward(self, seq_len, device):
60
+ # Position indices
61
+ t = torch.arange(seq_len, device=device).type_as(self.inv_freq)
62
+
63
+ # Compute frequencies
64
+ freqs = torch.outer(t, self.inv_freq) # (seq_len, dim/2)
65
+
66
+ # Compute sin and cos
67
+ emb = torch.cat((freqs, freqs), dim=-1) # (seq_len, dim)
68
+ return emb.cos(), emb.sin()
69
+
70
+ def rotate_half(x):
71
+ """Rotate half the hidden dimensions."""
72
+ x1, x2 = x.chunk(2, dim=-1)
73
+ return torch.cat((-x2, x1), dim=-1)
74
+
75
+ def apply_rotary_pos_emb(q, k, cos, sin):
76
+ """Apply rotary embeddings to queries and keys."""
77
+ # q, k shape: (batch, heads, seq_len, dim)
78
+ q_embed = (q * cos) + (rotate_half(q) * sin)
79
+ k_embed = (k * cos) + (rotate_half(k) * sin)
80
+ return q_embed, k_embed
81
+
82
+ # Usage
83
+ rope = RotaryEmbedding(dim=64, max_seq_len=8192)
84
+ cos, sin = rope(seq_len=2048, device='cuda')
85
+
86
+ # In attention layer
87
+ q_rotated, k_rotated = apply_rotary_pos_emb(query, key, cos, sin)
88
+ ```
89
+
90
+ ### ALiBi (Attention with Linear Biases)
91
+
92
+ ```python
93
+ def get_alibi_slopes(num_heads):
94
+ """Get ALiBi slope values for each attention head."""
95
+ def get_slopes_power_of_2(n):
96
+ start = 2 ** (-(2 ** -(math.log2(n) - 3)))
97
+ ratio = start
98
+ return [start * (ratio ** i) for i in range(n)]
99
+
100
+ if math.log2(num_heads).is_integer():
101
+ return get_slopes_power_of_2(num_heads)
102
+ else:
103
+ # Closest power of 2
104
+ closest_power = 2 ** math.floor(math.log2(num_heads))
105
+ slopes = get_slopes_power_of_2(closest_power)
106
+ # Add extra slopes
107
+ extra = get_slopes_power_of_2(2 * closest_power)
108
+ slopes.extend(extra[0::2][:num_heads - closest_power])
109
+ return slopes
110
+
111
+ def create_alibi_bias(seq_len, num_heads):
112
+ """Create ALiBi attention bias."""
113
+ # Distance matrix
114
+ context_position = torch.arange(seq_len)
115
+ memory_position = torch.arange(seq_len)
116
+ relative_position = memory_position[None, :] - context_position[:, None]
117
+
118
+ # Get slopes
119
+ slopes = torch.tensor(get_alibi_slopes(num_heads))
120
+
121
+ # Apply slopes to distances
122
+ alibi = slopes[:, None, None] * relative_position[None, :, :]
123
+ return alibi # (num_heads, seq_len, seq_len)
124
+
125
+ # Usage in attention
126
+ num_heads = 8
127
+ seq_len = 2048
128
+ alibi_bias = create_alibi_bias(seq_len, num_heads).to('cuda')
129
+
130
+ # Add bias to attention scores
131
+ # attn_scores shape: (batch, num_heads, seq_len, seq_len)
132
+ attn_scores = attn_scores + alibi_bias
133
+ attn_weights = torch.softmax(attn_scores, dim=-1)
134
+ ```
135
+
136
+ ### Position Interpolation for LLaMA
137
+
138
+ ```python
139
+ from transformers import LlamaForCausalLM, LlamaTokenizer
140
+
141
+ # Original context: 2048 tokens
142
+ model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
143
+
144
+ # Extend to 32k with position interpolation
145
+ # Modify RoPE base frequency
146
+ model.config.rope_scaling = {
147
+ "type": "linear",
148
+ "factor": 16.0 # 2048 * 16 = 32768
149
+ }
150
+
151
+ # Or use dynamic scaling
152
+ model.config.rope_scaling = {
153
+ "type": "dynamic",
154
+ "factor": 16.0
155
+ }
156
+
157
+ # Fine-tune with long documents (minimal steps needed)
158
+ # Position interpolation works out-of-the-box after this config change
159
+ ```
160
+
161
+ ## Core Concepts
162
+
163
+ ### 1. RoPE (Rotary Position Embeddings)
164
+
165
+ **How it works:**
166
+ - Encodes absolute position via rotation matrix
167
+ - Provides relative position dependency in attention
168
+ - Enables length extrapolation
169
+
170
+ **Mathematical formulation:**
171
+ ```
172
+ q_m = (W_q * x_m) * e^(imθ)
173
+ k_n = (W_k * x_n) * e^(inθ)
174
+
175
+ where θ_j = base^(-2j/d) for j ∈ [0, d/2)
176
+ ```
177
+
178
+ **Advantages:**
179
+ - Decaying inter-token dependency with distance
180
+ - Compatible with linear attention
181
+ - Better extrapolation than absolute position encodings
182
+
183
+ ### 2. YaRN (Yet another RoPE extensioN)
184
+
185
+ **Key innovation:**
186
+ - NTK-aware interpolation (Neural Tangent Kernel)
187
+ - Attention temperature scaling
188
+ - Efficient context extension (10× less tokens vs baselines)
189
+
190
+ **Parameters:**
191
+ ```python
192
+ # YaRN configuration
193
+ yarn_config = {
194
+ "scale": 16, # Extension factor
195
+ "original_max_position": 2048, # Base context
196
+ "extrapolation_factor": 1.0, # NTK parameter
197
+ "attn_factor": 1.0, # Attention scaling
198
+ "beta_fast": 32, # High-frequency scale
199
+ "beta_slow": 1, # Low-frequency scale
200
+ }
201
+ ```
202
+
203
+ **Performance:**
204
+ - Extends LLaMA to 128k tokens
205
+ - 2.5× less training steps than baselines
206
+ - State-of-the-art context window extension
207
+
208
+ ### 3. ALiBi (Attention with Linear Biases)
209
+
210
+ **Core idea:**
211
+ - No positional embeddings added to tokens
212
+ - Apply distance penalty directly to attention scores
213
+ - Bias proportional to key-query distance
214
+
215
+ **Formula:**
216
+ ```
217
+ attention_bias[i, j] = -m * |i - j|
218
+
219
+ where m = slope for each attention head
220
+ ```
221
+
222
+ **Advantages:**
223
+ - 11% faster training vs sinusoidal embeddings
224
+ - 11% less memory usage
225
+ - Strong length extrapolation (train 1k, test 2k+)
226
+ - Inductive bias towards recency
227
+
228
+ ### 4. Position Interpolation
229
+
230
+ **Technique:**
231
+ - Linearly down-scale position indices
232
+ - Interpolate within trained range (vs extrapolate beyond)
233
+ - Minimal fine-tuning required
234
+
235
+ **Formula:**
236
+ ```
237
+ # Original: position indices [0, 1, 2, ..., L]
238
+ # Extended: position indices [0, 0.5, 1.0, ..., L/2]
239
+ # (for 2× extension)
240
+
241
+ scaled_position[i] = i / extension_factor
242
+ ```
243
+
244
+ **Results:**
245
+ - LLaMA 7B-65B extended to 32k tokens
246
+ - 1000 fine-tuning steps sufficient
247
+ - 600× better stability than extrapolation
248
+
249
+ ## Method Comparison
250
+
251
+ | Method | Max Context | Training Needed | Memory | Extrapolation | Best For |
252
+ |--------|-------------|-----------------|--------|---------------|----------|
253
+ | **RoPE** | 8k-32k | Full pre-training | Moderate | Good | New models |
254
+ | **YaRN** | 32k-128k | Minimal (10× efficient) | Moderate | Excellent | Extending existing models |
255
+ | **ALiBi** | Unlimited | Full pre-training | Low (-11%) | Excellent | Training from scratch |
256
+ | **Position Interpolation** | 32k+ | Minimal (1k steps) | Moderate | Poor (by design) | Quick extension |
257
+
258
+ ## Implementation Patterns
259
+
260
+ ### HuggingFace Transformers Integration
261
+
262
+ ```python
263
+ from transformers import AutoModelForCausalLM, AutoConfig
264
+
265
+ # RoPE with YaRN scaling
266
+ config = AutoConfig.from_pretrained("mistralai/Mistral-7B-v0.1")
267
+ config.rope_scaling = {
268
+ "type": "yarn",
269
+ "factor": 8.0,
270
+ "original_max_position_embeddings": 8192,
271
+ "attention_factor": 1.0
272
+ }
273
+
274
+ model = AutoModelForCausalLM.from_config(config)
275
+
276
+ # Position interpolation (simpler)
277
+ config.rope_scaling = {
278
+ "type": "linear",
279
+ "factor": 4.0
280
+ }
281
+
282
+ # Dynamic scaling (adjusts based on input length)
283
+ config.rope_scaling = {
284
+ "type": "dynamic",
285
+ "factor": 8.0
286
+ }
287
+ ```
288
+
289
+ ### Custom RoPE Implementation
290
+
291
+ ```python
292
+ class LongContextAttention(nn.Module):
293
+ """Multi-head attention with RoPE."""
294
+
295
+ def __init__(self, hidden_size, num_heads, max_seq_len=32768):
296
+ super().__init__()
297
+ self.num_heads = num_heads
298
+ self.head_dim = hidden_size // num_heads
299
+
300
+ # Q, K, V projections
301
+ self.q_proj = nn.Linear(hidden_size, hidden_size)
302
+ self.k_proj = nn.Linear(hidden_size, hidden_size)
303
+ self.v_proj = nn.Linear(hidden_size, hidden_size)
304
+ self.o_proj = nn.Linear(hidden_size, hidden_size)
305
+
306
+ # RoPE
307
+ self.rotary_emb = RotaryEmbedding(
308
+ dim=self.head_dim,
309
+ max_seq_len=max_seq_len
310
+ )
311
+
312
+ def forward(self, hidden_states):
313
+ batch_size, seq_len, _ = hidden_states.shape
314
+
315
+ # Project to Q, K, V
316
+ q = self.q_proj(hidden_states)
317
+ k = self.k_proj(hidden_states)
318
+ v = self.v_proj(hidden_states)
319
+
320
+ # Reshape for multi-head
321
+ q = q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
322
+ k = k.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
323
+ v = v.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
324
+
325
+ # Apply RoPE
326
+ cos, sin = self.rotary_emb(seq_len, device=hidden_states.device)
327
+ q, k = apply_rotary_pos_emb(q, k, cos, sin)
328
+
329
+ # Standard attention
330
+ attn_output = F.scaled_dot_product_attention(q, k, v)
331
+
332
+ # Reshape and project
333
+ attn_output = attn_output.transpose(1, 2).contiguous()
334
+ attn_output = attn_output.view(batch_size, seq_len, -1)
335
+ output = self.o_proj(attn_output)
336
+
337
+ return output
338
+ ```
339
+
340
+ ## Fine-tuning for Long Context
341
+
342
+ ### Minimal Fine-tuning (Position Interpolation)
343
+
344
+ ```python
345
+ from transformers import Trainer, TrainingArguments
346
+
347
+ # Extend model config
348
+ model.config.max_position_embeddings = 32768
349
+ model.config.rope_scaling = {"type": "linear", "factor": 16.0}
350
+
351
+ # Training args (minimal steps needed)
352
+ training_args = TrainingArguments(
353
+ output_dir="./llama-32k",
354
+ num_train_epochs=1,
355
+ max_steps=1000, # Only 1000 steps!
356
+ per_device_train_batch_size=1,
357
+ gradient_accumulation_steps=16,
358
+ learning_rate=2e-5,
359
+ warmup_steps=100,
360
+ logging_steps=10,
361
+ save_steps=500,
362
+ )
363
+
364
+ # Train on long documents
365
+ trainer = Trainer(
366
+ model=model,
367
+ args=training_args,
368
+ train_dataset=long_document_dataset, # 32k token sequences
369
+ )
370
+
371
+ trainer.train()
372
+ ```
373
+
374
+ ### YaRN Fine-tuning
375
+
376
+ ```bash
377
+ # Clone YaRN implementation
378
+ git clone https://github.com/jquesnelle/yarn
379
+ cd yarn
380
+
381
+ # Fine-tune LLaMA with YaRN
382
+ python scripts/train.py \
383
+ --model meta-llama/Llama-2-7b-hf \
384
+ --scale 16 \
385
+ --rope_theta 10000 \
386
+ --max_length 32768 \
387
+ --batch_size 1 \
388
+ --gradient_accumulation 16 \
389
+ --steps 400 \
390
+ --learning_rate 2e-5
391
+ ```
392
+
393
+ ## Best Practices
394
+
395
+ ### 1. Choose the Right Method
396
+
397
+ ```python
398
+ # For NEW models (training from scratch)
399
+ use_method = "ALiBi" # Best extrapolation, lowest memory
400
+
401
+ # For EXTENDING existing RoPE models
402
+ use_method = "YaRN" # Most efficient extension (10× less data)
403
+
404
+ # For QUICK extension with minimal compute
405
+ use_method = "Position Interpolation" # 1000 steps
406
+
407
+ # For MODERATE extension with good efficiency
408
+ use_method = "Linear RoPE Scaling" # Built-in, simple
409
+ ```
410
+
411
+ ### 2. Scaling Factor Selection
412
+
413
+ ```python
414
+ # Conservative (safer, better quality)
415
+ scaling_factor = 2.0 # 8k → 16k
416
+
417
+ # Moderate (good balance)
418
+ scaling_factor = 4.0 # 8k → 32k
419
+
420
+ # Aggressive (requires more fine-tuning)
421
+ scaling_factor = 8.0 # 8k → 64k
422
+ scaling_factor = 16.0 # 8k → 128k
423
+
424
+ # Rule: Larger factors need more fine-tuning steps
425
+ steps_needed = 100 * scaling_factor # Rough estimate
426
+ ```
427
+
428
+ ### 3. Fine-tuning Data
429
+
430
+ ```python
431
+ # ✅ Good: Long documents matching target length
432
+ train_data = [
433
+ {"text": long_doc_32k_tokens}, # Full 32k
434
+ {"text": long_doc_24k_tokens}, # Varied lengths
435
+ {"text": long_doc_16k_tokens},
436
+ ]
437
+
438
+ # ❌ Bad: Short documents (won't learn long context)
439
+ train_data = [
440
+ {"text": short_doc_2k_tokens},
441
+ ]
442
+
443
+ # Use datasets like:
444
+ # - PG-19 (books, long texts)
445
+ # - arXiv papers
446
+ # - Long-form conversations
447
+ # - GitHub repositories (concatenated files)
448
+ ```
449
+
450
+ ### 4. Avoid Common Pitfalls
451
+
452
+ ```python
453
+ # ❌ Bad: Applying position interpolation without fine-tuning
454
+ model.config.rope_scaling = {"type": "linear", "factor": 16.0}
455
+ # Model will perform poorly without fine-tuning!
456
+
457
+ # ✅ Good: Fine-tune after scaling
458
+ model.config.rope_scaling = {"type": "linear", "factor": 16.0}
459
+ fine_tune(model, long_documents, steps=1000)
460
+
461
+ # ❌ Bad: Too aggressive scaling without data
462
+ scale_to_1M_tokens() # Won't work without massive fine-tuning
463
+
464
+ # ✅ Good: Incremental scaling
465
+ # 8k → 16k → 32k → 64k (fine-tune at each step)
466
+ ```
467
+
468
+ ## Production Deployment
469
+
470
+ ### Inference with Long Context
471
+
472
+ ```python
473
+ from transformers import AutoModelForCausalLM, AutoTokenizer
474
+
475
+ # Load long-context model
476
+ model = AutoModelForCausalLM.from_pretrained(
477
+ "togethercomputer/LLaMA-2-7B-32K", # 32k context
478
+ torch_dtype=torch.float16,
479
+ device_map="auto"
480
+ )
481
+ tokenizer = AutoTokenizer.from_pretrained("togethercomputer/LLaMA-2-7B-32K")
482
+
483
+ # Process long document
484
+ long_text = "..." * 30000 # 30k tokens
485
+ inputs = tokenizer(long_text, return_tensors="pt", truncation=False).to('cuda')
486
+
487
+ # Generate
488
+ outputs = model.generate(
489
+ **inputs,
490
+ max_new_tokens=512,
491
+ temperature=0.7,
492
+ )
493
+
494
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
495
+ ```
496
+
497
+ ### Memory Optimization
498
+
499
+ ```python
500
+ # Use gradient checkpointing for fine-tuning
501
+ model.gradient_checkpointing_enable()
502
+
503
+ # Use Flash Attention 2
504
+ model = AutoModelForCausalLM.from_pretrained(
505
+ "meta-llama/Llama-2-7b-hf",
506
+ attn_implementation="flash_attention_2", # 2-3× faster
507
+ torch_dtype=torch.float16
508
+ )
509
+
510
+ # Use paged attention (vLLM)
511
+ from vllm import LLM
512
+
513
+ llm = LLM(
514
+ model="togethercomputer/LLaMA-2-7B-32K",
515
+ max_model_len=32768, # 32k context
516
+ gpu_memory_utilization=0.9
517
+ )
518
+ ```
519
+
520
+ ## Resources
521
+
522
+ - **RoPE Paper**: https://arxiv.org/abs/2104.09864 (RoFormer)
523
+ - **YaRN Paper**: https://arxiv.org/abs/2309.00071
524
+ - **ALiBi Paper**: https://arxiv.org/abs/2108.12409 (Train Short, Test Long)
525
+ - **Position Interpolation**: https://arxiv.org/abs/2306.15595
526
+ - **HuggingFace RoPE Utils**: https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_rope_utils.py
527
+ - **YaRN Implementation**: https://github.com/jquesnelle/yarn
528
+ - **Together AI Blog**: https://www.together.ai/blog/llama-2-7b-32k
529
+
530
+ ## See Also
531
+
532
+ - `references/rope.md` - Detailed RoPE implementation and theory
533
+ - `references/extension_methods.md` - YaRN, ALiBi, Position Interpolation comparisons
534
+ - `references/fine_tuning.md` - Complete fine-tuning guide for context extension
535
+
536
+