@synsci/cli-darwin-x64 1.1.49

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (373) hide show
  1. package/bin/skills/accelerate/SKILL.md +332 -0
  2. package/bin/skills/accelerate/references/custom-plugins.md +453 -0
  3. package/bin/skills/accelerate/references/megatron-integration.md +489 -0
  4. package/bin/skills/accelerate/references/performance.md +525 -0
  5. package/bin/skills/audiocraft/SKILL.md +564 -0
  6. package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
  7. package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
  8. package/bin/skills/autogpt/SKILL.md +403 -0
  9. package/bin/skills/autogpt/references/advanced-usage.md +535 -0
  10. package/bin/skills/autogpt/references/troubleshooting.md +420 -0
  11. package/bin/skills/awq/SKILL.md +310 -0
  12. package/bin/skills/awq/references/advanced-usage.md +324 -0
  13. package/bin/skills/awq/references/troubleshooting.md +344 -0
  14. package/bin/skills/axolotl/SKILL.md +158 -0
  15. package/bin/skills/axolotl/references/api.md +5548 -0
  16. package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
  17. package/bin/skills/axolotl/references/index.md +15 -0
  18. package/bin/skills/axolotl/references/other.md +3563 -0
  19. package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
  20. package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
  21. package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
  22. package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
  23. package/bin/skills/bitsandbytes/SKILL.md +411 -0
  24. package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
  25. package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
  26. package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
  27. package/bin/skills/blip-2/SKILL.md +564 -0
  28. package/bin/skills/blip-2/references/advanced-usage.md +680 -0
  29. package/bin/skills/blip-2/references/troubleshooting.md +526 -0
  30. package/bin/skills/chroma/SKILL.md +406 -0
  31. package/bin/skills/chroma/references/integration.md +38 -0
  32. package/bin/skills/clip/SKILL.md +253 -0
  33. package/bin/skills/clip/references/applications.md +207 -0
  34. package/bin/skills/constitutional-ai/SKILL.md +290 -0
  35. package/bin/skills/crewai/SKILL.md +498 -0
  36. package/bin/skills/crewai/references/flows.md +438 -0
  37. package/bin/skills/crewai/references/tools.md +429 -0
  38. package/bin/skills/crewai/references/troubleshooting.md +480 -0
  39. package/bin/skills/deepspeed/SKILL.md +141 -0
  40. package/bin/skills/deepspeed/references/08.md +17 -0
  41. package/bin/skills/deepspeed/references/09.md +173 -0
  42. package/bin/skills/deepspeed/references/2020.md +378 -0
  43. package/bin/skills/deepspeed/references/2023.md +279 -0
  44. package/bin/skills/deepspeed/references/assets.md +179 -0
  45. package/bin/skills/deepspeed/references/index.md +35 -0
  46. package/bin/skills/deepspeed/references/mii.md +118 -0
  47. package/bin/skills/deepspeed/references/other.md +1191 -0
  48. package/bin/skills/deepspeed/references/tutorials.md +6554 -0
  49. package/bin/skills/dspy/SKILL.md +590 -0
  50. package/bin/skills/dspy/references/examples.md +663 -0
  51. package/bin/skills/dspy/references/modules.md +475 -0
  52. package/bin/skills/dspy/references/optimizers.md +566 -0
  53. package/bin/skills/faiss/SKILL.md +221 -0
  54. package/bin/skills/faiss/references/index_types.md +280 -0
  55. package/bin/skills/flash-attention/SKILL.md +367 -0
  56. package/bin/skills/flash-attention/references/benchmarks.md +215 -0
  57. package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
  58. package/bin/skills/gguf/SKILL.md +427 -0
  59. package/bin/skills/gguf/references/advanced-usage.md +504 -0
  60. package/bin/skills/gguf/references/troubleshooting.md +442 -0
  61. package/bin/skills/gptq/SKILL.md +450 -0
  62. package/bin/skills/gptq/references/calibration.md +337 -0
  63. package/bin/skills/gptq/references/integration.md +129 -0
  64. package/bin/skills/gptq/references/troubleshooting.md +95 -0
  65. package/bin/skills/grpo-rl-training/README.md +97 -0
  66. package/bin/skills/grpo-rl-training/SKILL.md +572 -0
  67. package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
  68. package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
  69. package/bin/skills/guidance/SKILL.md +572 -0
  70. package/bin/skills/guidance/references/backends.md +554 -0
  71. package/bin/skills/guidance/references/constraints.md +674 -0
  72. package/bin/skills/guidance/references/examples.md +767 -0
  73. package/bin/skills/hqq/SKILL.md +445 -0
  74. package/bin/skills/hqq/references/advanced-usage.md +528 -0
  75. package/bin/skills/hqq/references/troubleshooting.md +503 -0
  76. package/bin/skills/hugging-face-cli/SKILL.md +191 -0
  77. package/bin/skills/hugging-face-cli/references/commands.md +954 -0
  78. package/bin/skills/hugging-face-cli/references/examples.md +374 -0
  79. package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
  80. package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
  81. package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
  82. package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
  83. package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
  84. package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
  85. package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
  86. package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
  87. package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
  88. package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
  89. package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
  90. package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
  91. package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
  92. package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
  93. package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
  94. package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
  95. package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
  96. package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
  97. package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
  98. package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
  99. package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
  100. package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
  101. package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
  102. package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
  103. package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
  104. package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
  105. package/bin/skills/hugging-face-jobs/index.html +216 -0
  106. package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
  107. package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
  108. package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
  109. package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
  110. package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
  111. package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
  112. package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
  113. package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
  114. package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
  115. package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
  116. package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
  117. package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
  118. package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
  119. package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
  120. package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
  121. package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
  122. package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
  123. package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
  124. package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
  125. package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
  126. package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
  127. package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
  128. package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
  129. package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
  130. package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
  131. package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
  132. package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
  133. package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
  134. package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
  135. package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
  136. package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
  137. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
  138. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
  139. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
  140. package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
  141. package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
  142. package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
  143. package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
  144. package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
  145. package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
  146. package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
  147. package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
  148. package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
  149. package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
  150. package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
  151. package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
  152. package/bin/skills/instructor/SKILL.md +740 -0
  153. package/bin/skills/instructor/references/examples.md +107 -0
  154. package/bin/skills/instructor/references/providers.md +70 -0
  155. package/bin/skills/instructor/references/validation.md +606 -0
  156. package/bin/skills/knowledge-distillation/SKILL.md +458 -0
  157. package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
  158. package/bin/skills/lambda-labs/SKILL.md +545 -0
  159. package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
  160. package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
  161. package/bin/skills/langchain/SKILL.md +480 -0
  162. package/bin/skills/langchain/references/agents.md +499 -0
  163. package/bin/skills/langchain/references/integration.md +562 -0
  164. package/bin/skills/langchain/references/rag.md +600 -0
  165. package/bin/skills/langsmith/SKILL.md +422 -0
  166. package/bin/skills/langsmith/references/advanced-usage.md +548 -0
  167. package/bin/skills/langsmith/references/troubleshooting.md +537 -0
  168. package/bin/skills/litgpt/SKILL.md +469 -0
  169. package/bin/skills/litgpt/references/custom-models.md +568 -0
  170. package/bin/skills/litgpt/references/distributed-training.md +451 -0
  171. package/bin/skills/litgpt/references/supported-models.md +336 -0
  172. package/bin/skills/litgpt/references/training-recipes.md +619 -0
  173. package/bin/skills/llama-cpp/SKILL.md +258 -0
  174. package/bin/skills/llama-cpp/references/optimization.md +89 -0
  175. package/bin/skills/llama-cpp/references/quantization.md +213 -0
  176. package/bin/skills/llama-cpp/references/server.md +125 -0
  177. package/bin/skills/llama-factory/SKILL.md +80 -0
  178. package/bin/skills/llama-factory/references/_images.md +23 -0
  179. package/bin/skills/llama-factory/references/advanced.md +1055 -0
  180. package/bin/skills/llama-factory/references/getting_started.md +349 -0
  181. package/bin/skills/llama-factory/references/index.md +19 -0
  182. package/bin/skills/llama-factory/references/other.md +31 -0
  183. package/bin/skills/llamaguard/SKILL.md +337 -0
  184. package/bin/skills/llamaindex/SKILL.md +569 -0
  185. package/bin/skills/llamaindex/references/agents.md +83 -0
  186. package/bin/skills/llamaindex/references/data_connectors.md +108 -0
  187. package/bin/skills/llamaindex/references/query_engines.md +406 -0
  188. package/bin/skills/llava/SKILL.md +304 -0
  189. package/bin/skills/llava/references/training.md +197 -0
  190. package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
  191. package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
  192. package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
  193. package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
  194. package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
  195. package/bin/skills/long-context/SKILL.md +536 -0
  196. package/bin/skills/long-context/references/extension_methods.md +468 -0
  197. package/bin/skills/long-context/references/fine_tuning.md +611 -0
  198. package/bin/skills/long-context/references/rope.md +402 -0
  199. package/bin/skills/mamba/SKILL.md +260 -0
  200. package/bin/skills/mamba/references/architecture-details.md +206 -0
  201. package/bin/skills/mamba/references/benchmarks.md +255 -0
  202. package/bin/skills/mamba/references/training-guide.md +388 -0
  203. package/bin/skills/megatron-core/SKILL.md +366 -0
  204. package/bin/skills/megatron-core/references/benchmarks.md +249 -0
  205. package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
  206. package/bin/skills/megatron-core/references/production-examples.md +473 -0
  207. package/bin/skills/megatron-core/references/training-recipes.md +547 -0
  208. package/bin/skills/miles/SKILL.md +315 -0
  209. package/bin/skills/miles/references/api-reference.md +141 -0
  210. package/bin/skills/miles/references/troubleshooting.md +352 -0
  211. package/bin/skills/mlflow/SKILL.md +704 -0
  212. package/bin/skills/mlflow/references/deployment.md +744 -0
  213. package/bin/skills/mlflow/references/model-registry.md +770 -0
  214. package/bin/skills/mlflow/references/tracking.md +680 -0
  215. package/bin/skills/modal/SKILL.md +341 -0
  216. package/bin/skills/modal/references/advanced-usage.md +503 -0
  217. package/bin/skills/modal/references/troubleshooting.md +494 -0
  218. package/bin/skills/model-merging/SKILL.md +539 -0
  219. package/bin/skills/model-merging/references/evaluation.md +462 -0
  220. package/bin/skills/model-merging/references/examples.md +428 -0
  221. package/bin/skills/model-merging/references/methods.md +352 -0
  222. package/bin/skills/model-pruning/SKILL.md +495 -0
  223. package/bin/skills/model-pruning/references/wanda.md +347 -0
  224. package/bin/skills/moe-training/SKILL.md +526 -0
  225. package/bin/skills/moe-training/references/architectures.md +432 -0
  226. package/bin/skills/moe-training/references/inference.md +348 -0
  227. package/bin/skills/moe-training/references/training.md +425 -0
  228. package/bin/skills/nanogpt/SKILL.md +290 -0
  229. package/bin/skills/nanogpt/references/architecture.md +382 -0
  230. package/bin/skills/nanogpt/references/data.md +476 -0
  231. package/bin/skills/nanogpt/references/training.md +564 -0
  232. package/bin/skills/nemo-curator/SKILL.md +383 -0
  233. package/bin/skills/nemo-curator/references/deduplication.md +87 -0
  234. package/bin/skills/nemo-curator/references/filtering.md +102 -0
  235. package/bin/skills/nemo-evaluator/SKILL.md +494 -0
  236. package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
  237. package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
  238. package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
  239. package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
  240. package/bin/skills/nemo-guardrails/SKILL.md +297 -0
  241. package/bin/skills/nnsight/SKILL.md +436 -0
  242. package/bin/skills/nnsight/references/README.md +78 -0
  243. package/bin/skills/nnsight/references/api.md +344 -0
  244. package/bin/skills/nnsight/references/tutorials.md +300 -0
  245. package/bin/skills/openrlhf/SKILL.md +249 -0
  246. package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
  247. package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
  248. package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
  249. package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
  250. package/bin/skills/outlines/SKILL.md +652 -0
  251. package/bin/skills/outlines/references/backends.md +615 -0
  252. package/bin/skills/outlines/references/examples.md +773 -0
  253. package/bin/skills/outlines/references/json_generation.md +652 -0
  254. package/bin/skills/peft/SKILL.md +431 -0
  255. package/bin/skills/peft/references/advanced-usage.md +514 -0
  256. package/bin/skills/peft/references/troubleshooting.md +480 -0
  257. package/bin/skills/phoenix/SKILL.md +475 -0
  258. package/bin/skills/phoenix/references/advanced-usage.md +619 -0
  259. package/bin/skills/phoenix/references/troubleshooting.md +538 -0
  260. package/bin/skills/pinecone/SKILL.md +358 -0
  261. package/bin/skills/pinecone/references/deployment.md +181 -0
  262. package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
  263. package/bin/skills/pytorch-fsdp/references/index.md +7 -0
  264. package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
  265. package/bin/skills/pytorch-lightning/SKILL.md +346 -0
  266. package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
  267. package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
  268. package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
  269. package/bin/skills/pyvene/SKILL.md +473 -0
  270. package/bin/skills/pyvene/references/README.md +73 -0
  271. package/bin/skills/pyvene/references/api.md +383 -0
  272. package/bin/skills/pyvene/references/tutorials.md +376 -0
  273. package/bin/skills/qdrant/SKILL.md +493 -0
  274. package/bin/skills/qdrant/references/advanced-usage.md +648 -0
  275. package/bin/skills/qdrant/references/troubleshooting.md +631 -0
  276. package/bin/skills/ray-data/SKILL.md +326 -0
  277. package/bin/skills/ray-data/references/integration.md +82 -0
  278. package/bin/skills/ray-data/references/transformations.md +83 -0
  279. package/bin/skills/ray-train/SKILL.md +406 -0
  280. package/bin/skills/ray-train/references/multi-node.md +628 -0
  281. package/bin/skills/rwkv/SKILL.md +260 -0
  282. package/bin/skills/rwkv/references/architecture-details.md +344 -0
  283. package/bin/skills/rwkv/references/rwkv7.md +386 -0
  284. package/bin/skills/rwkv/references/state-management.md +369 -0
  285. package/bin/skills/saelens/SKILL.md +386 -0
  286. package/bin/skills/saelens/references/README.md +70 -0
  287. package/bin/skills/saelens/references/api.md +333 -0
  288. package/bin/skills/saelens/references/tutorials.md +318 -0
  289. package/bin/skills/segment-anything/SKILL.md +500 -0
  290. package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
  291. package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
  292. package/bin/skills/sentence-transformers/SKILL.md +255 -0
  293. package/bin/skills/sentence-transformers/references/models.md +123 -0
  294. package/bin/skills/sentencepiece/SKILL.md +235 -0
  295. package/bin/skills/sentencepiece/references/algorithms.md +200 -0
  296. package/bin/skills/sentencepiece/references/training.md +304 -0
  297. package/bin/skills/sglang/SKILL.md +442 -0
  298. package/bin/skills/sglang/references/deployment.md +490 -0
  299. package/bin/skills/sglang/references/radix-attention.md +413 -0
  300. package/bin/skills/sglang/references/structured-generation.md +541 -0
  301. package/bin/skills/simpo/SKILL.md +219 -0
  302. package/bin/skills/simpo/references/datasets.md +478 -0
  303. package/bin/skills/simpo/references/hyperparameters.md +452 -0
  304. package/bin/skills/simpo/references/loss-functions.md +350 -0
  305. package/bin/skills/skypilot/SKILL.md +509 -0
  306. package/bin/skills/skypilot/references/advanced-usage.md +491 -0
  307. package/bin/skills/skypilot/references/troubleshooting.md +570 -0
  308. package/bin/skills/slime/SKILL.md +464 -0
  309. package/bin/skills/slime/references/api-reference.md +392 -0
  310. package/bin/skills/slime/references/troubleshooting.md +386 -0
  311. package/bin/skills/speculative-decoding/SKILL.md +467 -0
  312. package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
  313. package/bin/skills/speculative-decoding/references/medusa.md +350 -0
  314. package/bin/skills/stable-diffusion/SKILL.md +519 -0
  315. package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
  316. package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
  317. package/bin/skills/tensorboard/SKILL.md +629 -0
  318. package/bin/skills/tensorboard/references/integrations.md +638 -0
  319. package/bin/skills/tensorboard/references/profiling.md +545 -0
  320. package/bin/skills/tensorboard/references/visualization.md +620 -0
  321. package/bin/skills/tensorrt-llm/SKILL.md +187 -0
  322. package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
  323. package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
  324. package/bin/skills/tensorrt-llm/references/serving.md +470 -0
  325. package/bin/skills/tinker/SKILL.md +362 -0
  326. package/bin/skills/tinker/references/api-reference.md +168 -0
  327. package/bin/skills/tinker/references/getting-started.md +157 -0
  328. package/bin/skills/tinker/references/loss-functions.md +163 -0
  329. package/bin/skills/tinker/references/models-and-lora.md +139 -0
  330. package/bin/skills/tinker/references/recipes.md +280 -0
  331. package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
  332. package/bin/skills/tinker/references/rendering.md +243 -0
  333. package/bin/skills/tinker/references/supervised-learning.md +232 -0
  334. package/bin/skills/tinker-training-cost/SKILL.md +187 -0
  335. package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
  336. package/bin/skills/torchforge/SKILL.md +433 -0
  337. package/bin/skills/torchforge/references/api-reference.md +327 -0
  338. package/bin/skills/torchforge/references/troubleshooting.md +409 -0
  339. package/bin/skills/torchtitan/SKILL.md +358 -0
  340. package/bin/skills/torchtitan/references/checkpoint.md +181 -0
  341. package/bin/skills/torchtitan/references/custom-models.md +258 -0
  342. package/bin/skills/torchtitan/references/float8.md +133 -0
  343. package/bin/skills/torchtitan/references/fsdp.md +126 -0
  344. package/bin/skills/transformer-lens/SKILL.md +346 -0
  345. package/bin/skills/transformer-lens/references/README.md +54 -0
  346. package/bin/skills/transformer-lens/references/api.md +362 -0
  347. package/bin/skills/transformer-lens/references/tutorials.md +339 -0
  348. package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
  349. package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
  350. package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
  351. package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
  352. package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
  353. package/bin/skills/unsloth/SKILL.md +80 -0
  354. package/bin/skills/unsloth/references/index.md +7 -0
  355. package/bin/skills/unsloth/references/llms-full.md +16799 -0
  356. package/bin/skills/unsloth/references/llms-txt.md +12044 -0
  357. package/bin/skills/unsloth/references/llms.md +82 -0
  358. package/bin/skills/verl/SKILL.md +391 -0
  359. package/bin/skills/verl/references/api-reference.md +301 -0
  360. package/bin/skills/verl/references/troubleshooting.md +391 -0
  361. package/bin/skills/vllm/SKILL.md +364 -0
  362. package/bin/skills/vllm/references/optimization.md +226 -0
  363. package/bin/skills/vllm/references/quantization.md +284 -0
  364. package/bin/skills/vllm/references/server-deployment.md +255 -0
  365. package/bin/skills/vllm/references/troubleshooting.md +447 -0
  366. package/bin/skills/weights-and-biases/SKILL.md +590 -0
  367. package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
  368. package/bin/skills/weights-and-biases/references/integrations.md +700 -0
  369. package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
  370. package/bin/skills/whisper/SKILL.md +317 -0
  371. package/bin/skills/whisper/references/languages.md +189 -0
  372. package/bin/synsc +0 -0
  373. package/package.json +10 -0
@@ -0,0 +1,611 @@
1
+ # Fine-tuning for Context Extension
2
+
3
+ Complete guide to fine-tuning transformer models for longer context windows.
4
+
5
+ ## Table of Contents
6
+ - Data Preparation
7
+ - Training Configuration
8
+ - YaRN Fine-tuning
9
+ - Position Interpolation Fine-tuning
10
+ - Evaluation
11
+ - Production Deployment
12
+
13
+ ## Data Preparation
14
+
15
+ ### Long Document Datasets
16
+
17
+ **Best datasets for context extension**:
18
+
19
+ ```python
20
+ # 1. PG-19 (Books)
21
+ from datasets import load_dataset
22
+
23
+ pg19 = load_dataset("pg19", split="train")
24
+ # Average length: 50k-150k tokens
25
+ # Quality: High (literary works)
26
+
27
+ # 2. arXiv Papers
28
+ arxiv = load_dataset("scientific_papers", "arxiv", split="train")
29
+ # Average length: 4k-15k tokens
30
+ # Quality: High (technical content)
31
+
32
+ # 3. Long-form GitHub Code
33
+ github = load_dataset("codeparrot/github-code", split="train")
34
+ # Filter for large files (>5k tokens)
35
+
36
+ # 4. Long Conversations
37
+ conversations = load_dataset("HuggingFaceH4/ultrachat_200k", split="train")
38
+ # Concatenate multi-turn dialogues
39
+
40
+ # 5. Wikipedia Articles (concatenated)
41
+ wikipedia = load_dataset("wikipedia", "20220301.en", split="train")
42
+ ```
43
+
44
+ ### Creating Training Sequences
45
+
46
+ ```python
47
+ def create_long_sequences(dataset, target_length=32768, tokenizer=None):
48
+ """Create training sequences of target length."""
49
+ sequences = []
50
+
51
+ for example in dataset:
52
+ # Tokenize
53
+ tokens = tokenizer.encode(example['text'])
54
+
55
+ # If single document is long enough
56
+ if len(tokens) >= target_length:
57
+ # Split into chunks
58
+ for i in range(0, len(tokens) - target_length, target_length // 2):
59
+ sequences.append(tokens[i:i + target_length])
60
+ else:
61
+ # Concatenate multiple documents
62
+ buffer = tokens
63
+ while len(buffer) < target_length:
64
+ next_example = next(dataset)
65
+ buffer.extend(tokenizer.encode(next_example['text']))
66
+
67
+ sequences.append(buffer[:target_length])
68
+
69
+ return sequences
70
+ ```
71
+
72
+ ### Data Quality Checks
73
+
74
+ ```python
75
+ def validate_training_data(sequences, tokenizer, min_length=8192):
76
+ """Ensure data quality for context extension."""
77
+ issues = []
78
+
79
+ for i, seq in enumerate(sequences):
80
+ # 1. Check length
81
+ if len(seq) < min_length:
82
+ issues.append(f"Sequence {i}: too short ({len(seq)} tokens)")
83
+
84
+ # 2. Check for repetition (copy-paste errors)
85
+ if has_excessive_repetition(seq):
86
+ issues.append(f"Sequence {i}: excessive repetition")
87
+
88
+ # 3. Check for truncation artifacts
89
+ if looks_truncated(seq, tokenizer):
90
+ issues.append(f"Sequence {i}: appears truncated")
91
+
92
+ if issues:
93
+ print(f"⚠️ Found {len(issues)} data quality issues:")
94
+ for issue in issues[:10]: # Show first 10
95
+ print(f" - {issue}")
96
+
97
+ return len(issues) == 0
98
+
99
+ def has_excessive_repetition(tokens, window=50, threshold=0.8):
100
+ """Detect copy-paste or generated repetition."""
101
+ for i in range(len(tokens) - window * 2):
102
+ chunk1 = tokens[i:i + window]
103
+ chunk2 = tokens[i + window:i + window * 2]
104
+ similarity = sum(a == b for a, b in zip(chunk1, chunk2)) / window
105
+ if similarity > threshold:
106
+ return True
107
+ return False
108
+
109
+ def looks_truncated(tokens, tokenizer):
110
+ """Check if sequence ends mid-sentence."""
111
+ last_20 = tokenizer.decode(tokens[-20:])
112
+ # Check for incomplete sentences
113
+ return not any(last_20.endswith(c) for c in ['.', '!', '?', '\n'])
114
+ ```
115
+
116
+ ## Training Configuration
117
+
118
+ ### Position Interpolation Setup
119
+
120
+ **Minimal fine-tuning** (fastest method):
121
+
122
+ ```python
123
+ from transformers import (
124
+ AutoModelForCausalLM,
125
+ AutoTokenizer,
126
+ TrainingArguments,
127
+ Trainer
128
+ )
129
+
130
+ # 1. Load base model
131
+ model = AutoModelForCausalLM.from_pretrained(
132
+ "meta-llama/Llama-2-7b-hf",
133
+ torch_dtype=torch.float16,
134
+ device_map="auto"
135
+ )
136
+ tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
137
+
138
+ # 2. Configure position interpolation
139
+ scaling_factor = 16.0 # 2k → 32k
140
+ model.config.max_position_embeddings = 32768
141
+ model.config.rope_scaling = {
142
+ "type": "linear",
143
+ "factor": scaling_factor
144
+ }
145
+
146
+ # 3. Training arguments
147
+ training_args = TrainingArguments(
148
+ output_dir="./llama-2-7b-32k",
149
+ num_train_epochs=1,
150
+ max_steps=1000, # Only 1000 steps!
151
+ per_device_train_batch_size=1,
152
+ gradient_accumulation_steps=16,
153
+ learning_rate=2e-5, # Low LR
154
+ warmup_steps=100,
155
+ lr_scheduler_type="cosine",
156
+ logging_steps=10,
157
+ save_steps=500,
158
+ bf16=True,
159
+ gradient_checkpointing=True, # Reduce memory
160
+ dataloader_num_workers=4,
161
+ )
162
+
163
+ # 4. Create trainer
164
+ trainer = Trainer(
165
+ model=model,
166
+ args=training_args,
167
+ train_dataset=long_context_dataset,
168
+ data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
169
+ )
170
+
171
+ # 5. Train
172
+ trainer.train()
173
+ ```
174
+
175
+ ### YaRN Setup
176
+
177
+ **State-of-the-art extension** (best quality):
178
+
179
+ ```python
180
+ # 1. Install YaRN
181
+ # git clone https://github.com/jquesnelle/yarn
182
+ # cd yarn && pip install -e .
183
+
184
+ # 2. Configure YaRN scaling
185
+ model.config.max_position_embeddings = 32768
186
+ model.config.rope_scaling = {
187
+ "type": "yarn",
188
+ "factor": 16.0,
189
+ "original_max_position_embeddings": 2048,
190
+ "attention_factor": 1.0,
191
+ "beta_fast": 32,
192
+ "beta_slow": 1,
193
+ }
194
+
195
+ # 3. Training arguments (fewer steps than position interpolation!)
196
+ training_args = TrainingArguments(
197
+ output_dir="./llama-2-7b-32k-yarn",
198
+ max_steps=400, # 400 steps (vs 1000 for PI)
199
+ per_device_train_batch_size=1,
200
+ gradient_accumulation_steps=16,
201
+ learning_rate=2e-5,
202
+ warmup_steps=50,
203
+ bf16=True,
204
+ gradient_checkpointing=True,
205
+ )
206
+
207
+ # 4. Train
208
+ trainer = Trainer(model=model, args=training_args, train_dataset=dataset)
209
+ trainer.train()
210
+ ```
211
+
212
+ ### Full Configuration Example
213
+
214
+ ```python
215
+ # Complete fine-tuning script
216
+ import torch
217
+ from transformers import (
218
+ AutoModelForCausalLM,
219
+ AutoTokenizer,
220
+ TrainingArguments,
221
+ Trainer,
222
+ DataCollatorForLanguageModeling,
223
+ )
224
+ from datasets import load_dataset
225
+
226
+ def prepare_long_context_data(dataset, tokenizer, context_length=32768):
227
+ """Prepare training data."""
228
+ def tokenize_function(examples):
229
+ # Concatenate all texts
230
+ concatenated = "\n\n".join(examples['text'])
231
+ # Tokenize
232
+ tokenized = tokenizer(
233
+ concatenated,
234
+ truncation=False,
235
+ return_tensors=None,
236
+ )
237
+ # Split into chunks
238
+ total_length = len(tokenized['input_ids'])
239
+ chunks = []
240
+ for i in range(0, total_length - context_length, context_length // 2):
241
+ chunk = {
242
+ 'input_ids': tokenized['input_ids'][i:i + context_length],
243
+ 'attention_mask': tokenized['attention_mask'][i:i + context_length],
244
+ }
245
+ chunks.append(chunk)
246
+ return chunks
247
+
248
+ return dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)
249
+
250
+ def fine_tune_long_context(
251
+ base_model="meta-llama/Llama-2-7b-hf",
252
+ target_context=32768,
253
+ method="yarn", # or "linear"
254
+ output_dir="./output",
255
+ max_steps=400,
256
+ ):
257
+ """Complete fine-tuning pipeline."""
258
+
259
+ # Load model and tokenizer
260
+ print(f"Loading {base_model}...")
261
+ model = AutoModelForCausalLM.from_pretrained(
262
+ base_model,
263
+ torch_dtype=torch.bfloat16,
264
+ device_map="auto",
265
+ use_cache=False # Required for gradient checkpointing
266
+ )
267
+ tokenizer = AutoTokenizer.from_pretrained(base_model)
268
+ tokenizer.pad_token = tokenizer.eos_token
269
+
270
+ # Configure scaling
271
+ original_context = model.config.max_position_embeddings
272
+ scaling_factor = target_context / original_context
273
+
274
+ print(f"Scaling {original_context} → {target_context} ({scaling_factor}×)")
275
+ model.config.max_position_embeddings = target_context
276
+
277
+ if method == "yarn":
278
+ model.config.rope_scaling = {
279
+ "type": "yarn",
280
+ "factor": scaling_factor,
281
+ "original_max_position_embeddings": original_context,
282
+ "attention_factor": 1.0,
283
+ "beta_fast": 32,
284
+ "beta_slow": 1,
285
+ }
286
+ else: # linear
287
+ model.config.rope_scaling = {
288
+ "type": "linear",
289
+ "factor": scaling_factor
290
+ }
291
+
292
+ # Enable gradient checkpointing
293
+ model.gradient_checkpointing_enable()
294
+
295
+ # Load and prepare data
296
+ print("Preparing training data...")
297
+ dataset = load_dataset("pg19", split="train[:1000]") # Use subset for testing
298
+ train_dataset = prepare_long_context_data(dataset, tokenizer, target_context)
299
+
300
+ # Training arguments
301
+ training_args = TrainingArguments(
302
+ output_dir=output_dir,
303
+ max_steps=max_steps,
304
+ per_device_train_batch_size=1,
305
+ gradient_accumulation_steps=16,
306
+ learning_rate=2e-5,
307
+ warmup_steps=max_steps // 10,
308
+ lr_scheduler_type="cosine",
309
+ logging_steps=10,
310
+ save_steps=max_steps // 4,
311
+ bf16=True,
312
+ gradient_checkpointing=True,
313
+ dataloader_num_workers=4,
314
+ remove_unused_columns=False,
315
+ )
316
+
317
+ # Trainer
318
+ trainer = Trainer(
319
+ model=model,
320
+ args=training_args,
321
+ train_dataset=train_dataset,
322
+ data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
323
+ )
324
+
325
+ # Train
326
+ print("Starting fine-tuning...")
327
+ trainer.train()
328
+
329
+ # Save
330
+ print(f"Saving model to {output_dir}...")
331
+ model.save_pretrained(output_dir)
332
+ tokenizer.save_pretrained(output_dir)
333
+
334
+ print("Done!")
335
+
336
+ # Usage
337
+ if __name__ == "__main__":
338
+ fine_tune_long_context(
339
+ base_model="meta-llama/Llama-2-7b-hf",
340
+ target_context=32768,
341
+ method="yarn",
342
+ max_steps=400,
343
+ )
344
+ ```
345
+
346
+ ## Evaluation
347
+
348
+ ### Perplexity Evaluation
349
+
350
+ ```python
351
+ import torch
352
+ from transformers import AutoModelForCausalLM, AutoTokenizer
353
+ from datasets import load_dataset
354
+ import math
355
+
356
+ def evaluate_perplexity(model, tokenizer, dataset, context_length=32768):
357
+ """Evaluate perplexity on long context."""
358
+ model.eval()
359
+ total_loss = 0
360
+ total_tokens = 0
361
+
362
+ with torch.no_grad():
363
+ for example in dataset:
364
+ # Tokenize
365
+ tokens = tokenizer(
366
+ example['text'],
367
+ return_tensors='pt',
368
+ max_length=context_length,
369
+ truncation=True,
370
+ ).to(model.device)
371
+
372
+ # Forward pass
373
+ outputs = model(**tokens, labels=tokens['input_ids'])
374
+ loss = outputs.loss
375
+ num_tokens = tokens['input_ids'].numel()
376
+
377
+ total_loss += loss.item() * num_tokens
378
+ total_tokens += num_tokens
379
+
380
+ # Compute perplexity
381
+ avg_loss = total_loss / total_tokens
382
+ perplexity = math.exp(avg_loss)
383
+
384
+ return perplexity
385
+
386
+ # Usage
387
+ model = AutoModelForCausalLM.from_pretrained("./llama-2-7b-32k")
388
+ tokenizer = AutoTokenizer.from_pretrained("./llama-2-7b-32k")
389
+
390
+ test_dataset = load_dataset("pg19", split="test[:100]")
391
+ ppl = evaluate_perplexity(model, tokenizer, test_dataset, context_length=32768)
392
+
393
+ print(f"Perplexity at 32k context: {ppl:.2f}")
394
+ ```
395
+
396
+ ### Passkey Retrieval Test
397
+
398
+ ```python
399
+ def passkey_retrieval_test(model, tokenizer, context_lengths=[4096, 8192, 16384, 32768]):
400
+ """Test ability to retrieve information from different positions."""
401
+ results = {}
402
+
403
+ for context_len in context_lengths:
404
+ # Create synthetic document with passkey at random position
405
+ passkey = "12345"
406
+ position = random.randint(100, context_len - 100)
407
+
408
+ # Generate filler text
409
+ filler = "The quick brown fox jumps over the lazy dog. " * (context_len // 10)
410
+ text = filler[:position] + f"The passkey is {passkey}. " + filler[position:]
411
+
412
+ # Truncate to context length
413
+ tokens = tokenizer(text, return_tensors='pt', max_length=context_len, truncation=True)
414
+
415
+ # Query
416
+ prompt = text + "\nWhat is the passkey?"
417
+ inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
418
+
419
+ # Generate
420
+ outputs = model.generate(**inputs, max_new_tokens=10)
421
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
422
+
423
+ # Check if passkey retrieved
424
+ success = passkey in response
425
+ results[context_len] = success
426
+
427
+ print(f"Context {context_len}: {'✓' if success else '✗'}")
428
+
429
+ return results
430
+ ```
431
+
432
+ ### Long Document Q&A
433
+
434
+ ```python
435
+ from datasets import load_dataset
436
+
437
+ def test_long_qa(model, tokenizer, max_length=32768):
438
+ """Test on long-form QA dataset."""
439
+ # Load dataset
440
+ dataset = load_dataset("narrativeqa", split="test[:100]")
441
+
442
+ correct = 0
443
+ total = 0
444
+
445
+ for example in dataset:
446
+ # Long document
447
+ document = example['document']['text']
448
+ question = example['question']['text']
449
+ gold_answers = example['answers']
450
+
451
+ # Create prompt
452
+ prompt = f"Document:\n{document}\n\nQuestion: {question}\n\nAnswer:"
453
+
454
+ # Tokenize (may exceed original context)
455
+ inputs = tokenizer(
456
+ prompt,
457
+ return_tensors='pt',
458
+ max_length=max_length,
459
+ truncation=True
460
+ ).to(model.device)
461
+
462
+ # Generate
463
+ outputs = model.generate(
464
+ **inputs,
465
+ max_new_tokens=50,
466
+ temperature=0.7,
467
+ )
468
+ answer = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
469
+
470
+ # Check correctness
471
+ if any(gold in answer.lower() for gold in gold_answers):
472
+ correct += 1
473
+ total += 1
474
+
475
+ accuracy = correct / total
476
+ print(f"Long QA Accuracy: {accuracy:.1%}")
477
+ return accuracy
478
+ ```
479
+
480
+ ## Best Practices
481
+
482
+ ### 1. Gradual Scaling
483
+
484
+ ```python
485
+ # Don't jump directly to 128k!
486
+ # Scale incrementally:
487
+
488
+ # Step 1: 2k → 8k
489
+ fine_tune(model, target=8192, steps=200)
490
+
491
+ # Step 2: 8k → 16k
492
+ fine_tune(model, target=16384, steps=200)
493
+
494
+ # Step 3: 16k → 32k
495
+ fine_tune(model, target=32768, steps=400)
496
+
497
+ # Each step builds on previous, reducing total training needed
498
+ ```
499
+
500
+ ### 2. Learning Rate Tuning
501
+
502
+ ```python
503
+ # Position Interpolation: Lower LR
504
+ lr_pi = 2e-5
505
+
506
+ # YaRN: Can use slightly higher LR
507
+ lr_yarn = 5e-5
508
+
509
+ # Rule: Larger scaling factors need lower LR
510
+ lr = base_lr / sqrt(scaling_factor)
511
+ ```
512
+
513
+ ### 3. Gradient Checkpointing
514
+
515
+ ```python
516
+ # Essential for long context (saves ~50% memory)
517
+ model.gradient_checkpointing_enable()
518
+
519
+ # Trade-off: ~20% slower training, but fits in memory
520
+ ```
521
+
522
+ ### 4. Flash Attention
523
+
524
+ ```python
525
+ # 2-3× speedup for long sequences
526
+ model = AutoModelForCausalLM.from_pretrained(
527
+ "meta-llama/Llama-2-7b-hf",
528
+ attn_implementation="flash_attention_2", # Flash Attention 2
529
+ torch_dtype=torch.bfloat16
530
+ )
531
+ ```
532
+
533
+ ## Production Deployment
534
+
535
+ ### Save and Upload
536
+
537
+ ```python
538
+ # Save fine-tuned model
539
+ model.save_pretrained("./llama-2-7b-32k-yarn")
540
+ tokenizer.save_pretrained("./llama-2-7b-32k-yarn")
541
+
542
+ # Upload to HuggingFace Hub
543
+ from huggingface_hub import HfApi
544
+
545
+ api = HfApi()
546
+ api.upload_folder(
547
+ folder_path="./llama-2-7b-32k-yarn",
548
+ repo_id="your-username/llama-2-7b-32k-yarn",
549
+ repo_type="model",
550
+ )
551
+ ```
552
+
553
+ ### Inference Configuration
554
+
555
+ ```python
556
+ # Load for inference
557
+ model = AutoModelForCausalLM.from_pretrained(
558
+ "your-username/llama-2-7b-32k-yarn",
559
+ torch_dtype=torch.float16,
560
+ device_map="auto",
561
+ max_memory={0: "40GB", "cpu": "100GB"} # Offload to CPU if needed
562
+ )
563
+
564
+ # Process long document
565
+ long_text = "..." * 30000 # 30k tokens
566
+ inputs = tokenizer(long_text, return_tensors="pt", truncation=False).to('cuda')
567
+
568
+ outputs = model.generate(
569
+ **inputs,
570
+ max_new_tokens=512,
571
+ do_sample=True,
572
+ temperature=0.7,
573
+ top_p=0.9,
574
+ )
575
+
576
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
577
+ ```
578
+
579
+ ## Troubleshooting
580
+
581
+ ### Issue: Out of Memory
582
+
583
+ **Solutions**:
584
+ 1. Enable gradient checkpointing
585
+ 2. Reduce batch size to 1
586
+ 3. Increase gradient accumulation steps
587
+ 4. Use bfloat16 or float16
588
+ 5. Use Flash Attention
589
+
590
+ ### Issue: Poor Extrapolation
591
+
592
+ **Solutions**:
593
+ 1. Use YaRN instead of linear scaling
594
+ 2. Increase fine-tuning steps
595
+ 3. Use higher-quality long-form data
596
+ 4. Gradual scaling (8k → 16k → 32k)
597
+
598
+ ### Issue: Training Instability
599
+
600
+ **Solutions**:
601
+ 1. Lower learning rate
602
+ 2. Increase warmup steps
603
+ 3. Use gradient clipping
604
+ 4. Check data quality
605
+
606
+ ## Resources
607
+
608
+ - **Position Interpolation Paper**: https://arxiv.org/abs/2306.15595
609
+ - **YaRN Paper**: https://arxiv.org/abs/2309.00071
610
+ - **Together AI Guide**: https://www.together.ai/blog/llama-2-7b-32k
611
+ - **HuggingFace Long Context Guide**: https://huggingface.co/blog/long-range-transformers