@synsci/cli-darwin-arm64 1.1.49

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (373) hide show
  1. package/bin/skills/accelerate/SKILL.md +332 -0
  2. package/bin/skills/accelerate/references/custom-plugins.md +453 -0
  3. package/bin/skills/accelerate/references/megatron-integration.md +489 -0
  4. package/bin/skills/accelerate/references/performance.md +525 -0
  5. package/bin/skills/audiocraft/SKILL.md +564 -0
  6. package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
  7. package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
  8. package/bin/skills/autogpt/SKILL.md +403 -0
  9. package/bin/skills/autogpt/references/advanced-usage.md +535 -0
  10. package/bin/skills/autogpt/references/troubleshooting.md +420 -0
  11. package/bin/skills/awq/SKILL.md +310 -0
  12. package/bin/skills/awq/references/advanced-usage.md +324 -0
  13. package/bin/skills/awq/references/troubleshooting.md +344 -0
  14. package/bin/skills/axolotl/SKILL.md +158 -0
  15. package/bin/skills/axolotl/references/api.md +5548 -0
  16. package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
  17. package/bin/skills/axolotl/references/index.md +15 -0
  18. package/bin/skills/axolotl/references/other.md +3563 -0
  19. package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
  20. package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
  21. package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
  22. package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
  23. package/bin/skills/bitsandbytes/SKILL.md +411 -0
  24. package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
  25. package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
  26. package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
  27. package/bin/skills/blip-2/SKILL.md +564 -0
  28. package/bin/skills/blip-2/references/advanced-usage.md +680 -0
  29. package/bin/skills/blip-2/references/troubleshooting.md +526 -0
  30. package/bin/skills/chroma/SKILL.md +406 -0
  31. package/bin/skills/chroma/references/integration.md +38 -0
  32. package/bin/skills/clip/SKILL.md +253 -0
  33. package/bin/skills/clip/references/applications.md +207 -0
  34. package/bin/skills/constitutional-ai/SKILL.md +290 -0
  35. package/bin/skills/crewai/SKILL.md +498 -0
  36. package/bin/skills/crewai/references/flows.md +438 -0
  37. package/bin/skills/crewai/references/tools.md +429 -0
  38. package/bin/skills/crewai/references/troubleshooting.md +480 -0
  39. package/bin/skills/deepspeed/SKILL.md +141 -0
  40. package/bin/skills/deepspeed/references/08.md +17 -0
  41. package/bin/skills/deepspeed/references/09.md +173 -0
  42. package/bin/skills/deepspeed/references/2020.md +378 -0
  43. package/bin/skills/deepspeed/references/2023.md +279 -0
  44. package/bin/skills/deepspeed/references/assets.md +179 -0
  45. package/bin/skills/deepspeed/references/index.md +35 -0
  46. package/bin/skills/deepspeed/references/mii.md +118 -0
  47. package/bin/skills/deepspeed/references/other.md +1191 -0
  48. package/bin/skills/deepspeed/references/tutorials.md +6554 -0
  49. package/bin/skills/dspy/SKILL.md +590 -0
  50. package/bin/skills/dspy/references/examples.md +663 -0
  51. package/bin/skills/dspy/references/modules.md +475 -0
  52. package/bin/skills/dspy/references/optimizers.md +566 -0
  53. package/bin/skills/faiss/SKILL.md +221 -0
  54. package/bin/skills/faiss/references/index_types.md +280 -0
  55. package/bin/skills/flash-attention/SKILL.md +367 -0
  56. package/bin/skills/flash-attention/references/benchmarks.md +215 -0
  57. package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
  58. package/bin/skills/gguf/SKILL.md +427 -0
  59. package/bin/skills/gguf/references/advanced-usage.md +504 -0
  60. package/bin/skills/gguf/references/troubleshooting.md +442 -0
  61. package/bin/skills/gptq/SKILL.md +450 -0
  62. package/bin/skills/gptq/references/calibration.md +337 -0
  63. package/bin/skills/gptq/references/integration.md +129 -0
  64. package/bin/skills/gptq/references/troubleshooting.md +95 -0
  65. package/bin/skills/grpo-rl-training/README.md +97 -0
  66. package/bin/skills/grpo-rl-training/SKILL.md +572 -0
  67. package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
  68. package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
  69. package/bin/skills/guidance/SKILL.md +572 -0
  70. package/bin/skills/guidance/references/backends.md +554 -0
  71. package/bin/skills/guidance/references/constraints.md +674 -0
  72. package/bin/skills/guidance/references/examples.md +767 -0
  73. package/bin/skills/hqq/SKILL.md +445 -0
  74. package/bin/skills/hqq/references/advanced-usage.md +528 -0
  75. package/bin/skills/hqq/references/troubleshooting.md +503 -0
  76. package/bin/skills/hugging-face-cli/SKILL.md +191 -0
  77. package/bin/skills/hugging-face-cli/references/commands.md +954 -0
  78. package/bin/skills/hugging-face-cli/references/examples.md +374 -0
  79. package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
  80. package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
  81. package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
  82. package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
  83. package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
  84. package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
  85. package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
  86. package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
  87. package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
  88. package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
  89. package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
  90. package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
  91. package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
  92. package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
  93. package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
  94. package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
  95. package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
  96. package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
  97. package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
  98. package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
  99. package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
  100. package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
  101. package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
  102. package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
  103. package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
  104. package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
  105. package/bin/skills/hugging-face-jobs/index.html +216 -0
  106. package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
  107. package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
  108. package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
  109. package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
  110. package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
  111. package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
  112. package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
  113. package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
  114. package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
  115. package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
  116. package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
  117. package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
  118. package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
  119. package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
  120. package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
  121. package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
  122. package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
  123. package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
  124. package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
  125. package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
  126. package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
  127. package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
  128. package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
  129. package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
  130. package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
  131. package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
  132. package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
  133. package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
  134. package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
  135. package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
  136. package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
  137. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
  138. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
  139. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
  140. package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
  141. package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
  142. package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
  143. package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
  144. package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
  145. package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
  146. package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
  147. package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
  148. package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
  149. package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
  150. package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
  151. package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
  152. package/bin/skills/instructor/SKILL.md +740 -0
  153. package/bin/skills/instructor/references/examples.md +107 -0
  154. package/bin/skills/instructor/references/providers.md +70 -0
  155. package/bin/skills/instructor/references/validation.md +606 -0
  156. package/bin/skills/knowledge-distillation/SKILL.md +458 -0
  157. package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
  158. package/bin/skills/lambda-labs/SKILL.md +545 -0
  159. package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
  160. package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
  161. package/bin/skills/langchain/SKILL.md +480 -0
  162. package/bin/skills/langchain/references/agents.md +499 -0
  163. package/bin/skills/langchain/references/integration.md +562 -0
  164. package/bin/skills/langchain/references/rag.md +600 -0
  165. package/bin/skills/langsmith/SKILL.md +422 -0
  166. package/bin/skills/langsmith/references/advanced-usage.md +548 -0
  167. package/bin/skills/langsmith/references/troubleshooting.md +537 -0
  168. package/bin/skills/litgpt/SKILL.md +469 -0
  169. package/bin/skills/litgpt/references/custom-models.md +568 -0
  170. package/bin/skills/litgpt/references/distributed-training.md +451 -0
  171. package/bin/skills/litgpt/references/supported-models.md +336 -0
  172. package/bin/skills/litgpt/references/training-recipes.md +619 -0
  173. package/bin/skills/llama-cpp/SKILL.md +258 -0
  174. package/bin/skills/llama-cpp/references/optimization.md +89 -0
  175. package/bin/skills/llama-cpp/references/quantization.md +213 -0
  176. package/bin/skills/llama-cpp/references/server.md +125 -0
  177. package/bin/skills/llama-factory/SKILL.md +80 -0
  178. package/bin/skills/llama-factory/references/_images.md +23 -0
  179. package/bin/skills/llama-factory/references/advanced.md +1055 -0
  180. package/bin/skills/llama-factory/references/getting_started.md +349 -0
  181. package/bin/skills/llama-factory/references/index.md +19 -0
  182. package/bin/skills/llama-factory/references/other.md +31 -0
  183. package/bin/skills/llamaguard/SKILL.md +337 -0
  184. package/bin/skills/llamaindex/SKILL.md +569 -0
  185. package/bin/skills/llamaindex/references/agents.md +83 -0
  186. package/bin/skills/llamaindex/references/data_connectors.md +108 -0
  187. package/bin/skills/llamaindex/references/query_engines.md +406 -0
  188. package/bin/skills/llava/SKILL.md +304 -0
  189. package/bin/skills/llava/references/training.md +197 -0
  190. package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
  191. package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
  192. package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
  193. package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
  194. package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
  195. package/bin/skills/long-context/SKILL.md +536 -0
  196. package/bin/skills/long-context/references/extension_methods.md +468 -0
  197. package/bin/skills/long-context/references/fine_tuning.md +611 -0
  198. package/bin/skills/long-context/references/rope.md +402 -0
  199. package/bin/skills/mamba/SKILL.md +260 -0
  200. package/bin/skills/mamba/references/architecture-details.md +206 -0
  201. package/bin/skills/mamba/references/benchmarks.md +255 -0
  202. package/bin/skills/mamba/references/training-guide.md +388 -0
  203. package/bin/skills/megatron-core/SKILL.md +366 -0
  204. package/bin/skills/megatron-core/references/benchmarks.md +249 -0
  205. package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
  206. package/bin/skills/megatron-core/references/production-examples.md +473 -0
  207. package/bin/skills/megatron-core/references/training-recipes.md +547 -0
  208. package/bin/skills/miles/SKILL.md +315 -0
  209. package/bin/skills/miles/references/api-reference.md +141 -0
  210. package/bin/skills/miles/references/troubleshooting.md +352 -0
  211. package/bin/skills/mlflow/SKILL.md +704 -0
  212. package/bin/skills/mlflow/references/deployment.md +744 -0
  213. package/bin/skills/mlflow/references/model-registry.md +770 -0
  214. package/bin/skills/mlflow/references/tracking.md +680 -0
  215. package/bin/skills/modal/SKILL.md +341 -0
  216. package/bin/skills/modal/references/advanced-usage.md +503 -0
  217. package/bin/skills/modal/references/troubleshooting.md +494 -0
  218. package/bin/skills/model-merging/SKILL.md +539 -0
  219. package/bin/skills/model-merging/references/evaluation.md +462 -0
  220. package/bin/skills/model-merging/references/examples.md +428 -0
  221. package/bin/skills/model-merging/references/methods.md +352 -0
  222. package/bin/skills/model-pruning/SKILL.md +495 -0
  223. package/bin/skills/model-pruning/references/wanda.md +347 -0
  224. package/bin/skills/moe-training/SKILL.md +526 -0
  225. package/bin/skills/moe-training/references/architectures.md +432 -0
  226. package/bin/skills/moe-training/references/inference.md +348 -0
  227. package/bin/skills/moe-training/references/training.md +425 -0
  228. package/bin/skills/nanogpt/SKILL.md +290 -0
  229. package/bin/skills/nanogpt/references/architecture.md +382 -0
  230. package/bin/skills/nanogpt/references/data.md +476 -0
  231. package/bin/skills/nanogpt/references/training.md +564 -0
  232. package/bin/skills/nemo-curator/SKILL.md +383 -0
  233. package/bin/skills/nemo-curator/references/deduplication.md +87 -0
  234. package/bin/skills/nemo-curator/references/filtering.md +102 -0
  235. package/bin/skills/nemo-evaluator/SKILL.md +494 -0
  236. package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
  237. package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
  238. package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
  239. package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
  240. package/bin/skills/nemo-guardrails/SKILL.md +297 -0
  241. package/bin/skills/nnsight/SKILL.md +436 -0
  242. package/bin/skills/nnsight/references/README.md +78 -0
  243. package/bin/skills/nnsight/references/api.md +344 -0
  244. package/bin/skills/nnsight/references/tutorials.md +300 -0
  245. package/bin/skills/openrlhf/SKILL.md +249 -0
  246. package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
  247. package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
  248. package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
  249. package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
  250. package/bin/skills/outlines/SKILL.md +652 -0
  251. package/bin/skills/outlines/references/backends.md +615 -0
  252. package/bin/skills/outlines/references/examples.md +773 -0
  253. package/bin/skills/outlines/references/json_generation.md +652 -0
  254. package/bin/skills/peft/SKILL.md +431 -0
  255. package/bin/skills/peft/references/advanced-usage.md +514 -0
  256. package/bin/skills/peft/references/troubleshooting.md +480 -0
  257. package/bin/skills/phoenix/SKILL.md +475 -0
  258. package/bin/skills/phoenix/references/advanced-usage.md +619 -0
  259. package/bin/skills/phoenix/references/troubleshooting.md +538 -0
  260. package/bin/skills/pinecone/SKILL.md +358 -0
  261. package/bin/skills/pinecone/references/deployment.md +181 -0
  262. package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
  263. package/bin/skills/pytorch-fsdp/references/index.md +7 -0
  264. package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
  265. package/bin/skills/pytorch-lightning/SKILL.md +346 -0
  266. package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
  267. package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
  268. package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
  269. package/bin/skills/pyvene/SKILL.md +473 -0
  270. package/bin/skills/pyvene/references/README.md +73 -0
  271. package/bin/skills/pyvene/references/api.md +383 -0
  272. package/bin/skills/pyvene/references/tutorials.md +376 -0
  273. package/bin/skills/qdrant/SKILL.md +493 -0
  274. package/bin/skills/qdrant/references/advanced-usage.md +648 -0
  275. package/bin/skills/qdrant/references/troubleshooting.md +631 -0
  276. package/bin/skills/ray-data/SKILL.md +326 -0
  277. package/bin/skills/ray-data/references/integration.md +82 -0
  278. package/bin/skills/ray-data/references/transformations.md +83 -0
  279. package/bin/skills/ray-train/SKILL.md +406 -0
  280. package/bin/skills/ray-train/references/multi-node.md +628 -0
  281. package/bin/skills/rwkv/SKILL.md +260 -0
  282. package/bin/skills/rwkv/references/architecture-details.md +344 -0
  283. package/bin/skills/rwkv/references/rwkv7.md +386 -0
  284. package/bin/skills/rwkv/references/state-management.md +369 -0
  285. package/bin/skills/saelens/SKILL.md +386 -0
  286. package/bin/skills/saelens/references/README.md +70 -0
  287. package/bin/skills/saelens/references/api.md +333 -0
  288. package/bin/skills/saelens/references/tutorials.md +318 -0
  289. package/bin/skills/segment-anything/SKILL.md +500 -0
  290. package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
  291. package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
  292. package/bin/skills/sentence-transformers/SKILL.md +255 -0
  293. package/bin/skills/sentence-transformers/references/models.md +123 -0
  294. package/bin/skills/sentencepiece/SKILL.md +235 -0
  295. package/bin/skills/sentencepiece/references/algorithms.md +200 -0
  296. package/bin/skills/sentencepiece/references/training.md +304 -0
  297. package/bin/skills/sglang/SKILL.md +442 -0
  298. package/bin/skills/sglang/references/deployment.md +490 -0
  299. package/bin/skills/sglang/references/radix-attention.md +413 -0
  300. package/bin/skills/sglang/references/structured-generation.md +541 -0
  301. package/bin/skills/simpo/SKILL.md +219 -0
  302. package/bin/skills/simpo/references/datasets.md +478 -0
  303. package/bin/skills/simpo/references/hyperparameters.md +452 -0
  304. package/bin/skills/simpo/references/loss-functions.md +350 -0
  305. package/bin/skills/skypilot/SKILL.md +509 -0
  306. package/bin/skills/skypilot/references/advanced-usage.md +491 -0
  307. package/bin/skills/skypilot/references/troubleshooting.md +570 -0
  308. package/bin/skills/slime/SKILL.md +464 -0
  309. package/bin/skills/slime/references/api-reference.md +392 -0
  310. package/bin/skills/slime/references/troubleshooting.md +386 -0
  311. package/bin/skills/speculative-decoding/SKILL.md +467 -0
  312. package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
  313. package/bin/skills/speculative-decoding/references/medusa.md +350 -0
  314. package/bin/skills/stable-diffusion/SKILL.md +519 -0
  315. package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
  316. package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
  317. package/bin/skills/tensorboard/SKILL.md +629 -0
  318. package/bin/skills/tensorboard/references/integrations.md +638 -0
  319. package/bin/skills/tensorboard/references/profiling.md +545 -0
  320. package/bin/skills/tensorboard/references/visualization.md +620 -0
  321. package/bin/skills/tensorrt-llm/SKILL.md +187 -0
  322. package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
  323. package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
  324. package/bin/skills/tensorrt-llm/references/serving.md +470 -0
  325. package/bin/skills/tinker/SKILL.md +362 -0
  326. package/bin/skills/tinker/references/api-reference.md +168 -0
  327. package/bin/skills/tinker/references/getting-started.md +157 -0
  328. package/bin/skills/tinker/references/loss-functions.md +163 -0
  329. package/bin/skills/tinker/references/models-and-lora.md +139 -0
  330. package/bin/skills/tinker/references/recipes.md +280 -0
  331. package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
  332. package/bin/skills/tinker/references/rendering.md +243 -0
  333. package/bin/skills/tinker/references/supervised-learning.md +232 -0
  334. package/bin/skills/tinker-training-cost/SKILL.md +187 -0
  335. package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
  336. package/bin/skills/torchforge/SKILL.md +433 -0
  337. package/bin/skills/torchforge/references/api-reference.md +327 -0
  338. package/bin/skills/torchforge/references/troubleshooting.md +409 -0
  339. package/bin/skills/torchtitan/SKILL.md +358 -0
  340. package/bin/skills/torchtitan/references/checkpoint.md +181 -0
  341. package/bin/skills/torchtitan/references/custom-models.md +258 -0
  342. package/bin/skills/torchtitan/references/float8.md +133 -0
  343. package/bin/skills/torchtitan/references/fsdp.md +126 -0
  344. package/bin/skills/transformer-lens/SKILL.md +346 -0
  345. package/bin/skills/transformer-lens/references/README.md +54 -0
  346. package/bin/skills/transformer-lens/references/api.md +362 -0
  347. package/bin/skills/transformer-lens/references/tutorials.md +339 -0
  348. package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
  349. package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
  350. package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
  351. package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
  352. package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
  353. package/bin/skills/unsloth/SKILL.md +80 -0
  354. package/bin/skills/unsloth/references/index.md +7 -0
  355. package/bin/skills/unsloth/references/llms-full.md +16799 -0
  356. package/bin/skills/unsloth/references/llms-txt.md +12044 -0
  357. package/bin/skills/unsloth/references/llms.md +82 -0
  358. package/bin/skills/verl/SKILL.md +391 -0
  359. package/bin/skills/verl/references/api-reference.md +301 -0
  360. package/bin/skills/verl/references/troubleshooting.md +391 -0
  361. package/bin/skills/vllm/SKILL.md +364 -0
  362. package/bin/skills/vllm/references/optimization.md +226 -0
  363. package/bin/skills/vllm/references/quantization.md +284 -0
  364. package/bin/skills/vllm/references/server-deployment.md +255 -0
  365. package/bin/skills/vllm/references/troubleshooting.md +447 -0
  366. package/bin/skills/weights-and-biases/SKILL.md +590 -0
  367. package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
  368. package/bin/skills/weights-and-biases/references/integrations.md +700 -0
  369. package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
  370. package/bin/skills/whisper/SKILL.md +317 -0
  371. package/bin/skills/whisper/references/languages.md +189 -0
  372. package/bin/synsc +0 -0
  373. package/package.json +10 -0
@@ -0,0 +1,572 @@
1
+ ---
2
+ name: grpo-rl-training
3
+ description: Expert guidance for GRPO/RL fine-tuning with TRL for reasoning and task-specific model training
4
+ version: 1.0.0
5
+ author: Synthetic Sciences
6
+ license: MIT
7
+ tags: [Post-Training, Reinforcement Learning, GRPO, TRL, RLHF, Reward Modeling, Reasoning, DPO, PPO, Structured Output]
8
+ dependencies: [transformers>=4.47.0, trl>=0.14.0, datasets>=3.2.0, peft>=0.14.0, torch]
9
+ ---
10
+
11
+ # GRPO/RL Training with TRL
12
+
13
+ Expert-level guidance for implementing Group Relative Policy Optimization (GRPO) using the Transformer Reinforcement Learning (TRL) library. This skill provides battle-tested patterns, critical insights, and production-ready workflows for fine-tuning language models with custom reward functions.
14
+
15
+ ## When to Use This Skill
16
+
17
+ Use GRPO training when you need to:
18
+ - **Enforce specific output formats** (e.g., XML tags, JSON, structured reasoning)
19
+ - **Teach verifiable tasks** with objective correctness metrics (math, coding, fact-checking)
20
+ - **Improve reasoning capabilities** by rewarding chain-of-thought patterns
21
+ - **Align models to domain-specific behaviors** without labeled preference data
22
+ - **Optimize for multiple objectives** simultaneously (format + correctness + style)
23
+
24
+ **Do NOT use GRPO for:**
25
+ - Simple supervised fine-tuning tasks (use SFT instead)
26
+ - Tasks without clear reward signals
27
+ - When you already have high-quality preference pairs (use DPO/PPO instead)
28
+
29
+ ---
30
+
31
+ ## Core Concepts
32
+
33
+ ### 1. GRPO Algorithm Fundamentals
34
+
35
+ **Key Mechanism:**
36
+ - Generates **multiple completions** for each prompt (group size: 4-16)
37
+ - Compares completions within each group using reward functions
38
+ - Updates policy to favor higher-rewarded responses relative to the group
39
+
40
+ **Critical Difference from PPO:**
41
+ - No separate reward model needed
42
+ - More sample-efficient (learns from within-group comparisons)
43
+ - Simpler to implement and debug
44
+
45
+ **Mathematical Intuition:**
46
+ ```
47
+ For each prompt p:
48
+ 1. Generate N completions: {c₁, c₂, ..., cₙ}
49
+ 2. Compute rewards: {r₁, r₂, ..., rₙ}
50
+ 3. Learn to increase probability of high-reward completions
51
+ relative to low-reward ones in the same group
52
+ ```
53
+
54
+ ### 2. Reward Function Design Philosophy
55
+
56
+ **Golden Rules:**
57
+ 1. **Compose multiple reward functions** - Each handles one aspect (format, correctness, style)
58
+ 2. **Scale rewards appropriately** - Higher weight = stronger signal
59
+ 3. **Use incremental rewards** - Partial credit for partial compliance
60
+ 4. **Test rewards independently** - Debug each reward function in isolation
61
+
62
+ **Reward Function Types:**
63
+
64
+ | Type | Use Case | Example Weight |
65
+ |------|----------|----------------|
66
+ | **Correctness** | Verifiable tasks (math, code) | 2.0 (highest) |
67
+ | **Format** | Strict structure enforcement | 0.5-1.0 |
68
+ | **Length** | Encourage verbosity/conciseness | 0.1-0.5 |
69
+ | **Style** | Penalize unwanted patterns | -0.5 to 0.5 |
70
+
71
+ ---
72
+
73
+ ## Implementation Workflow
74
+
75
+ ### Step 1: Dataset Preparation
76
+
77
+ **Critical Requirements:**
78
+ - Prompts in chat format (list of dicts with 'role' and 'content')
79
+ - Include system prompts to set expectations
80
+ - For verifiable tasks, include ground truth answers as additional columns
81
+
82
+ **Example Structure:**
83
+ ```python
84
+ from datasets import load_dataset, Dataset
85
+
86
+ SYSTEM_PROMPT = """
87
+ Respond in the following format:
88
+ <reasoning>
89
+ [Your step-by-step thinking]
90
+ </reasoning>
91
+ <answer>
92
+ [Final answer]
93
+ </answer>
94
+ """
95
+
96
+ def prepare_dataset(raw_data):
97
+ """
98
+ Transform raw data into GRPO-compatible format.
99
+
100
+ Returns: Dataset with columns:
101
+ - 'prompt': List[Dict] with role/content (system + user messages)
102
+ - 'answer': str (ground truth, optional but recommended)
103
+ """
104
+ return raw_data.map(lambda x: {
105
+ 'prompt': [
106
+ {'role': 'system', 'content': SYSTEM_PROMPT},
107
+ {'role': 'user', 'content': x['question']}
108
+ ],
109
+ 'answer': extract_answer(x['raw_answer'])
110
+ })
111
+ ```
112
+
113
+ **Pro Tips:**
114
+ - Use one-shot or few-shot examples in system prompt for complex formats
115
+ - Keep prompts concise (max_prompt_length: 256-512 tokens)
116
+ - Validate data quality before training (garbage in = garbage out)
117
+
118
+ ### Step 2: Reward Function Implementation
119
+
120
+ **Template Structure:**
121
+ ```python
122
+ def reward_function_name(
123
+ prompts, # List[List[Dict]]: Original prompts
124
+ completions, # List[List[Dict]]: Model generations
125
+ answer=None, # Optional: Ground truth from dataset
126
+ **kwargs # Additional dataset columns
127
+ ) -> list[float]:
128
+ """
129
+ Evaluate completions and return rewards.
130
+
131
+ Returns: List of floats (one per completion)
132
+ """
133
+ # Extract completion text
134
+ responses = [comp[0]['content'] for comp in completions]
135
+
136
+ # Compute rewards
137
+ rewards = []
138
+ for response in responses:
139
+ score = compute_score(response)
140
+ rewards.append(score)
141
+
142
+ return rewards
143
+ ```
144
+
145
+ **Example 1: Correctness Reward (Math/Coding)**
146
+ ```python
147
+ def correctness_reward(prompts, completions, answer, **kwargs):
148
+ """Reward correct answers with high score."""
149
+ responses = [comp[0]['content'] for comp in completions]
150
+ extracted = [extract_final_answer(r) for r in responses]
151
+ return [2.0 if ans == gt else 0.0
152
+ for ans, gt in zip(extracted, answer)]
153
+ ```
154
+
155
+ **Example 2: Format Reward (Structured Output)**
156
+ ```python
157
+ import re
158
+
159
+ def format_reward(completions, **kwargs):
160
+ """Reward XML-like structured format."""
161
+ pattern = r'<reasoning>.*?</reasoning>\s*<answer>.*?</answer>'
162
+ responses = [comp[0]['content'] for comp in completions]
163
+ return [1.0 if re.search(pattern, r, re.DOTALL) else 0.0
164
+ for r in responses]
165
+ ```
166
+
167
+ **Example 3: Incremental Format Reward (Partial Credit)**
168
+ ```python
169
+ def incremental_format_reward(completions, **kwargs):
170
+ """Award partial credit for format compliance."""
171
+ responses = [comp[0]['content'] for comp in completions]
172
+ rewards = []
173
+
174
+ for r in responses:
175
+ score = 0.0
176
+ if '<reasoning>' in r:
177
+ score += 0.25
178
+ if '</reasoning>' in r:
179
+ score += 0.25
180
+ if '<answer>' in r:
181
+ score += 0.25
182
+ if '</answer>' in r:
183
+ score += 0.25
184
+ # Penalize extra text after closing tag
185
+ if r.count('</answer>') == 1:
186
+ extra_text = r.split('</answer>')[-1].strip()
187
+ score -= len(extra_text) * 0.001
188
+ rewards.append(score)
189
+
190
+ return rewards
191
+ ```
192
+
193
+ **Critical Insight:**
194
+ Combine 3-5 reward functions for robust training. Order matters less than diversity of signals.
195
+
196
+ ### Step 3: Training Configuration
197
+
198
+ **Memory-Optimized Config (Small GPU)**
199
+ ```python
200
+ from trl import GRPOConfig
201
+
202
+ training_args = GRPOConfig(
203
+ output_dir="outputs/grpo-model",
204
+
205
+ # Learning rate
206
+ learning_rate=5e-6, # Lower = more stable
207
+ adam_beta1=0.9,
208
+ adam_beta2=0.99,
209
+ weight_decay=0.1,
210
+ warmup_ratio=0.1,
211
+ lr_scheduler_type='cosine',
212
+
213
+ # Batch settings
214
+ per_device_train_batch_size=1,
215
+ gradient_accumulation_steps=4, # Effective batch = 4
216
+
217
+ # GRPO-specific
218
+ num_generations=8, # Group size: 8-16 recommended
219
+ max_prompt_length=256,
220
+ max_completion_length=512,
221
+
222
+ # Training duration
223
+ num_train_epochs=1,
224
+ max_steps=None, # Or set fixed steps (e.g., 500)
225
+
226
+ # Optimization
227
+ bf16=True, # Faster on A100/H100
228
+ optim="adamw_8bit", # Memory-efficient optimizer
229
+ max_grad_norm=0.1,
230
+
231
+ # Logging
232
+ logging_steps=1,
233
+ save_steps=100,
234
+ report_to="wandb", # Or "none" for no logging
235
+ )
236
+ ```
237
+
238
+ **High-Performance Config (Large GPU)**
239
+ ```python
240
+ training_args = GRPOConfig(
241
+ output_dir="outputs/grpo-model",
242
+ learning_rate=1e-5,
243
+ per_device_train_batch_size=4,
244
+ gradient_accumulation_steps=2,
245
+ num_generations=16, # Larger groups = better signal
246
+ max_prompt_length=512,
247
+ max_completion_length=1024,
248
+ num_train_epochs=1,
249
+ bf16=True,
250
+ use_vllm=True, # Fast generation with vLLM
251
+ logging_steps=10,
252
+ )
253
+ ```
254
+
255
+ **Critical Hyperparameters:**
256
+
257
+ | Parameter | Impact | Tuning Advice |
258
+ |-----------|--------|---------------|
259
+ | `num_generations` | Group size for comparison | Start with 8, increase to 16 if GPU allows |
260
+ | `learning_rate` | Convergence speed/stability | 5e-6 (safe), 1e-5 (faster, riskier) |
261
+ | `max_completion_length` | Output verbosity | Match your task (512 for reasoning, 256 for short answers) |
262
+ | `gradient_accumulation_steps` | Effective batch size | Increase if GPU memory limited |
263
+
264
+ ### Step 4: Model Setup and Training
265
+
266
+ **Standard Setup (Transformers)**
267
+ ```python
268
+ import torch
269
+ from transformers import AutoModelForCausalLM, AutoTokenizer
270
+ from peft import LoraConfig
271
+ from trl import GRPOTrainer
272
+
273
+ # Load model
274
+ model_name = "Qwen/Qwen2.5-1.5B-Instruct"
275
+ model = AutoModelForCausalLM.from_pretrained(
276
+ model_name,
277
+ torch_dtype=torch.bfloat16,
278
+ attn_implementation="flash_attention_2", # 2-3x faster
279
+ device_map="auto"
280
+ )
281
+
282
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
283
+ tokenizer.pad_token = tokenizer.eos_token
284
+
285
+ # Optional: LoRA for parameter-efficient training
286
+ peft_config = LoraConfig(
287
+ r=16, # Rank (higher = more capacity)
288
+ lora_alpha=32, # Scaling factor (typically 2*r)
289
+ target_modules=[
290
+ "q_proj", "k_proj", "v_proj", "o_proj",
291
+ "gate_proj", "up_proj", "down_proj"
292
+ ],
293
+ task_type="CAUSAL_LM",
294
+ lora_dropout=0.05,
295
+ )
296
+
297
+ # Initialize trainer
298
+ trainer = GRPOTrainer(
299
+ model=model,
300
+ processing_class=tokenizer,
301
+ reward_funcs=[
302
+ incremental_format_reward,
303
+ format_reward,
304
+ correctness_reward,
305
+ ],
306
+ args=training_args,
307
+ train_dataset=dataset,
308
+ peft_config=peft_config, # Remove for full fine-tuning
309
+ )
310
+
311
+ # Train
312
+ trainer.train()
313
+
314
+ # Save
315
+ trainer.save_model("final_model")
316
+ ```
317
+
318
+ **Unsloth Setup (2-3x Faster)**
319
+ ```python
320
+ from unsloth import FastLanguageModel
321
+
322
+ model, tokenizer = FastLanguageModel.from_pretrained(
323
+ model_name="google/gemma-3-1b-it",
324
+ max_seq_length=1024,
325
+ load_in_4bit=True,
326
+ fast_inference=True,
327
+ max_lora_rank=32,
328
+ )
329
+
330
+ model = FastLanguageModel.get_peft_model(
331
+ model,
332
+ r=32,
333
+ target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
334
+ "gate_proj", "up_proj", "down_proj"],
335
+ lora_alpha=32,
336
+ use_gradient_checkpointing="unsloth",
337
+ )
338
+
339
+ # Rest is identical to standard setup
340
+ trainer = GRPOTrainer(model=model, ...)
341
+ trainer.train()
342
+ ```
343
+
344
+ ---
345
+
346
+ ## Critical Training Insights
347
+
348
+ ### 1. Loss Behavior (EXPECTED PATTERN)
349
+ - **Loss starts near 0 and INCREASES during training**
350
+ - This is CORRECT - loss measures KL divergence from initial policy
351
+ - Model is learning (diverging from original behavior to optimize rewards)
352
+ - Monitor reward metrics instead of loss for progress
353
+
354
+ ### 2. Reward Tracking
355
+ Key metrics to watch:
356
+ - `reward`: Average across all completions
357
+ - `reward_std`: Diversity within groups (should remain > 0)
358
+ - `kl`: KL divergence from reference (should grow moderately)
359
+
360
+ **Healthy Training Pattern:**
361
+ ```
362
+ Step Reward Reward_Std KL
363
+ 100 0.5 0.3 0.02
364
+ 200 0.8 0.25 0.05
365
+ 300 1.2 0.2 0.08 ← Good progression
366
+ 400 1.5 0.15 0.12
367
+ ```
368
+
369
+ **Warning Signs:**
370
+ - Reward std → 0 (model collapsing to single response)
371
+ - KL exploding (> 0.5) (diverging too much, reduce LR)
372
+ - Reward stuck (reward functions too harsh or model capacity issue)
373
+
374
+ ### 3. Common Pitfalls and Solutions
375
+
376
+ | Problem | Symptom | Solution |
377
+ |---------|---------|----------|
378
+ | **Mode collapse** | All completions identical | Increase `num_generations`, add diversity penalty |
379
+ | **No learning** | Flat rewards | Check reward function logic, increase LR |
380
+ | **OOM errors** | GPU memory exceeded | Reduce `num_generations`, enable gradient checkpointing |
381
+ | **Slow training** | < 1 it/s | Enable `use_vllm=True`, use Unsloth, reduce seq length |
382
+ | **Format ignored** | Model doesn't follow structure | Increase format reward weight, add incremental rewards |
383
+
384
+ ---
385
+
386
+ ## Advanced Patterns
387
+
388
+ ### 1. Multi-Stage Training
389
+ For complex tasks, train in stages:
390
+
391
+ ```python
392
+ # Stage 1: Format compliance (epochs=1)
393
+ trainer_stage1 = GRPOTrainer(
394
+ model=model,
395
+ reward_funcs=[incremental_format_reward, format_reward],
396
+ ...
397
+ )
398
+ trainer_stage1.train()
399
+
400
+ # Stage 2: Correctness (epochs=1)
401
+ trainer_stage2 = GRPOTrainer(
402
+ model=model,
403
+ reward_funcs=[format_reward, correctness_reward],
404
+ ...
405
+ )
406
+ trainer_stage2.train()
407
+ ```
408
+
409
+ ### 2. Adaptive Reward Scaling
410
+ ```python
411
+ class AdaptiveReward:
412
+ def __init__(self, base_reward_func, initial_weight=1.0):
413
+ self.func = base_reward_func
414
+ self.weight = initial_weight
415
+
416
+ def __call__(self, *args, **kwargs):
417
+ rewards = self.func(*args, **kwargs)
418
+ return [r * self.weight for r in rewards]
419
+
420
+ def adjust_weight(self, success_rate):
421
+ """Increase weight if model struggling, decrease if succeeding."""
422
+ if success_rate < 0.3:
423
+ self.weight *= 1.2
424
+ elif success_rate > 0.8:
425
+ self.weight *= 0.9
426
+ ```
427
+
428
+ ### 3. Custom Dataset Integration
429
+ ```python
430
+ def load_custom_knowledge_base(csv_path):
431
+ """Example: School communication platform docs."""
432
+ import pandas as pd
433
+ df = pd.read_csv(csv_path)
434
+
435
+ dataset = Dataset.from_pandas(df).map(lambda x: {
436
+ 'prompt': [
437
+ {'role': 'system', 'content': CUSTOM_SYSTEM_PROMPT},
438
+ {'role': 'user', 'content': x['question']}
439
+ ],
440
+ 'answer': x['expert_answer']
441
+ })
442
+ return dataset
443
+ ```
444
+
445
+ ---
446
+
447
+ ## Deployment and Inference
448
+
449
+ ### Save and Merge LoRA
450
+ ```python
451
+ # Merge LoRA adapters into base model
452
+ if hasattr(trainer.model, 'merge_and_unload'):
453
+ merged_model = trainer.model.merge_and_unload()
454
+ merged_model.save_pretrained("production_model")
455
+ tokenizer.save_pretrained("production_model")
456
+ ```
457
+
458
+ ### Inference Example
459
+ ```python
460
+ from transformers import pipeline
461
+
462
+ generator = pipeline(
463
+ "text-generation",
464
+ model="production_model",
465
+ tokenizer=tokenizer
466
+ )
467
+
468
+ result = generator(
469
+ [
470
+ {'role': 'system', 'content': SYSTEM_PROMPT},
471
+ {'role': 'user', 'content': "What is 15 + 27?"}
472
+ ],
473
+ max_new_tokens=256,
474
+ do_sample=True,
475
+ temperature=0.7,
476
+ top_p=0.9
477
+ )
478
+ print(result[0]['generated_text'])
479
+ ```
480
+
481
+ ---
482
+
483
+ ## Best Practices Checklist
484
+
485
+ **Before Training:**
486
+ - [ ] Validate dataset format (prompts as List[Dict])
487
+ - [ ] Test reward functions on sample data
488
+ - [ ] Calculate expected max_prompt_length from data
489
+ - [ ] Choose appropriate num_generations based on GPU memory
490
+ - [ ] Set up logging (wandb recommended)
491
+
492
+ **During Training:**
493
+ - [ ] Monitor reward progression (should increase)
494
+ - [ ] Check reward_std (should stay > 0.1)
495
+ - [ ] Watch for OOM errors (reduce batch size if needed)
496
+ - [ ] Sample generations every 50-100 steps
497
+ - [ ] Validate format compliance on holdout set
498
+
499
+ **After Training:**
500
+ - [ ] Merge LoRA weights if using PEFT
501
+ - [ ] Test on diverse prompts
502
+ - [ ] Compare to baseline model
503
+ - [ ] Document reward weights and hyperparameters
504
+ - [ ] Save reproducibility config
505
+
506
+ ---
507
+
508
+ ## Troubleshooting Guide
509
+
510
+ ### Debugging Workflow
511
+ 1. **Isolate reward functions** - Test each independently
512
+ 2. **Check data distribution** - Ensure diversity in prompts
513
+ 3. **Reduce complexity** - Start with single reward, add gradually
514
+ 4. **Monitor generations** - Print samples every N steps
515
+ 5. **Validate extraction logic** - Ensure answer parsing works
516
+
517
+ ### Quick Fixes
518
+ ```python
519
+ # Debug reward function
520
+ def debug_reward(completions, **kwargs):
521
+ responses = [comp[0]['content'] for comp in completions]
522
+ for i, r in enumerate(responses[:2]): # Print first 2
523
+ print(f"Response {i}: {r[:200]}...")
524
+ return [1.0] * len(responses) # Dummy rewards
525
+
526
+ # Test without training
527
+ trainer = GRPOTrainer(..., reward_funcs=[debug_reward])
528
+ trainer.generate_completions(dataset[:1]) # Generate without updating
529
+ ```
530
+
531
+ ---
532
+
533
+ ## References and Resources
534
+
535
+ **Official Documentation:**
536
+ - TRL GRPO Trainer: https://huggingface.co/docs/trl/grpo_trainer
537
+ - DeepSeek R1 Paper: https://arxiv.org/abs/2501.12948
538
+ - Unsloth Docs: https://docs.unsloth.ai/
539
+
540
+ **Example Repositories:**
541
+ - Open R1 Implementation: https://github.com/huggingface/open-r1
542
+ - TRL Examples: https://github.com/huggingface/trl/tree/main/examples
543
+
544
+ **Recommended Reading:**
545
+ - Progressive Disclosure Pattern for agent instructions
546
+ - Reward shaping in RL (Ng et al.)
547
+ - LoRA paper (Hu et al., 2021)
548
+
549
+ ---
550
+
551
+ ## Usage Instructions for Agents
552
+
553
+ When this skill is loaded:
554
+
555
+ 1. **Read this entire file** before implementing GRPO training
556
+ 2. **Start with the simplest reward function** (e.g., length-based) to validate setup
557
+ 3. **Use the templates** in `templates/` directory as starting points
558
+ 4. **Reference examples** in `examples/` for task-specific implementations
559
+ 5. **Follow the workflow** sequentially (don't skip steps)
560
+ 6. **Debug incrementally** - add one reward function at a time
561
+
562
+ **Critical Reminders:**
563
+ - Always use multiple reward functions (3-5 is optimal)
564
+ - Monitor reward metrics, not loss
565
+ - Test reward functions before training
566
+ - Start small (num_generations=4), scale up gradually
567
+ - Save checkpoints frequently (every 100 steps)
568
+
569
+ This skill is designed for **expert-level implementation**. Beginners should start with supervised fine-tuning before attempting GRPO.
570
+
571
+
572
+