@synsci/cli-darwin-x64 1.1.49

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (373) hide show
  1. package/bin/skills/accelerate/SKILL.md +332 -0
  2. package/bin/skills/accelerate/references/custom-plugins.md +453 -0
  3. package/bin/skills/accelerate/references/megatron-integration.md +489 -0
  4. package/bin/skills/accelerate/references/performance.md +525 -0
  5. package/bin/skills/audiocraft/SKILL.md +564 -0
  6. package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
  7. package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
  8. package/bin/skills/autogpt/SKILL.md +403 -0
  9. package/bin/skills/autogpt/references/advanced-usage.md +535 -0
  10. package/bin/skills/autogpt/references/troubleshooting.md +420 -0
  11. package/bin/skills/awq/SKILL.md +310 -0
  12. package/bin/skills/awq/references/advanced-usage.md +324 -0
  13. package/bin/skills/awq/references/troubleshooting.md +344 -0
  14. package/bin/skills/axolotl/SKILL.md +158 -0
  15. package/bin/skills/axolotl/references/api.md +5548 -0
  16. package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
  17. package/bin/skills/axolotl/references/index.md +15 -0
  18. package/bin/skills/axolotl/references/other.md +3563 -0
  19. package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
  20. package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
  21. package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
  22. package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
  23. package/bin/skills/bitsandbytes/SKILL.md +411 -0
  24. package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
  25. package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
  26. package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
  27. package/bin/skills/blip-2/SKILL.md +564 -0
  28. package/bin/skills/blip-2/references/advanced-usage.md +680 -0
  29. package/bin/skills/blip-2/references/troubleshooting.md +526 -0
  30. package/bin/skills/chroma/SKILL.md +406 -0
  31. package/bin/skills/chroma/references/integration.md +38 -0
  32. package/bin/skills/clip/SKILL.md +253 -0
  33. package/bin/skills/clip/references/applications.md +207 -0
  34. package/bin/skills/constitutional-ai/SKILL.md +290 -0
  35. package/bin/skills/crewai/SKILL.md +498 -0
  36. package/bin/skills/crewai/references/flows.md +438 -0
  37. package/bin/skills/crewai/references/tools.md +429 -0
  38. package/bin/skills/crewai/references/troubleshooting.md +480 -0
  39. package/bin/skills/deepspeed/SKILL.md +141 -0
  40. package/bin/skills/deepspeed/references/08.md +17 -0
  41. package/bin/skills/deepspeed/references/09.md +173 -0
  42. package/bin/skills/deepspeed/references/2020.md +378 -0
  43. package/bin/skills/deepspeed/references/2023.md +279 -0
  44. package/bin/skills/deepspeed/references/assets.md +179 -0
  45. package/bin/skills/deepspeed/references/index.md +35 -0
  46. package/bin/skills/deepspeed/references/mii.md +118 -0
  47. package/bin/skills/deepspeed/references/other.md +1191 -0
  48. package/bin/skills/deepspeed/references/tutorials.md +6554 -0
  49. package/bin/skills/dspy/SKILL.md +590 -0
  50. package/bin/skills/dspy/references/examples.md +663 -0
  51. package/bin/skills/dspy/references/modules.md +475 -0
  52. package/bin/skills/dspy/references/optimizers.md +566 -0
  53. package/bin/skills/faiss/SKILL.md +221 -0
  54. package/bin/skills/faiss/references/index_types.md +280 -0
  55. package/bin/skills/flash-attention/SKILL.md +367 -0
  56. package/bin/skills/flash-attention/references/benchmarks.md +215 -0
  57. package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
  58. package/bin/skills/gguf/SKILL.md +427 -0
  59. package/bin/skills/gguf/references/advanced-usage.md +504 -0
  60. package/bin/skills/gguf/references/troubleshooting.md +442 -0
  61. package/bin/skills/gptq/SKILL.md +450 -0
  62. package/bin/skills/gptq/references/calibration.md +337 -0
  63. package/bin/skills/gptq/references/integration.md +129 -0
  64. package/bin/skills/gptq/references/troubleshooting.md +95 -0
  65. package/bin/skills/grpo-rl-training/README.md +97 -0
  66. package/bin/skills/grpo-rl-training/SKILL.md +572 -0
  67. package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
  68. package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
  69. package/bin/skills/guidance/SKILL.md +572 -0
  70. package/bin/skills/guidance/references/backends.md +554 -0
  71. package/bin/skills/guidance/references/constraints.md +674 -0
  72. package/bin/skills/guidance/references/examples.md +767 -0
  73. package/bin/skills/hqq/SKILL.md +445 -0
  74. package/bin/skills/hqq/references/advanced-usage.md +528 -0
  75. package/bin/skills/hqq/references/troubleshooting.md +503 -0
  76. package/bin/skills/hugging-face-cli/SKILL.md +191 -0
  77. package/bin/skills/hugging-face-cli/references/commands.md +954 -0
  78. package/bin/skills/hugging-face-cli/references/examples.md +374 -0
  79. package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
  80. package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
  81. package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
  82. package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
  83. package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
  84. package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
  85. package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
  86. package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
  87. package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
  88. package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
  89. package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
  90. package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
  91. package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
  92. package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
  93. package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
  94. package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
  95. package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
  96. package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
  97. package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
  98. package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
  99. package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
  100. package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
  101. package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
  102. package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
  103. package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
  104. package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
  105. package/bin/skills/hugging-face-jobs/index.html +216 -0
  106. package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
  107. package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
  108. package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
  109. package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
  110. package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
  111. package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
  112. package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
  113. package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
  114. package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
  115. package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
  116. package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
  117. package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
  118. package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
  119. package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
  120. package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
  121. package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
  122. package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
  123. package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
  124. package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
  125. package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
  126. package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
  127. package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
  128. package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
  129. package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
  130. package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
  131. package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
  132. package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
  133. package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
  134. package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
  135. package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
  136. package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
  137. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
  138. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
  139. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
  140. package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
  141. package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
  142. package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
  143. package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
  144. package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
  145. package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
  146. package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
  147. package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
  148. package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
  149. package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
  150. package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
  151. package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
  152. package/bin/skills/instructor/SKILL.md +740 -0
  153. package/bin/skills/instructor/references/examples.md +107 -0
  154. package/bin/skills/instructor/references/providers.md +70 -0
  155. package/bin/skills/instructor/references/validation.md +606 -0
  156. package/bin/skills/knowledge-distillation/SKILL.md +458 -0
  157. package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
  158. package/bin/skills/lambda-labs/SKILL.md +545 -0
  159. package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
  160. package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
  161. package/bin/skills/langchain/SKILL.md +480 -0
  162. package/bin/skills/langchain/references/agents.md +499 -0
  163. package/bin/skills/langchain/references/integration.md +562 -0
  164. package/bin/skills/langchain/references/rag.md +600 -0
  165. package/bin/skills/langsmith/SKILL.md +422 -0
  166. package/bin/skills/langsmith/references/advanced-usage.md +548 -0
  167. package/bin/skills/langsmith/references/troubleshooting.md +537 -0
  168. package/bin/skills/litgpt/SKILL.md +469 -0
  169. package/bin/skills/litgpt/references/custom-models.md +568 -0
  170. package/bin/skills/litgpt/references/distributed-training.md +451 -0
  171. package/bin/skills/litgpt/references/supported-models.md +336 -0
  172. package/bin/skills/litgpt/references/training-recipes.md +619 -0
  173. package/bin/skills/llama-cpp/SKILL.md +258 -0
  174. package/bin/skills/llama-cpp/references/optimization.md +89 -0
  175. package/bin/skills/llama-cpp/references/quantization.md +213 -0
  176. package/bin/skills/llama-cpp/references/server.md +125 -0
  177. package/bin/skills/llama-factory/SKILL.md +80 -0
  178. package/bin/skills/llama-factory/references/_images.md +23 -0
  179. package/bin/skills/llama-factory/references/advanced.md +1055 -0
  180. package/bin/skills/llama-factory/references/getting_started.md +349 -0
  181. package/bin/skills/llama-factory/references/index.md +19 -0
  182. package/bin/skills/llama-factory/references/other.md +31 -0
  183. package/bin/skills/llamaguard/SKILL.md +337 -0
  184. package/bin/skills/llamaindex/SKILL.md +569 -0
  185. package/bin/skills/llamaindex/references/agents.md +83 -0
  186. package/bin/skills/llamaindex/references/data_connectors.md +108 -0
  187. package/bin/skills/llamaindex/references/query_engines.md +406 -0
  188. package/bin/skills/llava/SKILL.md +304 -0
  189. package/bin/skills/llava/references/training.md +197 -0
  190. package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
  191. package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
  192. package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
  193. package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
  194. package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
  195. package/bin/skills/long-context/SKILL.md +536 -0
  196. package/bin/skills/long-context/references/extension_methods.md +468 -0
  197. package/bin/skills/long-context/references/fine_tuning.md +611 -0
  198. package/bin/skills/long-context/references/rope.md +402 -0
  199. package/bin/skills/mamba/SKILL.md +260 -0
  200. package/bin/skills/mamba/references/architecture-details.md +206 -0
  201. package/bin/skills/mamba/references/benchmarks.md +255 -0
  202. package/bin/skills/mamba/references/training-guide.md +388 -0
  203. package/bin/skills/megatron-core/SKILL.md +366 -0
  204. package/bin/skills/megatron-core/references/benchmarks.md +249 -0
  205. package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
  206. package/bin/skills/megatron-core/references/production-examples.md +473 -0
  207. package/bin/skills/megatron-core/references/training-recipes.md +547 -0
  208. package/bin/skills/miles/SKILL.md +315 -0
  209. package/bin/skills/miles/references/api-reference.md +141 -0
  210. package/bin/skills/miles/references/troubleshooting.md +352 -0
  211. package/bin/skills/mlflow/SKILL.md +704 -0
  212. package/bin/skills/mlflow/references/deployment.md +744 -0
  213. package/bin/skills/mlflow/references/model-registry.md +770 -0
  214. package/bin/skills/mlflow/references/tracking.md +680 -0
  215. package/bin/skills/modal/SKILL.md +341 -0
  216. package/bin/skills/modal/references/advanced-usage.md +503 -0
  217. package/bin/skills/modal/references/troubleshooting.md +494 -0
  218. package/bin/skills/model-merging/SKILL.md +539 -0
  219. package/bin/skills/model-merging/references/evaluation.md +462 -0
  220. package/bin/skills/model-merging/references/examples.md +428 -0
  221. package/bin/skills/model-merging/references/methods.md +352 -0
  222. package/bin/skills/model-pruning/SKILL.md +495 -0
  223. package/bin/skills/model-pruning/references/wanda.md +347 -0
  224. package/bin/skills/moe-training/SKILL.md +526 -0
  225. package/bin/skills/moe-training/references/architectures.md +432 -0
  226. package/bin/skills/moe-training/references/inference.md +348 -0
  227. package/bin/skills/moe-training/references/training.md +425 -0
  228. package/bin/skills/nanogpt/SKILL.md +290 -0
  229. package/bin/skills/nanogpt/references/architecture.md +382 -0
  230. package/bin/skills/nanogpt/references/data.md +476 -0
  231. package/bin/skills/nanogpt/references/training.md +564 -0
  232. package/bin/skills/nemo-curator/SKILL.md +383 -0
  233. package/bin/skills/nemo-curator/references/deduplication.md +87 -0
  234. package/bin/skills/nemo-curator/references/filtering.md +102 -0
  235. package/bin/skills/nemo-evaluator/SKILL.md +494 -0
  236. package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
  237. package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
  238. package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
  239. package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
  240. package/bin/skills/nemo-guardrails/SKILL.md +297 -0
  241. package/bin/skills/nnsight/SKILL.md +436 -0
  242. package/bin/skills/nnsight/references/README.md +78 -0
  243. package/bin/skills/nnsight/references/api.md +344 -0
  244. package/bin/skills/nnsight/references/tutorials.md +300 -0
  245. package/bin/skills/openrlhf/SKILL.md +249 -0
  246. package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
  247. package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
  248. package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
  249. package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
  250. package/bin/skills/outlines/SKILL.md +652 -0
  251. package/bin/skills/outlines/references/backends.md +615 -0
  252. package/bin/skills/outlines/references/examples.md +773 -0
  253. package/bin/skills/outlines/references/json_generation.md +652 -0
  254. package/bin/skills/peft/SKILL.md +431 -0
  255. package/bin/skills/peft/references/advanced-usage.md +514 -0
  256. package/bin/skills/peft/references/troubleshooting.md +480 -0
  257. package/bin/skills/phoenix/SKILL.md +475 -0
  258. package/bin/skills/phoenix/references/advanced-usage.md +619 -0
  259. package/bin/skills/phoenix/references/troubleshooting.md +538 -0
  260. package/bin/skills/pinecone/SKILL.md +358 -0
  261. package/bin/skills/pinecone/references/deployment.md +181 -0
  262. package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
  263. package/bin/skills/pytorch-fsdp/references/index.md +7 -0
  264. package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
  265. package/bin/skills/pytorch-lightning/SKILL.md +346 -0
  266. package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
  267. package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
  268. package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
  269. package/bin/skills/pyvene/SKILL.md +473 -0
  270. package/bin/skills/pyvene/references/README.md +73 -0
  271. package/bin/skills/pyvene/references/api.md +383 -0
  272. package/bin/skills/pyvene/references/tutorials.md +376 -0
  273. package/bin/skills/qdrant/SKILL.md +493 -0
  274. package/bin/skills/qdrant/references/advanced-usage.md +648 -0
  275. package/bin/skills/qdrant/references/troubleshooting.md +631 -0
  276. package/bin/skills/ray-data/SKILL.md +326 -0
  277. package/bin/skills/ray-data/references/integration.md +82 -0
  278. package/bin/skills/ray-data/references/transformations.md +83 -0
  279. package/bin/skills/ray-train/SKILL.md +406 -0
  280. package/bin/skills/ray-train/references/multi-node.md +628 -0
  281. package/bin/skills/rwkv/SKILL.md +260 -0
  282. package/bin/skills/rwkv/references/architecture-details.md +344 -0
  283. package/bin/skills/rwkv/references/rwkv7.md +386 -0
  284. package/bin/skills/rwkv/references/state-management.md +369 -0
  285. package/bin/skills/saelens/SKILL.md +386 -0
  286. package/bin/skills/saelens/references/README.md +70 -0
  287. package/bin/skills/saelens/references/api.md +333 -0
  288. package/bin/skills/saelens/references/tutorials.md +318 -0
  289. package/bin/skills/segment-anything/SKILL.md +500 -0
  290. package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
  291. package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
  292. package/bin/skills/sentence-transformers/SKILL.md +255 -0
  293. package/bin/skills/sentence-transformers/references/models.md +123 -0
  294. package/bin/skills/sentencepiece/SKILL.md +235 -0
  295. package/bin/skills/sentencepiece/references/algorithms.md +200 -0
  296. package/bin/skills/sentencepiece/references/training.md +304 -0
  297. package/bin/skills/sglang/SKILL.md +442 -0
  298. package/bin/skills/sglang/references/deployment.md +490 -0
  299. package/bin/skills/sglang/references/radix-attention.md +413 -0
  300. package/bin/skills/sglang/references/structured-generation.md +541 -0
  301. package/bin/skills/simpo/SKILL.md +219 -0
  302. package/bin/skills/simpo/references/datasets.md +478 -0
  303. package/bin/skills/simpo/references/hyperparameters.md +452 -0
  304. package/bin/skills/simpo/references/loss-functions.md +350 -0
  305. package/bin/skills/skypilot/SKILL.md +509 -0
  306. package/bin/skills/skypilot/references/advanced-usage.md +491 -0
  307. package/bin/skills/skypilot/references/troubleshooting.md +570 -0
  308. package/bin/skills/slime/SKILL.md +464 -0
  309. package/bin/skills/slime/references/api-reference.md +392 -0
  310. package/bin/skills/slime/references/troubleshooting.md +386 -0
  311. package/bin/skills/speculative-decoding/SKILL.md +467 -0
  312. package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
  313. package/bin/skills/speculative-decoding/references/medusa.md +350 -0
  314. package/bin/skills/stable-diffusion/SKILL.md +519 -0
  315. package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
  316. package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
  317. package/bin/skills/tensorboard/SKILL.md +629 -0
  318. package/bin/skills/tensorboard/references/integrations.md +638 -0
  319. package/bin/skills/tensorboard/references/profiling.md +545 -0
  320. package/bin/skills/tensorboard/references/visualization.md +620 -0
  321. package/bin/skills/tensorrt-llm/SKILL.md +187 -0
  322. package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
  323. package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
  324. package/bin/skills/tensorrt-llm/references/serving.md +470 -0
  325. package/bin/skills/tinker/SKILL.md +362 -0
  326. package/bin/skills/tinker/references/api-reference.md +168 -0
  327. package/bin/skills/tinker/references/getting-started.md +157 -0
  328. package/bin/skills/tinker/references/loss-functions.md +163 -0
  329. package/bin/skills/tinker/references/models-and-lora.md +139 -0
  330. package/bin/skills/tinker/references/recipes.md +280 -0
  331. package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
  332. package/bin/skills/tinker/references/rendering.md +243 -0
  333. package/bin/skills/tinker/references/supervised-learning.md +232 -0
  334. package/bin/skills/tinker-training-cost/SKILL.md +187 -0
  335. package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
  336. package/bin/skills/torchforge/SKILL.md +433 -0
  337. package/bin/skills/torchforge/references/api-reference.md +327 -0
  338. package/bin/skills/torchforge/references/troubleshooting.md +409 -0
  339. package/bin/skills/torchtitan/SKILL.md +358 -0
  340. package/bin/skills/torchtitan/references/checkpoint.md +181 -0
  341. package/bin/skills/torchtitan/references/custom-models.md +258 -0
  342. package/bin/skills/torchtitan/references/float8.md +133 -0
  343. package/bin/skills/torchtitan/references/fsdp.md +126 -0
  344. package/bin/skills/transformer-lens/SKILL.md +346 -0
  345. package/bin/skills/transformer-lens/references/README.md +54 -0
  346. package/bin/skills/transformer-lens/references/api.md +362 -0
  347. package/bin/skills/transformer-lens/references/tutorials.md +339 -0
  348. package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
  349. package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
  350. package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
  351. package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
  352. package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
  353. package/bin/skills/unsloth/SKILL.md +80 -0
  354. package/bin/skills/unsloth/references/index.md +7 -0
  355. package/bin/skills/unsloth/references/llms-full.md +16799 -0
  356. package/bin/skills/unsloth/references/llms-txt.md +12044 -0
  357. package/bin/skills/unsloth/references/llms.md +82 -0
  358. package/bin/skills/verl/SKILL.md +391 -0
  359. package/bin/skills/verl/references/api-reference.md +301 -0
  360. package/bin/skills/verl/references/troubleshooting.md +391 -0
  361. package/bin/skills/vllm/SKILL.md +364 -0
  362. package/bin/skills/vllm/references/optimization.md +226 -0
  363. package/bin/skills/vllm/references/quantization.md +284 -0
  364. package/bin/skills/vllm/references/server-deployment.md +255 -0
  365. package/bin/skills/vllm/references/troubleshooting.md +447 -0
  366. package/bin/skills/weights-and-biases/SKILL.md +590 -0
  367. package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
  368. package/bin/skills/weights-and-biases/references/integrations.md +700 -0
  369. package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
  370. package/bin/skills/whisper/SKILL.md +317 -0
  371. package/bin/skills/whisper/references/languages.md +189 -0
  372. package/bin/synsc +0 -0
  373. package/package.json +10 -0
@@ -0,0 +1,454 @@
1
+ # Multi-Node Training
2
+
3
+ Complete guide to distributed Ray cluster training with OpenRLHF across multiple machines.
4
+
5
+ ## Overview
6
+
7
+ OpenRLHF uses Ray for distributed scheduling, allowing Actor, Critic, Reward, and Reference models to span multiple nodes. Supports fault tolerance through checkpointing and automatic task rescheduling.
8
+
9
+ ## Ray Cluster Setup
10
+
11
+ ### 1. Start Head Node (Master Machine)
12
+
13
+ **In Docker container**:
14
+ ```bash
15
+ # Launch container on master node
16
+ docker run --runtime=nvidia -it --rm --shm-size="10g" \
17
+ --cap-add=SYS_ADMIN -v $PWD:/openrlhf \
18
+ nvcr.io/nvidia/pytorch:25.02-py3 bash
19
+
20
+ # Start Ray head node
21
+ ray start --head --node-ip-address 0.0.0.0 --num-gpus 8
22
+ ```
23
+
24
+ **Output**:
25
+ ```
26
+ Ray runtime started.
27
+ Dashboard: http://0.0.0.0:8265
28
+ ```
29
+
30
+ ### 2. Connect Worker Nodes
31
+
32
+ **On each worker machine**:
33
+ ```bash
34
+ # Launch container
35
+ docker run --runtime=nvidia -it --rm --shm-size="10g" \
36
+ --cap-add=SYS_ADMIN -v $PWD:/openrlhf \
37
+ nvcr.io/nvidia/pytorch:25.02-py3 bash
38
+
39
+ # Connect to head node
40
+ ray start --address {MASTER-NODE-IP}:6379 --num-gpus 8
41
+ ```
42
+
43
+ **Replace `{MASTER-NODE-IP}`** with head node's IP address.
44
+
45
+ ### 3. Verify Cluster
46
+
47
+ ```bash
48
+ # On head node
49
+ ray status
50
+ ```
51
+
52
+ **Output**:
53
+ ```
54
+ Nodes: 4
55
+ - 1 head node (8 GPUs)
56
+ - 3 worker nodes (8 GPUs each)
57
+ Total GPUs: 32
58
+ ```
59
+
60
+ ## Distributed Training Configuration
61
+
62
+ ### Multi-Node PPO Training
63
+
64
+ **4-node cluster (32 GPUs)** - 70B model:
65
+ ```bash
66
+ ray job submit --address="http://127.0.0.1:8265" \
67
+ --runtime-env-json='{"working_dir": "/openrlhf"}' \
68
+ -- python3 -m openrlhf.cli.train_ppo_ray \
69
+ --ref_num_nodes 1 --ref_num_gpus_per_node 8 \
70
+ --reward_num_nodes 1 --reward_num_gpus_per_node 8 \
71
+ --critic_num_nodes 1 --critic_num_gpus_per_node 8 \
72
+ --actor_num_nodes 1 --actor_num_gpus_per_node 8 \
73
+ --vllm_num_engines 2 --vllm_tensor_parallel_size 4 \
74
+ --pretrain meta-llama/Llama-2-70b-hf \
75
+ --reward_pretrain ./reward-model-70b \
76
+ --save_path ./output/llama-70b-ppo \
77
+ --ckpt_path ./checkpoints/llama-70b-ppo \
78
+ --save_steps 100 --logging_steps 1 \
79
+ --micro_train_batch_size 2 --train_batch_size 128 \
80
+ --micro_rollout_batch_size 4 --rollout_batch_size 1024 \
81
+ --max_epochs 1 --prompt_max_len 1024 --generate_max_len 1024 \
82
+ --zero_stage 3 --bf16 \
83
+ --actor_learning_rate 5e-7 --critic_learning_rate 9e-6 \
84
+ --init_kl_coef 0.01 --normalize_reward \
85
+ --gradient_checkpointing --flash_attn
86
+ ```
87
+
88
+ **GPU allocation**:
89
+ - **Node 1**: Reference model (8 GPUs)
90
+ - **Node 2**: Reward model (8 GPUs)
91
+ - **Node 3**: Critic model (8 GPUs)
92
+ - **Node 4**: Actor model (8 GPUs)
93
+
94
+ ### Model Distribution Arguments
95
+
96
+ **Per-model configuration**:
97
+ ```bash
98
+ # Actor model
99
+ --actor_num_nodes 2 # 2 nodes for actor
100
+ --actor_num_gpus_per_node 8 # 8 GPUs per node = 16 GPUs total
101
+
102
+ # Critic model
103
+ --critic_num_nodes 1
104
+ --critic_num_gpus_per_node 8
105
+
106
+ # Reward model
107
+ --reward_num_nodes 1
108
+ --reward_num_gpus_per_node 8
109
+
110
+ # Reference model
111
+ --ref_num_nodes 1
112
+ --ref_num_gpus_per_node 8
113
+ ```
114
+
115
+ ### Hybrid Engine (Colocated Models)
116
+
117
+ **Share GPUs across models**:
118
+ ```bash
119
+ # Colocate all models on same GPUs
120
+ --colocate_all_models
121
+
122
+ # Or colocate specific pairs
123
+ --colocate_actor_ref # Actor + Reference
124
+ --colocate_critic_reward # Critic + Reward
125
+ ```
126
+
127
+ **Example (2-node, 16 GPUs)**:
128
+ ```bash
129
+ ray job submit --address="http://127.0.0.1:8265" \
130
+ -- python3 -m openrlhf.cli.train_ppo_ray \
131
+ --colocate_all_models \
132
+ --vllm_enable_sleep --deepspeed_enable_sleep \
133
+ --actor_num_nodes 2 --actor_num_gpus_per_node 8 \
134
+ --critic_num_nodes 0 --critic_num_gpus_per_node 0 \
135
+ --reward_num_nodes 0 --reward_num_gpus_per_node 0 \
136
+ --ref_num_nodes 0 --ref_num_gpus_per_node 0 \
137
+ --vllm_num_engines 4 --vllm_tensor_parallel_size 4 \
138
+ # ... other args
139
+ ```
140
+
141
+ **Result**: All models share 16 GPUs via sleep/wake cycles.
142
+
143
+ ## vLLM Configuration
144
+
145
+ ### Tensor Parallelism
146
+
147
+ **Multi-GPU per engine**:
148
+ ```bash
149
+ --vllm_num_engines 4 # 4 engines
150
+ --vllm_tensor_parallel_size 4 # 4 GPUs each = 16 GPUs total
151
+ ```
152
+
153
+ ### GPU Memory Management
154
+
155
+ ```bash
156
+ --vllm_gpu_memory_utilization 0.5 # Use 50% GPU for vLLM
157
+ ```
158
+
159
+ **Calculation**:
160
+ - A100 80GB × 0.5 = 40GB for vLLM
161
+ - Remaining 40GB for other models (if colocated)
162
+
163
+ ## Checkpointing
164
+
165
+ ### Enable Checkpointing
166
+
167
+ **Basic checkpointing**:
168
+ ```bash
169
+ --save_path ./output/model # Final save path
170
+ --ckpt_path ./checkpoints/model # Checkpoint directory
171
+ --save_steps 100 # Save every 100 steps
172
+ --save_value_network # Also save critic
173
+ ```
174
+
175
+ **HuggingFace format**:
176
+ ```bash
177
+ --save_hf_ckpt # Save as HuggingFace model (easier loading)
178
+ ```
179
+
180
+ **DeepSpeed universal checkpoint**:
181
+ ```bash
182
+ --use_ds_universal_ckpt # Compatible across ZeRO stages
183
+ ```
184
+
185
+ ### Checkpoint Content
186
+
187
+ **Saved state**:
188
+ ```python
189
+ {
190
+ "global_step": 1000,
191
+ "episode": 10,
192
+ "data_loader_state_dict": {...},
193
+ "actor_model": {...}, # DeepSpeed checkpoint
194
+ "critic_model": {...} # If --save_value_network
195
+ }
196
+ ```
197
+
198
+ **Files created**:
199
+ ```
200
+ checkpoints/llama-70b-ppo/
201
+ ├── global_step_1000/
202
+ │ ├── actor/
203
+ │ │ ├── mp_rank_00_model_states.pt
204
+ │ │ ├── zero_pp_rank_0_mp_rank_00optim_states.pt
205
+ │ │ └── ...
206
+ │ └── critic/ (if --save_value_network)
207
+ │ └── ...
208
+ └── hf_ckpt/ (if --save_hf_ckpt)
209
+ ├── config.json
210
+ ├── pytorch_model.bin
211
+ └── ...
212
+ ```
213
+
214
+ ### Resume Training
215
+
216
+ **From checkpoint**:
217
+ ```bash
218
+ ray job submit --address="http://127.0.0.1:8265" \
219
+ -- python3 -m openrlhf.cli.train_ppo_ray \
220
+ --load_checkpoint # Enable resume
221
+ --ckpt_path ./checkpoints/llama-70b-ppo # Checkpoint dir
222
+ # ... other args (must match original)
223
+ ```
224
+
225
+ **Resume logic**:
226
+ 1. `PPOTrainer.fit()` checks for existing checkpoints
227
+ 2. Loads latest checkpoint from `ckpt_path`
228
+ 3. Restores `global_step`, `episode`, dataloader state
229
+ 4. Continues training from that point
230
+
231
+ ## Fault Tolerance
232
+
233
+ ### Automatic Task Rescheduling
234
+
235
+ **Ray's built-in fault tolerance**:
236
+ - If worker node fails → Ray reschedules tasks on available nodes
237
+ - Requires sufficient resources on remaining nodes
238
+ - May need to reinitialize some components
239
+
240
+ ### DeepSpeed Sleep Mode Protection
241
+
242
+ **Prevents OOM-related failures**:
243
+ ```bash
244
+ --deepspeed_enable_sleep # Offload to CPU when not training
245
+ ```
246
+
247
+ **Sleep/wake cycle**:
248
+ 1. Model offloaded to CPU after training
249
+ 2. Frees GPU memory for other components
250
+ 3. Reloaded from CPU before next training step
251
+ 4. Synchronized via Ray barriers
252
+
253
+ **OOM prevention**:
254
+ - Models don't compete for GPU memory
255
+ - Sequential loading prevents concurrent OOM
256
+ - Barriers ensure synchronization
257
+
258
+ ### Checkpoint-Based Recovery
259
+
260
+ **Recover from catastrophic failure**:
261
+ 1. Training interrupted (node crash, OOM, etc.)
262
+ 2. Restart Ray cluster
263
+ 3. Resume with `--load_checkpoint`
264
+ 4. Training continues from last saved step
265
+
266
+ **Best practice**:
267
+ ```bash
268
+ --save_steps 100 # Frequent checkpointing (every 100 steps)
269
+ ```
270
+
271
+ ## Monitoring
272
+
273
+ ### Ray Dashboard
274
+
275
+ **Access dashboard**:
276
+ ```
277
+ http://{HEAD-NODE-IP}:8265
278
+ ```
279
+
280
+ **Monitor**:
281
+ - Node status (active, idle, failed)
282
+ - GPU utilization per node
283
+ - Task scheduling (which models on which nodes)
284
+ - Resource usage (memory, CPU, GPU)
285
+
286
+ ### Weights & Biases Integration
287
+
288
+ **Enable W&B logging**:
289
+ ```bash
290
+ --use_wandb {your-wandb-token}
291
+ --wandb_org your-org
292
+ --wandb_project llama-70b-ppo
293
+ ```
294
+
295
+ **Metrics logged**:
296
+ - Training loss per step
297
+ - Reward scores
298
+ - KL divergence
299
+ - GPU utilization per node
300
+
301
+ ## Performance Optimization
302
+
303
+ ### InfiniBand for Multi-Node
304
+
305
+ **For nodes with InfiniBand**:
306
+ ```bash
307
+ # Set environment variable before starting Ray
308
+ export NCCL_IB_HCA=mlx5_0 # InfiniBand device
309
+ export NCCL_SOCKET_IFNAME=ib0
310
+ export NCCL_IB_DISABLE=0
311
+
312
+ ray start --head --node-ip-address 0.0.0.0 --num-gpus 8
313
+ ```
314
+
315
+ **Performance gain**: 2-3× faster multi-node communication
316
+
317
+ ### Gradient Checkpointing
318
+
319
+ **Reduce memory, enable larger models**:
320
+ ```bash
321
+ --gradient_checkpointing # Trade compute for memory
322
+ ```
323
+
324
+ ### Flash Attention 2
325
+
326
+ **Faster attention, lower memory**:
327
+ ```bash
328
+ --flash_attn # Requires FlashAttention installed
329
+ ```
330
+
331
+ ### Packing Samples
332
+
333
+ **Improve GPU utilization**:
334
+ ```bash
335
+ --packing_samples # Pack multiple samples per batch
336
+ ```
337
+
338
+ ## Troubleshooting
339
+
340
+ ### Ray Connection Issues
341
+
342
+ **Symptom**: Worker nodes can't connect to head
343
+
344
+ **Solution**: Check firewall/network
345
+ ```bash
346
+ # On head node, ensure ports open
347
+ # Default ports: 6379 (Redis), 8265 (Dashboard), 10001-10100 (workers)
348
+
349
+ # Test connection from worker
350
+ telnet {HEAD-NODE-IP} 6379
351
+ ```
352
+
353
+ ### Node Failures During Training
354
+
355
+ **Symptom**: Ray reports node failure
356
+
357
+ **Solution 1** - Resume from checkpoint:
358
+ ```bash
359
+ # Fix failed node or remove from cluster
360
+ ray stop # On failed node
361
+ # Then resume training with --load_checkpoint
362
+ ```
363
+
364
+ **Solution 2** - Adjust resources:
365
+ ```bash
366
+ # Reduce nodes if some failed
367
+ --actor_num_nodes 1 # Instead of 2
368
+ ```
369
+
370
+ ### OOM on Multi-Node
371
+
372
+ **Symptom**: OOM despite multi-node setup
373
+
374
+ **Solution 1** - Reduce batch sizes:
375
+ ```bash
376
+ --micro_train_batch_size 1 # Reduce from 2
377
+ --micro_rollout_batch_size 2 # Reduce from 4
378
+ ```
379
+
380
+ **Solution 2** - Enable sleep modes:
381
+ ```bash
382
+ --vllm_enable_sleep
383
+ --deepspeed_enable_sleep
384
+ ```
385
+
386
+ **Solution 3** - Increase ZeRO stage:
387
+ ```bash
388
+ --zero_stage 3 # Maximum sharding
389
+ ```
390
+
391
+ ### Checkpoint Loading Fails
392
+
393
+ **Symptom**: `FileNotFoundError` when resuming
394
+
395
+ **Check checkpoint path**:
396
+ ```bash
397
+ ls -la ./checkpoints/llama-70b-ppo/
398
+ # Verify global_step_* directories exist
399
+ ```
400
+
401
+ **Solution**: Ensure `--ckpt_path` matches save location
402
+ ```bash
403
+ --ckpt_path ./checkpoints/llama-70b-ppo # Same as during save
404
+ ```
405
+
406
+ ## Complete Multi-Node Example
407
+
408
+ ### 8-node cluster (64 GPUs) - 70B model
409
+
410
+ **Head node (Node 1)**:
411
+ ```bash
412
+ ray start --head --node-ip-address 10.0.0.1 --num-gpus 8
413
+ ```
414
+
415
+ **Worker nodes (Nodes 2-8)**:
416
+ ```bash
417
+ ray start --address 10.0.0.1:6379 --num-gpus 8
418
+ ```
419
+
420
+ **Submit job**:
421
+ ```bash
422
+ ray job submit --address="http://10.0.0.1:8265" \
423
+ --runtime-env-json='{"working_dir": "/openrlhf"}' \
424
+ -- python3 -m openrlhf.cli.train_ppo_ray \
425
+ --ref_num_nodes 2 --ref_num_gpus_per_node 8 \
426
+ --reward_num_nodes 2 --reward_num_gpus_per_node 8 \
427
+ --critic_num_nodes 2 --critic_num_gpus_per_node 8 \
428
+ --actor_num_nodes 2 --actor_num_gpus_per_node 8 \
429
+ --vllm_num_engines 4 --vllm_tensor_parallel_size 4 \
430
+ --pretrain meta-llama/Llama-2-70b-hf \
431
+ --reward_pretrain ./reward-70b \
432
+ --save_path ./output/llama-70b-ppo \
433
+ --ckpt_path ./checkpoints/llama-70b-ppo \
434
+ --save_steps 100 --save_hf_ckpt \
435
+ --micro_train_batch_size 1 --train_batch_size 128 \
436
+ --micro_rollout_batch_size 2 --rollout_batch_size 1024 \
437
+ --max_epochs 1 --bf16 --zero_stage 3 \
438
+ --actor_learning_rate 5e-7 --critic_learning_rate 9e-6 \
439
+ --gradient_checkpointing --flash_attn --packing_samples \
440
+ --use_wandb {token} --wandb_project llama-70b-ppo
441
+ ```
442
+
443
+ **GPU allocation**:
444
+ - Reference: 16 GPUs (2 nodes × 8)
445
+ - Reward: 16 GPUs (2 nodes × 8)
446
+ - Critic: 16 GPUs (2 nodes × 8)
447
+ - Actor: 16 GPUs (2 nodes × 8)
448
+ - **Total**: 64 GPUs
449
+
450
+ ## References
451
+
452
+ - Ray Docs: https://docs.ray.io/
453
+ - OpenRLHF: https://github.com/OpenRLHF/OpenRLHF
454
+ - DeepSpeed ZeRO: https://www.deepspeed.ai/tutorials/zero/