@synsci/cli-darwin-arm64 1.1.49

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (373) hide show
  1. package/bin/skills/accelerate/SKILL.md +332 -0
  2. package/bin/skills/accelerate/references/custom-plugins.md +453 -0
  3. package/bin/skills/accelerate/references/megatron-integration.md +489 -0
  4. package/bin/skills/accelerate/references/performance.md +525 -0
  5. package/bin/skills/audiocraft/SKILL.md +564 -0
  6. package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
  7. package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
  8. package/bin/skills/autogpt/SKILL.md +403 -0
  9. package/bin/skills/autogpt/references/advanced-usage.md +535 -0
  10. package/bin/skills/autogpt/references/troubleshooting.md +420 -0
  11. package/bin/skills/awq/SKILL.md +310 -0
  12. package/bin/skills/awq/references/advanced-usage.md +324 -0
  13. package/bin/skills/awq/references/troubleshooting.md +344 -0
  14. package/bin/skills/axolotl/SKILL.md +158 -0
  15. package/bin/skills/axolotl/references/api.md +5548 -0
  16. package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
  17. package/bin/skills/axolotl/references/index.md +15 -0
  18. package/bin/skills/axolotl/references/other.md +3563 -0
  19. package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
  20. package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
  21. package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
  22. package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
  23. package/bin/skills/bitsandbytes/SKILL.md +411 -0
  24. package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
  25. package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
  26. package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
  27. package/bin/skills/blip-2/SKILL.md +564 -0
  28. package/bin/skills/blip-2/references/advanced-usage.md +680 -0
  29. package/bin/skills/blip-2/references/troubleshooting.md +526 -0
  30. package/bin/skills/chroma/SKILL.md +406 -0
  31. package/bin/skills/chroma/references/integration.md +38 -0
  32. package/bin/skills/clip/SKILL.md +253 -0
  33. package/bin/skills/clip/references/applications.md +207 -0
  34. package/bin/skills/constitutional-ai/SKILL.md +290 -0
  35. package/bin/skills/crewai/SKILL.md +498 -0
  36. package/bin/skills/crewai/references/flows.md +438 -0
  37. package/bin/skills/crewai/references/tools.md +429 -0
  38. package/bin/skills/crewai/references/troubleshooting.md +480 -0
  39. package/bin/skills/deepspeed/SKILL.md +141 -0
  40. package/bin/skills/deepspeed/references/08.md +17 -0
  41. package/bin/skills/deepspeed/references/09.md +173 -0
  42. package/bin/skills/deepspeed/references/2020.md +378 -0
  43. package/bin/skills/deepspeed/references/2023.md +279 -0
  44. package/bin/skills/deepspeed/references/assets.md +179 -0
  45. package/bin/skills/deepspeed/references/index.md +35 -0
  46. package/bin/skills/deepspeed/references/mii.md +118 -0
  47. package/bin/skills/deepspeed/references/other.md +1191 -0
  48. package/bin/skills/deepspeed/references/tutorials.md +6554 -0
  49. package/bin/skills/dspy/SKILL.md +590 -0
  50. package/bin/skills/dspy/references/examples.md +663 -0
  51. package/bin/skills/dspy/references/modules.md +475 -0
  52. package/bin/skills/dspy/references/optimizers.md +566 -0
  53. package/bin/skills/faiss/SKILL.md +221 -0
  54. package/bin/skills/faiss/references/index_types.md +280 -0
  55. package/bin/skills/flash-attention/SKILL.md +367 -0
  56. package/bin/skills/flash-attention/references/benchmarks.md +215 -0
  57. package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
  58. package/bin/skills/gguf/SKILL.md +427 -0
  59. package/bin/skills/gguf/references/advanced-usage.md +504 -0
  60. package/bin/skills/gguf/references/troubleshooting.md +442 -0
  61. package/bin/skills/gptq/SKILL.md +450 -0
  62. package/bin/skills/gptq/references/calibration.md +337 -0
  63. package/bin/skills/gptq/references/integration.md +129 -0
  64. package/bin/skills/gptq/references/troubleshooting.md +95 -0
  65. package/bin/skills/grpo-rl-training/README.md +97 -0
  66. package/bin/skills/grpo-rl-training/SKILL.md +572 -0
  67. package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
  68. package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
  69. package/bin/skills/guidance/SKILL.md +572 -0
  70. package/bin/skills/guidance/references/backends.md +554 -0
  71. package/bin/skills/guidance/references/constraints.md +674 -0
  72. package/bin/skills/guidance/references/examples.md +767 -0
  73. package/bin/skills/hqq/SKILL.md +445 -0
  74. package/bin/skills/hqq/references/advanced-usage.md +528 -0
  75. package/bin/skills/hqq/references/troubleshooting.md +503 -0
  76. package/bin/skills/hugging-face-cli/SKILL.md +191 -0
  77. package/bin/skills/hugging-face-cli/references/commands.md +954 -0
  78. package/bin/skills/hugging-face-cli/references/examples.md +374 -0
  79. package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
  80. package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
  81. package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
  82. package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
  83. package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
  84. package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
  85. package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
  86. package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
  87. package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
  88. package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
  89. package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
  90. package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
  91. package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
  92. package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
  93. package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
  94. package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
  95. package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
  96. package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
  97. package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
  98. package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
  99. package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
  100. package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
  101. package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
  102. package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
  103. package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
  104. package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
  105. package/bin/skills/hugging-face-jobs/index.html +216 -0
  106. package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
  107. package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
  108. package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
  109. package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
  110. package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
  111. package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
  112. package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
  113. package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
  114. package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
  115. package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
  116. package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
  117. package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
  118. package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
  119. package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
  120. package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
  121. package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
  122. package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
  123. package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
  124. package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
  125. package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
  126. package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
  127. package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
  128. package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
  129. package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
  130. package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
  131. package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
  132. package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
  133. package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
  134. package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
  135. package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
  136. package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
  137. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
  138. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
  139. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
  140. package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
  141. package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
  142. package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
  143. package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
  144. package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
  145. package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
  146. package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
  147. package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
  148. package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
  149. package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
  150. package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
  151. package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
  152. package/bin/skills/instructor/SKILL.md +740 -0
  153. package/bin/skills/instructor/references/examples.md +107 -0
  154. package/bin/skills/instructor/references/providers.md +70 -0
  155. package/bin/skills/instructor/references/validation.md +606 -0
  156. package/bin/skills/knowledge-distillation/SKILL.md +458 -0
  157. package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
  158. package/bin/skills/lambda-labs/SKILL.md +545 -0
  159. package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
  160. package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
  161. package/bin/skills/langchain/SKILL.md +480 -0
  162. package/bin/skills/langchain/references/agents.md +499 -0
  163. package/bin/skills/langchain/references/integration.md +562 -0
  164. package/bin/skills/langchain/references/rag.md +600 -0
  165. package/bin/skills/langsmith/SKILL.md +422 -0
  166. package/bin/skills/langsmith/references/advanced-usage.md +548 -0
  167. package/bin/skills/langsmith/references/troubleshooting.md +537 -0
  168. package/bin/skills/litgpt/SKILL.md +469 -0
  169. package/bin/skills/litgpt/references/custom-models.md +568 -0
  170. package/bin/skills/litgpt/references/distributed-training.md +451 -0
  171. package/bin/skills/litgpt/references/supported-models.md +336 -0
  172. package/bin/skills/litgpt/references/training-recipes.md +619 -0
  173. package/bin/skills/llama-cpp/SKILL.md +258 -0
  174. package/bin/skills/llama-cpp/references/optimization.md +89 -0
  175. package/bin/skills/llama-cpp/references/quantization.md +213 -0
  176. package/bin/skills/llama-cpp/references/server.md +125 -0
  177. package/bin/skills/llama-factory/SKILL.md +80 -0
  178. package/bin/skills/llama-factory/references/_images.md +23 -0
  179. package/bin/skills/llama-factory/references/advanced.md +1055 -0
  180. package/bin/skills/llama-factory/references/getting_started.md +349 -0
  181. package/bin/skills/llama-factory/references/index.md +19 -0
  182. package/bin/skills/llama-factory/references/other.md +31 -0
  183. package/bin/skills/llamaguard/SKILL.md +337 -0
  184. package/bin/skills/llamaindex/SKILL.md +569 -0
  185. package/bin/skills/llamaindex/references/agents.md +83 -0
  186. package/bin/skills/llamaindex/references/data_connectors.md +108 -0
  187. package/bin/skills/llamaindex/references/query_engines.md +406 -0
  188. package/bin/skills/llava/SKILL.md +304 -0
  189. package/bin/skills/llava/references/training.md +197 -0
  190. package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
  191. package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
  192. package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
  193. package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
  194. package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
  195. package/bin/skills/long-context/SKILL.md +536 -0
  196. package/bin/skills/long-context/references/extension_methods.md +468 -0
  197. package/bin/skills/long-context/references/fine_tuning.md +611 -0
  198. package/bin/skills/long-context/references/rope.md +402 -0
  199. package/bin/skills/mamba/SKILL.md +260 -0
  200. package/bin/skills/mamba/references/architecture-details.md +206 -0
  201. package/bin/skills/mamba/references/benchmarks.md +255 -0
  202. package/bin/skills/mamba/references/training-guide.md +388 -0
  203. package/bin/skills/megatron-core/SKILL.md +366 -0
  204. package/bin/skills/megatron-core/references/benchmarks.md +249 -0
  205. package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
  206. package/bin/skills/megatron-core/references/production-examples.md +473 -0
  207. package/bin/skills/megatron-core/references/training-recipes.md +547 -0
  208. package/bin/skills/miles/SKILL.md +315 -0
  209. package/bin/skills/miles/references/api-reference.md +141 -0
  210. package/bin/skills/miles/references/troubleshooting.md +352 -0
  211. package/bin/skills/mlflow/SKILL.md +704 -0
  212. package/bin/skills/mlflow/references/deployment.md +744 -0
  213. package/bin/skills/mlflow/references/model-registry.md +770 -0
  214. package/bin/skills/mlflow/references/tracking.md +680 -0
  215. package/bin/skills/modal/SKILL.md +341 -0
  216. package/bin/skills/modal/references/advanced-usage.md +503 -0
  217. package/bin/skills/modal/references/troubleshooting.md +494 -0
  218. package/bin/skills/model-merging/SKILL.md +539 -0
  219. package/bin/skills/model-merging/references/evaluation.md +462 -0
  220. package/bin/skills/model-merging/references/examples.md +428 -0
  221. package/bin/skills/model-merging/references/methods.md +352 -0
  222. package/bin/skills/model-pruning/SKILL.md +495 -0
  223. package/bin/skills/model-pruning/references/wanda.md +347 -0
  224. package/bin/skills/moe-training/SKILL.md +526 -0
  225. package/bin/skills/moe-training/references/architectures.md +432 -0
  226. package/bin/skills/moe-training/references/inference.md +348 -0
  227. package/bin/skills/moe-training/references/training.md +425 -0
  228. package/bin/skills/nanogpt/SKILL.md +290 -0
  229. package/bin/skills/nanogpt/references/architecture.md +382 -0
  230. package/bin/skills/nanogpt/references/data.md +476 -0
  231. package/bin/skills/nanogpt/references/training.md +564 -0
  232. package/bin/skills/nemo-curator/SKILL.md +383 -0
  233. package/bin/skills/nemo-curator/references/deduplication.md +87 -0
  234. package/bin/skills/nemo-curator/references/filtering.md +102 -0
  235. package/bin/skills/nemo-evaluator/SKILL.md +494 -0
  236. package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
  237. package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
  238. package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
  239. package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
  240. package/bin/skills/nemo-guardrails/SKILL.md +297 -0
  241. package/bin/skills/nnsight/SKILL.md +436 -0
  242. package/bin/skills/nnsight/references/README.md +78 -0
  243. package/bin/skills/nnsight/references/api.md +344 -0
  244. package/bin/skills/nnsight/references/tutorials.md +300 -0
  245. package/bin/skills/openrlhf/SKILL.md +249 -0
  246. package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
  247. package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
  248. package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
  249. package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
  250. package/bin/skills/outlines/SKILL.md +652 -0
  251. package/bin/skills/outlines/references/backends.md +615 -0
  252. package/bin/skills/outlines/references/examples.md +773 -0
  253. package/bin/skills/outlines/references/json_generation.md +652 -0
  254. package/bin/skills/peft/SKILL.md +431 -0
  255. package/bin/skills/peft/references/advanced-usage.md +514 -0
  256. package/bin/skills/peft/references/troubleshooting.md +480 -0
  257. package/bin/skills/phoenix/SKILL.md +475 -0
  258. package/bin/skills/phoenix/references/advanced-usage.md +619 -0
  259. package/bin/skills/phoenix/references/troubleshooting.md +538 -0
  260. package/bin/skills/pinecone/SKILL.md +358 -0
  261. package/bin/skills/pinecone/references/deployment.md +181 -0
  262. package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
  263. package/bin/skills/pytorch-fsdp/references/index.md +7 -0
  264. package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
  265. package/bin/skills/pytorch-lightning/SKILL.md +346 -0
  266. package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
  267. package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
  268. package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
  269. package/bin/skills/pyvene/SKILL.md +473 -0
  270. package/bin/skills/pyvene/references/README.md +73 -0
  271. package/bin/skills/pyvene/references/api.md +383 -0
  272. package/bin/skills/pyvene/references/tutorials.md +376 -0
  273. package/bin/skills/qdrant/SKILL.md +493 -0
  274. package/bin/skills/qdrant/references/advanced-usage.md +648 -0
  275. package/bin/skills/qdrant/references/troubleshooting.md +631 -0
  276. package/bin/skills/ray-data/SKILL.md +326 -0
  277. package/bin/skills/ray-data/references/integration.md +82 -0
  278. package/bin/skills/ray-data/references/transformations.md +83 -0
  279. package/bin/skills/ray-train/SKILL.md +406 -0
  280. package/bin/skills/ray-train/references/multi-node.md +628 -0
  281. package/bin/skills/rwkv/SKILL.md +260 -0
  282. package/bin/skills/rwkv/references/architecture-details.md +344 -0
  283. package/bin/skills/rwkv/references/rwkv7.md +386 -0
  284. package/bin/skills/rwkv/references/state-management.md +369 -0
  285. package/bin/skills/saelens/SKILL.md +386 -0
  286. package/bin/skills/saelens/references/README.md +70 -0
  287. package/bin/skills/saelens/references/api.md +333 -0
  288. package/bin/skills/saelens/references/tutorials.md +318 -0
  289. package/bin/skills/segment-anything/SKILL.md +500 -0
  290. package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
  291. package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
  292. package/bin/skills/sentence-transformers/SKILL.md +255 -0
  293. package/bin/skills/sentence-transformers/references/models.md +123 -0
  294. package/bin/skills/sentencepiece/SKILL.md +235 -0
  295. package/bin/skills/sentencepiece/references/algorithms.md +200 -0
  296. package/bin/skills/sentencepiece/references/training.md +304 -0
  297. package/bin/skills/sglang/SKILL.md +442 -0
  298. package/bin/skills/sglang/references/deployment.md +490 -0
  299. package/bin/skills/sglang/references/radix-attention.md +413 -0
  300. package/bin/skills/sglang/references/structured-generation.md +541 -0
  301. package/bin/skills/simpo/SKILL.md +219 -0
  302. package/bin/skills/simpo/references/datasets.md +478 -0
  303. package/bin/skills/simpo/references/hyperparameters.md +452 -0
  304. package/bin/skills/simpo/references/loss-functions.md +350 -0
  305. package/bin/skills/skypilot/SKILL.md +509 -0
  306. package/bin/skills/skypilot/references/advanced-usage.md +491 -0
  307. package/bin/skills/skypilot/references/troubleshooting.md +570 -0
  308. package/bin/skills/slime/SKILL.md +464 -0
  309. package/bin/skills/slime/references/api-reference.md +392 -0
  310. package/bin/skills/slime/references/troubleshooting.md +386 -0
  311. package/bin/skills/speculative-decoding/SKILL.md +467 -0
  312. package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
  313. package/bin/skills/speculative-decoding/references/medusa.md +350 -0
  314. package/bin/skills/stable-diffusion/SKILL.md +519 -0
  315. package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
  316. package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
  317. package/bin/skills/tensorboard/SKILL.md +629 -0
  318. package/bin/skills/tensorboard/references/integrations.md +638 -0
  319. package/bin/skills/tensorboard/references/profiling.md +545 -0
  320. package/bin/skills/tensorboard/references/visualization.md +620 -0
  321. package/bin/skills/tensorrt-llm/SKILL.md +187 -0
  322. package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
  323. package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
  324. package/bin/skills/tensorrt-llm/references/serving.md +470 -0
  325. package/bin/skills/tinker/SKILL.md +362 -0
  326. package/bin/skills/tinker/references/api-reference.md +168 -0
  327. package/bin/skills/tinker/references/getting-started.md +157 -0
  328. package/bin/skills/tinker/references/loss-functions.md +163 -0
  329. package/bin/skills/tinker/references/models-and-lora.md +139 -0
  330. package/bin/skills/tinker/references/recipes.md +280 -0
  331. package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
  332. package/bin/skills/tinker/references/rendering.md +243 -0
  333. package/bin/skills/tinker/references/supervised-learning.md +232 -0
  334. package/bin/skills/tinker-training-cost/SKILL.md +187 -0
  335. package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
  336. package/bin/skills/torchforge/SKILL.md +433 -0
  337. package/bin/skills/torchforge/references/api-reference.md +327 -0
  338. package/bin/skills/torchforge/references/troubleshooting.md +409 -0
  339. package/bin/skills/torchtitan/SKILL.md +358 -0
  340. package/bin/skills/torchtitan/references/checkpoint.md +181 -0
  341. package/bin/skills/torchtitan/references/custom-models.md +258 -0
  342. package/bin/skills/torchtitan/references/float8.md +133 -0
  343. package/bin/skills/torchtitan/references/fsdp.md +126 -0
  344. package/bin/skills/transformer-lens/SKILL.md +346 -0
  345. package/bin/skills/transformer-lens/references/README.md +54 -0
  346. package/bin/skills/transformer-lens/references/api.md +362 -0
  347. package/bin/skills/transformer-lens/references/tutorials.md +339 -0
  348. package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
  349. package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
  350. package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
  351. package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
  352. package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
  353. package/bin/skills/unsloth/SKILL.md +80 -0
  354. package/bin/skills/unsloth/references/index.md +7 -0
  355. package/bin/skills/unsloth/references/llms-full.md +16799 -0
  356. package/bin/skills/unsloth/references/llms-txt.md +12044 -0
  357. package/bin/skills/unsloth/references/llms.md +82 -0
  358. package/bin/skills/verl/SKILL.md +391 -0
  359. package/bin/skills/verl/references/api-reference.md +301 -0
  360. package/bin/skills/verl/references/troubleshooting.md +391 -0
  361. package/bin/skills/vllm/SKILL.md +364 -0
  362. package/bin/skills/vllm/references/optimization.md +226 -0
  363. package/bin/skills/vllm/references/quantization.md +284 -0
  364. package/bin/skills/vllm/references/server-deployment.md +255 -0
  365. package/bin/skills/vllm/references/troubleshooting.md +447 -0
  366. package/bin/skills/weights-and-biases/SKILL.md +590 -0
  367. package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
  368. package/bin/skills/weights-and-biases/references/integrations.md +700 -0
  369. package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
  370. package/bin/skills/whisper/SKILL.md +317 -0
  371. package/bin/skills/whisper/references/languages.md +189 -0
  372. package/bin/synsc +0 -0
  373. package/package.json +10 -0
@@ -0,0 +1,451 @@
1
+ # Distributed Training
2
+
3
+ Guide to FSDP (Fully Sharded Data Parallel) distributed training in LitGPT for scaling to multiple GPUs and nodes.
4
+
5
+ ## Overview
6
+
7
+ LitGPT uses **Lightning Fabric** with **FSDP** to distribute training across multiple GPUs. FSDP shards model parameters, gradients, and optimizer states to enable training models larger than single-GPU memory.
8
+
9
+ **When to use FSDP**:
10
+ - Model doesn't fit on single GPU
11
+ - Want faster training with multi-GPU
12
+ - Training models >7B parameters
13
+ - Need to scale across multiple nodes
14
+
15
+ ## Quick Start
16
+
17
+ ### Single Node Multi-GPU
18
+
19
+ ```bash
20
+ # Train Llama 2 7B on 4 GPUs
21
+ litgpt finetune_lora meta-llama/Llama-2-7b-hf \
22
+ --devices 4 \
23
+ --data JSON \
24
+ --data.json_path data/alpaca.json
25
+ ```
26
+
27
+ FSDP is **automatically enabled** when `devices > 1`.
28
+
29
+ ### Multi-Node Training
30
+
31
+ ```bash
32
+ # Train on 2 nodes with 8 GPUs each (16 total)
33
+ litgpt finetune_lora meta-llama/Llama-2-70b-hf \
34
+ --devices 8 \
35
+ --num_nodes 2 \
36
+ --data JSON \
37
+ --data.json_path data/alpaca.json
38
+ ```
39
+
40
+ ## FSDP Configuration
41
+
42
+ ### Default FSDP Strategy
43
+
44
+ When multiple devices are used, LitGPT applies this FSDP configuration:
45
+
46
+ ```python
47
+ from lightning.fabric.strategies import FSDPStrategy
48
+ from litgpt.model import Block
49
+
50
+ strategy = FSDPStrategy(
51
+ auto_wrap_policy={Block},
52
+ state_dict_type="full",
53
+ sharding_strategy="HYBRID_SHARD"
54
+ )
55
+ ```
56
+
57
+ **Parameters**:
58
+ - `auto_wrap_policy={Block}`: Automatically wraps each transformer `Block` with FSDP
59
+ - `state_dict_type="full"`: Saves full model (assembled on rank 0) for easy deployment
60
+ - `sharding_strategy="HYBRID_SHARD"`: Shards parameters, gradients, and optimizer states
61
+
62
+ ### Sharding Strategies
63
+
64
+ | Strategy | Shards | Communication | Use Case |
65
+ |----------|--------|---------------|----------|
66
+ | `FULL_SHARD` (ZeRO-3) | Params + Grads + Optim | All-gather before forward/backward | Maximum memory savings |
67
+ | `SHARD_GRAD_OP` (ZeRO-2) | Grads + Optim only | Reduce-scatter after backward | Faster than FULL_SHARD |
68
+ | `HYBRID_SHARD` (default) | All (hybrid across nodes) | Optimized for multi-node | Best for clusters |
69
+ | `NO_SHARD` | None | Broadcast | Single GPU (no FSDP) |
70
+
71
+ **Recommendation**: Use default `HYBRID_SHARD` for multi-node, or `FULL_SHARD` for single-node multi-GPU.
72
+
73
+ ### State Dict Types
74
+
75
+ | Type | Behavior | Use Case |
76
+ |------|----------|----------|
77
+ | `full` (default) | Gathers all shards on rank 0, saves single file | Easy deployment, inference |
78
+ | `sharded` | Each rank saves its shard separately | Faster checkpointing, resume training |
79
+
80
+ ### Auto-Wrap Policy
81
+
82
+ FSDP wraps model components based on `auto_wrap_policy`:
83
+
84
+ ```python
85
+ auto_wrap_policy={Block} # Wrap each transformer block
86
+ ```
87
+
88
+ This means each `Block` (transformer layer) is independently sharded across GPUs. For a 32-layer model on 4 GPUs, each GPU holds ~8 layer shards.
89
+
90
+ ## Thunder FSDP (Advanced)
91
+
92
+ LitGPT includes an experimental **Thunder** extension with enhanced FSDP:
93
+
94
+ ```bash
95
+ litgpt pretrain tiny-llama-1.1b \
96
+ --devices 8 \
97
+ --num_nodes 1 \
98
+ --compiler thunder \
99
+ --strategy fsdp
100
+ ```
101
+
102
+ ### Thunder FSDP Configuration
103
+
104
+ ```python
105
+ from extensions.thunder.pretrain import ThunderFSDPStrategy
106
+
107
+ strategy = ThunderFSDPStrategy(
108
+ sharding_strategy="ZERO3",
109
+ bucketing_strategy="BLOCK",
110
+ state_dict_type="full",
111
+ jit=False,
112
+ )
113
+ ```
114
+
115
+ **Additional Parameters**:
116
+ - `sharding_strategy`: `"ZERO3"` (full shard), `"ZERO2"` (grad/optim only)
117
+ - `bucketing_strategy`: `"BLOCK"` (combine ops per block), `"LAYER"` (per layer), `"NONE"` (no bucketing)
118
+ - `jit`: Whether to apply `thunder.jit(model)` for optimization
119
+ - `executors`: Tuple of Thunder executors to enable
120
+
121
+ **Bucketing Strategy**:
122
+ - `"BLOCK"` (default): Combines collective operations for layer blocks → fewer communication calls
123
+ - `"LAYER"`: Combines per layer class
124
+ - `"NONE"`: No bucketing → more fine-grained but more overhead
125
+
126
+ ## Pretraining with FSDP
127
+
128
+ ### Single Node
129
+
130
+ ```bash
131
+ litgpt pretrain tiny-llama-1.1b \
132
+ --devices 8 \
133
+ --train.global_batch_size 512 \
134
+ --train.micro_batch_size 8 \
135
+ --data Alpaca2k
136
+ ```
137
+
138
+ **Memory calculation**:
139
+ - TinyLlama 1.1B: ~4GB model + ~4GB gradients + ~8GB optimizer = 16GB per GPU without FSDP
140
+ - With FSDP on 8 GPUs: 16GB / 8 = 2GB per GPU ✅ Fits easily
141
+
142
+ ### Multi-Node
143
+
144
+ ```bash
145
+ # Launch on 4 nodes with 8 GPUs each (32 total)
146
+ litgpt pretrain llama-2-7b \
147
+ --devices 8 \
148
+ --num_nodes 4 \
149
+ --train.global_batch_size 1024 \
150
+ --train.micro_batch_size 2 \
151
+ --data RedPajama
152
+ ```
153
+
154
+ **Memory calculation**:
155
+ - Llama 2 7B: ~28GB model + ~28GB gradients + ~56GB optimizer = 112GB total
156
+ - With FSDP on 32 GPUs: 112GB / 32 = 3.5GB per GPU ✅
157
+
158
+ ## Fine-tuning with FSDP
159
+
160
+ ### LoRA Fine-tuning (Recommended)
161
+
162
+ LoRA fine-tuning with FSDP for >7B models:
163
+
164
+ ```bash
165
+ # Llama 2 70B LoRA on 8 GPUs
166
+ litgpt finetune_lora meta-llama/Llama-2-70b-hf \
167
+ --devices 8 \
168
+ --data JSON \
169
+ --data.json_path data/alpaca.json \
170
+ --train.global_batch_size 16 \
171
+ --train.micro_batch_size 1 \
172
+ --lora_r 8
173
+ ```
174
+
175
+ **Why LoRA with FSDP**:
176
+ - Base model sharded with FSDP (memory efficient)
177
+ - Only LoRA adapters trained (fast)
178
+ - Best of both worlds for large models
179
+
180
+ ### Full Fine-tuning
181
+
182
+ Full fine-tuning with FSDP:
183
+
184
+ ```bash
185
+ # Llama 2 7B full fine-tune on 4 GPUs
186
+ litgpt finetune_full meta-llama/Llama-2-7b-hf \
187
+ --devices 4 \
188
+ --data JSON \
189
+ --data.json_path data/alpaca.json \
190
+ --train.global_batch_size 16 \
191
+ --train.micro_batch_size 1 \
192
+ --train.learning_rate 3e-5
193
+ ```
194
+
195
+ ## Mixed Precision
196
+
197
+ FSDP works with mixed precision for memory savings and speedup:
198
+
199
+ ```bash
200
+ # BF16 mixed precision (recommended for A100/H100)
201
+ litgpt pretrain tiny-llama-1.1b \
202
+ --devices 8 \
203
+ --precision bf16-mixed
204
+
205
+ # FP16 mixed precision (V100 compatible)
206
+ litgpt pretrain tiny-llama-1.1b \
207
+ --devices 8 \
208
+ --precision 16-mixed
209
+ ```
210
+
211
+ **Precision options**:
212
+ - `bf16-mixed`: BF16 for computation, FP32 for master weights (best for Ampere+)
213
+ - `16-mixed`: FP16 for computation, FP32 for master weights (V100)
214
+ - `32-true`: Full FP32 (debugging only, slow)
215
+
216
+ ## Gradient Accumulation
217
+
218
+ Simulate larger batch sizes with gradient accumulation:
219
+
220
+ ```bash
221
+ # Simulate global_batch_size=512 with micro_batch_size=2
222
+ litgpt pretrain tiny-llama-1.1b \
223
+ --devices 8 \
224
+ --train.global_batch_size 512 \
225
+ --train.micro_batch_size 2
226
+ # Accumulates over 512/(8*2) = 32 steps per optimizer update
227
+ ```
228
+
229
+ **Formula**:
230
+ ```
231
+ Gradient accumulation steps = global_batch_size / (devices × micro_batch_size)
232
+ ```
233
+
234
+ ## Memory Optimization
235
+
236
+ ### Out of Memory? Try These
237
+
238
+ 1. **Increase devices**:
239
+ ```bash
240
+ --devices 8 # Instead of 4
241
+ ```
242
+
243
+ 2. **Reduce micro batch size**:
244
+ ```bash
245
+ --train.micro_batch_size 1 # Instead of 2
246
+ ```
247
+
248
+ 3. **Lower precision**:
249
+ ```bash
250
+ --precision bf16-mixed # Instead of 32-true
251
+ ```
252
+
253
+ 4. **Use FULL_SHARD**:
254
+ ```python
255
+ strategy = FSDPStrategy(
256
+ sharding_strategy="FULL_SHARD" # Maximum memory savings
257
+ )
258
+ ```
259
+
260
+ 5. **Enable activation checkpointing** (implemented in model):
261
+ ```python
262
+ # Recomputes activations during backward pass
263
+ # Trades compute for memory
264
+ ```
265
+
266
+ 6. **Use QLoRA**:
267
+ ```bash
268
+ litgpt finetune_lora meta-llama/Llama-2-7b-hf \
269
+ --quantize bnb.nf4 \
270
+ --devices 1 # May not need FSDP with quantization
271
+ ```
272
+
273
+ ## Checkpointing
274
+
275
+ ### Save Checkpoints
276
+
277
+ FSDP automatically handles checkpoint saving:
278
+
279
+ ```bash
280
+ litgpt pretrain tiny-llama-1.1b \
281
+ --devices 8 \
282
+ --out_dir checkpoints/tinyllama-pretrain
283
+ # Saves to: checkpoints/tinyllama-pretrain/final/lit_model.pth
284
+ ```
285
+
286
+ With `state_dict_type="full"` (default), rank 0 assembles full model and saves single file.
287
+
288
+ ### Resume Training
289
+
290
+ ```bash
291
+ litgpt pretrain tiny-llama-1.1b \
292
+ --devices 8 \
293
+ --resume checkpoints/tinyllama-pretrain/
294
+ # Automatically loads latest checkpoint
295
+ ```
296
+
297
+ ### Convert to HuggingFace
298
+
299
+ ```bash
300
+ python scripts/convert_lit_checkpoint.py \
301
+ --checkpoint_path checkpoints/tinyllama-pretrain/final/lit_model.pth \
302
+ --output_dir models/tinyllama-hf
303
+ ```
304
+
305
+ ## Performance Tuning
306
+
307
+ ### Communication Backends
308
+
309
+ LitGPT uses NCCL for GPU communication:
310
+
311
+ ```bash
312
+ # Default (NCCL auto-configured)
313
+ litgpt pretrain tiny-llama-1.1b --devices 8
314
+
315
+ # Explicit NCCL settings (advanced)
316
+ NCCL_DEBUG=INFO \
317
+ NCCL_IB_DISABLE=0 \
318
+ litgpt pretrain tiny-llama-1.1b --devices 8
319
+ ```
320
+
321
+ **NCCL Environment Variables**:
322
+ - `NCCL_DEBUG=INFO`: Enable debug logging
323
+ - `NCCL_IB_DISABLE=0`: Use InfiniBand (if available)
324
+ - `NCCL_SOCKET_IFNAME=eth0`: Specify network interface
325
+
326
+ ### Multi-Node Setup
327
+
328
+ **Option 1: SLURM**
329
+
330
+ ```bash
331
+ #!/bin/bash
332
+ #SBATCH --nodes=4
333
+ #SBATCH --gpus-per-node=8
334
+ #SBATCH --ntasks-per-node=1
335
+
336
+ srun litgpt pretrain llama-2-7b \
337
+ --devices 8 \
338
+ --num_nodes 4 \
339
+ --data RedPajama
340
+ ```
341
+
342
+ **Option 2: torchrun**
343
+
344
+ ```bash
345
+ # On each node, run:
346
+ torchrun \
347
+ --nproc_per_node=8 \
348
+ --nnodes=4 \
349
+ --node_rank=$NODE_RANK \
350
+ --master_addr=$MASTER_ADDR \
351
+ --master_port=29500 \
352
+ -m litgpt pretrain llama-2-7b
353
+ ```
354
+
355
+ ### Profiling
356
+
357
+ Enable profiling to identify bottlenecks:
358
+
359
+ ```bash
360
+ litgpt pretrain tiny-llama-1.1b \
361
+ --devices 8 \
362
+ --train.max_steps 100 \
363
+ --profile
364
+ # Generates profiling report
365
+ ```
366
+
367
+ ## Example Configurations
368
+
369
+ ### Llama 2 7B on 4× A100 (40GB)
370
+
371
+ ```bash
372
+ litgpt finetune_lora meta-llama/Llama-2-7b-hf \
373
+ --devices 4 \
374
+ --precision bf16-mixed \
375
+ --train.global_batch_size 64 \
376
+ --train.micro_batch_size 4 \
377
+ --train.max_seq_length 2048 \
378
+ --lora_r 8 \
379
+ --data JSON \
380
+ --data.json_path data/alpaca.json
381
+ ```
382
+
383
+ **Memory per GPU**: ~20GB
384
+ **Throughput**: ~5 samples/sec
385
+
386
+ ### Llama 2 70B on 8× A100 (80GB)
387
+
388
+ ```bash
389
+ litgpt finetune_lora meta-llama/Llama-2-70b-hf \
390
+ --devices 8 \
391
+ --precision bf16-mixed \
392
+ --train.global_batch_size 32 \
393
+ --train.micro_batch_size 1 \
394
+ --train.max_seq_length 2048 \
395
+ --lora_r 8 \
396
+ --data JSON \
397
+ --data.json_path data/alpaca.json
398
+ ```
399
+
400
+ **Memory per GPU**: ~70GB
401
+ **Throughput**: ~1 sample/sec
402
+
403
+ ### Llama 3 405B on 64× H100 (80GB)
404
+
405
+ ```bash
406
+ litgpt finetune_lora meta-llama/Llama-3.1-405B \
407
+ --devices 8 \
408
+ --num_nodes 8 \
409
+ --precision bf16-mixed \
410
+ --train.global_batch_size 128 \
411
+ --train.micro_batch_size 1 \
412
+ --train.max_seq_length 4096 \
413
+ --lora_r 16 \
414
+ --data JSON \
415
+ --data.json_path data/alpaca.json
416
+ ```
417
+
418
+ **Memory per GPU**: ~60GB
419
+ **Requires**: 64 H100 GPUs (8 nodes × 8 GPUs)
420
+
421
+ ## Troubleshooting
422
+
423
+ ### "CUDA out of memory"
424
+
425
+ 1. Reduce `micro_batch_size`
426
+ 2. Increase `devices` (more sharding)
427
+ 3. Lower `max_seq_length`
428
+ 4. Use `bf16-mixed` precision
429
+ 5. Try QLoRA (`--quantize bnb.nf4`)
430
+
431
+ ### "NCCL error" or Slow Communication
432
+
433
+ 1. Check network connectivity between nodes
434
+ 2. Enable InfiniBand: `NCCL_IB_DISABLE=0`
435
+ 3. Verify NCCL version: `python -c "import torch; print(torch.cuda.nccl.version())"`
436
+ 4. Test with NCCL tests: `$NCCL_HOME/build/all_reduce_perf -b 8 -e 128M`
437
+
438
+ ### Training Slower Than Expected
439
+
440
+ 1. Profile with `--profile`
441
+ 2. Check GPU utilization: `nvidia-smi dmon`
442
+ 3. Verify data loading isn't bottleneck
443
+ 4. Increase `micro_batch_size` if memory allows
444
+ 5. Use Thunder FSDP with bucketing
445
+
446
+ ## References
447
+
448
+ - FSDP configuration: `litgpt/pretrain.py:setup()`
449
+ - Thunder FSDP: `extensions/thunder/pretrain.py`
450
+ - Memory optimization guide: `tutorials/oom.md`
451
+ - Lightning Fabric docs: https://lightning.ai/docs/fabric/