@synsci/cli-darwin-x64 1.1.49

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (373) hide show
  1. package/bin/skills/accelerate/SKILL.md +332 -0
  2. package/bin/skills/accelerate/references/custom-plugins.md +453 -0
  3. package/bin/skills/accelerate/references/megatron-integration.md +489 -0
  4. package/bin/skills/accelerate/references/performance.md +525 -0
  5. package/bin/skills/audiocraft/SKILL.md +564 -0
  6. package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
  7. package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
  8. package/bin/skills/autogpt/SKILL.md +403 -0
  9. package/bin/skills/autogpt/references/advanced-usage.md +535 -0
  10. package/bin/skills/autogpt/references/troubleshooting.md +420 -0
  11. package/bin/skills/awq/SKILL.md +310 -0
  12. package/bin/skills/awq/references/advanced-usage.md +324 -0
  13. package/bin/skills/awq/references/troubleshooting.md +344 -0
  14. package/bin/skills/axolotl/SKILL.md +158 -0
  15. package/bin/skills/axolotl/references/api.md +5548 -0
  16. package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
  17. package/bin/skills/axolotl/references/index.md +15 -0
  18. package/bin/skills/axolotl/references/other.md +3563 -0
  19. package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
  20. package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
  21. package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
  22. package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
  23. package/bin/skills/bitsandbytes/SKILL.md +411 -0
  24. package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
  25. package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
  26. package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
  27. package/bin/skills/blip-2/SKILL.md +564 -0
  28. package/bin/skills/blip-2/references/advanced-usage.md +680 -0
  29. package/bin/skills/blip-2/references/troubleshooting.md +526 -0
  30. package/bin/skills/chroma/SKILL.md +406 -0
  31. package/bin/skills/chroma/references/integration.md +38 -0
  32. package/bin/skills/clip/SKILL.md +253 -0
  33. package/bin/skills/clip/references/applications.md +207 -0
  34. package/bin/skills/constitutional-ai/SKILL.md +290 -0
  35. package/bin/skills/crewai/SKILL.md +498 -0
  36. package/bin/skills/crewai/references/flows.md +438 -0
  37. package/bin/skills/crewai/references/tools.md +429 -0
  38. package/bin/skills/crewai/references/troubleshooting.md +480 -0
  39. package/bin/skills/deepspeed/SKILL.md +141 -0
  40. package/bin/skills/deepspeed/references/08.md +17 -0
  41. package/bin/skills/deepspeed/references/09.md +173 -0
  42. package/bin/skills/deepspeed/references/2020.md +378 -0
  43. package/bin/skills/deepspeed/references/2023.md +279 -0
  44. package/bin/skills/deepspeed/references/assets.md +179 -0
  45. package/bin/skills/deepspeed/references/index.md +35 -0
  46. package/bin/skills/deepspeed/references/mii.md +118 -0
  47. package/bin/skills/deepspeed/references/other.md +1191 -0
  48. package/bin/skills/deepspeed/references/tutorials.md +6554 -0
  49. package/bin/skills/dspy/SKILL.md +590 -0
  50. package/bin/skills/dspy/references/examples.md +663 -0
  51. package/bin/skills/dspy/references/modules.md +475 -0
  52. package/bin/skills/dspy/references/optimizers.md +566 -0
  53. package/bin/skills/faiss/SKILL.md +221 -0
  54. package/bin/skills/faiss/references/index_types.md +280 -0
  55. package/bin/skills/flash-attention/SKILL.md +367 -0
  56. package/bin/skills/flash-attention/references/benchmarks.md +215 -0
  57. package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
  58. package/bin/skills/gguf/SKILL.md +427 -0
  59. package/bin/skills/gguf/references/advanced-usage.md +504 -0
  60. package/bin/skills/gguf/references/troubleshooting.md +442 -0
  61. package/bin/skills/gptq/SKILL.md +450 -0
  62. package/bin/skills/gptq/references/calibration.md +337 -0
  63. package/bin/skills/gptq/references/integration.md +129 -0
  64. package/bin/skills/gptq/references/troubleshooting.md +95 -0
  65. package/bin/skills/grpo-rl-training/README.md +97 -0
  66. package/bin/skills/grpo-rl-training/SKILL.md +572 -0
  67. package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
  68. package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
  69. package/bin/skills/guidance/SKILL.md +572 -0
  70. package/bin/skills/guidance/references/backends.md +554 -0
  71. package/bin/skills/guidance/references/constraints.md +674 -0
  72. package/bin/skills/guidance/references/examples.md +767 -0
  73. package/bin/skills/hqq/SKILL.md +445 -0
  74. package/bin/skills/hqq/references/advanced-usage.md +528 -0
  75. package/bin/skills/hqq/references/troubleshooting.md +503 -0
  76. package/bin/skills/hugging-face-cli/SKILL.md +191 -0
  77. package/bin/skills/hugging-face-cli/references/commands.md +954 -0
  78. package/bin/skills/hugging-face-cli/references/examples.md +374 -0
  79. package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
  80. package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
  81. package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
  82. package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
  83. package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
  84. package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
  85. package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
  86. package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
  87. package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
  88. package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
  89. package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
  90. package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
  91. package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
  92. package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
  93. package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
  94. package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
  95. package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
  96. package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
  97. package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
  98. package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
  99. package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
  100. package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
  101. package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
  102. package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
  103. package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
  104. package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
  105. package/bin/skills/hugging-face-jobs/index.html +216 -0
  106. package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
  107. package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
  108. package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
  109. package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
  110. package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
  111. package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
  112. package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
  113. package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
  114. package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
  115. package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
  116. package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
  117. package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
  118. package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
  119. package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
  120. package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
  121. package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
  122. package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
  123. package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
  124. package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
  125. package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
  126. package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
  127. package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
  128. package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
  129. package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
  130. package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
  131. package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
  132. package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
  133. package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
  134. package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
  135. package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
  136. package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
  137. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
  138. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
  139. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
  140. package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
  141. package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
  142. package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
  143. package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
  144. package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
  145. package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
  146. package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
  147. package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
  148. package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
  149. package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
  150. package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
  151. package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
  152. package/bin/skills/instructor/SKILL.md +740 -0
  153. package/bin/skills/instructor/references/examples.md +107 -0
  154. package/bin/skills/instructor/references/providers.md +70 -0
  155. package/bin/skills/instructor/references/validation.md +606 -0
  156. package/bin/skills/knowledge-distillation/SKILL.md +458 -0
  157. package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
  158. package/bin/skills/lambda-labs/SKILL.md +545 -0
  159. package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
  160. package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
  161. package/bin/skills/langchain/SKILL.md +480 -0
  162. package/bin/skills/langchain/references/agents.md +499 -0
  163. package/bin/skills/langchain/references/integration.md +562 -0
  164. package/bin/skills/langchain/references/rag.md +600 -0
  165. package/bin/skills/langsmith/SKILL.md +422 -0
  166. package/bin/skills/langsmith/references/advanced-usage.md +548 -0
  167. package/bin/skills/langsmith/references/troubleshooting.md +537 -0
  168. package/bin/skills/litgpt/SKILL.md +469 -0
  169. package/bin/skills/litgpt/references/custom-models.md +568 -0
  170. package/bin/skills/litgpt/references/distributed-training.md +451 -0
  171. package/bin/skills/litgpt/references/supported-models.md +336 -0
  172. package/bin/skills/litgpt/references/training-recipes.md +619 -0
  173. package/bin/skills/llama-cpp/SKILL.md +258 -0
  174. package/bin/skills/llama-cpp/references/optimization.md +89 -0
  175. package/bin/skills/llama-cpp/references/quantization.md +213 -0
  176. package/bin/skills/llama-cpp/references/server.md +125 -0
  177. package/bin/skills/llama-factory/SKILL.md +80 -0
  178. package/bin/skills/llama-factory/references/_images.md +23 -0
  179. package/bin/skills/llama-factory/references/advanced.md +1055 -0
  180. package/bin/skills/llama-factory/references/getting_started.md +349 -0
  181. package/bin/skills/llama-factory/references/index.md +19 -0
  182. package/bin/skills/llama-factory/references/other.md +31 -0
  183. package/bin/skills/llamaguard/SKILL.md +337 -0
  184. package/bin/skills/llamaindex/SKILL.md +569 -0
  185. package/bin/skills/llamaindex/references/agents.md +83 -0
  186. package/bin/skills/llamaindex/references/data_connectors.md +108 -0
  187. package/bin/skills/llamaindex/references/query_engines.md +406 -0
  188. package/bin/skills/llava/SKILL.md +304 -0
  189. package/bin/skills/llava/references/training.md +197 -0
  190. package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
  191. package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
  192. package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
  193. package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
  194. package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
  195. package/bin/skills/long-context/SKILL.md +536 -0
  196. package/bin/skills/long-context/references/extension_methods.md +468 -0
  197. package/bin/skills/long-context/references/fine_tuning.md +611 -0
  198. package/bin/skills/long-context/references/rope.md +402 -0
  199. package/bin/skills/mamba/SKILL.md +260 -0
  200. package/bin/skills/mamba/references/architecture-details.md +206 -0
  201. package/bin/skills/mamba/references/benchmarks.md +255 -0
  202. package/bin/skills/mamba/references/training-guide.md +388 -0
  203. package/bin/skills/megatron-core/SKILL.md +366 -0
  204. package/bin/skills/megatron-core/references/benchmarks.md +249 -0
  205. package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
  206. package/bin/skills/megatron-core/references/production-examples.md +473 -0
  207. package/bin/skills/megatron-core/references/training-recipes.md +547 -0
  208. package/bin/skills/miles/SKILL.md +315 -0
  209. package/bin/skills/miles/references/api-reference.md +141 -0
  210. package/bin/skills/miles/references/troubleshooting.md +352 -0
  211. package/bin/skills/mlflow/SKILL.md +704 -0
  212. package/bin/skills/mlflow/references/deployment.md +744 -0
  213. package/bin/skills/mlflow/references/model-registry.md +770 -0
  214. package/bin/skills/mlflow/references/tracking.md +680 -0
  215. package/bin/skills/modal/SKILL.md +341 -0
  216. package/bin/skills/modal/references/advanced-usage.md +503 -0
  217. package/bin/skills/modal/references/troubleshooting.md +494 -0
  218. package/bin/skills/model-merging/SKILL.md +539 -0
  219. package/bin/skills/model-merging/references/evaluation.md +462 -0
  220. package/bin/skills/model-merging/references/examples.md +428 -0
  221. package/bin/skills/model-merging/references/methods.md +352 -0
  222. package/bin/skills/model-pruning/SKILL.md +495 -0
  223. package/bin/skills/model-pruning/references/wanda.md +347 -0
  224. package/bin/skills/moe-training/SKILL.md +526 -0
  225. package/bin/skills/moe-training/references/architectures.md +432 -0
  226. package/bin/skills/moe-training/references/inference.md +348 -0
  227. package/bin/skills/moe-training/references/training.md +425 -0
  228. package/bin/skills/nanogpt/SKILL.md +290 -0
  229. package/bin/skills/nanogpt/references/architecture.md +382 -0
  230. package/bin/skills/nanogpt/references/data.md +476 -0
  231. package/bin/skills/nanogpt/references/training.md +564 -0
  232. package/bin/skills/nemo-curator/SKILL.md +383 -0
  233. package/bin/skills/nemo-curator/references/deduplication.md +87 -0
  234. package/bin/skills/nemo-curator/references/filtering.md +102 -0
  235. package/bin/skills/nemo-evaluator/SKILL.md +494 -0
  236. package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
  237. package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
  238. package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
  239. package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
  240. package/bin/skills/nemo-guardrails/SKILL.md +297 -0
  241. package/bin/skills/nnsight/SKILL.md +436 -0
  242. package/bin/skills/nnsight/references/README.md +78 -0
  243. package/bin/skills/nnsight/references/api.md +344 -0
  244. package/bin/skills/nnsight/references/tutorials.md +300 -0
  245. package/bin/skills/openrlhf/SKILL.md +249 -0
  246. package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
  247. package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
  248. package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
  249. package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
  250. package/bin/skills/outlines/SKILL.md +652 -0
  251. package/bin/skills/outlines/references/backends.md +615 -0
  252. package/bin/skills/outlines/references/examples.md +773 -0
  253. package/bin/skills/outlines/references/json_generation.md +652 -0
  254. package/bin/skills/peft/SKILL.md +431 -0
  255. package/bin/skills/peft/references/advanced-usage.md +514 -0
  256. package/bin/skills/peft/references/troubleshooting.md +480 -0
  257. package/bin/skills/phoenix/SKILL.md +475 -0
  258. package/bin/skills/phoenix/references/advanced-usage.md +619 -0
  259. package/bin/skills/phoenix/references/troubleshooting.md +538 -0
  260. package/bin/skills/pinecone/SKILL.md +358 -0
  261. package/bin/skills/pinecone/references/deployment.md +181 -0
  262. package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
  263. package/bin/skills/pytorch-fsdp/references/index.md +7 -0
  264. package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
  265. package/bin/skills/pytorch-lightning/SKILL.md +346 -0
  266. package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
  267. package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
  268. package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
  269. package/bin/skills/pyvene/SKILL.md +473 -0
  270. package/bin/skills/pyvene/references/README.md +73 -0
  271. package/bin/skills/pyvene/references/api.md +383 -0
  272. package/bin/skills/pyvene/references/tutorials.md +376 -0
  273. package/bin/skills/qdrant/SKILL.md +493 -0
  274. package/bin/skills/qdrant/references/advanced-usage.md +648 -0
  275. package/bin/skills/qdrant/references/troubleshooting.md +631 -0
  276. package/bin/skills/ray-data/SKILL.md +326 -0
  277. package/bin/skills/ray-data/references/integration.md +82 -0
  278. package/bin/skills/ray-data/references/transformations.md +83 -0
  279. package/bin/skills/ray-train/SKILL.md +406 -0
  280. package/bin/skills/ray-train/references/multi-node.md +628 -0
  281. package/bin/skills/rwkv/SKILL.md +260 -0
  282. package/bin/skills/rwkv/references/architecture-details.md +344 -0
  283. package/bin/skills/rwkv/references/rwkv7.md +386 -0
  284. package/bin/skills/rwkv/references/state-management.md +369 -0
  285. package/bin/skills/saelens/SKILL.md +386 -0
  286. package/bin/skills/saelens/references/README.md +70 -0
  287. package/bin/skills/saelens/references/api.md +333 -0
  288. package/bin/skills/saelens/references/tutorials.md +318 -0
  289. package/bin/skills/segment-anything/SKILL.md +500 -0
  290. package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
  291. package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
  292. package/bin/skills/sentence-transformers/SKILL.md +255 -0
  293. package/bin/skills/sentence-transformers/references/models.md +123 -0
  294. package/bin/skills/sentencepiece/SKILL.md +235 -0
  295. package/bin/skills/sentencepiece/references/algorithms.md +200 -0
  296. package/bin/skills/sentencepiece/references/training.md +304 -0
  297. package/bin/skills/sglang/SKILL.md +442 -0
  298. package/bin/skills/sglang/references/deployment.md +490 -0
  299. package/bin/skills/sglang/references/radix-attention.md +413 -0
  300. package/bin/skills/sglang/references/structured-generation.md +541 -0
  301. package/bin/skills/simpo/SKILL.md +219 -0
  302. package/bin/skills/simpo/references/datasets.md +478 -0
  303. package/bin/skills/simpo/references/hyperparameters.md +452 -0
  304. package/bin/skills/simpo/references/loss-functions.md +350 -0
  305. package/bin/skills/skypilot/SKILL.md +509 -0
  306. package/bin/skills/skypilot/references/advanced-usage.md +491 -0
  307. package/bin/skills/skypilot/references/troubleshooting.md +570 -0
  308. package/bin/skills/slime/SKILL.md +464 -0
  309. package/bin/skills/slime/references/api-reference.md +392 -0
  310. package/bin/skills/slime/references/troubleshooting.md +386 -0
  311. package/bin/skills/speculative-decoding/SKILL.md +467 -0
  312. package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
  313. package/bin/skills/speculative-decoding/references/medusa.md +350 -0
  314. package/bin/skills/stable-diffusion/SKILL.md +519 -0
  315. package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
  316. package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
  317. package/bin/skills/tensorboard/SKILL.md +629 -0
  318. package/bin/skills/tensorboard/references/integrations.md +638 -0
  319. package/bin/skills/tensorboard/references/profiling.md +545 -0
  320. package/bin/skills/tensorboard/references/visualization.md +620 -0
  321. package/bin/skills/tensorrt-llm/SKILL.md +187 -0
  322. package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
  323. package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
  324. package/bin/skills/tensorrt-llm/references/serving.md +470 -0
  325. package/bin/skills/tinker/SKILL.md +362 -0
  326. package/bin/skills/tinker/references/api-reference.md +168 -0
  327. package/bin/skills/tinker/references/getting-started.md +157 -0
  328. package/bin/skills/tinker/references/loss-functions.md +163 -0
  329. package/bin/skills/tinker/references/models-and-lora.md +139 -0
  330. package/bin/skills/tinker/references/recipes.md +280 -0
  331. package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
  332. package/bin/skills/tinker/references/rendering.md +243 -0
  333. package/bin/skills/tinker/references/supervised-learning.md +232 -0
  334. package/bin/skills/tinker-training-cost/SKILL.md +187 -0
  335. package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
  336. package/bin/skills/torchforge/SKILL.md +433 -0
  337. package/bin/skills/torchforge/references/api-reference.md +327 -0
  338. package/bin/skills/torchforge/references/troubleshooting.md +409 -0
  339. package/bin/skills/torchtitan/SKILL.md +358 -0
  340. package/bin/skills/torchtitan/references/checkpoint.md +181 -0
  341. package/bin/skills/torchtitan/references/custom-models.md +258 -0
  342. package/bin/skills/torchtitan/references/float8.md +133 -0
  343. package/bin/skills/torchtitan/references/fsdp.md +126 -0
  344. package/bin/skills/transformer-lens/SKILL.md +346 -0
  345. package/bin/skills/transformer-lens/references/README.md +54 -0
  346. package/bin/skills/transformer-lens/references/api.md +362 -0
  347. package/bin/skills/transformer-lens/references/tutorials.md +339 -0
  348. package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
  349. package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
  350. package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
  351. package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
  352. package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
  353. package/bin/skills/unsloth/SKILL.md +80 -0
  354. package/bin/skills/unsloth/references/index.md +7 -0
  355. package/bin/skills/unsloth/references/llms-full.md +16799 -0
  356. package/bin/skills/unsloth/references/llms-txt.md +12044 -0
  357. package/bin/skills/unsloth/references/llms.md +82 -0
  358. package/bin/skills/verl/SKILL.md +391 -0
  359. package/bin/skills/verl/references/api-reference.md +301 -0
  360. package/bin/skills/verl/references/troubleshooting.md +391 -0
  361. package/bin/skills/vllm/SKILL.md +364 -0
  362. package/bin/skills/vllm/references/optimization.md +226 -0
  363. package/bin/skills/vllm/references/quantization.md +284 -0
  364. package/bin/skills/vllm/references/server-deployment.md +255 -0
  365. package/bin/skills/vllm/references/troubleshooting.md +447 -0
  366. package/bin/skills/weights-and-biases/SKILL.md +590 -0
  367. package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
  368. package/bin/skills/weights-and-biases/references/integrations.md +700 -0
  369. package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
  370. package/bin/skills/whisper/SKILL.md +317 -0
  371. package/bin/skills/whisper/references/languages.md +189 -0
  372. package/bin/synsc +0 -0
  373. package/package.json +10 -0
@@ -0,0 +1,358 @@
1
+ ---
2
+ name: distributed-llm-pretraining-torchtitan
3
+ description: Provides PyTorch-native distributed LLM pretraining using torchtitan with 4D parallelism (FSDP2, TP, PP, CP). Use when pretraining Llama 3.1, DeepSeek V3, or custom models at scale from 8 to 512+ GPUs with Float8, torch.compile, and distributed checkpointing.
4
+ version: 1.0.0
5
+ author: Synthetic Sciences
6
+ license: MIT
7
+ tags: [Model Architecture, Distributed Training, TorchTitan, FSDP2, Tensor Parallel, Pipeline Parallel, Context Parallel, Float8, Llama, Pretraining]
8
+ dependencies: [torch>=2.6.0, torchtitan>=0.2.0, torchao>=0.5.0]
9
+ ---
10
+
11
+ # TorchTitan - PyTorch Native Distributed LLM Pretraining
12
+
13
+ ## Quick start
14
+
15
+ TorchTitan is PyTorch's official platform for large-scale LLM pretraining with composable 4D parallelism (FSDP2, TP, PP, CP), achieving 65%+ speedups over baselines on H100 GPUs.
16
+
17
+ **Installation**:
18
+ ```bash
19
+ # From PyPI (stable)
20
+ pip install torchtitan
21
+
22
+ # From source (latest features, requires PyTorch nightly)
23
+ git clone https://github.com/pytorch/torchtitan
24
+ cd torchtitan
25
+ pip install -r requirements.txt
26
+ ```
27
+
28
+ **Download tokenizer**:
29
+ ```bash
30
+ # Get HF token from https://huggingface.co/settings/tokens
31
+ python scripts/download_hf_assets.py --repo_id meta-llama/Llama-3.1-8B --assets tokenizer --hf_token=...
32
+ ```
33
+
34
+ **Start training on 8 GPUs**:
35
+ ```bash
36
+ CONFIG_FILE="./torchtitan/models/llama3/train_configs/llama3_8b.toml" ./run_train.sh
37
+ ```
38
+
39
+ ## Common workflows
40
+
41
+ ### Workflow 1: Pretrain Llama 3.1 8B on single node
42
+
43
+ Copy this checklist:
44
+
45
+ ```
46
+ Single Node Pretraining:
47
+ - [ ] Step 1: Download tokenizer
48
+ - [ ] Step 2: Configure training
49
+ - [ ] Step 3: Launch training
50
+ - [ ] Step 4: Monitor and checkpoint
51
+ ```
52
+
53
+ **Step 1: Download tokenizer**
54
+
55
+ ```bash
56
+ python scripts/download_hf_assets.py \
57
+ --repo_id meta-llama/Llama-3.1-8B \
58
+ --assets tokenizer \
59
+ --hf_token=YOUR_HF_TOKEN
60
+ ```
61
+
62
+ **Step 2: Configure training**
63
+
64
+ Edit or create a TOML config file:
65
+
66
+ ```toml
67
+ # llama3_8b_custom.toml
68
+ [job]
69
+ dump_folder = "./outputs"
70
+ description = "Llama 3.1 8B training"
71
+
72
+ [model]
73
+ name = "llama3"
74
+ flavor = "8B"
75
+ hf_assets_path = "./assets/hf/Llama-3.1-8B"
76
+
77
+ [optimizer]
78
+ name = "AdamW"
79
+ lr = 3e-4
80
+
81
+ [lr_scheduler]
82
+ warmup_steps = 200
83
+
84
+ [training]
85
+ local_batch_size = 2
86
+ seq_len = 8192
87
+ max_norm = 1.0
88
+ steps = 1000
89
+ dataset = "c4"
90
+
91
+ [parallelism]
92
+ data_parallel_shard_degree = -1 # Use all GPUs for FSDP
93
+
94
+ [activation_checkpoint]
95
+ mode = "selective"
96
+ selective_ac_option = "op"
97
+
98
+ [checkpoint]
99
+ enable = true
100
+ folder = "checkpoint"
101
+ interval = 500
102
+ ```
103
+
104
+ **Step 3: Launch training**
105
+
106
+ ```bash
107
+ # 8 GPUs on single node
108
+ CONFIG_FILE="./llama3_8b_custom.toml" ./run_train.sh
109
+
110
+ # Or explicitly with torchrun
111
+ torchrun --nproc_per_node=8 \
112
+ -m torchtitan.train \
113
+ --job.config_file ./llama3_8b_custom.toml
114
+ ```
115
+
116
+ **Step 4: Monitor and checkpoint**
117
+
118
+ TensorBoard logs are saved to `./outputs/tb/`:
119
+ ```bash
120
+ tensorboard --logdir ./outputs/tb
121
+ ```
122
+
123
+ ### Workflow 2: Multi-node training with SLURM
124
+
125
+ ```
126
+ Multi-Node Training:
127
+ - [ ] Step 1: Configure parallelism for scale
128
+ - [ ] Step 2: Set up SLURM script
129
+ - [ ] Step 3: Submit job
130
+ - [ ] Step 4: Resume from checkpoint
131
+ ```
132
+
133
+ **Step 1: Configure parallelism for scale**
134
+
135
+ For 70B model on 256 GPUs (32 nodes):
136
+ ```toml
137
+ [parallelism]
138
+ data_parallel_shard_degree = 32 # FSDP across 32 ranks
139
+ tensor_parallel_degree = 8 # TP within node
140
+ pipeline_parallel_degree = 1 # No PP for 70B
141
+ context_parallel_degree = 1 # Increase for long sequences
142
+ ```
143
+
144
+ **Step 2: Set up SLURM script**
145
+
146
+ ```bash
147
+ #!/bin/bash
148
+ #SBATCH --job-name=llama70b
149
+ #SBATCH --nodes=32
150
+ #SBATCH --ntasks-per-node=8
151
+ #SBATCH --gpus-per-node=8
152
+
153
+ srun torchrun \
154
+ --nnodes=32 \
155
+ --nproc_per_node=8 \
156
+ --rdzv_backend=c10d \
157
+ --rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT \
158
+ -m torchtitan.train \
159
+ --job.config_file ./llama3_70b.toml
160
+ ```
161
+
162
+ **Step 3: Submit job**
163
+
164
+ ```bash
165
+ sbatch multinode_trainer.slurm
166
+ ```
167
+
168
+ **Step 4: Resume from checkpoint**
169
+
170
+ Training auto-resumes if checkpoint exists in configured folder.
171
+
172
+ ### Workflow 3: Enable Float8 training for H100s
173
+
174
+ Float8 provides 30-50% speedup on H100 GPUs.
175
+
176
+ ```
177
+ Float8 Training:
178
+ - [ ] Step 1: Install torchao
179
+ - [ ] Step 2: Configure Float8
180
+ - [ ] Step 3: Launch with compile
181
+ ```
182
+
183
+ **Step 1: Install torchao**
184
+
185
+ ```bash
186
+ USE_CPP=0 pip install git+https://github.com/pytorch/ao.git
187
+ ```
188
+
189
+ **Step 2: Configure Float8**
190
+
191
+ Add to your TOML config:
192
+ ```toml
193
+ [model]
194
+ converters = ["quantize.linear.float8"]
195
+
196
+ [quantize.linear.float8]
197
+ enable_fsdp_float8_all_gather = true
198
+ precompute_float8_dynamic_scale_for_fsdp = true
199
+ filter_fqns = ["output"] # Exclude output layer
200
+
201
+ [compile]
202
+ enable = true
203
+ components = ["model", "loss"]
204
+ ```
205
+
206
+ **Step 3: Launch with compile**
207
+
208
+ ```bash
209
+ CONFIG_FILE="./llama3_8b.toml" ./run_train.sh \
210
+ --model.converters="quantize.linear.float8" \
211
+ --quantize.linear.float8.enable_fsdp_float8_all_gather \
212
+ --compile.enable
213
+ ```
214
+
215
+ ### Workflow 4: 4D parallelism for 405B models
216
+
217
+ ```
218
+ 4D Parallelism (FSDP + TP + PP + CP):
219
+ - [ ] Step 1: Create seed checkpoint
220
+ - [ ] Step 2: Configure 4D parallelism
221
+ - [ ] Step 3: Launch on 512 GPUs
222
+ ```
223
+
224
+ **Step 1: Create seed checkpoint**
225
+
226
+ Required for consistent initialization across PP stages:
227
+ ```bash
228
+ NGPU=1 CONFIG_FILE=./llama3_405b.toml ./run_train.sh \
229
+ --checkpoint.enable \
230
+ --checkpoint.create_seed_checkpoint \
231
+ --parallelism.data_parallel_shard_degree 1 \
232
+ --parallelism.tensor_parallel_degree 1 \
233
+ --parallelism.pipeline_parallel_degree 1
234
+ ```
235
+
236
+ **Step 2: Configure 4D parallelism**
237
+
238
+ ```toml
239
+ [parallelism]
240
+ data_parallel_shard_degree = 8 # FSDP
241
+ tensor_parallel_degree = 8 # TP within node
242
+ pipeline_parallel_degree = 8 # PP across nodes
243
+ context_parallel_degree = 1 # CP for long sequences
244
+
245
+ [training]
246
+ local_batch_size = 32
247
+ seq_len = 8192
248
+ ```
249
+
250
+ **Step 3: Launch on 512 GPUs**
251
+
252
+ ```bash
253
+ # 64 nodes x 8 GPUs = 512 GPUs
254
+ srun torchrun --nnodes=64 --nproc_per_node=8 \
255
+ -m torchtitan.train \
256
+ --job.config_file ./llama3_405b.toml
257
+ ```
258
+
259
+ ## When to use vs alternatives
260
+
261
+ **Use TorchTitan when:**
262
+ - Pretraining LLMs from scratch (8B to 405B+)
263
+ - Need PyTorch-native solution without third-party dependencies
264
+ - Require composable 4D parallelism (FSDP2, TP, PP, CP)
265
+ - Training on H100s with Float8 support
266
+ - Want interoperable checkpoints with torchtune/HuggingFace
267
+
268
+ **Use alternatives instead:**
269
+ - **Megatron-LM**: Maximum performance for NVIDIA-only deployments
270
+ - **DeepSpeed**: Broader ZeRO optimization ecosystem, inference support
271
+ - **Axolotl/TRL**: Fine-tuning rather than pretraining
272
+ - **LitGPT**: Educational, smaller-scale training
273
+
274
+ ## Common issues
275
+
276
+ **Issue: Out of memory on large models**
277
+
278
+ Enable activation checkpointing and reduce batch size:
279
+ ```toml
280
+ [activation_checkpoint]
281
+ mode = "full" # Instead of "selective"
282
+
283
+ [training]
284
+ local_batch_size = 1
285
+ ```
286
+
287
+ Or use gradient accumulation:
288
+ ```toml
289
+ [training]
290
+ local_batch_size = 1
291
+ global_batch_size = 32 # Accumulates gradients
292
+ ```
293
+
294
+ **Issue: TP causes high memory with async collectives**
295
+
296
+ Set environment variable:
297
+ ```bash
298
+ export TORCH_NCCL_AVOID_RECORD_STREAMS=1
299
+ ```
300
+
301
+ **Issue: Float8 training not faster**
302
+
303
+ Float8 only benefits large GEMMs. Filter small layers:
304
+ ```toml
305
+ [quantize.linear.float8]
306
+ filter_fqns = ["attention.wk", "attention.wv", "output", "auto_filter_small_kn"]
307
+ ```
308
+
309
+ **Issue: Checkpoint loading fails after parallelism change**
310
+
311
+ Use DCP's resharding capability:
312
+ ```bash
313
+ # Convert sharded checkpoint to single file
314
+ python -m torch.distributed.checkpoint.format_utils \
315
+ dcp_to_torch checkpoint/step-1000 checkpoint.pt
316
+ ```
317
+
318
+ **Issue: Pipeline parallelism initialization**
319
+
320
+ Create seed checkpoint first (see Workflow 4, Step 1).
321
+
322
+ ## Supported models
323
+
324
+ | Model | Sizes | Status |
325
+ |-------|-------|--------|
326
+ | Llama 3.1 | 8B, 70B, 405B | Production |
327
+ | Llama 4 | Various | Experimental |
328
+ | DeepSeek V3 | 16B, 236B, 671B (MoE) | Experimental |
329
+ | GPT-OSS | 20B, 120B (MoE) | Experimental |
330
+ | Qwen 3 | Various | Experimental |
331
+ | Flux | Diffusion | Experimental |
332
+
333
+ ## Performance benchmarks (H100)
334
+
335
+ | Model | GPUs | Parallelism | TPS/GPU | Techniques |
336
+ |-------|------|-------------|---------|------------|
337
+ | Llama 8B | 8 | FSDP | 5,762 | Baseline |
338
+ | Llama 8B | 8 | FSDP+compile+FP8 | 8,532 | +48% |
339
+ | Llama 70B | 256 | FSDP+TP+AsyncTP | 876 | 2D parallel |
340
+ | Llama 405B | 512 | FSDP+TP+PP | 128 | 3D parallel |
341
+
342
+ ## Advanced topics
343
+
344
+ **FSDP2 configuration**: See [references/fsdp.md](references/fsdp.md) for detailed FSDP2 vs FSDP1 comparison and ZeRO equivalents.
345
+
346
+ **Float8 training**: See [references/float8.md](references/float8.md) for tensorwise vs rowwise scaling recipes.
347
+
348
+ **Checkpointing**: See [references/checkpoint.md](references/checkpoint.md) for HuggingFace conversion and async checkpointing.
349
+
350
+ **Adding custom models**: See [references/custom-models.md](references/custom-models.md) for TrainSpec protocol.
351
+
352
+ ## Resources
353
+
354
+ - GitHub: https://github.com/pytorch/torchtitan
355
+ - Paper: https://arxiv.org/abs/2410.06511
356
+ - ICLR 2025: https://iclr.cc/virtual/2025/poster/29620
357
+ - PyTorch Forum: https://discuss.pytorch.org/c/distributed/torchtitan/44
358
+
@@ -0,0 +1,181 @@
1
+ # Checkpointing in TorchTitan
2
+
3
+ TorchTitan uses PyTorch Distributed Checkpoint (DCP) for fault-tolerant, interoperable checkpointing.
4
+
5
+ ## Basic Configuration
6
+
7
+ ```toml
8
+ [checkpoint]
9
+ enable = true
10
+ folder = "checkpoint"
11
+ interval = 500
12
+ ```
13
+
14
+ ## Save Model Only (Smaller Checkpoints)
15
+
16
+ Exclude optimizer state and training metadata:
17
+
18
+ ```toml
19
+ [checkpoint]
20
+ enable = true
21
+ last_save_model_only = true
22
+ export_dtype = "bfloat16" # Optional: export in lower precision
23
+ ```
24
+
25
+ ## Excluding Keys from Loading
26
+
27
+ Partial checkpoint loading for modified settings:
28
+
29
+ ```toml
30
+ [checkpoint]
31
+ enable = true
32
+ exclude_from_loading = ["data_loader", "lr_scheduler"]
33
+ ```
34
+
35
+ CLI equivalent:
36
+ ```bash
37
+ --checkpoint.exclude_from_loading data_loader,lr_scheduler
38
+ ```
39
+
40
+ ## Creating Seed Checkpoints
41
+
42
+ Required for Pipeline Parallelism to ensure consistent initialization:
43
+
44
+ ```bash
45
+ NGPU=1 CONFIG_FILE=<path_to_config> ./run_train.sh \
46
+ --checkpoint.enable \
47
+ --checkpoint.create_seed_checkpoint \
48
+ --parallelism.data_parallel_replicate_degree 1 \
49
+ --parallelism.data_parallel_shard_degree 1 \
50
+ --parallelism.tensor_parallel_degree 1 \
51
+ --parallelism.pipeline_parallel_degree 1 \
52
+ --parallelism.context_parallel_degree 1 \
53
+ --parallelism.expert_parallel_degree 1
54
+ ```
55
+
56
+ This initializes on single CPU for reproducible initialization across any GPU count.
57
+
58
+ ## Async Checkpointing
59
+
60
+ Reduce checkpoint overhead with async writes:
61
+
62
+ ```toml
63
+ [checkpoint]
64
+ enable = true
65
+ async_mode = "async" # Options: "disabled", "async", "async_with_pinned_mem"
66
+ ```
67
+
68
+ ## HuggingFace Conversion
69
+
70
+ ### During Training
71
+
72
+ Save directly in HuggingFace format:
73
+
74
+ ```toml
75
+ [checkpoint]
76
+ last_save_in_hf = true
77
+ last_save_model_only = true
78
+ ```
79
+
80
+ Load from HuggingFace:
81
+
82
+ ```toml
83
+ [checkpoint]
84
+ initial_load_in_hf = true
85
+
86
+ [model]
87
+ hf_assets_path = "./path/to/hf/checkpoint"
88
+ ```
89
+
90
+ ### Offline Conversion
91
+
92
+ Convert without running training:
93
+
94
+ ```bash
95
+ # HuggingFace -> TorchTitan
96
+ python ./scripts/checkpoint_conversion/convert_from_hf.py \
97
+ <input_dir> <output_dir> \
98
+ --model_name llama3 \
99
+ --model_flavor 8B
100
+
101
+ # TorchTitan -> HuggingFace
102
+ python ./scripts/checkpoint_conversion/convert_to_hf.py \
103
+ <input_dir> <output_dir> \
104
+ --hf_assets_path ./assets/hf/Llama3.1-8B \
105
+ --model_name llama3 \
106
+ --model_flavor 8B
107
+ ```
108
+
109
+ ### Example
110
+
111
+ ```bash
112
+ python ./scripts/convert_from_hf.py \
113
+ ~/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/8cde5ca8380496c9a6cc7ef3a8b46a0372a1d920/ \
114
+ ./initial_load_path/ \
115
+ --model_name llama3 \
116
+ --model_flavor 8B
117
+ ```
118
+
119
+ ## Converting to Single .pt File
120
+
121
+ Convert DCP sharded checkpoint to single PyTorch file:
122
+
123
+ ```bash
124
+ python -m torch.distributed.checkpoint.format_utils \
125
+ dcp_to_torch \
126
+ torchtitan/outputs/checkpoint/step-1000 \
127
+ checkpoint.pt
128
+ ```
129
+
130
+ ## Checkpoint Structure
131
+
132
+ DCP saves sharded checkpoints that can be resharded for different parallelism configurations:
133
+
134
+ ```
135
+ checkpoint/
136
+ ├── step-500/
137
+ │ ├── .metadata
138
+ │ ├── __0_0.distcp
139
+ │ ├── __0_1.distcp
140
+ │ └── ...
141
+ └── step-1000/
142
+ └── ...
143
+ ```
144
+
145
+ ## Resume Training
146
+
147
+ Training auto-resumes from the latest checkpoint in the configured folder. To resume from a specific step:
148
+
149
+ ```toml
150
+ [checkpoint]
151
+ load_step = 500 # Resume from step 500
152
+ ```
153
+
154
+ ## Interoperability with TorchTune
155
+
156
+ Checkpoints saved with `last_save_model_only = true` can be loaded directly into [torchtune](https://github.com/pytorch/torchtune) for fine-tuning.
157
+
158
+ ## Full Configuration Example
159
+
160
+ ```toml
161
+ [checkpoint]
162
+ enable = true
163
+ folder = "checkpoint"
164
+ interval = 500
165
+ load_step = -1 # -1 = latest, or specify step number
166
+ last_save_model_only = true
167
+ export_dtype = "bfloat16"
168
+ async_mode = "async"
169
+ exclude_from_loading = []
170
+ last_save_in_hf = false
171
+ initial_load_in_hf = false
172
+ create_seed_checkpoint = false
173
+ ```
174
+
175
+ ## Best Practices
176
+
177
+ 1. **Large models**: Use `async_mode = "async"` to overlap checkpoint saves with training
178
+ 2. **Fine-tuning export**: Enable `last_save_model_only` and `export_dtype = "bfloat16"` for smaller files
179
+ 3. **Pipeline parallelism**: Always create seed checkpoint first
180
+ 4. **Debugging**: Save frequent checkpoints during development, reduce for production
181
+ 5. **HF interop**: Use conversion scripts for offline conversion, direct save/load for training workflows