@synsci/cli-darwin-x64 1.1.49

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (373) hide show
  1. package/bin/skills/accelerate/SKILL.md +332 -0
  2. package/bin/skills/accelerate/references/custom-plugins.md +453 -0
  3. package/bin/skills/accelerate/references/megatron-integration.md +489 -0
  4. package/bin/skills/accelerate/references/performance.md +525 -0
  5. package/bin/skills/audiocraft/SKILL.md +564 -0
  6. package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
  7. package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
  8. package/bin/skills/autogpt/SKILL.md +403 -0
  9. package/bin/skills/autogpt/references/advanced-usage.md +535 -0
  10. package/bin/skills/autogpt/references/troubleshooting.md +420 -0
  11. package/bin/skills/awq/SKILL.md +310 -0
  12. package/bin/skills/awq/references/advanced-usage.md +324 -0
  13. package/bin/skills/awq/references/troubleshooting.md +344 -0
  14. package/bin/skills/axolotl/SKILL.md +158 -0
  15. package/bin/skills/axolotl/references/api.md +5548 -0
  16. package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
  17. package/bin/skills/axolotl/references/index.md +15 -0
  18. package/bin/skills/axolotl/references/other.md +3563 -0
  19. package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
  20. package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
  21. package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
  22. package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
  23. package/bin/skills/bitsandbytes/SKILL.md +411 -0
  24. package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
  25. package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
  26. package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
  27. package/bin/skills/blip-2/SKILL.md +564 -0
  28. package/bin/skills/blip-2/references/advanced-usage.md +680 -0
  29. package/bin/skills/blip-2/references/troubleshooting.md +526 -0
  30. package/bin/skills/chroma/SKILL.md +406 -0
  31. package/bin/skills/chroma/references/integration.md +38 -0
  32. package/bin/skills/clip/SKILL.md +253 -0
  33. package/bin/skills/clip/references/applications.md +207 -0
  34. package/bin/skills/constitutional-ai/SKILL.md +290 -0
  35. package/bin/skills/crewai/SKILL.md +498 -0
  36. package/bin/skills/crewai/references/flows.md +438 -0
  37. package/bin/skills/crewai/references/tools.md +429 -0
  38. package/bin/skills/crewai/references/troubleshooting.md +480 -0
  39. package/bin/skills/deepspeed/SKILL.md +141 -0
  40. package/bin/skills/deepspeed/references/08.md +17 -0
  41. package/bin/skills/deepspeed/references/09.md +173 -0
  42. package/bin/skills/deepspeed/references/2020.md +378 -0
  43. package/bin/skills/deepspeed/references/2023.md +279 -0
  44. package/bin/skills/deepspeed/references/assets.md +179 -0
  45. package/bin/skills/deepspeed/references/index.md +35 -0
  46. package/bin/skills/deepspeed/references/mii.md +118 -0
  47. package/bin/skills/deepspeed/references/other.md +1191 -0
  48. package/bin/skills/deepspeed/references/tutorials.md +6554 -0
  49. package/bin/skills/dspy/SKILL.md +590 -0
  50. package/bin/skills/dspy/references/examples.md +663 -0
  51. package/bin/skills/dspy/references/modules.md +475 -0
  52. package/bin/skills/dspy/references/optimizers.md +566 -0
  53. package/bin/skills/faiss/SKILL.md +221 -0
  54. package/bin/skills/faiss/references/index_types.md +280 -0
  55. package/bin/skills/flash-attention/SKILL.md +367 -0
  56. package/bin/skills/flash-attention/references/benchmarks.md +215 -0
  57. package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
  58. package/bin/skills/gguf/SKILL.md +427 -0
  59. package/bin/skills/gguf/references/advanced-usage.md +504 -0
  60. package/bin/skills/gguf/references/troubleshooting.md +442 -0
  61. package/bin/skills/gptq/SKILL.md +450 -0
  62. package/bin/skills/gptq/references/calibration.md +337 -0
  63. package/bin/skills/gptq/references/integration.md +129 -0
  64. package/bin/skills/gptq/references/troubleshooting.md +95 -0
  65. package/bin/skills/grpo-rl-training/README.md +97 -0
  66. package/bin/skills/grpo-rl-training/SKILL.md +572 -0
  67. package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
  68. package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
  69. package/bin/skills/guidance/SKILL.md +572 -0
  70. package/bin/skills/guidance/references/backends.md +554 -0
  71. package/bin/skills/guidance/references/constraints.md +674 -0
  72. package/bin/skills/guidance/references/examples.md +767 -0
  73. package/bin/skills/hqq/SKILL.md +445 -0
  74. package/bin/skills/hqq/references/advanced-usage.md +528 -0
  75. package/bin/skills/hqq/references/troubleshooting.md +503 -0
  76. package/bin/skills/hugging-face-cli/SKILL.md +191 -0
  77. package/bin/skills/hugging-face-cli/references/commands.md +954 -0
  78. package/bin/skills/hugging-face-cli/references/examples.md +374 -0
  79. package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
  80. package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
  81. package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
  82. package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
  83. package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
  84. package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
  85. package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
  86. package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
  87. package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
  88. package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
  89. package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
  90. package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
  91. package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
  92. package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
  93. package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
  94. package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
  95. package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
  96. package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
  97. package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
  98. package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
  99. package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
  100. package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
  101. package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
  102. package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
  103. package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
  104. package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
  105. package/bin/skills/hugging-face-jobs/index.html +216 -0
  106. package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
  107. package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
  108. package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
  109. package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
  110. package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
  111. package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
  112. package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
  113. package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
  114. package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
  115. package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
  116. package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
  117. package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
  118. package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
  119. package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
  120. package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
  121. package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
  122. package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
  123. package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
  124. package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
  125. package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
  126. package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
  127. package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
  128. package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
  129. package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
  130. package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
  131. package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
  132. package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
  133. package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
  134. package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
  135. package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
  136. package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
  137. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
  138. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
  139. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
  140. package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
  141. package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
  142. package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
  143. package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
  144. package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
  145. package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
  146. package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
  147. package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
  148. package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
  149. package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
  150. package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
  151. package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
  152. package/bin/skills/instructor/SKILL.md +740 -0
  153. package/bin/skills/instructor/references/examples.md +107 -0
  154. package/bin/skills/instructor/references/providers.md +70 -0
  155. package/bin/skills/instructor/references/validation.md +606 -0
  156. package/bin/skills/knowledge-distillation/SKILL.md +458 -0
  157. package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
  158. package/bin/skills/lambda-labs/SKILL.md +545 -0
  159. package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
  160. package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
  161. package/bin/skills/langchain/SKILL.md +480 -0
  162. package/bin/skills/langchain/references/agents.md +499 -0
  163. package/bin/skills/langchain/references/integration.md +562 -0
  164. package/bin/skills/langchain/references/rag.md +600 -0
  165. package/bin/skills/langsmith/SKILL.md +422 -0
  166. package/bin/skills/langsmith/references/advanced-usage.md +548 -0
  167. package/bin/skills/langsmith/references/troubleshooting.md +537 -0
  168. package/bin/skills/litgpt/SKILL.md +469 -0
  169. package/bin/skills/litgpt/references/custom-models.md +568 -0
  170. package/bin/skills/litgpt/references/distributed-training.md +451 -0
  171. package/bin/skills/litgpt/references/supported-models.md +336 -0
  172. package/bin/skills/litgpt/references/training-recipes.md +619 -0
  173. package/bin/skills/llama-cpp/SKILL.md +258 -0
  174. package/bin/skills/llama-cpp/references/optimization.md +89 -0
  175. package/bin/skills/llama-cpp/references/quantization.md +213 -0
  176. package/bin/skills/llama-cpp/references/server.md +125 -0
  177. package/bin/skills/llama-factory/SKILL.md +80 -0
  178. package/bin/skills/llama-factory/references/_images.md +23 -0
  179. package/bin/skills/llama-factory/references/advanced.md +1055 -0
  180. package/bin/skills/llama-factory/references/getting_started.md +349 -0
  181. package/bin/skills/llama-factory/references/index.md +19 -0
  182. package/bin/skills/llama-factory/references/other.md +31 -0
  183. package/bin/skills/llamaguard/SKILL.md +337 -0
  184. package/bin/skills/llamaindex/SKILL.md +569 -0
  185. package/bin/skills/llamaindex/references/agents.md +83 -0
  186. package/bin/skills/llamaindex/references/data_connectors.md +108 -0
  187. package/bin/skills/llamaindex/references/query_engines.md +406 -0
  188. package/bin/skills/llava/SKILL.md +304 -0
  189. package/bin/skills/llava/references/training.md +197 -0
  190. package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
  191. package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
  192. package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
  193. package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
  194. package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
  195. package/bin/skills/long-context/SKILL.md +536 -0
  196. package/bin/skills/long-context/references/extension_methods.md +468 -0
  197. package/bin/skills/long-context/references/fine_tuning.md +611 -0
  198. package/bin/skills/long-context/references/rope.md +402 -0
  199. package/bin/skills/mamba/SKILL.md +260 -0
  200. package/bin/skills/mamba/references/architecture-details.md +206 -0
  201. package/bin/skills/mamba/references/benchmarks.md +255 -0
  202. package/bin/skills/mamba/references/training-guide.md +388 -0
  203. package/bin/skills/megatron-core/SKILL.md +366 -0
  204. package/bin/skills/megatron-core/references/benchmarks.md +249 -0
  205. package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
  206. package/bin/skills/megatron-core/references/production-examples.md +473 -0
  207. package/bin/skills/megatron-core/references/training-recipes.md +547 -0
  208. package/bin/skills/miles/SKILL.md +315 -0
  209. package/bin/skills/miles/references/api-reference.md +141 -0
  210. package/bin/skills/miles/references/troubleshooting.md +352 -0
  211. package/bin/skills/mlflow/SKILL.md +704 -0
  212. package/bin/skills/mlflow/references/deployment.md +744 -0
  213. package/bin/skills/mlflow/references/model-registry.md +770 -0
  214. package/bin/skills/mlflow/references/tracking.md +680 -0
  215. package/bin/skills/modal/SKILL.md +341 -0
  216. package/bin/skills/modal/references/advanced-usage.md +503 -0
  217. package/bin/skills/modal/references/troubleshooting.md +494 -0
  218. package/bin/skills/model-merging/SKILL.md +539 -0
  219. package/bin/skills/model-merging/references/evaluation.md +462 -0
  220. package/bin/skills/model-merging/references/examples.md +428 -0
  221. package/bin/skills/model-merging/references/methods.md +352 -0
  222. package/bin/skills/model-pruning/SKILL.md +495 -0
  223. package/bin/skills/model-pruning/references/wanda.md +347 -0
  224. package/bin/skills/moe-training/SKILL.md +526 -0
  225. package/bin/skills/moe-training/references/architectures.md +432 -0
  226. package/bin/skills/moe-training/references/inference.md +348 -0
  227. package/bin/skills/moe-training/references/training.md +425 -0
  228. package/bin/skills/nanogpt/SKILL.md +290 -0
  229. package/bin/skills/nanogpt/references/architecture.md +382 -0
  230. package/bin/skills/nanogpt/references/data.md +476 -0
  231. package/bin/skills/nanogpt/references/training.md +564 -0
  232. package/bin/skills/nemo-curator/SKILL.md +383 -0
  233. package/bin/skills/nemo-curator/references/deduplication.md +87 -0
  234. package/bin/skills/nemo-curator/references/filtering.md +102 -0
  235. package/bin/skills/nemo-evaluator/SKILL.md +494 -0
  236. package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
  237. package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
  238. package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
  239. package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
  240. package/bin/skills/nemo-guardrails/SKILL.md +297 -0
  241. package/bin/skills/nnsight/SKILL.md +436 -0
  242. package/bin/skills/nnsight/references/README.md +78 -0
  243. package/bin/skills/nnsight/references/api.md +344 -0
  244. package/bin/skills/nnsight/references/tutorials.md +300 -0
  245. package/bin/skills/openrlhf/SKILL.md +249 -0
  246. package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
  247. package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
  248. package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
  249. package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
  250. package/bin/skills/outlines/SKILL.md +652 -0
  251. package/bin/skills/outlines/references/backends.md +615 -0
  252. package/bin/skills/outlines/references/examples.md +773 -0
  253. package/bin/skills/outlines/references/json_generation.md +652 -0
  254. package/bin/skills/peft/SKILL.md +431 -0
  255. package/bin/skills/peft/references/advanced-usage.md +514 -0
  256. package/bin/skills/peft/references/troubleshooting.md +480 -0
  257. package/bin/skills/phoenix/SKILL.md +475 -0
  258. package/bin/skills/phoenix/references/advanced-usage.md +619 -0
  259. package/bin/skills/phoenix/references/troubleshooting.md +538 -0
  260. package/bin/skills/pinecone/SKILL.md +358 -0
  261. package/bin/skills/pinecone/references/deployment.md +181 -0
  262. package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
  263. package/bin/skills/pytorch-fsdp/references/index.md +7 -0
  264. package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
  265. package/bin/skills/pytorch-lightning/SKILL.md +346 -0
  266. package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
  267. package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
  268. package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
  269. package/bin/skills/pyvene/SKILL.md +473 -0
  270. package/bin/skills/pyvene/references/README.md +73 -0
  271. package/bin/skills/pyvene/references/api.md +383 -0
  272. package/bin/skills/pyvene/references/tutorials.md +376 -0
  273. package/bin/skills/qdrant/SKILL.md +493 -0
  274. package/bin/skills/qdrant/references/advanced-usage.md +648 -0
  275. package/bin/skills/qdrant/references/troubleshooting.md +631 -0
  276. package/bin/skills/ray-data/SKILL.md +326 -0
  277. package/bin/skills/ray-data/references/integration.md +82 -0
  278. package/bin/skills/ray-data/references/transformations.md +83 -0
  279. package/bin/skills/ray-train/SKILL.md +406 -0
  280. package/bin/skills/ray-train/references/multi-node.md +628 -0
  281. package/bin/skills/rwkv/SKILL.md +260 -0
  282. package/bin/skills/rwkv/references/architecture-details.md +344 -0
  283. package/bin/skills/rwkv/references/rwkv7.md +386 -0
  284. package/bin/skills/rwkv/references/state-management.md +369 -0
  285. package/bin/skills/saelens/SKILL.md +386 -0
  286. package/bin/skills/saelens/references/README.md +70 -0
  287. package/bin/skills/saelens/references/api.md +333 -0
  288. package/bin/skills/saelens/references/tutorials.md +318 -0
  289. package/bin/skills/segment-anything/SKILL.md +500 -0
  290. package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
  291. package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
  292. package/bin/skills/sentence-transformers/SKILL.md +255 -0
  293. package/bin/skills/sentence-transformers/references/models.md +123 -0
  294. package/bin/skills/sentencepiece/SKILL.md +235 -0
  295. package/bin/skills/sentencepiece/references/algorithms.md +200 -0
  296. package/bin/skills/sentencepiece/references/training.md +304 -0
  297. package/bin/skills/sglang/SKILL.md +442 -0
  298. package/bin/skills/sglang/references/deployment.md +490 -0
  299. package/bin/skills/sglang/references/radix-attention.md +413 -0
  300. package/bin/skills/sglang/references/structured-generation.md +541 -0
  301. package/bin/skills/simpo/SKILL.md +219 -0
  302. package/bin/skills/simpo/references/datasets.md +478 -0
  303. package/bin/skills/simpo/references/hyperparameters.md +452 -0
  304. package/bin/skills/simpo/references/loss-functions.md +350 -0
  305. package/bin/skills/skypilot/SKILL.md +509 -0
  306. package/bin/skills/skypilot/references/advanced-usage.md +491 -0
  307. package/bin/skills/skypilot/references/troubleshooting.md +570 -0
  308. package/bin/skills/slime/SKILL.md +464 -0
  309. package/bin/skills/slime/references/api-reference.md +392 -0
  310. package/bin/skills/slime/references/troubleshooting.md +386 -0
  311. package/bin/skills/speculative-decoding/SKILL.md +467 -0
  312. package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
  313. package/bin/skills/speculative-decoding/references/medusa.md +350 -0
  314. package/bin/skills/stable-diffusion/SKILL.md +519 -0
  315. package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
  316. package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
  317. package/bin/skills/tensorboard/SKILL.md +629 -0
  318. package/bin/skills/tensorboard/references/integrations.md +638 -0
  319. package/bin/skills/tensorboard/references/profiling.md +545 -0
  320. package/bin/skills/tensorboard/references/visualization.md +620 -0
  321. package/bin/skills/tensorrt-llm/SKILL.md +187 -0
  322. package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
  323. package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
  324. package/bin/skills/tensorrt-llm/references/serving.md +470 -0
  325. package/bin/skills/tinker/SKILL.md +362 -0
  326. package/bin/skills/tinker/references/api-reference.md +168 -0
  327. package/bin/skills/tinker/references/getting-started.md +157 -0
  328. package/bin/skills/tinker/references/loss-functions.md +163 -0
  329. package/bin/skills/tinker/references/models-and-lora.md +139 -0
  330. package/bin/skills/tinker/references/recipes.md +280 -0
  331. package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
  332. package/bin/skills/tinker/references/rendering.md +243 -0
  333. package/bin/skills/tinker/references/supervised-learning.md +232 -0
  334. package/bin/skills/tinker-training-cost/SKILL.md +187 -0
  335. package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
  336. package/bin/skills/torchforge/SKILL.md +433 -0
  337. package/bin/skills/torchforge/references/api-reference.md +327 -0
  338. package/bin/skills/torchforge/references/troubleshooting.md +409 -0
  339. package/bin/skills/torchtitan/SKILL.md +358 -0
  340. package/bin/skills/torchtitan/references/checkpoint.md +181 -0
  341. package/bin/skills/torchtitan/references/custom-models.md +258 -0
  342. package/bin/skills/torchtitan/references/float8.md +133 -0
  343. package/bin/skills/torchtitan/references/fsdp.md +126 -0
  344. package/bin/skills/transformer-lens/SKILL.md +346 -0
  345. package/bin/skills/transformer-lens/references/README.md +54 -0
  346. package/bin/skills/transformer-lens/references/api.md +362 -0
  347. package/bin/skills/transformer-lens/references/tutorials.md +339 -0
  348. package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
  349. package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
  350. package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
  351. package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
  352. package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
  353. package/bin/skills/unsloth/SKILL.md +80 -0
  354. package/bin/skills/unsloth/references/index.md +7 -0
  355. package/bin/skills/unsloth/references/llms-full.md +16799 -0
  356. package/bin/skills/unsloth/references/llms-txt.md +12044 -0
  357. package/bin/skills/unsloth/references/llms.md +82 -0
  358. package/bin/skills/verl/SKILL.md +391 -0
  359. package/bin/skills/verl/references/api-reference.md +301 -0
  360. package/bin/skills/verl/references/troubleshooting.md +391 -0
  361. package/bin/skills/vllm/SKILL.md +364 -0
  362. package/bin/skills/vllm/references/optimization.md +226 -0
  363. package/bin/skills/vllm/references/quantization.md +284 -0
  364. package/bin/skills/vllm/references/server-deployment.md +255 -0
  365. package/bin/skills/vllm/references/troubleshooting.md +447 -0
  366. package/bin/skills/weights-and-biases/SKILL.md +590 -0
  367. package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
  368. package/bin/skills/weights-and-biases/references/integrations.md +700 -0
  369. package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
  370. package/bin/skills/whisper/SKILL.md +317 -0
  371. package/bin/skills/whisper/references/languages.md +189 -0
  372. package/bin/synsc +0 -0
  373. package/package.json +10 -0
@@ -0,0 +1,847 @@
1
+ # Comprehensive Hyperparameter Sweeps Guide
2
+
3
+ Complete guide to hyperparameter optimization with W&B Sweeps.
4
+
5
+ ## Table of Contents
6
+ - Sweep Configuration
7
+ - Search Strategies
8
+ - Parameter Distributions
9
+ - Early Termination
10
+ - Parallel Execution
11
+ - Advanced Patterns
12
+ - Real-World Examples
13
+
14
+ ## Sweep Configuration
15
+
16
+ ### Basic Sweep Config
17
+
18
+ ```python
19
+ sweep_config = {
20
+ 'method': 'bayes', # Search strategy
21
+ 'metric': {
22
+ 'name': 'val/accuracy',
23
+ 'goal': 'maximize' # or 'minimize'
24
+ },
25
+ 'parameters': {
26
+ 'learning_rate': {
27
+ 'distribution': 'log_uniform',
28
+ 'min': 1e-5,
29
+ 'max': 1e-1
30
+ },
31
+ 'batch_size': {
32
+ 'values': [16, 32, 64, 128]
33
+ }
34
+ }
35
+ }
36
+
37
+ # Initialize sweep
38
+ sweep_id = wandb.sweep(sweep_config, project="my-project")
39
+ ```
40
+
41
+ ### Complete Config Example
42
+
43
+ ```python
44
+ sweep_config = {
45
+ # Required: Search method
46
+ 'method': 'bayes',
47
+
48
+ # Required: Optimization metric
49
+ 'metric': {
50
+ 'name': 'val/f1_score',
51
+ 'goal': 'maximize'
52
+ },
53
+
54
+ # Required: Parameters to search
55
+ 'parameters': {
56
+ # Continuous parameter
57
+ 'learning_rate': {
58
+ 'distribution': 'log_uniform',
59
+ 'min': 1e-5,
60
+ 'max': 1e-1
61
+ },
62
+
63
+ # Discrete values
64
+ 'batch_size': {
65
+ 'values': [16, 32, 64, 128]
66
+ },
67
+
68
+ # Categorical
69
+ 'optimizer': {
70
+ 'values': ['adam', 'sgd', 'rmsprop', 'adamw']
71
+ },
72
+
73
+ # Uniform distribution
74
+ 'dropout': {
75
+ 'distribution': 'uniform',
76
+ 'min': 0.1,
77
+ 'max': 0.5
78
+ },
79
+
80
+ # Integer range
81
+ 'num_layers': {
82
+ 'distribution': 'int_uniform',
83
+ 'min': 2,
84
+ 'max': 10
85
+ },
86
+
87
+ # Fixed value (constant across runs)
88
+ 'epochs': {
89
+ 'value': 50
90
+ }
91
+ },
92
+
93
+ # Optional: Early termination
94
+ 'early_terminate': {
95
+ 'type': 'hyperband',
96
+ 'min_iter': 5,
97
+ 's': 2,
98
+ 'eta': 3,
99
+ 'max_iter': 27
100
+ }
101
+ }
102
+ ```
103
+
104
+ ## Search Strategies
105
+
106
+ ### 1. Grid Search
107
+
108
+ Exhaustively search all combinations.
109
+
110
+ ```python
111
+ sweep_config = {
112
+ 'method': 'grid',
113
+ 'parameters': {
114
+ 'learning_rate': {
115
+ 'values': [0.001, 0.01, 0.1]
116
+ },
117
+ 'batch_size': {
118
+ 'values': [16, 32, 64]
119
+ },
120
+ 'optimizer': {
121
+ 'values': ['adam', 'sgd']
122
+ }
123
+ }
124
+ }
125
+
126
+ # Total runs: 3 × 3 × 2 = 18 runs
127
+ ```
128
+
129
+ **Pros:**
130
+ - Comprehensive search
131
+ - Reproducible results
132
+ - No randomness
133
+
134
+ **Cons:**
135
+ - Exponential growth with parameters
136
+ - Inefficient for continuous parameters
137
+ - Not scalable beyond 3-4 parameters
138
+
139
+ **When to use:**
140
+ - Few parameters (< 4)
141
+ - All discrete values
142
+ - Need complete coverage
143
+
144
+ ### 2. Random Search
145
+
146
+ Randomly sample parameter combinations.
147
+
148
+ ```python
149
+ sweep_config = {
150
+ 'method': 'random',
151
+ 'parameters': {
152
+ 'learning_rate': {
153
+ 'distribution': 'log_uniform',
154
+ 'min': 1e-5,
155
+ 'max': 1e-1
156
+ },
157
+ 'batch_size': {
158
+ 'values': [16, 32, 64, 128, 256]
159
+ },
160
+ 'dropout': {
161
+ 'distribution': 'uniform',
162
+ 'min': 0.0,
163
+ 'max': 0.5
164
+ },
165
+ 'num_layers': {
166
+ 'distribution': 'int_uniform',
167
+ 'min': 2,
168
+ 'max': 8
169
+ }
170
+ }
171
+ }
172
+
173
+ # Run 100 random trials
174
+ wandb.agent(sweep_id, function=train, count=100)
175
+ ```
176
+
177
+ **Pros:**
178
+ - Scales to many parameters
179
+ - Can run indefinitely
180
+ - Often finds good solutions quickly
181
+
182
+ **Cons:**
183
+ - No learning from previous runs
184
+ - May miss optimal region
185
+ - Results vary with random seed
186
+
187
+ **When to use:**
188
+ - Many parameters (> 4)
189
+ - Quick exploration
190
+ - Limited budget
191
+
192
+ ### 3. Bayesian Optimization (Recommended)
193
+
194
+ Learn from previous trials to sample promising regions.
195
+
196
+ ```python
197
+ sweep_config = {
198
+ 'method': 'bayes',
199
+ 'metric': {
200
+ 'name': 'val/loss',
201
+ 'goal': 'minimize'
202
+ },
203
+ 'parameters': {
204
+ 'learning_rate': {
205
+ 'distribution': 'log_uniform',
206
+ 'min': 1e-5,
207
+ 'max': 1e-1
208
+ },
209
+ 'weight_decay': {
210
+ 'distribution': 'log_uniform',
211
+ 'min': 1e-6,
212
+ 'max': 1e-2
213
+ },
214
+ 'dropout': {
215
+ 'distribution': 'uniform',
216
+ 'min': 0.1,
217
+ 'max': 0.5
218
+ },
219
+ 'num_layers': {
220
+ 'values': [2, 3, 4, 5, 6]
221
+ }
222
+ }
223
+ }
224
+ ```
225
+
226
+ **Pros:**
227
+ - Most sample-efficient
228
+ - Learns from past trials
229
+ - Focuses on promising regions
230
+
231
+ **Cons:**
232
+ - Initial random exploration phase
233
+ - May get stuck in local optima
234
+ - Slower per iteration
235
+
236
+ **When to use:**
237
+ - Expensive training runs
238
+ - Need best performance
239
+ - Limited compute budget
240
+
241
+ ## Parameter Distributions
242
+
243
+ ### Continuous Distributions
244
+
245
+ ```python
246
+ # Log-uniform: Good for learning rates, regularization
247
+ 'learning_rate': {
248
+ 'distribution': 'log_uniform',
249
+ 'min': 1e-6,
250
+ 'max': 1e-1
251
+ }
252
+
253
+ # Uniform: Good for dropout, momentum
254
+ 'dropout': {
255
+ 'distribution': 'uniform',
256
+ 'min': 0.0,
257
+ 'max': 0.5
258
+ }
259
+
260
+ # Normal distribution
261
+ 'parameter': {
262
+ 'distribution': 'normal',
263
+ 'mu': 0.5,
264
+ 'sigma': 0.1
265
+ }
266
+
267
+ # Log-normal distribution
268
+ 'parameter': {
269
+ 'distribution': 'log_normal',
270
+ 'mu': 0.0,
271
+ 'sigma': 1.0
272
+ }
273
+ ```
274
+
275
+ ### Discrete Distributions
276
+
277
+ ```python
278
+ # Fixed values
279
+ 'batch_size': {
280
+ 'values': [16, 32, 64, 128, 256]
281
+ }
282
+
283
+ # Integer uniform
284
+ 'num_layers': {
285
+ 'distribution': 'int_uniform',
286
+ 'min': 2,
287
+ 'max': 10
288
+ }
289
+
290
+ # Quantized uniform (step size)
291
+ 'layer_size': {
292
+ 'distribution': 'q_uniform',
293
+ 'min': 32,
294
+ 'max': 512,
295
+ 'q': 32 # Step by 32: 32, 64, 96, 128...
296
+ }
297
+
298
+ # Quantized log-uniform
299
+ 'hidden_size': {
300
+ 'distribution': 'q_log_uniform',
301
+ 'min': 32,
302
+ 'max': 1024,
303
+ 'q': 32
304
+ }
305
+ ```
306
+
307
+ ### Categorical Parameters
308
+
309
+ ```python
310
+ # Optimizers
311
+ 'optimizer': {
312
+ 'values': ['adam', 'sgd', 'rmsprop', 'adamw']
313
+ }
314
+
315
+ # Model architectures
316
+ 'model': {
317
+ 'values': ['resnet18', 'resnet34', 'resnet50', 'efficientnet_b0']
318
+ }
319
+
320
+ # Activation functions
321
+ 'activation': {
322
+ 'values': ['relu', 'gelu', 'silu', 'leaky_relu']
323
+ }
324
+ ```
325
+
326
+ ## Early Termination
327
+
328
+ Stop underperforming runs early to save compute.
329
+
330
+ ### Hyperband
331
+
332
+ ```python
333
+ sweep_config = {
334
+ 'method': 'bayes',
335
+ 'metric': {'name': 'val/accuracy', 'goal': 'maximize'},
336
+ 'parameters': {...},
337
+
338
+ # Hyperband early termination
339
+ 'early_terminate': {
340
+ 'type': 'hyperband',
341
+ 'min_iter': 3, # Minimum iterations before termination
342
+ 's': 2, # Bracket count
343
+ 'eta': 3, # Downsampling rate
344
+ 'max_iter': 27 # Maximum iterations
345
+ }
346
+ }
347
+ ```
348
+
349
+ **How it works:**
350
+ - Runs trials in brackets
351
+ - Keeps top 1/eta performers each round
352
+ - Eliminates bottom performers early
353
+
354
+ ### Custom Termination
355
+
356
+ ```python
357
+ def train():
358
+ run = wandb.init()
359
+
360
+ for epoch in range(MAX_EPOCHS):
361
+ loss = train_epoch()
362
+ val_acc = validate()
363
+
364
+ wandb.log({'val/accuracy': val_acc, 'epoch': epoch})
365
+
366
+ # Custom early stopping
367
+ if epoch > 5 and val_acc < 0.5:
368
+ print("Early stop: Poor performance")
369
+ break
370
+
371
+ if epoch > 10 and val_acc > best_acc - 0.01:
372
+ print("Early stop: No improvement")
373
+ break
374
+ ```
375
+
376
+ ## Training Function
377
+
378
+ ### Basic Template
379
+
380
+ ```python
381
+ def train():
382
+ # Initialize W&B run
383
+ run = wandb.init()
384
+
385
+ # Get hyperparameters
386
+ config = wandb.config
387
+
388
+ # Build model with config
389
+ model = build_model(
390
+ hidden_size=config.hidden_size,
391
+ num_layers=config.num_layers,
392
+ dropout=config.dropout
393
+ )
394
+
395
+ # Create optimizer
396
+ optimizer = create_optimizer(
397
+ model.parameters(),
398
+ name=config.optimizer,
399
+ lr=config.learning_rate,
400
+ weight_decay=config.weight_decay
401
+ )
402
+
403
+ # Training loop
404
+ for epoch in range(config.epochs):
405
+ # Train
406
+ train_loss, train_acc = train_epoch(
407
+ model, optimizer, train_loader, config.batch_size
408
+ )
409
+
410
+ # Validate
411
+ val_loss, val_acc = validate(model, val_loader)
412
+
413
+ # Log metrics
414
+ wandb.log({
415
+ 'train/loss': train_loss,
416
+ 'train/accuracy': train_acc,
417
+ 'val/loss': val_loss,
418
+ 'val/accuracy': val_acc,
419
+ 'epoch': epoch
420
+ })
421
+
422
+ # Log final model
423
+ torch.save(model.state_dict(), 'model.pth')
424
+ wandb.save('model.pth')
425
+
426
+ # Finish run
427
+ wandb.finish()
428
+ ```
429
+
430
+ ### With PyTorch
431
+
432
+ ```python
433
+ import torch
434
+ import torch.nn as nn
435
+ from torch.utils.data import DataLoader
436
+ import wandb
437
+
438
+ def train():
439
+ run = wandb.init()
440
+ config = wandb.config
441
+
442
+ # Data
443
+ train_loader = DataLoader(
444
+ train_dataset,
445
+ batch_size=config.batch_size,
446
+ shuffle=True
447
+ )
448
+
449
+ # Model
450
+ model = ResNet(
451
+ num_classes=config.num_classes,
452
+ dropout=config.dropout
453
+ ).to(device)
454
+
455
+ # Optimizer
456
+ if config.optimizer == 'adam':
457
+ optimizer = torch.optim.Adam(
458
+ model.parameters(),
459
+ lr=config.learning_rate,
460
+ weight_decay=config.weight_decay
461
+ )
462
+ elif config.optimizer == 'sgd':
463
+ optimizer = torch.optim.SGD(
464
+ model.parameters(),
465
+ lr=config.learning_rate,
466
+ momentum=config.momentum,
467
+ weight_decay=config.weight_decay
468
+ )
469
+
470
+ # Scheduler
471
+ scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
472
+ optimizer, T_max=config.epochs
473
+ )
474
+
475
+ # Training
476
+ for epoch in range(config.epochs):
477
+ model.train()
478
+ train_loss = 0.0
479
+
480
+ for data, target in train_loader:
481
+ data, target = data.to(device), target.to(device)
482
+
483
+ optimizer.zero_grad()
484
+ output = model(data)
485
+ loss = nn.CrossEntropyLoss()(output, target)
486
+ loss.backward()
487
+ optimizer.step()
488
+
489
+ train_loss += loss.item()
490
+
491
+ # Validation
492
+ model.eval()
493
+ val_loss, val_acc = validate(model, val_loader)
494
+
495
+ # Step scheduler
496
+ scheduler.step()
497
+
498
+ # Log
499
+ wandb.log({
500
+ 'train/loss': train_loss / len(train_loader),
501
+ 'val/loss': val_loss,
502
+ 'val/accuracy': val_acc,
503
+ 'learning_rate': scheduler.get_last_lr()[0],
504
+ 'epoch': epoch
505
+ })
506
+ ```
507
+
508
+ ## Parallel Execution
509
+
510
+ ### Multiple Agents
511
+
512
+ Run sweep agents in parallel to speed up search.
513
+
514
+ ```python
515
+ # Initialize sweep once
516
+ sweep_id = wandb.sweep(sweep_config, project="my-project")
517
+
518
+ # Run multiple agents in parallel
519
+ # Agent 1 (Terminal 1)
520
+ wandb.agent(sweep_id, function=train, count=20)
521
+
522
+ # Agent 2 (Terminal 2)
523
+ wandb.agent(sweep_id, function=train, count=20)
524
+
525
+ # Agent 3 (Terminal 3)
526
+ wandb.agent(sweep_id, function=train, count=20)
527
+
528
+ # Total: 60 runs across 3 agents
529
+ ```
530
+
531
+ ### Multi-GPU Execution
532
+
533
+ ```python
534
+ import os
535
+
536
+ def train():
537
+ # Get available GPU
538
+ gpu_id = os.environ.get('CUDA_VISIBLE_DEVICES', '0')
539
+
540
+ run = wandb.init()
541
+ config = wandb.config
542
+
543
+ # Train on specific GPU
544
+ device = torch.device(f'cuda:{gpu_id}')
545
+ model = model.to(device)
546
+
547
+ # ... rest of training ...
548
+
549
+ # Run agents on different GPUs
550
+ # Terminal 1
551
+ # CUDA_VISIBLE_DEVICES=0 wandb agent sweep_id
552
+
553
+ # Terminal 2
554
+ # CUDA_VISIBLE_DEVICES=1 wandb agent sweep_id
555
+
556
+ # Terminal 3
557
+ # CUDA_VISIBLE_DEVICES=2 wandb agent sweep_id
558
+ ```
559
+
560
+ ## Advanced Patterns
561
+
562
+ ### Nested Parameters
563
+
564
+ ```python
565
+ sweep_config = {
566
+ 'method': 'bayes',
567
+ 'metric': {'name': 'val/accuracy', 'goal': 'maximize'},
568
+ 'parameters': {
569
+ 'model': {
570
+ 'parameters': {
571
+ 'type': {
572
+ 'values': ['resnet', 'efficientnet']
573
+ },
574
+ 'size': {
575
+ 'values': ['small', 'medium', 'large']
576
+ }
577
+ }
578
+ },
579
+ 'optimizer': {
580
+ 'parameters': {
581
+ 'type': {
582
+ 'values': ['adam', 'sgd']
583
+ },
584
+ 'lr': {
585
+ 'distribution': 'log_uniform',
586
+ 'min': 1e-5,
587
+ 'max': 1e-1
588
+ }
589
+ }
590
+ }
591
+ }
592
+ }
593
+
594
+ # Access nested config
595
+ def train():
596
+ run = wandb.init()
597
+ model_type = wandb.config.model.type
598
+ model_size = wandb.config.model.size
599
+ opt_type = wandb.config.optimizer.type
600
+ lr = wandb.config.optimizer.lr
601
+ ```
602
+
603
+ ### Conditional Parameters
604
+
605
+ ```python
606
+ sweep_config = {
607
+ 'method': 'bayes',
608
+ 'parameters': {
609
+ 'optimizer': {
610
+ 'values': ['adam', 'sgd']
611
+ },
612
+ 'learning_rate': {
613
+ 'distribution': 'log_uniform',
614
+ 'min': 1e-5,
615
+ 'max': 1e-1
616
+ },
617
+ # Only used if optimizer == 'sgd'
618
+ 'momentum': {
619
+ 'distribution': 'uniform',
620
+ 'min': 0.5,
621
+ 'max': 0.99
622
+ }
623
+ }
624
+ }
625
+
626
+ def train():
627
+ run = wandb.init()
628
+ config = wandb.config
629
+
630
+ if config.optimizer == 'adam':
631
+ optimizer = torch.optim.Adam(
632
+ model.parameters(),
633
+ lr=config.learning_rate
634
+ )
635
+ elif config.optimizer == 'sgd':
636
+ optimizer = torch.optim.SGD(
637
+ model.parameters(),
638
+ lr=config.learning_rate,
639
+ momentum=config.momentum # Conditional parameter
640
+ )
641
+ ```
642
+
643
+ ## Real-World Examples
644
+
645
+ ### Image Classification
646
+
647
+ ```python
648
+ sweep_config = {
649
+ 'method': 'bayes',
650
+ 'metric': {
651
+ 'name': 'val/top1_accuracy',
652
+ 'goal': 'maximize'
653
+ },
654
+ 'parameters': {
655
+ # Model
656
+ 'architecture': {
657
+ 'values': ['resnet50', 'resnet101', 'efficientnet_b0', 'efficientnet_b3']
658
+ },
659
+ 'pretrained': {
660
+ 'values': [True, False]
661
+ },
662
+
663
+ # Training
664
+ 'learning_rate': {
665
+ 'distribution': 'log_uniform',
666
+ 'min': 1e-5,
667
+ 'max': 1e-2
668
+ },
669
+ 'batch_size': {
670
+ 'values': [16, 32, 64, 128]
671
+ },
672
+ 'optimizer': {
673
+ 'values': ['adam', 'sgd', 'adamw']
674
+ },
675
+ 'weight_decay': {
676
+ 'distribution': 'log_uniform',
677
+ 'min': 1e-6,
678
+ 'max': 1e-2
679
+ },
680
+
681
+ # Regularization
682
+ 'dropout': {
683
+ 'distribution': 'uniform',
684
+ 'min': 0.0,
685
+ 'max': 0.5
686
+ },
687
+ 'label_smoothing': {
688
+ 'distribution': 'uniform',
689
+ 'min': 0.0,
690
+ 'max': 0.2
691
+ },
692
+
693
+ # Data augmentation
694
+ 'mixup_alpha': {
695
+ 'distribution': 'uniform',
696
+ 'min': 0.0,
697
+ 'max': 1.0
698
+ },
699
+ 'cutmix_alpha': {
700
+ 'distribution': 'uniform',
701
+ 'min': 0.0,
702
+ 'max': 1.0
703
+ }
704
+ },
705
+ 'early_terminate': {
706
+ 'type': 'hyperband',
707
+ 'min_iter': 5
708
+ }
709
+ }
710
+ ```
711
+
712
+ ### NLP Fine-Tuning
713
+
714
+ ```python
715
+ sweep_config = {
716
+ 'method': 'bayes',
717
+ 'metric': {'name': 'eval/f1', 'goal': 'maximize'},
718
+ 'parameters': {
719
+ # Model
720
+ 'model_name': {
721
+ 'values': ['bert-base-uncased', 'roberta-base', 'distilbert-base-uncased']
722
+ },
723
+
724
+ # Training
725
+ 'learning_rate': {
726
+ 'distribution': 'log_uniform',
727
+ 'min': 1e-6,
728
+ 'max': 1e-4
729
+ },
730
+ 'per_device_train_batch_size': {
731
+ 'values': [8, 16, 32]
732
+ },
733
+ 'num_train_epochs': {
734
+ 'values': [3, 4, 5]
735
+ },
736
+ 'warmup_ratio': {
737
+ 'distribution': 'uniform',
738
+ 'min': 0.0,
739
+ 'max': 0.1
740
+ },
741
+ 'weight_decay': {
742
+ 'distribution': 'log_uniform',
743
+ 'min': 1e-4,
744
+ 'max': 1e-1
745
+ },
746
+
747
+ # Optimizer
748
+ 'adam_beta1': {
749
+ 'distribution': 'uniform',
750
+ 'min': 0.8,
751
+ 'max': 0.95
752
+ },
753
+ 'adam_beta2': {
754
+ 'distribution': 'uniform',
755
+ 'min': 0.95,
756
+ 'max': 0.999
757
+ }
758
+ }
759
+ }
760
+ ```
761
+
762
+ ## Best Practices
763
+
764
+ ### 1. Start Small
765
+
766
+ ```python
767
+ # Initial exploration: Random search, 20 runs
768
+ sweep_config_v1 = {
769
+ 'method': 'random',
770
+ 'parameters': {...}
771
+ }
772
+ wandb.agent(sweep_id_v1, train, count=20)
773
+
774
+ # Refined search: Bayes, narrow ranges
775
+ sweep_config_v2 = {
776
+ 'method': 'bayes',
777
+ 'parameters': {
778
+ 'learning_rate': {
779
+ 'min': 5e-5, # Narrowed from 1e-6 to 1e-4
780
+ 'max': 1e-4
781
+ }
782
+ }
783
+ }
784
+ ```
785
+
786
+ ### 2. Use Log Scales
787
+
788
+ ```python
789
+ # ✅ Good: Log scale for learning rate
790
+ 'learning_rate': {
791
+ 'distribution': 'log_uniform',
792
+ 'min': 1e-6,
793
+ 'max': 1e-2
794
+ }
795
+
796
+ # ❌ Bad: Linear scale
797
+ 'learning_rate': {
798
+ 'distribution': 'uniform',
799
+ 'min': 0.000001,
800
+ 'max': 0.01
801
+ }
802
+ ```
803
+
804
+ ### 3. Set Reasonable Ranges
805
+
806
+ ```python
807
+ # Base ranges on prior knowledge
808
+ 'learning_rate': {'min': 1e-5, 'max': 1e-3}, # Typical for Adam
809
+ 'batch_size': {'values': [16, 32, 64]}, # GPU memory limits
810
+ 'dropout': {'min': 0.1, 'max': 0.5} # Too high hurts training
811
+ ```
812
+
813
+ ### 4. Monitor Resource Usage
814
+
815
+ ```python
816
+ def train():
817
+ run = wandb.init()
818
+
819
+ # Log system metrics
820
+ wandb.log({
821
+ 'system/gpu_memory_allocated': torch.cuda.memory_allocated(),
822
+ 'system/gpu_memory_reserved': torch.cuda.memory_reserved()
823
+ })
824
+ ```
825
+
826
+ ### 5. Save Best Models
827
+
828
+ ```python
829
+ def train():
830
+ run = wandb.init()
831
+ best_acc = 0.0
832
+
833
+ for epoch in range(config.epochs):
834
+ val_acc = validate(model)
835
+
836
+ if val_acc > best_acc:
837
+ best_acc = val_acc
838
+ # Save best checkpoint
839
+ torch.save(model.state_dict(), 'best_model.pth')
840
+ wandb.save('best_model.pth')
841
+ ```
842
+
843
+ ## Resources
844
+
845
+ - **Sweeps Documentation**: https://docs.wandb.ai/guides/sweeps
846
+ - **Configuration Reference**: https://docs.wandb.ai/guides/sweeps/configuration
847
+ - **Examples**: https://github.com/wandb/examples/tree/master/examples/wandb-sweeps