@synsci/cli-darwin-x64 1.1.49

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (373) hide show
  1. package/bin/skills/accelerate/SKILL.md +332 -0
  2. package/bin/skills/accelerate/references/custom-plugins.md +453 -0
  3. package/bin/skills/accelerate/references/megatron-integration.md +489 -0
  4. package/bin/skills/accelerate/references/performance.md +525 -0
  5. package/bin/skills/audiocraft/SKILL.md +564 -0
  6. package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
  7. package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
  8. package/bin/skills/autogpt/SKILL.md +403 -0
  9. package/bin/skills/autogpt/references/advanced-usage.md +535 -0
  10. package/bin/skills/autogpt/references/troubleshooting.md +420 -0
  11. package/bin/skills/awq/SKILL.md +310 -0
  12. package/bin/skills/awq/references/advanced-usage.md +324 -0
  13. package/bin/skills/awq/references/troubleshooting.md +344 -0
  14. package/bin/skills/axolotl/SKILL.md +158 -0
  15. package/bin/skills/axolotl/references/api.md +5548 -0
  16. package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
  17. package/bin/skills/axolotl/references/index.md +15 -0
  18. package/bin/skills/axolotl/references/other.md +3563 -0
  19. package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
  20. package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
  21. package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
  22. package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
  23. package/bin/skills/bitsandbytes/SKILL.md +411 -0
  24. package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
  25. package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
  26. package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
  27. package/bin/skills/blip-2/SKILL.md +564 -0
  28. package/bin/skills/blip-2/references/advanced-usage.md +680 -0
  29. package/bin/skills/blip-2/references/troubleshooting.md +526 -0
  30. package/bin/skills/chroma/SKILL.md +406 -0
  31. package/bin/skills/chroma/references/integration.md +38 -0
  32. package/bin/skills/clip/SKILL.md +253 -0
  33. package/bin/skills/clip/references/applications.md +207 -0
  34. package/bin/skills/constitutional-ai/SKILL.md +290 -0
  35. package/bin/skills/crewai/SKILL.md +498 -0
  36. package/bin/skills/crewai/references/flows.md +438 -0
  37. package/bin/skills/crewai/references/tools.md +429 -0
  38. package/bin/skills/crewai/references/troubleshooting.md +480 -0
  39. package/bin/skills/deepspeed/SKILL.md +141 -0
  40. package/bin/skills/deepspeed/references/08.md +17 -0
  41. package/bin/skills/deepspeed/references/09.md +173 -0
  42. package/bin/skills/deepspeed/references/2020.md +378 -0
  43. package/bin/skills/deepspeed/references/2023.md +279 -0
  44. package/bin/skills/deepspeed/references/assets.md +179 -0
  45. package/bin/skills/deepspeed/references/index.md +35 -0
  46. package/bin/skills/deepspeed/references/mii.md +118 -0
  47. package/bin/skills/deepspeed/references/other.md +1191 -0
  48. package/bin/skills/deepspeed/references/tutorials.md +6554 -0
  49. package/bin/skills/dspy/SKILL.md +590 -0
  50. package/bin/skills/dspy/references/examples.md +663 -0
  51. package/bin/skills/dspy/references/modules.md +475 -0
  52. package/bin/skills/dspy/references/optimizers.md +566 -0
  53. package/bin/skills/faiss/SKILL.md +221 -0
  54. package/bin/skills/faiss/references/index_types.md +280 -0
  55. package/bin/skills/flash-attention/SKILL.md +367 -0
  56. package/bin/skills/flash-attention/references/benchmarks.md +215 -0
  57. package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
  58. package/bin/skills/gguf/SKILL.md +427 -0
  59. package/bin/skills/gguf/references/advanced-usage.md +504 -0
  60. package/bin/skills/gguf/references/troubleshooting.md +442 -0
  61. package/bin/skills/gptq/SKILL.md +450 -0
  62. package/bin/skills/gptq/references/calibration.md +337 -0
  63. package/bin/skills/gptq/references/integration.md +129 -0
  64. package/bin/skills/gptq/references/troubleshooting.md +95 -0
  65. package/bin/skills/grpo-rl-training/README.md +97 -0
  66. package/bin/skills/grpo-rl-training/SKILL.md +572 -0
  67. package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
  68. package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
  69. package/bin/skills/guidance/SKILL.md +572 -0
  70. package/bin/skills/guidance/references/backends.md +554 -0
  71. package/bin/skills/guidance/references/constraints.md +674 -0
  72. package/bin/skills/guidance/references/examples.md +767 -0
  73. package/bin/skills/hqq/SKILL.md +445 -0
  74. package/bin/skills/hqq/references/advanced-usage.md +528 -0
  75. package/bin/skills/hqq/references/troubleshooting.md +503 -0
  76. package/bin/skills/hugging-face-cli/SKILL.md +191 -0
  77. package/bin/skills/hugging-face-cli/references/commands.md +954 -0
  78. package/bin/skills/hugging-face-cli/references/examples.md +374 -0
  79. package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
  80. package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
  81. package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
  82. package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
  83. package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
  84. package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
  85. package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
  86. package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
  87. package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
  88. package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
  89. package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
  90. package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
  91. package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
  92. package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
  93. package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
  94. package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
  95. package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
  96. package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
  97. package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
  98. package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
  99. package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
  100. package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
  101. package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
  102. package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
  103. package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
  104. package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
  105. package/bin/skills/hugging-face-jobs/index.html +216 -0
  106. package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
  107. package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
  108. package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
  109. package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
  110. package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
  111. package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
  112. package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
  113. package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
  114. package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
  115. package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
  116. package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
  117. package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
  118. package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
  119. package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
  120. package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
  121. package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
  122. package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
  123. package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
  124. package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
  125. package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
  126. package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
  127. package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
  128. package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
  129. package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
  130. package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
  131. package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
  132. package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
  133. package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
  134. package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
  135. package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
  136. package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
  137. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
  138. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
  139. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
  140. package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
  141. package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
  142. package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
  143. package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
  144. package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
  145. package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
  146. package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
  147. package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
  148. package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
  149. package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
  150. package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
  151. package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
  152. package/bin/skills/instructor/SKILL.md +740 -0
  153. package/bin/skills/instructor/references/examples.md +107 -0
  154. package/bin/skills/instructor/references/providers.md +70 -0
  155. package/bin/skills/instructor/references/validation.md +606 -0
  156. package/bin/skills/knowledge-distillation/SKILL.md +458 -0
  157. package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
  158. package/bin/skills/lambda-labs/SKILL.md +545 -0
  159. package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
  160. package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
  161. package/bin/skills/langchain/SKILL.md +480 -0
  162. package/bin/skills/langchain/references/agents.md +499 -0
  163. package/bin/skills/langchain/references/integration.md +562 -0
  164. package/bin/skills/langchain/references/rag.md +600 -0
  165. package/bin/skills/langsmith/SKILL.md +422 -0
  166. package/bin/skills/langsmith/references/advanced-usage.md +548 -0
  167. package/bin/skills/langsmith/references/troubleshooting.md +537 -0
  168. package/bin/skills/litgpt/SKILL.md +469 -0
  169. package/bin/skills/litgpt/references/custom-models.md +568 -0
  170. package/bin/skills/litgpt/references/distributed-training.md +451 -0
  171. package/bin/skills/litgpt/references/supported-models.md +336 -0
  172. package/bin/skills/litgpt/references/training-recipes.md +619 -0
  173. package/bin/skills/llama-cpp/SKILL.md +258 -0
  174. package/bin/skills/llama-cpp/references/optimization.md +89 -0
  175. package/bin/skills/llama-cpp/references/quantization.md +213 -0
  176. package/bin/skills/llama-cpp/references/server.md +125 -0
  177. package/bin/skills/llama-factory/SKILL.md +80 -0
  178. package/bin/skills/llama-factory/references/_images.md +23 -0
  179. package/bin/skills/llama-factory/references/advanced.md +1055 -0
  180. package/bin/skills/llama-factory/references/getting_started.md +349 -0
  181. package/bin/skills/llama-factory/references/index.md +19 -0
  182. package/bin/skills/llama-factory/references/other.md +31 -0
  183. package/bin/skills/llamaguard/SKILL.md +337 -0
  184. package/bin/skills/llamaindex/SKILL.md +569 -0
  185. package/bin/skills/llamaindex/references/agents.md +83 -0
  186. package/bin/skills/llamaindex/references/data_connectors.md +108 -0
  187. package/bin/skills/llamaindex/references/query_engines.md +406 -0
  188. package/bin/skills/llava/SKILL.md +304 -0
  189. package/bin/skills/llava/references/training.md +197 -0
  190. package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
  191. package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
  192. package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
  193. package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
  194. package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
  195. package/bin/skills/long-context/SKILL.md +536 -0
  196. package/bin/skills/long-context/references/extension_methods.md +468 -0
  197. package/bin/skills/long-context/references/fine_tuning.md +611 -0
  198. package/bin/skills/long-context/references/rope.md +402 -0
  199. package/bin/skills/mamba/SKILL.md +260 -0
  200. package/bin/skills/mamba/references/architecture-details.md +206 -0
  201. package/bin/skills/mamba/references/benchmarks.md +255 -0
  202. package/bin/skills/mamba/references/training-guide.md +388 -0
  203. package/bin/skills/megatron-core/SKILL.md +366 -0
  204. package/bin/skills/megatron-core/references/benchmarks.md +249 -0
  205. package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
  206. package/bin/skills/megatron-core/references/production-examples.md +473 -0
  207. package/bin/skills/megatron-core/references/training-recipes.md +547 -0
  208. package/bin/skills/miles/SKILL.md +315 -0
  209. package/bin/skills/miles/references/api-reference.md +141 -0
  210. package/bin/skills/miles/references/troubleshooting.md +352 -0
  211. package/bin/skills/mlflow/SKILL.md +704 -0
  212. package/bin/skills/mlflow/references/deployment.md +744 -0
  213. package/bin/skills/mlflow/references/model-registry.md +770 -0
  214. package/bin/skills/mlflow/references/tracking.md +680 -0
  215. package/bin/skills/modal/SKILL.md +341 -0
  216. package/bin/skills/modal/references/advanced-usage.md +503 -0
  217. package/bin/skills/modal/references/troubleshooting.md +494 -0
  218. package/bin/skills/model-merging/SKILL.md +539 -0
  219. package/bin/skills/model-merging/references/evaluation.md +462 -0
  220. package/bin/skills/model-merging/references/examples.md +428 -0
  221. package/bin/skills/model-merging/references/methods.md +352 -0
  222. package/bin/skills/model-pruning/SKILL.md +495 -0
  223. package/bin/skills/model-pruning/references/wanda.md +347 -0
  224. package/bin/skills/moe-training/SKILL.md +526 -0
  225. package/bin/skills/moe-training/references/architectures.md +432 -0
  226. package/bin/skills/moe-training/references/inference.md +348 -0
  227. package/bin/skills/moe-training/references/training.md +425 -0
  228. package/bin/skills/nanogpt/SKILL.md +290 -0
  229. package/bin/skills/nanogpt/references/architecture.md +382 -0
  230. package/bin/skills/nanogpt/references/data.md +476 -0
  231. package/bin/skills/nanogpt/references/training.md +564 -0
  232. package/bin/skills/nemo-curator/SKILL.md +383 -0
  233. package/bin/skills/nemo-curator/references/deduplication.md +87 -0
  234. package/bin/skills/nemo-curator/references/filtering.md +102 -0
  235. package/bin/skills/nemo-evaluator/SKILL.md +494 -0
  236. package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
  237. package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
  238. package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
  239. package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
  240. package/bin/skills/nemo-guardrails/SKILL.md +297 -0
  241. package/bin/skills/nnsight/SKILL.md +436 -0
  242. package/bin/skills/nnsight/references/README.md +78 -0
  243. package/bin/skills/nnsight/references/api.md +344 -0
  244. package/bin/skills/nnsight/references/tutorials.md +300 -0
  245. package/bin/skills/openrlhf/SKILL.md +249 -0
  246. package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
  247. package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
  248. package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
  249. package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
  250. package/bin/skills/outlines/SKILL.md +652 -0
  251. package/bin/skills/outlines/references/backends.md +615 -0
  252. package/bin/skills/outlines/references/examples.md +773 -0
  253. package/bin/skills/outlines/references/json_generation.md +652 -0
  254. package/bin/skills/peft/SKILL.md +431 -0
  255. package/bin/skills/peft/references/advanced-usage.md +514 -0
  256. package/bin/skills/peft/references/troubleshooting.md +480 -0
  257. package/bin/skills/phoenix/SKILL.md +475 -0
  258. package/bin/skills/phoenix/references/advanced-usage.md +619 -0
  259. package/bin/skills/phoenix/references/troubleshooting.md +538 -0
  260. package/bin/skills/pinecone/SKILL.md +358 -0
  261. package/bin/skills/pinecone/references/deployment.md +181 -0
  262. package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
  263. package/bin/skills/pytorch-fsdp/references/index.md +7 -0
  264. package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
  265. package/bin/skills/pytorch-lightning/SKILL.md +346 -0
  266. package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
  267. package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
  268. package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
  269. package/bin/skills/pyvene/SKILL.md +473 -0
  270. package/bin/skills/pyvene/references/README.md +73 -0
  271. package/bin/skills/pyvene/references/api.md +383 -0
  272. package/bin/skills/pyvene/references/tutorials.md +376 -0
  273. package/bin/skills/qdrant/SKILL.md +493 -0
  274. package/bin/skills/qdrant/references/advanced-usage.md +648 -0
  275. package/bin/skills/qdrant/references/troubleshooting.md +631 -0
  276. package/bin/skills/ray-data/SKILL.md +326 -0
  277. package/bin/skills/ray-data/references/integration.md +82 -0
  278. package/bin/skills/ray-data/references/transformations.md +83 -0
  279. package/bin/skills/ray-train/SKILL.md +406 -0
  280. package/bin/skills/ray-train/references/multi-node.md +628 -0
  281. package/bin/skills/rwkv/SKILL.md +260 -0
  282. package/bin/skills/rwkv/references/architecture-details.md +344 -0
  283. package/bin/skills/rwkv/references/rwkv7.md +386 -0
  284. package/bin/skills/rwkv/references/state-management.md +369 -0
  285. package/bin/skills/saelens/SKILL.md +386 -0
  286. package/bin/skills/saelens/references/README.md +70 -0
  287. package/bin/skills/saelens/references/api.md +333 -0
  288. package/bin/skills/saelens/references/tutorials.md +318 -0
  289. package/bin/skills/segment-anything/SKILL.md +500 -0
  290. package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
  291. package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
  292. package/bin/skills/sentence-transformers/SKILL.md +255 -0
  293. package/bin/skills/sentence-transformers/references/models.md +123 -0
  294. package/bin/skills/sentencepiece/SKILL.md +235 -0
  295. package/bin/skills/sentencepiece/references/algorithms.md +200 -0
  296. package/bin/skills/sentencepiece/references/training.md +304 -0
  297. package/bin/skills/sglang/SKILL.md +442 -0
  298. package/bin/skills/sglang/references/deployment.md +490 -0
  299. package/bin/skills/sglang/references/radix-attention.md +413 -0
  300. package/bin/skills/sglang/references/structured-generation.md +541 -0
  301. package/bin/skills/simpo/SKILL.md +219 -0
  302. package/bin/skills/simpo/references/datasets.md +478 -0
  303. package/bin/skills/simpo/references/hyperparameters.md +452 -0
  304. package/bin/skills/simpo/references/loss-functions.md +350 -0
  305. package/bin/skills/skypilot/SKILL.md +509 -0
  306. package/bin/skills/skypilot/references/advanced-usage.md +491 -0
  307. package/bin/skills/skypilot/references/troubleshooting.md +570 -0
  308. package/bin/skills/slime/SKILL.md +464 -0
  309. package/bin/skills/slime/references/api-reference.md +392 -0
  310. package/bin/skills/slime/references/troubleshooting.md +386 -0
  311. package/bin/skills/speculative-decoding/SKILL.md +467 -0
  312. package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
  313. package/bin/skills/speculative-decoding/references/medusa.md +350 -0
  314. package/bin/skills/stable-diffusion/SKILL.md +519 -0
  315. package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
  316. package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
  317. package/bin/skills/tensorboard/SKILL.md +629 -0
  318. package/bin/skills/tensorboard/references/integrations.md +638 -0
  319. package/bin/skills/tensorboard/references/profiling.md +545 -0
  320. package/bin/skills/tensorboard/references/visualization.md +620 -0
  321. package/bin/skills/tensorrt-llm/SKILL.md +187 -0
  322. package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
  323. package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
  324. package/bin/skills/tensorrt-llm/references/serving.md +470 -0
  325. package/bin/skills/tinker/SKILL.md +362 -0
  326. package/bin/skills/tinker/references/api-reference.md +168 -0
  327. package/bin/skills/tinker/references/getting-started.md +157 -0
  328. package/bin/skills/tinker/references/loss-functions.md +163 -0
  329. package/bin/skills/tinker/references/models-and-lora.md +139 -0
  330. package/bin/skills/tinker/references/recipes.md +280 -0
  331. package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
  332. package/bin/skills/tinker/references/rendering.md +243 -0
  333. package/bin/skills/tinker/references/supervised-learning.md +232 -0
  334. package/bin/skills/tinker-training-cost/SKILL.md +187 -0
  335. package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
  336. package/bin/skills/torchforge/SKILL.md +433 -0
  337. package/bin/skills/torchforge/references/api-reference.md +327 -0
  338. package/bin/skills/torchforge/references/troubleshooting.md +409 -0
  339. package/bin/skills/torchtitan/SKILL.md +358 -0
  340. package/bin/skills/torchtitan/references/checkpoint.md +181 -0
  341. package/bin/skills/torchtitan/references/custom-models.md +258 -0
  342. package/bin/skills/torchtitan/references/float8.md +133 -0
  343. package/bin/skills/torchtitan/references/fsdp.md +126 -0
  344. package/bin/skills/transformer-lens/SKILL.md +346 -0
  345. package/bin/skills/transformer-lens/references/README.md +54 -0
  346. package/bin/skills/transformer-lens/references/api.md +362 -0
  347. package/bin/skills/transformer-lens/references/tutorials.md +339 -0
  348. package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
  349. package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
  350. package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
  351. package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
  352. package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
  353. package/bin/skills/unsloth/SKILL.md +80 -0
  354. package/bin/skills/unsloth/references/index.md +7 -0
  355. package/bin/skills/unsloth/references/llms-full.md +16799 -0
  356. package/bin/skills/unsloth/references/llms-txt.md +12044 -0
  357. package/bin/skills/unsloth/references/llms.md +82 -0
  358. package/bin/skills/verl/SKILL.md +391 -0
  359. package/bin/skills/verl/references/api-reference.md +301 -0
  360. package/bin/skills/verl/references/troubleshooting.md +391 -0
  361. package/bin/skills/vllm/SKILL.md +364 -0
  362. package/bin/skills/vllm/references/optimization.md +226 -0
  363. package/bin/skills/vllm/references/quantization.md +284 -0
  364. package/bin/skills/vllm/references/server-deployment.md +255 -0
  365. package/bin/skills/vllm/references/troubleshooting.md +447 -0
  366. package/bin/skills/weights-and-biases/SKILL.md +590 -0
  367. package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
  368. package/bin/skills/weights-and-biases/references/integrations.md +700 -0
  369. package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
  370. package/bin/skills/whisper/SKILL.md +317 -0
  371. package/bin/skills/whisper/references/languages.md +189 -0
  372. package/bin/synsc +0 -0
  373. package/package.json +10 -0
@@ -0,0 +1,611 @@
1
+ # Lambda Labs Advanced Usage Guide
2
+
3
+ ## Multi-Node Distributed Training
4
+
5
+ ### PyTorch DDP across nodes
6
+
7
+ ```python
8
+ # train_multi_node.py
9
+ import os
10
+ import torch
11
+ import torch.distributed as dist
12
+ from torch.nn.parallel import DistributedDataParallel as DDP
13
+
14
+ def setup_distributed():
15
+ # Environment variables set by launcher
16
+ rank = int(os.environ["RANK"])
17
+ world_size = int(os.environ["WORLD_SIZE"])
18
+ local_rank = int(os.environ["LOCAL_RANK"])
19
+
20
+ dist.init_process_group(
21
+ backend="nccl",
22
+ rank=rank,
23
+ world_size=world_size
24
+ )
25
+
26
+ torch.cuda.set_device(local_rank)
27
+ return rank, world_size, local_rank
28
+
29
+ def main():
30
+ rank, world_size, local_rank = setup_distributed()
31
+
32
+ model = MyModel().cuda(local_rank)
33
+ model = DDP(model, device_ids=[local_rank])
34
+
35
+ # Training loop with synchronized gradients
36
+ for epoch in range(num_epochs):
37
+ train_one_epoch(model, dataloader)
38
+
39
+ # Save checkpoint on rank 0 only
40
+ if rank == 0:
41
+ torch.save(model.module.state_dict(), f"checkpoint_{epoch}.pt")
42
+
43
+ dist.destroy_process_group()
44
+
45
+ if __name__ == "__main__":
46
+ main()
47
+ ```
48
+
49
+ ### Launch on multiple instances
50
+
51
+ ```bash
52
+ # On Node 0 (master)
53
+ export MASTER_ADDR=<NODE0_PRIVATE_IP>
54
+ export MASTER_PORT=29500
55
+
56
+ torchrun \
57
+ --nnodes=2 \
58
+ --nproc_per_node=8 \
59
+ --node_rank=0 \
60
+ --master_addr=$MASTER_ADDR \
61
+ --master_port=$MASTER_PORT \
62
+ train_multi_node.py
63
+
64
+ # On Node 1
65
+ export MASTER_ADDR=<NODE0_PRIVATE_IP>
66
+ export MASTER_PORT=29500
67
+
68
+ torchrun \
69
+ --nnodes=2 \
70
+ --nproc_per_node=8 \
71
+ --node_rank=1 \
72
+ --master_addr=$MASTER_ADDR \
73
+ --master_port=$MASTER_PORT \
74
+ train_multi_node.py
75
+ ```
76
+
77
+ ### FSDP for large models
78
+
79
+ ```python
80
+ from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
81
+ from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
82
+ from transformers.models.llama.modeling_llama import LlamaDecoderLayer
83
+
84
+ # Wrap policy for transformer models
85
+ auto_wrap_policy = functools.partial(
86
+ transformer_auto_wrap_policy,
87
+ transformer_layer_cls={LlamaDecoderLayer}
88
+ )
89
+
90
+ model = FSDP(
91
+ model,
92
+ auto_wrap_policy=auto_wrap_policy,
93
+ mixed_precision=MixedPrecision(
94
+ param_dtype=torch.bfloat16,
95
+ reduce_dtype=torch.bfloat16,
96
+ buffer_dtype=torch.bfloat16,
97
+ ),
98
+ device_id=local_rank,
99
+ )
100
+ ```
101
+
102
+ ### DeepSpeed ZeRO
103
+
104
+ ```python
105
+ # ds_config.json
106
+ {
107
+ "train_batch_size": 64,
108
+ "gradient_accumulation_steps": 4,
109
+ "fp16": {"enabled": true},
110
+ "zero_optimization": {
111
+ "stage": 3,
112
+ "offload_optimizer": {"device": "cpu"},
113
+ "offload_param": {"device": "cpu"}
114
+ }
115
+ }
116
+ ```
117
+
118
+ ```bash
119
+ # Launch with DeepSpeed
120
+ deepspeed --num_nodes=2 \
121
+ --num_gpus=8 \
122
+ --hostfile=hostfile.txt \
123
+ train.py --deepspeed ds_config.json
124
+ ```
125
+
126
+ ### Hostfile for multi-node
127
+
128
+ ```bash
129
+ # hostfile.txt
130
+ node0_ip slots=8
131
+ node1_ip slots=8
132
+ ```
133
+
134
+ ## API Automation
135
+
136
+ ### Auto-launch training jobs
137
+
138
+ ```python
139
+ import os
140
+ import time
141
+ import lambda_cloud_client
142
+ from lambda_cloud_client.models import LaunchInstanceRequest
143
+
144
+ class LambdaJobManager:
145
+ def __init__(self, api_key: str):
146
+ self.config = lambda_cloud_client.Configuration(
147
+ host="https://cloud.lambdalabs.com/api/v1",
148
+ access_token=api_key
149
+ )
150
+
151
+ def find_available_gpu(self, gpu_types: list[str], regions: list[str] = None):
152
+ """Find first available GPU type across regions."""
153
+ with lambda_cloud_client.ApiClient(self.config) as client:
154
+ api = lambda_cloud_client.DefaultApi(client)
155
+ types = api.instance_types()
156
+
157
+ for gpu_type in gpu_types:
158
+ if gpu_type in types.data:
159
+ info = types.data[gpu_type]
160
+ for region in info.regions_with_capacity_available:
161
+ if regions is None or region.name in regions:
162
+ return gpu_type, region.name
163
+
164
+ return None, None
165
+
166
+ def launch_and_wait(self, instance_type: str, region: str,
167
+ ssh_key: str, filesystem: str = None,
168
+ timeout: int = 900) -> dict:
169
+ """Launch instance and wait for it to be ready."""
170
+ with lambda_cloud_client.ApiClient(self.config) as client:
171
+ api = lambda_cloud_client.DefaultApi(client)
172
+
173
+ request = LaunchInstanceRequest(
174
+ region_name=region,
175
+ instance_type_name=instance_type,
176
+ ssh_key_names=[ssh_key],
177
+ file_system_names=[filesystem] if filesystem else [],
178
+ )
179
+
180
+ response = api.launch_instance(request)
181
+ instance_id = response.data.instance_ids[0]
182
+
183
+ # Poll until ready
184
+ start = time.time()
185
+ while time.time() - start < timeout:
186
+ instance = api.get_instance(instance_id)
187
+ if instance.data.status == "active":
188
+ return {
189
+ "id": instance_id,
190
+ "ip": instance.data.ip,
191
+ "status": "active"
192
+ }
193
+ time.sleep(30)
194
+
195
+ raise TimeoutError(f"Instance {instance_id} not ready after {timeout}s")
196
+
197
+ def terminate(self, instance_ids: list[str]):
198
+ """Terminate instances."""
199
+ from lambda_cloud_client.models import TerminateInstanceRequest
200
+
201
+ with lambda_cloud_client.ApiClient(self.config) as client:
202
+ api = lambda_cloud_client.DefaultApi(client)
203
+ request = TerminateInstanceRequest(instance_ids=instance_ids)
204
+ api.terminate_instance(request)
205
+
206
+
207
+ # Usage
208
+ manager = LambdaJobManager(os.environ["LAMBDA_API_KEY"])
209
+
210
+ # Find available H100 or A100
211
+ gpu_type, region = manager.find_available_gpu(
212
+ ["gpu_8x_h100_sxm5", "gpu_8x_a100_80gb_sxm4"],
213
+ regions=["us-west-1", "us-east-1"]
214
+ )
215
+
216
+ if gpu_type:
217
+ instance = manager.launch_and_wait(
218
+ gpu_type, region,
219
+ ssh_key="my-key",
220
+ filesystem="training-data"
221
+ )
222
+ print(f"Ready: ssh ubuntu@{instance['ip']}")
223
+ ```
224
+
225
+ ### Batch job submission
226
+
227
+ ```python
228
+ import subprocess
229
+ import paramiko
230
+
231
+ def run_remote_job(ip: str, ssh_key_path: str, commands: list[str]):
232
+ """Execute commands on remote instance."""
233
+ client = paramiko.SSHClient()
234
+ client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
235
+ client.connect(ip, username="ubuntu", key_filename=ssh_key_path)
236
+
237
+ for cmd in commands:
238
+ stdin, stdout, stderr = client.exec_command(cmd)
239
+ print(stdout.read().decode())
240
+ if stderr.read():
241
+ print(f"Error: {stderr.read().decode()}")
242
+
243
+ client.close()
244
+
245
+ # Submit training job
246
+ commands = [
247
+ "cd /lambda/nfs/storage/project",
248
+ "git pull",
249
+ "pip install -r requirements.txt",
250
+ "nohup torchrun --nproc_per_node=8 train.py > train.log 2>&1 &"
251
+ ]
252
+
253
+ run_remote_job(instance["ip"], "~/.ssh/lambda_key", commands)
254
+ ```
255
+
256
+ ### Monitor training progress
257
+
258
+ ```python
259
+ def monitor_job(ip: str, ssh_key_path: str, log_file: str = "train.log"):
260
+ """Stream training logs from remote instance."""
261
+ import time
262
+
263
+ client = paramiko.SSHClient()
264
+ client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
265
+ client.connect(ip, username="ubuntu", key_filename=ssh_key_path)
266
+
267
+ # Tail log file
268
+ stdin, stdout, stderr = client.exec_command(f"tail -f {log_file}")
269
+
270
+ try:
271
+ for line in stdout:
272
+ print(line.strip())
273
+ except KeyboardInterrupt:
274
+ pass
275
+ finally:
276
+ client.close()
277
+ ```
278
+
279
+ ## 1-Click Cluster Workflows
280
+
281
+ ### Slurm job submission
282
+
283
+ ```bash
284
+ #!/bin/bash
285
+ #SBATCH --job-name=llm-training
286
+ #SBATCH --nodes=4
287
+ #SBATCH --ntasks-per-node=8
288
+ #SBATCH --gpus-per-node=8
289
+ #SBATCH --time=24:00:00
290
+ #SBATCH --output=logs/%j.out
291
+ #SBATCH --error=logs/%j.err
292
+
293
+ # Set up distributed environment
294
+ export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
295
+ export MASTER_PORT=29500
296
+
297
+ # Launch training
298
+ srun torchrun \
299
+ --nnodes=$SLURM_NNODES \
300
+ --nproc_per_node=$SLURM_GPUS_PER_NODE \
301
+ --rdzv_backend=c10d \
302
+ --rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT \
303
+ train.py \
304
+ --config config.yaml
305
+ ```
306
+
307
+ ### Interactive cluster session
308
+
309
+ ```bash
310
+ # Request interactive session
311
+ srun --nodes=1 --ntasks=1 --gpus=8 --time=4:00:00 --pty bash
312
+
313
+ # Now on compute node with 8 GPUs
314
+ nvidia-smi
315
+ python train.py
316
+ ```
317
+
318
+ ### Monitoring cluster jobs
319
+
320
+ ```bash
321
+ # View job queue
322
+ squeue
323
+
324
+ # View job details
325
+ scontrol show job <JOB_ID>
326
+
327
+ # Cancel job
328
+ scancel <JOB_ID>
329
+
330
+ # View node status
331
+ sinfo
332
+
333
+ # View GPU usage across cluster
334
+ srun --nodes=4 nvidia-smi --query-gpu=name,utilization.gpu --format=csv
335
+ ```
336
+
337
+ ## Advanced Filesystem Usage
338
+
339
+ ### Data staging workflow
340
+
341
+ ```bash
342
+ # Stage data from S3 to filesystem (one-time)
343
+ aws s3 sync s3://my-bucket/dataset /lambda/nfs/storage/datasets/
344
+
345
+ # Or use rclone
346
+ rclone sync s3:my-bucket/dataset /lambda/nfs/storage/datasets/
347
+ ```
348
+
349
+ ### Shared filesystem across instances
350
+
351
+ ```python
352
+ # Instance 1: Write checkpoints
353
+ checkpoint_path = "/lambda/nfs/shared/checkpoints/model_step_1000.pt"
354
+ torch.save(model.state_dict(), checkpoint_path)
355
+
356
+ # Instance 2: Read checkpoints
357
+ model.load_state_dict(torch.load(checkpoint_path))
358
+ ```
359
+
360
+ ### Filesystem best practices
361
+
362
+ ```bash
363
+ # Organize for ML workflows
364
+ /lambda/nfs/storage/
365
+ ├── datasets/
366
+ │ ├── raw/ # Original data
367
+ │ └── processed/ # Preprocessed data
368
+ ├── models/
369
+ │ ├── pretrained/ # Base models
370
+ │ └── fine-tuned/ # Your trained models
371
+ ├── checkpoints/
372
+ │ └── experiment_1/ # Per-experiment checkpoints
373
+ ├── logs/
374
+ │ └── tensorboard/ # Training logs
375
+ └── outputs/
376
+ └── inference/ # Inference results
377
+ ```
378
+
379
+ ## Environment Management
380
+
381
+ ### Custom Python environments
382
+
383
+ ```bash
384
+ # Don't modify system Python, create venv
385
+ python -m venv ~/myenv
386
+ source ~/myenv/bin/activate
387
+
388
+ # Install packages
389
+ pip install torch transformers accelerate
390
+
391
+ # Save to filesystem for reuse
392
+ cp -r ~/myenv /lambda/nfs/storage/envs/myenv
393
+ ```
394
+
395
+ ### Conda environments
396
+
397
+ ```bash
398
+ # Install miniconda (if not present)
399
+ wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
400
+ bash Miniconda3-latest-Linux-x86_64.sh -b -p ~/miniconda3
401
+
402
+ # Create environment
403
+ ~/miniconda3/bin/conda create -n ml python=3.10 pytorch pytorch-cuda=12.1 -c pytorch -c nvidia -y
404
+
405
+ # Activate
406
+ source ~/miniconda3/bin/activate ml
407
+ ```
408
+
409
+ ### Docker containers
410
+
411
+ ```bash
412
+ # Pull and run NVIDIA container
413
+ docker run --gpus all -it --rm \
414
+ -v /lambda/nfs/storage:/data \
415
+ nvcr.io/nvidia/pytorch:24.01-py3
416
+
417
+ # Run training in container
418
+ docker run --gpus all -d \
419
+ -v /lambda/nfs/storage:/data \
420
+ -v $(pwd):/workspace \
421
+ nvcr.io/nvidia/pytorch:24.01-py3 \
422
+ python /workspace/train.py
423
+ ```
424
+
425
+ ## Monitoring and Observability
426
+
427
+ ### GPU monitoring
428
+
429
+ ```bash
430
+ # Real-time GPU stats
431
+ watch -n 1 nvidia-smi
432
+
433
+ # GPU utilization over time
434
+ nvidia-smi dmon -s u -d 1
435
+
436
+ # Detailed GPU info
437
+ nvidia-smi -q
438
+ ```
439
+
440
+ ### System monitoring
441
+
442
+ ```bash
443
+ # CPU and memory
444
+ htop
445
+
446
+ # Disk I/O
447
+ iostat -x 1
448
+
449
+ # Network
450
+ iftop
451
+
452
+ # All resources
453
+ glances
454
+ ```
455
+
456
+ ### TensorBoard integration
457
+
458
+ ```bash
459
+ # Start TensorBoard
460
+ tensorboard --logdir /lambda/nfs/storage/logs --port 6006 --bind_all
461
+
462
+ # SSH tunnel from local machine
463
+ ssh -L 6006:localhost:6006 ubuntu@<IP>
464
+
465
+ # Access at http://localhost:6006
466
+ ```
467
+
468
+ ### Weights & Biases integration
469
+
470
+ ```python
471
+ import wandb
472
+
473
+ # Initialize with API key
474
+ wandb.login(key=os.environ["WANDB_API_KEY"])
475
+
476
+ # Start run
477
+ wandb.init(
478
+ project="lambda-training",
479
+ config={"learning_rate": 1e-4, "epochs": 100}
480
+ )
481
+
482
+ # Log metrics
483
+ wandb.log({"loss": loss, "accuracy": acc})
484
+
485
+ # Save artifacts to filesystem + W&B
486
+ wandb.save("/lambda/nfs/storage/checkpoints/best_model.pt")
487
+ ```
488
+
489
+ ## Cost Optimization Strategies
490
+
491
+ ### Checkpointing for interruption recovery
492
+
493
+ ```python
494
+ import os
495
+
496
+ def save_checkpoint(model, optimizer, epoch, loss, path):
497
+ torch.save({
498
+ 'epoch': epoch,
499
+ 'model_state_dict': model.state_dict(),
500
+ 'optimizer_state_dict': optimizer.state_dict(),
501
+ 'loss': loss,
502
+ }, path)
503
+
504
+ def load_checkpoint(path, model, optimizer):
505
+ if os.path.exists(path):
506
+ checkpoint = torch.load(path)
507
+ model.load_state_dict(checkpoint['model_state_dict'])
508
+ optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
509
+ return checkpoint['epoch'], checkpoint['loss']
510
+ return 0, float('inf')
511
+
512
+ # Save every N steps to filesystem
513
+ checkpoint_path = "/lambda/nfs/storage/checkpoints/latest.pt"
514
+ if step % 1000 == 0:
515
+ save_checkpoint(model, optimizer, epoch, loss, checkpoint_path)
516
+ ```
517
+
518
+ ### Instance selection by workload
519
+
520
+ ```python
521
+ def recommend_instance(model_params: int, batch_size: int, task: str) -> str:
522
+ """Recommend Lambda instance based on workload."""
523
+
524
+ if task == "inference":
525
+ if model_params < 7e9:
526
+ return "gpu_1x_a10" # $0.75/hr
527
+ elif model_params < 13e9:
528
+ return "gpu_1x_a6000" # $0.80/hr
529
+ else:
530
+ return "gpu_1x_h100_pcie" # $2.49/hr
531
+
532
+ elif task == "fine-tuning":
533
+ if model_params < 7e9:
534
+ return "gpu_1x_a100" # $1.29/hr
535
+ elif model_params < 13e9:
536
+ return "gpu_4x_a100" # $5.16/hr
537
+ else:
538
+ return "gpu_8x_h100_sxm5" # $23.92/hr
539
+
540
+ elif task == "pretraining":
541
+ return "gpu_8x_h100_sxm5" # Maximum performance
542
+
543
+ return "gpu_1x_a100" # Default
544
+ ```
545
+
546
+ ### Auto-terminate idle instances
547
+
548
+ ```python
549
+ import time
550
+ from datetime import datetime, timedelta
551
+
552
+ def auto_terminate_idle(api_key: str, idle_threshold_hours: float = 2):
553
+ """Terminate instances idle for too long."""
554
+ manager = LambdaJobManager(api_key)
555
+
556
+ with lambda_cloud_client.ApiClient(manager.config) as client:
557
+ api = lambda_cloud_client.DefaultApi(client)
558
+ instances = api.list_instances()
559
+
560
+ for instance in instances.data:
561
+ # Check if instance has been running without activity
562
+ # (You'd need to track this separately)
563
+ launch_time = instance.launched_at
564
+ if datetime.now() - launch_time > timedelta(hours=idle_threshold_hours):
565
+ print(f"Terminating idle instance: {instance.id}")
566
+ manager.terminate([instance.id])
567
+ ```
568
+
569
+ ## Security Best Practices
570
+
571
+ ### SSH key rotation
572
+
573
+ ```bash
574
+ # Generate new key pair
575
+ ssh-keygen -t ed25519 -f ~/.ssh/lambda_key_new -C "lambda-$(date +%Y%m)"
576
+
577
+ # Add new key via Lambda console or API
578
+ # Update authorized_keys on running instances
579
+ ssh ubuntu@<IP> "echo '$(cat ~/.ssh/lambda_key_new.pub)' >> ~/.ssh/authorized_keys"
580
+
581
+ # Test new key
582
+ ssh -i ~/.ssh/lambda_key_new ubuntu@<IP>
583
+
584
+ # Remove old key from Lambda console
585
+ ```
586
+
587
+ ### Firewall configuration
588
+
589
+ ```bash
590
+ # Lambda console: Only open necessary ports
591
+ # Recommended:
592
+ # - 22 (SSH) - Always needed
593
+ # - 6006 (TensorBoard) - If using
594
+ # - 8888 (Jupyter) - If using
595
+ # - 29500 (PyTorch distributed) - For multi-node only
596
+ ```
597
+
598
+ ### Secrets management
599
+
600
+ ```bash
601
+ # Don't hardcode API keys in code
602
+ # Use environment variables
603
+ export HF_TOKEN="hf_..."
604
+ export WANDB_API_KEY="..."
605
+
606
+ # Or use .env file (add to .gitignore)
607
+ source .env
608
+
609
+ # On instance, store in ~/.bashrc
610
+ echo 'export HF_TOKEN="..."' >> ~/.bashrc
611
+ ```