@synsci/cli-darwin-x64 1.1.49

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (373) hide show
  1. package/bin/skills/accelerate/SKILL.md +332 -0
  2. package/bin/skills/accelerate/references/custom-plugins.md +453 -0
  3. package/bin/skills/accelerate/references/megatron-integration.md +489 -0
  4. package/bin/skills/accelerate/references/performance.md +525 -0
  5. package/bin/skills/audiocraft/SKILL.md +564 -0
  6. package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
  7. package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
  8. package/bin/skills/autogpt/SKILL.md +403 -0
  9. package/bin/skills/autogpt/references/advanced-usage.md +535 -0
  10. package/bin/skills/autogpt/references/troubleshooting.md +420 -0
  11. package/bin/skills/awq/SKILL.md +310 -0
  12. package/bin/skills/awq/references/advanced-usage.md +324 -0
  13. package/bin/skills/awq/references/troubleshooting.md +344 -0
  14. package/bin/skills/axolotl/SKILL.md +158 -0
  15. package/bin/skills/axolotl/references/api.md +5548 -0
  16. package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
  17. package/bin/skills/axolotl/references/index.md +15 -0
  18. package/bin/skills/axolotl/references/other.md +3563 -0
  19. package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
  20. package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
  21. package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
  22. package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
  23. package/bin/skills/bitsandbytes/SKILL.md +411 -0
  24. package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
  25. package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
  26. package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
  27. package/bin/skills/blip-2/SKILL.md +564 -0
  28. package/bin/skills/blip-2/references/advanced-usage.md +680 -0
  29. package/bin/skills/blip-2/references/troubleshooting.md +526 -0
  30. package/bin/skills/chroma/SKILL.md +406 -0
  31. package/bin/skills/chroma/references/integration.md +38 -0
  32. package/bin/skills/clip/SKILL.md +253 -0
  33. package/bin/skills/clip/references/applications.md +207 -0
  34. package/bin/skills/constitutional-ai/SKILL.md +290 -0
  35. package/bin/skills/crewai/SKILL.md +498 -0
  36. package/bin/skills/crewai/references/flows.md +438 -0
  37. package/bin/skills/crewai/references/tools.md +429 -0
  38. package/bin/skills/crewai/references/troubleshooting.md +480 -0
  39. package/bin/skills/deepspeed/SKILL.md +141 -0
  40. package/bin/skills/deepspeed/references/08.md +17 -0
  41. package/bin/skills/deepspeed/references/09.md +173 -0
  42. package/bin/skills/deepspeed/references/2020.md +378 -0
  43. package/bin/skills/deepspeed/references/2023.md +279 -0
  44. package/bin/skills/deepspeed/references/assets.md +179 -0
  45. package/bin/skills/deepspeed/references/index.md +35 -0
  46. package/bin/skills/deepspeed/references/mii.md +118 -0
  47. package/bin/skills/deepspeed/references/other.md +1191 -0
  48. package/bin/skills/deepspeed/references/tutorials.md +6554 -0
  49. package/bin/skills/dspy/SKILL.md +590 -0
  50. package/bin/skills/dspy/references/examples.md +663 -0
  51. package/bin/skills/dspy/references/modules.md +475 -0
  52. package/bin/skills/dspy/references/optimizers.md +566 -0
  53. package/bin/skills/faiss/SKILL.md +221 -0
  54. package/bin/skills/faiss/references/index_types.md +280 -0
  55. package/bin/skills/flash-attention/SKILL.md +367 -0
  56. package/bin/skills/flash-attention/references/benchmarks.md +215 -0
  57. package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
  58. package/bin/skills/gguf/SKILL.md +427 -0
  59. package/bin/skills/gguf/references/advanced-usage.md +504 -0
  60. package/bin/skills/gguf/references/troubleshooting.md +442 -0
  61. package/bin/skills/gptq/SKILL.md +450 -0
  62. package/bin/skills/gptq/references/calibration.md +337 -0
  63. package/bin/skills/gptq/references/integration.md +129 -0
  64. package/bin/skills/gptq/references/troubleshooting.md +95 -0
  65. package/bin/skills/grpo-rl-training/README.md +97 -0
  66. package/bin/skills/grpo-rl-training/SKILL.md +572 -0
  67. package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
  68. package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
  69. package/bin/skills/guidance/SKILL.md +572 -0
  70. package/bin/skills/guidance/references/backends.md +554 -0
  71. package/bin/skills/guidance/references/constraints.md +674 -0
  72. package/bin/skills/guidance/references/examples.md +767 -0
  73. package/bin/skills/hqq/SKILL.md +445 -0
  74. package/bin/skills/hqq/references/advanced-usage.md +528 -0
  75. package/bin/skills/hqq/references/troubleshooting.md +503 -0
  76. package/bin/skills/hugging-face-cli/SKILL.md +191 -0
  77. package/bin/skills/hugging-face-cli/references/commands.md +954 -0
  78. package/bin/skills/hugging-face-cli/references/examples.md +374 -0
  79. package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
  80. package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
  81. package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
  82. package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
  83. package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
  84. package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
  85. package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
  86. package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
  87. package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
  88. package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
  89. package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
  90. package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
  91. package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
  92. package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
  93. package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
  94. package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
  95. package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
  96. package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
  97. package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
  98. package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
  99. package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
  100. package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
  101. package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
  102. package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
  103. package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
  104. package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
  105. package/bin/skills/hugging-face-jobs/index.html +216 -0
  106. package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
  107. package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
  108. package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
  109. package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
  110. package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
  111. package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
  112. package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
  113. package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
  114. package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
  115. package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
  116. package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
  117. package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
  118. package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
  119. package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
  120. package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
  121. package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
  122. package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
  123. package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
  124. package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
  125. package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
  126. package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
  127. package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
  128. package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
  129. package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
  130. package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
  131. package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
  132. package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
  133. package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
  134. package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
  135. package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
  136. package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
  137. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
  138. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
  139. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
  140. package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
  141. package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
  142. package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
  143. package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
  144. package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
  145. package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
  146. package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
  147. package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
  148. package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
  149. package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
  150. package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
  151. package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
  152. package/bin/skills/instructor/SKILL.md +740 -0
  153. package/bin/skills/instructor/references/examples.md +107 -0
  154. package/bin/skills/instructor/references/providers.md +70 -0
  155. package/bin/skills/instructor/references/validation.md +606 -0
  156. package/bin/skills/knowledge-distillation/SKILL.md +458 -0
  157. package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
  158. package/bin/skills/lambda-labs/SKILL.md +545 -0
  159. package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
  160. package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
  161. package/bin/skills/langchain/SKILL.md +480 -0
  162. package/bin/skills/langchain/references/agents.md +499 -0
  163. package/bin/skills/langchain/references/integration.md +562 -0
  164. package/bin/skills/langchain/references/rag.md +600 -0
  165. package/bin/skills/langsmith/SKILL.md +422 -0
  166. package/bin/skills/langsmith/references/advanced-usage.md +548 -0
  167. package/bin/skills/langsmith/references/troubleshooting.md +537 -0
  168. package/bin/skills/litgpt/SKILL.md +469 -0
  169. package/bin/skills/litgpt/references/custom-models.md +568 -0
  170. package/bin/skills/litgpt/references/distributed-training.md +451 -0
  171. package/bin/skills/litgpt/references/supported-models.md +336 -0
  172. package/bin/skills/litgpt/references/training-recipes.md +619 -0
  173. package/bin/skills/llama-cpp/SKILL.md +258 -0
  174. package/bin/skills/llama-cpp/references/optimization.md +89 -0
  175. package/bin/skills/llama-cpp/references/quantization.md +213 -0
  176. package/bin/skills/llama-cpp/references/server.md +125 -0
  177. package/bin/skills/llama-factory/SKILL.md +80 -0
  178. package/bin/skills/llama-factory/references/_images.md +23 -0
  179. package/bin/skills/llama-factory/references/advanced.md +1055 -0
  180. package/bin/skills/llama-factory/references/getting_started.md +349 -0
  181. package/bin/skills/llama-factory/references/index.md +19 -0
  182. package/bin/skills/llama-factory/references/other.md +31 -0
  183. package/bin/skills/llamaguard/SKILL.md +337 -0
  184. package/bin/skills/llamaindex/SKILL.md +569 -0
  185. package/bin/skills/llamaindex/references/agents.md +83 -0
  186. package/bin/skills/llamaindex/references/data_connectors.md +108 -0
  187. package/bin/skills/llamaindex/references/query_engines.md +406 -0
  188. package/bin/skills/llava/SKILL.md +304 -0
  189. package/bin/skills/llava/references/training.md +197 -0
  190. package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
  191. package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
  192. package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
  193. package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
  194. package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
  195. package/bin/skills/long-context/SKILL.md +536 -0
  196. package/bin/skills/long-context/references/extension_methods.md +468 -0
  197. package/bin/skills/long-context/references/fine_tuning.md +611 -0
  198. package/bin/skills/long-context/references/rope.md +402 -0
  199. package/bin/skills/mamba/SKILL.md +260 -0
  200. package/bin/skills/mamba/references/architecture-details.md +206 -0
  201. package/bin/skills/mamba/references/benchmarks.md +255 -0
  202. package/bin/skills/mamba/references/training-guide.md +388 -0
  203. package/bin/skills/megatron-core/SKILL.md +366 -0
  204. package/bin/skills/megatron-core/references/benchmarks.md +249 -0
  205. package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
  206. package/bin/skills/megatron-core/references/production-examples.md +473 -0
  207. package/bin/skills/megatron-core/references/training-recipes.md +547 -0
  208. package/bin/skills/miles/SKILL.md +315 -0
  209. package/bin/skills/miles/references/api-reference.md +141 -0
  210. package/bin/skills/miles/references/troubleshooting.md +352 -0
  211. package/bin/skills/mlflow/SKILL.md +704 -0
  212. package/bin/skills/mlflow/references/deployment.md +744 -0
  213. package/bin/skills/mlflow/references/model-registry.md +770 -0
  214. package/bin/skills/mlflow/references/tracking.md +680 -0
  215. package/bin/skills/modal/SKILL.md +341 -0
  216. package/bin/skills/modal/references/advanced-usage.md +503 -0
  217. package/bin/skills/modal/references/troubleshooting.md +494 -0
  218. package/bin/skills/model-merging/SKILL.md +539 -0
  219. package/bin/skills/model-merging/references/evaluation.md +462 -0
  220. package/bin/skills/model-merging/references/examples.md +428 -0
  221. package/bin/skills/model-merging/references/methods.md +352 -0
  222. package/bin/skills/model-pruning/SKILL.md +495 -0
  223. package/bin/skills/model-pruning/references/wanda.md +347 -0
  224. package/bin/skills/moe-training/SKILL.md +526 -0
  225. package/bin/skills/moe-training/references/architectures.md +432 -0
  226. package/bin/skills/moe-training/references/inference.md +348 -0
  227. package/bin/skills/moe-training/references/training.md +425 -0
  228. package/bin/skills/nanogpt/SKILL.md +290 -0
  229. package/bin/skills/nanogpt/references/architecture.md +382 -0
  230. package/bin/skills/nanogpt/references/data.md +476 -0
  231. package/bin/skills/nanogpt/references/training.md +564 -0
  232. package/bin/skills/nemo-curator/SKILL.md +383 -0
  233. package/bin/skills/nemo-curator/references/deduplication.md +87 -0
  234. package/bin/skills/nemo-curator/references/filtering.md +102 -0
  235. package/bin/skills/nemo-evaluator/SKILL.md +494 -0
  236. package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
  237. package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
  238. package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
  239. package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
  240. package/bin/skills/nemo-guardrails/SKILL.md +297 -0
  241. package/bin/skills/nnsight/SKILL.md +436 -0
  242. package/bin/skills/nnsight/references/README.md +78 -0
  243. package/bin/skills/nnsight/references/api.md +344 -0
  244. package/bin/skills/nnsight/references/tutorials.md +300 -0
  245. package/bin/skills/openrlhf/SKILL.md +249 -0
  246. package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
  247. package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
  248. package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
  249. package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
  250. package/bin/skills/outlines/SKILL.md +652 -0
  251. package/bin/skills/outlines/references/backends.md +615 -0
  252. package/bin/skills/outlines/references/examples.md +773 -0
  253. package/bin/skills/outlines/references/json_generation.md +652 -0
  254. package/bin/skills/peft/SKILL.md +431 -0
  255. package/bin/skills/peft/references/advanced-usage.md +514 -0
  256. package/bin/skills/peft/references/troubleshooting.md +480 -0
  257. package/bin/skills/phoenix/SKILL.md +475 -0
  258. package/bin/skills/phoenix/references/advanced-usage.md +619 -0
  259. package/bin/skills/phoenix/references/troubleshooting.md +538 -0
  260. package/bin/skills/pinecone/SKILL.md +358 -0
  261. package/bin/skills/pinecone/references/deployment.md +181 -0
  262. package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
  263. package/bin/skills/pytorch-fsdp/references/index.md +7 -0
  264. package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
  265. package/bin/skills/pytorch-lightning/SKILL.md +346 -0
  266. package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
  267. package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
  268. package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
  269. package/bin/skills/pyvene/SKILL.md +473 -0
  270. package/bin/skills/pyvene/references/README.md +73 -0
  271. package/bin/skills/pyvene/references/api.md +383 -0
  272. package/bin/skills/pyvene/references/tutorials.md +376 -0
  273. package/bin/skills/qdrant/SKILL.md +493 -0
  274. package/bin/skills/qdrant/references/advanced-usage.md +648 -0
  275. package/bin/skills/qdrant/references/troubleshooting.md +631 -0
  276. package/bin/skills/ray-data/SKILL.md +326 -0
  277. package/bin/skills/ray-data/references/integration.md +82 -0
  278. package/bin/skills/ray-data/references/transformations.md +83 -0
  279. package/bin/skills/ray-train/SKILL.md +406 -0
  280. package/bin/skills/ray-train/references/multi-node.md +628 -0
  281. package/bin/skills/rwkv/SKILL.md +260 -0
  282. package/bin/skills/rwkv/references/architecture-details.md +344 -0
  283. package/bin/skills/rwkv/references/rwkv7.md +386 -0
  284. package/bin/skills/rwkv/references/state-management.md +369 -0
  285. package/bin/skills/saelens/SKILL.md +386 -0
  286. package/bin/skills/saelens/references/README.md +70 -0
  287. package/bin/skills/saelens/references/api.md +333 -0
  288. package/bin/skills/saelens/references/tutorials.md +318 -0
  289. package/bin/skills/segment-anything/SKILL.md +500 -0
  290. package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
  291. package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
  292. package/bin/skills/sentence-transformers/SKILL.md +255 -0
  293. package/bin/skills/sentence-transformers/references/models.md +123 -0
  294. package/bin/skills/sentencepiece/SKILL.md +235 -0
  295. package/bin/skills/sentencepiece/references/algorithms.md +200 -0
  296. package/bin/skills/sentencepiece/references/training.md +304 -0
  297. package/bin/skills/sglang/SKILL.md +442 -0
  298. package/bin/skills/sglang/references/deployment.md +490 -0
  299. package/bin/skills/sglang/references/radix-attention.md +413 -0
  300. package/bin/skills/sglang/references/structured-generation.md +541 -0
  301. package/bin/skills/simpo/SKILL.md +219 -0
  302. package/bin/skills/simpo/references/datasets.md +478 -0
  303. package/bin/skills/simpo/references/hyperparameters.md +452 -0
  304. package/bin/skills/simpo/references/loss-functions.md +350 -0
  305. package/bin/skills/skypilot/SKILL.md +509 -0
  306. package/bin/skills/skypilot/references/advanced-usage.md +491 -0
  307. package/bin/skills/skypilot/references/troubleshooting.md +570 -0
  308. package/bin/skills/slime/SKILL.md +464 -0
  309. package/bin/skills/slime/references/api-reference.md +392 -0
  310. package/bin/skills/slime/references/troubleshooting.md +386 -0
  311. package/bin/skills/speculative-decoding/SKILL.md +467 -0
  312. package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
  313. package/bin/skills/speculative-decoding/references/medusa.md +350 -0
  314. package/bin/skills/stable-diffusion/SKILL.md +519 -0
  315. package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
  316. package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
  317. package/bin/skills/tensorboard/SKILL.md +629 -0
  318. package/bin/skills/tensorboard/references/integrations.md +638 -0
  319. package/bin/skills/tensorboard/references/profiling.md +545 -0
  320. package/bin/skills/tensorboard/references/visualization.md +620 -0
  321. package/bin/skills/tensorrt-llm/SKILL.md +187 -0
  322. package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
  323. package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
  324. package/bin/skills/tensorrt-llm/references/serving.md +470 -0
  325. package/bin/skills/tinker/SKILL.md +362 -0
  326. package/bin/skills/tinker/references/api-reference.md +168 -0
  327. package/bin/skills/tinker/references/getting-started.md +157 -0
  328. package/bin/skills/tinker/references/loss-functions.md +163 -0
  329. package/bin/skills/tinker/references/models-and-lora.md +139 -0
  330. package/bin/skills/tinker/references/recipes.md +280 -0
  331. package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
  332. package/bin/skills/tinker/references/rendering.md +243 -0
  333. package/bin/skills/tinker/references/supervised-learning.md +232 -0
  334. package/bin/skills/tinker-training-cost/SKILL.md +187 -0
  335. package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
  336. package/bin/skills/torchforge/SKILL.md +433 -0
  337. package/bin/skills/torchforge/references/api-reference.md +327 -0
  338. package/bin/skills/torchforge/references/troubleshooting.md +409 -0
  339. package/bin/skills/torchtitan/SKILL.md +358 -0
  340. package/bin/skills/torchtitan/references/checkpoint.md +181 -0
  341. package/bin/skills/torchtitan/references/custom-models.md +258 -0
  342. package/bin/skills/torchtitan/references/float8.md +133 -0
  343. package/bin/skills/torchtitan/references/fsdp.md +126 -0
  344. package/bin/skills/transformer-lens/SKILL.md +346 -0
  345. package/bin/skills/transformer-lens/references/README.md +54 -0
  346. package/bin/skills/transformer-lens/references/api.md +362 -0
  347. package/bin/skills/transformer-lens/references/tutorials.md +339 -0
  348. package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
  349. package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
  350. package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
  351. package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
  352. package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
  353. package/bin/skills/unsloth/SKILL.md +80 -0
  354. package/bin/skills/unsloth/references/index.md +7 -0
  355. package/bin/skills/unsloth/references/llms-full.md +16799 -0
  356. package/bin/skills/unsloth/references/llms-txt.md +12044 -0
  357. package/bin/skills/unsloth/references/llms.md +82 -0
  358. package/bin/skills/verl/SKILL.md +391 -0
  359. package/bin/skills/verl/references/api-reference.md +301 -0
  360. package/bin/skills/verl/references/troubleshooting.md +391 -0
  361. package/bin/skills/vllm/SKILL.md +364 -0
  362. package/bin/skills/vllm/references/optimization.md +226 -0
  363. package/bin/skills/vllm/references/quantization.md +284 -0
  364. package/bin/skills/vllm/references/server-deployment.md +255 -0
  365. package/bin/skills/vllm/references/troubleshooting.md +447 -0
  366. package/bin/skills/weights-and-biases/SKILL.md +590 -0
  367. package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
  368. package/bin/skills/weights-and-biases/references/integrations.md +700 -0
  369. package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
  370. package/bin/skills/whisper/SKILL.md +317 -0
  371. package/bin/skills/whisper/references/languages.md +189 -0
  372. package/bin/synsc +0 -0
  373. package/package.json +10 -0
@@ -0,0 +1,447 @@
1
+ # Troubleshooting Guide
2
+
3
+ ## Contents
4
+ - Out of memory (OOM) errors
5
+ - Performance issues
6
+ - Model loading errors
7
+ - Network and connection issues
8
+ - Quantization problems
9
+ - Distributed serving issues
10
+ - Debugging tools and commands
11
+
12
+ ## Out of memory (OOM) errors
13
+
14
+ ### Symptom: `torch.cuda.OutOfMemoryError` during model loading
15
+
16
+ **Cause**: Model + KV cache exceeds available VRAM
17
+
18
+ **Solutions (try in order)**:
19
+
20
+ 1. **Reduce GPU memory utilization**:
21
+ ```bash
22
+ vllm serve MODEL --gpu-memory-utilization 0.7 # Try 0.7, 0.75, 0.8
23
+ ```
24
+
25
+ 2. **Reduce max sequence length**:
26
+ ```bash
27
+ vllm serve MODEL --max-model-len 4096 # Instead of 8192
28
+ ```
29
+
30
+ 3. **Enable quantization**:
31
+ ```bash
32
+ vllm serve MODEL --quantization awq # 4x memory reduction
33
+ ```
34
+
35
+ 4. **Use tensor parallelism** (multiple GPUs):
36
+ ```bash
37
+ vllm serve MODEL --tensor-parallel-size 2 # Split across 2 GPUs
38
+ ```
39
+
40
+ 5. **Reduce max concurrent sequences**:
41
+ ```bash
42
+ vllm serve MODEL --max-num-seqs 128 # Default is 256
43
+ ```
44
+
45
+ ### Symptom: OOM during inference (not model loading)
46
+
47
+ **Cause**: KV cache fills up during generation
48
+
49
+ **Solutions**:
50
+
51
+ ```bash
52
+ # Reduce KV cache allocation
53
+ vllm serve MODEL --gpu-memory-utilization 0.85
54
+
55
+ # Reduce batch size
56
+ vllm serve MODEL --max-num-seqs 64
57
+
58
+ # Reduce max tokens per request
59
+ # Set in client request: max_tokens=512
60
+ ```
61
+
62
+ ### Symptom: OOM with quantized model
63
+
64
+ **Cause**: Quantization overhead or incorrect configuration
65
+
66
+ **Solution**:
67
+ ```bash
68
+ # Ensure quantization flag matches model
69
+ vllm serve TheBloke/Llama-2-70B-AWQ --quantization awq # Must specify
70
+
71
+ # Try different dtype
72
+ vllm serve MODEL --quantization awq --dtype float16
73
+ ```
74
+
75
+ ## Performance issues
76
+
77
+ ### Symptom: Low throughput (<50 req/sec expected >100)
78
+
79
+ **Diagnostic steps**:
80
+
81
+ 1. **Check GPU utilization**:
82
+ ```bash
83
+ watch -n 1 nvidia-smi
84
+ # GPU utilization should be >80%
85
+ ```
86
+
87
+ If <80%, increase concurrent requests:
88
+ ```bash
89
+ vllm serve MODEL --max-num-seqs 512 # Increase from 256
90
+ ```
91
+
92
+ 2. **Check if memory-bound**:
93
+ ```bash
94
+ # If memory at 100% but GPU <80%, reduce sequence length
95
+ vllm serve MODEL --max-model-len 4096
96
+ ```
97
+
98
+ 3. **Enable optimizations**:
99
+ ```bash
100
+ vllm serve MODEL \
101
+ --enable-prefix-caching \
102
+ --enable-chunked-prefill \
103
+ --max-num-seqs 512
104
+ ```
105
+
106
+ 4. **Check tensor parallelism settings**:
107
+ ```bash
108
+ # Must use power-of-2 GPUs
109
+ vllm serve MODEL --tensor-parallel-size 4 # Not 3 or 5
110
+ ```
111
+
112
+ ### Symptom: High TTFT (time to first token >1 second)
113
+
114
+ **Causes and solutions**:
115
+
116
+ **Long prompts**:
117
+ ```bash
118
+ vllm serve MODEL --enable-chunked-prefill
119
+ ```
120
+
121
+ **No prefix caching**:
122
+ ```bash
123
+ vllm serve MODEL --enable-prefix-caching # For repeated prompts
124
+ ```
125
+
126
+ **Too many concurrent requests**:
127
+ ```bash
128
+ vllm serve MODEL --max-num-seqs 64 # Reduce to prioritize latency
129
+ ```
130
+
131
+ **Model too large for single GPU**:
132
+ ```bash
133
+ vllm serve MODEL --tensor-parallel-size 2 # Parallelize prefill
134
+ ```
135
+
136
+ ### Symptom: Slow token generation (low tokens/sec)
137
+
138
+ **Diagnostic**:
139
+ ```bash
140
+ # Check if model is correct size
141
+ vllm serve MODEL # Should see model size in logs
142
+
143
+ # Check speculative decoding
144
+ vllm serve MODEL --speculative-model DRAFT_MODEL
145
+ ```
146
+
147
+ **For H100 GPUs**, enable FP8:
148
+ ```bash
149
+ vllm serve MODEL --quantization fp8
150
+ ```
151
+
152
+ ## Model loading errors
153
+
154
+ ### Symptom: `OSError: MODEL not found`
155
+
156
+ **Causes**:
157
+
158
+ 1. **Model name typo**:
159
+ ```bash
160
+ # Check exact model name on HuggingFace
161
+ vllm serve meta-llama/Llama-3-8B-Instruct # Correct capitalization
162
+ ```
163
+
164
+ 2. **Private/gated model**:
165
+ ```bash
166
+ # Login to HuggingFace first
167
+ huggingface-cli login
168
+ # Then run vLLM
169
+ vllm serve meta-llama/Llama-3-70B-Instruct
170
+ ```
171
+
172
+ 3. **Custom model needs trust flag**:
173
+ ```bash
174
+ vllm serve MODEL --trust-remote-code
175
+ ```
176
+
177
+ ### Symptom: `ValueError: Tokenizer not found`
178
+
179
+ **Solution**:
180
+ ```bash
181
+ # Download model manually first
182
+ python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('MODEL')"
183
+
184
+ # Then launch vLLM
185
+ vllm serve MODEL
186
+ ```
187
+
188
+ ### Symptom: `ImportError: No module named 'flash_attn'`
189
+
190
+ **Solution**:
191
+ ```bash
192
+ # Install flash attention
193
+ pip install flash-attn --no-build-isolation
194
+
195
+ # Or disable flash attention
196
+ vllm serve MODEL --disable-flash-attn
197
+ ```
198
+
199
+ ## Network and connection issues
200
+
201
+ ### Symptom: `Connection refused` when querying server
202
+
203
+ **Diagnostic**:
204
+
205
+ 1. **Check server is running**:
206
+ ```bash
207
+ curl http://localhost:8000/health
208
+ ```
209
+
210
+ 2. **Check port binding**:
211
+ ```bash
212
+ # Bind to all interfaces for remote access
213
+ vllm serve MODEL --host 0.0.0.0 --port 8000
214
+
215
+ # Check if port is in use
216
+ lsof -i :8000
217
+ ```
218
+
219
+ 3. **Check firewall**:
220
+ ```bash
221
+ # Allow port through firewall
222
+ sudo ufw allow 8000
223
+ ```
224
+
225
+ ### Symptom: Slow response times over network
226
+
227
+ **Solutions**:
228
+
229
+ 1. **Increase timeout**:
230
+ ```python
231
+ from openai import OpenAI
232
+
233
+ client = OpenAI(
234
+ base_url="http://localhost:8000/v1",
235
+ api_key="EMPTY",
236
+ timeout=300.0 # 5 minute timeout
237
+ )
238
+ ```
239
+
240
+ 2. **Check network latency**:
241
+ ```bash
242
+ ping SERVER_IP # Should be <10ms for local network
243
+ ```
244
+
245
+ 3. **Use connection pooling**:
246
+ ```python
247
+ import requests
248
+ from requests.adapters import HTTPAdapter
249
+ from urllib3.util.retry import Retry
250
+
251
+ session = requests.Session()
252
+ retries = Retry(total=3, backoff_factor=1)
253
+ session.mount('http://', HTTPAdapter(max_retries=retries))
254
+ ```
255
+
256
+ ## Quantization problems
257
+
258
+ ### Symptom: `RuntimeError: Quantization format not supported`
259
+
260
+ **Solution**:
261
+ ```bash
262
+ # Ensure correct quantization method
263
+ vllm serve MODEL --quantization awq # For AWQ models
264
+ vllm serve MODEL --quantization gptq # For GPTQ models
265
+
266
+ # Check model card for quantization type
267
+ ```
268
+
269
+ ### Symptom: Poor quality outputs after quantization
270
+
271
+ **Diagnostic**:
272
+
273
+ 1. **Verify model is correctly quantized**:
274
+ ```bash
275
+ # Check model config.json for quantization_config
276
+ cat ~/.cache/huggingface/hub/models--MODEL/config.json
277
+ ```
278
+
279
+ 2. **Try different quantization method**:
280
+ ```bash
281
+ # If AWQ quality issues, try FP8 (H100 only)
282
+ vllm serve MODEL --quantization fp8
283
+
284
+ # Or use less aggressive quantization
285
+ vllm serve MODEL # No quantization
286
+ ```
287
+
288
+ 3. **Increase temperature for better diversity**:
289
+ ```python
290
+ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
291
+ ```
292
+
293
+ ## Distributed serving issues
294
+
295
+ ### Symptom: `RuntimeError: Distributed init failed`
296
+
297
+ **Diagnostic**:
298
+
299
+ 1. **Check environment variables**:
300
+ ```bash
301
+ # On all nodes
302
+ echo $MASTER_ADDR # Should be same
303
+ echo $MASTER_PORT # Should be same
304
+ echo $RANK # Should be unique per node (0, 1, 2, ...)
305
+ echo $WORLD_SIZE # Should be same (total nodes)
306
+ ```
307
+
308
+ 2. **Check network connectivity**:
309
+ ```bash
310
+ # From node 1 to node 2
311
+ ping NODE2_IP
312
+ nc -zv NODE2_IP 29500 # Check port accessibility
313
+ ```
314
+
315
+ 3. **Check NCCL settings**:
316
+ ```bash
317
+ export NCCL_DEBUG=INFO
318
+ export NCCL_SOCKET_IFNAME=eth0 # Or your network interface
319
+ vllm serve MODEL --tensor-parallel-size 8
320
+ ```
321
+
322
+ ### Symptom: `NCCL error: unhandled cuda error`
323
+
324
+ **Solutions**:
325
+
326
+ ```bash
327
+ # Set NCCL to use correct network interface
328
+ export NCCL_SOCKET_IFNAME=eth0 # Replace with your interface
329
+
330
+ # Increase timeout
331
+ export NCCL_TIMEOUT=1800 # 30 minutes
332
+
333
+ # Force P2P for debugging
334
+ export NCCL_P2P_DISABLE=1
335
+ ```
336
+
337
+ ## Debugging tools and commands
338
+
339
+ ### Enable debug logging
340
+
341
+ ```bash
342
+ export VLLM_LOGGING_LEVEL=DEBUG
343
+ vllm serve MODEL
344
+ ```
345
+
346
+ ### Monitor GPU usage
347
+
348
+ ```bash
349
+ # Real-time GPU monitoring
350
+ watch -n 1 nvidia-smi
351
+
352
+ # Memory breakdown
353
+ nvidia-smi --query-gpu=memory.used,memory.free --format=csv -l 1
354
+ ```
355
+
356
+ ### Profile performance
357
+
358
+ ```bash
359
+ # Built-in benchmarking
360
+ vllm bench throughput \
361
+ --model MODEL \
362
+ --input-tokens 128 \
363
+ --output-tokens 256 \
364
+ --num-prompts 100
365
+
366
+ vllm bench latency \
367
+ --model MODEL \
368
+ --input-tokens 128 \
369
+ --output-tokens 256 \
370
+ --batch-size 8
371
+ ```
372
+
373
+ ### Check metrics
374
+
375
+ ```bash
376
+ # Prometheus metrics
377
+ curl http://localhost:9090/metrics
378
+
379
+ # Filter for specific metrics
380
+ curl http://localhost:9090/metrics | grep vllm_time_to_first_token
381
+
382
+ # Key metrics to monitor:
383
+ # - vllm_time_to_first_token_seconds
384
+ # - vllm_time_per_output_token_seconds
385
+ # - vllm_num_requests_running
386
+ # - vllm_gpu_cache_usage_perc
387
+ # - vllm_request_success_total
388
+ ```
389
+
390
+ ### Test server health
391
+
392
+ ```bash
393
+ # Health check
394
+ curl http://localhost:8000/health
395
+
396
+ # Model info
397
+ curl http://localhost:8000/v1/models
398
+
399
+ # Test completion
400
+ curl http://localhost:8000/v1/completions \
401
+ -H "Content-Type: application/json" \
402
+ -d '{
403
+ "model": "MODEL",
404
+ "prompt": "Hello",
405
+ "max_tokens": 10
406
+ }'
407
+ ```
408
+
409
+ ### Common environment variables
410
+
411
+ ```bash
412
+ # CUDA settings
413
+ export CUDA_VISIBLE_DEVICES=0,1,2,3 # Limit to specific GPUs
414
+
415
+ # vLLM settings
416
+ export VLLM_LOGGING_LEVEL=DEBUG
417
+ export VLLM_TRACE_FUNCTION=1 # Profile functions
418
+ export VLLM_USE_V1=1 # Use v1.0 engine (faster)
419
+
420
+ # NCCL settings (distributed)
421
+ export NCCL_DEBUG=INFO
422
+ export NCCL_SOCKET_IFNAME=eth0
423
+ export NCCL_IB_DISABLE=0 # Enable InfiniBand
424
+ ```
425
+
426
+ ### Collect diagnostic info for bug reports
427
+
428
+ ```bash
429
+ # System info
430
+ nvidia-smi
431
+ python --version
432
+ pip show vllm
433
+
434
+ # vLLM version and config
435
+ vllm --version
436
+ python -c "import vllm; print(vllm.__version__)"
437
+
438
+ # Run with debug logging
439
+ export VLLM_LOGGING_LEVEL=DEBUG
440
+ vllm serve MODEL 2>&1 | tee vllm_debug.log
441
+
442
+ # Include in bug report:
443
+ # - vllm_debug.log
444
+ # - nvidia-smi output
445
+ # - Full command used
446
+ # - Expected vs actual behavior
447
+ ```