@synsci/cli-darwin-x64 1.1.49

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (373) hide show
  1. package/bin/skills/accelerate/SKILL.md +332 -0
  2. package/bin/skills/accelerate/references/custom-plugins.md +453 -0
  3. package/bin/skills/accelerate/references/megatron-integration.md +489 -0
  4. package/bin/skills/accelerate/references/performance.md +525 -0
  5. package/bin/skills/audiocraft/SKILL.md +564 -0
  6. package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
  7. package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
  8. package/bin/skills/autogpt/SKILL.md +403 -0
  9. package/bin/skills/autogpt/references/advanced-usage.md +535 -0
  10. package/bin/skills/autogpt/references/troubleshooting.md +420 -0
  11. package/bin/skills/awq/SKILL.md +310 -0
  12. package/bin/skills/awq/references/advanced-usage.md +324 -0
  13. package/bin/skills/awq/references/troubleshooting.md +344 -0
  14. package/bin/skills/axolotl/SKILL.md +158 -0
  15. package/bin/skills/axolotl/references/api.md +5548 -0
  16. package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
  17. package/bin/skills/axolotl/references/index.md +15 -0
  18. package/bin/skills/axolotl/references/other.md +3563 -0
  19. package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
  20. package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
  21. package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
  22. package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
  23. package/bin/skills/bitsandbytes/SKILL.md +411 -0
  24. package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
  25. package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
  26. package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
  27. package/bin/skills/blip-2/SKILL.md +564 -0
  28. package/bin/skills/blip-2/references/advanced-usage.md +680 -0
  29. package/bin/skills/blip-2/references/troubleshooting.md +526 -0
  30. package/bin/skills/chroma/SKILL.md +406 -0
  31. package/bin/skills/chroma/references/integration.md +38 -0
  32. package/bin/skills/clip/SKILL.md +253 -0
  33. package/bin/skills/clip/references/applications.md +207 -0
  34. package/bin/skills/constitutional-ai/SKILL.md +290 -0
  35. package/bin/skills/crewai/SKILL.md +498 -0
  36. package/bin/skills/crewai/references/flows.md +438 -0
  37. package/bin/skills/crewai/references/tools.md +429 -0
  38. package/bin/skills/crewai/references/troubleshooting.md +480 -0
  39. package/bin/skills/deepspeed/SKILL.md +141 -0
  40. package/bin/skills/deepspeed/references/08.md +17 -0
  41. package/bin/skills/deepspeed/references/09.md +173 -0
  42. package/bin/skills/deepspeed/references/2020.md +378 -0
  43. package/bin/skills/deepspeed/references/2023.md +279 -0
  44. package/bin/skills/deepspeed/references/assets.md +179 -0
  45. package/bin/skills/deepspeed/references/index.md +35 -0
  46. package/bin/skills/deepspeed/references/mii.md +118 -0
  47. package/bin/skills/deepspeed/references/other.md +1191 -0
  48. package/bin/skills/deepspeed/references/tutorials.md +6554 -0
  49. package/bin/skills/dspy/SKILL.md +590 -0
  50. package/bin/skills/dspy/references/examples.md +663 -0
  51. package/bin/skills/dspy/references/modules.md +475 -0
  52. package/bin/skills/dspy/references/optimizers.md +566 -0
  53. package/bin/skills/faiss/SKILL.md +221 -0
  54. package/bin/skills/faiss/references/index_types.md +280 -0
  55. package/bin/skills/flash-attention/SKILL.md +367 -0
  56. package/bin/skills/flash-attention/references/benchmarks.md +215 -0
  57. package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
  58. package/bin/skills/gguf/SKILL.md +427 -0
  59. package/bin/skills/gguf/references/advanced-usage.md +504 -0
  60. package/bin/skills/gguf/references/troubleshooting.md +442 -0
  61. package/bin/skills/gptq/SKILL.md +450 -0
  62. package/bin/skills/gptq/references/calibration.md +337 -0
  63. package/bin/skills/gptq/references/integration.md +129 -0
  64. package/bin/skills/gptq/references/troubleshooting.md +95 -0
  65. package/bin/skills/grpo-rl-training/README.md +97 -0
  66. package/bin/skills/grpo-rl-training/SKILL.md +572 -0
  67. package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
  68. package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
  69. package/bin/skills/guidance/SKILL.md +572 -0
  70. package/bin/skills/guidance/references/backends.md +554 -0
  71. package/bin/skills/guidance/references/constraints.md +674 -0
  72. package/bin/skills/guidance/references/examples.md +767 -0
  73. package/bin/skills/hqq/SKILL.md +445 -0
  74. package/bin/skills/hqq/references/advanced-usage.md +528 -0
  75. package/bin/skills/hqq/references/troubleshooting.md +503 -0
  76. package/bin/skills/hugging-face-cli/SKILL.md +191 -0
  77. package/bin/skills/hugging-face-cli/references/commands.md +954 -0
  78. package/bin/skills/hugging-face-cli/references/examples.md +374 -0
  79. package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
  80. package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
  81. package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
  82. package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
  83. package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
  84. package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
  85. package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
  86. package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
  87. package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
  88. package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
  89. package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
  90. package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
  91. package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
  92. package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
  93. package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
  94. package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
  95. package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
  96. package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
  97. package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
  98. package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
  99. package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
  100. package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
  101. package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
  102. package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
  103. package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
  104. package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
  105. package/bin/skills/hugging-face-jobs/index.html +216 -0
  106. package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
  107. package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
  108. package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
  109. package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
  110. package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
  111. package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
  112. package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
  113. package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
  114. package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
  115. package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
  116. package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
  117. package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
  118. package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
  119. package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
  120. package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
  121. package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
  122. package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
  123. package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
  124. package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
  125. package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
  126. package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
  127. package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
  128. package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
  129. package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
  130. package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
  131. package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
  132. package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
  133. package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
  134. package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
  135. package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
  136. package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
  137. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
  138. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
  139. package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
  140. package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
  141. package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
  142. package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
  143. package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
  144. package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
  145. package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
  146. package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
  147. package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
  148. package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
  149. package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
  150. package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
  151. package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
  152. package/bin/skills/instructor/SKILL.md +740 -0
  153. package/bin/skills/instructor/references/examples.md +107 -0
  154. package/bin/skills/instructor/references/providers.md +70 -0
  155. package/bin/skills/instructor/references/validation.md +606 -0
  156. package/bin/skills/knowledge-distillation/SKILL.md +458 -0
  157. package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
  158. package/bin/skills/lambda-labs/SKILL.md +545 -0
  159. package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
  160. package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
  161. package/bin/skills/langchain/SKILL.md +480 -0
  162. package/bin/skills/langchain/references/agents.md +499 -0
  163. package/bin/skills/langchain/references/integration.md +562 -0
  164. package/bin/skills/langchain/references/rag.md +600 -0
  165. package/bin/skills/langsmith/SKILL.md +422 -0
  166. package/bin/skills/langsmith/references/advanced-usage.md +548 -0
  167. package/bin/skills/langsmith/references/troubleshooting.md +537 -0
  168. package/bin/skills/litgpt/SKILL.md +469 -0
  169. package/bin/skills/litgpt/references/custom-models.md +568 -0
  170. package/bin/skills/litgpt/references/distributed-training.md +451 -0
  171. package/bin/skills/litgpt/references/supported-models.md +336 -0
  172. package/bin/skills/litgpt/references/training-recipes.md +619 -0
  173. package/bin/skills/llama-cpp/SKILL.md +258 -0
  174. package/bin/skills/llama-cpp/references/optimization.md +89 -0
  175. package/bin/skills/llama-cpp/references/quantization.md +213 -0
  176. package/bin/skills/llama-cpp/references/server.md +125 -0
  177. package/bin/skills/llama-factory/SKILL.md +80 -0
  178. package/bin/skills/llama-factory/references/_images.md +23 -0
  179. package/bin/skills/llama-factory/references/advanced.md +1055 -0
  180. package/bin/skills/llama-factory/references/getting_started.md +349 -0
  181. package/bin/skills/llama-factory/references/index.md +19 -0
  182. package/bin/skills/llama-factory/references/other.md +31 -0
  183. package/bin/skills/llamaguard/SKILL.md +337 -0
  184. package/bin/skills/llamaindex/SKILL.md +569 -0
  185. package/bin/skills/llamaindex/references/agents.md +83 -0
  186. package/bin/skills/llamaindex/references/data_connectors.md +108 -0
  187. package/bin/skills/llamaindex/references/query_engines.md +406 -0
  188. package/bin/skills/llava/SKILL.md +304 -0
  189. package/bin/skills/llava/references/training.md +197 -0
  190. package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
  191. package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
  192. package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
  193. package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
  194. package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
  195. package/bin/skills/long-context/SKILL.md +536 -0
  196. package/bin/skills/long-context/references/extension_methods.md +468 -0
  197. package/bin/skills/long-context/references/fine_tuning.md +611 -0
  198. package/bin/skills/long-context/references/rope.md +402 -0
  199. package/bin/skills/mamba/SKILL.md +260 -0
  200. package/bin/skills/mamba/references/architecture-details.md +206 -0
  201. package/bin/skills/mamba/references/benchmarks.md +255 -0
  202. package/bin/skills/mamba/references/training-guide.md +388 -0
  203. package/bin/skills/megatron-core/SKILL.md +366 -0
  204. package/bin/skills/megatron-core/references/benchmarks.md +249 -0
  205. package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
  206. package/bin/skills/megatron-core/references/production-examples.md +473 -0
  207. package/bin/skills/megatron-core/references/training-recipes.md +547 -0
  208. package/bin/skills/miles/SKILL.md +315 -0
  209. package/bin/skills/miles/references/api-reference.md +141 -0
  210. package/bin/skills/miles/references/troubleshooting.md +352 -0
  211. package/bin/skills/mlflow/SKILL.md +704 -0
  212. package/bin/skills/mlflow/references/deployment.md +744 -0
  213. package/bin/skills/mlflow/references/model-registry.md +770 -0
  214. package/bin/skills/mlflow/references/tracking.md +680 -0
  215. package/bin/skills/modal/SKILL.md +341 -0
  216. package/bin/skills/modal/references/advanced-usage.md +503 -0
  217. package/bin/skills/modal/references/troubleshooting.md +494 -0
  218. package/bin/skills/model-merging/SKILL.md +539 -0
  219. package/bin/skills/model-merging/references/evaluation.md +462 -0
  220. package/bin/skills/model-merging/references/examples.md +428 -0
  221. package/bin/skills/model-merging/references/methods.md +352 -0
  222. package/bin/skills/model-pruning/SKILL.md +495 -0
  223. package/bin/skills/model-pruning/references/wanda.md +347 -0
  224. package/bin/skills/moe-training/SKILL.md +526 -0
  225. package/bin/skills/moe-training/references/architectures.md +432 -0
  226. package/bin/skills/moe-training/references/inference.md +348 -0
  227. package/bin/skills/moe-training/references/training.md +425 -0
  228. package/bin/skills/nanogpt/SKILL.md +290 -0
  229. package/bin/skills/nanogpt/references/architecture.md +382 -0
  230. package/bin/skills/nanogpt/references/data.md +476 -0
  231. package/bin/skills/nanogpt/references/training.md +564 -0
  232. package/bin/skills/nemo-curator/SKILL.md +383 -0
  233. package/bin/skills/nemo-curator/references/deduplication.md +87 -0
  234. package/bin/skills/nemo-curator/references/filtering.md +102 -0
  235. package/bin/skills/nemo-evaluator/SKILL.md +494 -0
  236. package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
  237. package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
  238. package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
  239. package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
  240. package/bin/skills/nemo-guardrails/SKILL.md +297 -0
  241. package/bin/skills/nnsight/SKILL.md +436 -0
  242. package/bin/skills/nnsight/references/README.md +78 -0
  243. package/bin/skills/nnsight/references/api.md +344 -0
  244. package/bin/skills/nnsight/references/tutorials.md +300 -0
  245. package/bin/skills/openrlhf/SKILL.md +249 -0
  246. package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
  247. package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
  248. package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
  249. package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
  250. package/bin/skills/outlines/SKILL.md +652 -0
  251. package/bin/skills/outlines/references/backends.md +615 -0
  252. package/bin/skills/outlines/references/examples.md +773 -0
  253. package/bin/skills/outlines/references/json_generation.md +652 -0
  254. package/bin/skills/peft/SKILL.md +431 -0
  255. package/bin/skills/peft/references/advanced-usage.md +514 -0
  256. package/bin/skills/peft/references/troubleshooting.md +480 -0
  257. package/bin/skills/phoenix/SKILL.md +475 -0
  258. package/bin/skills/phoenix/references/advanced-usage.md +619 -0
  259. package/bin/skills/phoenix/references/troubleshooting.md +538 -0
  260. package/bin/skills/pinecone/SKILL.md +358 -0
  261. package/bin/skills/pinecone/references/deployment.md +181 -0
  262. package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
  263. package/bin/skills/pytorch-fsdp/references/index.md +7 -0
  264. package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
  265. package/bin/skills/pytorch-lightning/SKILL.md +346 -0
  266. package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
  267. package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
  268. package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
  269. package/bin/skills/pyvene/SKILL.md +473 -0
  270. package/bin/skills/pyvene/references/README.md +73 -0
  271. package/bin/skills/pyvene/references/api.md +383 -0
  272. package/bin/skills/pyvene/references/tutorials.md +376 -0
  273. package/bin/skills/qdrant/SKILL.md +493 -0
  274. package/bin/skills/qdrant/references/advanced-usage.md +648 -0
  275. package/bin/skills/qdrant/references/troubleshooting.md +631 -0
  276. package/bin/skills/ray-data/SKILL.md +326 -0
  277. package/bin/skills/ray-data/references/integration.md +82 -0
  278. package/bin/skills/ray-data/references/transformations.md +83 -0
  279. package/bin/skills/ray-train/SKILL.md +406 -0
  280. package/bin/skills/ray-train/references/multi-node.md +628 -0
  281. package/bin/skills/rwkv/SKILL.md +260 -0
  282. package/bin/skills/rwkv/references/architecture-details.md +344 -0
  283. package/bin/skills/rwkv/references/rwkv7.md +386 -0
  284. package/bin/skills/rwkv/references/state-management.md +369 -0
  285. package/bin/skills/saelens/SKILL.md +386 -0
  286. package/bin/skills/saelens/references/README.md +70 -0
  287. package/bin/skills/saelens/references/api.md +333 -0
  288. package/bin/skills/saelens/references/tutorials.md +318 -0
  289. package/bin/skills/segment-anything/SKILL.md +500 -0
  290. package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
  291. package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
  292. package/bin/skills/sentence-transformers/SKILL.md +255 -0
  293. package/bin/skills/sentence-transformers/references/models.md +123 -0
  294. package/bin/skills/sentencepiece/SKILL.md +235 -0
  295. package/bin/skills/sentencepiece/references/algorithms.md +200 -0
  296. package/bin/skills/sentencepiece/references/training.md +304 -0
  297. package/bin/skills/sglang/SKILL.md +442 -0
  298. package/bin/skills/sglang/references/deployment.md +490 -0
  299. package/bin/skills/sglang/references/radix-attention.md +413 -0
  300. package/bin/skills/sglang/references/structured-generation.md +541 -0
  301. package/bin/skills/simpo/SKILL.md +219 -0
  302. package/bin/skills/simpo/references/datasets.md +478 -0
  303. package/bin/skills/simpo/references/hyperparameters.md +452 -0
  304. package/bin/skills/simpo/references/loss-functions.md +350 -0
  305. package/bin/skills/skypilot/SKILL.md +509 -0
  306. package/bin/skills/skypilot/references/advanced-usage.md +491 -0
  307. package/bin/skills/skypilot/references/troubleshooting.md +570 -0
  308. package/bin/skills/slime/SKILL.md +464 -0
  309. package/bin/skills/slime/references/api-reference.md +392 -0
  310. package/bin/skills/slime/references/troubleshooting.md +386 -0
  311. package/bin/skills/speculative-decoding/SKILL.md +467 -0
  312. package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
  313. package/bin/skills/speculative-decoding/references/medusa.md +350 -0
  314. package/bin/skills/stable-diffusion/SKILL.md +519 -0
  315. package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
  316. package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
  317. package/bin/skills/tensorboard/SKILL.md +629 -0
  318. package/bin/skills/tensorboard/references/integrations.md +638 -0
  319. package/bin/skills/tensorboard/references/profiling.md +545 -0
  320. package/bin/skills/tensorboard/references/visualization.md +620 -0
  321. package/bin/skills/tensorrt-llm/SKILL.md +187 -0
  322. package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
  323. package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
  324. package/bin/skills/tensorrt-llm/references/serving.md +470 -0
  325. package/bin/skills/tinker/SKILL.md +362 -0
  326. package/bin/skills/tinker/references/api-reference.md +168 -0
  327. package/bin/skills/tinker/references/getting-started.md +157 -0
  328. package/bin/skills/tinker/references/loss-functions.md +163 -0
  329. package/bin/skills/tinker/references/models-and-lora.md +139 -0
  330. package/bin/skills/tinker/references/recipes.md +280 -0
  331. package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
  332. package/bin/skills/tinker/references/rendering.md +243 -0
  333. package/bin/skills/tinker/references/supervised-learning.md +232 -0
  334. package/bin/skills/tinker-training-cost/SKILL.md +187 -0
  335. package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
  336. package/bin/skills/torchforge/SKILL.md +433 -0
  337. package/bin/skills/torchforge/references/api-reference.md +327 -0
  338. package/bin/skills/torchforge/references/troubleshooting.md +409 -0
  339. package/bin/skills/torchtitan/SKILL.md +358 -0
  340. package/bin/skills/torchtitan/references/checkpoint.md +181 -0
  341. package/bin/skills/torchtitan/references/custom-models.md +258 -0
  342. package/bin/skills/torchtitan/references/float8.md +133 -0
  343. package/bin/skills/torchtitan/references/fsdp.md +126 -0
  344. package/bin/skills/transformer-lens/SKILL.md +346 -0
  345. package/bin/skills/transformer-lens/references/README.md +54 -0
  346. package/bin/skills/transformer-lens/references/api.md +362 -0
  347. package/bin/skills/transformer-lens/references/tutorials.md +339 -0
  348. package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
  349. package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
  350. package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
  351. package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
  352. package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
  353. package/bin/skills/unsloth/SKILL.md +80 -0
  354. package/bin/skills/unsloth/references/index.md +7 -0
  355. package/bin/skills/unsloth/references/llms-full.md +16799 -0
  356. package/bin/skills/unsloth/references/llms-txt.md +12044 -0
  357. package/bin/skills/unsloth/references/llms.md +82 -0
  358. package/bin/skills/verl/SKILL.md +391 -0
  359. package/bin/skills/verl/references/api-reference.md +301 -0
  360. package/bin/skills/verl/references/troubleshooting.md +391 -0
  361. package/bin/skills/vllm/SKILL.md +364 -0
  362. package/bin/skills/vllm/references/optimization.md +226 -0
  363. package/bin/skills/vllm/references/quantization.md +284 -0
  364. package/bin/skills/vllm/references/server-deployment.md +255 -0
  365. package/bin/skills/vllm/references/troubleshooting.md +447 -0
  366. package/bin/skills/weights-and-biases/SKILL.md +590 -0
  367. package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
  368. package/bin/skills/weights-and-biases/references/integrations.md +700 -0
  369. package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
  370. package/bin/skills/whisper/SKILL.md +317 -0
  371. package/bin/skills/whisper/references/languages.md +189 -0
  372. package/bin/synsc +0 -0
  373. package/package.json +10 -0
@@ -0,0 +1,315 @@
1
+ # Custom Benchmark Integration
2
+
3
+ NeMo Evaluator supports adding custom benchmarks through Framework Definition Files (FDFs) and custom containers.
4
+
5
+ ## Overview
6
+
7
+ Custom benchmarks are added by:
8
+
9
+ 1. **Framework Definition Files (FDFs)**: YAML files that define evaluation tasks, commands, and output parsing
10
+ 2. **Custom Containers**: Package your framework with nemo-evaluator for reproducible execution
11
+
12
+ > **Note**: NeMo Evaluator does not currently support programmatic harness APIs or custom metric implementations via Python classes. Customization is done through FDFs and containers.
13
+
14
+ ## Framework Definition Files (FDFs)
15
+
16
+ FDFs are the primary way to add custom evaluations. An FDF declares framework metadata, default commands, and evaluation tasks.
17
+
18
+ ### FDF Structure
19
+
20
+ ```yaml
21
+ # framework_def.yaml
22
+ framework:
23
+ name: my-custom-framework
24
+ package_name: my_custom_eval
25
+
26
+ defaults:
27
+ command: "python -m my_custom_eval.run --model-id {model_id} --task {task} --output-dir {output_dir}"
28
+
29
+ evaluations:
30
+ - name: custom_task_1
31
+ defaults:
32
+ temperature: 0.0
33
+ max_new_tokens: 512
34
+ extra:
35
+ custom_param: value
36
+
37
+ - name: custom_task_2
38
+ defaults:
39
+ temperature: 0.7
40
+ max_new_tokens: 1024
41
+ ```
42
+
43
+ ### Key FDF Components
44
+
45
+ **Framework section**:
46
+ - `name`: Human-readable name for your framework
47
+ - `package_name`: Python package name
48
+
49
+ **Defaults section**:
50
+ - `command`: The command template to execute your evaluation
51
+ - Placeholders: `{model_id}`, `{task}`, `{output_dir}` are substituted at runtime
52
+
53
+ **Evaluations section**:
54
+ - List of tasks with their default parameters
55
+ - Each task can override the framework defaults
56
+
57
+ ### Output Parser
58
+
59
+ When creating a custom FDF, you need an output parser function that translates your framework's results into NeMo Evaluator's standard schema:
60
+
61
+ ```python
62
+ # my_custom_eval/parser.py
63
+ def parse_output(output_dir: str) -> dict:
64
+ """
65
+ Parse evaluation results from output_dir.
66
+
67
+ Returns dict with metrics in NeMo Evaluator format.
68
+ """
69
+ # Read your framework's output files
70
+ results_file = Path(output_dir) / "results.json"
71
+ with open(results_file) as f:
72
+ raw_results = json.load(f)
73
+
74
+ # Transform to standard schema
75
+ return {
76
+ "metrics": {
77
+ "accuracy": raw_results["score"],
78
+ "total_samples": raw_results["num_samples"]
79
+ }
80
+ }
81
+ ```
82
+
83
+ ## Custom Container Creation
84
+
85
+ Package your custom framework as a container for reproducibility.
86
+
87
+ ### Dockerfile Example
88
+
89
+ ```dockerfile
90
+ # Dockerfile
91
+ FROM python:3.10-slim
92
+
93
+ # Install nemo-evaluator
94
+ RUN pip install nemo-evaluator
95
+
96
+ # Install your custom framework
97
+ COPY my_custom_eval/ /opt/my_custom_eval/
98
+ RUN pip install /opt/my_custom_eval/
99
+
100
+ # Copy framework definition
101
+ COPY framework_def.yaml /opt/framework_def.yaml
102
+
103
+ # Set working directory
104
+ WORKDIR /opt
105
+
106
+ ENTRYPOINT ["python", "-m", "nemo_evaluator"]
107
+ ```
108
+
109
+ ### Build and Push
110
+
111
+ ```bash
112
+ docker build -t my-registry/custom-eval:1.0 .
113
+ docker push my-registry/custom-eval:1.0
114
+ ```
115
+
116
+ ### Register in mapping.toml
117
+
118
+ Add your custom container to the task registry:
119
+
120
+ ```toml
121
+ # Add to mapping.toml
122
+ [my-custom-framework]
123
+ container = "my-registry/custom-eval:1.0"
124
+
125
+ [my-custom-framework.tasks.chat.custom_task_1]
126
+ required_env_vars = []
127
+
128
+ [my-custom-framework.tasks.chat.custom_task_2]
129
+ required_env_vars = ["CUSTOM_API_KEY"]
130
+ ```
131
+
132
+ ## Using Custom Datasets
133
+
134
+ ### Dataset Mounting
135
+
136
+ Mount proprietary datasets at runtime rather than baking them into containers:
137
+
138
+ ```yaml
139
+ # config.yaml
140
+ defaults:
141
+ - execution: local
142
+ - deployment: none
143
+ - _self_
144
+
145
+ execution:
146
+ output_dir: ./results
147
+
148
+ evaluation:
149
+ tasks:
150
+ - name: custom_task_1
151
+ dataset_dir: /path/to/local/data
152
+ dataset_mount_path: /data # Optional, defaults to /datasets
153
+ ```
154
+
155
+ The launcher will mount the dataset directory into the container and set `NEMO_EVALUATOR_DATASET_DIR` environment variable.
156
+
157
+ ### Task-Specific Environment Variables
158
+
159
+ Pass environment variables to specific tasks:
160
+
161
+ ```yaml
162
+ evaluation:
163
+ tasks:
164
+ - name: gpqa_diamond
165
+ env_vars:
166
+ HF_TOKEN: HF_TOKEN # Maps to $HF_TOKEN from host
167
+
168
+ - name: custom_task
169
+ env_vars:
170
+ CUSTOM_API_KEY: MY_CUSTOM_KEY
171
+ DATA_PATH: /data/custom.jsonl
172
+ ```
173
+
174
+ ## Parameter Overrides
175
+
176
+ Override evaluation parameters at multiple levels:
177
+
178
+ ### Global Overrides
179
+
180
+ Apply to all tasks:
181
+
182
+ ```yaml
183
+ evaluation:
184
+ nemo_evaluator_config:
185
+ config:
186
+ params:
187
+ temperature: 0.0
188
+ max_new_tokens: 512
189
+ parallelism: 4
190
+ request_timeout: 300
191
+ ```
192
+
193
+ ### Task-Specific Overrides
194
+
195
+ Override for individual tasks:
196
+
197
+ ```yaml
198
+ evaluation:
199
+ tasks:
200
+ - name: humaneval
201
+ nemo_evaluator_config:
202
+ config:
203
+ params:
204
+ temperature: 0.8
205
+ max_new_tokens: 1024
206
+ n_samples: 200 # Task-specific parameter
207
+ ```
208
+
209
+ ### CLI Overrides
210
+
211
+ Override at runtime:
212
+
213
+ ```bash
214
+ nemo-evaluator-launcher run \
215
+ --config-dir . \
216
+ --config-name config \
217
+ -o +evaluation.nemo_evaluator_config.config.params.limit_samples=10
218
+ ```
219
+
220
+ ## Testing Custom Benchmarks
221
+
222
+ ### Dry Run
223
+
224
+ Validate configuration without execution:
225
+
226
+ ```bash
227
+ nemo-evaluator-launcher run \
228
+ --config-dir . \
229
+ --config-name custom_config \
230
+ --dry-run
231
+ ```
232
+
233
+ ### Limited Sample Testing
234
+
235
+ Test with a small subset first:
236
+
237
+ ```bash
238
+ nemo-evaluator-launcher run \
239
+ --config-dir . \
240
+ --config-name custom_config \
241
+ -o +evaluation.nemo_evaluator_config.config.params.limit_samples=5
242
+ ```
243
+
244
+ ### Check Results
245
+
246
+ ```bash
247
+ # View results
248
+ cat results/<invocation_id>/<task>/artifacts/results.json
249
+
250
+ # Check logs
251
+ cat results/<invocation_id>/<task>/artifacts/logs/eval.log
252
+ ```
253
+
254
+ ## Best Practices
255
+
256
+ 1. **Use FDFs**: Define custom benchmarks via Framework Definition Files
257
+ 2. **Containerize**: Package frameworks as containers for reproducibility
258
+ 3. **Mount data**: Use volume mounts for datasets instead of baking into images
259
+ 4. **Test incrementally**: Use `limit_samples` for quick validation
260
+ 5. **Version containers**: Tag containers with semantic versions
261
+ 6. **Document parameters**: Include clear documentation in your FDF
262
+
263
+ ## Limitations
264
+
265
+ Currently **not supported**:
266
+ - Custom Python metric classes via plugin system
267
+ - Programmatic harness registration via Python API
268
+ - Runtime metric injection via configuration
269
+
270
+ Custom scoring logic must be implemented within your evaluation framework and exposed through the FDF's output parser.
271
+
272
+ ## Example: Complete Custom Setup
273
+
274
+ ```yaml
275
+ # custom_eval_config.yaml
276
+ defaults:
277
+ - execution: local
278
+ - deployment: none
279
+ - _self_
280
+
281
+ execution:
282
+ output_dir: ./custom_results
283
+
284
+ target:
285
+ api_endpoint:
286
+ model_id: my-model
287
+ url: http://localhost:8000/v1/chat/completions
288
+ api_key_name: ""
289
+
290
+ evaluation:
291
+ nemo_evaluator_config:
292
+ config:
293
+ params:
294
+ parallelism: 4
295
+ request_timeout: 300
296
+
297
+ tasks:
298
+ - name: custom_task_1
299
+ dataset_dir: /data/benchmarks
300
+ env_vars:
301
+ DATA_VERSION: v2
302
+ nemo_evaluator_config:
303
+ config:
304
+ params:
305
+ temperature: 0.0
306
+ max_new_tokens: 256
307
+ ```
308
+
309
+ Run with:
310
+
311
+ ```bash
312
+ nemo-evaluator-launcher run \
313
+ --config-dir . \
314
+ --config-name custom_eval_config
315
+ ```
@@ -0,0 +1,361 @@
1
+ # Execution Backends
2
+
3
+ NeMo Evaluator supports three execution backends: Local (Docker), Slurm (HPC), and Lepton (Cloud). Each backend implements the same interface but has different configuration requirements.
4
+
5
+ ## Backend Architecture
6
+
7
+ ```
8
+ ┌─────────────────────────────────────────────────────────────┐
9
+ │ nemo-evaluator-launcher │
10
+ │ │
11
+ │ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
12
+ │ │ LocalExecutor │ │ SlurmExecutor │ │ LeptonExecutor│ │
13
+ │ │ (Docker) │ │ (SSH+sbatch)│ │ (Cloud API) │ │
14
+ │ └──────────────┘ └──────────────┘ └──────────────┘ │
15
+ │ │ │ │ │
16
+ └───────────┼────────────────┼─────────────────┼───────────────┘
17
+ │ │ │
18
+ ▼ ▼ ▼
19
+ ┌─────────┐ ┌───────────┐ ┌────────────┐
20
+ │ Docker │ │ Slurm │ │ Lepton AI │
21
+ │ Engine │ │ Cluster │ │ Platform │
22
+ └─────────┘ └───────────┘ └────────────┘
23
+ ```
24
+
25
+ ## Local Executor (Docker)
26
+
27
+ The local executor runs evaluation containers on your local machine using Docker.
28
+
29
+ ### Prerequisites
30
+
31
+ - Docker installed and running
32
+ - `docker` command available in PATH
33
+ - GPU drivers and nvidia-container-toolkit for GPU tasks
34
+
35
+ ### Configuration
36
+
37
+ ```yaml
38
+ defaults:
39
+ - execution: local
40
+ - deployment: none
41
+ - _self_
42
+
43
+ execution:
44
+ output_dir: ./results
45
+ mode: sequential # or parallel
46
+
47
+ # Docker-specific options
48
+ docker_args:
49
+ - "--gpus=all"
50
+ - "--shm-size=16g"
51
+
52
+ # Container resource limits
53
+ memory_limit: "64g"
54
+ cpus: 8
55
+ ```
56
+
57
+ ### How It Works
58
+
59
+ 1. Launcher reads `mapping.toml` to find container image for task
60
+ 2. Creates run configuration and mounts volumes
61
+ 3. Executes `docker run` via subprocess
62
+ 4. Monitors stage files (`stage.pre-start`, `stage.running`, `stage.exit`)
63
+ 5. Collects results from mounted output directory
64
+
65
+ ### Example Usage
66
+
67
+ ```bash
68
+ # Simple local evaluation
69
+ nemo-evaluator-launcher run \
70
+ --config-dir . \
71
+ --config-name local_config
72
+
73
+ # With GPU allocation
74
+ nemo-evaluator-launcher run \
75
+ --config-dir . \
76
+ --config-name local_config \
77
+ -o 'execution.docker_args=["--gpus=all"]'
78
+ ```
79
+
80
+ ### Status Tracking
81
+
82
+ Status is tracked via file markers in the output directory:
83
+
84
+ | File | Meaning |
85
+ |------|---------|
86
+ | `stage.pre-start` | Container starting |
87
+ | `stage.running` | Evaluation in progress |
88
+ | `stage.exit` | Evaluation complete |
89
+
90
+ ## Slurm Executor
91
+
92
+ The Slurm executor submits evaluation jobs to HPC clusters via SSH.
93
+
94
+ ### Prerequisites
95
+
96
+ - SSH access to cluster head node
97
+ - Slurm commands available (`sbatch`, `squeue`, `sacct`)
98
+ - NGC containers accessible from compute nodes
99
+ - Shared filesystem for results
100
+
101
+ ### Configuration
102
+
103
+ ```yaml
104
+ defaults:
105
+ - execution: slurm
106
+ - deployment: vllm # or sglang, nim, none
107
+ - _self_
108
+
109
+ execution:
110
+ # SSH connection settings
111
+ hostname: cluster.example.com
112
+ username: myuser # Optional, uses SSH config
113
+ ssh_key_path: ~/.ssh/id_rsa
114
+
115
+ # Slurm job settings
116
+ account: my_account
117
+ partition: gpu
118
+ qos: normal
119
+ nodes: 1
120
+ gpus_per_node: 8
121
+ cpus_per_task: 32
122
+ memory: "256G"
123
+ walltime: "04:00:00"
124
+
125
+ # Output settings
126
+ output_dir: /shared/nfs/results
127
+
128
+ # Container settings
129
+ container_mounts:
130
+ - "/shared/data:/data:ro"
131
+ - "/shared/models:/models:ro"
132
+ ```
133
+
134
+ ### Deployment Options
135
+
136
+ When running on Slurm, you can deploy models alongside evaluation:
137
+
138
+ ```yaml
139
+ # vLLM deployment
140
+ deployment:
141
+ type: vllm
142
+ checkpoint_path: /models/llama-3.1-8b
143
+ tensor_parallel_size: 4
144
+ max_model_len: 8192
145
+ gpu_memory_utilization: 0.9
146
+
147
+ # SGLang deployment
148
+ deployment:
149
+ type: sglang
150
+ checkpoint_path: /models/llama-3.1-8b
151
+ tensor_parallel_size: 4
152
+
153
+ # NVIDIA NIM deployment
154
+ deployment:
155
+ type: nim
156
+ nim_model_name: meta/llama-3.1-8b-instruct
157
+ ```
158
+
159
+ ### Job Submission Flow
160
+
161
+ ```
162
+ ┌─────────────────┐
163
+ │ Launcher CLI │
164
+ └────────┬────────┘
165
+ │ SSH
166
+
167
+ ┌─────────────────┐
168
+ │ Cluster Head │
169
+ │ Node │
170
+ └────────┬────────┘
171
+ │ sbatch
172
+
173
+ ┌─────────────────┐
174
+ │ Compute Node │
175
+ │ │
176
+ │ ┌─────────────┐ │
177
+ │ │ Deployment │ │
178
+ │ │ Container │ │
179
+ │ └─────────────┘ │
180
+ │ │ │
181
+ │ ▼ │
182
+ │ ┌─────────────┐ │
183
+ │ │ Evaluation │ │
184
+ │ │ Container │ │
185
+ │ └─────────────┘ │
186
+ └─────────────────┘
187
+ ```
188
+
189
+ ### Status Queries
190
+
191
+ The Slurm executor queries job status via `sacct`:
192
+
193
+ ```bash
194
+ # Status command checks these Slurm states
195
+ sacct -j <job_id> --format=JobID,State,ExitCode
196
+
197
+ # Mapped to ExecutionState:
198
+ # PENDING -> pending
199
+ # RUNNING -> running
200
+ # COMPLETED -> completed
201
+ # FAILED -> failed
202
+ # CANCELLED -> cancelled
203
+ ```
204
+
205
+ ### Long-Running Jobs
206
+
207
+ For long-running evaluations on Slurm, consider:
208
+
209
+ ```yaml
210
+ execution:
211
+ walltime: "24:00:00" # Extended walltime
212
+ # Use caching to resume from interruptions
213
+
214
+ target:
215
+ api_endpoint:
216
+ adapter_config:
217
+ interceptors:
218
+ - name: caching
219
+ config:
220
+ cache_dir: "/shared/cache"
221
+ reuse_cached_responses: true
222
+ ```
223
+
224
+ The caching interceptor helps resume interrupted evaluations by reusing previous API responses.
225
+
226
+ ## Lepton Executor
227
+
228
+ The Lepton executor runs evaluations on Lepton AI's cloud platform.
229
+
230
+ ### Prerequisites
231
+
232
+ - Lepton AI account
233
+ - `LEPTON_API_TOKEN` environment variable set
234
+ - `leptonai` Python package (auto-installed)
235
+
236
+ ### Configuration
237
+
238
+ ```yaml
239
+ defaults:
240
+ - execution: lepton
241
+ - deployment: none
242
+ - _self_
243
+
244
+ execution:
245
+ # Lepton job settings
246
+ resource_shape: gpu.a100-80g
247
+ num_replicas: 1
248
+
249
+ # Environment
250
+ env_vars:
251
+ NGC_API_KEY: NGC_API_KEY
252
+ HF_TOKEN: HF_TOKEN
253
+ ```
254
+
255
+ ### How It Works
256
+
257
+ 1. Launcher creates Lepton job specification
258
+ 2. Submits job via Lepton API
259
+ 3. Optionally creates endpoint for model serving
260
+ 4. Polls job status via API
261
+ 5. Retrieves results when complete
262
+
263
+ ### Endpoint Management
264
+
265
+ For evaluating Lepton-hosted models:
266
+
267
+ ```yaml
268
+ target:
269
+ api_endpoint:
270
+ type: lepton
271
+ deployment_name: my-llama-deployment
272
+ # URL auto-generated from deployment
273
+ ```
274
+
275
+ ## Backend Selection Guide
276
+
277
+ | Use Case | Recommended Backend |
278
+ |----------|-------------------|
279
+ | Quick local testing | Local |
280
+ | Large-scale batch evaluation | Slurm |
281
+ | CI/CD pipeline | Local or Lepton |
282
+ | Multi-model comparison | Slurm (parallel jobs) |
283
+ | Cloud-native workflow | Lepton |
284
+ | Self-hosted model evaluation | Local or Slurm |
285
+
286
+ ## Execution Database
287
+
288
+ All backends share the `ExecutionDB` for tracking jobs:
289
+
290
+ ```
291
+ ┌─────────────────────────────────────────────┐
292
+ │ ExecutionDB (SQLite) │
293
+ │ │
294
+ │ invocation_id │ job_id │ status │ backend │
295
+ │ ───────────────────────────────────────── │
296
+ │ inv_abc123 │ 12345 │ running │ slurm │
297
+ │ inv_def456 │ cont_1 │ done │ local │
298
+ └─────────────────────────────────────────────┘
299
+ ```
300
+
301
+ Query via CLI:
302
+
303
+ ```bash
304
+ # List all invocations
305
+ nemo-evaluator-launcher ls runs
306
+
307
+ # Get specific invocation
308
+ nemo-evaluator-launcher info <invocation_id>
309
+ ```
310
+
311
+ ## Troubleshooting
312
+
313
+ ### Local Executor
314
+
315
+ **Issue: Docker permission denied**
316
+ ```bash
317
+ sudo usermod -aG docker $USER
318
+ newgrp docker
319
+ ```
320
+
321
+ **Issue: GPU not available in container**
322
+ ```bash
323
+ # Install nvidia-container-toolkit
324
+ sudo apt-get install nvidia-container-toolkit
325
+ sudo systemctl restart docker
326
+ ```
327
+
328
+ ### Slurm Executor
329
+
330
+ **Issue: SSH connection fails**
331
+ ```bash
332
+ # Test SSH connection
333
+ ssh -v cluster.example.com
334
+
335
+ # Check SSH key permissions
336
+ chmod 600 ~/.ssh/id_rsa
337
+ ```
338
+
339
+ **Issue: Job stuck in pending**
340
+ ```bash
341
+ # Check queue status
342
+ squeue -u $USER
343
+
344
+ # Check account limits
345
+ sacctmgr show associations user=$USER
346
+ ```
347
+
348
+ ### Lepton Executor
349
+
350
+ **Issue: API token invalid**
351
+ ```bash
352
+ # Verify token
353
+ curl -H "Authorization: Bearer $LEPTON_API_TOKEN" \
354
+ https://api.lepton.ai/v1/jobs
355
+ ```
356
+
357
+ **Issue: Resource shape unavailable**
358
+ ```bash
359
+ # List available shapes
360
+ lepton shape list
361
+ ```