@synsci/cli-darwin-x64 1.1.49
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skills/accelerate/SKILL.md +332 -0
- package/bin/skills/accelerate/references/custom-plugins.md +453 -0
- package/bin/skills/accelerate/references/megatron-integration.md +489 -0
- package/bin/skills/accelerate/references/performance.md +525 -0
- package/bin/skills/audiocraft/SKILL.md +564 -0
- package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
- package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
- package/bin/skills/autogpt/SKILL.md +403 -0
- package/bin/skills/autogpt/references/advanced-usage.md +535 -0
- package/bin/skills/autogpt/references/troubleshooting.md +420 -0
- package/bin/skills/awq/SKILL.md +310 -0
- package/bin/skills/awq/references/advanced-usage.md +324 -0
- package/bin/skills/awq/references/troubleshooting.md +344 -0
- package/bin/skills/axolotl/SKILL.md +158 -0
- package/bin/skills/axolotl/references/api.md +5548 -0
- package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
- package/bin/skills/axolotl/references/index.md +15 -0
- package/bin/skills/axolotl/references/other.md +3563 -0
- package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
- package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
- package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
- package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
- package/bin/skills/bitsandbytes/SKILL.md +411 -0
- package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
- package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
- package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
- package/bin/skills/blip-2/SKILL.md +564 -0
- package/bin/skills/blip-2/references/advanced-usage.md +680 -0
- package/bin/skills/blip-2/references/troubleshooting.md +526 -0
- package/bin/skills/chroma/SKILL.md +406 -0
- package/bin/skills/chroma/references/integration.md +38 -0
- package/bin/skills/clip/SKILL.md +253 -0
- package/bin/skills/clip/references/applications.md +207 -0
- package/bin/skills/constitutional-ai/SKILL.md +290 -0
- package/bin/skills/crewai/SKILL.md +498 -0
- package/bin/skills/crewai/references/flows.md +438 -0
- package/bin/skills/crewai/references/tools.md +429 -0
- package/bin/skills/crewai/references/troubleshooting.md +480 -0
- package/bin/skills/deepspeed/SKILL.md +141 -0
- package/bin/skills/deepspeed/references/08.md +17 -0
- package/bin/skills/deepspeed/references/09.md +173 -0
- package/bin/skills/deepspeed/references/2020.md +378 -0
- package/bin/skills/deepspeed/references/2023.md +279 -0
- package/bin/skills/deepspeed/references/assets.md +179 -0
- package/bin/skills/deepspeed/references/index.md +35 -0
- package/bin/skills/deepspeed/references/mii.md +118 -0
- package/bin/skills/deepspeed/references/other.md +1191 -0
- package/bin/skills/deepspeed/references/tutorials.md +6554 -0
- package/bin/skills/dspy/SKILL.md +590 -0
- package/bin/skills/dspy/references/examples.md +663 -0
- package/bin/skills/dspy/references/modules.md +475 -0
- package/bin/skills/dspy/references/optimizers.md +566 -0
- package/bin/skills/faiss/SKILL.md +221 -0
- package/bin/skills/faiss/references/index_types.md +280 -0
- package/bin/skills/flash-attention/SKILL.md +367 -0
- package/bin/skills/flash-attention/references/benchmarks.md +215 -0
- package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
- package/bin/skills/gguf/SKILL.md +427 -0
- package/bin/skills/gguf/references/advanced-usage.md +504 -0
- package/bin/skills/gguf/references/troubleshooting.md +442 -0
- package/bin/skills/gptq/SKILL.md +450 -0
- package/bin/skills/gptq/references/calibration.md +337 -0
- package/bin/skills/gptq/references/integration.md +129 -0
- package/bin/skills/gptq/references/troubleshooting.md +95 -0
- package/bin/skills/grpo-rl-training/README.md +97 -0
- package/bin/skills/grpo-rl-training/SKILL.md +572 -0
- package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
- package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
- package/bin/skills/guidance/SKILL.md +572 -0
- package/bin/skills/guidance/references/backends.md +554 -0
- package/bin/skills/guidance/references/constraints.md +674 -0
- package/bin/skills/guidance/references/examples.md +767 -0
- package/bin/skills/hqq/SKILL.md +445 -0
- package/bin/skills/hqq/references/advanced-usage.md +528 -0
- package/bin/skills/hqq/references/troubleshooting.md +503 -0
- package/bin/skills/hugging-face-cli/SKILL.md +191 -0
- package/bin/skills/hugging-face-cli/references/commands.md +954 -0
- package/bin/skills/hugging-face-cli/references/examples.md +374 -0
- package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
- package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
- package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
- package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
- package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
- package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
- package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
- package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
- package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
- package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
- package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
- package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
- package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
- package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
- package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
- package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
- package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
- package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
- package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
- package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
- package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
- package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
- package/bin/skills/hugging-face-jobs/index.html +216 -0
- package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
- package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
- package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
- package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
- package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
- package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
- package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
- package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
- package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
- package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
- package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
- package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
- package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
- package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
- package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
- package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
- package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
- package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
- package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
- package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
- package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
- package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
- package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
- package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
- package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
- package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
- package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
- package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
- package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
- package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
- package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
- package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
- package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
- package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
- package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
- package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
- package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
- package/bin/skills/instructor/SKILL.md +740 -0
- package/bin/skills/instructor/references/examples.md +107 -0
- package/bin/skills/instructor/references/providers.md +70 -0
- package/bin/skills/instructor/references/validation.md +606 -0
- package/bin/skills/knowledge-distillation/SKILL.md +458 -0
- package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
- package/bin/skills/lambda-labs/SKILL.md +545 -0
- package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
- package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
- package/bin/skills/langchain/SKILL.md +480 -0
- package/bin/skills/langchain/references/agents.md +499 -0
- package/bin/skills/langchain/references/integration.md +562 -0
- package/bin/skills/langchain/references/rag.md +600 -0
- package/bin/skills/langsmith/SKILL.md +422 -0
- package/bin/skills/langsmith/references/advanced-usage.md +548 -0
- package/bin/skills/langsmith/references/troubleshooting.md +537 -0
- package/bin/skills/litgpt/SKILL.md +469 -0
- package/bin/skills/litgpt/references/custom-models.md +568 -0
- package/bin/skills/litgpt/references/distributed-training.md +451 -0
- package/bin/skills/litgpt/references/supported-models.md +336 -0
- package/bin/skills/litgpt/references/training-recipes.md +619 -0
- package/bin/skills/llama-cpp/SKILL.md +258 -0
- package/bin/skills/llama-cpp/references/optimization.md +89 -0
- package/bin/skills/llama-cpp/references/quantization.md +213 -0
- package/bin/skills/llama-cpp/references/server.md +125 -0
- package/bin/skills/llama-factory/SKILL.md +80 -0
- package/bin/skills/llama-factory/references/_images.md +23 -0
- package/bin/skills/llama-factory/references/advanced.md +1055 -0
- package/bin/skills/llama-factory/references/getting_started.md +349 -0
- package/bin/skills/llama-factory/references/index.md +19 -0
- package/bin/skills/llama-factory/references/other.md +31 -0
- package/bin/skills/llamaguard/SKILL.md +337 -0
- package/bin/skills/llamaindex/SKILL.md +569 -0
- package/bin/skills/llamaindex/references/agents.md +83 -0
- package/bin/skills/llamaindex/references/data_connectors.md +108 -0
- package/bin/skills/llamaindex/references/query_engines.md +406 -0
- package/bin/skills/llava/SKILL.md +304 -0
- package/bin/skills/llava/references/training.md +197 -0
- package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
- package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
- package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
- package/bin/skills/long-context/SKILL.md +536 -0
- package/bin/skills/long-context/references/extension_methods.md +468 -0
- package/bin/skills/long-context/references/fine_tuning.md +611 -0
- package/bin/skills/long-context/references/rope.md +402 -0
- package/bin/skills/mamba/SKILL.md +260 -0
- package/bin/skills/mamba/references/architecture-details.md +206 -0
- package/bin/skills/mamba/references/benchmarks.md +255 -0
- package/bin/skills/mamba/references/training-guide.md +388 -0
- package/bin/skills/megatron-core/SKILL.md +366 -0
- package/bin/skills/megatron-core/references/benchmarks.md +249 -0
- package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
- package/bin/skills/megatron-core/references/production-examples.md +473 -0
- package/bin/skills/megatron-core/references/training-recipes.md +547 -0
- package/bin/skills/miles/SKILL.md +315 -0
- package/bin/skills/miles/references/api-reference.md +141 -0
- package/bin/skills/miles/references/troubleshooting.md +352 -0
- package/bin/skills/mlflow/SKILL.md +704 -0
- package/bin/skills/mlflow/references/deployment.md +744 -0
- package/bin/skills/mlflow/references/model-registry.md +770 -0
- package/bin/skills/mlflow/references/tracking.md +680 -0
- package/bin/skills/modal/SKILL.md +341 -0
- package/bin/skills/modal/references/advanced-usage.md +503 -0
- package/bin/skills/modal/references/troubleshooting.md +494 -0
- package/bin/skills/model-merging/SKILL.md +539 -0
- package/bin/skills/model-merging/references/evaluation.md +462 -0
- package/bin/skills/model-merging/references/examples.md +428 -0
- package/bin/skills/model-merging/references/methods.md +352 -0
- package/bin/skills/model-pruning/SKILL.md +495 -0
- package/bin/skills/model-pruning/references/wanda.md +347 -0
- package/bin/skills/moe-training/SKILL.md +526 -0
- package/bin/skills/moe-training/references/architectures.md +432 -0
- package/bin/skills/moe-training/references/inference.md +348 -0
- package/bin/skills/moe-training/references/training.md +425 -0
- package/bin/skills/nanogpt/SKILL.md +290 -0
- package/bin/skills/nanogpt/references/architecture.md +382 -0
- package/bin/skills/nanogpt/references/data.md +476 -0
- package/bin/skills/nanogpt/references/training.md +564 -0
- package/bin/skills/nemo-curator/SKILL.md +383 -0
- package/bin/skills/nemo-curator/references/deduplication.md +87 -0
- package/bin/skills/nemo-curator/references/filtering.md +102 -0
- package/bin/skills/nemo-evaluator/SKILL.md +494 -0
- package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
- package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
- package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
- package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
- package/bin/skills/nemo-guardrails/SKILL.md +297 -0
- package/bin/skills/nnsight/SKILL.md +436 -0
- package/bin/skills/nnsight/references/README.md +78 -0
- package/bin/skills/nnsight/references/api.md +344 -0
- package/bin/skills/nnsight/references/tutorials.md +300 -0
- package/bin/skills/openrlhf/SKILL.md +249 -0
- package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
- package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
- package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
- package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
- package/bin/skills/outlines/SKILL.md +652 -0
- package/bin/skills/outlines/references/backends.md +615 -0
- package/bin/skills/outlines/references/examples.md +773 -0
- package/bin/skills/outlines/references/json_generation.md +652 -0
- package/bin/skills/peft/SKILL.md +431 -0
- package/bin/skills/peft/references/advanced-usage.md +514 -0
- package/bin/skills/peft/references/troubleshooting.md +480 -0
- package/bin/skills/phoenix/SKILL.md +475 -0
- package/bin/skills/phoenix/references/advanced-usage.md +619 -0
- package/bin/skills/phoenix/references/troubleshooting.md +538 -0
- package/bin/skills/pinecone/SKILL.md +358 -0
- package/bin/skills/pinecone/references/deployment.md +181 -0
- package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
- package/bin/skills/pytorch-fsdp/references/index.md +7 -0
- package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
- package/bin/skills/pytorch-lightning/SKILL.md +346 -0
- package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
- package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
- package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
- package/bin/skills/pyvene/SKILL.md +473 -0
- package/bin/skills/pyvene/references/README.md +73 -0
- package/bin/skills/pyvene/references/api.md +383 -0
- package/bin/skills/pyvene/references/tutorials.md +376 -0
- package/bin/skills/qdrant/SKILL.md +493 -0
- package/bin/skills/qdrant/references/advanced-usage.md +648 -0
- package/bin/skills/qdrant/references/troubleshooting.md +631 -0
- package/bin/skills/ray-data/SKILL.md +326 -0
- package/bin/skills/ray-data/references/integration.md +82 -0
- package/bin/skills/ray-data/references/transformations.md +83 -0
- package/bin/skills/ray-train/SKILL.md +406 -0
- package/bin/skills/ray-train/references/multi-node.md +628 -0
- package/bin/skills/rwkv/SKILL.md +260 -0
- package/bin/skills/rwkv/references/architecture-details.md +344 -0
- package/bin/skills/rwkv/references/rwkv7.md +386 -0
- package/bin/skills/rwkv/references/state-management.md +369 -0
- package/bin/skills/saelens/SKILL.md +386 -0
- package/bin/skills/saelens/references/README.md +70 -0
- package/bin/skills/saelens/references/api.md +333 -0
- package/bin/skills/saelens/references/tutorials.md +318 -0
- package/bin/skills/segment-anything/SKILL.md +500 -0
- package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
- package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
- package/bin/skills/sentence-transformers/SKILL.md +255 -0
- package/bin/skills/sentence-transformers/references/models.md +123 -0
- package/bin/skills/sentencepiece/SKILL.md +235 -0
- package/bin/skills/sentencepiece/references/algorithms.md +200 -0
- package/bin/skills/sentencepiece/references/training.md +304 -0
- package/bin/skills/sglang/SKILL.md +442 -0
- package/bin/skills/sglang/references/deployment.md +490 -0
- package/bin/skills/sglang/references/radix-attention.md +413 -0
- package/bin/skills/sglang/references/structured-generation.md +541 -0
- package/bin/skills/simpo/SKILL.md +219 -0
- package/bin/skills/simpo/references/datasets.md +478 -0
- package/bin/skills/simpo/references/hyperparameters.md +452 -0
- package/bin/skills/simpo/references/loss-functions.md +350 -0
- package/bin/skills/skypilot/SKILL.md +509 -0
- package/bin/skills/skypilot/references/advanced-usage.md +491 -0
- package/bin/skills/skypilot/references/troubleshooting.md +570 -0
- package/bin/skills/slime/SKILL.md +464 -0
- package/bin/skills/slime/references/api-reference.md +392 -0
- package/bin/skills/slime/references/troubleshooting.md +386 -0
- package/bin/skills/speculative-decoding/SKILL.md +467 -0
- package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
- package/bin/skills/speculative-decoding/references/medusa.md +350 -0
- package/bin/skills/stable-diffusion/SKILL.md +519 -0
- package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
- package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
- package/bin/skills/tensorboard/SKILL.md +629 -0
- package/bin/skills/tensorboard/references/integrations.md +638 -0
- package/bin/skills/tensorboard/references/profiling.md +545 -0
- package/bin/skills/tensorboard/references/visualization.md +620 -0
- package/bin/skills/tensorrt-llm/SKILL.md +187 -0
- package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
- package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
- package/bin/skills/tensorrt-llm/references/serving.md +470 -0
- package/bin/skills/tinker/SKILL.md +362 -0
- package/bin/skills/tinker/references/api-reference.md +168 -0
- package/bin/skills/tinker/references/getting-started.md +157 -0
- package/bin/skills/tinker/references/loss-functions.md +163 -0
- package/bin/skills/tinker/references/models-and-lora.md +139 -0
- package/bin/skills/tinker/references/recipes.md +280 -0
- package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
- package/bin/skills/tinker/references/rendering.md +243 -0
- package/bin/skills/tinker/references/supervised-learning.md +232 -0
- package/bin/skills/tinker-training-cost/SKILL.md +187 -0
- package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
- package/bin/skills/torchforge/SKILL.md +433 -0
- package/bin/skills/torchforge/references/api-reference.md +327 -0
- package/bin/skills/torchforge/references/troubleshooting.md +409 -0
- package/bin/skills/torchtitan/SKILL.md +358 -0
- package/bin/skills/torchtitan/references/checkpoint.md +181 -0
- package/bin/skills/torchtitan/references/custom-models.md +258 -0
- package/bin/skills/torchtitan/references/float8.md +133 -0
- package/bin/skills/torchtitan/references/fsdp.md +126 -0
- package/bin/skills/transformer-lens/SKILL.md +346 -0
- package/bin/skills/transformer-lens/references/README.md +54 -0
- package/bin/skills/transformer-lens/references/api.md +362 -0
- package/bin/skills/transformer-lens/references/tutorials.md +339 -0
- package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
- package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
- package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
- package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
- package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
- package/bin/skills/unsloth/SKILL.md +80 -0
- package/bin/skills/unsloth/references/index.md +7 -0
- package/bin/skills/unsloth/references/llms-full.md +16799 -0
- package/bin/skills/unsloth/references/llms-txt.md +12044 -0
- package/bin/skills/unsloth/references/llms.md +82 -0
- package/bin/skills/verl/SKILL.md +391 -0
- package/bin/skills/verl/references/api-reference.md +301 -0
- package/bin/skills/verl/references/troubleshooting.md +391 -0
- package/bin/skills/vllm/SKILL.md +364 -0
- package/bin/skills/vllm/references/optimization.md +226 -0
- package/bin/skills/vllm/references/quantization.md +284 -0
- package/bin/skills/vllm/references/server-deployment.md +255 -0
- package/bin/skills/vllm/references/troubleshooting.md +447 -0
- package/bin/skills/weights-and-biases/SKILL.md +590 -0
- package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
- package/bin/skills/weights-and-biases/references/integrations.md +700 -0
- package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
- package/bin/skills/whisper/SKILL.md +317 -0
- package/bin/skills/whisper/references/languages.md +189 -0
- package/bin/synsc +0 -0
- package/package.json +10 -0
|
@@ -0,0 +1,494 @@
|
|
|
1
|
+
# Modal Troubleshooting Guide
|
|
2
|
+
|
|
3
|
+
## Installation Issues
|
|
4
|
+
|
|
5
|
+
### Authentication fails
|
|
6
|
+
|
|
7
|
+
**Error**: `modal setup` doesn't complete or token is invalid
|
|
8
|
+
|
|
9
|
+
**Solutions**:
|
|
10
|
+
```bash
|
|
11
|
+
# Re-authenticate
|
|
12
|
+
modal token new
|
|
13
|
+
|
|
14
|
+
# Check current token
|
|
15
|
+
modal config show
|
|
16
|
+
|
|
17
|
+
# Set token via environment
|
|
18
|
+
export MODAL_TOKEN_ID=ak-...
|
|
19
|
+
export MODAL_TOKEN_SECRET=as-...
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
### Package installation issues
|
|
23
|
+
|
|
24
|
+
**Error**: `pip install modal` fails
|
|
25
|
+
|
|
26
|
+
**Solutions**:
|
|
27
|
+
```bash
|
|
28
|
+
# Upgrade pip
|
|
29
|
+
pip install --upgrade pip
|
|
30
|
+
|
|
31
|
+
# Install with specific Python version
|
|
32
|
+
python3.11 -m pip install modal
|
|
33
|
+
|
|
34
|
+
# Install from wheel
|
|
35
|
+
pip install modal --prefer-binary
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Container Image Issues
|
|
39
|
+
|
|
40
|
+
### Image build fails
|
|
41
|
+
|
|
42
|
+
**Error**: `ImageBuilderError: Failed to build image`
|
|
43
|
+
|
|
44
|
+
**Solutions**:
|
|
45
|
+
```python
|
|
46
|
+
# Pin package versions to avoid conflicts
|
|
47
|
+
image = modal.Image.debian_slim().pip_install(
|
|
48
|
+
"torch==2.1.0",
|
|
49
|
+
"transformers==4.36.0", # Pin versions
|
|
50
|
+
"accelerate==0.25.0"
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# Use compatible CUDA versions
|
|
54
|
+
image = modal.Image.from_registry(
|
|
55
|
+
"nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04", # Match PyTorch CUDA
|
|
56
|
+
add_python="3.11"
|
|
57
|
+
)
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### Dependency conflicts
|
|
61
|
+
|
|
62
|
+
**Error**: `ERROR: Cannot install package due to conflicting dependencies`
|
|
63
|
+
|
|
64
|
+
**Solutions**:
|
|
65
|
+
```python
|
|
66
|
+
# Layer dependencies separately
|
|
67
|
+
base = modal.Image.debian_slim().pip_install("torch")
|
|
68
|
+
ml = base.pip_install("transformers") # Install after torch
|
|
69
|
+
|
|
70
|
+
# Use uv for better resolution
|
|
71
|
+
image = modal.Image.debian_slim().uv_pip_install(
|
|
72
|
+
"torch", "transformers"
|
|
73
|
+
)
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Large image builds timeout
|
|
77
|
+
|
|
78
|
+
**Error**: Image build exceeds time limit
|
|
79
|
+
|
|
80
|
+
**Solutions**:
|
|
81
|
+
```python
|
|
82
|
+
# Split into multiple layers (better caching)
|
|
83
|
+
base = modal.Image.debian_slim().pip_install("torch") # Cached
|
|
84
|
+
ml = base.pip_install("transformers", "datasets") # Cached
|
|
85
|
+
app = ml.copy_local_dir("./src", "/app") # Rebuilds on code change
|
|
86
|
+
|
|
87
|
+
# Download models during build, not runtime
|
|
88
|
+
image = modal.Image.debian_slim().pip_install("transformers").run_commands(
|
|
89
|
+
"python -c 'from transformers import AutoModel; AutoModel.from_pretrained(\"bert-base\")'"
|
|
90
|
+
)
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## GPU Issues
|
|
94
|
+
|
|
95
|
+
### GPU not available
|
|
96
|
+
|
|
97
|
+
**Error**: `RuntimeError: CUDA not available`
|
|
98
|
+
|
|
99
|
+
**Solutions**:
|
|
100
|
+
```python
|
|
101
|
+
# Ensure GPU is specified
|
|
102
|
+
@app.function(gpu="T4") # Must specify GPU
|
|
103
|
+
def my_function():
|
|
104
|
+
import torch
|
|
105
|
+
assert torch.cuda.is_available()
|
|
106
|
+
|
|
107
|
+
# Check CUDA compatibility in image
|
|
108
|
+
image = modal.Image.from_registry(
|
|
109
|
+
"nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04",
|
|
110
|
+
add_python="3.11"
|
|
111
|
+
).pip_install(
|
|
112
|
+
"torch",
|
|
113
|
+
index_url="https://download.pytorch.org/whl/cu121" # Match CUDA
|
|
114
|
+
)
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### GPU out of memory
|
|
118
|
+
|
|
119
|
+
**Error**: `torch.cuda.OutOfMemoryError: CUDA out of memory`
|
|
120
|
+
|
|
121
|
+
**Solutions**:
|
|
122
|
+
```python
|
|
123
|
+
# Use larger GPU
|
|
124
|
+
@app.function(gpu="A100-80GB") # More VRAM
|
|
125
|
+
def train():
|
|
126
|
+
pass
|
|
127
|
+
|
|
128
|
+
# Enable memory optimization
|
|
129
|
+
@app.function(gpu="A100")
|
|
130
|
+
def memory_optimized():
|
|
131
|
+
import torch
|
|
132
|
+
torch.backends.cuda.enable_flash_sdp(True)
|
|
133
|
+
|
|
134
|
+
# Use gradient checkpointing
|
|
135
|
+
model.gradient_checkpointing_enable()
|
|
136
|
+
|
|
137
|
+
# Mixed precision
|
|
138
|
+
with torch.autocast(device_type="cuda", dtype=torch.float16):
|
|
139
|
+
outputs = model(**inputs)
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
### Wrong GPU allocated
|
|
143
|
+
|
|
144
|
+
**Error**: Got different GPU than requested
|
|
145
|
+
|
|
146
|
+
**Solutions**:
|
|
147
|
+
```python
|
|
148
|
+
# Use strict GPU selection
|
|
149
|
+
@app.function(gpu="H100!") # H100! prevents auto-upgrade to H200
|
|
150
|
+
|
|
151
|
+
# Specify exact memory variant
|
|
152
|
+
@app.function(gpu="A100-80GB") # Not just "A100"
|
|
153
|
+
|
|
154
|
+
# Check GPU at runtime
|
|
155
|
+
@app.function(gpu="A100")
|
|
156
|
+
def check_gpu():
|
|
157
|
+
import subprocess
|
|
158
|
+
result = subprocess.run(["nvidia-smi"], capture_output=True, text=True)
|
|
159
|
+
print(result.stdout)
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
## Cold Start Issues
|
|
163
|
+
|
|
164
|
+
### Slow cold starts
|
|
165
|
+
|
|
166
|
+
**Problem**: First request takes too long
|
|
167
|
+
|
|
168
|
+
**Solutions**:
|
|
169
|
+
```python
|
|
170
|
+
# Keep containers warm
|
|
171
|
+
@app.function(
|
|
172
|
+
container_idle_timeout=600, # Keep warm 10 min
|
|
173
|
+
keep_warm=1 # Always keep 1 container ready
|
|
174
|
+
)
|
|
175
|
+
def low_latency():
|
|
176
|
+
pass
|
|
177
|
+
|
|
178
|
+
# Load model during container start
|
|
179
|
+
@app.cls(gpu="A100")
|
|
180
|
+
class Model:
|
|
181
|
+
@modal.enter()
|
|
182
|
+
def load(self):
|
|
183
|
+
# This runs once at container start, not per request
|
|
184
|
+
self.model = load_heavy_model()
|
|
185
|
+
|
|
186
|
+
# Cache model in volume
|
|
187
|
+
volume = modal.Volume.from_name("models", create_if_missing=True)
|
|
188
|
+
|
|
189
|
+
@app.function(volumes={"/cache": volume})
|
|
190
|
+
def cached_model():
|
|
191
|
+
if os.path.exists("/cache/model"):
|
|
192
|
+
model = load_from_disk("/cache/model")
|
|
193
|
+
else:
|
|
194
|
+
model = download_model()
|
|
195
|
+
save_to_disk(model, "/cache/model")
|
|
196
|
+
volume.commit()
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
### Container keeps restarting
|
|
200
|
+
|
|
201
|
+
**Problem**: Containers are killed and restarted frequently
|
|
202
|
+
|
|
203
|
+
**Solutions**:
|
|
204
|
+
```python
|
|
205
|
+
# Increase memory
|
|
206
|
+
@app.function(memory=32768) # 32GB RAM
|
|
207
|
+
def memory_heavy():
|
|
208
|
+
pass
|
|
209
|
+
|
|
210
|
+
# Increase timeout
|
|
211
|
+
@app.function(timeout=3600) # 1 hour
|
|
212
|
+
def long_running():
|
|
213
|
+
pass
|
|
214
|
+
|
|
215
|
+
# Handle signals gracefully
|
|
216
|
+
import signal
|
|
217
|
+
|
|
218
|
+
def handler(signum, frame):
|
|
219
|
+
cleanup()
|
|
220
|
+
exit(0)
|
|
221
|
+
|
|
222
|
+
signal.signal(signal.SIGTERM, handler)
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
## Volume Issues
|
|
226
|
+
|
|
227
|
+
### Volume changes not persisting
|
|
228
|
+
|
|
229
|
+
**Error**: Data written to volume disappears
|
|
230
|
+
|
|
231
|
+
**Solutions**:
|
|
232
|
+
```python
|
|
233
|
+
volume = modal.Volume.from_name("my-volume", create_if_missing=True)
|
|
234
|
+
|
|
235
|
+
@app.function(volumes={"/data": volume})
|
|
236
|
+
def write_data():
|
|
237
|
+
with open("/data/file.txt", "w") as f:
|
|
238
|
+
f.write("data")
|
|
239
|
+
|
|
240
|
+
# CRITICAL: Commit changes!
|
|
241
|
+
volume.commit()
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
### Volume read shows stale data
|
|
245
|
+
|
|
246
|
+
**Error**: Reading outdated data from volume
|
|
247
|
+
|
|
248
|
+
**Solutions**:
|
|
249
|
+
```python
|
|
250
|
+
@app.function(volumes={"/data": volume})
|
|
251
|
+
def read_data():
|
|
252
|
+
# Reload to get latest
|
|
253
|
+
volume.reload()
|
|
254
|
+
|
|
255
|
+
with open("/data/file.txt", "r") as f:
|
|
256
|
+
return f.read()
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
### Volume mount fails
|
|
260
|
+
|
|
261
|
+
**Error**: `VolumeError: Failed to mount volume`
|
|
262
|
+
|
|
263
|
+
**Solutions**:
|
|
264
|
+
```python
|
|
265
|
+
# Ensure volume exists
|
|
266
|
+
volume = modal.Volume.from_name("my-volume", create_if_missing=True)
|
|
267
|
+
|
|
268
|
+
# Use absolute path
|
|
269
|
+
@app.function(volumes={"/data": volume}) # Not "./data"
|
|
270
|
+
def my_function():
|
|
271
|
+
pass
|
|
272
|
+
|
|
273
|
+
# Check volume in dashboard
|
|
274
|
+
# modal volume list
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
## Web Endpoint Issues
|
|
278
|
+
|
|
279
|
+
### Endpoint returns 502
|
|
280
|
+
|
|
281
|
+
**Error**: Gateway timeout or bad gateway
|
|
282
|
+
|
|
283
|
+
**Solutions**:
|
|
284
|
+
```python
|
|
285
|
+
# Increase timeout
|
|
286
|
+
@app.function(timeout=300) # 5 min
|
|
287
|
+
@modal.web_endpoint()
|
|
288
|
+
def slow_endpoint():
|
|
289
|
+
pass
|
|
290
|
+
|
|
291
|
+
# Return streaming response for long operations
|
|
292
|
+
from fastapi.responses import StreamingResponse
|
|
293
|
+
|
|
294
|
+
@app.function()
|
|
295
|
+
@modal.asgi_app()
|
|
296
|
+
def streaming_app():
|
|
297
|
+
async def generate():
|
|
298
|
+
for i in range(100):
|
|
299
|
+
yield f"data: {i}\n\n"
|
|
300
|
+
await process_chunk(i)
|
|
301
|
+
return StreamingResponse(generate(), media_type="text/event-stream")
|
|
302
|
+
```
|
|
303
|
+
|
|
304
|
+
### Endpoint not accessible
|
|
305
|
+
|
|
306
|
+
**Error**: 404 or cannot reach endpoint
|
|
307
|
+
|
|
308
|
+
**Solutions**:
|
|
309
|
+
```bash
|
|
310
|
+
# Check deployment status
|
|
311
|
+
modal app list
|
|
312
|
+
|
|
313
|
+
# Redeploy
|
|
314
|
+
modal deploy my_app.py
|
|
315
|
+
|
|
316
|
+
# Check logs
|
|
317
|
+
modal app logs my-app
|
|
318
|
+
```
|
|
319
|
+
|
|
320
|
+
### CORS errors
|
|
321
|
+
|
|
322
|
+
**Error**: Cross-origin request blocked
|
|
323
|
+
|
|
324
|
+
**Solutions**:
|
|
325
|
+
```python
|
|
326
|
+
from fastapi import FastAPI
|
|
327
|
+
from fastapi.middleware.cors import CORSMiddleware
|
|
328
|
+
|
|
329
|
+
web_app = FastAPI()
|
|
330
|
+
web_app.add_middleware(
|
|
331
|
+
CORSMiddleware,
|
|
332
|
+
allow_origins=["*"],
|
|
333
|
+
allow_credentials=True,
|
|
334
|
+
allow_methods=["*"],
|
|
335
|
+
allow_headers=["*"],
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
@app.function()
|
|
339
|
+
@modal.asgi_app()
|
|
340
|
+
def cors_enabled():
|
|
341
|
+
return web_app
|
|
342
|
+
```
|
|
343
|
+
|
|
344
|
+
## Secret Issues
|
|
345
|
+
|
|
346
|
+
### Secret not found
|
|
347
|
+
|
|
348
|
+
**Error**: `SecretNotFound: Secret 'my-secret' not found`
|
|
349
|
+
|
|
350
|
+
**Solutions**:
|
|
351
|
+
```bash
|
|
352
|
+
# Create secret via CLI
|
|
353
|
+
modal secret create my-secret KEY=value
|
|
354
|
+
|
|
355
|
+
# List secrets
|
|
356
|
+
modal secret list
|
|
357
|
+
|
|
358
|
+
# Check secret name matches exactly
|
|
359
|
+
```
|
|
360
|
+
|
|
361
|
+
### Secret value not accessible
|
|
362
|
+
|
|
363
|
+
**Error**: Environment variable is empty
|
|
364
|
+
|
|
365
|
+
**Solutions**:
|
|
366
|
+
```python
|
|
367
|
+
# Ensure secret is attached
|
|
368
|
+
@app.function(secrets=[modal.Secret.from_name("my-secret")])
|
|
369
|
+
def use_secret():
|
|
370
|
+
import os
|
|
371
|
+
value = os.environ.get("KEY") # Use get() to handle missing
|
|
372
|
+
if not value:
|
|
373
|
+
raise ValueError("KEY not set in secret")
|
|
374
|
+
```
|
|
375
|
+
|
|
376
|
+
## Scheduling Issues
|
|
377
|
+
|
|
378
|
+
### Scheduled job not running
|
|
379
|
+
|
|
380
|
+
**Error**: Cron job doesn't execute
|
|
381
|
+
|
|
382
|
+
**Solutions**:
|
|
383
|
+
```python
|
|
384
|
+
# Verify cron syntax
|
|
385
|
+
@app.function(schedule=modal.Cron("0 0 * * *")) # Daily at midnight UTC
|
|
386
|
+
def daily_job():
|
|
387
|
+
pass
|
|
388
|
+
|
|
389
|
+
# Check timezone (Modal uses UTC)
|
|
390
|
+
# "0 8 * * *" = 8am UTC, not local time
|
|
391
|
+
|
|
392
|
+
# Ensure app is deployed
|
|
393
|
+
# modal deploy my_app.py
|
|
394
|
+
```
|
|
395
|
+
|
|
396
|
+
### Job runs multiple times
|
|
397
|
+
|
|
398
|
+
**Problem**: Scheduled job executes more than expected
|
|
399
|
+
|
|
400
|
+
**Solutions**:
|
|
401
|
+
```python
|
|
402
|
+
# Implement idempotency
|
|
403
|
+
@app.function(schedule=modal.Cron("0 * * * *"))
|
|
404
|
+
def hourly_job():
|
|
405
|
+
job_id = get_current_hour_id()
|
|
406
|
+
if already_processed(job_id):
|
|
407
|
+
return
|
|
408
|
+
process()
|
|
409
|
+
mark_processed(job_id)
|
|
410
|
+
```
|
|
411
|
+
|
|
412
|
+
## Debugging Tips
|
|
413
|
+
|
|
414
|
+
### Enable debug logging
|
|
415
|
+
|
|
416
|
+
```python
|
|
417
|
+
import logging
|
|
418
|
+
logging.basicConfig(level=logging.DEBUG)
|
|
419
|
+
|
|
420
|
+
@app.function()
|
|
421
|
+
def debug_function():
|
|
422
|
+
logging.debug("Debug message")
|
|
423
|
+
logging.info("Info message")
|
|
424
|
+
```
|
|
425
|
+
|
|
426
|
+
### View container logs
|
|
427
|
+
|
|
428
|
+
```bash
|
|
429
|
+
# Stream logs
|
|
430
|
+
modal app logs my-app
|
|
431
|
+
|
|
432
|
+
# View specific function
|
|
433
|
+
modal app logs my-app --function my_function
|
|
434
|
+
|
|
435
|
+
# View historical logs
|
|
436
|
+
modal app logs my-app --since 1h
|
|
437
|
+
```
|
|
438
|
+
|
|
439
|
+
### Test locally
|
|
440
|
+
|
|
441
|
+
```python
|
|
442
|
+
# Run function locally without Modal
|
|
443
|
+
if __name__ == "__main__":
|
|
444
|
+
result = my_function.local() # Runs on your machine
|
|
445
|
+
print(result)
|
|
446
|
+
```
|
|
447
|
+
|
|
448
|
+
### Inspect container
|
|
449
|
+
|
|
450
|
+
```python
|
|
451
|
+
@app.function(gpu="T4")
|
|
452
|
+
def debug_environment():
|
|
453
|
+
import subprocess
|
|
454
|
+
import sys
|
|
455
|
+
|
|
456
|
+
# System info
|
|
457
|
+
print(f"Python: {sys.version}")
|
|
458
|
+
print(subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout)
|
|
459
|
+
print(subprocess.run(["pip", "list"], capture_output=True, text=True).stdout)
|
|
460
|
+
|
|
461
|
+
# CUDA info
|
|
462
|
+
import torch
|
|
463
|
+
print(f"CUDA available: {torch.cuda.is_available()}")
|
|
464
|
+
print(f"CUDA version: {torch.version.cuda}")
|
|
465
|
+
print(f"GPU: {torch.cuda.get_device_name(0)}")
|
|
466
|
+
```
|
|
467
|
+
|
|
468
|
+
## Common Error Messages
|
|
469
|
+
|
|
470
|
+
| Error | Cause | Solution |
|
|
471
|
+
|-------|-------|----------|
|
|
472
|
+
| `FunctionTimeoutError` | Function exceeded timeout | Increase `timeout` parameter |
|
|
473
|
+
| `ContainerMemoryExceeded` | OOM killed | Increase `memory` parameter |
|
|
474
|
+
| `ImageBuilderError` | Build failed | Check dependencies, pin versions |
|
|
475
|
+
| `ResourceExhausted` | No GPUs available | Use GPU fallbacks, try later |
|
|
476
|
+
| `AuthenticationError` | Invalid token | Run `modal token new` |
|
|
477
|
+
| `VolumeNotFound` | Volume doesn't exist | Use `create_if_missing=True` |
|
|
478
|
+
| `SecretNotFound` | Secret doesn't exist | Create secret via CLI |
|
|
479
|
+
|
|
480
|
+
## Getting Help
|
|
481
|
+
|
|
482
|
+
1. **Documentation**: https://modal.com/docs
|
|
483
|
+
2. **Examples**: https://github.com/modal-labs/modal-examples
|
|
484
|
+
3. **Discord**: https://discord.gg/modal
|
|
485
|
+
4. **Status**: https://status.modal.com
|
|
486
|
+
|
|
487
|
+
### Reporting Issues
|
|
488
|
+
|
|
489
|
+
Include:
|
|
490
|
+
- Modal client version: `modal --version`
|
|
491
|
+
- Python version: `python --version`
|
|
492
|
+
- Full error traceback
|
|
493
|
+
- Minimal reproducible code
|
|
494
|
+
- GPU type if relevant
|