@synsci/cli-darwin-x64 1.1.49
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skills/accelerate/SKILL.md +332 -0
- package/bin/skills/accelerate/references/custom-plugins.md +453 -0
- package/bin/skills/accelerate/references/megatron-integration.md +489 -0
- package/bin/skills/accelerate/references/performance.md +525 -0
- package/bin/skills/audiocraft/SKILL.md +564 -0
- package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
- package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
- package/bin/skills/autogpt/SKILL.md +403 -0
- package/bin/skills/autogpt/references/advanced-usage.md +535 -0
- package/bin/skills/autogpt/references/troubleshooting.md +420 -0
- package/bin/skills/awq/SKILL.md +310 -0
- package/bin/skills/awq/references/advanced-usage.md +324 -0
- package/bin/skills/awq/references/troubleshooting.md +344 -0
- package/bin/skills/axolotl/SKILL.md +158 -0
- package/bin/skills/axolotl/references/api.md +5548 -0
- package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
- package/bin/skills/axolotl/references/index.md +15 -0
- package/bin/skills/axolotl/references/other.md +3563 -0
- package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
- package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
- package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
- package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
- package/bin/skills/bitsandbytes/SKILL.md +411 -0
- package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
- package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
- package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
- package/bin/skills/blip-2/SKILL.md +564 -0
- package/bin/skills/blip-2/references/advanced-usage.md +680 -0
- package/bin/skills/blip-2/references/troubleshooting.md +526 -0
- package/bin/skills/chroma/SKILL.md +406 -0
- package/bin/skills/chroma/references/integration.md +38 -0
- package/bin/skills/clip/SKILL.md +253 -0
- package/bin/skills/clip/references/applications.md +207 -0
- package/bin/skills/constitutional-ai/SKILL.md +290 -0
- package/bin/skills/crewai/SKILL.md +498 -0
- package/bin/skills/crewai/references/flows.md +438 -0
- package/bin/skills/crewai/references/tools.md +429 -0
- package/bin/skills/crewai/references/troubleshooting.md +480 -0
- package/bin/skills/deepspeed/SKILL.md +141 -0
- package/bin/skills/deepspeed/references/08.md +17 -0
- package/bin/skills/deepspeed/references/09.md +173 -0
- package/bin/skills/deepspeed/references/2020.md +378 -0
- package/bin/skills/deepspeed/references/2023.md +279 -0
- package/bin/skills/deepspeed/references/assets.md +179 -0
- package/bin/skills/deepspeed/references/index.md +35 -0
- package/bin/skills/deepspeed/references/mii.md +118 -0
- package/bin/skills/deepspeed/references/other.md +1191 -0
- package/bin/skills/deepspeed/references/tutorials.md +6554 -0
- package/bin/skills/dspy/SKILL.md +590 -0
- package/bin/skills/dspy/references/examples.md +663 -0
- package/bin/skills/dspy/references/modules.md +475 -0
- package/bin/skills/dspy/references/optimizers.md +566 -0
- package/bin/skills/faiss/SKILL.md +221 -0
- package/bin/skills/faiss/references/index_types.md +280 -0
- package/bin/skills/flash-attention/SKILL.md +367 -0
- package/bin/skills/flash-attention/references/benchmarks.md +215 -0
- package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
- package/bin/skills/gguf/SKILL.md +427 -0
- package/bin/skills/gguf/references/advanced-usage.md +504 -0
- package/bin/skills/gguf/references/troubleshooting.md +442 -0
- package/bin/skills/gptq/SKILL.md +450 -0
- package/bin/skills/gptq/references/calibration.md +337 -0
- package/bin/skills/gptq/references/integration.md +129 -0
- package/bin/skills/gptq/references/troubleshooting.md +95 -0
- package/bin/skills/grpo-rl-training/README.md +97 -0
- package/bin/skills/grpo-rl-training/SKILL.md +572 -0
- package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
- package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
- package/bin/skills/guidance/SKILL.md +572 -0
- package/bin/skills/guidance/references/backends.md +554 -0
- package/bin/skills/guidance/references/constraints.md +674 -0
- package/bin/skills/guidance/references/examples.md +767 -0
- package/bin/skills/hqq/SKILL.md +445 -0
- package/bin/skills/hqq/references/advanced-usage.md +528 -0
- package/bin/skills/hqq/references/troubleshooting.md +503 -0
- package/bin/skills/hugging-face-cli/SKILL.md +191 -0
- package/bin/skills/hugging-face-cli/references/commands.md +954 -0
- package/bin/skills/hugging-face-cli/references/examples.md +374 -0
- package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
- package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
- package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
- package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
- package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
- package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
- package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
- package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
- package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
- package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
- package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
- package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
- package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
- package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
- package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
- package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
- package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
- package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
- package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
- package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
- package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
- package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
- package/bin/skills/hugging-face-jobs/index.html +216 -0
- package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
- package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
- package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
- package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
- package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
- package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
- package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
- package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
- package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
- package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
- package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
- package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
- package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
- package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
- package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
- package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
- package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
- package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
- package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
- package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
- package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
- package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
- package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
- package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
- package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
- package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
- package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
- package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
- package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
- package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
- package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
- package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
- package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
- package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
- package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
- package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
- package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
- package/bin/skills/instructor/SKILL.md +740 -0
- package/bin/skills/instructor/references/examples.md +107 -0
- package/bin/skills/instructor/references/providers.md +70 -0
- package/bin/skills/instructor/references/validation.md +606 -0
- package/bin/skills/knowledge-distillation/SKILL.md +458 -0
- package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
- package/bin/skills/lambda-labs/SKILL.md +545 -0
- package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
- package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
- package/bin/skills/langchain/SKILL.md +480 -0
- package/bin/skills/langchain/references/agents.md +499 -0
- package/bin/skills/langchain/references/integration.md +562 -0
- package/bin/skills/langchain/references/rag.md +600 -0
- package/bin/skills/langsmith/SKILL.md +422 -0
- package/bin/skills/langsmith/references/advanced-usage.md +548 -0
- package/bin/skills/langsmith/references/troubleshooting.md +537 -0
- package/bin/skills/litgpt/SKILL.md +469 -0
- package/bin/skills/litgpt/references/custom-models.md +568 -0
- package/bin/skills/litgpt/references/distributed-training.md +451 -0
- package/bin/skills/litgpt/references/supported-models.md +336 -0
- package/bin/skills/litgpt/references/training-recipes.md +619 -0
- package/bin/skills/llama-cpp/SKILL.md +258 -0
- package/bin/skills/llama-cpp/references/optimization.md +89 -0
- package/bin/skills/llama-cpp/references/quantization.md +213 -0
- package/bin/skills/llama-cpp/references/server.md +125 -0
- package/bin/skills/llama-factory/SKILL.md +80 -0
- package/bin/skills/llama-factory/references/_images.md +23 -0
- package/bin/skills/llama-factory/references/advanced.md +1055 -0
- package/bin/skills/llama-factory/references/getting_started.md +349 -0
- package/bin/skills/llama-factory/references/index.md +19 -0
- package/bin/skills/llama-factory/references/other.md +31 -0
- package/bin/skills/llamaguard/SKILL.md +337 -0
- package/bin/skills/llamaindex/SKILL.md +569 -0
- package/bin/skills/llamaindex/references/agents.md +83 -0
- package/bin/skills/llamaindex/references/data_connectors.md +108 -0
- package/bin/skills/llamaindex/references/query_engines.md +406 -0
- package/bin/skills/llava/SKILL.md +304 -0
- package/bin/skills/llava/references/training.md +197 -0
- package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
- package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
- package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
- package/bin/skills/long-context/SKILL.md +536 -0
- package/bin/skills/long-context/references/extension_methods.md +468 -0
- package/bin/skills/long-context/references/fine_tuning.md +611 -0
- package/bin/skills/long-context/references/rope.md +402 -0
- package/bin/skills/mamba/SKILL.md +260 -0
- package/bin/skills/mamba/references/architecture-details.md +206 -0
- package/bin/skills/mamba/references/benchmarks.md +255 -0
- package/bin/skills/mamba/references/training-guide.md +388 -0
- package/bin/skills/megatron-core/SKILL.md +366 -0
- package/bin/skills/megatron-core/references/benchmarks.md +249 -0
- package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
- package/bin/skills/megatron-core/references/production-examples.md +473 -0
- package/bin/skills/megatron-core/references/training-recipes.md +547 -0
- package/bin/skills/miles/SKILL.md +315 -0
- package/bin/skills/miles/references/api-reference.md +141 -0
- package/bin/skills/miles/references/troubleshooting.md +352 -0
- package/bin/skills/mlflow/SKILL.md +704 -0
- package/bin/skills/mlflow/references/deployment.md +744 -0
- package/bin/skills/mlflow/references/model-registry.md +770 -0
- package/bin/skills/mlflow/references/tracking.md +680 -0
- package/bin/skills/modal/SKILL.md +341 -0
- package/bin/skills/modal/references/advanced-usage.md +503 -0
- package/bin/skills/modal/references/troubleshooting.md +494 -0
- package/bin/skills/model-merging/SKILL.md +539 -0
- package/bin/skills/model-merging/references/evaluation.md +462 -0
- package/bin/skills/model-merging/references/examples.md +428 -0
- package/bin/skills/model-merging/references/methods.md +352 -0
- package/bin/skills/model-pruning/SKILL.md +495 -0
- package/bin/skills/model-pruning/references/wanda.md +347 -0
- package/bin/skills/moe-training/SKILL.md +526 -0
- package/bin/skills/moe-training/references/architectures.md +432 -0
- package/bin/skills/moe-training/references/inference.md +348 -0
- package/bin/skills/moe-training/references/training.md +425 -0
- package/bin/skills/nanogpt/SKILL.md +290 -0
- package/bin/skills/nanogpt/references/architecture.md +382 -0
- package/bin/skills/nanogpt/references/data.md +476 -0
- package/bin/skills/nanogpt/references/training.md +564 -0
- package/bin/skills/nemo-curator/SKILL.md +383 -0
- package/bin/skills/nemo-curator/references/deduplication.md +87 -0
- package/bin/skills/nemo-curator/references/filtering.md +102 -0
- package/bin/skills/nemo-evaluator/SKILL.md +494 -0
- package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
- package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
- package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
- package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
- package/bin/skills/nemo-guardrails/SKILL.md +297 -0
- package/bin/skills/nnsight/SKILL.md +436 -0
- package/bin/skills/nnsight/references/README.md +78 -0
- package/bin/skills/nnsight/references/api.md +344 -0
- package/bin/skills/nnsight/references/tutorials.md +300 -0
- package/bin/skills/openrlhf/SKILL.md +249 -0
- package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
- package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
- package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
- package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
- package/bin/skills/outlines/SKILL.md +652 -0
- package/bin/skills/outlines/references/backends.md +615 -0
- package/bin/skills/outlines/references/examples.md +773 -0
- package/bin/skills/outlines/references/json_generation.md +652 -0
- package/bin/skills/peft/SKILL.md +431 -0
- package/bin/skills/peft/references/advanced-usage.md +514 -0
- package/bin/skills/peft/references/troubleshooting.md +480 -0
- package/bin/skills/phoenix/SKILL.md +475 -0
- package/bin/skills/phoenix/references/advanced-usage.md +619 -0
- package/bin/skills/phoenix/references/troubleshooting.md +538 -0
- package/bin/skills/pinecone/SKILL.md +358 -0
- package/bin/skills/pinecone/references/deployment.md +181 -0
- package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
- package/bin/skills/pytorch-fsdp/references/index.md +7 -0
- package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
- package/bin/skills/pytorch-lightning/SKILL.md +346 -0
- package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
- package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
- package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
- package/bin/skills/pyvene/SKILL.md +473 -0
- package/bin/skills/pyvene/references/README.md +73 -0
- package/bin/skills/pyvene/references/api.md +383 -0
- package/bin/skills/pyvene/references/tutorials.md +376 -0
- package/bin/skills/qdrant/SKILL.md +493 -0
- package/bin/skills/qdrant/references/advanced-usage.md +648 -0
- package/bin/skills/qdrant/references/troubleshooting.md +631 -0
- package/bin/skills/ray-data/SKILL.md +326 -0
- package/bin/skills/ray-data/references/integration.md +82 -0
- package/bin/skills/ray-data/references/transformations.md +83 -0
- package/bin/skills/ray-train/SKILL.md +406 -0
- package/bin/skills/ray-train/references/multi-node.md +628 -0
- package/bin/skills/rwkv/SKILL.md +260 -0
- package/bin/skills/rwkv/references/architecture-details.md +344 -0
- package/bin/skills/rwkv/references/rwkv7.md +386 -0
- package/bin/skills/rwkv/references/state-management.md +369 -0
- package/bin/skills/saelens/SKILL.md +386 -0
- package/bin/skills/saelens/references/README.md +70 -0
- package/bin/skills/saelens/references/api.md +333 -0
- package/bin/skills/saelens/references/tutorials.md +318 -0
- package/bin/skills/segment-anything/SKILL.md +500 -0
- package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
- package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
- package/bin/skills/sentence-transformers/SKILL.md +255 -0
- package/bin/skills/sentence-transformers/references/models.md +123 -0
- package/bin/skills/sentencepiece/SKILL.md +235 -0
- package/bin/skills/sentencepiece/references/algorithms.md +200 -0
- package/bin/skills/sentencepiece/references/training.md +304 -0
- package/bin/skills/sglang/SKILL.md +442 -0
- package/bin/skills/sglang/references/deployment.md +490 -0
- package/bin/skills/sglang/references/radix-attention.md +413 -0
- package/bin/skills/sglang/references/structured-generation.md +541 -0
- package/bin/skills/simpo/SKILL.md +219 -0
- package/bin/skills/simpo/references/datasets.md +478 -0
- package/bin/skills/simpo/references/hyperparameters.md +452 -0
- package/bin/skills/simpo/references/loss-functions.md +350 -0
- package/bin/skills/skypilot/SKILL.md +509 -0
- package/bin/skills/skypilot/references/advanced-usage.md +491 -0
- package/bin/skills/skypilot/references/troubleshooting.md +570 -0
- package/bin/skills/slime/SKILL.md +464 -0
- package/bin/skills/slime/references/api-reference.md +392 -0
- package/bin/skills/slime/references/troubleshooting.md +386 -0
- package/bin/skills/speculative-decoding/SKILL.md +467 -0
- package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
- package/bin/skills/speculative-decoding/references/medusa.md +350 -0
- package/bin/skills/stable-diffusion/SKILL.md +519 -0
- package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
- package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
- package/bin/skills/tensorboard/SKILL.md +629 -0
- package/bin/skills/tensorboard/references/integrations.md +638 -0
- package/bin/skills/tensorboard/references/profiling.md +545 -0
- package/bin/skills/tensorboard/references/visualization.md +620 -0
- package/bin/skills/tensorrt-llm/SKILL.md +187 -0
- package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
- package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
- package/bin/skills/tensorrt-llm/references/serving.md +470 -0
- package/bin/skills/tinker/SKILL.md +362 -0
- package/bin/skills/tinker/references/api-reference.md +168 -0
- package/bin/skills/tinker/references/getting-started.md +157 -0
- package/bin/skills/tinker/references/loss-functions.md +163 -0
- package/bin/skills/tinker/references/models-and-lora.md +139 -0
- package/bin/skills/tinker/references/recipes.md +280 -0
- package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
- package/bin/skills/tinker/references/rendering.md +243 -0
- package/bin/skills/tinker/references/supervised-learning.md +232 -0
- package/bin/skills/tinker-training-cost/SKILL.md +187 -0
- package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
- package/bin/skills/torchforge/SKILL.md +433 -0
- package/bin/skills/torchforge/references/api-reference.md +327 -0
- package/bin/skills/torchforge/references/troubleshooting.md +409 -0
- package/bin/skills/torchtitan/SKILL.md +358 -0
- package/bin/skills/torchtitan/references/checkpoint.md +181 -0
- package/bin/skills/torchtitan/references/custom-models.md +258 -0
- package/bin/skills/torchtitan/references/float8.md +133 -0
- package/bin/skills/torchtitan/references/fsdp.md +126 -0
- package/bin/skills/transformer-lens/SKILL.md +346 -0
- package/bin/skills/transformer-lens/references/README.md +54 -0
- package/bin/skills/transformer-lens/references/api.md +362 -0
- package/bin/skills/transformer-lens/references/tutorials.md +339 -0
- package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
- package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
- package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
- package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
- package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
- package/bin/skills/unsloth/SKILL.md +80 -0
- package/bin/skills/unsloth/references/index.md +7 -0
- package/bin/skills/unsloth/references/llms-full.md +16799 -0
- package/bin/skills/unsloth/references/llms-txt.md +12044 -0
- package/bin/skills/unsloth/references/llms.md +82 -0
- package/bin/skills/verl/SKILL.md +391 -0
- package/bin/skills/verl/references/api-reference.md +301 -0
- package/bin/skills/verl/references/troubleshooting.md +391 -0
- package/bin/skills/vllm/SKILL.md +364 -0
- package/bin/skills/vllm/references/optimization.md +226 -0
- package/bin/skills/vllm/references/quantization.md +284 -0
- package/bin/skills/vllm/references/server-deployment.md +255 -0
- package/bin/skills/vllm/references/troubleshooting.md +447 -0
- package/bin/skills/weights-and-biases/SKILL.md +590 -0
- package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
- package/bin/skills/weights-and-biases/references/integrations.md +700 -0
- package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
- package/bin/skills/whisper/SKILL.md +317 -0
- package/bin/skills/whisper/references/languages.md +189 -0
- package/bin/synsc +0 -0
- package/package.json +10 -0
|
@@ -0,0 +1,704 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: mlflow
|
|
3
|
+
description: Track ML experiments, manage model registry with versioning, deploy models to production, and reproduce experiments with MLflow - framework-agnostic ML lifecycle platform
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
author: Synthetic Sciences
|
|
6
|
+
license: MIT
|
|
7
|
+
tags: [MLOps, MLflow, Experiment Tracking, Model Registry, ML Lifecycle, Deployment, Model Versioning, PyTorch, TensorFlow, Scikit-Learn, HuggingFace]
|
|
8
|
+
dependencies: [mlflow, sqlalchemy, boto3]
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# MLflow: ML Lifecycle Management Platform
|
|
12
|
+
|
|
13
|
+
## When to Use This Skill
|
|
14
|
+
|
|
15
|
+
Use MLflow when you need to:
|
|
16
|
+
- **Track ML experiments** with parameters, metrics, and artifacts
|
|
17
|
+
- **Manage model registry** with versioning and stage transitions
|
|
18
|
+
- **Deploy models** to various platforms (local, cloud, serving)
|
|
19
|
+
- **Reproduce experiments** with project configurations
|
|
20
|
+
- **Compare model versions** and performance metrics
|
|
21
|
+
- **Collaborate** on ML projects with team workflows
|
|
22
|
+
- **Integrate** with any ML framework (framework-agnostic)
|
|
23
|
+
|
|
24
|
+
**Users**: 20,000+ organizations | **GitHub Stars**: 23k+ | **License**: Apache 2.0
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
# Install MLflow
|
|
30
|
+
pip install mlflow
|
|
31
|
+
|
|
32
|
+
# Install with extras
|
|
33
|
+
pip install mlflow[extras] # Includes SQLAlchemy, boto3, etc.
|
|
34
|
+
|
|
35
|
+
# Start MLflow UI
|
|
36
|
+
mlflow ui
|
|
37
|
+
|
|
38
|
+
# Access at http://localhost:5000
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Quick Start
|
|
42
|
+
|
|
43
|
+
### Basic Tracking
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
import mlflow
|
|
47
|
+
|
|
48
|
+
# Start a run
|
|
49
|
+
with mlflow.start_run():
|
|
50
|
+
# Log parameters
|
|
51
|
+
mlflow.log_param("learning_rate", 0.001)
|
|
52
|
+
mlflow.log_param("batch_size", 32)
|
|
53
|
+
|
|
54
|
+
# Your training code
|
|
55
|
+
model = train_model()
|
|
56
|
+
|
|
57
|
+
# Log metrics
|
|
58
|
+
mlflow.log_metric("train_loss", 0.15)
|
|
59
|
+
mlflow.log_metric("val_accuracy", 0.92)
|
|
60
|
+
|
|
61
|
+
# Log model
|
|
62
|
+
mlflow.sklearn.log_model(model, "model")
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### Autologging (Automatic Tracking)
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
import mlflow
|
|
69
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
70
|
+
|
|
71
|
+
# Enable autologging
|
|
72
|
+
mlflow.autolog()
|
|
73
|
+
|
|
74
|
+
# Train (automatically logged)
|
|
75
|
+
model = RandomForestClassifier(n_estimators=100, max_depth=5)
|
|
76
|
+
model.fit(X_train, y_train)
|
|
77
|
+
|
|
78
|
+
# Metrics, parameters, and model logged automatically!
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Core Concepts
|
|
82
|
+
|
|
83
|
+
### 1. Experiments and Runs
|
|
84
|
+
|
|
85
|
+
**Experiment**: Logical container for related runs
|
|
86
|
+
**Run**: Single execution of ML code (parameters, metrics, artifacts)
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
import mlflow
|
|
90
|
+
|
|
91
|
+
# Create/set experiment
|
|
92
|
+
mlflow.set_experiment("my-experiment")
|
|
93
|
+
|
|
94
|
+
# Start a run
|
|
95
|
+
with mlflow.start_run(run_name="baseline-model"):
|
|
96
|
+
# Log params
|
|
97
|
+
mlflow.log_param("model", "ResNet50")
|
|
98
|
+
mlflow.log_param("epochs", 10)
|
|
99
|
+
|
|
100
|
+
# Train
|
|
101
|
+
model = train()
|
|
102
|
+
|
|
103
|
+
# Log metrics
|
|
104
|
+
mlflow.log_metric("accuracy", 0.95)
|
|
105
|
+
|
|
106
|
+
# Log model
|
|
107
|
+
mlflow.pytorch.log_model(model, "model")
|
|
108
|
+
|
|
109
|
+
# Run ID is automatically generated
|
|
110
|
+
print(f"Run ID: {mlflow.active_run().info.run_id}")
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### 2. Logging Parameters
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
with mlflow.start_run():
|
|
117
|
+
# Single parameter
|
|
118
|
+
mlflow.log_param("learning_rate", 0.001)
|
|
119
|
+
|
|
120
|
+
# Multiple parameters
|
|
121
|
+
mlflow.log_params({
|
|
122
|
+
"batch_size": 32,
|
|
123
|
+
"epochs": 50,
|
|
124
|
+
"optimizer": "Adam",
|
|
125
|
+
"dropout": 0.2
|
|
126
|
+
})
|
|
127
|
+
|
|
128
|
+
# Nested parameters (as dict)
|
|
129
|
+
config = {
|
|
130
|
+
"model": {
|
|
131
|
+
"architecture": "ResNet50",
|
|
132
|
+
"pretrained": True
|
|
133
|
+
},
|
|
134
|
+
"training": {
|
|
135
|
+
"lr": 0.001,
|
|
136
|
+
"weight_decay": 1e-4
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
# Log as JSON string or individual params
|
|
141
|
+
for key, value in config.items():
|
|
142
|
+
mlflow.log_param(key, str(value))
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### 3. Logging Metrics
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
with mlflow.start_run():
|
|
149
|
+
# Training loop
|
|
150
|
+
for epoch in range(NUM_EPOCHS):
|
|
151
|
+
train_loss = train_epoch()
|
|
152
|
+
val_loss = validate()
|
|
153
|
+
|
|
154
|
+
# Log metrics at each step
|
|
155
|
+
mlflow.log_metric("train_loss", train_loss, step=epoch)
|
|
156
|
+
mlflow.log_metric("val_loss", val_loss, step=epoch)
|
|
157
|
+
|
|
158
|
+
# Log multiple metrics
|
|
159
|
+
mlflow.log_metrics({
|
|
160
|
+
"train_accuracy": train_acc,
|
|
161
|
+
"val_accuracy": val_acc
|
|
162
|
+
}, step=epoch)
|
|
163
|
+
|
|
164
|
+
# Log final metrics (no step)
|
|
165
|
+
mlflow.log_metric("final_accuracy", final_acc)
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
### 4. Logging Artifacts
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
with mlflow.start_run():
|
|
172
|
+
# Log file
|
|
173
|
+
model.save('model.pkl')
|
|
174
|
+
mlflow.log_artifact('model.pkl')
|
|
175
|
+
|
|
176
|
+
# Log directory
|
|
177
|
+
os.makedirs('plots', exist_ok=True)
|
|
178
|
+
plt.savefig('plots/loss_curve.png')
|
|
179
|
+
mlflow.log_artifacts('plots')
|
|
180
|
+
|
|
181
|
+
# Log text
|
|
182
|
+
with open('config.txt', 'w') as f:
|
|
183
|
+
f.write(str(config))
|
|
184
|
+
mlflow.log_artifact('config.txt')
|
|
185
|
+
|
|
186
|
+
# Log dict as JSON
|
|
187
|
+
mlflow.log_dict({'config': config}, 'config.json')
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
### 5. Logging Models
|
|
191
|
+
|
|
192
|
+
```python
|
|
193
|
+
# PyTorch
|
|
194
|
+
import mlflow.pytorch
|
|
195
|
+
|
|
196
|
+
with mlflow.start_run():
|
|
197
|
+
model = train_pytorch_model()
|
|
198
|
+
mlflow.pytorch.log_model(model, "model")
|
|
199
|
+
|
|
200
|
+
# Scikit-learn
|
|
201
|
+
import mlflow.sklearn
|
|
202
|
+
|
|
203
|
+
with mlflow.start_run():
|
|
204
|
+
model = train_sklearn_model()
|
|
205
|
+
mlflow.sklearn.log_model(model, "model")
|
|
206
|
+
|
|
207
|
+
# Keras/TensorFlow
|
|
208
|
+
import mlflow.keras
|
|
209
|
+
|
|
210
|
+
with mlflow.start_run():
|
|
211
|
+
model = train_keras_model()
|
|
212
|
+
mlflow.keras.log_model(model, "model")
|
|
213
|
+
|
|
214
|
+
# HuggingFace Transformers
|
|
215
|
+
import mlflow.transformers
|
|
216
|
+
|
|
217
|
+
with mlflow.start_run():
|
|
218
|
+
mlflow.transformers.log_model(
|
|
219
|
+
transformers_model={
|
|
220
|
+
"model": model,
|
|
221
|
+
"tokenizer": tokenizer
|
|
222
|
+
},
|
|
223
|
+
artifact_path="model"
|
|
224
|
+
)
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
## Autologging
|
|
228
|
+
|
|
229
|
+
Automatically log metrics, parameters, and models for popular frameworks.
|
|
230
|
+
|
|
231
|
+
### Enable Autologging
|
|
232
|
+
|
|
233
|
+
```python
|
|
234
|
+
import mlflow
|
|
235
|
+
|
|
236
|
+
# Enable for all supported frameworks
|
|
237
|
+
mlflow.autolog()
|
|
238
|
+
|
|
239
|
+
# Or enable for specific framework
|
|
240
|
+
mlflow.sklearn.autolog()
|
|
241
|
+
mlflow.pytorch.autolog()
|
|
242
|
+
mlflow.keras.autolog()
|
|
243
|
+
mlflow.xgboost.autolog()
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
### Autologging with Scikit-learn
|
|
247
|
+
|
|
248
|
+
```python
|
|
249
|
+
import mlflow
|
|
250
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
251
|
+
from sklearn.model_selection import train_test_split
|
|
252
|
+
|
|
253
|
+
# Enable autologging
|
|
254
|
+
mlflow.sklearn.autolog()
|
|
255
|
+
|
|
256
|
+
# Split data
|
|
257
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
|
|
258
|
+
|
|
259
|
+
# Train (automatically logs params, metrics, model)
|
|
260
|
+
with mlflow.start_run():
|
|
261
|
+
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
|
|
262
|
+
model.fit(X_train, y_train)
|
|
263
|
+
|
|
264
|
+
# Metrics like accuracy, f1_score logged automatically
|
|
265
|
+
# Model logged automatically
|
|
266
|
+
# Training duration logged
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
### Autologging with PyTorch Lightning
|
|
270
|
+
|
|
271
|
+
```python
|
|
272
|
+
import mlflow
|
|
273
|
+
import pytorch_lightning as pl
|
|
274
|
+
|
|
275
|
+
# Enable autologging
|
|
276
|
+
mlflow.pytorch.autolog()
|
|
277
|
+
|
|
278
|
+
# Train
|
|
279
|
+
with mlflow.start_run():
|
|
280
|
+
trainer = pl.Trainer(max_epochs=10)
|
|
281
|
+
trainer.fit(model, datamodule=dm)
|
|
282
|
+
|
|
283
|
+
# Hyperparameters logged
|
|
284
|
+
# Training metrics logged
|
|
285
|
+
# Best model checkpoint logged
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
## Model Registry
|
|
289
|
+
|
|
290
|
+
Manage model lifecycle with versioning and stage transitions.
|
|
291
|
+
|
|
292
|
+
### Register Model
|
|
293
|
+
|
|
294
|
+
```python
|
|
295
|
+
import mlflow
|
|
296
|
+
|
|
297
|
+
# Log and register model
|
|
298
|
+
with mlflow.start_run():
|
|
299
|
+
model = train_model()
|
|
300
|
+
|
|
301
|
+
# Log model
|
|
302
|
+
mlflow.sklearn.log_model(
|
|
303
|
+
model,
|
|
304
|
+
"model",
|
|
305
|
+
registered_model_name="my-classifier" # Register immediately
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
# Or register later
|
|
309
|
+
run_id = "abc123"
|
|
310
|
+
model_uri = f"runs:/{run_id}/model"
|
|
311
|
+
mlflow.register_model(model_uri, "my-classifier")
|
|
312
|
+
```
|
|
313
|
+
|
|
314
|
+
### Model Stages
|
|
315
|
+
|
|
316
|
+
Transition models between stages: **None** → **Staging** → **Production** → **Archived**
|
|
317
|
+
|
|
318
|
+
```python
|
|
319
|
+
from mlflow.tracking import MlflowClient
|
|
320
|
+
|
|
321
|
+
client = MlflowClient()
|
|
322
|
+
|
|
323
|
+
# Promote to staging
|
|
324
|
+
client.transition_model_version_stage(
|
|
325
|
+
name="my-classifier",
|
|
326
|
+
version=3,
|
|
327
|
+
stage="Staging"
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
# Promote to production
|
|
331
|
+
client.transition_model_version_stage(
|
|
332
|
+
name="my-classifier",
|
|
333
|
+
version=3,
|
|
334
|
+
stage="Production",
|
|
335
|
+
archive_existing_versions=True # Archive old production versions
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
# Archive model
|
|
339
|
+
client.transition_model_version_stage(
|
|
340
|
+
name="my-classifier",
|
|
341
|
+
version=2,
|
|
342
|
+
stage="Archived"
|
|
343
|
+
)
|
|
344
|
+
```
|
|
345
|
+
|
|
346
|
+
### Load Model from Registry
|
|
347
|
+
|
|
348
|
+
```python
|
|
349
|
+
import mlflow.pyfunc
|
|
350
|
+
|
|
351
|
+
# Load latest production model
|
|
352
|
+
model = mlflow.pyfunc.load_model("models:/my-classifier/Production")
|
|
353
|
+
|
|
354
|
+
# Load specific version
|
|
355
|
+
model = mlflow.pyfunc.load_model("models:/my-classifier/3")
|
|
356
|
+
|
|
357
|
+
# Load from staging
|
|
358
|
+
model = mlflow.pyfunc.load_model("models:/my-classifier/Staging")
|
|
359
|
+
|
|
360
|
+
# Use model
|
|
361
|
+
predictions = model.predict(X_test)
|
|
362
|
+
```
|
|
363
|
+
|
|
364
|
+
### Model Versioning
|
|
365
|
+
|
|
366
|
+
```python
|
|
367
|
+
client = MlflowClient()
|
|
368
|
+
|
|
369
|
+
# List all versions
|
|
370
|
+
versions = client.search_model_versions("name='my-classifier'")
|
|
371
|
+
|
|
372
|
+
for v in versions:
|
|
373
|
+
print(f"Version {v.version}: {v.current_stage}")
|
|
374
|
+
|
|
375
|
+
# Get latest version by stage
|
|
376
|
+
latest_prod = client.get_latest_versions("my-classifier", stages=["Production"])
|
|
377
|
+
latest_staging = client.get_latest_versions("my-classifier", stages=["Staging"])
|
|
378
|
+
|
|
379
|
+
# Get model version details
|
|
380
|
+
version_info = client.get_model_version(name="my-classifier", version="3")
|
|
381
|
+
print(f"Run ID: {version_info.run_id}")
|
|
382
|
+
print(f"Stage: {version_info.current_stage}")
|
|
383
|
+
print(f"Tags: {version_info.tags}")
|
|
384
|
+
```
|
|
385
|
+
|
|
386
|
+
### Model Annotations
|
|
387
|
+
|
|
388
|
+
```python
|
|
389
|
+
client = MlflowClient()
|
|
390
|
+
|
|
391
|
+
# Add description
|
|
392
|
+
client.update_model_version(
|
|
393
|
+
name="my-classifier",
|
|
394
|
+
version="3",
|
|
395
|
+
description="ResNet50 classifier trained on 1M images with 95% accuracy"
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
# Add tags
|
|
399
|
+
client.set_model_version_tag(
|
|
400
|
+
name="my-classifier",
|
|
401
|
+
version="3",
|
|
402
|
+
key="validation_status",
|
|
403
|
+
value="approved"
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
client.set_model_version_tag(
|
|
407
|
+
name="my-classifier",
|
|
408
|
+
version="3",
|
|
409
|
+
key="deployed_date",
|
|
410
|
+
value="2025-01-15"
|
|
411
|
+
)
|
|
412
|
+
```
|
|
413
|
+
|
|
414
|
+
## Searching Runs
|
|
415
|
+
|
|
416
|
+
Find runs programmatically.
|
|
417
|
+
|
|
418
|
+
```python
|
|
419
|
+
from mlflow.tracking import MlflowClient
|
|
420
|
+
|
|
421
|
+
client = MlflowClient()
|
|
422
|
+
|
|
423
|
+
# Search all runs in experiment
|
|
424
|
+
experiment_id = client.get_experiment_by_name("my-experiment").experiment_id
|
|
425
|
+
runs = client.search_runs(
|
|
426
|
+
experiment_ids=[experiment_id],
|
|
427
|
+
filter_string="metrics.accuracy > 0.9",
|
|
428
|
+
order_by=["metrics.accuracy DESC"],
|
|
429
|
+
max_results=10
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
for run in runs:
|
|
433
|
+
print(f"Run ID: {run.info.run_id}")
|
|
434
|
+
print(f"Accuracy: {run.data.metrics['accuracy']}")
|
|
435
|
+
print(f"Params: {run.data.params}")
|
|
436
|
+
|
|
437
|
+
# Search with complex filters
|
|
438
|
+
runs = client.search_runs(
|
|
439
|
+
experiment_ids=[experiment_id],
|
|
440
|
+
filter_string="""
|
|
441
|
+
metrics.accuracy > 0.9 AND
|
|
442
|
+
params.model = 'ResNet50' AND
|
|
443
|
+
tags.dataset = 'ImageNet'
|
|
444
|
+
""",
|
|
445
|
+
order_by=["metrics.f1_score DESC"]
|
|
446
|
+
)
|
|
447
|
+
```
|
|
448
|
+
|
|
449
|
+
## Integration Examples
|
|
450
|
+
|
|
451
|
+
### PyTorch
|
|
452
|
+
|
|
453
|
+
```python
|
|
454
|
+
import mlflow
|
|
455
|
+
import torch
|
|
456
|
+
import torch.nn as nn
|
|
457
|
+
|
|
458
|
+
# Enable autologging
|
|
459
|
+
mlflow.pytorch.autolog()
|
|
460
|
+
|
|
461
|
+
with mlflow.start_run():
|
|
462
|
+
# Log config
|
|
463
|
+
config = {
|
|
464
|
+
"lr": 0.001,
|
|
465
|
+
"epochs": 10,
|
|
466
|
+
"batch_size": 32
|
|
467
|
+
}
|
|
468
|
+
mlflow.log_params(config)
|
|
469
|
+
|
|
470
|
+
# Train
|
|
471
|
+
model = create_model()
|
|
472
|
+
optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"])
|
|
473
|
+
|
|
474
|
+
for epoch in range(config["epochs"]):
|
|
475
|
+
train_loss = train_epoch(model, optimizer, train_loader)
|
|
476
|
+
val_loss, val_acc = validate(model, val_loader)
|
|
477
|
+
|
|
478
|
+
# Log metrics
|
|
479
|
+
mlflow.log_metrics({
|
|
480
|
+
"train_loss": train_loss,
|
|
481
|
+
"val_loss": val_loss,
|
|
482
|
+
"val_accuracy": val_acc
|
|
483
|
+
}, step=epoch)
|
|
484
|
+
|
|
485
|
+
# Log model
|
|
486
|
+
mlflow.pytorch.log_model(model, "model")
|
|
487
|
+
```
|
|
488
|
+
|
|
489
|
+
### HuggingFace Transformers
|
|
490
|
+
|
|
491
|
+
```python
|
|
492
|
+
import mlflow
|
|
493
|
+
from transformers import Trainer, TrainingArguments
|
|
494
|
+
|
|
495
|
+
# Enable autologging
|
|
496
|
+
mlflow.transformers.autolog()
|
|
497
|
+
|
|
498
|
+
training_args = TrainingArguments(
|
|
499
|
+
output_dir="./results",
|
|
500
|
+
num_train_epochs=3,
|
|
501
|
+
per_device_train_batch_size=16,
|
|
502
|
+
evaluation_strategy="epoch",
|
|
503
|
+
save_strategy="epoch",
|
|
504
|
+
load_best_model_at_end=True
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
# Start MLflow run
|
|
508
|
+
with mlflow.start_run():
|
|
509
|
+
trainer = Trainer(
|
|
510
|
+
model=model,
|
|
511
|
+
args=training_args,
|
|
512
|
+
train_dataset=train_dataset,
|
|
513
|
+
eval_dataset=eval_dataset
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
# Train (automatically logged)
|
|
517
|
+
trainer.train()
|
|
518
|
+
|
|
519
|
+
# Log final model to registry
|
|
520
|
+
mlflow.transformers.log_model(
|
|
521
|
+
transformers_model={
|
|
522
|
+
"model": trainer.model,
|
|
523
|
+
"tokenizer": tokenizer
|
|
524
|
+
},
|
|
525
|
+
artifact_path="model",
|
|
526
|
+
registered_model_name="hf-classifier"
|
|
527
|
+
)
|
|
528
|
+
```
|
|
529
|
+
|
|
530
|
+
### XGBoost
|
|
531
|
+
|
|
532
|
+
```python
|
|
533
|
+
import mlflow
|
|
534
|
+
import xgboost as xgb
|
|
535
|
+
|
|
536
|
+
# Enable autologging
|
|
537
|
+
mlflow.xgboost.autolog()
|
|
538
|
+
|
|
539
|
+
with mlflow.start_run():
|
|
540
|
+
dtrain = xgb.DMatrix(X_train, label=y_train)
|
|
541
|
+
dval = xgb.DMatrix(X_val, label=y_val)
|
|
542
|
+
|
|
543
|
+
params = {
|
|
544
|
+
'max_depth': 6,
|
|
545
|
+
'learning_rate': 0.1,
|
|
546
|
+
'objective': 'binary:logistic',
|
|
547
|
+
'eval_metric': ['logloss', 'auc']
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
# Train (automatically logged)
|
|
551
|
+
model = xgb.train(
|
|
552
|
+
params,
|
|
553
|
+
dtrain,
|
|
554
|
+
num_boost_round=100,
|
|
555
|
+
evals=[(dtrain, 'train'), (dval, 'val')],
|
|
556
|
+
early_stopping_rounds=10
|
|
557
|
+
)
|
|
558
|
+
|
|
559
|
+
# Model and metrics logged automatically
|
|
560
|
+
```
|
|
561
|
+
|
|
562
|
+
## Best Practices
|
|
563
|
+
|
|
564
|
+
### 1. Organize with Experiments
|
|
565
|
+
|
|
566
|
+
```python
|
|
567
|
+
# ✅ Good: Separate experiments for different tasks
|
|
568
|
+
mlflow.set_experiment("sentiment-analysis")
|
|
569
|
+
mlflow.set_experiment("image-classification")
|
|
570
|
+
mlflow.set_experiment("recommendation-system")
|
|
571
|
+
|
|
572
|
+
# ❌ Bad: Everything in one experiment
|
|
573
|
+
mlflow.set_experiment("all-models")
|
|
574
|
+
```
|
|
575
|
+
|
|
576
|
+
### 2. Use Descriptive Run Names
|
|
577
|
+
|
|
578
|
+
```python
|
|
579
|
+
# ✅ Good: Descriptive names
|
|
580
|
+
with mlflow.start_run(run_name="resnet50-imagenet-lr0.001-bs32"):
|
|
581
|
+
train()
|
|
582
|
+
|
|
583
|
+
# ❌ Bad: No name (auto-generated UUID)
|
|
584
|
+
with mlflow.start_run():
|
|
585
|
+
train()
|
|
586
|
+
```
|
|
587
|
+
|
|
588
|
+
### 3. Log Comprehensive Metadata
|
|
589
|
+
|
|
590
|
+
```python
|
|
591
|
+
with mlflow.start_run():
|
|
592
|
+
# Log hyperparameters
|
|
593
|
+
mlflow.log_params({
|
|
594
|
+
"learning_rate": 0.001,
|
|
595
|
+
"batch_size": 32,
|
|
596
|
+
"epochs": 50
|
|
597
|
+
})
|
|
598
|
+
|
|
599
|
+
# Log system info
|
|
600
|
+
mlflow.set_tags({
|
|
601
|
+
"dataset": "ImageNet",
|
|
602
|
+
"framework": "PyTorch 2.0",
|
|
603
|
+
"gpu": "A100",
|
|
604
|
+
"git_commit": get_git_commit()
|
|
605
|
+
})
|
|
606
|
+
|
|
607
|
+
# Log data info
|
|
608
|
+
mlflow.log_param("train_samples", len(train_dataset))
|
|
609
|
+
mlflow.log_param("val_samples", len(val_dataset))
|
|
610
|
+
```
|
|
611
|
+
|
|
612
|
+
### 4. Track Model Lineage
|
|
613
|
+
|
|
614
|
+
```python
|
|
615
|
+
# Link runs to understand lineage
|
|
616
|
+
with mlflow.start_run(run_name="preprocessing"):
|
|
617
|
+
data = preprocess()
|
|
618
|
+
mlflow.log_artifact("data.csv")
|
|
619
|
+
preprocessing_run_id = mlflow.active_run().info.run_id
|
|
620
|
+
|
|
621
|
+
with mlflow.start_run(run_name="training"):
|
|
622
|
+
# Reference parent run
|
|
623
|
+
mlflow.set_tag("preprocessing_run_id", preprocessing_run_id)
|
|
624
|
+
model = train(data)
|
|
625
|
+
```
|
|
626
|
+
|
|
627
|
+
### 5. Use Model Registry for Deployment
|
|
628
|
+
|
|
629
|
+
```python
|
|
630
|
+
# ✅ Good: Use registry for production
|
|
631
|
+
model_uri = "models:/my-classifier/Production"
|
|
632
|
+
model = mlflow.pyfunc.load_model(model_uri)
|
|
633
|
+
|
|
634
|
+
# ❌ Bad: Hard-code run IDs
|
|
635
|
+
model_uri = "runs:/abc123/model"
|
|
636
|
+
model = mlflow.pyfunc.load_model(model_uri)
|
|
637
|
+
```
|
|
638
|
+
|
|
639
|
+
## Deployment
|
|
640
|
+
|
|
641
|
+
### Serve Model Locally
|
|
642
|
+
|
|
643
|
+
```bash
|
|
644
|
+
# Serve registered model
|
|
645
|
+
mlflow models serve -m "models:/my-classifier/Production" -p 5001
|
|
646
|
+
|
|
647
|
+
# Serve from run
|
|
648
|
+
mlflow models serve -m "runs:/<RUN_ID>/model" -p 5001
|
|
649
|
+
|
|
650
|
+
# Test endpoint
|
|
651
|
+
curl http://127.0.0.1:5001/invocations -H 'Content-Type: application/json' -d '{
|
|
652
|
+
"inputs": [[1.0, 2.0, 3.0, 4.0]]
|
|
653
|
+
}'
|
|
654
|
+
```
|
|
655
|
+
|
|
656
|
+
### Deploy to Cloud
|
|
657
|
+
|
|
658
|
+
```bash
|
|
659
|
+
# Deploy to AWS SageMaker
|
|
660
|
+
mlflow sagemaker deploy -m "models:/my-classifier/Production" --region-name us-west-2
|
|
661
|
+
|
|
662
|
+
# Deploy to Azure ML
|
|
663
|
+
mlflow azureml deploy -m "models:/my-classifier/Production"
|
|
664
|
+
```
|
|
665
|
+
|
|
666
|
+
## Configuration
|
|
667
|
+
|
|
668
|
+
### Tracking Server
|
|
669
|
+
|
|
670
|
+
```bash
|
|
671
|
+
# Start tracking server with backend store
|
|
672
|
+
mlflow server \
|
|
673
|
+
--backend-store-uri postgresql://user:password@localhost/mlflow \
|
|
674
|
+
--default-artifact-root s3://my-bucket/mlflow \
|
|
675
|
+
--host 0.0.0.0 \
|
|
676
|
+
--port 5000
|
|
677
|
+
```
|
|
678
|
+
|
|
679
|
+
### Client Configuration
|
|
680
|
+
|
|
681
|
+
```python
|
|
682
|
+
import mlflow
|
|
683
|
+
|
|
684
|
+
# Set tracking URI
|
|
685
|
+
mlflow.set_tracking_uri("http://localhost:5000")
|
|
686
|
+
|
|
687
|
+
# Or use environment variable
|
|
688
|
+
# export MLFLOW_TRACKING_URI=http://localhost:5000
|
|
689
|
+
```
|
|
690
|
+
|
|
691
|
+
## Resources
|
|
692
|
+
|
|
693
|
+
- **Documentation**: https://mlflow.org/docs/latest
|
|
694
|
+
- **GitHub**: https://github.com/mlflow/mlflow (23k+ stars)
|
|
695
|
+
- **Examples**: https://github.com/mlflow/mlflow/tree/master/examples
|
|
696
|
+
- **Community**: https://mlflow.org/community
|
|
697
|
+
|
|
698
|
+
## See Also
|
|
699
|
+
|
|
700
|
+
- `references/tracking.md` - Comprehensive tracking guide
|
|
701
|
+
- `references/model-registry.md` - Model lifecycle management
|
|
702
|
+
- `references/deployment.md` - Production deployment patterns
|
|
703
|
+
|
|
704
|
+
|