@synsci/cli-darwin-x64 1.1.49
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skills/accelerate/SKILL.md +332 -0
- package/bin/skills/accelerate/references/custom-plugins.md +453 -0
- package/bin/skills/accelerate/references/megatron-integration.md +489 -0
- package/bin/skills/accelerate/references/performance.md +525 -0
- package/bin/skills/audiocraft/SKILL.md +564 -0
- package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
- package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
- package/bin/skills/autogpt/SKILL.md +403 -0
- package/bin/skills/autogpt/references/advanced-usage.md +535 -0
- package/bin/skills/autogpt/references/troubleshooting.md +420 -0
- package/bin/skills/awq/SKILL.md +310 -0
- package/bin/skills/awq/references/advanced-usage.md +324 -0
- package/bin/skills/awq/references/troubleshooting.md +344 -0
- package/bin/skills/axolotl/SKILL.md +158 -0
- package/bin/skills/axolotl/references/api.md +5548 -0
- package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
- package/bin/skills/axolotl/references/index.md +15 -0
- package/bin/skills/axolotl/references/other.md +3563 -0
- package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
- package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
- package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
- package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
- package/bin/skills/bitsandbytes/SKILL.md +411 -0
- package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
- package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
- package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
- package/bin/skills/blip-2/SKILL.md +564 -0
- package/bin/skills/blip-2/references/advanced-usage.md +680 -0
- package/bin/skills/blip-2/references/troubleshooting.md +526 -0
- package/bin/skills/chroma/SKILL.md +406 -0
- package/bin/skills/chroma/references/integration.md +38 -0
- package/bin/skills/clip/SKILL.md +253 -0
- package/bin/skills/clip/references/applications.md +207 -0
- package/bin/skills/constitutional-ai/SKILL.md +290 -0
- package/bin/skills/crewai/SKILL.md +498 -0
- package/bin/skills/crewai/references/flows.md +438 -0
- package/bin/skills/crewai/references/tools.md +429 -0
- package/bin/skills/crewai/references/troubleshooting.md +480 -0
- package/bin/skills/deepspeed/SKILL.md +141 -0
- package/bin/skills/deepspeed/references/08.md +17 -0
- package/bin/skills/deepspeed/references/09.md +173 -0
- package/bin/skills/deepspeed/references/2020.md +378 -0
- package/bin/skills/deepspeed/references/2023.md +279 -0
- package/bin/skills/deepspeed/references/assets.md +179 -0
- package/bin/skills/deepspeed/references/index.md +35 -0
- package/bin/skills/deepspeed/references/mii.md +118 -0
- package/bin/skills/deepspeed/references/other.md +1191 -0
- package/bin/skills/deepspeed/references/tutorials.md +6554 -0
- package/bin/skills/dspy/SKILL.md +590 -0
- package/bin/skills/dspy/references/examples.md +663 -0
- package/bin/skills/dspy/references/modules.md +475 -0
- package/bin/skills/dspy/references/optimizers.md +566 -0
- package/bin/skills/faiss/SKILL.md +221 -0
- package/bin/skills/faiss/references/index_types.md +280 -0
- package/bin/skills/flash-attention/SKILL.md +367 -0
- package/bin/skills/flash-attention/references/benchmarks.md +215 -0
- package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
- package/bin/skills/gguf/SKILL.md +427 -0
- package/bin/skills/gguf/references/advanced-usage.md +504 -0
- package/bin/skills/gguf/references/troubleshooting.md +442 -0
- package/bin/skills/gptq/SKILL.md +450 -0
- package/bin/skills/gptq/references/calibration.md +337 -0
- package/bin/skills/gptq/references/integration.md +129 -0
- package/bin/skills/gptq/references/troubleshooting.md +95 -0
- package/bin/skills/grpo-rl-training/README.md +97 -0
- package/bin/skills/grpo-rl-training/SKILL.md +572 -0
- package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
- package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
- package/bin/skills/guidance/SKILL.md +572 -0
- package/bin/skills/guidance/references/backends.md +554 -0
- package/bin/skills/guidance/references/constraints.md +674 -0
- package/bin/skills/guidance/references/examples.md +767 -0
- package/bin/skills/hqq/SKILL.md +445 -0
- package/bin/skills/hqq/references/advanced-usage.md +528 -0
- package/bin/skills/hqq/references/troubleshooting.md +503 -0
- package/bin/skills/hugging-face-cli/SKILL.md +191 -0
- package/bin/skills/hugging-face-cli/references/commands.md +954 -0
- package/bin/skills/hugging-face-cli/references/examples.md +374 -0
- package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
- package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
- package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
- package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
- package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
- package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
- package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
- package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
- package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
- package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
- package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
- package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
- package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
- package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
- package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
- package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
- package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
- package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
- package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
- package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
- package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
- package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
- package/bin/skills/hugging-face-jobs/index.html +216 -0
- package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
- package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
- package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
- package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
- package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
- package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
- package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
- package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
- package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
- package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
- package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
- package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
- package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
- package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
- package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
- package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
- package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
- package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
- package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
- package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
- package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
- package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
- package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
- package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
- package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
- package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
- package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
- package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
- package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
- package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
- package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
- package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
- package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
- package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
- package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
- package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
- package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
- package/bin/skills/instructor/SKILL.md +740 -0
- package/bin/skills/instructor/references/examples.md +107 -0
- package/bin/skills/instructor/references/providers.md +70 -0
- package/bin/skills/instructor/references/validation.md +606 -0
- package/bin/skills/knowledge-distillation/SKILL.md +458 -0
- package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
- package/bin/skills/lambda-labs/SKILL.md +545 -0
- package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
- package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
- package/bin/skills/langchain/SKILL.md +480 -0
- package/bin/skills/langchain/references/agents.md +499 -0
- package/bin/skills/langchain/references/integration.md +562 -0
- package/bin/skills/langchain/references/rag.md +600 -0
- package/bin/skills/langsmith/SKILL.md +422 -0
- package/bin/skills/langsmith/references/advanced-usage.md +548 -0
- package/bin/skills/langsmith/references/troubleshooting.md +537 -0
- package/bin/skills/litgpt/SKILL.md +469 -0
- package/bin/skills/litgpt/references/custom-models.md +568 -0
- package/bin/skills/litgpt/references/distributed-training.md +451 -0
- package/bin/skills/litgpt/references/supported-models.md +336 -0
- package/bin/skills/litgpt/references/training-recipes.md +619 -0
- package/bin/skills/llama-cpp/SKILL.md +258 -0
- package/bin/skills/llama-cpp/references/optimization.md +89 -0
- package/bin/skills/llama-cpp/references/quantization.md +213 -0
- package/bin/skills/llama-cpp/references/server.md +125 -0
- package/bin/skills/llama-factory/SKILL.md +80 -0
- package/bin/skills/llama-factory/references/_images.md +23 -0
- package/bin/skills/llama-factory/references/advanced.md +1055 -0
- package/bin/skills/llama-factory/references/getting_started.md +349 -0
- package/bin/skills/llama-factory/references/index.md +19 -0
- package/bin/skills/llama-factory/references/other.md +31 -0
- package/bin/skills/llamaguard/SKILL.md +337 -0
- package/bin/skills/llamaindex/SKILL.md +569 -0
- package/bin/skills/llamaindex/references/agents.md +83 -0
- package/bin/skills/llamaindex/references/data_connectors.md +108 -0
- package/bin/skills/llamaindex/references/query_engines.md +406 -0
- package/bin/skills/llava/SKILL.md +304 -0
- package/bin/skills/llava/references/training.md +197 -0
- package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
- package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
- package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
- package/bin/skills/long-context/SKILL.md +536 -0
- package/bin/skills/long-context/references/extension_methods.md +468 -0
- package/bin/skills/long-context/references/fine_tuning.md +611 -0
- package/bin/skills/long-context/references/rope.md +402 -0
- package/bin/skills/mamba/SKILL.md +260 -0
- package/bin/skills/mamba/references/architecture-details.md +206 -0
- package/bin/skills/mamba/references/benchmarks.md +255 -0
- package/bin/skills/mamba/references/training-guide.md +388 -0
- package/bin/skills/megatron-core/SKILL.md +366 -0
- package/bin/skills/megatron-core/references/benchmarks.md +249 -0
- package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
- package/bin/skills/megatron-core/references/production-examples.md +473 -0
- package/bin/skills/megatron-core/references/training-recipes.md +547 -0
- package/bin/skills/miles/SKILL.md +315 -0
- package/bin/skills/miles/references/api-reference.md +141 -0
- package/bin/skills/miles/references/troubleshooting.md +352 -0
- package/bin/skills/mlflow/SKILL.md +704 -0
- package/bin/skills/mlflow/references/deployment.md +744 -0
- package/bin/skills/mlflow/references/model-registry.md +770 -0
- package/bin/skills/mlflow/references/tracking.md +680 -0
- package/bin/skills/modal/SKILL.md +341 -0
- package/bin/skills/modal/references/advanced-usage.md +503 -0
- package/bin/skills/modal/references/troubleshooting.md +494 -0
- package/bin/skills/model-merging/SKILL.md +539 -0
- package/bin/skills/model-merging/references/evaluation.md +462 -0
- package/bin/skills/model-merging/references/examples.md +428 -0
- package/bin/skills/model-merging/references/methods.md +352 -0
- package/bin/skills/model-pruning/SKILL.md +495 -0
- package/bin/skills/model-pruning/references/wanda.md +347 -0
- package/bin/skills/moe-training/SKILL.md +526 -0
- package/bin/skills/moe-training/references/architectures.md +432 -0
- package/bin/skills/moe-training/references/inference.md +348 -0
- package/bin/skills/moe-training/references/training.md +425 -0
- package/bin/skills/nanogpt/SKILL.md +290 -0
- package/bin/skills/nanogpt/references/architecture.md +382 -0
- package/bin/skills/nanogpt/references/data.md +476 -0
- package/bin/skills/nanogpt/references/training.md +564 -0
- package/bin/skills/nemo-curator/SKILL.md +383 -0
- package/bin/skills/nemo-curator/references/deduplication.md +87 -0
- package/bin/skills/nemo-curator/references/filtering.md +102 -0
- package/bin/skills/nemo-evaluator/SKILL.md +494 -0
- package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
- package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
- package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
- package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
- package/bin/skills/nemo-guardrails/SKILL.md +297 -0
- package/bin/skills/nnsight/SKILL.md +436 -0
- package/bin/skills/nnsight/references/README.md +78 -0
- package/bin/skills/nnsight/references/api.md +344 -0
- package/bin/skills/nnsight/references/tutorials.md +300 -0
- package/bin/skills/openrlhf/SKILL.md +249 -0
- package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
- package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
- package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
- package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
- package/bin/skills/outlines/SKILL.md +652 -0
- package/bin/skills/outlines/references/backends.md +615 -0
- package/bin/skills/outlines/references/examples.md +773 -0
- package/bin/skills/outlines/references/json_generation.md +652 -0
- package/bin/skills/peft/SKILL.md +431 -0
- package/bin/skills/peft/references/advanced-usage.md +514 -0
- package/bin/skills/peft/references/troubleshooting.md +480 -0
- package/bin/skills/phoenix/SKILL.md +475 -0
- package/bin/skills/phoenix/references/advanced-usage.md +619 -0
- package/bin/skills/phoenix/references/troubleshooting.md +538 -0
- package/bin/skills/pinecone/SKILL.md +358 -0
- package/bin/skills/pinecone/references/deployment.md +181 -0
- package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
- package/bin/skills/pytorch-fsdp/references/index.md +7 -0
- package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
- package/bin/skills/pytorch-lightning/SKILL.md +346 -0
- package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
- package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
- package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
- package/bin/skills/pyvene/SKILL.md +473 -0
- package/bin/skills/pyvene/references/README.md +73 -0
- package/bin/skills/pyvene/references/api.md +383 -0
- package/bin/skills/pyvene/references/tutorials.md +376 -0
- package/bin/skills/qdrant/SKILL.md +493 -0
- package/bin/skills/qdrant/references/advanced-usage.md +648 -0
- package/bin/skills/qdrant/references/troubleshooting.md +631 -0
- package/bin/skills/ray-data/SKILL.md +326 -0
- package/bin/skills/ray-data/references/integration.md +82 -0
- package/bin/skills/ray-data/references/transformations.md +83 -0
- package/bin/skills/ray-train/SKILL.md +406 -0
- package/bin/skills/ray-train/references/multi-node.md +628 -0
- package/bin/skills/rwkv/SKILL.md +260 -0
- package/bin/skills/rwkv/references/architecture-details.md +344 -0
- package/bin/skills/rwkv/references/rwkv7.md +386 -0
- package/bin/skills/rwkv/references/state-management.md +369 -0
- package/bin/skills/saelens/SKILL.md +386 -0
- package/bin/skills/saelens/references/README.md +70 -0
- package/bin/skills/saelens/references/api.md +333 -0
- package/bin/skills/saelens/references/tutorials.md +318 -0
- package/bin/skills/segment-anything/SKILL.md +500 -0
- package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
- package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
- package/bin/skills/sentence-transformers/SKILL.md +255 -0
- package/bin/skills/sentence-transformers/references/models.md +123 -0
- package/bin/skills/sentencepiece/SKILL.md +235 -0
- package/bin/skills/sentencepiece/references/algorithms.md +200 -0
- package/bin/skills/sentencepiece/references/training.md +304 -0
- package/bin/skills/sglang/SKILL.md +442 -0
- package/bin/skills/sglang/references/deployment.md +490 -0
- package/bin/skills/sglang/references/radix-attention.md +413 -0
- package/bin/skills/sglang/references/structured-generation.md +541 -0
- package/bin/skills/simpo/SKILL.md +219 -0
- package/bin/skills/simpo/references/datasets.md +478 -0
- package/bin/skills/simpo/references/hyperparameters.md +452 -0
- package/bin/skills/simpo/references/loss-functions.md +350 -0
- package/bin/skills/skypilot/SKILL.md +509 -0
- package/bin/skills/skypilot/references/advanced-usage.md +491 -0
- package/bin/skills/skypilot/references/troubleshooting.md +570 -0
- package/bin/skills/slime/SKILL.md +464 -0
- package/bin/skills/slime/references/api-reference.md +392 -0
- package/bin/skills/slime/references/troubleshooting.md +386 -0
- package/bin/skills/speculative-decoding/SKILL.md +467 -0
- package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
- package/bin/skills/speculative-decoding/references/medusa.md +350 -0
- package/bin/skills/stable-diffusion/SKILL.md +519 -0
- package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
- package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
- package/bin/skills/tensorboard/SKILL.md +629 -0
- package/bin/skills/tensorboard/references/integrations.md +638 -0
- package/bin/skills/tensorboard/references/profiling.md +545 -0
- package/bin/skills/tensorboard/references/visualization.md +620 -0
- package/bin/skills/tensorrt-llm/SKILL.md +187 -0
- package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
- package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
- package/bin/skills/tensorrt-llm/references/serving.md +470 -0
- package/bin/skills/tinker/SKILL.md +362 -0
- package/bin/skills/tinker/references/api-reference.md +168 -0
- package/bin/skills/tinker/references/getting-started.md +157 -0
- package/bin/skills/tinker/references/loss-functions.md +163 -0
- package/bin/skills/tinker/references/models-and-lora.md +139 -0
- package/bin/skills/tinker/references/recipes.md +280 -0
- package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
- package/bin/skills/tinker/references/rendering.md +243 -0
- package/bin/skills/tinker/references/supervised-learning.md +232 -0
- package/bin/skills/tinker-training-cost/SKILL.md +187 -0
- package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
- package/bin/skills/torchforge/SKILL.md +433 -0
- package/bin/skills/torchforge/references/api-reference.md +327 -0
- package/bin/skills/torchforge/references/troubleshooting.md +409 -0
- package/bin/skills/torchtitan/SKILL.md +358 -0
- package/bin/skills/torchtitan/references/checkpoint.md +181 -0
- package/bin/skills/torchtitan/references/custom-models.md +258 -0
- package/bin/skills/torchtitan/references/float8.md +133 -0
- package/bin/skills/torchtitan/references/fsdp.md +126 -0
- package/bin/skills/transformer-lens/SKILL.md +346 -0
- package/bin/skills/transformer-lens/references/README.md +54 -0
- package/bin/skills/transformer-lens/references/api.md +362 -0
- package/bin/skills/transformer-lens/references/tutorials.md +339 -0
- package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
- package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
- package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
- package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
- package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
- package/bin/skills/unsloth/SKILL.md +80 -0
- package/bin/skills/unsloth/references/index.md +7 -0
- package/bin/skills/unsloth/references/llms-full.md +16799 -0
- package/bin/skills/unsloth/references/llms-txt.md +12044 -0
- package/bin/skills/unsloth/references/llms.md +82 -0
- package/bin/skills/verl/SKILL.md +391 -0
- package/bin/skills/verl/references/api-reference.md +301 -0
- package/bin/skills/verl/references/troubleshooting.md +391 -0
- package/bin/skills/vllm/SKILL.md +364 -0
- package/bin/skills/vllm/references/optimization.md +226 -0
- package/bin/skills/vllm/references/quantization.md +284 -0
- package/bin/skills/vllm/references/server-deployment.md +255 -0
- package/bin/skills/vllm/references/troubleshooting.md +447 -0
- package/bin/skills/weights-and-biases/SKILL.md +590 -0
- package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
- package/bin/skills/weights-and-biases/references/integrations.md +700 -0
- package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
- package/bin/skills/whisper/SKILL.md +317 -0
- package/bin/skills/whisper/references/languages.md +189 -0
- package/bin/synsc +0 -0
- package/package.json +10 -0
|
@@ -0,0 +1,770 @@
|
|
|
1
|
+
# Model Registry Guide
|
|
2
|
+
|
|
3
|
+
Complete guide to MLflow Model Registry for versioning, lifecycle management, and collaboration.
|
|
4
|
+
|
|
5
|
+
## Table of Contents
|
|
6
|
+
- What is Model Registry
|
|
7
|
+
- Registering Models
|
|
8
|
+
- Model Versions
|
|
9
|
+
- Stage Transitions
|
|
10
|
+
- Model Aliases (Modern Approach)
|
|
11
|
+
- Searching Models
|
|
12
|
+
- Model Annotations
|
|
13
|
+
- Collaborative Workflows
|
|
14
|
+
- Best Practices
|
|
15
|
+
|
|
16
|
+
## What is Model Registry
|
|
17
|
+
|
|
18
|
+
The Model Registry is a centralized model store for managing the full lifecycle of MLflow Models.
|
|
19
|
+
|
|
20
|
+
**Key Features:**
|
|
21
|
+
- **Versioning**: Automatic version increments (v1, v2, v3...)
|
|
22
|
+
- **Stages**: None, Staging, Production, Archived (legacy)
|
|
23
|
+
- **Aliases**: champion, challenger, latest (modern approach)
|
|
24
|
+
- **Annotations**: Descriptions, tags, metadata
|
|
25
|
+
- **Lineage**: Track which runs produced models
|
|
26
|
+
- **Collaboration**: Team-wide model governance
|
|
27
|
+
- **Deployment**: Single source of truth for production models
|
|
28
|
+
|
|
29
|
+
**Use Cases:**
|
|
30
|
+
- Model approval workflows
|
|
31
|
+
- A/B testing (champion vs challenger)
|
|
32
|
+
- Production deployment tracking
|
|
33
|
+
- Model performance monitoring
|
|
34
|
+
- Regulatory compliance
|
|
35
|
+
|
|
36
|
+
## Registering Models
|
|
37
|
+
|
|
38
|
+
### Register During Training
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
import mlflow
|
|
42
|
+
import mlflow.sklearn
|
|
43
|
+
|
|
44
|
+
with mlflow.start_run():
|
|
45
|
+
model = train_model()
|
|
46
|
+
|
|
47
|
+
# Log and register in one step
|
|
48
|
+
mlflow.sklearn.log_model(
|
|
49
|
+
model,
|
|
50
|
+
"model",
|
|
51
|
+
registered_model_name="product-classifier" # Creates or updates
|
|
52
|
+
)
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Register After Training
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
from mlflow.tracking import MlflowClient
|
|
59
|
+
|
|
60
|
+
client = MlflowClient()
|
|
61
|
+
|
|
62
|
+
# Get run ID from experiment
|
|
63
|
+
run_id = "abc123"
|
|
64
|
+
|
|
65
|
+
# Register model from run
|
|
66
|
+
model_uri = f"runs:/{run_id}/model"
|
|
67
|
+
result = mlflow.register_model(
|
|
68
|
+
model_uri,
|
|
69
|
+
"product-classifier"
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
print(f"Model name: {result.name}")
|
|
73
|
+
print(f"Version: {result.version}")
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Register with Signature
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from mlflow.models.signature import infer_signature
|
|
80
|
+
|
|
81
|
+
with mlflow.start_run():
|
|
82
|
+
model = train_model()
|
|
83
|
+
|
|
84
|
+
# Infer signature
|
|
85
|
+
signature = infer_signature(X_train, model.predict(X_train))
|
|
86
|
+
|
|
87
|
+
# Register with signature
|
|
88
|
+
mlflow.sklearn.log_model(
|
|
89
|
+
model,
|
|
90
|
+
"model",
|
|
91
|
+
signature=signature,
|
|
92
|
+
registered_model_name="product-classifier"
|
|
93
|
+
)
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Model Versions
|
|
97
|
+
|
|
98
|
+
### Automatic Versioning
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
# First registration: creates version 1
|
|
102
|
+
with mlflow.start_run():
|
|
103
|
+
model_v1 = train_model()
|
|
104
|
+
mlflow.sklearn.log_model(model_v1, "model", registered_model_name="my-model")
|
|
105
|
+
# Result: my-model version 1
|
|
106
|
+
|
|
107
|
+
# Second registration: creates version 2
|
|
108
|
+
with mlflow.start_run():
|
|
109
|
+
model_v2 = train_improved_model()
|
|
110
|
+
mlflow.sklearn.log_model(model_v2, "model", registered_model_name="my-model")
|
|
111
|
+
# Result: my-model version 2
|
|
112
|
+
|
|
113
|
+
# Third registration: creates version 3
|
|
114
|
+
with mlflow.start_run():
|
|
115
|
+
model_v3 = train_best_model()
|
|
116
|
+
mlflow.sklearn.log_model(model_v3, "model", registered_model_name="my-model")
|
|
117
|
+
# Result: my-model version 3
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### List Model Versions
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
from mlflow.tracking import MlflowClient
|
|
124
|
+
|
|
125
|
+
client = MlflowClient()
|
|
126
|
+
|
|
127
|
+
# Get all versions
|
|
128
|
+
versions = client.search_model_versions("name='product-classifier'")
|
|
129
|
+
|
|
130
|
+
for v in versions:
|
|
131
|
+
print(f"Version {v.version}:")
|
|
132
|
+
print(f" Stage: {v.current_stage}")
|
|
133
|
+
print(f" Run ID: {v.run_id}")
|
|
134
|
+
print(f" Created: {v.creation_timestamp}")
|
|
135
|
+
print(f" Status: {v.status}")
|
|
136
|
+
print()
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### Get Specific Version
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
client = MlflowClient()
|
|
143
|
+
|
|
144
|
+
# Get version details
|
|
145
|
+
version_info = client.get_model_version(
|
|
146
|
+
name="product-classifier",
|
|
147
|
+
version="3"
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
print(f"Version: {version_info.version}")
|
|
151
|
+
print(f"Stage: {version_info.current_stage}")
|
|
152
|
+
print(f"Run ID: {version_info.run_id}")
|
|
153
|
+
print(f"Description: {version_info.description}")
|
|
154
|
+
print(f"Tags: {version_info.tags}")
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### Get Latest Version
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
# Get latest version in Production stage
|
|
161
|
+
latest_prod = client.get_latest_versions(
|
|
162
|
+
"product-classifier",
|
|
163
|
+
stages=["Production"]
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
# Get latest version in Staging
|
|
167
|
+
latest_staging = client.get_latest_versions(
|
|
168
|
+
"product-classifier",
|
|
169
|
+
stages=["Staging"]
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# Get all latest versions (one per stage)
|
|
173
|
+
all_latest = client.get_latest_versions("product-classifier")
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
## Stage Transitions
|
|
177
|
+
|
|
178
|
+
**Note**: Stages are deprecated in MLflow 2.9+. Use aliases instead (see next section).
|
|
179
|
+
|
|
180
|
+
### Available Stages
|
|
181
|
+
|
|
182
|
+
- **None**: Initial state, not yet tested
|
|
183
|
+
- **Staging**: Under testing/validation
|
|
184
|
+
- **Production**: Deployed in production
|
|
185
|
+
- **Archived**: Retired/deprecated
|
|
186
|
+
|
|
187
|
+
### Transition Model
|
|
188
|
+
|
|
189
|
+
```python
|
|
190
|
+
from mlflow.tracking import MlflowClient
|
|
191
|
+
|
|
192
|
+
client = MlflowClient()
|
|
193
|
+
|
|
194
|
+
# Promote to Staging
|
|
195
|
+
client.transition_model_version_stage(
|
|
196
|
+
name="product-classifier",
|
|
197
|
+
version=3,
|
|
198
|
+
stage="Staging"
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
# Promote to Production (archive old production versions)
|
|
202
|
+
client.transition_model_version_stage(
|
|
203
|
+
name="product-classifier",
|
|
204
|
+
version=3,
|
|
205
|
+
stage="Production",
|
|
206
|
+
archive_existing_versions=True # Archive old production models
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
# Archive old version
|
|
210
|
+
client.transition_model_version_stage(
|
|
211
|
+
name="product-classifier",
|
|
212
|
+
version=2,
|
|
213
|
+
stage="Archived"
|
|
214
|
+
)
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
### Load Model by Stage
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
import mlflow.pyfunc
|
|
221
|
+
|
|
222
|
+
# Load production model
|
|
223
|
+
model = mlflow.pyfunc.load_model("models:/product-classifier/Production")
|
|
224
|
+
|
|
225
|
+
# Load staging model
|
|
226
|
+
staging_model = mlflow.pyfunc.load_model("models:/product-classifier/Staging")
|
|
227
|
+
|
|
228
|
+
# Load specific version
|
|
229
|
+
model_v3 = mlflow.pyfunc.load_model("models:/product-classifier/3")
|
|
230
|
+
|
|
231
|
+
# Use model
|
|
232
|
+
predictions = model.predict(X_test)
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
## Model Aliases (Modern Approach)
|
|
236
|
+
|
|
237
|
+
**Introduced in MLflow 2.8** - Flexible alternative to stages.
|
|
238
|
+
|
|
239
|
+
### Set Aliases
|
|
240
|
+
|
|
241
|
+
```python
|
|
242
|
+
from mlflow.tracking import MlflowClient
|
|
243
|
+
|
|
244
|
+
client = MlflowClient()
|
|
245
|
+
|
|
246
|
+
# Set champion alias (current production model)
|
|
247
|
+
client.set_registered_model_alias(
|
|
248
|
+
name="product-classifier",
|
|
249
|
+
alias="champion",
|
|
250
|
+
version="5"
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
# Set challenger alias (candidate for production)
|
|
254
|
+
client.set_registered_model_alias(
|
|
255
|
+
name="product-classifier",
|
|
256
|
+
alias="challenger",
|
|
257
|
+
version="6"
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
# Set latest alias
|
|
261
|
+
client.set_registered_model_alias(
|
|
262
|
+
name="product-classifier",
|
|
263
|
+
alias="latest",
|
|
264
|
+
version="7"
|
|
265
|
+
)
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
### Load Model by Alias
|
|
269
|
+
|
|
270
|
+
```python
|
|
271
|
+
import mlflow.pyfunc
|
|
272
|
+
|
|
273
|
+
# Load champion model
|
|
274
|
+
champion = mlflow.pyfunc.load_model("models:/product-classifier@champion")
|
|
275
|
+
|
|
276
|
+
# Load challenger model
|
|
277
|
+
challenger = mlflow.pyfunc.load_model("models:/product-classifier@challenger")
|
|
278
|
+
|
|
279
|
+
# Load latest model
|
|
280
|
+
latest = mlflow.pyfunc.load_model("models:/product-classifier@latest")
|
|
281
|
+
|
|
282
|
+
# Use for A/B testing
|
|
283
|
+
champion_preds = champion.predict(X_test)
|
|
284
|
+
challenger_preds = challenger.predict(X_test)
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
### Get Model by Alias
|
|
288
|
+
|
|
289
|
+
```python
|
|
290
|
+
client = MlflowClient()
|
|
291
|
+
|
|
292
|
+
# Get version info by alias
|
|
293
|
+
version_info = client.get_model_version_by_alias(
|
|
294
|
+
name="product-classifier",
|
|
295
|
+
alias="champion"
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
print(f"Champion is version: {version_info.version}")
|
|
299
|
+
print(f"Run ID: {version_info.run_id}")
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
### Delete Alias
|
|
303
|
+
|
|
304
|
+
```python
|
|
305
|
+
# Remove alias
|
|
306
|
+
client.delete_registered_model_alias(
|
|
307
|
+
name="product-classifier",
|
|
308
|
+
alias="challenger"
|
|
309
|
+
)
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
## Searching Models
|
|
313
|
+
|
|
314
|
+
### Search All Models
|
|
315
|
+
|
|
316
|
+
```python
|
|
317
|
+
from mlflow.tracking import MlflowClient
|
|
318
|
+
|
|
319
|
+
client = MlflowClient()
|
|
320
|
+
|
|
321
|
+
# List all registered models
|
|
322
|
+
models = client.search_registered_models()
|
|
323
|
+
|
|
324
|
+
for model in models:
|
|
325
|
+
print(f"Name: {model.name}")
|
|
326
|
+
print(f"Description: {model.description}")
|
|
327
|
+
print(f"Latest versions: {model.latest_versions}")
|
|
328
|
+
print()
|
|
329
|
+
```
|
|
330
|
+
|
|
331
|
+
### Search by Name
|
|
332
|
+
|
|
333
|
+
```python
|
|
334
|
+
# Search by name pattern
|
|
335
|
+
models = client.search_registered_models(
|
|
336
|
+
filter_string="name LIKE 'product-%'"
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
# Search exact name
|
|
340
|
+
models = client.search_registered_models(
|
|
341
|
+
filter_string="name='product-classifier'"
|
|
342
|
+
)
|
|
343
|
+
```
|
|
344
|
+
|
|
345
|
+
### Search Model Versions
|
|
346
|
+
|
|
347
|
+
```python
|
|
348
|
+
# Find all versions of a model
|
|
349
|
+
versions = client.search_model_versions("name='product-classifier'")
|
|
350
|
+
|
|
351
|
+
# Find production versions
|
|
352
|
+
versions = client.search_model_versions(
|
|
353
|
+
"name='product-classifier' AND current_stage='Production'"
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
# Find versions from specific run
|
|
357
|
+
versions = client.search_model_versions(
|
|
358
|
+
f"run_id='{run_id}'"
|
|
359
|
+
)
|
|
360
|
+
```
|
|
361
|
+
|
|
362
|
+
## Model Annotations
|
|
363
|
+
|
|
364
|
+
### Add Description
|
|
365
|
+
|
|
366
|
+
```python
|
|
367
|
+
from mlflow.tracking import MlflowClient
|
|
368
|
+
|
|
369
|
+
client = MlflowClient()
|
|
370
|
+
|
|
371
|
+
# Update model description
|
|
372
|
+
client.update_registered_model(
|
|
373
|
+
name="product-classifier",
|
|
374
|
+
description="ResNet50 classifier for product categorization. Trained on 1M images with 95% accuracy."
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
# Update version description
|
|
378
|
+
client.update_model_version(
|
|
379
|
+
name="product-classifier",
|
|
380
|
+
version="3",
|
|
381
|
+
description="Best performing model. Validation accuracy: 95.2%. Tested on 50K images."
|
|
382
|
+
)
|
|
383
|
+
```
|
|
384
|
+
|
|
385
|
+
### Add Tags
|
|
386
|
+
|
|
387
|
+
```python
|
|
388
|
+
client = MlflowClient()
|
|
389
|
+
|
|
390
|
+
# Add tags to model
|
|
391
|
+
client.set_registered_model_tag(
|
|
392
|
+
name="product-classifier",
|
|
393
|
+
key="task",
|
|
394
|
+
value="classification"
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
client.set_registered_model_tag(
|
|
398
|
+
name="product-classifier",
|
|
399
|
+
key="domain",
|
|
400
|
+
value="e-commerce"
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
# Add tags to specific version
|
|
404
|
+
client.set_model_version_tag(
|
|
405
|
+
name="product-classifier",
|
|
406
|
+
version="3",
|
|
407
|
+
key="validation_status",
|
|
408
|
+
value="approved"
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
client.set_model_version_tag(
|
|
412
|
+
name="product-classifier",
|
|
413
|
+
version="3",
|
|
414
|
+
key="deployed_date",
|
|
415
|
+
value="2025-01-15"
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
client.set_model_version_tag(
|
|
419
|
+
name="product-classifier",
|
|
420
|
+
version="3",
|
|
421
|
+
key="approved_by",
|
|
422
|
+
value="ml-team-lead"
|
|
423
|
+
)
|
|
424
|
+
```
|
|
425
|
+
|
|
426
|
+
### Delete Tags
|
|
427
|
+
|
|
428
|
+
```python
|
|
429
|
+
# Delete model tag
|
|
430
|
+
client.delete_registered_model_tag(
|
|
431
|
+
name="product-classifier",
|
|
432
|
+
key="old_tag"
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
# Delete version tag
|
|
436
|
+
client.delete_model_version_tag(
|
|
437
|
+
name="product-classifier",
|
|
438
|
+
version="3",
|
|
439
|
+
key="old_version_tag"
|
|
440
|
+
)
|
|
441
|
+
```
|
|
442
|
+
|
|
443
|
+
## Collaborative Workflows
|
|
444
|
+
|
|
445
|
+
### Model Approval Workflow
|
|
446
|
+
|
|
447
|
+
```python
|
|
448
|
+
from mlflow.tracking import MlflowClient
|
|
449
|
+
|
|
450
|
+
client = MlflowClient()
|
|
451
|
+
|
|
452
|
+
# 1. Data scientist trains and registers model
|
|
453
|
+
with mlflow.start_run():
|
|
454
|
+
model = train_model()
|
|
455
|
+
mlflow.sklearn.log_model(
|
|
456
|
+
model,
|
|
457
|
+
"model",
|
|
458
|
+
registered_model_name="product-classifier"
|
|
459
|
+
)
|
|
460
|
+
run_id = mlflow.active_run().info.run_id
|
|
461
|
+
|
|
462
|
+
# 2. Add metadata for review
|
|
463
|
+
version = client.get_latest_versions("product-classifier")[0].version
|
|
464
|
+
client.update_model_version(
|
|
465
|
+
name="product-classifier",
|
|
466
|
+
version=version,
|
|
467
|
+
description=f"Accuracy: 95%, F1: 0.93, Run: {run_id}"
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
client.set_model_version_tag(
|
|
471
|
+
name="product-classifier",
|
|
472
|
+
version=version,
|
|
473
|
+
key="status",
|
|
474
|
+
value="awaiting_review"
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
# 3. ML engineer reviews and tests
|
|
478
|
+
test_accuracy = evaluate_model(model)
|
|
479
|
+
|
|
480
|
+
if test_accuracy > 0.9:
|
|
481
|
+
# Approve and promote to staging
|
|
482
|
+
client.set_model_version_tag(
|
|
483
|
+
name="product-classifier",
|
|
484
|
+
version=version,
|
|
485
|
+
key="status",
|
|
486
|
+
value="approved"
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
client.transition_model_version_stage(
|
|
490
|
+
name="product-classifier",
|
|
491
|
+
version=version,
|
|
492
|
+
stage="Staging"
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
# 4. After staging validation, promote to production
|
|
496
|
+
if staging_tests_pass():
|
|
497
|
+
client.transition_model_version_stage(
|
|
498
|
+
name="product-classifier",
|
|
499
|
+
version=version,
|
|
500
|
+
stage="Production",
|
|
501
|
+
archive_existing_versions=True
|
|
502
|
+
)
|
|
503
|
+
|
|
504
|
+
client.set_model_version_tag(
|
|
505
|
+
name="product-classifier",
|
|
506
|
+
version=version,
|
|
507
|
+
key="deployed_by",
|
|
508
|
+
value="ml-ops-team"
|
|
509
|
+
)
|
|
510
|
+
```
|
|
511
|
+
|
|
512
|
+
### A/B Testing Workflow
|
|
513
|
+
|
|
514
|
+
```python
|
|
515
|
+
# Set up champion vs challenger
|
|
516
|
+
client = MlflowClient()
|
|
517
|
+
|
|
518
|
+
# Champion: Current production model
|
|
519
|
+
client.set_registered_model_alias(
|
|
520
|
+
name="product-classifier",
|
|
521
|
+
alias="champion",
|
|
522
|
+
version="5"
|
|
523
|
+
)
|
|
524
|
+
|
|
525
|
+
# Challenger: New candidate model
|
|
526
|
+
client.set_registered_model_alias(
|
|
527
|
+
name="product-classifier",
|
|
528
|
+
alias="challenger",
|
|
529
|
+
version="6"
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
# In production code
|
|
533
|
+
import random
|
|
534
|
+
|
|
535
|
+
def get_model_for_request():
|
|
536
|
+
"""Route 90% to champion, 10% to challenger."""
|
|
537
|
+
if random.random() < 0.9:
|
|
538
|
+
return mlflow.pyfunc.load_model("models:/product-classifier@champion")
|
|
539
|
+
else:
|
|
540
|
+
return mlflow.pyfunc.load_model("models:/product-classifier@challenger")
|
|
541
|
+
|
|
542
|
+
# After A/B test completes
|
|
543
|
+
if challenger_performs_better():
|
|
544
|
+
# Promote challenger to champion
|
|
545
|
+
client.set_registered_model_alias(
|
|
546
|
+
name="product-classifier",
|
|
547
|
+
alias="champion",
|
|
548
|
+
version="6"
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
# Archive old champion
|
|
552
|
+
client.delete_registered_model_alias(
|
|
553
|
+
name="product-classifier",
|
|
554
|
+
alias="challenger"
|
|
555
|
+
)
|
|
556
|
+
```
|
|
557
|
+
|
|
558
|
+
### Model Rollback
|
|
559
|
+
|
|
560
|
+
```python
|
|
561
|
+
client = MlflowClient()
|
|
562
|
+
|
|
563
|
+
# Emergency rollback to previous production version
|
|
564
|
+
previous_version = "4"
|
|
565
|
+
|
|
566
|
+
client.transition_model_version_stage(
|
|
567
|
+
name="product-classifier",
|
|
568
|
+
version=previous_version,
|
|
569
|
+
stage="Production",
|
|
570
|
+
archive_existing_versions=True
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
# Add rollback metadata
|
|
574
|
+
client.set_model_version_tag(
|
|
575
|
+
name="product-classifier",
|
|
576
|
+
version=previous_version,
|
|
577
|
+
key="rollback_reason",
|
|
578
|
+
value="Performance degradation in production"
|
|
579
|
+
)
|
|
580
|
+
|
|
581
|
+
client.set_model_version_tag(
|
|
582
|
+
name="product-classifier",
|
|
583
|
+
version=previous_version,
|
|
584
|
+
key="rollback_date",
|
|
585
|
+
value="2025-01-15"
|
|
586
|
+
)
|
|
587
|
+
```
|
|
588
|
+
|
|
589
|
+
## Best Practices
|
|
590
|
+
|
|
591
|
+
### 1. Use Descriptive Names
|
|
592
|
+
|
|
593
|
+
```python
|
|
594
|
+
# ✅ Good: Descriptive, domain-specific names
|
|
595
|
+
mlflow.sklearn.log_model(model, "model", registered_model_name="ecommerce-product-classifier")
|
|
596
|
+
mlflow.sklearn.log_model(model, "model", registered_model_name="fraud-detection-xgboost")
|
|
597
|
+
|
|
598
|
+
# ❌ Bad: Generic names
|
|
599
|
+
mlflow.sklearn.log_model(model, "model", registered_model_name="model1")
|
|
600
|
+
mlflow.sklearn.log_model(model, "model", registered_model_name="classifier")
|
|
601
|
+
```
|
|
602
|
+
|
|
603
|
+
### 2. Always Add Descriptions
|
|
604
|
+
|
|
605
|
+
```python
|
|
606
|
+
client = MlflowClient()
|
|
607
|
+
|
|
608
|
+
# Add detailed version description
|
|
609
|
+
client.update_model_version(
|
|
610
|
+
name="product-classifier",
|
|
611
|
+
version="5",
|
|
612
|
+
description="""
|
|
613
|
+
ResNet50 classifier for product categorization
|
|
614
|
+
|
|
615
|
+
Performance:
|
|
616
|
+
- Validation Accuracy: 95.2%
|
|
617
|
+
- F1 Score: 0.93
|
|
618
|
+
- Inference Time: 15ms
|
|
619
|
+
|
|
620
|
+
Training:
|
|
621
|
+
- Dataset: ImageNet subset (1.2M images)
|
|
622
|
+
- Augmentation: Random flip, crop, rotation
|
|
623
|
+
- Epochs: 50
|
|
624
|
+
- Batch Size: 32
|
|
625
|
+
|
|
626
|
+
Notes:
|
|
627
|
+
- Pretrained on ImageNet
|
|
628
|
+
- Fine-tuned last 2 layers
|
|
629
|
+
- Handles 1000 product categories
|
|
630
|
+
"""
|
|
631
|
+
)
|
|
632
|
+
```
|
|
633
|
+
|
|
634
|
+
### 3. Use Tags for Metadata
|
|
635
|
+
|
|
636
|
+
```python
|
|
637
|
+
# Add comprehensive tags
|
|
638
|
+
tags = {
|
|
639
|
+
# Performance
|
|
640
|
+
"accuracy": "0.952",
|
|
641
|
+
"f1_score": "0.93",
|
|
642
|
+
"inference_time_ms": "15",
|
|
643
|
+
|
|
644
|
+
# Training
|
|
645
|
+
"dataset": "imagenet-subset",
|
|
646
|
+
"num_samples": "1200000",
|
|
647
|
+
"epochs": "50",
|
|
648
|
+
|
|
649
|
+
# Validation
|
|
650
|
+
"validation_status": "approved",
|
|
651
|
+
"tested_by": "ml-team",
|
|
652
|
+
"test_date": "2025-01-10",
|
|
653
|
+
|
|
654
|
+
# Deployment
|
|
655
|
+
"deployed_date": "2025-01-15",
|
|
656
|
+
"deployed_by": "mlops-team",
|
|
657
|
+
"environment": "production",
|
|
658
|
+
|
|
659
|
+
# Business
|
|
660
|
+
"use_case": "product-categorization",
|
|
661
|
+
"owner": "data-science-team",
|
|
662
|
+
"stakeholder": "ecommerce-team"
|
|
663
|
+
}
|
|
664
|
+
|
|
665
|
+
for key, value in tags.items():
|
|
666
|
+
client.set_model_version_tag(
|
|
667
|
+
name="product-classifier",
|
|
668
|
+
version="5",
|
|
669
|
+
key=key,
|
|
670
|
+
value=value
|
|
671
|
+
)
|
|
672
|
+
```
|
|
673
|
+
|
|
674
|
+
### 4. Use Aliases Instead of Stages
|
|
675
|
+
|
|
676
|
+
```python
|
|
677
|
+
# ✅ Modern: Use aliases (MLflow 2.8+)
|
|
678
|
+
client.set_registered_model_alias(name="my-model", alias="champion", version="5")
|
|
679
|
+
client.set_registered_model_alias(name="my-model", alias="challenger", version="6")
|
|
680
|
+
model = mlflow.pyfunc.load_model("models:/my-model@champion")
|
|
681
|
+
|
|
682
|
+
# ⚠️ Legacy: Stages (deprecated in MLflow 2.9+)
|
|
683
|
+
client.transition_model_version_stage(name="my-model", version=5, stage="Production")
|
|
684
|
+
model = mlflow.pyfunc.load_model("models:/my-model/Production")
|
|
685
|
+
```
|
|
686
|
+
|
|
687
|
+
### 5. Track Model Lineage
|
|
688
|
+
|
|
689
|
+
```python
|
|
690
|
+
# Link model version to training run
|
|
691
|
+
with mlflow.start_run(run_name="product-classifier-training") as run:
|
|
692
|
+
# Log training metrics
|
|
693
|
+
mlflow.log_params(config)
|
|
694
|
+
mlflow.log_metrics(metrics)
|
|
695
|
+
|
|
696
|
+
# Register model
|
|
697
|
+
mlflow.sklearn.log_model(
|
|
698
|
+
model,
|
|
699
|
+
"model",
|
|
700
|
+
registered_model_name="product-classifier"
|
|
701
|
+
)
|
|
702
|
+
|
|
703
|
+
run_id = run.info.run_id
|
|
704
|
+
|
|
705
|
+
# Add lineage metadata
|
|
706
|
+
version = client.get_latest_versions("product-classifier")[0].version
|
|
707
|
+
client.set_model_version_tag(
|
|
708
|
+
name="product-classifier",
|
|
709
|
+
version=version,
|
|
710
|
+
key="training_run_id",
|
|
711
|
+
value=run_id
|
|
712
|
+
)
|
|
713
|
+
|
|
714
|
+
# Add data lineage
|
|
715
|
+
client.set_model_version_tag(
|
|
716
|
+
name="product-classifier",
|
|
717
|
+
version=version,
|
|
718
|
+
key="dataset_version",
|
|
719
|
+
value="imagenet-v2-2025-01"
|
|
720
|
+
)
|
|
721
|
+
```
|
|
722
|
+
|
|
723
|
+
### 6. Implement Approval Gates
|
|
724
|
+
|
|
725
|
+
```python
|
|
726
|
+
def promote_to_production(model_name, version, min_accuracy=0.9):
|
|
727
|
+
"""Promote model to production with validation checks."""
|
|
728
|
+
client = MlflowClient()
|
|
729
|
+
|
|
730
|
+
# 1. Validate performance
|
|
731
|
+
version_info = client.get_model_version(name=model_name, version=version)
|
|
732
|
+
|
|
733
|
+
# Check if approved
|
|
734
|
+
tags = version_info.tags
|
|
735
|
+
if tags.get("validation_status") != "approved":
|
|
736
|
+
raise ValueError("Model not approved for production")
|
|
737
|
+
|
|
738
|
+
# Check accuracy threshold
|
|
739
|
+
accuracy = float(tags.get("accuracy", 0))
|
|
740
|
+
if accuracy < min_accuracy:
|
|
741
|
+
raise ValueError(f"Accuracy {accuracy} below threshold {min_accuracy}")
|
|
742
|
+
|
|
743
|
+
# 2. Promote to production
|
|
744
|
+
client.transition_model_version_stage(
|
|
745
|
+
name=model_name,
|
|
746
|
+
version=version,
|
|
747
|
+
stage="Production",
|
|
748
|
+
archive_existing_versions=True
|
|
749
|
+
)
|
|
750
|
+
|
|
751
|
+
# 3. Add deployment metadata
|
|
752
|
+
from datetime import datetime
|
|
753
|
+
client.set_model_version_tag(
|
|
754
|
+
name=model_name,
|
|
755
|
+
version=version,
|
|
756
|
+
key="deployed_date",
|
|
757
|
+
value=datetime.now().isoformat()
|
|
758
|
+
)
|
|
759
|
+
|
|
760
|
+
print(f"✅ Promoted {model_name} v{version} to production")
|
|
761
|
+
|
|
762
|
+
# Use it
|
|
763
|
+
promote_to_production("product-classifier", "5", min_accuracy=0.9)
|
|
764
|
+
```
|
|
765
|
+
|
|
766
|
+
## Resources
|
|
767
|
+
|
|
768
|
+
- **Model Registry**: https://mlflow.org/docs/latest/model-registry.html
|
|
769
|
+
- **Model Aliases**: https://mlflow.org/docs/latest/model-registry.html#using-model-aliases
|
|
770
|
+
- **Python API**: https://mlflow.org/docs/latest/python_api/mlflow.tracking.html#mlflow.tracking.MlflowClient
|