@synsci/cli-darwin-x64 1.1.49
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skills/accelerate/SKILL.md +332 -0
- package/bin/skills/accelerate/references/custom-plugins.md +453 -0
- package/bin/skills/accelerate/references/megatron-integration.md +489 -0
- package/bin/skills/accelerate/references/performance.md +525 -0
- package/bin/skills/audiocraft/SKILL.md +564 -0
- package/bin/skills/audiocraft/references/advanced-usage.md +666 -0
- package/bin/skills/audiocraft/references/troubleshooting.md +504 -0
- package/bin/skills/autogpt/SKILL.md +403 -0
- package/bin/skills/autogpt/references/advanced-usage.md +535 -0
- package/bin/skills/autogpt/references/troubleshooting.md +420 -0
- package/bin/skills/awq/SKILL.md +310 -0
- package/bin/skills/awq/references/advanced-usage.md +324 -0
- package/bin/skills/awq/references/troubleshooting.md +344 -0
- package/bin/skills/axolotl/SKILL.md +158 -0
- package/bin/skills/axolotl/references/api.md +5548 -0
- package/bin/skills/axolotl/references/dataset-formats.md +1029 -0
- package/bin/skills/axolotl/references/index.md +15 -0
- package/bin/skills/axolotl/references/other.md +3563 -0
- package/bin/skills/bigcode-evaluation-harness/SKILL.md +405 -0
- package/bin/skills/bigcode-evaluation-harness/references/benchmarks.md +393 -0
- package/bin/skills/bigcode-evaluation-harness/references/custom-tasks.md +424 -0
- package/bin/skills/bigcode-evaluation-harness/references/issues.md +394 -0
- package/bin/skills/bitsandbytes/SKILL.md +411 -0
- package/bin/skills/bitsandbytes/references/memory-optimization.md +521 -0
- package/bin/skills/bitsandbytes/references/qlora-training.md +521 -0
- package/bin/skills/bitsandbytes/references/quantization-formats.md +447 -0
- package/bin/skills/blip-2/SKILL.md +564 -0
- package/bin/skills/blip-2/references/advanced-usage.md +680 -0
- package/bin/skills/blip-2/references/troubleshooting.md +526 -0
- package/bin/skills/chroma/SKILL.md +406 -0
- package/bin/skills/chroma/references/integration.md +38 -0
- package/bin/skills/clip/SKILL.md +253 -0
- package/bin/skills/clip/references/applications.md +207 -0
- package/bin/skills/constitutional-ai/SKILL.md +290 -0
- package/bin/skills/crewai/SKILL.md +498 -0
- package/bin/skills/crewai/references/flows.md +438 -0
- package/bin/skills/crewai/references/tools.md +429 -0
- package/bin/skills/crewai/references/troubleshooting.md +480 -0
- package/bin/skills/deepspeed/SKILL.md +141 -0
- package/bin/skills/deepspeed/references/08.md +17 -0
- package/bin/skills/deepspeed/references/09.md +173 -0
- package/bin/skills/deepspeed/references/2020.md +378 -0
- package/bin/skills/deepspeed/references/2023.md +279 -0
- package/bin/skills/deepspeed/references/assets.md +179 -0
- package/bin/skills/deepspeed/references/index.md +35 -0
- package/bin/skills/deepspeed/references/mii.md +118 -0
- package/bin/skills/deepspeed/references/other.md +1191 -0
- package/bin/skills/deepspeed/references/tutorials.md +6554 -0
- package/bin/skills/dspy/SKILL.md +590 -0
- package/bin/skills/dspy/references/examples.md +663 -0
- package/bin/skills/dspy/references/modules.md +475 -0
- package/bin/skills/dspy/references/optimizers.md +566 -0
- package/bin/skills/faiss/SKILL.md +221 -0
- package/bin/skills/faiss/references/index_types.md +280 -0
- package/bin/skills/flash-attention/SKILL.md +367 -0
- package/bin/skills/flash-attention/references/benchmarks.md +215 -0
- package/bin/skills/flash-attention/references/transformers-integration.md +293 -0
- package/bin/skills/gguf/SKILL.md +427 -0
- package/bin/skills/gguf/references/advanced-usage.md +504 -0
- package/bin/skills/gguf/references/troubleshooting.md +442 -0
- package/bin/skills/gptq/SKILL.md +450 -0
- package/bin/skills/gptq/references/calibration.md +337 -0
- package/bin/skills/gptq/references/integration.md +129 -0
- package/bin/skills/gptq/references/troubleshooting.md +95 -0
- package/bin/skills/grpo-rl-training/README.md +97 -0
- package/bin/skills/grpo-rl-training/SKILL.md +572 -0
- package/bin/skills/grpo-rl-training/examples/reward_functions_library.py +393 -0
- package/bin/skills/grpo-rl-training/templates/basic_grpo_training.py +228 -0
- package/bin/skills/guidance/SKILL.md +572 -0
- package/bin/skills/guidance/references/backends.md +554 -0
- package/bin/skills/guidance/references/constraints.md +674 -0
- package/bin/skills/guidance/references/examples.md +767 -0
- package/bin/skills/hqq/SKILL.md +445 -0
- package/bin/skills/hqq/references/advanced-usage.md +528 -0
- package/bin/skills/hqq/references/troubleshooting.md +503 -0
- package/bin/skills/hugging-face-cli/SKILL.md +191 -0
- package/bin/skills/hugging-face-cli/references/commands.md +954 -0
- package/bin/skills/hugging-face-cli/references/examples.md +374 -0
- package/bin/skills/hugging-face-datasets/SKILL.md +547 -0
- package/bin/skills/hugging-face-datasets/examples/diverse_training_examples.json +239 -0
- package/bin/skills/hugging-face-datasets/examples/system_prompt_template.txt +196 -0
- package/bin/skills/hugging-face-datasets/examples/training_examples.json +176 -0
- package/bin/skills/hugging-face-datasets/scripts/dataset_manager.py +522 -0
- package/bin/skills/hugging-face-datasets/scripts/sql_manager.py +844 -0
- package/bin/skills/hugging-face-datasets/templates/chat.json +55 -0
- package/bin/skills/hugging-face-datasets/templates/classification.json +62 -0
- package/bin/skills/hugging-face-datasets/templates/completion.json +51 -0
- package/bin/skills/hugging-face-datasets/templates/custom.json +75 -0
- package/bin/skills/hugging-face-datasets/templates/qa.json +54 -0
- package/bin/skills/hugging-face-datasets/templates/tabular.json +81 -0
- package/bin/skills/hugging-face-evaluation/SKILL.md +656 -0
- package/bin/skills/hugging-face-evaluation/examples/USAGE_EXAMPLES.md +382 -0
- package/bin/skills/hugging-face-evaluation/examples/artificial_analysis_to_hub.py +141 -0
- package/bin/skills/hugging-face-evaluation/examples/example_readme_tables.md +135 -0
- package/bin/skills/hugging-face-evaluation/examples/metric_mapping.json +50 -0
- package/bin/skills/hugging-face-evaluation/requirements.txt +20 -0
- package/bin/skills/hugging-face-evaluation/scripts/evaluation_manager.py +1374 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_eval_uv.py +104 -0
- package/bin/skills/hugging-face-evaluation/scripts/inspect_vllm_uv.py +317 -0
- package/bin/skills/hugging-face-evaluation/scripts/lighteval_vllm_uv.py +303 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_eval_job.py +98 -0
- package/bin/skills/hugging-face-evaluation/scripts/run_vllm_eval_job.py +331 -0
- package/bin/skills/hugging-face-evaluation/scripts/test_extraction.py +206 -0
- package/bin/skills/hugging-face-jobs/SKILL.md +1041 -0
- package/bin/skills/hugging-face-jobs/index.html +216 -0
- package/bin/skills/hugging-face-jobs/references/hardware_guide.md +336 -0
- package/bin/skills/hugging-face-jobs/references/hub_saving.md +352 -0
- package/bin/skills/hugging-face-jobs/references/token_usage.md +546 -0
- package/bin/skills/hugging-face-jobs/references/troubleshooting.md +475 -0
- package/bin/skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
- package/bin/skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
- package/bin/skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
- package/bin/skills/hugging-face-model-trainer/SKILL.md +711 -0
- package/bin/skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
- package/bin/skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
- package/bin/skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
- package/bin/skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
- package/bin/skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
- package/bin/skills/hugging-face-model-trainer/references/training_methods.md +150 -0
- package/bin/skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
- package/bin/skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
- package/bin/skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
- package/bin/skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
- package/bin/skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
- package/bin/skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
- package/bin/skills/hugging-face-paper-publisher/SKILL.md +627 -0
- package/bin/skills/hugging-face-paper-publisher/examples/example_usage.md +327 -0
- package/bin/skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
- package/bin/skills/hugging-face-paper-publisher/scripts/paper_manager.py +508 -0
- package/bin/skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
- package/bin/skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
- package/bin/skills/hugging-face-paper-publisher/templates/modern.md +319 -0
- package/bin/skills/hugging-face-paper-publisher/templates/standard.md +201 -0
- package/bin/skills/hugging-face-tool-builder/SKILL.md +115 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.py +57 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.sh +40 -0
- package/bin/skills/hugging-face-tool-builder/references/baseline_hf_api.tsx +57 -0
- package/bin/skills/hugging-face-tool-builder/references/find_models_by_paper.sh +230 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_enrich_models.sh +96 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_card_frontmatter.sh +188 -0
- package/bin/skills/hugging-face-tool-builder/references/hf_model_papers_auth.sh +171 -0
- package/bin/skills/hugging-face-trackio/SKILL.md +65 -0
- package/bin/skills/hugging-face-trackio/references/logging_metrics.md +206 -0
- package/bin/skills/hugging-face-trackio/references/retrieving_metrics.md +223 -0
- package/bin/skills/huggingface-tokenizers/SKILL.md +516 -0
- package/bin/skills/huggingface-tokenizers/references/algorithms.md +653 -0
- package/bin/skills/huggingface-tokenizers/references/integration.md +637 -0
- package/bin/skills/huggingface-tokenizers/references/pipeline.md +723 -0
- package/bin/skills/huggingface-tokenizers/references/training.md +565 -0
- package/bin/skills/instructor/SKILL.md +740 -0
- package/bin/skills/instructor/references/examples.md +107 -0
- package/bin/skills/instructor/references/providers.md +70 -0
- package/bin/skills/instructor/references/validation.md +606 -0
- package/bin/skills/knowledge-distillation/SKILL.md +458 -0
- package/bin/skills/knowledge-distillation/references/minillm.md +334 -0
- package/bin/skills/lambda-labs/SKILL.md +545 -0
- package/bin/skills/lambda-labs/references/advanced-usage.md +611 -0
- package/bin/skills/lambda-labs/references/troubleshooting.md +530 -0
- package/bin/skills/langchain/SKILL.md +480 -0
- package/bin/skills/langchain/references/agents.md +499 -0
- package/bin/skills/langchain/references/integration.md +562 -0
- package/bin/skills/langchain/references/rag.md +600 -0
- package/bin/skills/langsmith/SKILL.md +422 -0
- package/bin/skills/langsmith/references/advanced-usage.md +548 -0
- package/bin/skills/langsmith/references/troubleshooting.md +537 -0
- package/bin/skills/litgpt/SKILL.md +469 -0
- package/bin/skills/litgpt/references/custom-models.md +568 -0
- package/bin/skills/litgpt/references/distributed-training.md +451 -0
- package/bin/skills/litgpt/references/supported-models.md +336 -0
- package/bin/skills/litgpt/references/training-recipes.md +619 -0
- package/bin/skills/llama-cpp/SKILL.md +258 -0
- package/bin/skills/llama-cpp/references/optimization.md +89 -0
- package/bin/skills/llama-cpp/references/quantization.md +213 -0
- package/bin/skills/llama-cpp/references/server.md +125 -0
- package/bin/skills/llama-factory/SKILL.md +80 -0
- package/bin/skills/llama-factory/references/_images.md +23 -0
- package/bin/skills/llama-factory/references/advanced.md +1055 -0
- package/bin/skills/llama-factory/references/getting_started.md +349 -0
- package/bin/skills/llama-factory/references/index.md +19 -0
- package/bin/skills/llama-factory/references/other.md +31 -0
- package/bin/skills/llamaguard/SKILL.md +337 -0
- package/bin/skills/llamaindex/SKILL.md +569 -0
- package/bin/skills/llamaindex/references/agents.md +83 -0
- package/bin/skills/llamaindex/references/data_connectors.md +108 -0
- package/bin/skills/llamaindex/references/query_engines.md +406 -0
- package/bin/skills/llava/SKILL.md +304 -0
- package/bin/skills/llava/references/training.md +197 -0
- package/bin/skills/lm-evaluation-harness/SKILL.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/api-evaluation.md +490 -0
- package/bin/skills/lm-evaluation-harness/references/benchmark-guide.md +488 -0
- package/bin/skills/lm-evaluation-harness/references/custom-tasks.md +602 -0
- package/bin/skills/lm-evaluation-harness/references/distributed-eval.md +519 -0
- package/bin/skills/long-context/SKILL.md +536 -0
- package/bin/skills/long-context/references/extension_methods.md +468 -0
- package/bin/skills/long-context/references/fine_tuning.md +611 -0
- package/bin/skills/long-context/references/rope.md +402 -0
- package/bin/skills/mamba/SKILL.md +260 -0
- package/bin/skills/mamba/references/architecture-details.md +206 -0
- package/bin/skills/mamba/references/benchmarks.md +255 -0
- package/bin/skills/mamba/references/training-guide.md +388 -0
- package/bin/skills/megatron-core/SKILL.md +366 -0
- package/bin/skills/megatron-core/references/benchmarks.md +249 -0
- package/bin/skills/megatron-core/references/parallelism-guide.md +404 -0
- package/bin/skills/megatron-core/references/production-examples.md +473 -0
- package/bin/skills/megatron-core/references/training-recipes.md +547 -0
- package/bin/skills/miles/SKILL.md +315 -0
- package/bin/skills/miles/references/api-reference.md +141 -0
- package/bin/skills/miles/references/troubleshooting.md +352 -0
- package/bin/skills/mlflow/SKILL.md +704 -0
- package/bin/skills/mlflow/references/deployment.md +744 -0
- package/bin/skills/mlflow/references/model-registry.md +770 -0
- package/bin/skills/mlflow/references/tracking.md +680 -0
- package/bin/skills/modal/SKILL.md +341 -0
- package/bin/skills/modal/references/advanced-usage.md +503 -0
- package/bin/skills/modal/references/troubleshooting.md +494 -0
- package/bin/skills/model-merging/SKILL.md +539 -0
- package/bin/skills/model-merging/references/evaluation.md +462 -0
- package/bin/skills/model-merging/references/examples.md +428 -0
- package/bin/skills/model-merging/references/methods.md +352 -0
- package/bin/skills/model-pruning/SKILL.md +495 -0
- package/bin/skills/model-pruning/references/wanda.md +347 -0
- package/bin/skills/moe-training/SKILL.md +526 -0
- package/bin/skills/moe-training/references/architectures.md +432 -0
- package/bin/skills/moe-training/references/inference.md +348 -0
- package/bin/skills/moe-training/references/training.md +425 -0
- package/bin/skills/nanogpt/SKILL.md +290 -0
- package/bin/skills/nanogpt/references/architecture.md +382 -0
- package/bin/skills/nanogpt/references/data.md +476 -0
- package/bin/skills/nanogpt/references/training.md +564 -0
- package/bin/skills/nemo-curator/SKILL.md +383 -0
- package/bin/skills/nemo-curator/references/deduplication.md +87 -0
- package/bin/skills/nemo-curator/references/filtering.md +102 -0
- package/bin/skills/nemo-evaluator/SKILL.md +494 -0
- package/bin/skills/nemo-evaluator/references/adapter-system.md +340 -0
- package/bin/skills/nemo-evaluator/references/configuration.md +447 -0
- package/bin/skills/nemo-evaluator/references/custom-benchmarks.md +315 -0
- package/bin/skills/nemo-evaluator/references/execution-backends.md +361 -0
- package/bin/skills/nemo-guardrails/SKILL.md +297 -0
- package/bin/skills/nnsight/SKILL.md +436 -0
- package/bin/skills/nnsight/references/README.md +78 -0
- package/bin/skills/nnsight/references/api.md +344 -0
- package/bin/skills/nnsight/references/tutorials.md +300 -0
- package/bin/skills/openrlhf/SKILL.md +249 -0
- package/bin/skills/openrlhf/references/algorithm-comparison.md +404 -0
- package/bin/skills/openrlhf/references/custom-rewards.md +530 -0
- package/bin/skills/openrlhf/references/hybrid-engine.md +287 -0
- package/bin/skills/openrlhf/references/multi-node-training.md +454 -0
- package/bin/skills/outlines/SKILL.md +652 -0
- package/bin/skills/outlines/references/backends.md +615 -0
- package/bin/skills/outlines/references/examples.md +773 -0
- package/bin/skills/outlines/references/json_generation.md +652 -0
- package/bin/skills/peft/SKILL.md +431 -0
- package/bin/skills/peft/references/advanced-usage.md +514 -0
- package/bin/skills/peft/references/troubleshooting.md +480 -0
- package/bin/skills/phoenix/SKILL.md +475 -0
- package/bin/skills/phoenix/references/advanced-usage.md +619 -0
- package/bin/skills/phoenix/references/troubleshooting.md +538 -0
- package/bin/skills/pinecone/SKILL.md +358 -0
- package/bin/skills/pinecone/references/deployment.md +181 -0
- package/bin/skills/pytorch-fsdp/SKILL.md +126 -0
- package/bin/skills/pytorch-fsdp/references/index.md +7 -0
- package/bin/skills/pytorch-fsdp/references/other.md +4249 -0
- package/bin/skills/pytorch-lightning/SKILL.md +346 -0
- package/bin/skills/pytorch-lightning/references/callbacks.md +436 -0
- package/bin/skills/pytorch-lightning/references/distributed.md +490 -0
- package/bin/skills/pytorch-lightning/references/hyperparameter-tuning.md +556 -0
- package/bin/skills/pyvene/SKILL.md +473 -0
- package/bin/skills/pyvene/references/README.md +73 -0
- package/bin/skills/pyvene/references/api.md +383 -0
- package/bin/skills/pyvene/references/tutorials.md +376 -0
- package/bin/skills/qdrant/SKILL.md +493 -0
- package/bin/skills/qdrant/references/advanced-usage.md +648 -0
- package/bin/skills/qdrant/references/troubleshooting.md +631 -0
- package/bin/skills/ray-data/SKILL.md +326 -0
- package/bin/skills/ray-data/references/integration.md +82 -0
- package/bin/skills/ray-data/references/transformations.md +83 -0
- package/bin/skills/ray-train/SKILL.md +406 -0
- package/bin/skills/ray-train/references/multi-node.md +628 -0
- package/bin/skills/rwkv/SKILL.md +260 -0
- package/bin/skills/rwkv/references/architecture-details.md +344 -0
- package/bin/skills/rwkv/references/rwkv7.md +386 -0
- package/bin/skills/rwkv/references/state-management.md +369 -0
- package/bin/skills/saelens/SKILL.md +386 -0
- package/bin/skills/saelens/references/README.md +70 -0
- package/bin/skills/saelens/references/api.md +333 -0
- package/bin/skills/saelens/references/tutorials.md +318 -0
- package/bin/skills/segment-anything/SKILL.md +500 -0
- package/bin/skills/segment-anything/references/advanced-usage.md +589 -0
- package/bin/skills/segment-anything/references/troubleshooting.md +484 -0
- package/bin/skills/sentence-transformers/SKILL.md +255 -0
- package/bin/skills/sentence-transformers/references/models.md +123 -0
- package/bin/skills/sentencepiece/SKILL.md +235 -0
- package/bin/skills/sentencepiece/references/algorithms.md +200 -0
- package/bin/skills/sentencepiece/references/training.md +304 -0
- package/bin/skills/sglang/SKILL.md +442 -0
- package/bin/skills/sglang/references/deployment.md +490 -0
- package/bin/skills/sglang/references/radix-attention.md +413 -0
- package/bin/skills/sglang/references/structured-generation.md +541 -0
- package/bin/skills/simpo/SKILL.md +219 -0
- package/bin/skills/simpo/references/datasets.md +478 -0
- package/bin/skills/simpo/references/hyperparameters.md +452 -0
- package/bin/skills/simpo/references/loss-functions.md +350 -0
- package/bin/skills/skypilot/SKILL.md +509 -0
- package/bin/skills/skypilot/references/advanced-usage.md +491 -0
- package/bin/skills/skypilot/references/troubleshooting.md +570 -0
- package/bin/skills/slime/SKILL.md +464 -0
- package/bin/skills/slime/references/api-reference.md +392 -0
- package/bin/skills/slime/references/troubleshooting.md +386 -0
- package/bin/skills/speculative-decoding/SKILL.md +467 -0
- package/bin/skills/speculative-decoding/references/lookahead.md +309 -0
- package/bin/skills/speculative-decoding/references/medusa.md +350 -0
- package/bin/skills/stable-diffusion/SKILL.md +519 -0
- package/bin/skills/stable-diffusion/references/advanced-usage.md +716 -0
- package/bin/skills/stable-diffusion/references/troubleshooting.md +555 -0
- package/bin/skills/tensorboard/SKILL.md +629 -0
- package/bin/skills/tensorboard/references/integrations.md +638 -0
- package/bin/skills/tensorboard/references/profiling.md +545 -0
- package/bin/skills/tensorboard/references/visualization.md +620 -0
- package/bin/skills/tensorrt-llm/SKILL.md +187 -0
- package/bin/skills/tensorrt-llm/references/multi-gpu.md +298 -0
- package/bin/skills/tensorrt-llm/references/optimization.md +242 -0
- package/bin/skills/tensorrt-llm/references/serving.md +470 -0
- package/bin/skills/tinker/SKILL.md +362 -0
- package/bin/skills/tinker/references/api-reference.md +168 -0
- package/bin/skills/tinker/references/getting-started.md +157 -0
- package/bin/skills/tinker/references/loss-functions.md +163 -0
- package/bin/skills/tinker/references/models-and-lora.md +139 -0
- package/bin/skills/tinker/references/recipes.md +280 -0
- package/bin/skills/tinker/references/reinforcement-learning.md +212 -0
- package/bin/skills/tinker/references/rendering.md +243 -0
- package/bin/skills/tinker/references/supervised-learning.md +232 -0
- package/bin/skills/tinker-training-cost/SKILL.md +187 -0
- package/bin/skills/tinker-training-cost/scripts/calculate_cost.py +123 -0
- package/bin/skills/torchforge/SKILL.md +433 -0
- package/bin/skills/torchforge/references/api-reference.md +327 -0
- package/bin/skills/torchforge/references/troubleshooting.md +409 -0
- package/bin/skills/torchtitan/SKILL.md +358 -0
- package/bin/skills/torchtitan/references/checkpoint.md +181 -0
- package/bin/skills/torchtitan/references/custom-models.md +258 -0
- package/bin/skills/torchtitan/references/float8.md +133 -0
- package/bin/skills/torchtitan/references/fsdp.md +126 -0
- package/bin/skills/transformer-lens/SKILL.md +346 -0
- package/bin/skills/transformer-lens/references/README.md +54 -0
- package/bin/skills/transformer-lens/references/api.md +362 -0
- package/bin/skills/transformer-lens/references/tutorials.md +339 -0
- package/bin/skills/trl-fine-tuning/SKILL.md +455 -0
- package/bin/skills/trl-fine-tuning/references/dpo-variants.md +227 -0
- package/bin/skills/trl-fine-tuning/references/online-rl.md +82 -0
- package/bin/skills/trl-fine-tuning/references/reward-modeling.md +122 -0
- package/bin/skills/trl-fine-tuning/references/sft-training.md +168 -0
- package/bin/skills/unsloth/SKILL.md +80 -0
- package/bin/skills/unsloth/references/index.md +7 -0
- package/bin/skills/unsloth/references/llms-full.md +16799 -0
- package/bin/skills/unsloth/references/llms-txt.md +12044 -0
- package/bin/skills/unsloth/references/llms.md +82 -0
- package/bin/skills/verl/SKILL.md +391 -0
- package/bin/skills/verl/references/api-reference.md +301 -0
- package/bin/skills/verl/references/troubleshooting.md +391 -0
- package/bin/skills/vllm/SKILL.md +364 -0
- package/bin/skills/vllm/references/optimization.md +226 -0
- package/bin/skills/vllm/references/quantization.md +284 -0
- package/bin/skills/vllm/references/server-deployment.md +255 -0
- package/bin/skills/vllm/references/troubleshooting.md +447 -0
- package/bin/skills/weights-and-biases/SKILL.md +590 -0
- package/bin/skills/weights-and-biases/references/artifacts.md +584 -0
- package/bin/skills/weights-and-biases/references/integrations.md +700 -0
- package/bin/skills/weights-and-biases/references/sweeps.md +847 -0
- package/bin/skills/whisper/SKILL.md +317 -0
- package/bin/skills/whisper/references/languages.md +189 -0
- package/bin/synsc +0 -0
- package/package.json +10 -0
|
@@ -0,0 +1,494 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: nemo-evaluator-sdk
|
|
3
|
+
description: Evaluates LLMs across 100+ benchmarks from 18+ harnesses (MMLU, HumanEval, GSM8K, safety, VLM) with multi-backend execution. Use when needing scalable evaluation on local Docker, Slurm HPC, or cloud platforms. NVIDIA's enterprise-grade platform with container-first architecture for reproducible benchmarking.
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
author: Synthetic Sciences
|
|
6
|
+
license: MIT
|
|
7
|
+
tags: [Evaluation, NeMo, NVIDIA, Benchmarking, MMLU, HumanEval, Multi-Backend, Slurm, Docker, Reproducible, Enterprise]
|
|
8
|
+
dependencies: [nemo-evaluator-launcher>=0.1.25, docker]
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# NeMo Evaluator SDK - Enterprise LLM Benchmarking
|
|
12
|
+
|
|
13
|
+
## Quick Start
|
|
14
|
+
|
|
15
|
+
NeMo Evaluator SDK evaluates LLMs across 100+ benchmarks from 18+ harnesses using containerized, reproducible evaluation with multi-backend execution (local Docker, Slurm HPC, Lepton cloud).
|
|
16
|
+
|
|
17
|
+
**Installation**:
|
|
18
|
+
```bash
|
|
19
|
+
pip install nemo-evaluator-launcher
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
**Set API key and run evaluation**:
|
|
23
|
+
```bash
|
|
24
|
+
export NGC_API_KEY=nvapi-your-key-here
|
|
25
|
+
|
|
26
|
+
# Create minimal config
|
|
27
|
+
cat > config.yaml << 'EOF'
|
|
28
|
+
defaults:
|
|
29
|
+
- execution: local
|
|
30
|
+
- deployment: none
|
|
31
|
+
- _self_
|
|
32
|
+
|
|
33
|
+
execution:
|
|
34
|
+
output_dir: ./results
|
|
35
|
+
|
|
36
|
+
target:
|
|
37
|
+
api_endpoint:
|
|
38
|
+
model_id: meta/llama-3.1-8b-instruct
|
|
39
|
+
url: https://integrate.api.nvidia.com/v1/chat/completions
|
|
40
|
+
api_key_name: NGC_API_KEY
|
|
41
|
+
|
|
42
|
+
evaluation:
|
|
43
|
+
tasks:
|
|
44
|
+
- name: ifeval
|
|
45
|
+
EOF
|
|
46
|
+
|
|
47
|
+
# Run evaluation
|
|
48
|
+
nemo-evaluator-launcher run --config-dir . --config-name config
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
**View available tasks**:
|
|
52
|
+
```bash
|
|
53
|
+
nemo-evaluator-launcher ls tasks
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Common Workflows
|
|
57
|
+
|
|
58
|
+
### Workflow 1: Evaluate Model on Standard Benchmarks
|
|
59
|
+
|
|
60
|
+
Run core academic benchmarks (MMLU, GSM8K, IFEval) on any OpenAI-compatible endpoint.
|
|
61
|
+
|
|
62
|
+
**Checklist**:
|
|
63
|
+
```
|
|
64
|
+
Standard Evaluation:
|
|
65
|
+
- [ ] Step 1: Configure API endpoint
|
|
66
|
+
- [ ] Step 2: Select benchmarks
|
|
67
|
+
- [ ] Step 3: Run evaluation
|
|
68
|
+
- [ ] Step 4: Check results
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
**Step 1: Configure API endpoint**
|
|
72
|
+
|
|
73
|
+
```yaml
|
|
74
|
+
# config.yaml
|
|
75
|
+
defaults:
|
|
76
|
+
- execution: local
|
|
77
|
+
- deployment: none
|
|
78
|
+
- _self_
|
|
79
|
+
|
|
80
|
+
execution:
|
|
81
|
+
output_dir: ./results
|
|
82
|
+
|
|
83
|
+
target:
|
|
84
|
+
api_endpoint:
|
|
85
|
+
model_id: meta/llama-3.1-8b-instruct
|
|
86
|
+
url: https://integrate.api.nvidia.com/v1/chat/completions
|
|
87
|
+
api_key_name: NGC_API_KEY
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
For self-hosted endpoints (vLLM, TRT-LLM):
|
|
91
|
+
```yaml
|
|
92
|
+
target:
|
|
93
|
+
api_endpoint:
|
|
94
|
+
model_id: my-model
|
|
95
|
+
url: http://localhost:8000/v1/chat/completions
|
|
96
|
+
api_key_name: "" # No key needed for local
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
**Step 2: Select benchmarks**
|
|
100
|
+
|
|
101
|
+
Add tasks to your config:
|
|
102
|
+
```yaml
|
|
103
|
+
evaluation:
|
|
104
|
+
tasks:
|
|
105
|
+
- name: ifeval # Instruction following
|
|
106
|
+
- name: gpqa_diamond # Graduate-level QA
|
|
107
|
+
env_vars:
|
|
108
|
+
HF_TOKEN: HF_TOKEN # Some tasks need HF token
|
|
109
|
+
- name: gsm8k_cot_instruct # Math reasoning
|
|
110
|
+
- name: humaneval # Code generation
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
**Step 3: Run evaluation**
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
# Run with config file
|
|
117
|
+
nemo-evaluator-launcher run \
|
|
118
|
+
--config-dir . \
|
|
119
|
+
--config-name config
|
|
120
|
+
|
|
121
|
+
# Override output directory
|
|
122
|
+
nemo-evaluator-launcher run \
|
|
123
|
+
--config-dir . \
|
|
124
|
+
--config-name config \
|
|
125
|
+
-o execution.output_dir=./my_results
|
|
126
|
+
|
|
127
|
+
# Limit samples for quick testing
|
|
128
|
+
nemo-evaluator-launcher run \
|
|
129
|
+
--config-dir . \
|
|
130
|
+
--config-name config \
|
|
131
|
+
-o +evaluation.nemo_evaluator_config.config.params.limit_samples=10
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
**Step 4: Check results**
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
# Check job status
|
|
138
|
+
nemo-evaluator-launcher status <invocation_id>
|
|
139
|
+
|
|
140
|
+
# List all runs
|
|
141
|
+
nemo-evaluator-launcher ls runs
|
|
142
|
+
|
|
143
|
+
# View results
|
|
144
|
+
cat results/<invocation_id>/<task>/artifacts/results.yml
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
### Workflow 2: Run Evaluation on Slurm HPC Cluster
|
|
148
|
+
|
|
149
|
+
Execute large-scale evaluation on HPC infrastructure.
|
|
150
|
+
|
|
151
|
+
**Checklist**:
|
|
152
|
+
```
|
|
153
|
+
Slurm Evaluation:
|
|
154
|
+
- [ ] Step 1: Configure Slurm settings
|
|
155
|
+
- [ ] Step 2: Set up model deployment
|
|
156
|
+
- [ ] Step 3: Launch evaluation
|
|
157
|
+
- [ ] Step 4: Monitor job status
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
**Step 1: Configure Slurm settings**
|
|
161
|
+
|
|
162
|
+
```yaml
|
|
163
|
+
# slurm_config.yaml
|
|
164
|
+
defaults:
|
|
165
|
+
- execution: slurm
|
|
166
|
+
- deployment: vllm
|
|
167
|
+
- _self_
|
|
168
|
+
|
|
169
|
+
execution:
|
|
170
|
+
hostname: cluster.example.com
|
|
171
|
+
account: my_slurm_account
|
|
172
|
+
partition: gpu
|
|
173
|
+
output_dir: /shared/results
|
|
174
|
+
walltime: "04:00:00"
|
|
175
|
+
nodes: 1
|
|
176
|
+
gpus_per_node: 8
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
**Step 2: Set up model deployment**
|
|
180
|
+
|
|
181
|
+
```yaml
|
|
182
|
+
deployment:
|
|
183
|
+
checkpoint_path: /shared/models/llama-3.1-8b
|
|
184
|
+
tensor_parallel_size: 2
|
|
185
|
+
data_parallel_size: 4
|
|
186
|
+
max_model_len: 4096
|
|
187
|
+
|
|
188
|
+
target:
|
|
189
|
+
api_endpoint:
|
|
190
|
+
model_id: llama-3.1-8b
|
|
191
|
+
# URL auto-generated by deployment
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
**Step 3: Launch evaluation**
|
|
195
|
+
|
|
196
|
+
```bash
|
|
197
|
+
nemo-evaluator-launcher run \
|
|
198
|
+
--config-dir . \
|
|
199
|
+
--config-name slurm_config
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
**Step 4: Monitor job status**
|
|
203
|
+
|
|
204
|
+
```bash
|
|
205
|
+
# Check status (queries sacct)
|
|
206
|
+
nemo-evaluator-launcher status <invocation_id>
|
|
207
|
+
|
|
208
|
+
# View detailed info
|
|
209
|
+
nemo-evaluator-launcher info <invocation_id>
|
|
210
|
+
|
|
211
|
+
# Kill if needed
|
|
212
|
+
nemo-evaluator-launcher kill <invocation_id>
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
### Workflow 3: Compare Multiple Models
|
|
216
|
+
|
|
217
|
+
Benchmark multiple models on the same tasks for comparison.
|
|
218
|
+
|
|
219
|
+
**Checklist**:
|
|
220
|
+
```
|
|
221
|
+
Model Comparison:
|
|
222
|
+
- [ ] Step 1: Create base config
|
|
223
|
+
- [ ] Step 2: Run evaluations with overrides
|
|
224
|
+
- [ ] Step 3: Export and compare results
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
**Step 1: Create base config**
|
|
228
|
+
|
|
229
|
+
```yaml
|
|
230
|
+
# base_eval.yaml
|
|
231
|
+
defaults:
|
|
232
|
+
- execution: local
|
|
233
|
+
- deployment: none
|
|
234
|
+
- _self_
|
|
235
|
+
|
|
236
|
+
execution:
|
|
237
|
+
output_dir: ./comparison_results
|
|
238
|
+
|
|
239
|
+
evaluation:
|
|
240
|
+
nemo_evaluator_config:
|
|
241
|
+
config:
|
|
242
|
+
params:
|
|
243
|
+
temperature: 0.01
|
|
244
|
+
parallelism: 4
|
|
245
|
+
tasks:
|
|
246
|
+
- name: mmlu_pro
|
|
247
|
+
- name: gsm8k_cot_instruct
|
|
248
|
+
- name: ifeval
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
**Step 2: Run evaluations with model overrides**
|
|
252
|
+
|
|
253
|
+
```bash
|
|
254
|
+
# Evaluate Llama 3.1 8B
|
|
255
|
+
nemo-evaluator-launcher run \
|
|
256
|
+
--config-dir . \
|
|
257
|
+
--config-name base_eval \
|
|
258
|
+
-o target.api_endpoint.model_id=meta/llama-3.1-8b-instruct \
|
|
259
|
+
-o target.api_endpoint.url=https://integrate.api.nvidia.com/v1/chat/completions
|
|
260
|
+
|
|
261
|
+
# Evaluate Mistral 7B
|
|
262
|
+
nemo-evaluator-launcher run \
|
|
263
|
+
--config-dir . \
|
|
264
|
+
--config-name base_eval \
|
|
265
|
+
-o target.api_endpoint.model_id=mistralai/mistral-7b-instruct-v0.3 \
|
|
266
|
+
-o target.api_endpoint.url=https://integrate.api.nvidia.com/v1/chat/completions
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
**Step 3: Export and compare**
|
|
270
|
+
|
|
271
|
+
```bash
|
|
272
|
+
# Export to MLflow
|
|
273
|
+
nemo-evaluator-launcher export <invocation_id_1> --dest mlflow
|
|
274
|
+
nemo-evaluator-launcher export <invocation_id_2> --dest mlflow
|
|
275
|
+
|
|
276
|
+
# Export to local JSON
|
|
277
|
+
nemo-evaluator-launcher export <invocation_id> --dest local --format json
|
|
278
|
+
|
|
279
|
+
# Export to Weights & Biases
|
|
280
|
+
nemo-evaluator-launcher export <invocation_id> --dest wandb
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
### Workflow 4: Safety and Vision-Language Evaluation
|
|
284
|
+
|
|
285
|
+
Evaluate models on safety benchmarks and VLM tasks.
|
|
286
|
+
|
|
287
|
+
**Checklist**:
|
|
288
|
+
```
|
|
289
|
+
Safety/VLM Evaluation:
|
|
290
|
+
- [ ] Step 1: Configure safety tasks
|
|
291
|
+
- [ ] Step 2: Set up VLM tasks (if applicable)
|
|
292
|
+
- [ ] Step 3: Run evaluation
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
**Step 1: Configure safety tasks**
|
|
296
|
+
|
|
297
|
+
```yaml
|
|
298
|
+
evaluation:
|
|
299
|
+
tasks:
|
|
300
|
+
- name: aegis # Safety harness
|
|
301
|
+
- name: wildguard # Safety classification
|
|
302
|
+
- name: garak # Security probing
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
**Step 2: Configure VLM tasks**
|
|
306
|
+
|
|
307
|
+
```yaml
|
|
308
|
+
# For vision-language models
|
|
309
|
+
target:
|
|
310
|
+
api_endpoint:
|
|
311
|
+
type: vlm # Vision-language endpoint
|
|
312
|
+
model_id: nvidia/llama-3.2-90b-vision-instruct
|
|
313
|
+
url: https://integrate.api.nvidia.com/v1/chat/completions
|
|
314
|
+
|
|
315
|
+
evaluation:
|
|
316
|
+
tasks:
|
|
317
|
+
- name: ocrbench # OCR evaluation
|
|
318
|
+
- name: chartqa # Chart understanding
|
|
319
|
+
- name: mmmu # Multimodal understanding
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
## When to Use vs Alternatives
|
|
323
|
+
|
|
324
|
+
**Use NeMo Evaluator when:**
|
|
325
|
+
- Need **100+ benchmarks** from 18+ harnesses in one platform
|
|
326
|
+
- Running evaluations on **Slurm HPC clusters** or cloud
|
|
327
|
+
- Requiring **reproducible** containerized evaluation
|
|
328
|
+
- Evaluating against **OpenAI-compatible APIs** (vLLM, TRT-LLM, NIMs)
|
|
329
|
+
- Need **enterprise-grade** evaluation with result export (MLflow, W&B)
|
|
330
|
+
|
|
331
|
+
**Use alternatives instead:**
|
|
332
|
+
- **lm-evaluation-harness**: Simpler setup for quick local evaluation
|
|
333
|
+
- **bigcode-evaluation-harness**: Focused only on code benchmarks
|
|
334
|
+
- **HELM**: Stanford's broader evaluation (fairness, efficiency)
|
|
335
|
+
- **Custom scripts**: Highly specialized domain evaluation
|
|
336
|
+
|
|
337
|
+
## Supported Harnesses and Tasks
|
|
338
|
+
|
|
339
|
+
| Harness | Task Count | Categories |
|
|
340
|
+
|---------|-----------|------------|
|
|
341
|
+
| `lm-evaluation-harness` | 60+ | MMLU, GSM8K, HellaSwag, ARC |
|
|
342
|
+
| `simple-evals` | 20+ | GPQA, MATH, AIME |
|
|
343
|
+
| `bigcode-evaluation-harness` | 25+ | HumanEval, MBPP, MultiPL-E |
|
|
344
|
+
| `safety-harness` | 3 | Aegis, WildGuard |
|
|
345
|
+
| `garak` | 1 | Security probing |
|
|
346
|
+
| `vlmevalkit` | 6+ | OCRBench, ChartQA, MMMU |
|
|
347
|
+
| `bfcl` | 6 | Function calling v2/v3 |
|
|
348
|
+
| `mtbench` | 2 | Multi-turn conversation |
|
|
349
|
+
| `livecodebench` | 10+ | Live coding evaluation |
|
|
350
|
+
| `helm` | 15 | Medical domain |
|
|
351
|
+
| `nemo-skills` | 8 | Math, science, agentic |
|
|
352
|
+
|
|
353
|
+
## Common Issues
|
|
354
|
+
|
|
355
|
+
**Issue: Container pull fails**
|
|
356
|
+
|
|
357
|
+
Ensure NGC credentials are configured:
|
|
358
|
+
```bash
|
|
359
|
+
docker login nvcr.io -u '$oauthtoken' -p $NGC_API_KEY
|
|
360
|
+
```
|
|
361
|
+
|
|
362
|
+
**Issue: Task requires environment variable**
|
|
363
|
+
|
|
364
|
+
Some tasks need HF_TOKEN or JUDGE_API_KEY:
|
|
365
|
+
```yaml
|
|
366
|
+
evaluation:
|
|
367
|
+
tasks:
|
|
368
|
+
- name: gpqa_diamond
|
|
369
|
+
env_vars:
|
|
370
|
+
HF_TOKEN: HF_TOKEN # Maps env var name to env var
|
|
371
|
+
```
|
|
372
|
+
|
|
373
|
+
**Issue: Evaluation timeout**
|
|
374
|
+
|
|
375
|
+
Increase parallelism or reduce samples:
|
|
376
|
+
```bash
|
|
377
|
+
-o +evaluation.nemo_evaluator_config.config.params.parallelism=8
|
|
378
|
+
-o +evaluation.nemo_evaluator_config.config.params.limit_samples=100
|
|
379
|
+
```
|
|
380
|
+
|
|
381
|
+
**Issue: Slurm job not starting**
|
|
382
|
+
|
|
383
|
+
Check Slurm account and partition:
|
|
384
|
+
```yaml
|
|
385
|
+
execution:
|
|
386
|
+
account: correct_account
|
|
387
|
+
partition: gpu
|
|
388
|
+
qos: normal # May need specific QOS
|
|
389
|
+
```
|
|
390
|
+
|
|
391
|
+
**Issue: Different results than expected**
|
|
392
|
+
|
|
393
|
+
Verify configuration matches reported settings:
|
|
394
|
+
```yaml
|
|
395
|
+
evaluation:
|
|
396
|
+
nemo_evaluator_config:
|
|
397
|
+
config:
|
|
398
|
+
params:
|
|
399
|
+
temperature: 0.0 # Deterministic
|
|
400
|
+
num_fewshot: 5 # Check paper's fewshot count
|
|
401
|
+
```
|
|
402
|
+
|
|
403
|
+
## CLI Reference
|
|
404
|
+
|
|
405
|
+
| Command | Description |
|
|
406
|
+
|---------|-------------|
|
|
407
|
+
| `run` | Execute evaluation with config |
|
|
408
|
+
| `status <id>` | Check job status |
|
|
409
|
+
| `info <id>` | View detailed job info |
|
|
410
|
+
| `ls tasks` | List available benchmarks |
|
|
411
|
+
| `ls runs` | List all invocations |
|
|
412
|
+
| `export <id>` | Export results (mlflow/wandb/local) |
|
|
413
|
+
| `kill <id>` | Terminate running job |
|
|
414
|
+
|
|
415
|
+
## Configuration Override Examples
|
|
416
|
+
|
|
417
|
+
```bash
|
|
418
|
+
# Override model endpoint
|
|
419
|
+
-o target.api_endpoint.model_id=my-model
|
|
420
|
+
-o target.api_endpoint.url=http://localhost:8000/v1/chat/completions
|
|
421
|
+
|
|
422
|
+
# Add evaluation parameters
|
|
423
|
+
-o +evaluation.nemo_evaluator_config.config.params.temperature=0.5
|
|
424
|
+
-o +evaluation.nemo_evaluator_config.config.params.parallelism=8
|
|
425
|
+
-o +evaluation.nemo_evaluator_config.config.params.limit_samples=50
|
|
426
|
+
|
|
427
|
+
# Change execution settings
|
|
428
|
+
-o execution.output_dir=/custom/path
|
|
429
|
+
-o execution.mode=parallel
|
|
430
|
+
|
|
431
|
+
# Dynamically set tasks
|
|
432
|
+
-o 'evaluation.tasks=[{name: ifeval}, {name: gsm8k}]'
|
|
433
|
+
```
|
|
434
|
+
|
|
435
|
+
## Python API Usage
|
|
436
|
+
|
|
437
|
+
For programmatic evaluation without the CLI:
|
|
438
|
+
|
|
439
|
+
```python
|
|
440
|
+
from nemo_evaluator.core.evaluate import evaluate
|
|
441
|
+
from nemo_evaluator.api.api_dataclasses import (
|
|
442
|
+
EvaluationConfig,
|
|
443
|
+
EvaluationTarget,
|
|
444
|
+
ApiEndpoint,
|
|
445
|
+
EndpointType,
|
|
446
|
+
ConfigParams
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
# Configure evaluation
|
|
450
|
+
eval_config = EvaluationConfig(
|
|
451
|
+
type="mmlu_pro",
|
|
452
|
+
output_dir="./results",
|
|
453
|
+
params=ConfigParams(
|
|
454
|
+
limit_samples=10,
|
|
455
|
+
temperature=0.0,
|
|
456
|
+
max_new_tokens=1024,
|
|
457
|
+
parallelism=4
|
|
458
|
+
)
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
# Configure target endpoint
|
|
462
|
+
target_config = EvaluationTarget(
|
|
463
|
+
api_endpoint=ApiEndpoint(
|
|
464
|
+
model_id="meta/llama-3.1-8b-instruct",
|
|
465
|
+
url="https://integrate.api.nvidia.com/v1/chat/completions",
|
|
466
|
+
type=EndpointType.CHAT,
|
|
467
|
+
api_key="nvapi-your-key-here"
|
|
468
|
+
)
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
# Run evaluation
|
|
472
|
+
result = evaluate(eval_cfg=eval_config, target_cfg=target_config)
|
|
473
|
+
```
|
|
474
|
+
|
|
475
|
+
## Advanced Topics
|
|
476
|
+
|
|
477
|
+
**Multi-backend execution**: See [references/execution-backends.md](references/execution-backends.md)
|
|
478
|
+
**Configuration deep-dive**: See [references/configuration.md](references/configuration.md)
|
|
479
|
+
**Adapter and interceptor system**: See [references/adapter-system.md](references/adapter-system.md)
|
|
480
|
+
**Custom benchmark integration**: See [references/custom-benchmarks.md](references/custom-benchmarks.md)
|
|
481
|
+
|
|
482
|
+
## Requirements
|
|
483
|
+
|
|
484
|
+
- **Python**: 3.10-3.13
|
|
485
|
+
- **Docker**: Required for local execution
|
|
486
|
+
- **NGC API Key**: For pulling containers and using NVIDIA Build
|
|
487
|
+
- **HF_TOKEN**: Required for some benchmarks (GPQA, MMLU)
|
|
488
|
+
|
|
489
|
+
## Resources
|
|
490
|
+
|
|
491
|
+
- **GitHub**: https://github.com/NVIDIA-NeMo/Evaluator
|
|
492
|
+
- **NGC Containers**: nvcr.io/nvidia/eval-factory/
|
|
493
|
+
- **NVIDIA Build**: https://build.nvidia.com (free hosted models)
|
|
494
|
+
- **Documentation**: https://github.com/NVIDIA-NeMo/Evaluator/tree/main/docs
|