opencode-skills-antigravity 1.0.40 → 1.0.41
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bundled-skills/.antigravity-install-manifest.json +7 -1
- package/bundled-skills/docs/integrations/jetski-cortex.md +3 -3
- package/bundled-skills/docs/integrations/jetski-gemini-loader/README.md +1 -1
- package/bundled-skills/docs/maintainers/repo-growth-seo.md +3 -3
- package/bundled-skills/docs/maintainers/skills-update-guide.md +1 -1
- package/bundled-skills/docs/sources/sources.md +2 -2
- package/bundled-skills/docs/users/bundles.md +1 -1
- package/bundled-skills/docs/users/claude-code-skills.md +1 -1
- package/bundled-skills/docs/users/gemini-cli-skills.md +1 -1
- package/bundled-skills/docs/users/getting-started.md +1 -1
- package/bundled-skills/docs/users/kiro-integration.md +1 -1
- package/bundled-skills/docs/users/usage.md +4 -4
- package/bundled-skills/docs/users/visual-guide.md +4 -4
- package/bundled-skills/hugging-face-cli/SKILL.md +192 -195
- package/bundled-skills/hugging-face-community-evals/SKILL.md +213 -0
- package/bundled-skills/hugging-face-community-evals/examples/.env.example +3 -0
- package/bundled-skills/hugging-face-community-evals/examples/USAGE_EXAMPLES.md +101 -0
- package/bundled-skills/hugging-face-community-evals/scripts/inspect_eval_uv.py +104 -0
- package/bundled-skills/hugging-face-community-evals/scripts/inspect_vllm_uv.py +306 -0
- package/bundled-skills/hugging-face-community-evals/scripts/lighteval_vllm_uv.py +297 -0
- package/bundled-skills/hugging-face-dataset-viewer/SKILL.md +120 -120
- package/bundled-skills/hugging-face-gradio/SKILL.md +304 -0
- package/bundled-skills/hugging-face-gradio/examples.md +613 -0
- package/bundled-skills/hugging-face-jobs/SKILL.md +25 -18
- package/bundled-skills/hugging-face-jobs/index.html +216 -0
- package/bundled-skills/hugging-face-jobs/references/hardware_guide.md +336 -0
- package/bundled-skills/hugging-face-jobs/references/hub_saving.md +352 -0
- package/bundled-skills/hugging-face-jobs/references/token_usage.md +570 -0
- package/bundled-skills/hugging-face-jobs/references/troubleshooting.md +475 -0
- package/bundled-skills/hugging-face-jobs/scripts/cot-self-instruct.py +718 -0
- package/bundled-skills/hugging-face-jobs/scripts/finepdfs-stats.py +546 -0
- package/bundled-skills/hugging-face-jobs/scripts/generate-responses.py +587 -0
- package/bundled-skills/hugging-face-model-trainer/SKILL.md +11 -12
- package/bundled-skills/hugging-face-model-trainer/references/gguf_conversion.md +296 -0
- package/bundled-skills/hugging-face-model-trainer/references/hardware_guide.md +283 -0
- package/bundled-skills/hugging-face-model-trainer/references/hub_saving.md +364 -0
- package/bundled-skills/hugging-face-model-trainer/references/local_training_macos.md +231 -0
- package/bundled-skills/hugging-face-model-trainer/references/reliability_principles.md +371 -0
- package/bundled-skills/hugging-face-model-trainer/references/trackio_guide.md +189 -0
- package/bundled-skills/hugging-face-model-trainer/references/training_methods.md +150 -0
- package/bundled-skills/hugging-face-model-trainer/references/training_patterns.md +203 -0
- package/bundled-skills/hugging-face-model-trainer/references/troubleshooting.md +282 -0
- package/bundled-skills/hugging-face-model-trainer/references/unsloth.md +313 -0
- package/bundled-skills/hugging-face-model-trainer/scripts/convert_to_gguf.py +424 -0
- package/bundled-skills/hugging-face-model-trainer/scripts/dataset_inspector.py +417 -0
- package/bundled-skills/hugging-face-model-trainer/scripts/estimate_cost.py +150 -0
- package/bundled-skills/hugging-face-model-trainer/scripts/train_dpo_example.py +106 -0
- package/bundled-skills/hugging-face-model-trainer/scripts/train_grpo_example.py +89 -0
- package/bundled-skills/hugging-face-model-trainer/scripts/train_sft_example.py +122 -0
- package/bundled-skills/hugging-face-model-trainer/scripts/unsloth_sft_example.py +512 -0
- package/bundled-skills/hugging-face-paper-publisher/SKILL.md +11 -4
- package/bundled-skills/hugging-face-paper-publisher/examples/example_usage.md +326 -0
- package/bundled-skills/hugging-face-paper-publisher/references/quick_reference.md +216 -0
- package/bundled-skills/hugging-face-paper-publisher/scripts/paper_manager.py +606 -0
- package/bundled-skills/hugging-face-paper-publisher/templates/arxiv.md +299 -0
- package/bundled-skills/hugging-face-paper-publisher/templates/ml-report.md +358 -0
- package/bundled-skills/hugging-face-paper-publisher/templates/modern.md +319 -0
- package/bundled-skills/hugging-face-paper-publisher/templates/standard.md +201 -0
- package/bundled-skills/hugging-face-papers/SKILL.md +241 -0
- package/bundled-skills/hugging-face-trackio/.claude-plugin/plugin.json +19 -0
- package/bundled-skills/hugging-face-trackio/SKILL.md +117 -0
- package/bundled-skills/hugging-face-trackio/references/alerts.md +196 -0
- package/bundled-skills/hugging-face-trackio/references/logging_metrics.md +206 -0
- package/bundled-skills/hugging-face-trackio/references/retrieving_metrics.md +251 -0
- package/bundled-skills/hugging-face-vision-trainer/SKILL.md +595 -0
- package/bundled-skills/hugging-face-vision-trainer/references/finetune_sam2_trainer.md +254 -0
- package/bundled-skills/hugging-face-vision-trainer/references/hub_saving.md +618 -0
- package/bundled-skills/hugging-face-vision-trainer/references/image_classification_training_notebook.md +279 -0
- package/bundled-skills/hugging-face-vision-trainer/references/object_detection_training_notebook.md +700 -0
- package/bundled-skills/hugging-face-vision-trainer/references/reliability_principles.md +310 -0
- package/bundled-skills/hugging-face-vision-trainer/references/timm_trainer.md +91 -0
- package/bundled-skills/hugging-face-vision-trainer/scripts/dataset_inspector.py +814 -0
- package/bundled-skills/hugging-face-vision-trainer/scripts/estimate_cost.py +217 -0
- package/bundled-skills/hugging-face-vision-trainer/scripts/image_classification_training.py +383 -0
- package/bundled-skills/hugging-face-vision-trainer/scripts/object_detection_training.py +710 -0
- package/bundled-skills/hugging-face-vision-trainer/scripts/sam_segmentation_training.py +382 -0
- package/bundled-skills/transformers-js/SKILL.md +639 -0
- package/bundled-skills/transformers-js/references/CACHE.md +339 -0
- package/bundled-skills/transformers-js/references/CONFIGURATION.md +390 -0
- package/bundled-skills/transformers-js/references/EXAMPLES.md +605 -0
- package/bundled-skills/transformers-js/references/MODEL_ARCHITECTURES.md +167 -0
- package/bundled-skills/transformers-js/references/PIPELINE_OPTIONS.md +545 -0
- package/bundled-skills/transformers-js/references/TEXT_GENERATION.md +315 -0
- package/package.json +1 -1
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# Usage Examples
|
|
2
|
+
|
|
3
|
+
This document provides practical examples for **running evaluations locally** against Hugging Face Hub models.
|
|
4
|
+
|
|
5
|
+
## What this skill covers
|
|
6
|
+
|
|
7
|
+
- `inspect-ai` local runs
|
|
8
|
+
- `inspect-ai` with `vllm` or Transformers backends
|
|
9
|
+
- `lighteval` local runs with `vllm` or `accelerate`
|
|
10
|
+
- smoke tests and backend fallback patterns
|
|
11
|
+
|
|
12
|
+
## What this skill does NOT cover
|
|
13
|
+
|
|
14
|
+
- `model-index`
|
|
15
|
+
- `.eval_results`
|
|
16
|
+
- community eval publication workflows
|
|
17
|
+
- model-card PR creation
|
|
18
|
+
- Hugging Face Jobs orchestration
|
|
19
|
+
|
|
20
|
+
If you want to run these same scripts remotely, use the `hugging-face-jobs` skill and pass one of the scripts in `scripts/`.
|
|
21
|
+
|
|
22
|
+
## Setup
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
cd skills/hugging-face-evaluation
|
|
26
|
+
export HF_TOKEN=hf_xxx
|
|
27
|
+
uv --version
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
For local GPU runs:
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
nvidia-smi
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## inspect-ai examples
|
|
37
|
+
|
|
38
|
+
### Quick smoke test
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
uv run scripts/inspect_eval_uv.py \
|
|
42
|
+
--model meta-llama/Llama-3.2-1B \
|
|
43
|
+
--task mmlu \
|
|
44
|
+
--limit 10
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### Local GPU with vLLM
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
uv run scripts/inspect_vllm_uv.py \
|
|
51
|
+
--model meta-llama/Llama-3.2-8B-Instruct \
|
|
52
|
+
--task gsm8k \
|
|
53
|
+
--limit 20
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### Transformers fallback
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
uv run scripts/inspect_vllm_uv.py \
|
|
60
|
+
--model microsoft/phi-2 \
|
|
61
|
+
--task mmlu \
|
|
62
|
+
--backend hf \
|
|
63
|
+
--trust-remote-code \
|
|
64
|
+
--limit 20
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## lighteval examples
|
|
68
|
+
|
|
69
|
+
### Single task
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
uv run scripts/lighteval_vllm_uv.py \
|
|
73
|
+
--model meta-llama/Llama-3.2-3B-Instruct \
|
|
74
|
+
--tasks "leaderboard|mmlu|5" \
|
|
75
|
+
--max-samples 20
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### Multiple tasks
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
uv run scripts/lighteval_vllm_uv.py \
|
|
82
|
+
--model meta-llama/Llama-3.2-3B-Instruct \
|
|
83
|
+
--tasks "leaderboard|mmlu|5,leaderboard|gsm8k|5" \
|
|
84
|
+
--max-samples 20 \
|
|
85
|
+
--use-chat-template
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### accelerate fallback
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
uv run scripts/lighteval_vllm_uv.py \
|
|
92
|
+
--model microsoft/phi-2 \
|
|
93
|
+
--tasks "leaderboard|mmlu|5" \
|
|
94
|
+
--backend accelerate \
|
|
95
|
+
--trust-remote-code \
|
|
96
|
+
--max-samples 20
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Hand-off to Hugging Face Jobs
|
|
100
|
+
|
|
101
|
+
When local hardware is not enough, switch to the `hugging-face-jobs` skill and run one of these scripts remotely. Keep the script path and args; move the orchestration there.
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
# /// script
|
|
2
|
+
# requires-python = ">=3.10"
|
|
3
|
+
# dependencies = [
|
|
4
|
+
# "inspect-ai>=0.3.0",
|
|
5
|
+
# "inspect-evals",
|
|
6
|
+
# "openai",
|
|
7
|
+
# ]
|
|
8
|
+
# ///
|
|
9
|
+
|
|
10
|
+
"""
|
|
11
|
+
Entry point script for running inspect-ai evaluations against Hugging Face inference providers.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import os
|
|
18
|
+
import subprocess
|
|
19
|
+
import sys
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Optional
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _inspect_evals_tasks_root() -> Optional[Path]:
|
|
25
|
+
"""Return the installed inspect_evals package path if available."""
|
|
26
|
+
try:
|
|
27
|
+
import inspect_evals
|
|
28
|
+
|
|
29
|
+
return Path(inspect_evals.__file__).parent
|
|
30
|
+
except Exception:
|
|
31
|
+
return None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _normalize_task(task: str) -> str:
|
|
35
|
+
"""Allow lighteval-style `suite|task|shots` strings by keeping the task name."""
|
|
36
|
+
if "|" in task:
|
|
37
|
+
parts = task.split("|")
|
|
38
|
+
if len(parts) >= 2 and parts[1]:
|
|
39
|
+
return parts[1]
|
|
40
|
+
return task
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def main() -> None:
|
|
44
|
+
parser = argparse.ArgumentParser(description="Inspect-ai job runner")
|
|
45
|
+
parser.add_argument("--model", required=True, help="Model ID on Hugging Face Hub")
|
|
46
|
+
parser.add_argument("--task", required=True, help="inspect-ai task to execute")
|
|
47
|
+
parser.add_argument("--limit", type=int, default=None, help="Limit number of samples to evaluate")
|
|
48
|
+
parser.add_argument(
|
|
49
|
+
"--tasks-root",
|
|
50
|
+
default=None,
|
|
51
|
+
help="Optional path to inspect task files. Defaults to the installed inspect_evals package.",
|
|
52
|
+
)
|
|
53
|
+
parser.add_argument(
|
|
54
|
+
"--sandbox",
|
|
55
|
+
default="local",
|
|
56
|
+
help="Sandbox backend to use (default: local for HF jobs without Docker).",
|
|
57
|
+
)
|
|
58
|
+
args = parser.parse_args()
|
|
59
|
+
|
|
60
|
+
# Ensure downstream libraries can read the token passed as a secret
|
|
61
|
+
hf_token = os.getenv("HF_TOKEN")
|
|
62
|
+
if hf_token:
|
|
63
|
+
os.environ.setdefault("HUGGING_FACE_HUB_TOKEN", hf_token)
|
|
64
|
+
os.environ.setdefault("HF_HUB_TOKEN", hf_token)
|
|
65
|
+
|
|
66
|
+
task = _normalize_task(args.task)
|
|
67
|
+
tasks_root = Path(args.tasks_root) if args.tasks_root else _inspect_evals_tasks_root()
|
|
68
|
+
if tasks_root and not tasks_root.exists():
|
|
69
|
+
tasks_root = None
|
|
70
|
+
|
|
71
|
+
cmd = [
|
|
72
|
+
"inspect",
|
|
73
|
+
"eval",
|
|
74
|
+
task,
|
|
75
|
+
"--model",
|
|
76
|
+
f"hf-inference-providers/{args.model}",
|
|
77
|
+
"--log-level",
|
|
78
|
+
"info",
|
|
79
|
+
# Reduce batch size to avoid OOM errors (default is 32)
|
|
80
|
+
"--max-connections",
|
|
81
|
+
"1",
|
|
82
|
+
# Set a small positive temperature (HF doesn't allow temperature=0)
|
|
83
|
+
"--temperature",
|
|
84
|
+
"0.001",
|
|
85
|
+
]
|
|
86
|
+
|
|
87
|
+
if args.sandbox:
|
|
88
|
+
cmd.extend(["--sandbox", args.sandbox])
|
|
89
|
+
|
|
90
|
+
if args.limit:
|
|
91
|
+
cmd.extend(["--limit", str(args.limit)])
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
subprocess.run(cmd, check=True, cwd=tasks_root)
|
|
95
|
+
print("Evaluation complete.")
|
|
96
|
+
except subprocess.CalledProcessError as exc:
|
|
97
|
+
location = f" (cwd={tasks_root})" if tasks_root else ""
|
|
98
|
+
print(f"Evaluation failed with exit code {exc.returncode}{location}", file=sys.stderr)
|
|
99
|
+
raise
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
if __name__ == "__main__":
|
|
103
|
+
main()
|
|
104
|
+
|
|
@@ -0,0 +1,306 @@
|
|
|
1
|
+
# /// script
|
|
2
|
+
# requires-python = ">=3.10"
|
|
3
|
+
# dependencies = [
|
|
4
|
+
# "inspect-ai>=0.3.0",
|
|
5
|
+
# "inspect-evals",
|
|
6
|
+
# "vllm>=0.4.0",
|
|
7
|
+
# "torch>=2.0.0",
|
|
8
|
+
# "transformers>=4.40.0",
|
|
9
|
+
# ]
|
|
10
|
+
# ///
|
|
11
|
+
|
|
12
|
+
"""
|
|
13
|
+
Entry point script for running inspect-ai evaluations with vLLM or HuggingFace Transformers backend.
|
|
14
|
+
|
|
15
|
+
This script runs evaluations on custom HuggingFace models using local GPU inference,
|
|
16
|
+
separate from inference provider scripts (which use external APIs).
|
|
17
|
+
|
|
18
|
+
Usage (standalone):
|
|
19
|
+
uv run scripts/inspect_vllm_uv.py --model "meta-llama/Llama-3.2-1B" --task "mmlu"
|
|
20
|
+
|
|
21
|
+
Model backends:
|
|
22
|
+
- vllm: Fast inference with vLLM (recommended for large models)
|
|
23
|
+
- hf: HuggingFace Transformers backend (broader model compatibility)
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
import argparse
|
|
29
|
+
import os
|
|
30
|
+
import subprocess
|
|
31
|
+
import sys
|
|
32
|
+
from typing import Optional
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def setup_environment() -> None:
|
|
36
|
+
"""Configure environment variables for HuggingFace authentication."""
|
|
37
|
+
hf_token = os.getenv("HF_TOKEN")
|
|
38
|
+
if hf_token:
|
|
39
|
+
os.environ.setdefault("HUGGING_FACE_HUB_TOKEN", hf_token)
|
|
40
|
+
os.environ.setdefault("HF_HUB_TOKEN", hf_token)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def run_inspect_vllm(
|
|
44
|
+
model_id: str,
|
|
45
|
+
task: str,
|
|
46
|
+
limit: Optional[int] = None,
|
|
47
|
+
max_connections: int = 4,
|
|
48
|
+
temperature: float = 0.0,
|
|
49
|
+
tensor_parallel_size: int = 1,
|
|
50
|
+
gpu_memory_utilization: float = 0.8,
|
|
51
|
+
dtype: str = "auto",
|
|
52
|
+
trust_remote_code: bool = False,
|
|
53
|
+
log_level: str = "info",
|
|
54
|
+
) -> None:
|
|
55
|
+
"""
|
|
56
|
+
Run inspect-ai evaluation with vLLM backend.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
model_id: HuggingFace model ID
|
|
60
|
+
task: inspect-ai task to execute (e.g., "mmlu", "gsm8k")
|
|
61
|
+
limit: Limit number of samples to evaluate
|
|
62
|
+
max_connections: Maximum concurrent connections
|
|
63
|
+
temperature: Sampling temperature
|
|
64
|
+
tensor_parallel_size: Number of GPUs for tensor parallelism
|
|
65
|
+
gpu_memory_utilization: GPU memory fraction
|
|
66
|
+
dtype: Data type (auto, float16, bfloat16)
|
|
67
|
+
trust_remote_code: Allow remote code execution
|
|
68
|
+
log_level: Logging level
|
|
69
|
+
"""
|
|
70
|
+
setup_environment()
|
|
71
|
+
|
|
72
|
+
model_spec = f"vllm/{model_id}"
|
|
73
|
+
cmd = [
|
|
74
|
+
"inspect",
|
|
75
|
+
"eval",
|
|
76
|
+
task,
|
|
77
|
+
"--model",
|
|
78
|
+
model_spec,
|
|
79
|
+
"--log-level",
|
|
80
|
+
log_level,
|
|
81
|
+
"--max-connections",
|
|
82
|
+
str(max_connections),
|
|
83
|
+
]
|
|
84
|
+
|
|
85
|
+
# vLLM supports temperature=0 unlike HF inference providers
|
|
86
|
+
cmd.extend(["--temperature", str(temperature)])
|
|
87
|
+
|
|
88
|
+
# Older inspect-ai CLI versions do not support --model-args; rely on defaults
|
|
89
|
+
# and let vLLM choose sensible settings for small models.
|
|
90
|
+
if tensor_parallel_size != 1:
|
|
91
|
+
cmd.extend(["--tensor-parallel-size", str(tensor_parallel_size)])
|
|
92
|
+
if gpu_memory_utilization != 0.8:
|
|
93
|
+
cmd.extend(["--gpu-memory-utilization", str(gpu_memory_utilization)])
|
|
94
|
+
if dtype != "auto":
|
|
95
|
+
cmd.extend(["--dtype", dtype])
|
|
96
|
+
if trust_remote_code:
|
|
97
|
+
cmd.append("--trust-remote-code")
|
|
98
|
+
|
|
99
|
+
if limit:
|
|
100
|
+
cmd.extend(["--limit", str(limit)])
|
|
101
|
+
|
|
102
|
+
print(f"Running: {' '.join(cmd)}")
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
subprocess.run(cmd, check=True)
|
|
106
|
+
print("Evaluation complete.")
|
|
107
|
+
except subprocess.CalledProcessError as exc:
|
|
108
|
+
print(f"Evaluation failed with exit code {exc.returncode}", file=sys.stderr)
|
|
109
|
+
sys.exit(exc.returncode)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def run_inspect_hf(
|
|
113
|
+
model_id: str,
|
|
114
|
+
task: str,
|
|
115
|
+
limit: Optional[int] = None,
|
|
116
|
+
max_connections: int = 1,
|
|
117
|
+
temperature: float = 0.001,
|
|
118
|
+
device: str = "auto",
|
|
119
|
+
dtype: str = "auto",
|
|
120
|
+
trust_remote_code: bool = False,
|
|
121
|
+
log_level: str = "info",
|
|
122
|
+
) -> None:
|
|
123
|
+
"""
|
|
124
|
+
Run inspect-ai evaluation with HuggingFace Transformers backend.
|
|
125
|
+
|
|
126
|
+
Use this when vLLM doesn't support the model architecture.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
model_id: HuggingFace model ID
|
|
130
|
+
task: inspect-ai task to execute
|
|
131
|
+
limit: Limit number of samples
|
|
132
|
+
max_connections: Maximum concurrent connections (keep low for memory)
|
|
133
|
+
temperature: Sampling temperature
|
|
134
|
+
device: Device to use (auto, cuda, cpu)
|
|
135
|
+
dtype: Data type
|
|
136
|
+
trust_remote_code: Allow remote code execution
|
|
137
|
+
log_level: Logging level
|
|
138
|
+
"""
|
|
139
|
+
setup_environment()
|
|
140
|
+
|
|
141
|
+
model_spec = f"hf/{model_id}"
|
|
142
|
+
|
|
143
|
+
cmd = [
|
|
144
|
+
"inspect",
|
|
145
|
+
"eval",
|
|
146
|
+
task,
|
|
147
|
+
"--model",
|
|
148
|
+
model_spec,
|
|
149
|
+
"--log-level",
|
|
150
|
+
log_level,
|
|
151
|
+
"--max-connections",
|
|
152
|
+
str(max_connections),
|
|
153
|
+
"--temperature",
|
|
154
|
+
str(temperature),
|
|
155
|
+
]
|
|
156
|
+
|
|
157
|
+
if device != "auto":
|
|
158
|
+
cmd.extend(["--device", device])
|
|
159
|
+
if dtype != "auto":
|
|
160
|
+
cmd.extend(["--dtype", dtype])
|
|
161
|
+
if trust_remote_code:
|
|
162
|
+
cmd.append("--trust-remote-code")
|
|
163
|
+
|
|
164
|
+
if limit:
|
|
165
|
+
cmd.extend(["--limit", str(limit)])
|
|
166
|
+
|
|
167
|
+
print(f"Running: {' '.join(cmd)}")
|
|
168
|
+
|
|
169
|
+
try:
|
|
170
|
+
subprocess.run(cmd, check=True)
|
|
171
|
+
print("Evaluation complete.")
|
|
172
|
+
except subprocess.CalledProcessError as exc:
|
|
173
|
+
print(f"Evaluation failed with exit code {exc.returncode}", file=sys.stderr)
|
|
174
|
+
sys.exit(exc.returncode)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def main() -> None:
|
|
178
|
+
parser = argparse.ArgumentParser(
|
|
179
|
+
description="Run inspect-ai evaluations with vLLM or HuggingFace Transformers on custom models",
|
|
180
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
181
|
+
epilog="""
|
|
182
|
+
Examples:
|
|
183
|
+
# Run MMLU with vLLM backend
|
|
184
|
+
uv run scripts/inspect_vllm_uv.py --model meta-llama/Llama-3.2-1B --task mmlu
|
|
185
|
+
|
|
186
|
+
# Run with HuggingFace Transformers backend
|
|
187
|
+
uv run scripts/inspect_vllm_uv.py --model meta-llama/Llama-3.2-1B --task mmlu --backend hf
|
|
188
|
+
|
|
189
|
+
# Run with limited samples for testing
|
|
190
|
+
uv run scripts/inspect_vllm_uv.py --model meta-llama/Llama-3.2-1B --task mmlu --limit 10
|
|
191
|
+
|
|
192
|
+
# Run on multiple GPUs with tensor parallelism
|
|
193
|
+
uv run scripts/inspect_vllm_uv.py --model meta-llama/Llama-3.2-70B --task mmlu --tensor-parallel-size 4
|
|
194
|
+
|
|
195
|
+
Available tasks (from inspect-evals):
|
|
196
|
+
- mmlu: Massive Multitask Language Understanding
|
|
197
|
+
- gsm8k: Grade School Math
|
|
198
|
+
- hellaswag: Common sense reasoning
|
|
199
|
+
- arc_challenge: AI2 Reasoning Challenge
|
|
200
|
+
- truthfulqa: TruthfulQA benchmark
|
|
201
|
+
- winogrande: Winograd Schema Challenge
|
|
202
|
+
- humaneval: Code generation (HumanEval)
|
|
203
|
+
|
|
204
|
+
""",
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
parser.add_argument(
|
|
208
|
+
"--model",
|
|
209
|
+
required=True,
|
|
210
|
+
help="HuggingFace model ID (e.g., meta-llama/Llama-3.2-1B)",
|
|
211
|
+
)
|
|
212
|
+
parser.add_argument(
|
|
213
|
+
"--task",
|
|
214
|
+
required=True,
|
|
215
|
+
help="inspect-ai task to execute (e.g., mmlu, gsm8k)",
|
|
216
|
+
)
|
|
217
|
+
parser.add_argument(
|
|
218
|
+
"--backend",
|
|
219
|
+
choices=["vllm", "hf"],
|
|
220
|
+
default="vllm",
|
|
221
|
+
help="Model backend (default: vllm)",
|
|
222
|
+
)
|
|
223
|
+
parser.add_argument(
|
|
224
|
+
"--limit",
|
|
225
|
+
type=int,
|
|
226
|
+
default=None,
|
|
227
|
+
help="Limit number of samples to evaluate",
|
|
228
|
+
)
|
|
229
|
+
parser.add_argument(
|
|
230
|
+
"--max-connections",
|
|
231
|
+
type=int,
|
|
232
|
+
default=None,
|
|
233
|
+
help="Maximum concurrent connections (default: 4 for vllm, 1 for hf)",
|
|
234
|
+
)
|
|
235
|
+
parser.add_argument(
|
|
236
|
+
"--temperature",
|
|
237
|
+
type=float,
|
|
238
|
+
default=None,
|
|
239
|
+
help="Sampling temperature (default: 0.0 for vllm, 0.001 for hf)",
|
|
240
|
+
)
|
|
241
|
+
parser.add_argument(
|
|
242
|
+
"--tensor-parallel-size",
|
|
243
|
+
type=int,
|
|
244
|
+
default=1,
|
|
245
|
+
help="Number of GPUs for tensor parallelism (vLLM only, default: 1)",
|
|
246
|
+
)
|
|
247
|
+
parser.add_argument(
|
|
248
|
+
"--gpu-memory-utilization",
|
|
249
|
+
type=float,
|
|
250
|
+
default=0.8,
|
|
251
|
+
help="GPU memory fraction to use (vLLM only, default: 0.8)",
|
|
252
|
+
)
|
|
253
|
+
parser.add_argument(
|
|
254
|
+
"--dtype",
|
|
255
|
+
default="auto",
|
|
256
|
+
choices=["auto", "float16", "bfloat16", "float32"],
|
|
257
|
+
help="Data type for model weights (default: auto)",
|
|
258
|
+
)
|
|
259
|
+
parser.add_argument(
|
|
260
|
+
"--device",
|
|
261
|
+
default="auto",
|
|
262
|
+
help="Device for HF backend (auto, cuda, cpu)",
|
|
263
|
+
)
|
|
264
|
+
parser.add_argument(
|
|
265
|
+
"--trust-remote-code",
|
|
266
|
+
action="store_true",
|
|
267
|
+
help="Allow executing remote code from model repository",
|
|
268
|
+
)
|
|
269
|
+
parser.add_argument(
|
|
270
|
+
"--log-level",
|
|
271
|
+
default="info",
|
|
272
|
+
choices=["debug", "info", "warning", "error"],
|
|
273
|
+
help="Logging level (default: info)",
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
args = parser.parse_args()
|
|
277
|
+
|
|
278
|
+
if args.backend == "vllm":
|
|
279
|
+
run_inspect_vllm(
|
|
280
|
+
model_id=args.model,
|
|
281
|
+
task=args.task,
|
|
282
|
+
limit=args.limit,
|
|
283
|
+
max_connections=args.max_connections or 4,
|
|
284
|
+
temperature=args.temperature if args.temperature is not None else 0.0,
|
|
285
|
+
tensor_parallel_size=args.tensor_parallel_size,
|
|
286
|
+
gpu_memory_utilization=args.gpu_memory_utilization,
|
|
287
|
+
dtype=args.dtype,
|
|
288
|
+
trust_remote_code=args.trust_remote_code,
|
|
289
|
+
log_level=args.log_level,
|
|
290
|
+
)
|
|
291
|
+
else:
|
|
292
|
+
run_inspect_hf(
|
|
293
|
+
model_id=args.model,
|
|
294
|
+
task=args.task,
|
|
295
|
+
limit=args.limit,
|
|
296
|
+
max_connections=args.max_connections or 1,
|
|
297
|
+
temperature=args.temperature if args.temperature is not None else 0.001,
|
|
298
|
+
device=args.device,
|
|
299
|
+
dtype=args.dtype,
|
|
300
|
+
trust_remote_code=args.trust_remote_code,
|
|
301
|
+
log_level=args.log_level,
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
if __name__ == "__main__":
|
|
306
|
+
main()
|