@cliwatch/cli-bench 0.6.3 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +8 -0
- package/README.md +3 -0
- package/dist/assertions.d.ts +1 -1
- package/dist/assertions.d.ts.map +1 -1
- package/dist/assertions.js +6 -6
- package/dist/assertions.js.map +1 -1
- package/dist/client/index.d.ts +1 -1
- package/dist/client/index.d.ts.map +1 -1
- package/dist/client/types.gen.d.ts +124 -86
- package/dist/client/types.gen.d.ts.map +1 -1
- package/dist/client/zod.gen.d.ts +57 -36
- package/dist/client/zod.gen.d.ts.map +1 -1
- package/dist/client/zod.gen.js +84 -52
- package/dist/client/zod.gen.js.map +1 -1
- package/dist/config.d.ts +2 -0
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +9 -0
- package/dist/config.js.map +1 -1
- package/dist/exec.d.ts +2 -0
- package/dist/exec.d.ts.map +1 -1
- package/dist/exec.js +6 -2
- package/dist/exec.js.map +1 -1
- package/dist/github-comment.d.ts +16 -0
- package/dist/github-comment.d.ts.map +1 -0
- package/dist/github-comment.js +90 -0
- package/dist/github-comment.js.map +1 -0
- package/dist/index.d.ts +2 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +27 -26
- package/dist/index.js.map +1 -1
- package/dist/models.d.ts +7 -0
- package/dist/models.d.ts.map +1 -1
- package/dist/project.d.ts +11 -2
- package/dist/project.d.ts.map +1 -1
- package/dist/project.js +22 -8
- package/dist/project.js.map +1 -1
- package/dist/providers.d.ts +9 -7
- package/dist/providers.d.ts.map +1 -1
- package/dist/providers.js +26 -8
- package/dist/providers.js.map +1 -1
- package/dist/runner.d.ts +29 -1
- package/dist/runner.d.ts.map +1 -1
- package/dist/runner.js +73 -76
- package/dist/runner.js.map +1 -1
- package/dist/schemas.d.ts +15 -0
- package/dist/schemas.d.ts.map +1 -1
- package/dist/schemas.js +6 -0
- package/dist/schemas.js.map +1 -1
- package/dist/suite-generator.d.ts.map +1 -1
- package/dist/suite-generator.js +63 -11
- package/dist/suite-generator.js.map +1 -1
- package/package.json +2 -2
- package/task_suites/curl.yaml +0 -138
- package/task_suites/docker.yaml +0 -163
- package/task_suites/gh.yaml +0 -118
- package/task_suites/jq.yaml +0 -172
- package/task_suites/kubectl.yaml +0 -74
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@cliwatch/cli-bench",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.7.0",
|
|
4
4
|
"description": "LLM CLI agent testing framework — benchmark how well AI models use your CLI tool",
|
|
5
5
|
"keywords": ["cli", "benchmark", "llm", "testing", "ai-agent", "cliwatch", "evaluation"],
|
|
6
6
|
"license": "MIT",
|
|
@@ -14,7 +14,6 @@
|
|
|
14
14
|
},
|
|
15
15
|
"files": [
|
|
16
16
|
"dist",
|
|
17
|
-
"task_suites",
|
|
18
17
|
"LICENSE",
|
|
19
18
|
"CHANGELOG.md"
|
|
20
19
|
],
|
|
@@ -35,6 +34,7 @@
|
|
|
35
34
|
"test": "vitest"
|
|
36
35
|
},
|
|
37
36
|
"dependencies": {
|
|
37
|
+
"@ai-sdk/google": "^3.0.0",
|
|
38
38
|
"@hey-api/client-fetch": "^0.13.1",
|
|
39
39
|
"ai": "^6.0.18",
|
|
40
40
|
"yaml": "^2.7.0",
|
package/task_suites/curl.yaml
DELETED
|
@@ -1,138 +0,0 @@
|
|
|
1
|
-
cli: curl
|
|
2
|
-
version_command: "curl --version | head -1"
|
|
3
|
-
|
|
4
|
-
tasks:
|
|
5
|
-
# -- Easy --------------------------------------------------------------------
|
|
6
|
-
- id: simple-get
|
|
7
|
-
intent: "Fetch the contents of https://httpbin.org/get"
|
|
8
|
-
difficulty: easy
|
|
9
|
-
category: query
|
|
10
|
-
max_turns: 2
|
|
11
|
-
assert:
|
|
12
|
-
- ran: "curl.*httpbin.org/get"
|
|
13
|
-
- exit_code: 0
|
|
14
|
-
- output_contains: "origin"
|
|
15
|
-
|
|
16
|
-
- id: head-request
|
|
17
|
-
intent: "Send a HEAD request to https://httpbin.org/get and show the response headers"
|
|
18
|
-
difficulty: easy
|
|
19
|
-
category: query
|
|
20
|
-
max_turns: 3
|
|
21
|
-
assert:
|
|
22
|
-
- ran: "curl"
|
|
23
|
-
- ran: "-I|--head"
|
|
24
|
-
- output_contains: "HTTP"
|
|
25
|
-
|
|
26
|
-
- id: download-file
|
|
27
|
-
intent: "Download https://httpbin.org/robots.txt and save it as robots.txt"
|
|
28
|
-
difficulty: easy
|
|
29
|
-
category: crud
|
|
30
|
-
max_turns: 3
|
|
31
|
-
assert:
|
|
32
|
-
- ran: "curl"
|
|
33
|
-
- ran: "-o|--output"
|
|
34
|
-
- file_exists: "robots.txt"
|
|
35
|
-
|
|
36
|
-
- id: follow-redirects
|
|
37
|
-
intent: "Fetch https://httpbin.org/redirect/2 and follow all redirects"
|
|
38
|
-
difficulty: easy
|
|
39
|
-
category: query
|
|
40
|
-
max_turns: 3
|
|
41
|
-
assert:
|
|
42
|
-
- ran: "curl"
|
|
43
|
-
- ran: "-L|--location"
|
|
44
|
-
- exit_code: 0
|
|
45
|
-
|
|
46
|
-
# -- Medium ------------------------------------------------------------------
|
|
47
|
-
- id: post-json
|
|
48
|
-
intent: "Send a POST request to https://httpbin.org/post with JSON body {\"name\": \"bench\", \"version\": 1} and set the Content-Type header to application/json"
|
|
49
|
-
difficulty: medium
|
|
50
|
-
category: crud
|
|
51
|
-
max_turns: 5
|
|
52
|
-
assert:
|
|
53
|
-
- ran: "curl"
|
|
54
|
-
- ran: "-X POST|--request POST|-d|--data"
|
|
55
|
-
- ran: "Content-Type.*application/json"
|
|
56
|
-
- output_contains: "bench"
|
|
57
|
-
|
|
58
|
-
- id: custom-headers
|
|
59
|
-
intent: "Send a GET request to https://httpbin.org/headers with custom headers X-Request-ID: abc123 and Accept: application/xml"
|
|
60
|
-
difficulty: medium
|
|
61
|
-
category: query
|
|
62
|
-
max_turns: 5
|
|
63
|
-
assert:
|
|
64
|
-
- ran: "curl"
|
|
65
|
-
- ran: "-H|--header"
|
|
66
|
-
- ran: "X-Request-ID"
|
|
67
|
-
- output_contains: "abc123"
|
|
68
|
-
|
|
69
|
-
- id: basic-auth
|
|
70
|
-
intent: "Send a GET request to https://httpbin.org/basic-auth/user/passwd using basic authentication with username 'user' and password 'passwd'"
|
|
71
|
-
difficulty: medium
|
|
72
|
-
category: auth
|
|
73
|
-
max_turns: 5
|
|
74
|
-
assert:
|
|
75
|
-
- ran: "curl"
|
|
76
|
-
- ran: "-u|--user|user:passwd"
|
|
77
|
-
- output_contains: "authenticated"
|
|
78
|
-
|
|
79
|
-
- id: verbose-timing
|
|
80
|
-
intent: "Fetch https://httpbin.org/get and show the total time taken for the request using curl's write-out feature"
|
|
81
|
-
difficulty: medium
|
|
82
|
-
category: output
|
|
83
|
-
max_turns: 5
|
|
84
|
-
assert:
|
|
85
|
-
- ran: "curl"
|
|
86
|
-
- ran: "-w|--write-out|time_total"
|
|
87
|
-
|
|
88
|
-
# -- Hard --------------------------------------------------------------------
|
|
89
|
-
- id: put-with-file
|
|
90
|
-
intent: "Upload the file /tmp/bench-workspace/data.json to https://httpbin.org/put using a PUT request with Content-Type application/json"
|
|
91
|
-
difficulty: hard
|
|
92
|
-
category: crud
|
|
93
|
-
setup:
|
|
94
|
-
- "mkdir -p /tmp/bench-workspace"
|
|
95
|
-
- "echo '{\"key\": \"value\"}' > /tmp/bench-workspace/data.json"
|
|
96
|
-
max_turns: 7
|
|
97
|
-
assert:
|
|
98
|
-
- ran: "curl"
|
|
99
|
-
- ran: "-X PUT|--request PUT|-T"
|
|
100
|
-
- ran: "data.json"
|
|
101
|
-
- output_contains: "key"
|
|
102
|
-
|
|
103
|
-
- id: retry-with-timeout
|
|
104
|
-
intent: "Fetch https://httpbin.org/delay/1 with a 5 second timeout, retry 3 times on failure, and save the response to response.json"
|
|
105
|
-
difficulty: hard
|
|
106
|
-
category: crud
|
|
107
|
-
max_turns: 7
|
|
108
|
-
assert:
|
|
109
|
-
- ran: "curl"
|
|
110
|
-
- ran: "--retry.*3"
|
|
111
|
-
- ran: "--max-time|--connect-timeout|-m"
|
|
112
|
-
- ran: "-o|--output"
|
|
113
|
-
|
|
114
|
-
- id: multipart-upload
|
|
115
|
-
intent: "Send a multipart/form-data POST to https://httpbin.org/post with a field 'username' set to 'admin' and a file field 'config' uploading /tmp/bench-workspace/app.conf"
|
|
116
|
-
difficulty: hard
|
|
117
|
-
category: crud
|
|
118
|
-
setup:
|
|
119
|
-
- "mkdir -p /tmp/bench-workspace"
|
|
120
|
-
- "echo 'port=8080' > /tmp/bench-workspace/app.conf"
|
|
121
|
-
max_turns: 7
|
|
122
|
-
assert:
|
|
123
|
-
- ran: "curl"
|
|
124
|
-
- ran: "-F|--form"
|
|
125
|
-
- ran: "username.*admin"
|
|
126
|
-
- ran: "config.*@.*app.conf"
|
|
127
|
-
- output_contains: "admin"
|
|
128
|
-
|
|
129
|
-
- id: conditional-request
|
|
130
|
-
intent: "Fetch https://httpbin.org/cache and use conditional headers: set If-None-Match to '12345' and If-Modified-Since to 'Thu, 01 Jan 2025 00:00:00 GMT'. Show the response status code."
|
|
131
|
-
difficulty: hard
|
|
132
|
-
category: query
|
|
133
|
-
max_turns: 7
|
|
134
|
-
assert:
|
|
135
|
-
- ran: "curl"
|
|
136
|
-
- ran: "If-None-Match"
|
|
137
|
-
- ran: "If-Modified-Since"
|
|
138
|
-
- ran: "-w|--write-out|-I|--head|-v|--verbose"
|
package/task_suites/docker.yaml
DELETED
|
@@ -1,163 +0,0 @@
|
|
|
1
|
-
cli: docker
|
|
2
|
-
|
|
3
|
-
tasks:
|
|
4
|
-
# -- Easy --------------------------------------------------------------------
|
|
5
|
-
- id: list-containers
|
|
6
|
-
intent: "List all running containers"
|
|
7
|
-
difficulty: easy
|
|
8
|
-
category: query
|
|
9
|
-
max_turns: 3
|
|
10
|
-
assert:
|
|
11
|
-
- ran: "docker ps"
|
|
12
|
-
- exit_code: 0
|
|
13
|
-
|
|
14
|
-
- id: list-images
|
|
15
|
-
intent: "List all local Docker images"
|
|
16
|
-
difficulty: easy
|
|
17
|
-
category: query
|
|
18
|
-
max_turns: 3
|
|
19
|
-
assert:
|
|
20
|
-
- ran: "docker image"
|
|
21
|
-
- exit_code: 0
|
|
22
|
-
|
|
23
|
-
- id: pull-image
|
|
24
|
-
intent: "Pull the latest nginx image from Docker Hub"
|
|
25
|
-
difficulty: easy
|
|
26
|
-
category: crud
|
|
27
|
-
max_turns: 3
|
|
28
|
-
assert:
|
|
29
|
-
- ran: "docker pull.*nginx"
|
|
30
|
-
- verify:
|
|
31
|
-
run: "docker images nginx --format '{{.Repository}}'"
|
|
32
|
-
output_contains: "nginx"
|
|
33
|
-
|
|
34
|
-
- id: view-logs
|
|
35
|
-
intent: "Show the last 100 lines of logs from the container 'api' and follow new output"
|
|
36
|
-
difficulty: easy
|
|
37
|
-
category: query
|
|
38
|
-
setup:
|
|
39
|
-
- "docker run -d --name api alpine sh -c 'for i in $(seq 1 200); do echo line-$i; done; sleep 3600'"
|
|
40
|
-
max_turns: 3
|
|
41
|
-
assert:
|
|
42
|
-
- ran: "docker logs"
|
|
43
|
-
- ran: "--tail"
|
|
44
|
-
|
|
45
|
-
- id: stop-container
|
|
46
|
-
intent: "Stop the container named 'web-server'"
|
|
47
|
-
difficulty: easy
|
|
48
|
-
category: crud
|
|
49
|
-
setup:
|
|
50
|
-
- "docker run -d --name web-server alpine sleep 3600"
|
|
51
|
-
max_turns: 3
|
|
52
|
-
assert:
|
|
53
|
-
- ran: "docker stop.*web-server"
|
|
54
|
-
- verify:
|
|
55
|
-
run: "docker ps --filter name=web-server --format '{{.Names}}'"
|
|
56
|
-
output_equals: ""
|
|
57
|
-
|
|
58
|
-
# -- Medium ------------------------------------------------------------------
|
|
59
|
-
- id: run-detached
|
|
60
|
-
intent: "Run an nginx container in the background, mapping host port 8080 to container port 80, named 'web'"
|
|
61
|
-
difficulty: medium
|
|
62
|
-
category: crud
|
|
63
|
-
max_turns: 5
|
|
64
|
-
assert:
|
|
65
|
-
- ran: "docker run"
|
|
66
|
-
- ran: "-d"
|
|
67
|
-
- ran: "--name.*web"
|
|
68
|
-
- verify:
|
|
69
|
-
run: "docker ps --filter name=web --format '{{.Names}}'"
|
|
70
|
-
output_contains: "web"
|
|
71
|
-
|
|
72
|
-
- id: build-with-tag
|
|
73
|
-
intent: "Build a Docker image from ./Dockerfile and tag it as myapp:v2"
|
|
74
|
-
difficulty: medium
|
|
75
|
-
category: crud
|
|
76
|
-
setup:
|
|
77
|
-
- "mkdir -p /tmp/bench-workspace"
|
|
78
|
-
- "printf 'FROM alpine:latest\nRUN echo hello' > /tmp/bench-workspace/Dockerfile"
|
|
79
|
-
max_turns: 5
|
|
80
|
-
assert:
|
|
81
|
-
- ran: "docker build"
|
|
82
|
-
- ran: "myapp:v2"
|
|
83
|
-
- verify:
|
|
84
|
-
run: "docker images myapp:v2 --format '{{.Repository}}:{{.Tag}}'"
|
|
85
|
-
output_contains: "myapp:v2"
|
|
86
|
-
|
|
87
|
-
- id: exec-into-container
|
|
88
|
-
intent: "Run the command 'cat /etc/os-release' inside the running container 'web'"
|
|
89
|
-
difficulty: medium
|
|
90
|
-
category: crud
|
|
91
|
-
setup:
|
|
92
|
-
- "docker run -d --name web alpine sleep 3600"
|
|
93
|
-
max_turns: 5
|
|
94
|
-
assert:
|
|
95
|
-
- ran: "docker exec.*web.*cat /etc/os-release"
|
|
96
|
-
- exit_code: 0
|
|
97
|
-
|
|
98
|
-
- id: inspect-json
|
|
99
|
-
intent: "Get the IP address of the container 'web' using docker inspect with a format template"
|
|
100
|
-
difficulty: medium
|
|
101
|
-
category: output
|
|
102
|
-
setup:
|
|
103
|
-
- "docker run -d --name web alpine sleep 3600"
|
|
104
|
-
max_turns: 5
|
|
105
|
-
assert:
|
|
106
|
-
- ran: "docker inspect"
|
|
107
|
-
- ran: "--format"
|
|
108
|
-
|
|
109
|
-
- id: prune-all
|
|
110
|
-
intent: "Remove all stopped containers, unused networks, dangling images, and build cache without prompting for confirmation"
|
|
111
|
-
difficulty: medium
|
|
112
|
-
category: crud
|
|
113
|
-
max_turns: 5
|
|
114
|
-
assert:
|
|
115
|
-
- ran: "docker system prune"
|
|
116
|
-
- ran: "--force|-f"
|
|
117
|
-
|
|
118
|
-
# -- Hard --------------------------------------------------------------------
|
|
119
|
-
- id: run-complex
|
|
120
|
-
intent: "Run a postgres:16 container named 'db' in the background with environment variables POSTGRES_USER=admin and POSTGRES_PASSWORD=secret, mount a volume 'pgdata' to /var/lib/postgresql/data, and connect it to the network 'backend'"
|
|
121
|
-
difficulty: hard
|
|
122
|
-
category: crud
|
|
123
|
-
setup:
|
|
124
|
-
- "docker network create backend || true"
|
|
125
|
-
max_turns: 7
|
|
126
|
-
assert:
|
|
127
|
-
- ran: "docker run"
|
|
128
|
-
- ran: "--name.*db"
|
|
129
|
-
- ran: "POSTGRES_USER=admin"
|
|
130
|
-
- ran: "POSTGRES_PASSWORD=secret"
|
|
131
|
-
- verify:
|
|
132
|
-
run: "docker ps --filter name=db --format '{{.Names}}'"
|
|
133
|
-
output_contains: "db"
|
|
134
|
-
|
|
135
|
-
- id: compose-up
|
|
136
|
-
intent: "Start all services defined in docker-compose.yml in detached mode and rebuild any changed images"
|
|
137
|
-
difficulty: hard
|
|
138
|
-
category: workflow
|
|
139
|
-
setup:
|
|
140
|
-
- "mkdir -p /tmp/bench-workspace"
|
|
141
|
-
- "printf 'services:\n web:\n image: alpine\n command: sleep 3600\n' > /tmp/bench-workspace/docker-compose.yml"
|
|
142
|
-
max_turns: 7
|
|
143
|
-
assert:
|
|
144
|
-
- ran: "docker compose.*up"
|
|
145
|
-
- ran: "-d|--detach"
|
|
146
|
-
- ran: "--build"
|
|
147
|
-
|
|
148
|
-
- id: multi-stage-debug
|
|
149
|
-
intent: "Build only the 'builder' stage from /tmp/bench-workspace/Dockerfile, tag it as 'myapp:debug', and don't use cache"
|
|
150
|
-
difficulty: hard
|
|
151
|
-
category: crud
|
|
152
|
-
setup:
|
|
153
|
-
- "mkdir -p /tmp/bench-workspace"
|
|
154
|
-
- "printf 'FROM alpine:latest AS builder\nRUN echo building\nFROM alpine:latest\nCOPY --from=builder / /\n' > /tmp/bench-workspace/Dockerfile"
|
|
155
|
-
max_turns: 7
|
|
156
|
-
assert:
|
|
157
|
-
- ran: "docker build"
|
|
158
|
-
- ran: "--target.*builder"
|
|
159
|
-
- ran: "myapp:debug"
|
|
160
|
-
- ran: "--no-cache"
|
|
161
|
-
- verify:
|
|
162
|
-
run: "docker images myapp:debug --format '{{.Repository}}:{{.Tag}}'"
|
|
163
|
-
output_contains: "myapp:debug"
|
package/task_suites/gh.yaml
DELETED
|
@@ -1,118 +0,0 @@
|
|
|
1
|
-
cli: gh
|
|
2
|
-
|
|
3
|
-
tasks:
|
|
4
|
-
# -- Easy --------------------------------------------------------------------
|
|
5
|
-
- id: list-repos
|
|
6
|
-
intent: "List all public repositories for the GitHub user 'octocat'"
|
|
7
|
-
difficulty: easy
|
|
8
|
-
category: query
|
|
9
|
-
max_turns: 3
|
|
10
|
-
assert:
|
|
11
|
-
- ran: "gh repo list.*octocat"
|
|
12
|
-
- ran: "--visibility"
|
|
13
|
-
|
|
14
|
-
- id: view-issue
|
|
15
|
-
intent: "View the details of issue #42 in the repository cli/cli"
|
|
16
|
-
difficulty: easy
|
|
17
|
-
category: crud
|
|
18
|
-
max_turns: 3
|
|
19
|
-
assert:
|
|
20
|
-
- ran: "gh issue view.*42"
|
|
21
|
-
- ran: "--repo.*cli/cli"
|
|
22
|
-
|
|
23
|
-
- id: list-prs-json
|
|
24
|
-
intent: "List open pull requests in the current repo and output as JSON with fields number, title, and author"
|
|
25
|
-
difficulty: easy
|
|
26
|
-
category: output
|
|
27
|
-
max_turns: 3
|
|
28
|
-
assert:
|
|
29
|
-
- ran: "gh pr list"
|
|
30
|
-
- ran: "--json"
|
|
31
|
-
|
|
32
|
-
- id: whoami
|
|
33
|
-
intent: "Show which GitHub account you're currently authenticated as"
|
|
34
|
-
difficulty: easy
|
|
35
|
-
category: auth
|
|
36
|
-
max_turns: 3
|
|
37
|
-
assert:
|
|
38
|
-
- ran: "gh auth status"
|
|
39
|
-
|
|
40
|
-
- id: set-config
|
|
41
|
-
intent: "Set the default git protocol to SSH for GitHub CLI"
|
|
42
|
-
difficulty: easy
|
|
43
|
-
category: config
|
|
44
|
-
max_turns: 3
|
|
45
|
-
assert:
|
|
46
|
-
- ran: "gh config set.*git_protocol.*ssh"
|
|
47
|
-
|
|
48
|
-
# -- Medium ------------------------------------------------------------------
|
|
49
|
-
- id: create-issue-with-labels
|
|
50
|
-
intent: "Create a new issue titled 'Login page broken' with labels 'bug' and 'urgent' in the repository myorg/myapp"
|
|
51
|
-
difficulty: medium
|
|
52
|
-
category: crud
|
|
53
|
-
max_turns: 5
|
|
54
|
-
assert:
|
|
55
|
-
- ran: "gh issue create"
|
|
56
|
-
- ran: "--title"
|
|
57
|
-
- ran: "--label"
|
|
58
|
-
- ran: "--repo.*myorg/myapp"
|
|
59
|
-
|
|
60
|
-
- id: create-release
|
|
61
|
-
intent: "Create a new release tagged v2.0.0 with title 'Major Release' and auto-generate the release notes"
|
|
62
|
-
difficulty: medium
|
|
63
|
-
category: crud
|
|
64
|
-
max_turns: 5
|
|
65
|
-
assert:
|
|
66
|
-
- ran: "gh release create.*v2.0.0"
|
|
67
|
-
- ran: "--title"
|
|
68
|
-
- ran: "--generate-notes"
|
|
69
|
-
|
|
70
|
-
- id: search-issues
|
|
71
|
-
intent: "Search for open issues labeled 'good first issue' in the rust-lang/rust repository"
|
|
72
|
-
difficulty: medium
|
|
73
|
-
category: query
|
|
74
|
-
max_turns: 5
|
|
75
|
-
assert:
|
|
76
|
-
- ran: "gh search issues"
|
|
77
|
-
- ran: "--repo.*rust-lang/rust"
|
|
78
|
-
- ran: "--label"
|
|
79
|
-
|
|
80
|
-
- id: pr-diff
|
|
81
|
-
intent: "Show the diff for pull request #500 in the current repository"
|
|
82
|
-
difficulty: medium
|
|
83
|
-
category: query
|
|
84
|
-
max_turns: 5
|
|
85
|
-
assert:
|
|
86
|
-
- ran: "gh pr diff.*500"
|
|
87
|
-
|
|
88
|
-
# -- Hard --------------------------------------------------------------------
|
|
89
|
-
- id: review-pr-workflow
|
|
90
|
-
intent: "Check out pull request #123, then approve it with the comment 'Looks good, tests pass'"
|
|
91
|
-
difficulty: hard
|
|
92
|
-
category: workflow
|
|
93
|
-
max_turns: 7
|
|
94
|
-
assert:
|
|
95
|
-
- ran: "gh pr checkout.*123"
|
|
96
|
-
- ran: "gh pr review.*123.*--approve"
|
|
97
|
-
|
|
98
|
-
- id: api-graphql
|
|
99
|
-
intent: "Use the GitHub API to fetch the description of the repository cli/cli using a GraphQL query"
|
|
100
|
-
difficulty: hard
|
|
101
|
-
category: query
|
|
102
|
-
max_turns: 7
|
|
103
|
-
assert:
|
|
104
|
-
- ran: "gh api graphql"
|
|
105
|
-
- ran: "repository"
|
|
106
|
-
|
|
107
|
-
- id: create-repo-full
|
|
108
|
-
intent: "Create a new private repository called 'my-project' with a README, a .gitignore for Python, and the MIT license"
|
|
109
|
-
difficulty: hard
|
|
110
|
-
category: crud
|
|
111
|
-
max_turns: 7
|
|
112
|
-
assert:
|
|
113
|
-
- ran: "gh repo create.*my-project"
|
|
114
|
-
- ran: "--private"
|
|
115
|
-
- ran: "--add-readme"
|
|
116
|
-
- ran: "--gitignore"
|
|
117
|
-
- ran: "--license"
|
|
118
|
-
- not_ran: "--public"
|
package/task_suites/jq.yaml
DELETED
|
@@ -1,172 +0,0 @@
|
|
|
1
|
-
cli: jq
|
|
2
|
-
version_command: "jq --version"
|
|
3
|
-
|
|
4
|
-
tasks:
|
|
5
|
-
# -- Easy --------------------------------------------------------------------
|
|
6
|
-
- id: pretty-print
|
|
7
|
-
intent: "Pretty-print the JSON file /tmp/bench-workspace/data.json"
|
|
8
|
-
difficulty: easy
|
|
9
|
-
category: query
|
|
10
|
-
setup:
|
|
11
|
-
- "mkdir -p /tmp/bench-workspace"
|
|
12
|
-
- "echo '{\"name\":\"alice\",\"age\":30,\"active\":true}' > /tmp/bench-workspace/data.json"
|
|
13
|
-
max_turns: 2
|
|
14
|
-
assert:
|
|
15
|
-
- ran: "jq.*data.json"
|
|
16
|
-
- exit_code: 0
|
|
17
|
-
- output_contains: "alice"
|
|
18
|
-
|
|
19
|
-
- id: extract-field
|
|
20
|
-
intent: "Extract the 'name' field from /tmp/bench-workspace/data.json"
|
|
21
|
-
difficulty: easy
|
|
22
|
-
category: query
|
|
23
|
-
setup:
|
|
24
|
-
- "mkdir -p /tmp/bench-workspace"
|
|
25
|
-
- "echo '{\"name\":\"alice\",\"age\":30}' > /tmp/bench-workspace/data.json"
|
|
26
|
-
max_turns: 3
|
|
27
|
-
assert:
|
|
28
|
-
- ran: "jq.*\\.name.*data.json"
|
|
29
|
-
- output_contains: "alice"
|
|
30
|
-
|
|
31
|
-
- id: raw-output
|
|
32
|
-
intent: "Extract the 'name' field from /tmp/bench-workspace/data.json as a raw string without quotes"
|
|
33
|
-
difficulty: easy
|
|
34
|
-
category: output
|
|
35
|
-
setup:
|
|
36
|
-
- "mkdir -p /tmp/bench-workspace"
|
|
37
|
-
- "echo '{\"name\":\"alice\"}' > /tmp/bench-workspace/data.json"
|
|
38
|
-
max_turns: 3
|
|
39
|
-
assert:
|
|
40
|
-
- ran: "jq.*-r|--raw-output"
|
|
41
|
-
- ran: "\\.name"
|
|
42
|
-
|
|
43
|
-
- id: array-length
|
|
44
|
-
intent: "Count the number of items in the JSON array in /tmp/bench-workspace/list.json"
|
|
45
|
-
difficulty: easy
|
|
46
|
-
category: query
|
|
47
|
-
setup:
|
|
48
|
-
- "mkdir -p /tmp/bench-workspace"
|
|
49
|
-
- "echo '[1,2,3,4,5]' > /tmp/bench-workspace/list.json"
|
|
50
|
-
max_turns: 3
|
|
51
|
-
assert:
|
|
52
|
-
- ran: "jq.*length.*list.json"
|
|
53
|
-
- output_contains: "5"
|
|
54
|
-
|
|
55
|
-
# -- Medium ------------------------------------------------------------------
|
|
56
|
-
- id: filter-array
|
|
57
|
-
intent: "From /tmp/bench-workspace/users.json, select only users where age is greater than 25"
|
|
58
|
-
difficulty: medium
|
|
59
|
-
category: query
|
|
60
|
-
setup:
|
|
61
|
-
- "mkdir -p /tmp/bench-workspace"
|
|
62
|
-
- "echo '[{\"name\":\"alice\",\"age\":30},{\"name\":\"bob\",\"age\":20},{\"name\":\"carol\",\"age\":35}]' > /tmp/bench-workspace/users.json"
|
|
63
|
-
max_turns: 5
|
|
64
|
-
assert:
|
|
65
|
-
- ran: "jq.*select.*age.*25.*users.json"
|
|
66
|
-
- output_contains: "alice"
|
|
67
|
-
- output_contains: "carol"
|
|
68
|
-
|
|
69
|
-
- id: map-transform
|
|
70
|
-
intent: "From /tmp/bench-workspace/users.json, extract just the names of all users as a JSON array"
|
|
71
|
-
difficulty: medium
|
|
72
|
-
category: output
|
|
73
|
-
setup:
|
|
74
|
-
- "mkdir -p /tmp/bench-workspace"
|
|
75
|
-
- "echo '[{\"name\":\"alice\",\"age\":30},{\"name\":\"bob\",\"age\":20}]' > /tmp/bench-workspace/users.json"
|
|
76
|
-
max_turns: 5
|
|
77
|
-
assert:
|
|
78
|
-
- ran: "jq.*\\.name.*users.json|jq.*map.*users.json"
|
|
79
|
-
- output_contains: "alice"
|
|
80
|
-
- output_contains: "bob"
|
|
81
|
-
|
|
82
|
-
- id: nested-access
|
|
83
|
-
intent: "From /tmp/bench-workspace/config.json, extract the host field from inside the database object"
|
|
84
|
-
difficulty: medium
|
|
85
|
-
category: query
|
|
86
|
-
setup:
|
|
87
|
-
- "mkdir -p /tmp/bench-workspace"
|
|
88
|
-
- "echo '{\"database\":{\"host\":\"localhost\",\"port\":5432},\"cache\":{\"host\":\"redis\"}}' > /tmp/bench-workspace/config.json"
|
|
89
|
-
max_turns: 5
|
|
90
|
-
assert:
|
|
91
|
-
- ran: "jq.*\\.database\\.host.*config.json"
|
|
92
|
-
- output_contains: "localhost"
|
|
93
|
-
|
|
94
|
-
# -- Hard --------------------------------------------------------------------
|
|
95
|
-
- id: group-and-count
|
|
96
|
-
intent: "From /tmp/bench-workspace/events.json, group the events by their 'type' field and count how many of each type there are"
|
|
97
|
-
difficulty: hard
|
|
98
|
-
category: query
|
|
99
|
-
setup:
|
|
100
|
-
- "mkdir -p /tmp/bench-workspace"
|
|
101
|
-
- "echo '[{\"type\":\"click\",\"page\":\"/home\"},{\"type\":\"view\",\"page\":\"/about\"},{\"type\":\"click\",\"page\":\"/pricing\"},{\"type\":\"view\",\"page\":\"/home\"},{\"type\":\"click\",\"page\":\"/home\"}]' > /tmp/bench-workspace/events.json"
|
|
102
|
-
max_turns: 7
|
|
103
|
-
assert:
|
|
104
|
-
- ran: "jq.*group_by.*events.json"
|
|
105
|
-
- output_contains: "click"
|
|
106
|
-
- output_contains: "view"
|
|
107
|
-
|
|
108
|
-
- id: merge-objects
|
|
109
|
-
intent: "Merge /tmp/bench-workspace/defaults.json and /tmp/bench-workspace/overrides.json, where overrides take precedence"
|
|
110
|
-
difficulty: hard
|
|
111
|
-
category: crud
|
|
112
|
-
setup:
|
|
113
|
-
- "mkdir -p /tmp/bench-workspace"
|
|
114
|
-
- "echo '{\"color\":\"red\",\"size\":10,\"debug\":false}' > /tmp/bench-workspace/defaults.json"
|
|
115
|
-
- "echo '{\"color\":\"blue\",\"verbose\":true}' > /tmp/bench-workspace/overrides.json"
|
|
116
|
-
max_turns: 7
|
|
117
|
-
assert:
|
|
118
|
-
- ran: "jq"
|
|
119
|
-
- output_contains: "blue"
|
|
120
|
-
- output_contains: "verbose"
|
|
121
|
-
|
|
122
|
-
- id: csv-output
|
|
123
|
-
intent: "Convert /tmp/bench-workspace/users.json into CSV format with columns name and age"
|
|
124
|
-
difficulty: hard
|
|
125
|
-
category: output
|
|
126
|
-
setup:
|
|
127
|
-
- "mkdir -p /tmp/bench-workspace"
|
|
128
|
-
- "echo '[{\"name\":\"alice\",\"age\":30},{\"name\":\"bob\",\"age\":20}]' > /tmp/bench-workspace/users.json"
|
|
129
|
-
max_turns: 7
|
|
130
|
-
assert:
|
|
131
|
-
- ran: "jq.*-r|--raw-output"
|
|
132
|
-
- ran: "@csv|@tsv|join"
|
|
133
|
-
- output_contains: "alice"
|
|
134
|
-
- output_contains: "bob"
|
|
135
|
-
|
|
136
|
-
- id: recursive-descent
|
|
137
|
-
intent: "From /tmp/bench-workspace/deep.json, find ALL values of any key named 'id' anywhere in the nested structure, no matter how deep"
|
|
138
|
-
difficulty: hard
|
|
139
|
-
category: query
|
|
140
|
-
setup:
|
|
141
|
-
- "mkdir -p /tmp/bench-workspace"
|
|
142
|
-
- "echo '{\"id\":1,\"children\":[{\"id\":2,\"meta\":{\"id\":3}},{\"id\":4}]}' > /tmp/bench-workspace/deep.json"
|
|
143
|
-
max_turns: 7
|
|
144
|
-
assert:
|
|
145
|
-
- ran: "jq.*\\.\\..*id|jq.*recurse.*id"
|
|
146
|
-
- output_contains: "1"
|
|
147
|
-
- output_contains: "3"
|
|
148
|
-
|
|
149
|
-
- id: conditional-update
|
|
150
|
-
intent: "In /tmp/bench-workspace/items.json, update the price to 0 for all items where the stock is 0, and output the modified JSON"
|
|
151
|
-
difficulty: hard
|
|
152
|
-
category: crud
|
|
153
|
-
setup:
|
|
154
|
-
- "mkdir -p /tmp/bench-workspace"
|
|
155
|
-
- "echo '[{\"name\":\"A\",\"price\":10,\"stock\":5},{\"name\":\"B\",\"price\":20,\"stock\":0},{\"name\":\"C\",\"price\":15,\"stock\":0}]' > /tmp/bench-workspace/items.json"
|
|
156
|
-
max_turns: 7
|
|
157
|
-
assert:
|
|
158
|
-
- ran: "jq.*if.*stock.*then|jq.*select.*stock|jq.*map.*stock"
|
|
159
|
-
- output_contains: "\"price\":0"
|
|
160
|
-
|
|
161
|
-
- id: pivot-table
|
|
162
|
-
intent: "From /tmp/bench-workspace/sales.json, create an object where each key is a unique region and each value is the total revenue for that region"
|
|
163
|
-
difficulty: hard
|
|
164
|
-
category: query
|
|
165
|
-
setup:
|
|
166
|
-
- "mkdir -p /tmp/bench-workspace"
|
|
167
|
-
- "echo '[{\"region\":\"US\",\"revenue\":100},{\"region\":\"EU\",\"revenue\":200},{\"region\":\"US\",\"revenue\":150},{\"region\":\"EU\",\"revenue\":50}]' > /tmp/bench-workspace/sales.json"
|
|
168
|
-
max_turns: 7
|
|
169
|
-
assert:
|
|
170
|
-
- ran: "jq.*group_by.*region"
|
|
171
|
-
- output_contains: "US"
|
|
172
|
-
- output_contains: "250"
|
package/task_suites/kubectl.yaml
DELETED
|
@@ -1,74 +0,0 @@
|
|
|
1
|
-
cli: kubectl
|
|
2
|
-
version_command: "kubectl version --client --short 2>/dev/null || kubectl version --client -o yaml | head -5"
|
|
3
|
-
|
|
4
|
-
tasks:
|
|
5
|
-
- id: get-help
|
|
6
|
-
intent: "Show help information for kubectl"
|
|
7
|
-
max_turns: 2
|
|
8
|
-
assert:
|
|
9
|
-
- ran: "kubectl.*--help"
|
|
10
|
-
- output_contains: "kubectl"
|
|
11
|
-
|
|
12
|
-
- id: create-namespace-dry-run
|
|
13
|
-
intent: "Create a namespace called 'test-ns' but only show what would be created, don't actually create it"
|
|
14
|
-
assert:
|
|
15
|
-
- output_contains: "test-ns"
|
|
16
|
-
- ran: "kubectl create"
|
|
17
|
-
- ran: "dry-run"
|
|
18
|
-
|
|
19
|
-
- id: create-deployment-dry-run
|
|
20
|
-
intent: "Create a deployment named 'nginx-app' using the nginx image, show the YAML output without creating it"
|
|
21
|
-
assert:
|
|
22
|
-
- output_contains: "nginx-app"
|
|
23
|
-
- output_contains: "nginx"
|
|
24
|
-
- ran: "kubectl create deploy"
|
|
25
|
-
|
|
26
|
-
- id: create-configmap-literal
|
|
27
|
-
intent: "Create a configmap named 'app-config' with database_url=localhost:5432 and debug=true, just show what it would look like"
|
|
28
|
-
assert:
|
|
29
|
-
- output_contains: "app-config"
|
|
30
|
-
- output_contains: "database_url"
|
|
31
|
-
- output_contains: "debug"
|
|
32
|
-
|
|
33
|
-
- id: create-job-with-command
|
|
34
|
-
intent: "Create a job named 'backup-job' using busybox that runs 'echo backup completed', show the YAML"
|
|
35
|
-
assert:
|
|
36
|
-
- output_contains: "backup-job"
|
|
37
|
-
- output_contains: "busybox"
|
|
38
|
-
- ran: "kubectl create job"
|
|
39
|
-
|
|
40
|
-
- id: create-cronjob-schedule
|
|
41
|
-
intent: "Create a cronjob named 'daily-cleanup' that runs every day at 2 AM using busybox with command 'rm -rf /tmp/*'"
|
|
42
|
-
assert:
|
|
43
|
-
- output_contains: "daily-cleanup"
|
|
44
|
-
- output_contains: "busybox"
|
|
45
|
-
- ran: "kubectl create cronjob"
|
|
46
|
-
|
|
47
|
-
- id: create-deployment-replicas
|
|
48
|
-
intent: "Create a deployment named 'web-app' using nginx image with 3 replicas"
|
|
49
|
-
assert:
|
|
50
|
-
- output_contains: "web-app"
|
|
51
|
-
- output_contains: "replicas"
|
|
52
|
-
- ran: "kubectl create deploy"
|
|
53
|
-
|
|
54
|
-
- id: create-configmap-from-file
|
|
55
|
-
intent: "Create a configmap named 'nginx-config' from the file at /tmp/bench-workspace/nginx.conf"
|
|
56
|
-
setup:
|
|
57
|
-
- "mkdir -p /tmp/bench-workspace && echo 'server { listen 80; }' > /tmp/bench-workspace/nginx.conf"
|
|
58
|
-
assert:
|
|
59
|
-
- output_contains: "nginx-config"
|
|
60
|
-
- ran: "kubectl create configmap"
|
|
61
|
-
|
|
62
|
-
- id: create-clusterrole-permissions
|
|
63
|
-
intent: "Create a cluster role named 'pod-reader' that allows get, list, and watch on pods"
|
|
64
|
-
assert:
|
|
65
|
-
- output_contains: "pod-reader"
|
|
66
|
-
- ran: "kubectl create clusterrole"
|
|
67
|
-
|
|
68
|
-
- id: create-ingress-complex
|
|
69
|
-
intent: "Create an ingress named 'web-ingress' routing foo.com/api to api-service:8080 and foo.com/web to web-service:80"
|
|
70
|
-
difficulty: hard
|
|
71
|
-
assert:
|
|
72
|
-
- output_contains: "web-ingress"
|
|
73
|
-
- output_contains: "foo.com"
|
|
74
|
-
- ran: "kubectl create ingress"
|