@cliwatch/cli-bench 0.6.3 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/CHANGELOG.md +8 -0
  2. package/README.md +3 -0
  3. package/dist/assertions.d.ts +1 -1
  4. package/dist/assertions.d.ts.map +1 -1
  5. package/dist/assertions.js +6 -6
  6. package/dist/assertions.js.map +1 -1
  7. package/dist/client/index.d.ts +1 -1
  8. package/dist/client/index.d.ts.map +1 -1
  9. package/dist/client/types.gen.d.ts +124 -86
  10. package/dist/client/types.gen.d.ts.map +1 -1
  11. package/dist/client/zod.gen.d.ts +57 -36
  12. package/dist/client/zod.gen.d.ts.map +1 -1
  13. package/dist/client/zod.gen.js +84 -52
  14. package/dist/client/zod.gen.js.map +1 -1
  15. package/dist/config.d.ts +2 -0
  16. package/dist/config.d.ts.map +1 -1
  17. package/dist/config.js +9 -0
  18. package/dist/config.js.map +1 -1
  19. package/dist/exec.d.ts +2 -0
  20. package/dist/exec.d.ts.map +1 -1
  21. package/dist/exec.js +6 -2
  22. package/dist/exec.js.map +1 -1
  23. package/dist/github-comment.d.ts +16 -0
  24. package/dist/github-comment.d.ts.map +1 -0
  25. package/dist/github-comment.js +90 -0
  26. package/dist/github-comment.js.map +1 -0
  27. package/dist/index.d.ts +2 -3
  28. package/dist/index.d.ts.map +1 -1
  29. package/dist/index.js +27 -26
  30. package/dist/index.js.map +1 -1
  31. package/dist/models.d.ts +7 -0
  32. package/dist/models.d.ts.map +1 -1
  33. package/dist/project.d.ts +11 -2
  34. package/dist/project.d.ts.map +1 -1
  35. package/dist/project.js +22 -8
  36. package/dist/project.js.map +1 -1
  37. package/dist/providers.d.ts +9 -7
  38. package/dist/providers.d.ts.map +1 -1
  39. package/dist/providers.js +26 -8
  40. package/dist/providers.js.map +1 -1
  41. package/dist/runner.d.ts +29 -1
  42. package/dist/runner.d.ts.map +1 -1
  43. package/dist/runner.js +73 -76
  44. package/dist/runner.js.map +1 -1
  45. package/dist/schemas.d.ts +15 -0
  46. package/dist/schemas.d.ts.map +1 -1
  47. package/dist/schemas.js +6 -0
  48. package/dist/schemas.js.map +1 -1
  49. package/dist/suite-generator.d.ts.map +1 -1
  50. package/dist/suite-generator.js +63 -11
  51. package/dist/suite-generator.js.map +1 -1
  52. package/package.json +2 -2
  53. package/task_suites/curl.yaml +0 -138
  54. package/task_suites/docker.yaml +0 -163
  55. package/task_suites/gh.yaml +0 -118
  56. package/task_suites/jq.yaml +0 -172
  57. package/task_suites/kubectl.yaml +0 -74
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@cliwatch/cli-bench",
3
- "version": "0.6.3",
3
+ "version": "0.7.0",
4
4
  "description": "LLM CLI agent testing framework — benchmark how well AI models use your CLI tool",
5
5
  "keywords": ["cli", "benchmark", "llm", "testing", "ai-agent", "cliwatch", "evaluation"],
6
6
  "license": "MIT",
@@ -14,7 +14,6 @@
14
14
  },
15
15
  "files": [
16
16
  "dist",
17
- "task_suites",
18
17
  "LICENSE",
19
18
  "CHANGELOG.md"
20
19
  ],
@@ -35,6 +34,7 @@
35
34
  "test": "vitest"
36
35
  },
37
36
  "dependencies": {
37
+ "@ai-sdk/google": "^3.0.0",
38
38
  "@hey-api/client-fetch": "^0.13.1",
39
39
  "ai": "^6.0.18",
40
40
  "yaml": "^2.7.0",
@@ -1,138 +0,0 @@
1
- cli: curl
2
- version_command: "curl --version | head -1"
3
-
4
- tasks:
5
- # -- Easy --------------------------------------------------------------------
6
- - id: simple-get
7
- intent: "Fetch the contents of https://httpbin.org/get"
8
- difficulty: easy
9
- category: query
10
- max_turns: 2
11
- assert:
12
- - ran: "curl.*httpbin.org/get"
13
- - exit_code: 0
14
- - output_contains: "origin"
15
-
16
- - id: head-request
17
- intent: "Send a HEAD request to https://httpbin.org/get and show the response headers"
18
- difficulty: easy
19
- category: query
20
- max_turns: 3
21
- assert:
22
- - ran: "curl"
23
- - ran: "-I|--head"
24
- - output_contains: "HTTP"
25
-
26
- - id: download-file
27
- intent: "Download https://httpbin.org/robots.txt and save it as robots.txt"
28
- difficulty: easy
29
- category: crud
30
- max_turns: 3
31
- assert:
32
- - ran: "curl"
33
- - ran: "-o|--output"
34
- - file_exists: "robots.txt"
35
-
36
- - id: follow-redirects
37
- intent: "Fetch https://httpbin.org/redirect/2 and follow all redirects"
38
- difficulty: easy
39
- category: query
40
- max_turns: 3
41
- assert:
42
- - ran: "curl"
43
- - ran: "-L|--location"
44
- - exit_code: 0
45
-
46
- # -- Medium ------------------------------------------------------------------
47
- - id: post-json
48
- intent: "Send a POST request to https://httpbin.org/post with JSON body {\"name\": \"bench\", \"version\": 1} and set the Content-Type header to application/json"
49
- difficulty: medium
50
- category: crud
51
- max_turns: 5
52
- assert:
53
- - ran: "curl"
54
- - ran: "-X POST|--request POST|-d|--data"
55
- - ran: "Content-Type.*application/json"
56
- - output_contains: "bench"
57
-
58
- - id: custom-headers
59
- intent: "Send a GET request to https://httpbin.org/headers with custom headers X-Request-ID: abc123 and Accept: application/xml"
60
- difficulty: medium
61
- category: query
62
- max_turns: 5
63
- assert:
64
- - ran: "curl"
65
- - ran: "-H|--header"
66
- - ran: "X-Request-ID"
67
- - output_contains: "abc123"
68
-
69
- - id: basic-auth
70
- intent: "Send a GET request to https://httpbin.org/basic-auth/user/passwd using basic authentication with username 'user' and password 'passwd'"
71
- difficulty: medium
72
- category: auth
73
- max_turns: 5
74
- assert:
75
- - ran: "curl"
76
- - ran: "-u|--user|user:passwd"
77
- - output_contains: "authenticated"
78
-
79
- - id: verbose-timing
80
- intent: "Fetch https://httpbin.org/get and show the total time taken for the request using curl's write-out feature"
81
- difficulty: medium
82
- category: output
83
- max_turns: 5
84
- assert:
85
- - ran: "curl"
86
- - ran: "-w|--write-out|time_total"
87
-
88
- # -- Hard --------------------------------------------------------------------
89
- - id: put-with-file
90
- intent: "Upload the file /tmp/bench-workspace/data.json to https://httpbin.org/put using a PUT request with Content-Type application/json"
91
- difficulty: hard
92
- category: crud
93
- setup:
94
- - "mkdir -p /tmp/bench-workspace"
95
- - "echo '{\"key\": \"value\"}' > /tmp/bench-workspace/data.json"
96
- max_turns: 7
97
- assert:
98
- - ran: "curl"
99
- - ran: "-X PUT|--request PUT|-T"
100
- - ran: "data.json"
101
- - output_contains: "key"
102
-
103
- - id: retry-with-timeout
104
- intent: "Fetch https://httpbin.org/delay/1 with a 5 second timeout, retry 3 times on failure, and save the response to response.json"
105
- difficulty: hard
106
- category: crud
107
- max_turns: 7
108
- assert:
109
- - ran: "curl"
110
- - ran: "--retry.*3"
111
- - ran: "--max-time|--connect-timeout|-m"
112
- - ran: "-o|--output"
113
-
114
- - id: multipart-upload
115
- intent: "Send a multipart/form-data POST to https://httpbin.org/post with a field 'username' set to 'admin' and a file field 'config' uploading /tmp/bench-workspace/app.conf"
116
- difficulty: hard
117
- category: crud
118
- setup:
119
- - "mkdir -p /tmp/bench-workspace"
120
- - "echo 'port=8080' > /tmp/bench-workspace/app.conf"
121
- max_turns: 7
122
- assert:
123
- - ran: "curl"
124
- - ran: "-F|--form"
125
- - ran: "username.*admin"
126
- - ran: "config.*@.*app.conf"
127
- - output_contains: "admin"
128
-
129
- - id: conditional-request
130
- intent: "Fetch https://httpbin.org/cache and use conditional headers: set If-None-Match to '12345' and If-Modified-Since to 'Thu, 01 Jan 2025 00:00:00 GMT'. Show the response status code."
131
- difficulty: hard
132
- category: query
133
- max_turns: 7
134
- assert:
135
- - ran: "curl"
136
- - ran: "If-None-Match"
137
- - ran: "If-Modified-Since"
138
- - ran: "-w|--write-out|-I|--head|-v|--verbose"
@@ -1,163 +0,0 @@
1
- cli: docker
2
-
3
- tasks:
4
- # -- Easy --------------------------------------------------------------------
5
- - id: list-containers
6
- intent: "List all running containers"
7
- difficulty: easy
8
- category: query
9
- max_turns: 3
10
- assert:
11
- - ran: "docker ps"
12
- - exit_code: 0
13
-
14
- - id: list-images
15
- intent: "List all local Docker images"
16
- difficulty: easy
17
- category: query
18
- max_turns: 3
19
- assert:
20
- - ran: "docker image"
21
- - exit_code: 0
22
-
23
- - id: pull-image
24
- intent: "Pull the latest nginx image from Docker Hub"
25
- difficulty: easy
26
- category: crud
27
- max_turns: 3
28
- assert:
29
- - ran: "docker pull.*nginx"
30
- - verify:
31
- run: "docker images nginx --format '{{.Repository}}'"
32
- output_contains: "nginx"
33
-
34
- - id: view-logs
35
- intent: "Show the last 100 lines of logs from the container 'api' and follow new output"
36
- difficulty: easy
37
- category: query
38
- setup:
39
- - "docker run -d --name api alpine sh -c 'for i in $(seq 1 200); do echo line-$i; done; sleep 3600'"
40
- max_turns: 3
41
- assert:
42
- - ran: "docker logs"
43
- - ran: "--tail"
44
-
45
- - id: stop-container
46
- intent: "Stop the container named 'web-server'"
47
- difficulty: easy
48
- category: crud
49
- setup:
50
- - "docker run -d --name web-server alpine sleep 3600"
51
- max_turns: 3
52
- assert:
53
- - ran: "docker stop.*web-server"
54
- - verify:
55
- run: "docker ps --filter name=web-server --format '{{.Names}}'"
56
- output_equals: ""
57
-
58
- # -- Medium ------------------------------------------------------------------
59
- - id: run-detached
60
- intent: "Run an nginx container in the background, mapping host port 8080 to container port 80, named 'web'"
61
- difficulty: medium
62
- category: crud
63
- max_turns: 5
64
- assert:
65
- - ran: "docker run"
66
- - ran: "-d"
67
- - ran: "--name.*web"
68
- - verify:
69
- run: "docker ps --filter name=web --format '{{.Names}}'"
70
- output_contains: "web"
71
-
72
- - id: build-with-tag
73
- intent: "Build a Docker image from ./Dockerfile and tag it as myapp:v2"
74
- difficulty: medium
75
- category: crud
76
- setup:
77
- - "mkdir -p /tmp/bench-workspace"
78
- - "printf 'FROM alpine:latest\nRUN echo hello' > /tmp/bench-workspace/Dockerfile"
79
- max_turns: 5
80
- assert:
81
- - ran: "docker build"
82
- - ran: "myapp:v2"
83
- - verify:
84
- run: "docker images myapp:v2 --format '{{.Repository}}:{{.Tag}}'"
85
- output_contains: "myapp:v2"
86
-
87
- - id: exec-into-container
88
- intent: "Run the command 'cat /etc/os-release' inside the running container 'web'"
89
- difficulty: medium
90
- category: crud
91
- setup:
92
- - "docker run -d --name web alpine sleep 3600"
93
- max_turns: 5
94
- assert:
95
- - ran: "docker exec.*web.*cat /etc/os-release"
96
- - exit_code: 0
97
-
98
- - id: inspect-json
99
- intent: "Get the IP address of the container 'web' using docker inspect with a format template"
100
- difficulty: medium
101
- category: output
102
- setup:
103
- - "docker run -d --name web alpine sleep 3600"
104
- max_turns: 5
105
- assert:
106
- - ran: "docker inspect"
107
- - ran: "--format"
108
-
109
- - id: prune-all
110
- intent: "Remove all stopped containers, unused networks, dangling images, and build cache without prompting for confirmation"
111
- difficulty: medium
112
- category: crud
113
- max_turns: 5
114
- assert:
115
- - ran: "docker system prune"
116
- - ran: "--force|-f"
117
-
118
- # -- Hard --------------------------------------------------------------------
119
- - id: run-complex
120
- intent: "Run a postgres:16 container named 'db' in the background with environment variables POSTGRES_USER=admin and POSTGRES_PASSWORD=secret, mount a volume 'pgdata' to /var/lib/postgresql/data, and connect it to the network 'backend'"
121
- difficulty: hard
122
- category: crud
123
- setup:
124
- - "docker network create backend || true"
125
- max_turns: 7
126
- assert:
127
- - ran: "docker run"
128
- - ran: "--name.*db"
129
- - ran: "POSTGRES_USER=admin"
130
- - ran: "POSTGRES_PASSWORD=secret"
131
- - verify:
132
- run: "docker ps --filter name=db --format '{{.Names}}'"
133
- output_contains: "db"
134
-
135
- - id: compose-up
136
- intent: "Start all services defined in docker-compose.yml in detached mode and rebuild any changed images"
137
- difficulty: hard
138
- category: workflow
139
- setup:
140
- - "mkdir -p /tmp/bench-workspace"
141
- - "printf 'services:\n web:\n image: alpine\n command: sleep 3600\n' > /tmp/bench-workspace/docker-compose.yml"
142
- max_turns: 7
143
- assert:
144
- - ran: "docker compose.*up"
145
- - ran: "-d|--detach"
146
- - ran: "--build"
147
-
148
- - id: multi-stage-debug
149
- intent: "Build only the 'builder' stage from /tmp/bench-workspace/Dockerfile, tag it as 'myapp:debug', and don't use cache"
150
- difficulty: hard
151
- category: crud
152
- setup:
153
- - "mkdir -p /tmp/bench-workspace"
154
- - "printf 'FROM alpine:latest AS builder\nRUN echo building\nFROM alpine:latest\nCOPY --from=builder / /\n' > /tmp/bench-workspace/Dockerfile"
155
- max_turns: 7
156
- assert:
157
- - ran: "docker build"
158
- - ran: "--target.*builder"
159
- - ran: "myapp:debug"
160
- - ran: "--no-cache"
161
- - verify:
162
- run: "docker images myapp:debug --format '{{.Repository}}:{{.Tag}}'"
163
- output_contains: "myapp:debug"
@@ -1,118 +0,0 @@
1
- cli: gh
2
-
3
- tasks:
4
- # -- Easy --------------------------------------------------------------------
5
- - id: list-repos
6
- intent: "List all public repositories for the GitHub user 'octocat'"
7
- difficulty: easy
8
- category: query
9
- max_turns: 3
10
- assert:
11
- - ran: "gh repo list.*octocat"
12
- - ran: "--visibility"
13
-
14
- - id: view-issue
15
- intent: "View the details of issue #42 in the repository cli/cli"
16
- difficulty: easy
17
- category: crud
18
- max_turns: 3
19
- assert:
20
- - ran: "gh issue view.*42"
21
- - ran: "--repo.*cli/cli"
22
-
23
- - id: list-prs-json
24
- intent: "List open pull requests in the current repo and output as JSON with fields number, title, and author"
25
- difficulty: easy
26
- category: output
27
- max_turns: 3
28
- assert:
29
- - ran: "gh pr list"
30
- - ran: "--json"
31
-
32
- - id: whoami
33
- intent: "Show which GitHub account you're currently authenticated as"
34
- difficulty: easy
35
- category: auth
36
- max_turns: 3
37
- assert:
38
- - ran: "gh auth status"
39
-
40
- - id: set-config
41
- intent: "Set the default git protocol to SSH for GitHub CLI"
42
- difficulty: easy
43
- category: config
44
- max_turns: 3
45
- assert:
46
- - ran: "gh config set.*git_protocol.*ssh"
47
-
48
- # -- Medium ------------------------------------------------------------------
49
- - id: create-issue-with-labels
50
- intent: "Create a new issue titled 'Login page broken' with labels 'bug' and 'urgent' in the repository myorg/myapp"
51
- difficulty: medium
52
- category: crud
53
- max_turns: 5
54
- assert:
55
- - ran: "gh issue create"
56
- - ran: "--title"
57
- - ran: "--label"
58
- - ran: "--repo.*myorg/myapp"
59
-
60
- - id: create-release
61
- intent: "Create a new release tagged v2.0.0 with title 'Major Release' and auto-generate the release notes"
62
- difficulty: medium
63
- category: crud
64
- max_turns: 5
65
- assert:
66
- - ran: "gh release create.*v2.0.0"
67
- - ran: "--title"
68
- - ran: "--generate-notes"
69
-
70
- - id: search-issues
71
- intent: "Search for open issues labeled 'good first issue' in the rust-lang/rust repository"
72
- difficulty: medium
73
- category: query
74
- max_turns: 5
75
- assert:
76
- - ran: "gh search issues"
77
- - ran: "--repo.*rust-lang/rust"
78
- - ran: "--label"
79
-
80
- - id: pr-diff
81
- intent: "Show the diff for pull request #500 in the current repository"
82
- difficulty: medium
83
- category: query
84
- max_turns: 5
85
- assert:
86
- - ran: "gh pr diff.*500"
87
-
88
- # -- Hard --------------------------------------------------------------------
89
- - id: review-pr-workflow
90
- intent: "Check out pull request #123, then approve it with the comment 'Looks good, tests pass'"
91
- difficulty: hard
92
- category: workflow
93
- max_turns: 7
94
- assert:
95
- - ran: "gh pr checkout.*123"
96
- - ran: "gh pr review.*123.*--approve"
97
-
98
- - id: api-graphql
99
- intent: "Use the GitHub API to fetch the description of the repository cli/cli using a GraphQL query"
100
- difficulty: hard
101
- category: query
102
- max_turns: 7
103
- assert:
104
- - ran: "gh api graphql"
105
- - ran: "repository"
106
-
107
- - id: create-repo-full
108
- intent: "Create a new private repository called 'my-project' with a README, a .gitignore for Python, and the MIT license"
109
- difficulty: hard
110
- category: crud
111
- max_turns: 7
112
- assert:
113
- - ran: "gh repo create.*my-project"
114
- - ran: "--private"
115
- - ran: "--add-readme"
116
- - ran: "--gitignore"
117
- - ran: "--license"
118
- - not_ran: "--public"
@@ -1,172 +0,0 @@
1
- cli: jq
2
- version_command: "jq --version"
3
-
4
- tasks:
5
- # -- Easy --------------------------------------------------------------------
6
- - id: pretty-print
7
- intent: "Pretty-print the JSON file /tmp/bench-workspace/data.json"
8
- difficulty: easy
9
- category: query
10
- setup:
11
- - "mkdir -p /tmp/bench-workspace"
12
- - "echo '{\"name\":\"alice\",\"age\":30,\"active\":true}' > /tmp/bench-workspace/data.json"
13
- max_turns: 2
14
- assert:
15
- - ran: "jq.*data.json"
16
- - exit_code: 0
17
- - output_contains: "alice"
18
-
19
- - id: extract-field
20
- intent: "Extract the 'name' field from /tmp/bench-workspace/data.json"
21
- difficulty: easy
22
- category: query
23
- setup:
24
- - "mkdir -p /tmp/bench-workspace"
25
- - "echo '{\"name\":\"alice\",\"age\":30}' > /tmp/bench-workspace/data.json"
26
- max_turns: 3
27
- assert:
28
- - ran: "jq.*\\.name.*data.json"
29
- - output_contains: "alice"
30
-
31
- - id: raw-output
32
- intent: "Extract the 'name' field from /tmp/bench-workspace/data.json as a raw string without quotes"
33
- difficulty: easy
34
- category: output
35
- setup:
36
- - "mkdir -p /tmp/bench-workspace"
37
- - "echo '{\"name\":\"alice\"}' > /tmp/bench-workspace/data.json"
38
- max_turns: 3
39
- assert:
40
- - ran: "jq.*-r|--raw-output"
41
- - ran: "\\.name"
42
-
43
- - id: array-length
44
- intent: "Count the number of items in the JSON array in /tmp/bench-workspace/list.json"
45
- difficulty: easy
46
- category: query
47
- setup:
48
- - "mkdir -p /tmp/bench-workspace"
49
- - "echo '[1,2,3,4,5]' > /tmp/bench-workspace/list.json"
50
- max_turns: 3
51
- assert:
52
- - ran: "jq.*length.*list.json"
53
- - output_contains: "5"
54
-
55
- # -- Medium ------------------------------------------------------------------
56
- - id: filter-array
57
- intent: "From /tmp/bench-workspace/users.json, select only users where age is greater than 25"
58
- difficulty: medium
59
- category: query
60
- setup:
61
- - "mkdir -p /tmp/bench-workspace"
62
- - "echo '[{\"name\":\"alice\",\"age\":30},{\"name\":\"bob\",\"age\":20},{\"name\":\"carol\",\"age\":35}]' > /tmp/bench-workspace/users.json"
63
- max_turns: 5
64
- assert:
65
- - ran: "jq.*select.*age.*25.*users.json"
66
- - output_contains: "alice"
67
- - output_contains: "carol"
68
-
69
- - id: map-transform
70
- intent: "From /tmp/bench-workspace/users.json, extract just the names of all users as a JSON array"
71
- difficulty: medium
72
- category: output
73
- setup:
74
- - "mkdir -p /tmp/bench-workspace"
75
- - "echo '[{\"name\":\"alice\",\"age\":30},{\"name\":\"bob\",\"age\":20}]' > /tmp/bench-workspace/users.json"
76
- max_turns: 5
77
- assert:
78
- - ran: "jq.*\\.name.*users.json|jq.*map.*users.json"
79
- - output_contains: "alice"
80
- - output_contains: "bob"
81
-
82
- - id: nested-access
83
- intent: "From /tmp/bench-workspace/config.json, extract the host field from inside the database object"
84
- difficulty: medium
85
- category: query
86
- setup:
87
- - "mkdir -p /tmp/bench-workspace"
88
- - "echo '{\"database\":{\"host\":\"localhost\",\"port\":5432},\"cache\":{\"host\":\"redis\"}}' > /tmp/bench-workspace/config.json"
89
- max_turns: 5
90
- assert:
91
- - ran: "jq.*\\.database\\.host.*config.json"
92
- - output_contains: "localhost"
93
-
94
- # -- Hard --------------------------------------------------------------------
95
- - id: group-and-count
96
- intent: "From /tmp/bench-workspace/events.json, group the events by their 'type' field and count how many of each type there are"
97
- difficulty: hard
98
- category: query
99
- setup:
100
- - "mkdir -p /tmp/bench-workspace"
101
- - "echo '[{\"type\":\"click\",\"page\":\"/home\"},{\"type\":\"view\",\"page\":\"/about\"},{\"type\":\"click\",\"page\":\"/pricing\"},{\"type\":\"view\",\"page\":\"/home\"},{\"type\":\"click\",\"page\":\"/home\"}]' > /tmp/bench-workspace/events.json"
102
- max_turns: 7
103
- assert:
104
- - ran: "jq.*group_by.*events.json"
105
- - output_contains: "click"
106
- - output_contains: "view"
107
-
108
- - id: merge-objects
109
- intent: "Merge /tmp/bench-workspace/defaults.json and /tmp/bench-workspace/overrides.json, where overrides take precedence"
110
- difficulty: hard
111
- category: crud
112
- setup:
113
- - "mkdir -p /tmp/bench-workspace"
114
- - "echo '{\"color\":\"red\",\"size\":10,\"debug\":false}' > /tmp/bench-workspace/defaults.json"
115
- - "echo '{\"color\":\"blue\",\"verbose\":true}' > /tmp/bench-workspace/overrides.json"
116
- max_turns: 7
117
- assert:
118
- - ran: "jq"
119
- - output_contains: "blue"
120
- - output_contains: "verbose"
121
-
122
- - id: csv-output
123
- intent: "Convert /tmp/bench-workspace/users.json into CSV format with columns name and age"
124
- difficulty: hard
125
- category: output
126
- setup:
127
- - "mkdir -p /tmp/bench-workspace"
128
- - "echo '[{\"name\":\"alice\",\"age\":30},{\"name\":\"bob\",\"age\":20}]' > /tmp/bench-workspace/users.json"
129
- max_turns: 7
130
- assert:
131
- - ran: "jq.*-r|--raw-output"
132
- - ran: "@csv|@tsv|join"
133
- - output_contains: "alice"
134
- - output_contains: "bob"
135
-
136
- - id: recursive-descent
137
- intent: "From /tmp/bench-workspace/deep.json, find ALL values of any key named 'id' anywhere in the nested structure, no matter how deep"
138
- difficulty: hard
139
- category: query
140
- setup:
141
- - "mkdir -p /tmp/bench-workspace"
142
- - "echo '{\"id\":1,\"children\":[{\"id\":2,\"meta\":{\"id\":3}},{\"id\":4}]}' > /tmp/bench-workspace/deep.json"
143
- max_turns: 7
144
- assert:
145
- - ran: "jq.*\\.\\..*id|jq.*recurse.*id"
146
- - output_contains: "1"
147
- - output_contains: "3"
148
-
149
- - id: conditional-update
150
- intent: "In /tmp/bench-workspace/items.json, update the price to 0 for all items where the stock is 0, and output the modified JSON"
151
- difficulty: hard
152
- category: crud
153
- setup:
154
- - "mkdir -p /tmp/bench-workspace"
155
- - "echo '[{\"name\":\"A\",\"price\":10,\"stock\":5},{\"name\":\"B\",\"price\":20,\"stock\":0},{\"name\":\"C\",\"price\":15,\"stock\":0}]' > /tmp/bench-workspace/items.json"
156
- max_turns: 7
157
- assert:
158
- - ran: "jq.*if.*stock.*then|jq.*select.*stock|jq.*map.*stock"
159
- - output_contains: "\"price\":0"
160
-
161
- - id: pivot-table
162
- intent: "From /tmp/bench-workspace/sales.json, create an object where each key is a unique region and each value is the total revenue for that region"
163
- difficulty: hard
164
- category: query
165
- setup:
166
- - "mkdir -p /tmp/bench-workspace"
167
- - "echo '[{\"region\":\"US\",\"revenue\":100},{\"region\":\"EU\",\"revenue\":200},{\"region\":\"US\",\"revenue\":150},{\"region\":\"EU\",\"revenue\":50}]' > /tmp/bench-workspace/sales.json"
168
- max_turns: 7
169
- assert:
170
- - ran: "jq.*group_by.*region"
171
- - output_contains: "US"
172
- - output_contains: "250"
@@ -1,74 +0,0 @@
1
- cli: kubectl
2
- version_command: "kubectl version --client --short 2>/dev/null || kubectl version --client -o yaml | head -5"
3
-
4
- tasks:
5
- - id: get-help
6
- intent: "Show help information for kubectl"
7
- max_turns: 2
8
- assert:
9
- - ran: "kubectl.*--help"
10
- - output_contains: "kubectl"
11
-
12
- - id: create-namespace-dry-run
13
- intent: "Create a namespace called 'test-ns' but only show what would be created, don't actually create it"
14
- assert:
15
- - output_contains: "test-ns"
16
- - ran: "kubectl create"
17
- - ran: "dry-run"
18
-
19
- - id: create-deployment-dry-run
20
- intent: "Create a deployment named 'nginx-app' using the nginx image, show the YAML output without creating it"
21
- assert:
22
- - output_contains: "nginx-app"
23
- - output_contains: "nginx"
24
- - ran: "kubectl create deploy"
25
-
26
- - id: create-configmap-literal
27
- intent: "Create a configmap named 'app-config' with database_url=localhost:5432 and debug=true, just show what it would look like"
28
- assert:
29
- - output_contains: "app-config"
30
- - output_contains: "database_url"
31
- - output_contains: "debug"
32
-
33
- - id: create-job-with-command
34
- intent: "Create a job named 'backup-job' using busybox that runs 'echo backup completed', show the YAML"
35
- assert:
36
- - output_contains: "backup-job"
37
- - output_contains: "busybox"
38
- - ran: "kubectl create job"
39
-
40
- - id: create-cronjob-schedule
41
- intent: "Create a cronjob named 'daily-cleanup' that runs every day at 2 AM using busybox with command 'rm -rf /tmp/*'"
42
- assert:
43
- - output_contains: "daily-cleanup"
44
- - output_contains: "busybox"
45
- - ran: "kubectl create cronjob"
46
-
47
- - id: create-deployment-replicas
48
- intent: "Create a deployment named 'web-app' using nginx image with 3 replicas"
49
- assert:
50
- - output_contains: "web-app"
51
- - output_contains: "replicas"
52
- - ran: "kubectl create deploy"
53
-
54
- - id: create-configmap-from-file
55
- intent: "Create a configmap named 'nginx-config' from the file at /tmp/bench-workspace/nginx.conf"
56
- setup:
57
- - "mkdir -p /tmp/bench-workspace && echo 'server { listen 80; }' > /tmp/bench-workspace/nginx.conf"
58
- assert:
59
- - output_contains: "nginx-config"
60
- - ran: "kubectl create configmap"
61
-
62
- - id: create-clusterrole-permissions
63
- intent: "Create a cluster role named 'pod-reader' that allows get, list, and watch on pods"
64
- assert:
65
- - output_contains: "pod-reader"
66
- - ran: "kubectl create clusterrole"
67
-
68
- - id: create-ingress-complex
69
- intent: "Create an ingress named 'web-ingress' routing foo.com/api to api-service:8080 and foo.com/web to web-service:80"
70
- difficulty: hard
71
- assert:
72
- - output_contains: "web-ingress"
73
- - output_contains: "foo.com"
74
- - ran: "kubectl create ingress"