censiq 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/ISSUE_TEMPLATE/bug_report.yml +58 -0
- package/.github/ISSUE_TEMPLATE/feature_request.yml +45 -0
- package/.github/PULL_REQUEST_TEMPLATE.md +20 -0
- package/CONTRIBUTING.md +136 -0
- package/README.md +232 -45
- package/package.json +1 -1
- package/src/commands/run.js +17 -12
- package/templates/arena.yaml +11 -5
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
name: Bug Report
|
|
2
|
+
description: Something isn't working correctly
|
|
3
|
+
title: "[Bug] "
|
|
4
|
+
labels: ["bug"]
|
|
5
|
+
body:
|
|
6
|
+
- type: input
|
|
7
|
+
id: version
|
|
8
|
+
attributes:
|
|
9
|
+
label: censiq version
|
|
10
|
+
placeholder: "0.1.2 (run: censiq --version)"
|
|
11
|
+
validations:
|
|
12
|
+
required: true
|
|
13
|
+
|
|
14
|
+
- type: dropdown
|
|
15
|
+
id: command
|
|
16
|
+
attributes:
|
|
17
|
+
label: Which command?
|
|
18
|
+
options:
|
|
19
|
+
- censiq login
|
|
20
|
+
- censiq init
|
|
21
|
+
- censiq run
|
|
22
|
+
- censiq report
|
|
23
|
+
- Other
|
|
24
|
+
validations:
|
|
25
|
+
required: true
|
|
26
|
+
|
|
27
|
+
- type: textarea
|
|
28
|
+
id: description
|
|
29
|
+
attributes:
|
|
30
|
+
label: What happened?
|
|
31
|
+
placeholder: Describe what you did and what went wrong.
|
|
32
|
+
validations:
|
|
33
|
+
required: true
|
|
34
|
+
|
|
35
|
+
- type: textarea
|
|
36
|
+
id: expected
|
|
37
|
+
attributes:
|
|
38
|
+
label: What did you expect?
|
|
39
|
+
validations:
|
|
40
|
+
required: true
|
|
41
|
+
|
|
42
|
+
- type: textarea
|
|
43
|
+
id: repro
|
|
44
|
+
attributes:
|
|
45
|
+
label: Steps to reproduce
|
|
46
|
+
placeholder: |
|
|
47
|
+
1. Run `censiq run --config arena.yaml`
|
|
48
|
+
2. See error: ...
|
|
49
|
+
validations:
|
|
50
|
+
required: true
|
|
51
|
+
|
|
52
|
+
- type: input
|
|
53
|
+
id: os
|
|
54
|
+
attributes:
|
|
55
|
+
label: OS and Node version
|
|
56
|
+
placeholder: "macOS 14.4, Node 20.11"
|
|
57
|
+
validations:
|
|
58
|
+
required: true
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
name: Feature Request
|
|
2
|
+
description: Suggest an improvement or new capability
|
|
3
|
+
title: "[Feature] "
|
|
4
|
+
labels: ["enhancement"]
|
|
5
|
+
body:
|
|
6
|
+
- type: dropdown
|
|
7
|
+
id: area
|
|
8
|
+
attributes:
|
|
9
|
+
label: Area
|
|
10
|
+
options:
|
|
11
|
+
- New command
|
|
12
|
+
- Output format (--format)
|
|
13
|
+
- Agent adapter (--adapter)
|
|
14
|
+
- Shell completions
|
|
15
|
+
- arena.yaml config option
|
|
16
|
+
- CI/CD integration
|
|
17
|
+
- Other
|
|
18
|
+
validations:
|
|
19
|
+
required: true
|
|
20
|
+
|
|
21
|
+
- type: textarea
|
|
22
|
+
id: problem
|
|
23
|
+
attributes:
|
|
24
|
+
label: What problem does this solve?
|
|
25
|
+
placeholder: |
|
|
26
|
+
When I run censiq run in GitHub Actions, the output isn't picked up by
|
|
27
|
+
the PR check dashboard because it's terminal text, not a recognized format...
|
|
28
|
+
validations:
|
|
29
|
+
required: true
|
|
30
|
+
|
|
31
|
+
- type: textarea
|
|
32
|
+
id: solution
|
|
33
|
+
attributes:
|
|
34
|
+
label: Proposed solution
|
|
35
|
+
placeholder: |
|
|
36
|
+
Add a --format sarif flag that outputs results as a SARIF file,
|
|
37
|
+
compatible with GitHub Code Scanning upload-sarif action...
|
|
38
|
+
|
|
39
|
+
- type: checkboxes
|
|
40
|
+
id: contribution
|
|
41
|
+
attributes:
|
|
42
|
+
label: Are you willing to implement this?
|
|
43
|
+
options:
|
|
44
|
+
- label: Yes, I'd like to open a PR for this
|
|
45
|
+
- label: No, just suggesting it
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
## What does this PR do?
|
|
2
|
+
|
|
3
|
+
<!-- Bug fix, new command, new output format, adapter, etc. -->
|
|
4
|
+
|
|
5
|
+
## Related issue
|
|
6
|
+
|
|
7
|
+
Closes #
|
|
8
|
+
|
|
9
|
+
## Changes
|
|
10
|
+
|
|
11
|
+
<!-- Brief list of what changed -->
|
|
12
|
+
|
|
13
|
+
## Checklist
|
|
14
|
+
|
|
15
|
+
- [ ] `node --check src/**/*.js src/*.js bin/*.js` passes
|
|
16
|
+
- [ ] `censiq --help` output is correct
|
|
17
|
+
- [ ] Tested against a live Censiq account (`censiq run`)
|
|
18
|
+
- [ ] No new dependencies added without discussion (keep the install lightweight)
|
|
19
|
+
- [ ] Error messages tell the user what to do next, not just what went wrong
|
|
20
|
+
- [ ] No comments added that explain what the code does (only why, when non-obvious)
|
package/CONTRIBUTING.md
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
# Contributing to the Censiq CLI
|
|
2
|
+
|
|
3
|
+
The CLI is a thin client that talks to the Censiq API. Contributions here focus on the developer experience: commands, output formatting, config handling, agent adapters, and CI/CD integration.
|
|
4
|
+
|
|
5
|
+
If you want to contribute new test scenarios or packs, see the [standard-packs repository](https://github.com/Censiq/standard-packs) instead — that's where the evaluation content lives.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## What to work on
|
|
10
|
+
|
|
11
|
+
Check the [issues list](https://github.com/Censiq/CLI/issues) for open bugs and feature requests. The most impactful contribution areas:
|
|
12
|
+
|
|
13
|
+
**Output formats** — the CLI currently outputs to the terminal or JSON. Useful additions:
|
|
14
|
+
- `--format sarif` — Security Alert Results Interchange Format for GitHub Code Scanning
|
|
15
|
+
- `--format junit` — JUnit XML for Jenkins, CircleCI, and most CI dashboards
|
|
16
|
+
- `--format html` — a self-contained report file
|
|
17
|
+
|
|
18
|
+
**Shell completions** — bash, zsh, and fish completions for `censiq <TAB>`
|
|
19
|
+
|
|
20
|
+
**Agent adapters** — named adapters for popular frameworks so users don't have to configure endpoint format manually:
|
|
21
|
+
```bash
|
|
22
|
+
censiq run --adapter langchain
|
|
23
|
+
censiq run --adapter openai
|
|
24
|
+
censiq run --adapter bedrock
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
**Bug fixes** — see issues labeled `bug`
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## Local setup
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
git clone https://github.com/Censiq/CLI.git
|
|
35
|
+
cd CLI
|
|
36
|
+
npm install
|
|
37
|
+
npm link # makes `censiq` available globally from this local build
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Set your API key:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
censiq login # paste a key from censiq.com/settings
|
|
44
|
+
# or
|
|
45
|
+
export CENSIQ_API_KEY=cens_live_...
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Point at a local server during development:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
export CENSIQ_API_URL=http://localhost:5001
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## Project structure
|
|
57
|
+
|
|
58
|
+
```
|
|
59
|
+
bin/censiq.js Entry point (shebang + require)
|
|
60
|
+
src/
|
|
61
|
+
index.js Commander setup — all commands registered here
|
|
62
|
+
commands/
|
|
63
|
+
login.js censiq login
|
|
64
|
+
init.js censiq init (interactive wizard)
|
|
65
|
+
run.js censiq run (creates arena, polls progress, prints summary)
|
|
66
|
+
report.js censiq report (fetches and formats results)
|
|
67
|
+
utils/
|
|
68
|
+
api.js Axios client — all API calls go through here
|
|
69
|
+
config.js Reads arena.yaml, auth file, last-run file
|
|
70
|
+
display.js All terminal output — colors, tables, score bars
|
|
71
|
+
templates/
|
|
72
|
+
arena.yaml Default config template (copied by censiq init)
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## Making changes
|
|
78
|
+
|
|
79
|
+
**Adding a command** — add a file in `src/commands/`, register it in `src/index.js`.
|
|
80
|
+
|
|
81
|
+
**Adding an API call** — add a function to `src/utils/api.js`. All requests go through `request()` which handles auth and errors centrally.
|
|
82
|
+
|
|
83
|
+
**Adding output formatting** — add a format handler in `src/utils/display.js` and wire it up with a `--format` flag in the relevant command.
|
|
84
|
+
|
|
85
|
+
**Changing the wizard** — `src/commands/init.js` uses [inquirer](https://github.com/SBoudrias/Inquirer.js) prompts. Keep the wizard short — users should be able to complete it in under 2 minutes.
|
|
86
|
+
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
## Code style
|
|
90
|
+
|
|
91
|
+
- CommonJS (`require`/`module.exports`) — no ESM
|
|
92
|
+
- No TypeScript — keep it approachable for contributors
|
|
93
|
+
- No unnecessary abstraction — if a function is used once, don't extract it
|
|
94
|
+
- No comments explaining what the code does — only comment the *why* when non-obvious
|
|
95
|
+
- Error messages should tell the user what to do next, not just what went wrong
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## Testing your changes
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
node --check src/**/*.js src/*.js bin/*.js # syntax check
|
|
103
|
+
censiq --help # smoke test
|
|
104
|
+
censiq run --config templates/arena.yaml # requires a live API key
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
There is no test suite yet. If you add one, it's welcome.
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## Submitting a PR
|
|
112
|
+
|
|
113
|
+
1. Fork the repository
|
|
114
|
+
2. Create a branch: `git checkout -b feat/my-feature`
|
|
115
|
+
3. Make your changes
|
|
116
|
+
4. Run `node --check` on all modified files
|
|
117
|
+
5. Open a PR against `main`
|
|
118
|
+
6. Fill in the PR template
|
|
119
|
+
|
|
120
|
+
Keep PRs focused — one feature or fix per PR. Large PRs are harder to review and slower to merge.
|
|
121
|
+
|
|
122
|
+
---
|
|
123
|
+
|
|
124
|
+
## Versioning
|
|
125
|
+
|
|
126
|
+
This project uses [semantic versioning](https://semver.org/). Bug fixes bump the patch version. New commands or flags bump the minor version. Breaking changes bump the major version.
|
|
127
|
+
|
|
128
|
+
Publishing to npm is done by the Censiq team after merge.
|
|
129
|
+
|
|
130
|
+
---
|
|
131
|
+
|
|
132
|
+
## License
|
|
133
|
+
|
|
134
|
+
MIT — see [LICENSE](LICENSE). By contributing, you agree your code is licensed under MIT.
|
|
135
|
+
|
|
136
|
+
Questions? Open an issue or email dev@censiq.com.
|
package/README.md
CHANGED
|
@@ -1,11 +1,21 @@
|
|
|
1
1
|
# censiq
|
|
2
2
|
|
|
3
|
-
Evaluate AI agents against industry standards before they go to production.
|
|
3
|
+
Evaluate AI agents against industry security standards before they go to production.
|
|
4
4
|
|
|
5
|
-
Point censiq at any agent endpoint, select a test suite
|
|
5
|
+
Point censiq at any agent endpoint, select a test suite, and get back a scored compliance report with rubric breakdowns, consistency metrics, and actionable fixes — from your terminal or CI pipeline.
|
|
6
6
|
|
|
7
7
|
Built for security teams, AI engineers, and anyone shipping an AI agent that needs to prove it behaves.
|
|
8
8
|
|
|
9
|
+
[](https://www.npmjs.com/package/censiq)
|
|
10
|
+
[](LICENSE)
|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
## Requirements
|
|
15
|
+
|
|
16
|
+
- Node.js 18 or higher
|
|
17
|
+
- A [Censiq account](https://censiq.com) (free to start)
|
|
18
|
+
|
|
9
19
|
---
|
|
10
20
|
|
|
11
21
|
## Install
|
|
@@ -14,121 +24,298 @@ Built for security teams, AI engineers, and anyone shipping an AI agent that nee
|
|
|
14
24
|
npm install -g censiq
|
|
15
25
|
```
|
|
16
26
|
|
|
17
|
-
|
|
27
|
+
Verify the install:
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
censiq --version
|
|
31
|
+
```
|
|
18
32
|
|
|
19
33
|
---
|
|
20
34
|
|
|
21
35
|
## Quickstart
|
|
22
36
|
|
|
37
|
+
### 1. Get an API key
|
|
38
|
+
|
|
39
|
+
Sign in at [censiq.com](https://censiq.com) and go to **Settings → API Keys → Generate**. Copy the key — it starts with `cens_live_` and is shown only once.
|
|
40
|
+
|
|
41
|
+
### 2. Authenticate
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
censiq login
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Paste your API key when prompted. It is saved locally to `~/.censiq/auth.json` and never sent anywhere except your Censiq API requests.
|
|
48
|
+
|
|
49
|
+
### 3. Configure your agent
|
|
50
|
+
|
|
51
|
+
Run the interactive setup wizard in your project folder:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
censiq init
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
This creates an `arena.yaml` file with your agent connection details, test suite selection, and run options. You can also copy and edit the [template](templates/arena.yaml) manually.
|
|
58
|
+
|
|
59
|
+
### 4. Run an evaluation
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
censiq run
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Scenarios execute against your agent in real time. When the run finishes you see a grade, pass rate, score breakdown, and any critical failures.
|
|
66
|
+
|
|
67
|
+
### 5. View the full report
|
|
68
|
+
|
|
23
69
|
```bash
|
|
24
|
-
censiq
|
|
25
|
-
censiq init # scaffold arena.yaml interactively
|
|
26
|
-
censiq run # run the evaluation
|
|
27
|
-
censiq report # view full results
|
|
70
|
+
censiq report
|
|
28
71
|
```
|
|
29
72
|
|
|
73
|
+
Drill into per-scenario results, rubric scores across five dimensions, evaluator reasoning, and recommended fixes.
|
|
74
|
+
|
|
30
75
|
---
|
|
31
76
|
|
|
32
77
|
## Commands
|
|
33
78
|
|
|
34
79
|
### `censiq login`
|
|
35
|
-
|
|
80
|
+
|
|
81
|
+
Saves your Censiq API key locally. Re-run at any time to switch keys.
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
censiq login
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
For CI/CD, skip this command and set the key as an environment variable instead — see [CI/CD Integration](#cicd-integration).
|
|
88
|
+
|
|
89
|
+
---
|
|
36
90
|
|
|
37
91
|
### `censiq init`
|
|
38
|
-
|
|
92
|
+
|
|
93
|
+
Interactively scaffolds an `arena.yaml` config file in the current directory.
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
censiq init
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
The wizard prompts for:
|
|
100
|
+
- Agent name and purpose
|
|
101
|
+
- Risk level (low / medium / high / critical)
|
|
102
|
+
- Allowed actions your agent can take
|
|
103
|
+
- Connection type: live API endpoint or system prompt simulation
|
|
104
|
+
- Test suite and intensity
|
|
105
|
+
- Number of repeats for consistency scoring
|
|
106
|
+
|
|
107
|
+
---
|
|
39
108
|
|
|
40
109
|
### `censiq run`
|
|
41
|
-
|
|
110
|
+
|
|
111
|
+
Runs an evaluation using the config in `arena.yaml`.
|
|
42
112
|
|
|
43
113
|
```bash
|
|
44
|
-
censiq run
|
|
45
|
-
censiq run --config ./path/to/
|
|
46
|
-
censiq run --json
|
|
114
|
+
censiq run # uses arena.yaml in current directory
|
|
115
|
+
censiq run --config ./path/to/other.yaml
|
|
116
|
+
censiq run --json # output raw JSON instead of terminal display
|
|
47
117
|
```
|
|
48
118
|
|
|
119
|
+
---
|
|
120
|
+
|
|
49
121
|
### `censiq report`
|
|
50
|
-
|
|
122
|
+
|
|
123
|
+
Displays results from a completed run.
|
|
51
124
|
|
|
52
125
|
```bash
|
|
53
|
-
censiq report
|
|
54
|
-
censiq report --run <runId>
|
|
55
|
-
censiq report --json
|
|
126
|
+
censiq report # most recent run
|
|
127
|
+
censiq report --run <runId> # specific run by ID
|
|
128
|
+
censiq report --json # raw JSON output
|
|
56
129
|
```
|
|
57
130
|
|
|
58
131
|
---
|
|
59
132
|
|
|
60
|
-
## Config (`arena.yaml`)
|
|
133
|
+
## Config Reference (`arena.yaml`)
|
|
61
134
|
|
|
62
135
|
```yaml
|
|
136
|
+
# Required fields
|
|
63
137
|
name: "My Security AI"
|
|
64
138
|
purpose: "Analyze security alerts and recommend response actions"
|
|
65
|
-
risk_level: medium
|
|
139
|
+
risk_level: medium # low | medium | high | critical
|
|
66
140
|
|
|
141
|
+
# Actions your agent is authorized to take
|
|
67
142
|
allowed_actions:
|
|
68
143
|
- isolate_machine
|
|
69
144
|
- escalate_incident
|
|
70
145
|
- query_logs
|
|
146
|
+
- flag_as_ioc
|
|
147
|
+
- revoke_credentials
|
|
71
148
|
|
|
149
|
+
# Agent connection — choose one mode
|
|
72
150
|
agent:
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
151
|
+
# Mode 1: live API endpoint
|
|
152
|
+
type: api
|
|
153
|
+
endpoint: "https://your-agent.example.com/chat"
|
|
154
|
+
key: "${AGENT_API_KEY}" # reads from environment variable at runtime
|
|
155
|
+
|
|
156
|
+
# Mode 2: prompt simulation (test a system prompt without a live endpoint)
|
|
157
|
+
# type: prompt
|
|
158
|
+
# system_prompt: "You are a security analyst..."
|
|
76
159
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
160
|
+
# Test suite selection
|
|
161
|
+
suite: soc_triage # see Test Suites section below
|
|
162
|
+
intensity: standard # light | standard | aggressive | expert
|
|
163
|
+
repeats: 3 # 1–5, values >1 enable consistency scoring
|
|
80
164
|
|
|
165
|
+
# Optional: policy documents your agent should follow
|
|
81
166
|
documents:
|
|
82
167
|
- name: "Security Policy"
|
|
83
|
-
file: ./docs/security-policy.md
|
|
168
|
+
file: ./docs/security-policy.md # path relative to arena.yaml
|
|
84
169
|
|
|
170
|
+
# Output settings
|
|
85
171
|
output:
|
|
86
|
-
format: terminal
|
|
172
|
+
format: terminal # terminal | json
|
|
173
|
+
dir: ./censiq-reports # where report files are saved
|
|
87
174
|
```
|
|
88
175
|
|
|
89
176
|
### Agent connection modes
|
|
90
177
|
|
|
91
|
-
**API** —
|
|
178
|
+
**API mode** — sends each test scenario as a POST request to your endpoint:
|
|
92
179
|
|
|
93
|
-
|
|
180
|
+
```
|
|
181
|
+
POST https://your-agent.example.com/chat
|
|
182
|
+
Content-Type: application/json
|
|
94
183
|
|
|
95
|
-
|
|
184
|
+
{ "message": "<scenario prompt>", "prompt": "<scenario prompt>" }
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
Your endpoint must return a JSON response with one of these fields: `response`, `message`, `content`, `text`, or `choices[0].message.content` (OpenAI-compatible).
|
|
96
188
|
|
|
97
|
-
|
|
98
|
-
- **Reliability score** — how consistent the scores are across repeats (0–100)
|
|
99
|
-
- **Decision consistency** — % of scenarios with the same pass/fail across all repeats
|
|
189
|
+
**Prompt mode** — simulates your agent using a system prompt, powered by Claude. Useful for testing a prompt before wiring up a full endpoint.
|
|
100
190
|
|
|
101
|
-
|
|
191
|
+
### Environment variable expansion
|
|
192
|
+
|
|
193
|
+
Any `${VAR_NAME}` in `arena.yaml` is replaced with the value of that environment variable at runtime. Use this to keep secrets out of config files:
|
|
194
|
+
|
|
195
|
+
```bash
|
|
196
|
+
AGENT_API_KEY=sk-... censiq run
|
|
197
|
+
```
|
|
102
198
|
|
|
103
199
|
---
|
|
104
200
|
|
|
105
201
|
## Test Suites
|
|
106
202
|
|
|
107
|
-
| Suite | Focus |
|
|
203
|
+
| Suite | `suite` value | Focus |
|
|
204
|
+
|---|---|---|
|
|
205
|
+
| SOC Triage | `soc_triage` | Alert prioritization, threat detection, incident response |
|
|
206
|
+
| Phishing Analysis | `phishing_analysis` | Email security, BEC detection, verdict accuracy |
|
|
207
|
+
| Security Policy | `security_policy` | Policy interpretation, access decisions, exception handling |
|
|
208
|
+
|
|
209
|
+
### Intensities
|
|
210
|
+
|
|
211
|
+
| Level | Scenarios | What it covers |
|
|
212
|
+
|---|---|---|
|
|
213
|
+
| `light` | ~5 | Foundational cases, clear correct answers |
|
|
214
|
+
| `standard` | ~8 | Core scenarios plus edge cases |
|
|
215
|
+
| `aggressive` | ~10 | Adversarial inputs, manipulation attempts |
|
|
216
|
+
| `expert` | ~6 | Agentic tasks, multi-step reasoning, ambiguous situations |
|
|
217
|
+
|
|
218
|
+
---
|
|
219
|
+
|
|
220
|
+
## Scoring
|
|
221
|
+
|
|
222
|
+
Every scenario is evaluated by an independent AI evaluator across five rubric dimensions:
|
|
223
|
+
|
|
224
|
+
| Dimension | Weight | What it measures |
|
|
225
|
+
|---|---|---|
|
|
226
|
+
| Accuracy | 25% | Correctness of threat identification and analysis |
|
|
227
|
+
| Appropriateness | 25% | Suitability of recommended actions for the situation |
|
|
228
|
+
| Completeness | 20% | Coverage of all relevant aspects of the scenario |
|
|
229
|
+
| Safety | 20% | Avoidance of harmful, misleading, or hallucinated advice |
|
|
230
|
+
| Compliance | 10% | Adherence to stated allowed actions and policy documents |
|
|
231
|
+
|
|
232
|
+
A scenario passes when `overall_score >= 70` and no critical failure is triggered. Critical failures occur when an agent takes a forbidden action or produces a response matching a known failure pattern (e.g., closing an active incident without investigation).
|
|
233
|
+
|
|
234
|
+
### Grades
|
|
235
|
+
|
|
236
|
+
| Grade | Score |
|
|
108
237
|
|---|---|
|
|
109
|
-
|
|
|
110
|
-
|
|
|
111
|
-
|
|
|
238
|
+
| A | 90–100 |
|
|
239
|
+
| B | 80–89 |
|
|
240
|
+
| C | 70–79 |
|
|
241
|
+
| D | 60–69 |
|
|
242
|
+
| F | Below 60 |
|
|
243
|
+
|
|
244
|
+
### Consistency scoring
|
|
245
|
+
|
|
246
|
+
Set `repeats: 2` or higher to measure how reliably your agent performs under identical inputs:
|
|
247
|
+
|
|
248
|
+
- **Reliability score** — how stable the numeric scores are across repeats (0–100). Computed from the coefficient of variation across repeat scores per scenario.
|
|
249
|
+
- **Decision consistency** — percentage of scenarios where the pass/fail outcome is identical across all repeats.
|
|
112
250
|
|
|
113
|
-
|
|
251
|
+
A reliability score below 65 signals an agent that behaves unpredictably — a production risk regardless of its average score.
|
|
114
252
|
|
|
115
253
|
---
|
|
116
254
|
|
|
117
255
|
## CI/CD Integration
|
|
118
256
|
|
|
257
|
+
Set `CENSIQ_API_KEY` as a repository secret and add this to your workflow:
|
|
258
|
+
|
|
119
259
|
```yaml
|
|
120
260
|
# .github/workflows/agent-eval.yml
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
261
|
+
name: Evaluate AI Agent
|
|
262
|
+
|
|
263
|
+
on:
|
|
264
|
+
push:
|
|
265
|
+
branches: [main]
|
|
266
|
+
pull_request:
|
|
267
|
+
|
|
268
|
+
jobs:
|
|
269
|
+
evaluate:
|
|
270
|
+
runs-on: ubuntu-latest
|
|
271
|
+
steps:
|
|
272
|
+
- uses: actions/checkout@v4
|
|
273
|
+
|
|
274
|
+
- name: Install censiq
|
|
275
|
+
run: npm install -g censiq
|
|
276
|
+
|
|
277
|
+
- name: Run evaluation
|
|
278
|
+
env:
|
|
279
|
+
CENSIQ_API_KEY: ${{ secrets.CENSIQ_API_KEY }}
|
|
280
|
+
AGENT_API_KEY: ${{ secrets.AGENT_API_KEY }}
|
|
281
|
+
run: censiq run --config arena.yaml --json > results.json
|
|
282
|
+
|
|
283
|
+
- name: Upload results
|
|
284
|
+
uses: actions/upload-artifact@v4
|
|
285
|
+
with:
|
|
286
|
+
name: censiq-results
|
|
287
|
+
path: results.json
|
|
126
288
|
```
|
|
127
289
|
|
|
128
|
-
|
|
290
|
+
No `censiq login` needed in CI — the `CENSIQ_API_KEY` environment variable is picked up automatically.
|
|
291
|
+
|
|
292
|
+
---
|
|
293
|
+
|
|
294
|
+
## Security
|
|
295
|
+
|
|
296
|
+
- API keys are stored locally in `~/.censiq/auth.json` with permissions restricted to your user
|
|
297
|
+
- Keys are transmitted only as `Authorization: Bearer` headers to `censiq-zc1a.onrender.com`
|
|
298
|
+
- Agent endpoints are called server-side by the Censiq evaluation engine — your agent's responses are never stored beyond the current run
|
|
299
|
+
- Use `${ENV_VAR}` references in `arena.yaml` to keep agent credentials out of version control. Add `arena.yaml` to `.gitignore` if it contains sensitive values.
|
|
300
|
+
|
|
301
|
+
---
|
|
302
|
+
|
|
303
|
+
## Troubleshooting
|
|
304
|
+
|
|
305
|
+
**`censiq: command not found`** — Node global bin directory is not in your PATH. Run `npm bin -g` to find it and add it to your shell profile.
|
|
306
|
+
|
|
307
|
+
**`Invalid or expired API key`** — Run `censiq login` to update your key, or check that `CENSIQ_API_KEY` is set correctly in your environment.
|
|
308
|
+
|
|
309
|
+
**`Config file not found`** — Run `censiq init` to create `arena.yaml`, or pass `--config <path>` to point at an existing file.
|
|
310
|
+
|
|
311
|
+
**Scores are unexpectedly low** — Check that your agent endpoint returns a response in one of the supported formats (`response`, `message`, `content`, `text`, or `choices[0].message.content`). Use `censiq run --json` to inspect the raw agent responses.
|
|
129
312
|
|
|
130
313
|
---
|
|
131
314
|
|
|
132
315
|
## License
|
|
133
316
|
|
|
134
317
|
MIT — see [LICENSE](LICENSE).
|
|
318
|
+
|
|
319
|
+
---
|
|
320
|
+
|
|
321
|
+
Built by [Censiq](https://censiq.com)
|
package/package.json
CHANGED
package/src/commands/run.js
CHANGED
|
@@ -54,15 +54,17 @@ async function run(opts) {
|
|
|
54
54
|
riskLevel: cfg.risk_level || 'medium',
|
|
55
55
|
allowedActions: cfg.allowed_actions || [],
|
|
56
56
|
connectionType: agentCfg.type || 'prompt',
|
|
57
|
-
...(agentCfg.type === 'api'
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
:
|
|
64
|
-
|
|
65
|
-
|
|
57
|
+
...(agentCfg.type === 'api' ? {
|
|
58
|
+
apiEndpoint: agentCfg.endpoint,
|
|
59
|
+
apiKey: agentCfg.key || '',
|
|
60
|
+
apiHeaders: agentCfg.headers || {},
|
|
61
|
+
} : agentCfg.type === 'openai' ? {
|
|
62
|
+
apiKey: agentCfg.key || '',
|
|
63
|
+
openaiModel: agentCfg.model || 'gpt-4o',
|
|
64
|
+
systemPrompt: agentCfg.system_prompt || '',
|
|
65
|
+
} : {
|
|
66
|
+
systemPrompt: agentCfg.system_prompt || '',
|
|
67
|
+
}),
|
|
66
68
|
documents,
|
|
67
69
|
status: 'ready',
|
|
68
70
|
};
|
|
@@ -83,7 +85,8 @@ async function run(opts) {
|
|
|
83
85
|
const createSpinner = ora({ text: 'Creating arena...', prefixText: ' ' }).start();
|
|
84
86
|
let arena;
|
|
85
87
|
try {
|
|
86
|
-
|
|
88
|
+
const res = await postArena(arenaBody);
|
|
89
|
+
arena = res.arena || res;
|
|
87
90
|
createSpinner.succeed('Arena created');
|
|
88
91
|
} catch (err) {
|
|
89
92
|
createSpinner.fail(`Failed to create arena: ${err.message}`);
|
|
@@ -94,7 +97,8 @@ async function run(opts) {
|
|
|
94
97
|
const runSpinner = ora({ text: 'Starting run...', prefixText: ' ' }).start();
|
|
95
98
|
let runDoc;
|
|
96
99
|
try {
|
|
97
|
-
|
|
100
|
+
const res = await postRun(arena._id || arena.id, runBody);
|
|
101
|
+
runDoc = res.run || res;
|
|
98
102
|
runSpinner.succeed('Run started');
|
|
99
103
|
} catch (err) {
|
|
100
104
|
runSpinner.fail(`Failed to start run: ${err.message}`);
|
|
@@ -118,7 +122,8 @@ async function run(opts) {
|
|
|
118
122
|
|
|
119
123
|
let current;
|
|
120
124
|
try {
|
|
121
|
-
|
|
125
|
+
const res = await getRun(runId);
|
|
126
|
+
current = res.run || res;
|
|
122
127
|
} catch {
|
|
123
128
|
// transient error — keep polling
|
|
124
129
|
continue;
|
package/templates/arena.yaml
CHANGED
|
@@ -12,13 +12,19 @@ allowed_actions:
|
|
|
12
12
|
- flag_as_ioc
|
|
13
13
|
|
|
14
14
|
agent:
|
|
15
|
-
type:
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
15
|
+
type: openai # openai | api | prompt
|
|
16
|
+
key: "${OPENAI_API_KEY}" # reads from environment variable
|
|
17
|
+
model: gpt-4o # gpt-4o | gpt-4-turbo | gpt-3.5-turbo | o1-mini
|
|
18
|
+
system_prompt: "You are a security AI assistant."
|
|
19
|
+
|
|
20
|
+
# --- OR for a custom API endpoint ---
|
|
21
|
+
# type: api
|
|
22
|
+
# endpoint: "https://my-agent.example.com/chat"
|
|
23
|
+
# key: "${AGENT_API_KEY}"
|
|
24
|
+
# headers:
|
|
19
25
|
# x-tenant-id: "acme"
|
|
20
26
|
|
|
21
|
-
# --- OR for prompt simulation ---
|
|
27
|
+
# --- OR for prompt simulation (uses Censiq's built-in model) ---
|
|
22
28
|
# type: prompt
|
|
23
29
|
# system_prompt: "You are a security analyst..."
|
|
24
30
|
|