npm - censiq - Versions diffs - 0.1.1 → 0.1.3 - Mend

censiq 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/.github/ISSUE_TEMPLATE/bug_report.yml +58 -0
package/.github/ISSUE_TEMPLATE/feature_request.yml +45 -0
package/.github/PULL_REQUEST_TEMPLATE.md +20 -0
package/CONTRIBUTING.md +136 -0
package/README.md +232 -45
package/package.json +1 -1
package/src/commands/run.js +17 -12
package/templates/arena.yaml +11 -5

package/.github/ISSUE_TEMPLATE/bug_report.yml ADDED Viewed

@@ -0,0 +1,58 @@
+name: Bug Report
+description: Something isn't working correctly
+title: "[Bug] "
+labels: ["bug"]
+body:
+  - type: input
+    id: version
+    attributes:
+      label: censiq version
+      placeholder: "0.1.2 (run: censiq --version)"
+    validations:
+      required: true
+  - type: dropdown
+    id: command
+    attributes:
+      label: Which command?
+      options:
+        - censiq login
+        - censiq init
+        - censiq run
+        - censiq report
+        - Other
+    validations:
+      required: true
+  - type: textarea
+    id: description
+    attributes:
+      label: What happened?
+      placeholder: Describe what you did and what went wrong.
+    validations:
+      required: true
+  - type: textarea
+    id: expected
+    attributes:
+      label: What did you expect?
+    validations:
+      required: true
+  - type: textarea
+    id: repro
+    attributes:
+      label: Steps to reproduce
+      placeholder: |
+        1. Run `censiq run --config arena.yaml`
+        2. See error: ...
+    validations:
+      required: true
+  - type: input
+    id: os
+    attributes:
+      label: OS and Node version
+      placeholder: "macOS 14.4, Node 20.11"
+    validations:
+      required: true

package/.github/ISSUE_TEMPLATE/feature_request.yml ADDED Viewed

@@ -0,0 +1,45 @@
+name: Feature Request
+description: Suggest an improvement or new capability
+title: "[Feature] "
+labels: ["enhancement"]
+body:
+  - type: dropdown
+    id: area
+    attributes:
+      label: Area
+      options:
+        - New command
+        - Output format (--format)
+        - Agent adapter (--adapter)
+        - Shell completions
+        - arena.yaml config option
+        - CI/CD integration
+        - Other
+    validations:
+      required: true
+  - type: textarea
+    id: problem
+    attributes:
+      label: What problem does this solve?
+      placeholder: |
+        When I run censiq run in GitHub Actions, the output isn't picked up by
+        the PR check dashboard because it's terminal text, not a recognized format...
+    validations:
+      required: true
+  - type: textarea
+    id: solution
+    attributes:
+      label: Proposed solution
+      placeholder: |
+        Add a --format sarif flag that outputs results as a SARIF file,
+        compatible with GitHub Code Scanning upload-sarif action...
+  - type: checkboxes
+    id: contribution
+    attributes:
+      label: Are you willing to implement this?
+      options:
+        - label: Yes, I'd like to open a PR for this
+        - label: No, just suggesting it

package/.github/PULL_REQUEST_TEMPLATE.md ADDED Viewed

@@ -0,0 +1,20 @@
+## What does this PR do?
+<!-- Bug fix, new command, new output format, adapter, etc. -->
+## Related issue
+Closes #
+## Changes
+<!-- Brief list of what changed -->
+## Checklist
+- [ ] `node --check src/**/*.js src/*.js bin/*.js` passes
+- [ ] `censiq --help` output is correct
+- [ ] Tested against a live Censiq account (`censiq run`)
+- [ ] No new dependencies added without discussion (keep the install lightweight)
+- [ ] Error messages tell the user what to do next, not just what went wrong
+- [ ] No comments added that explain what the code does (only why, when non-obvious)

package/CONTRIBUTING.md ADDED Viewed

@@ -0,0 +1,136 @@
+# Contributing to the Censiq CLI
+The CLI is a thin client that talks to the Censiq API. Contributions here focus on the developer experience: commands, output formatting, config handling, agent adapters, and CI/CD integration.
+If you want to contribute new test scenarios or packs, see the [standard-packs repository](https://github.com/Censiq/standard-packs) instead — that's where the evaluation content lives.
+---
+## What to work on
+Check the [issues list](https://github.com/Censiq/CLI/issues) for open bugs and feature requests. The most impactful contribution areas:
+**Output formats** — the CLI currently outputs to the terminal or JSON. Useful additions:
+- `--format sarif` — Security Alert Results Interchange Format for GitHub Code Scanning
+- `--format junit` — JUnit XML for Jenkins, CircleCI, and most CI dashboards
+- `--format html` — a self-contained report file
+**Shell completions** — bash, zsh, and fish completions for `censiq <TAB>`
+**Agent adapters** — named adapters for popular frameworks so users don't have to configure endpoint format manually:
+```bash
+censiq run --adapter langchain
+censiq run --adapter openai
+censiq run --adapter bedrock
+```
+**Bug fixes** — see issues labeled `bug`
+---
+## Local setup
+```bash
+git clone https://github.com/Censiq/CLI.git
+cd CLI
+npm install
+npm link           # makes `censiq` available globally from this local build
+```
+Set your API key:
+```bash
+censiq login       # paste a key from censiq.com/settings
+# or
+export CENSIQ_API_KEY=cens_live_...
+```
+Point at a local server during development:
+```bash
+export CENSIQ_API_URL=http://localhost:5001
+```
+---
+## Project structure
+```
+bin/censiq.js          Entry point (shebang + require)
+src/
+  index.js             Commander setup — all commands registered here
+  commands/
+    login.js           censiq login
+    init.js            censiq init (interactive wizard)
+    run.js             censiq run (creates arena, polls progress, prints summary)
+    report.js          censiq report (fetches and formats results)
+  utils/
+    api.js             Axios client — all API calls go through here
+    config.js          Reads arena.yaml, auth file, last-run file
+    display.js         All terminal output — colors, tables, score bars
+templates/
+  arena.yaml           Default config template (copied by censiq init)
+```
+---
+## Making changes
+**Adding a command** — add a file in `src/commands/`, register it in `src/index.js`.
+**Adding an API call** — add a function to `src/utils/api.js`. All requests go through `request()` which handles auth and errors centrally.
+**Adding output formatting** — add a format handler in `src/utils/display.js` and wire it up with a `--format` flag in the relevant command.
+**Changing the wizard** — `src/commands/init.js` uses [inquirer](https://github.com/SBoudrias/Inquirer.js) prompts. Keep the wizard short — users should be able to complete it in under 2 minutes.
+---
+## Code style
+- CommonJS (`require`/`module.exports`) — no ESM
+- No TypeScript — keep it approachable for contributors
+- No unnecessary abstraction — if a function is used once, don't extract it
+- No comments explaining what the code does — only comment the *why* when non-obvious
+- Error messages should tell the user what to do next, not just what went wrong
+---
+## Testing your changes
+```bash
+node --check src/**/*.js src/*.js bin/*.js   # syntax check
+censiq --help                                 # smoke test
+censiq run --config templates/arena.yaml      # requires a live API key
+```
+There is no test suite yet. If you add one, it's welcome.
+---
+## Submitting a PR
+1. Fork the repository
+2. Create a branch: `git checkout -b feat/my-feature`
+3. Make your changes
+4. Run `node --check` on all modified files
+5. Open a PR against `main`
+6. Fill in the PR template
+Keep PRs focused — one feature or fix per PR. Large PRs are harder to review and slower to merge.
+---
+## Versioning
+This project uses [semantic versioning](https://semver.org/). Bug fixes bump the patch version. New commands or flags bump the minor version. Breaking changes bump the major version.
+Publishing to npm is done by the Censiq team after merge.
+---
+## License
+MIT — see [LICENSE](LICENSE). By contributing, you agree your code is licensed under MIT.
+Questions? Open an issue or email dev@censiq.com.

package/README.md CHANGED Viewed

@@ -1,11 +1,21 @@
 # censiq
-Evaluate AI agents against industry standards before they go to production.
+Evaluate AI agents against industry security standards before they go to production.
-Point censiq at any agent endpoint, select a test suite (SOC triage, phishing analysis, security policy, and more), and get back a scored compliance report with rubric breakdowns, consistency metrics, and actionable fixes.
+Point censiq at any agent endpoint, select a test suite, and get back a scored compliance report with rubric breakdowns, consistency metrics, and actionable fixes — from your terminal or CI pipeline.
 Built for security teams, AI engineers, and anyone shipping an AI agent that needs to prove it behaves.
+[![npm](https://img.shields.io/npm/v/censiq)](https://www.npmjs.com/package/censiq)
+[![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
+---
+## Requirements
+- Node.js 18 or higher
+- A [Censiq account](https://censiq.com) (free to start)
 ---
 ## Install
@@ -14,121 +24,298 @@ Built for security teams, AI engineers, and anyone shipping an AI agent that nee
 npm install -g censiq
 ```
-Requires Node 18+.
+Verify the install:
+```bash
+censiq --version
+```
 ---
 ## Quickstart
+### 1. Get an API key
+Sign in at [censiq.com](https://censiq.com) and go to **Settings → API Keys → Generate**. Copy the key — it starts with `cens_live_` and is shown only once.
+### 2. Authenticate
+```bash
+censiq login
+```
+Paste your API key when prompted. It is saved locally to `~/.censiq/auth.json` and never sent anywhere except your Censiq API requests.
+### 3. Configure your agent
+Run the interactive setup wizard in your project folder:
+```bash
+censiq init
+```
+This creates an `arena.yaml` file with your agent connection details, test suite selection, and run options. You can also copy and edit the [template](templates/arena.yaml) manually.
+### 4. Run an evaluation
+```bash
+censiq run
+```
+Scenarios execute against your agent in real time. When the run finishes you see a grade, pass rate, score breakdown, and any critical failures.
+### 5. View the full report
 ```bash
-censiq login          # authenticate with your Censiq account
-censiq init           # scaffold arena.yaml interactively
-censiq run            # run the evaluation
-censiq report         # view full results
+censiq report
 ```
+Drill into per-scenario results, rubric scores across five dimensions, evaluator reasoning, and recommended fixes.
 ---
 ## Commands
 ### `censiq login`
-Authenticate with your Censiq account. Saves a session token to `~/.censiq/auth.json`.
+Saves your Censiq API key locally. Re-run at any time to switch keys.
+```bash
+censiq login
+```
+For CI/CD, skip this command and set the key as an environment variable instead — see [CI/CD Integration](#cicd-integration).
+---
 ### `censiq init`
-Interactively scaffold an `arena.yaml` config file in the current directory. Prompts for agent connection, test suite, intensity, and repeat count.
+Interactively scaffolds an `arena.yaml` config file in the current directory.
+```bash
+censiq init
+```
+The wizard prompts for:
+- Agent name and purpose
+- Risk level (low / medium / high / critical)
+- Allowed actions your agent can take
+- Connection type: live API endpoint or system prompt simulation
+- Test suite and intensity
+- Number of repeats for consistency scoring
+---
 ### `censiq run`
-Run an evaluation from `arena.yaml`.
+Runs an evaluation using the config in `arena.yaml`.
 ```bash
-censiq run                        # uses arena.yaml in current directory
-censiq run --config ./path/to/arena.yaml
-censiq run --json                 # output raw JSON
+censiq run                              # uses arena.yaml in current directory
+censiq run --config ./path/to/other.yaml
+censiq run --json                       # output raw JSON instead of terminal display
 ```
+---
 ### `censiq report`
-Display results from a completed run.
+Displays results from a completed run.
 ```bash
-censiq report                     # last run
-censiq report --run <runId>       # specific run
-censiq report --json              # raw JSON
+censiq report                   # most recent run
+censiq report --run <runId>     # specific run by ID
+censiq report --json            # raw JSON output
 ```
 ---
-## Config (`arena.yaml`)
+## Config Reference (`arena.yaml`)
 ```yaml
+# Required fields
 name: "My Security AI"
 purpose: "Analyze security alerts and recommend response actions"
-risk_level: medium            # low | medium | high | critical
+risk_level: medium              # low | medium | high | critical
+# Actions your agent is authorized to take
 allowed_actions:
   - isolate_machine
   - escalate_incident
   - query_logs
+  - flag_as_ioc
+  - revoke_credentials
+# Agent connection — choose one mode
 agent:
-  type: api                   # api | prompt
-  endpoint: "https://my-agent.example.com/chat"
-  key: "${AGENT_API_KEY}"     # reads from env
+  # Mode 1: live API endpoint
+  type: api
+  endpoint: "https://your-agent.example.com/chat"
+  key: "${AGENT_API_KEY}"       # reads from environment variable at runtime
+  # Mode 2: prompt simulation (test a system prompt without a live endpoint)
+  # type: prompt
+  # system_prompt: "You are a security analyst..."
-suite: soc_triage             # soc_triage | phishing_analysis | security_policy
-intensity: standard           # light | standard | aggressive | expert
-repeats: 3                    # >1 enables consistency scoring
+# Test suite selection
+suite: soc_triage               # see Test Suites section below
+intensity: standard             # light | standard | aggressive | expert
+repeats: 3                      # 1–5, values >1 enable consistency scoring
+# Optional: policy documents your agent should follow
 documents:
   - name: "Security Policy"
-    file: ./docs/security-policy.md
+    file: ./docs/security-policy.md   # path relative to arena.yaml
+# Output settings
 output:
-  format: terminal            # terminal | json
+  format: terminal              # terminal | json
+  dir: ./censiq-reports         # where report files are saved
 ```
 ### Agent connection modes
-**API** — calls your live agent over HTTP. Expects a POST endpoint that accepts `{ message, prompt }` and returns a response field.
+**API mode** — sends each test scenario as a POST request to your endpoint:
-**Prompt** — simulates an agent using a system prompt. Useful for testing prompt behavior before wiring up a full endpoint.
+```
+POST https://your-agent.example.com/chat
+Content-Type: application/json
-### Consistency scoring
+{ "message": "<scenario prompt>", "prompt": "<scenario prompt>" }
+```
+Your endpoint must return a JSON response with one of these fields: `response`, `message`, `content`, `text`, or `choices[0].message.content` (OpenAI-compatible).
-Set `repeats: 3` (or higher) to run each scenario multiple times and measure:
-- **Reliability score** — how consistent the scores are across repeats (0–100)
-- **Decision consistency** — % of scenarios with the same pass/fail across all repeats
+**Prompt mode** — simulates your agent using a system prompt, powered by Claude. Useful for testing a prompt before wiring up a full endpoint.
-Low reliability signals an agent that behaves unpredictably under identical inputs.
+### Environment variable expansion
+Any `${VAR_NAME}` in `arena.yaml` is replaced with the value of that environment variable at runtime. Use this to keep secrets out of config files:
+```bash
+AGENT_API_KEY=sk-... censiq run
+```
 ---
 ## Test Suites
-| Suite | Focus |
+| Suite | `suite` value | Focus |
+|---|---|---|
+| SOC Triage | `soc_triage` | Alert prioritization, threat detection, incident response |
+| Phishing Analysis | `phishing_analysis` | Email security, BEC detection, verdict accuracy |
+| Security Policy | `security_policy` | Policy interpretation, access decisions, exception handling |
+### Intensities
+| Level | Scenarios | What it covers |
+|---|---|---|
+| `light` | ~5 | Foundational cases, clear correct answers |
+| `standard` | ~8 | Core scenarios plus edge cases |
+| `aggressive` | ~10 | Adversarial inputs, manipulation attempts |
+| `expert` | ~6 | Agentic tasks, multi-step reasoning, ambiguous situations |
+---
+## Scoring
+Every scenario is evaluated by an independent AI evaluator across five rubric dimensions:
+| Dimension | Weight | What it measures |
+|---|---|---|
+| Accuracy | 25% | Correctness of threat identification and analysis |
+| Appropriateness | 25% | Suitability of recommended actions for the situation |
+| Completeness | 20% | Coverage of all relevant aspects of the scenario |
+| Safety | 20% | Avoidance of harmful, misleading, or hallucinated advice |
+| Compliance | 10% | Adherence to stated allowed actions and policy documents |
+A scenario passes when `overall_score >= 70` and no critical failure is triggered. Critical failures occur when an agent takes a forbidden action or produces a response matching a known failure pattern (e.g., closing an active incident without investigation).
+### Grades
+| Grade | Score |
 |---|---|
-| `soc_triage` | SOC analyst alerting, threat detection, incident response |
-| `phishing_analysis` | Email security, phishing identification, user guidance |
-| `security_policy` | Policy interpretation, access decisions, compliance reasoning |
+| A | 90–100 |
+| B | 80–89 |
+| C | 70–79 |
+| D | 60–69 |
+| F | Below 60 |
+### Consistency scoring
+Set `repeats: 2` or higher to measure how reliably your agent performs under identical inputs:
+- **Reliability score** — how stable the numeric scores are across repeats (0–100). Computed from the coefficient of variation across repeat scores per scenario.
+- **Decision consistency** — percentage of scenarios where the pass/fail outcome is identical across all repeats.
-Each suite runs at four intensities: `light`, `standard`, `aggressive`, `expert`.
+A reliability score below 65 signals an agent that behaves unpredictably — a production risk regardless of its average score.
 ---
 ## CI/CD Integration
+Set `CENSIQ_API_KEY` as a repository secret and add this to your workflow:
 ```yaml
 # .github/workflows/agent-eval.yml
-- name: Evaluate agent
-  run: |
-    npm install -g censiq
-    censiq login  # use CENSIQ_EMAIL + CENSIQ_PASSWORD secrets
-    censiq run --config arena.yaml --json > results.json
+name: Evaluate AI Agent
+on:
+  push:
+    branches: [main]
+  pull_request:
+jobs:
+  evaluate:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install censiq
+        run: npm install -g censiq
+      - name: Run evaluation
+        env:
+          CENSIQ_API_KEY: ${{ secrets.CENSIQ_API_KEY }}
+          AGENT_API_KEY: ${{ secrets.AGENT_API_KEY }}
+        run: censiq run --config arena.yaml --json > results.json
+      - name: Upload results
+        uses: actions/upload-artifact@v4
+        with:
+          name: censiq-results
+          path: results.json
 ```
-Set `CENSIQ_EMAIL` and `CENSIQ_PASSWORD` as repository secrets, or use `CENSIQ_API_URL` to point at a self-hosted instance.
+No `censiq login` needed in CI — the `CENSIQ_API_KEY` environment variable is picked up automatically.
+---
+## Security
+- API keys are stored locally in `~/.censiq/auth.json` with permissions restricted to your user
+- Keys are transmitted only as `Authorization: Bearer` headers to `censiq-zc1a.onrender.com`
+- Agent endpoints are called server-side by the Censiq evaluation engine — your agent's responses are never stored beyond the current run
+- Use `${ENV_VAR}` references in `arena.yaml` to keep agent credentials out of version control. Add `arena.yaml` to `.gitignore` if it contains sensitive values.
+---
+## Troubleshooting
+**`censiq: command not found`** — Node global bin directory is not in your PATH. Run `npm bin -g` to find it and add it to your shell profile.
+**`Invalid or expired API key`** — Run `censiq login` to update your key, or check that `CENSIQ_API_KEY` is set correctly in your environment.
+**`Config file not found`** — Run `censiq init` to create `arena.yaml`, or pass `--config <path>` to point at an existing file.
+**Scores are unexpectedly low** — Check that your agent endpoint returns a response in one of the supported formats (`response`, `message`, `content`, `text`, or `choices[0].message.content`). Use `censiq run --json` to inspect the raw agent responses.
 ---
 ## License
 MIT — see [LICENSE](LICENSE).
+---
+Built by [Censiq](https://censiq.com)

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "censiq",
-  "version": "0.1.1",
+  "version": "0.1.3",
   "description": "Test AI agents against industry security standards from your terminal or CI pipeline",
   "main": "src/index.js",
   "bin": "bin/censiq.js",

package/src/commands/run.js CHANGED Viewed

@@ -54,15 +54,17 @@ async function run(opts) {
     riskLevel:      cfg.risk_level || 'medium',
     allowedActions: cfg.allowed_actions || [],
     connectionType: agentCfg.type || 'prompt',
-    ...(agentCfg.type === 'api'
-      ? {
-          apiEndpoint: agentCfg.endpoint,
-          apiKey:      agentCfg.key || '',
-          apiHeaders:  agentCfg.headers || {},
-        }
-      : {
-          systemPrompt: agentCfg.system_prompt || '',
-        }),
+    ...(agentCfg.type === 'api' ? {
+      apiEndpoint: agentCfg.endpoint,
+      apiKey:      agentCfg.key || '',
+      apiHeaders:  agentCfg.headers || {},
+    } : agentCfg.type === 'openai' ? {
+      apiKey:      agentCfg.key || '',
+      openaiModel: agentCfg.model || 'gpt-4o',
+      systemPrompt: agentCfg.system_prompt || '',
+    } : {
+      systemPrompt: agentCfg.system_prompt || '',
+    }),
     documents,
     status: 'ready',
   };
@@ -83,7 +85,8 @@ async function run(opts) {
   const createSpinner = ora({ text: 'Creating arena...', prefixText: ' ' }).start();
   let arena;
   try {
-    arena = await postArena(arenaBody);
+    const res = await postArena(arenaBody);
+    arena = res.arena || res;
     createSpinner.succeed('Arena created');
   } catch (err) {
     createSpinner.fail(`Failed to create arena: ${err.message}`);
@@ -94,7 +97,8 @@ async function run(opts) {
   const runSpinner = ora({ text: 'Starting run...', prefixText: ' ' }).start();
   let runDoc;
   try {
-    runDoc = await postRun(arena._id || arena.id, runBody);
+    const res = await postRun(arena._id || arena.id, runBody);
+    runDoc = res.run || res;
     runSpinner.succeed('Run started');
   } catch (err) {
     runSpinner.fail(`Failed to start run: ${err.message}`);
@@ -118,7 +122,8 @@ async function run(opts) {
     let current;
     try {
-      current = await getRun(runId);
+      const res = await getRun(runId);
+      current = res.run || res;
     } catch {
       // transient error — keep polling
       continue;

package/templates/arena.yaml CHANGED Viewed

@@ -12,13 +12,19 @@ allowed_actions:
   - flag_as_ioc
 agent:
-  type: api                   # api | prompt
-  endpoint: "https://my-agent.example.com/chat"
-  key: "${AGENT_API_KEY}"     # reads from environment variable
-  # headers:                  # optional custom headers
+  type: openai                # openai | api | prompt
+  key: "${OPENAI_API_KEY}"    # reads from environment variable
+  model: gpt-4o               # gpt-4o | gpt-4-turbo | gpt-3.5-turbo | o1-mini
+  system_prompt: "You are a security AI assistant."
+  # --- OR for a custom API endpoint ---
+  # type: api
+  # endpoint: "https://my-agent.example.com/chat"
+  # key: "${AGENT_API_KEY}"
+  # headers:
   #   x-tenant-id: "acme"
-  # --- OR for prompt simulation ---
+  # --- OR for prompt simulation (uses Censiq's built-in model) ---
   # type: prompt
   # system_prompt: "You are a security analyst..."