@roleplay-sh/cli 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.env.example CHANGED
@@ -15,7 +15,7 @@ ROLEPLAY_TARGET_COMMAND=
15
15
  # Provider choices: openai, anthropic, google, openai-compatible.
16
16
  ROLEPLAY_LLM_PROVIDER=<provider>
17
17
  ROLEPLAY_LLM_MODEL=
18
- ROLEPLAY_JUDGE_MODE=semantic
18
+ ROLEPLAY_JUDGE_MODE=hybrid
19
19
  ROLEPLAY_JUDGE_PROVIDER=<provider>
20
20
  ROLEPLAY_JUDGE_MODEL=
21
21
  ROLEPLAY_ATTACKER_PROVIDER=
package/CHANGELOG.md CHANGED
@@ -4,7 +4,13 @@ All notable changes to roleplay.sh will be documented in this file.
4
4
 
5
5
  This project follows semantic versioning after the public `0.1.0` release.
6
6
 
7
- ## 0.1.7 - Unreleased
7
+ ## 0.1.8 - Unreleased
8
+
9
+ ### Changed
10
+
11
+ - Changed `roleplay setup` default judge mode to `hybrid`.
12
+
13
+ ## 0.1.7 - 2026-06-14
8
14
 
9
15
  ### Added
10
16
 
package/README.md CHANGED
@@ -37,7 +37,7 @@ HTTP target:
37
37
  roleplay run social-engineering-core \
38
38
  --target http://localhost:3000/agent \
39
39
  --provider <provider> \
40
- --judge semantic \
40
+ --judge hybrid \
41
41
  --project <project-id> \
42
42
  --api-key <project-api-key> \
43
43
  --fail-on critical
@@ -59,7 +59,7 @@ roleplay run social-engineering-core \
59
59
  ## Judge Choices
60
60
 
61
61
  - `--judge rules`: deterministic local rule judge. Best for smoke tests and offline checks.
62
- - `--judge semantic`: provider-backed security judge. Recommended for real agent tests.
62
+ - `--judge semantic`: provider-backed security judge for real agent tests.
63
63
  - `--judge hybrid`: semantic judge plus deterministic guardrails. Recommended for CI once your provider is configured.
64
64
 
65
65
  Rules-only judging can be used against real targets only with `--allow-rules-only`, so it is never mistaken for full semantic evaluation.
package/RELEASE.md CHANGED
@@ -29,8 +29,8 @@ The publish workflow uses GitHub OIDC and intentionally does not require an npm
29
29
  Create a GitHub release or push a version tag:
30
30
 
31
31
  ```bash
32
- git tag v0.1.7
33
- git push origin v0.1.7
32
+ git tag v0.1.8
33
+ git push origin v0.1.8
34
34
  ```
35
35
 
36
36
  The publish workflow runs checks and then publishes with:
@@ -57,10 +57,10 @@ For real provider-backed verification:
57
57
  export ROLEPLAY_PROJECT_ID=<project-id>
58
58
  export ROLEPLAY_API_KEY=<project-api-key>
59
59
  export ROLEPLAY_LLM_PROVIDER=<provider>
60
- export ROLEPLAY_JUDGE_MODE=semantic
60
+ export ROLEPLAY_JUDGE_MODE=hybrid
61
61
  export ROLEPLAY_JUDGE_PROVIDER=<provider>
62
62
  export ROLEPLAY_<PROVIDER>_API_KEY=<provider-key>
63
- roleplay run social-engineering-core --target http://localhost:3000/agent --provider <provider> --judge semantic --max-turns 1 --fail-on critical
63
+ roleplay run social-engineering-core --target http://localhost:3000/agent --provider <provider> --judge hybrid --max-turns 1 --fail-on critical
64
64
  ```
65
65
 
66
66
  For workbench upload verification, start a Builder or Team trial, create a project API key at `https://app.roleplay.sh`, and run:
package/dist/cli.js CHANGED
@@ -199,7 +199,7 @@ function fromFlags(flags) {
199
199
  cloudUrl: flags["cloud-url"],
200
200
  project: flags.project ?? process.env.ROLEPLAY_PROJECT_ID ?? "",
201
201
  provider: flags.provider ?? process.env.ROLEPLAY_LLM_PROVIDER ?? "",
202
- judge: flags.judge ?? process.env.ROLEPLAY_JUDGE_MODE ?? "semantic",
202
+ judge: flags.judge ?? process.env.ROLEPLAY_JUDGE_MODE ?? "hybrid",
203
203
  judgeProvider: flags["judge-provider"] ?? process.env.ROLEPLAY_JUDGE_PROVIDER ?? flags.provider ?? process.env.ROLEPLAY_LLM_PROVIDER ?? "",
204
204
  target: flags.target ?? process.env.ROLEPLAY_TARGET_URL ?? "",
205
205
  targetCommand: flags["target-command"] ?? process.env.ROLEPLAY_TARGET_COMMAND ?? ""
@@ -211,7 +211,7 @@ async function promptForSetup(defaults) {
211
211
  const cloudUrl = await ask(rl, "Workbench URL", defaults.cloudUrl);
212
212
  const project = await ask(rl, "Project ID", defaults.project);
213
213
  const provider = await ask(rl, "Attacker provider (openai, anthropic, google, openai-compatible)", defaults.provider);
214
- const judge = await ask(rl, "Judge mode (rules, semantic, hybrid)", defaults.judge || "semantic");
214
+ const judge = await ask(rl, "Judge mode (rules, semantic, hybrid)", defaults.judge || "hybrid");
215
215
  const judgeProvider = await ask(rl, "Judge provider for semantic/hybrid mode", defaults.judgeProvider || provider);
216
216
  const target = await ask(rl, "HTTP target URL (leave blank if using a CLI target)", defaults.target);
217
217
  const targetCommand = target ? "" : await ask(rl, "CLI target command (optional)", defaults.targetCommand);
@@ -244,7 +244,7 @@ ROLEPLAY_TARGET_COMMAND=${input2.targetCommand}
244
244
  # Provider choices: openai, anthropic, google, openai-compatible.
245
245
  ROLEPLAY_LLM_PROVIDER=${input2.provider || "<provider>"}
246
246
  ROLEPLAY_LLM_MODEL=
247
- ROLEPLAY_JUDGE_MODE=${input2.judge || "semantic"}
247
+ ROLEPLAY_JUDGE_MODE=${input2.judge || "hybrid"}
248
248
  ROLEPLAY_JUDGE_PROVIDER=${input2.judgeProvider || "<provider>"}
249
249
  ROLEPLAY_JUDGE_MODEL=
250
250
  ROLEPLAY_ATTACKER_PROVIDER=
@@ -312,7 +312,7 @@ var init_setup = __esm({
312
312
  this.log("\nNext steps:");
313
313
  this.log(" 1. Copy .env.example to .env and fill in secrets locally or in CI.");
314
314
  this.log(" 2. Smoke test: roleplay run social-engineering-core --target mock --provider mock --judge rules");
315
- this.log(" 3. Real test: roleplay run social-engineering-core --target <agent-url> --provider <provider> --judge semantic");
315
+ this.log(" 3. Real test: roleplay run social-engineering-core --target <agent-url> --provider <provider> --judge hybrid");
316
316
  this.log(" 4. Upload proof: roleplay upload all --mode sanitized_findings");
317
317
  }
318
318
  };
@@ -1173,7 +1173,7 @@ ROLEPLAY_TARGET_COMMAND=
1173
1173
  # Provider choices: openai, anthropic, google, openai-compatible.
1174
1174
  ROLEPLAY_LLM_PROVIDER=<provider>
1175
1175
  ROLEPLAY_LLM_MODEL=
1176
- ROLEPLAY_JUDGE_MODE=semantic
1176
+ ROLEPLAY_JUDGE_MODE=hybrid
1177
1177
  ROLEPLAY_JUDGE_PROVIDER=<provider>
1178
1178
  ROLEPLAY_JUDGE_MODEL=
1179
1179
  ROLEPLAY_ATTACKER_PROVIDER=
@@ -1219,7 +1219,7 @@ ROLEPLAY_LLM_BASE_URL=
1219
1219
  this.log(" Start a 7-day Builder or Team trial: https://app.roleplay.sh/auth/create-workspace");
1220
1220
  this.log(" Add ROLEPLAY_PROJECT_ID, ROLEPLAY_API_KEY, provider, and judge settings to .env");
1221
1221
  this.log(" Smoke test install: roleplay run social-engineering-core --target mock --provider mock --judge rules");
1222
- this.log(" Real test: roleplay run social-engineering-core --target <agent-url> --provider <provider> --judge semantic");
1222
+ this.log(" Real test: roleplay run social-engineering-core --target <agent-url> --provider <provider> --judge hybrid");
1223
1223
  }
1224
1224
  };
1225
1225
  }
@@ -4334,7 +4334,7 @@ Usage:
4334
4334
  roleplay setup
4335
4335
  roleplay init
4336
4336
  roleplay run social-engineering-core --target mock --provider mock --judge rules
4337
- roleplay run social-engineering-core --target <url> --provider <provider> --judge semantic --project <projectId>
4337
+ roleplay run social-engineering-core --target <url> --provider <provider> --judge hybrid --project <projectId>
4338
4338
  roleplay report latest|<runId> [--out .roleplay/runs]
4339
4339
  roleplay replay latest|<runId> [--out .roleplay/runs]
4340
4340
  roleplay upload latest|all --project <projectId>
@@ -4344,7 +4344,7 @@ Usage:
4344
4344
 
4345
4345
  Jobs:
4346
4346
  Setup roleplay setup
4347
- Run tests roleplay run social-engineering-core --target <url> --provider <provider> --judge semantic
4347
+ Run tests roleplay run social-engineering-core --target <url> --provider <provider> --judge hybrid
4348
4348
  Review evidence roleplay report latest && roleplay replay latest
4349
4349
  Upload proof roleplay upload all --mode sanitized_findings
4350
4350
  Diagnose roleplay doctor --cloud
@@ -4357,7 +4357,7 @@ Smoke test:
4357
4357
  roleplay run social-engineering-core --target mock --provider mock --judge rules --fail-on critical
4358
4358
 
4359
4359
  Real HTTP target:
4360
- roleplay run social-engineering-core --target <agent-url> --provider <provider> --judge semantic --project <projectId> --api-key <projectApiKey>
4360
+ roleplay run social-engineering-core --target <agent-url> --provider <provider> --judge hybrid --project <projectId> --api-key <projectApiKey>
4361
4361
 
4362
4362
  Real CLI target:
4363
4363
  roleplay run social-engineering-core --target-command "node ./agent.js" --provider <provider> --judge hybrid --project <projectId> --api-key <projectApiKey> --yes
@@ -4375,7 +4375,7 @@ Useful flags:
4375
4375
 
4376
4376
  Usage:
4377
4377
  roleplay doctor
4378
- roleplay doctor --cloud --provider <provider> --judge semantic
4378
+ roleplay doctor --cloud --provider <provider> --judge hybrid
4379
4379
  roleplay doctor --cloud --project <projectId> --api-key <projectApiKey> --json
4380
4380
 
4381
4381
  Checks:
@@ -4388,7 +4388,7 @@ Checks:
4388
4388
 
4389
4389
  Usage:
4390
4390
  roleplay setup
4391
- roleplay setup --project <projectId> --provider <provider> --judge semantic --target http://localhost:3000/agent
4391
+ roleplay setup --project <projectId> --provider <provider> --judge hybrid --target http://localhost:3000/agent
4392
4392
 
4393
4393
  The setup command writes safe placeholders to .env.example and never stores raw API keys by default.`
4394
4394
  };