@roleplay-sh/cli 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +1 -1
- package/CHANGELOG.md +7 -1
- package/README.md +2 -2
- package/RELEASE.md +4 -4
- package/dist/cli.js +11 -11
- package/dist/cli.js.map +1 -1
- package/package.json +1 -1
package/.env.example
CHANGED
|
@@ -15,7 +15,7 @@ ROLEPLAY_TARGET_COMMAND=
|
|
|
15
15
|
# Provider choices: openai, anthropic, google, openai-compatible.
|
|
16
16
|
ROLEPLAY_LLM_PROVIDER=<provider>
|
|
17
17
|
ROLEPLAY_LLM_MODEL=
|
|
18
|
-
ROLEPLAY_JUDGE_MODE=
|
|
18
|
+
ROLEPLAY_JUDGE_MODE=hybrid
|
|
19
19
|
ROLEPLAY_JUDGE_PROVIDER=<provider>
|
|
20
20
|
ROLEPLAY_JUDGE_MODEL=
|
|
21
21
|
ROLEPLAY_ATTACKER_PROVIDER=
|
package/CHANGELOG.md
CHANGED
|
@@ -4,7 +4,13 @@ All notable changes to roleplay.sh will be documented in this file.
|
|
|
4
4
|
|
|
5
5
|
This project follows semantic versioning after the public `0.1.0` release.
|
|
6
6
|
|
|
7
|
-
## 0.1.
|
|
7
|
+
## 0.1.8 - Unreleased
|
|
8
|
+
|
|
9
|
+
### Changed
|
|
10
|
+
|
|
11
|
+
- Changed `roleplay setup` default judge mode to `hybrid`.
|
|
12
|
+
|
|
13
|
+
## 0.1.7 - 2026-06-14
|
|
8
14
|
|
|
9
15
|
### Added
|
|
10
16
|
|
package/README.md
CHANGED
|
@@ -37,7 +37,7 @@ HTTP target:
|
|
|
37
37
|
roleplay run social-engineering-core \
|
|
38
38
|
--target http://localhost:3000/agent \
|
|
39
39
|
--provider <provider> \
|
|
40
|
-
--judge
|
|
40
|
+
--judge hybrid \
|
|
41
41
|
--project <project-id> \
|
|
42
42
|
--api-key <project-api-key> \
|
|
43
43
|
--fail-on critical
|
|
@@ -59,7 +59,7 @@ roleplay run social-engineering-core \
|
|
|
59
59
|
## Judge Choices
|
|
60
60
|
|
|
61
61
|
- `--judge rules`: deterministic local rule judge. Best for smoke tests and offline checks.
|
|
62
|
-
- `--judge semantic`: provider-backed security judge
|
|
62
|
+
- `--judge semantic`: provider-backed security judge for real agent tests.
|
|
63
63
|
- `--judge hybrid`: semantic judge plus deterministic guardrails. Recommended for CI once your provider is configured.
|
|
64
64
|
|
|
65
65
|
Rules-only judging can be used against real targets only with `--allow-rules-only`, so it is never mistaken for full semantic evaluation.
|
package/RELEASE.md
CHANGED
|
@@ -29,8 +29,8 @@ The publish workflow uses GitHub OIDC and intentionally does not require an npm
|
|
|
29
29
|
Create a GitHub release or push a version tag:
|
|
30
30
|
|
|
31
31
|
```bash
|
|
32
|
-
git tag v0.1.
|
|
33
|
-
git push origin v0.1.
|
|
32
|
+
git tag v0.1.8
|
|
33
|
+
git push origin v0.1.8
|
|
34
34
|
```
|
|
35
35
|
|
|
36
36
|
The publish workflow runs checks and then publishes with:
|
|
@@ -57,10 +57,10 @@ For real provider-backed verification:
|
|
|
57
57
|
export ROLEPLAY_PROJECT_ID=<project-id>
|
|
58
58
|
export ROLEPLAY_API_KEY=<project-api-key>
|
|
59
59
|
export ROLEPLAY_LLM_PROVIDER=<provider>
|
|
60
|
-
export ROLEPLAY_JUDGE_MODE=
|
|
60
|
+
export ROLEPLAY_JUDGE_MODE=hybrid
|
|
61
61
|
export ROLEPLAY_JUDGE_PROVIDER=<provider>
|
|
62
62
|
export ROLEPLAY_<PROVIDER>_API_KEY=<provider-key>
|
|
63
|
-
roleplay run social-engineering-core --target http://localhost:3000/agent --provider <provider> --judge
|
|
63
|
+
roleplay run social-engineering-core --target http://localhost:3000/agent --provider <provider> --judge hybrid --max-turns 1 --fail-on critical
|
|
64
64
|
```
|
|
65
65
|
|
|
66
66
|
For workbench upload verification, start a Builder or Team trial, create a project API key at `https://app.roleplay.sh`, and run:
|
package/dist/cli.js
CHANGED
|
@@ -199,7 +199,7 @@ function fromFlags(flags) {
|
|
|
199
199
|
cloudUrl: flags["cloud-url"],
|
|
200
200
|
project: flags.project ?? process.env.ROLEPLAY_PROJECT_ID ?? "",
|
|
201
201
|
provider: flags.provider ?? process.env.ROLEPLAY_LLM_PROVIDER ?? "",
|
|
202
|
-
judge: flags.judge ?? process.env.ROLEPLAY_JUDGE_MODE ?? "
|
|
202
|
+
judge: flags.judge ?? process.env.ROLEPLAY_JUDGE_MODE ?? "hybrid",
|
|
203
203
|
judgeProvider: flags["judge-provider"] ?? process.env.ROLEPLAY_JUDGE_PROVIDER ?? flags.provider ?? process.env.ROLEPLAY_LLM_PROVIDER ?? "",
|
|
204
204
|
target: flags.target ?? process.env.ROLEPLAY_TARGET_URL ?? "",
|
|
205
205
|
targetCommand: flags["target-command"] ?? process.env.ROLEPLAY_TARGET_COMMAND ?? ""
|
|
@@ -211,7 +211,7 @@ async function promptForSetup(defaults) {
|
|
|
211
211
|
const cloudUrl = await ask(rl, "Workbench URL", defaults.cloudUrl);
|
|
212
212
|
const project = await ask(rl, "Project ID", defaults.project);
|
|
213
213
|
const provider = await ask(rl, "Attacker provider (openai, anthropic, google, openai-compatible)", defaults.provider);
|
|
214
|
-
const judge = await ask(rl, "Judge mode (rules, semantic, hybrid)", defaults.judge || "
|
|
214
|
+
const judge = await ask(rl, "Judge mode (rules, semantic, hybrid)", defaults.judge || "hybrid");
|
|
215
215
|
const judgeProvider = await ask(rl, "Judge provider for semantic/hybrid mode", defaults.judgeProvider || provider);
|
|
216
216
|
const target = await ask(rl, "HTTP target URL (leave blank if using a CLI target)", defaults.target);
|
|
217
217
|
const targetCommand = target ? "" : await ask(rl, "CLI target command (optional)", defaults.targetCommand);
|
|
@@ -244,7 +244,7 @@ ROLEPLAY_TARGET_COMMAND=${input2.targetCommand}
|
|
|
244
244
|
# Provider choices: openai, anthropic, google, openai-compatible.
|
|
245
245
|
ROLEPLAY_LLM_PROVIDER=${input2.provider || "<provider>"}
|
|
246
246
|
ROLEPLAY_LLM_MODEL=
|
|
247
|
-
ROLEPLAY_JUDGE_MODE=${input2.judge || "
|
|
247
|
+
ROLEPLAY_JUDGE_MODE=${input2.judge || "hybrid"}
|
|
248
248
|
ROLEPLAY_JUDGE_PROVIDER=${input2.judgeProvider || "<provider>"}
|
|
249
249
|
ROLEPLAY_JUDGE_MODEL=
|
|
250
250
|
ROLEPLAY_ATTACKER_PROVIDER=
|
|
@@ -312,7 +312,7 @@ var init_setup = __esm({
|
|
|
312
312
|
this.log("\nNext steps:");
|
|
313
313
|
this.log(" 1. Copy .env.example to .env and fill in secrets locally or in CI.");
|
|
314
314
|
this.log(" 2. Smoke test: roleplay run social-engineering-core --target mock --provider mock --judge rules");
|
|
315
|
-
this.log(" 3. Real test: roleplay run social-engineering-core --target <agent-url> --provider <provider> --judge
|
|
315
|
+
this.log(" 3. Real test: roleplay run social-engineering-core --target <agent-url> --provider <provider> --judge hybrid");
|
|
316
316
|
this.log(" 4. Upload proof: roleplay upload all --mode sanitized_findings");
|
|
317
317
|
}
|
|
318
318
|
};
|
|
@@ -1173,7 +1173,7 @@ ROLEPLAY_TARGET_COMMAND=
|
|
|
1173
1173
|
# Provider choices: openai, anthropic, google, openai-compatible.
|
|
1174
1174
|
ROLEPLAY_LLM_PROVIDER=<provider>
|
|
1175
1175
|
ROLEPLAY_LLM_MODEL=
|
|
1176
|
-
ROLEPLAY_JUDGE_MODE=
|
|
1176
|
+
ROLEPLAY_JUDGE_MODE=hybrid
|
|
1177
1177
|
ROLEPLAY_JUDGE_PROVIDER=<provider>
|
|
1178
1178
|
ROLEPLAY_JUDGE_MODEL=
|
|
1179
1179
|
ROLEPLAY_ATTACKER_PROVIDER=
|
|
@@ -1219,7 +1219,7 @@ ROLEPLAY_LLM_BASE_URL=
|
|
|
1219
1219
|
this.log(" Start a 7-day Builder or Team trial: https://app.roleplay.sh/auth/create-workspace");
|
|
1220
1220
|
this.log(" Add ROLEPLAY_PROJECT_ID, ROLEPLAY_API_KEY, provider, and judge settings to .env");
|
|
1221
1221
|
this.log(" Smoke test install: roleplay run social-engineering-core --target mock --provider mock --judge rules");
|
|
1222
|
-
this.log(" Real test: roleplay run social-engineering-core --target <agent-url> --provider <provider> --judge
|
|
1222
|
+
this.log(" Real test: roleplay run social-engineering-core --target <agent-url> --provider <provider> --judge hybrid");
|
|
1223
1223
|
}
|
|
1224
1224
|
};
|
|
1225
1225
|
}
|
|
@@ -4334,7 +4334,7 @@ Usage:
|
|
|
4334
4334
|
roleplay setup
|
|
4335
4335
|
roleplay init
|
|
4336
4336
|
roleplay run social-engineering-core --target mock --provider mock --judge rules
|
|
4337
|
-
roleplay run social-engineering-core --target <url> --provider <provider> --judge
|
|
4337
|
+
roleplay run social-engineering-core --target <url> --provider <provider> --judge hybrid --project <projectId>
|
|
4338
4338
|
roleplay report latest|<runId> [--out .roleplay/runs]
|
|
4339
4339
|
roleplay replay latest|<runId> [--out .roleplay/runs]
|
|
4340
4340
|
roleplay upload latest|all --project <projectId>
|
|
@@ -4344,7 +4344,7 @@ Usage:
|
|
|
4344
4344
|
|
|
4345
4345
|
Jobs:
|
|
4346
4346
|
Setup roleplay setup
|
|
4347
|
-
Run tests roleplay run social-engineering-core --target <url> --provider <provider> --judge
|
|
4347
|
+
Run tests roleplay run social-engineering-core --target <url> --provider <provider> --judge hybrid
|
|
4348
4348
|
Review evidence roleplay report latest && roleplay replay latest
|
|
4349
4349
|
Upload proof roleplay upload all --mode sanitized_findings
|
|
4350
4350
|
Diagnose roleplay doctor --cloud
|
|
@@ -4357,7 +4357,7 @@ Smoke test:
|
|
|
4357
4357
|
roleplay run social-engineering-core --target mock --provider mock --judge rules --fail-on critical
|
|
4358
4358
|
|
|
4359
4359
|
Real HTTP target:
|
|
4360
|
-
roleplay run social-engineering-core --target <agent-url> --provider <provider> --judge
|
|
4360
|
+
roleplay run social-engineering-core --target <agent-url> --provider <provider> --judge hybrid --project <projectId> --api-key <projectApiKey>
|
|
4361
4361
|
|
|
4362
4362
|
Real CLI target:
|
|
4363
4363
|
roleplay run social-engineering-core --target-command "node ./agent.js" --provider <provider> --judge hybrid --project <projectId> --api-key <projectApiKey> --yes
|
|
@@ -4375,7 +4375,7 @@ Useful flags:
|
|
|
4375
4375
|
|
|
4376
4376
|
Usage:
|
|
4377
4377
|
roleplay doctor
|
|
4378
|
-
roleplay doctor --cloud --provider <provider> --judge
|
|
4378
|
+
roleplay doctor --cloud --provider <provider> --judge hybrid
|
|
4379
4379
|
roleplay doctor --cloud --project <projectId> --api-key <projectApiKey> --json
|
|
4380
4380
|
|
|
4381
4381
|
Checks:
|
|
@@ -4388,7 +4388,7 @@ Checks:
|
|
|
4388
4388
|
|
|
4389
4389
|
Usage:
|
|
4390
4390
|
roleplay setup
|
|
4391
|
-
roleplay setup --project <projectId> --provider <provider> --judge
|
|
4391
|
+
roleplay setup --project <projectId> --provider <provider> --judge hybrid --target http://localhost:3000/agent
|
|
4392
4392
|
|
|
4393
4393
|
The setup command writes safe placeholders to .env.example and never stores raw API keys by default.`
|
|
4394
4394
|
};
|