@candor.sh/cli 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.cjs +414 -5
  2. package/package.json +1 -1
package/dist/index.cjs CHANGED
@@ -4484,7 +4484,7 @@ var import_os = require("os");
4484
4484
  var CANDOR_DIR = (0, import_path.join)((0, import_os.homedir)(), ".candor");
4485
4485
  var CONFIG_PATH = (0, import_path.join)(CANDOR_DIR, "config.json");
4486
4486
  var DEFAULT_CONFIG = {
4487
- apiUrl: "https://api.candor.sh"
4487
+ apiUrl: "https://candor.sh"
4488
4488
  };
4489
4489
  function ensureCandorDir() {
4490
4490
  if (!(0, import_fs.existsSync)(CANDOR_DIR)) {
@@ -4513,7 +4513,7 @@ function getConfigPath() {
4513
4513
  }
4514
4514
 
4515
4515
  // src/skill-content.ts
4516
- var SKILL_CONTENT = '# Candor \u2014 AI-Moderated User Studies\n\nCandor lets you create and manage AI-moderated user studies directly from the terminal. An AI voice agent interviews real participants while they use your product, then delivers prioritized findings.\n\n## Available Commands\n\nRun these via the Bash tool. Always use `--json` for machine-readable output.\n\n- `candor study create --url <url> --goal "<goal>" [--participants N] --json` \u2014 Create a study\n- `candor study list --json` \u2014 List all studies (or `candor study --json`)\n- `candor study status <study-id> --json` \u2014 Check participant progress\n- `candor study findings <study-id> --json` \u2014 Get prioritized feedback (P0\u2013P3)\n- `candor study approve <study-id> --json` \u2014 Approve recruitment (human-in-the-loop gate)\n\n## Workflow\n\n### Creating a Study\n\n1. **Infer context** \u2014 If the user doesn\'t provide a URL, check:\n - `package.json` `homepage` field\n - README for a deployed URL\n - Ask the user if nothing is found\n2. **Run `candor study create`** with the product URL and goal\n3. **Show the study summary** including:\n - Drafted study script (sections, questions, tasks)\n - Participant count and demographic filters\n - Estimated cost per session\n4. **Wait for explicit approval** \u2014 Do NOT run `candor study approve` without the user saying "yes", "approve", "go ahead", or similar\n5. **On approval**, run `candor study approve <study-id>` to begin recruiting\n\n### Checking Results\n\n- Use `candor study status <id> --json` to show completion progress\n- Use `candor study findings <id> --json` to show prioritized findings\n- Offer to create GitHub issues for P0/P1 items if the user wants\n\n## Important Rules\n\n- NEVER approve a study without explicit human confirmation\n- ALWAYS show cost estimate before approval\n- Studies are billed per participant session (~$14.50/session)\n- If Candor is not initialized, tell the user to run `candor init` first\n';
4516
+ var SKILL_CONTENT = '# Candor \u2014 AI-Moderated User Studies & Human Evaluation\n\nCandor lets you create and manage AI-moderated user studies and human evaluation jobs directly from the terminal. Use `candor study` for product research (AI voice interviews with real users) and `candor eval` for quick human labeling, ranking, and evaluation tasks via Mechanical Turk.\n\n## Available Commands\n\nRun these via the Bash tool. Always use `--json` for machine-readable output.\n\n### Studies (product research)\n\n- `candor study create --url <url> --goal "<goal>" [--participants N] --json` \u2014 Create a study\n- `candor study list --json` \u2014 List all studies (or `candor study --json`)\n- `candor study status <study-id> --json` \u2014 Check participant progress\n- `candor study findings <study-id> --json` \u2014 Get prioritized feedback (P0\u2013P3)\n- `candor study approve <study-id> --json` \u2014 Approve recruitment (human-in-the-loop gate)\n\n### Evals (quick human evaluation / labeling)\n\n- `candor eval create --goal "<goal>" --items "<item1,item2,...>" [--type pairwise_comparison] [--workers N] [--reward <cents>] --json` \u2014 Create an eval\n- `candor eval list --json` \u2014 List all evals\n- `candor eval approve <eval-id> --json` \u2014 Launch eval on MTurk\n- `candor eval status <eval-id> --json` \u2014 Check eval progress\n- `candor eval results <eval-id> --json` \u2014 View results and rankings\n\n## Workflow\n\n### Creating a Study\n\n1. **Infer context** \u2014 If the user doesn\'t provide a URL, check:\n - `package.json` `homepage` field\n - README for a deployed URL\n - Ask the user if nothing is found\n2. **Run `candor study create`** with the product URL and goal\n3. **Show the study summary** including:\n - Drafted study script (sections, questions, tasks)\n - Participant count and demographic filters\n - Estimated cost per session\n4. **Wait for explicit approval** \u2014 Do NOT run `candor study approve` without the user saying "yes", "approve", "go ahead", or similar\n5. **On approval**, run `candor study approve <study-id>` to begin recruiting\n\n### Creating an Eval\n\nEvals are for quick human evaluation tasks like labeling, ranking, rating, or comparing items. They run on Mechanical Turk and return results in minutes.\n\n**When to use eval vs study:**\n- Use `candor eval` when the user wants to label, rank, rate, compare, or categorize items (audio files, images, text, etc.)\n- Use `candor study` when the user wants qualitative product feedback from real users\n\n**Choosing the right task type:**\n- `pairwise_comparison` \u2014 Best for ranking. Workers compare pairs and pick a winner. Produces a stack ranking with win rates. Use when the user says "rank", "compare", "which is better", "stack rank".\n- `categorical_label` \u2014 Workers assign a label from a set of categories. Use when the user says "label", "categorize", "classify", "tag".\n- `rating_scale` \u2014 Workers rate items on a numeric scale. Use when the user says "rate", "score", "quality score".\n- `free_text` \u2014 Workers provide open-ended text feedback.\n\n**Workflow:**\n1. **Identify the items** \u2014 Look for files the user mentions (audio, images, etc.). Extract labels from filenames.\n2. **Infer the task type** \u2014 Based on the user\'s goal, pick the most appropriate type. If vague (e.g. "evaluate these"), default to `pairwise_comparison` for ranking or `rating_scale` for quality assessment.\n3. **Design the eval** \u2014 The API auto-generates the experiment design (randomized pairs, worker counts, etc.) from the goal and items. If the user is prescriptive about methodology, pass their preferences.\n4. **Create the eval** \u2014 Run `candor eval create` with goal and items.\n5. **Show the plan** \u2014 Display task type, item count, pair count, estimated cost and time.\n6. **Wait for approval** \u2014 Do NOT launch without explicit confirmation.\n7. **Launch** \u2014 Run `candor eval approve <id>` to create HITs on MTurk.\n8. **Poll and display results** \u2014 Use `candor eval status <id>` and `candor eval results <id>` to show progress and rankings.\n\n**Example natural language \u2192 eval mapping:**\n\n| User says | Task type | Items |\n|-----------|-----------|-------|\n| "rank these audio samples by TTS quality" | pairwise_comparison | Audio file names |\n| "label these images as cat or dog" | categorical_label | Image file names |\n| "rate these UI mockups on a 1-5 scale" | rating_scale | Mockup file names |\n| "which of these headlines is more engaging" | pairwise_comparison | Headline text |\n| "evaluate these samples" (vague) | pairwise_comparison | File names |\n\n### Checking Results\n\n- Use `candor study status <id> --json` to show completion progress\n- Use `candor study findings <id> --json` to show prioritized findings\n- Use `candor eval status <id> --json` to show eval progress\n- Use `candor eval results <id> --json` to show rankings and agreement metrics\n- Offer to create GitHub issues for P0/P1 items if the user wants\n\n## Important Rules\n\n- NEVER approve a study or launch an eval without explicit human confirmation\n- ALWAYS show cost estimate before launching\n- Studies are billed per participant session (~$14.50/session)\n- Evals are billed per task assignment (~$0.05-0.10/task + 20-40% MTurk fee)\n- If Candor is not initialized, tell the user to run `candor init` first\n';
4517
4517
 
4518
4518
  // src/commands/init.ts
4519
4519
  var CLAUDE_SKILLS_DIR = (0, import_path2.join)((0, import_os2.homedir)(), ".claude", "skills", "candor");
@@ -4970,28 +4970,437 @@ async function updateCommand() {
4970
4970
  }
4971
4971
  }
4972
4972
  function getCurrentVersion() {
4973
+ if ("0.3.0") {
4974
+ return "0.3.0";
4975
+ }
4973
4976
  try {
4974
4977
  const pkgPath = (0, import_path4.join)((0, import_path4.dirname)(new URL(import_meta2.url).pathname), "..", "package.json");
4975
4978
  const pkg = JSON.parse((0, import_fs4.readFileSync)(pkgPath, "utf-8"));
4976
4979
  return pkg.version;
4977
4980
  } catch {
4978
- return "0.1.0";
4981
+ return "unknown";
4982
+ }
4983
+ }
4984
+
4985
+ // src/commands/eval-create.ts
4986
+ async function evalCreateCommand(options) {
4987
+ const chalk2 = (await Promise.resolve().then(() => (init_source(), source_exports))).default;
4988
+ try {
4989
+ const itemLabels = options.items.split(",").map((s) => s.trim()).filter(Boolean);
4990
+ if (itemLabels.length < 2) {
4991
+ console.log(chalk2.red("Error: At least 2 items are required (comma-separated)"));
4992
+ process.exit(1);
4993
+ }
4994
+ const body = {
4995
+ goal: options.goal,
4996
+ items: itemLabels.map((label) => ({ label }))
4997
+ };
4998
+ if (options.type) body.taskType = options.type;
4999
+ if (options.labels) {
5000
+ const labels = options.labels.split(",").map((s) => s.trim()).filter(Boolean);
5001
+ body.design = { labels };
5002
+ }
5003
+ if (options.workers) body.workersPerTask = parseInt(options.workers, 10);
5004
+ if (options.reward) body.rewardCentsPerTask = parseInt(options.reward, 10);
5005
+ if (options.batchSize) body.batchSize = parseInt(options.batchSize, 10);
5006
+ const data = await apiRequest("/api/evals", { method: "POST", body });
5007
+ if (options.json) {
5008
+ console.log(JSON.stringify(data, null, 2));
5009
+ return;
5010
+ }
5011
+ const result = data;
5012
+ console.log();
5013
+ console.log(chalk2.green(` + Eval created: ${result.eval.id}`));
5014
+ console.log(chalk2.bold(` ${result.eval.name}`));
5015
+ console.log(
5016
+ chalk2.dim(
5017
+ ` Type: ${result.eval.taskType} | Workers/task: ${result.eval.workersPerTask}`
5018
+ )
5019
+ );
5020
+ console.log(
5021
+ chalk2.dim(` Items: ${result.items.length} | Tasks: ${result.estimate.totalTasks}`)
5022
+ );
5023
+ console.log(
5024
+ chalk2.dim(
5025
+ ` Est. cost: $${(result.estimate.estimatedCostCents / 100).toFixed(2)} | Est. time: ~${result.estimate.estimatedMinutes} min`
5026
+ )
5027
+ );
5028
+ console.log(chalk2.dim(` Status: ${result.eval.status}`));
5029
+ console.log();
5030
+ console.log(
5031
+ chalk2.dim(` To approve: candor eval approve ${result.eval.id}`)
5032
+ );
5033
+ console.log();
5034
+ } catch (err) {
5035
+ if (options.json) {
5036
+ console.log(
5037
+ JSON.stringify({ error: err instanceof Error ? err.message : String(err) })
5038
+ );
5039
+ process.exit(1);
5040
+ }
5041
+ console.log(chalk2.red(`Error: ${err instanceof Error ? err.message : err}`));
5042
+ process.exit(1);
5043
+ }
5044
+ }
5045
+
5046
+ // src/commands/eval-launch.ts
5047
+ async function evalLaunchCommand(id, options) {
5048
+ const chalk2 = (await Promise.resolve().then(() => (init_source(), source_exports))).default;
5049
+ try {
5050
+ const data = await apiRequest(`/api/evals/${id}/launch`, { method: "POST" });
5051
+ if (options.json) {
5052
+ console.log(JSON.stringify(data, null, 2));
5053
+ return;
5054
+ }
5055
+ const result = data;
5056
+ console.log();
5057
+ console.log(chalk2.green(` \u26A1 Eval launched!`));
5058
+ console.log(chalk2.dim(` ${result.totalTasks} tasks being sent to workers`));
5059
+ console.log(chalk2.dim(` Results will start streaming in within minutes.`));
5060
+ console.log();
5061
+ console.log(chalk2.dim(` Check progress: candor eval status ${id}`));
5062
+ console.log(chalk2.dim(` View results: candor eval results ${id}`));
5063
+ console.log();
5064
+ } catch (err) {
5065
+ if (options.json) {
5066
+ console.log(
5067
+ JSON.stringify({ error: err instanceof Error ? err.message : String(err) })
5068
+ );
5069
+ process.exit(1);
5070
+ }
5071
+ console.log(chalk2.red(`Error: ${err instanceof Error ? err.message : err}`));
5072
+ process.exit(1);
5073
+ }
5074
+ }
5075
+
5076
+ // src/commands/eval-status.ts
5077
+ function formatTime(iso) {
5078
+ const d = new Date(iso);
5079
+ return d.toLocaleTimeString([], { hour: "2-digit", minute: "2-digit", second: "2-digit" });
5080
+ }
5081
+ function renderStatus(result, id, chalk2, opts = {}) {
5082
+ const p = result.progress;
5083
+ const expectedResponses = p.totalTasks * p.workersPerTask;
5084
+ const responsePct = expectedResponses > 0 ? Math.round(p.totalResponses / expectedResponses * 100) : 0;
5085
+ const barLen = 20;
5086
+ const filled = Math.round(responsePct / 100 * barLen);
5087
+ const bar = "\u2588".repeat(filled) + "\u2591".repeat(barLen - filled);
5088
+ const lines = [];
5089
+ lines.push("");
5090
+ lines.push(chalk2.bold(` ${result.eval.name}`));
5091
+ lines.push(
5092
+ chalk2.dim(` ${result.eval.taskType} | ${result.eval.status}`)
5093
+ );
5094
+ lines.push("");
5095
+ lines.push(` [${bar}] ${responsePct}%`);
5096
+ lines.push(
5097
+ chalk2.dim(
5098
+ ` ${p.totalResponses}/${expectedResponses} responses (${p.uniqueWorkers}/${p.workersPerTask} workers \xD7 ${p.totalTasks} tasks)`
5099
+ )
5100
+ );
5101
+ if (opts.showActivity && result.activity.length > 0) {
5102
+ lines.push("");
5103
+ lines.push(chalk2.bold(" Activity"));
5104
+ const entries = result.activity;
5105
+ const seenCount = opts.seenActivityCount ?? 0;
5106
+ for (let i = 0; i < entries.length; i++) {
5107
+ const a = entries[i];
5108
+ const time = chalk2.dim(formatTime(a.at));
5109
+ const isNew = i >= seenCount;
5110
+ const msg = isNew ? chalk2.green(a.message) : chalk2.white(a.message);
5111
+ lines.push(` ${time} ${msg}`);
5112
+ }
5113
+ }
5114
+ lines.push("");
5115
+ if (result.eval.status === "completed") {
5116
+ lines.push(chalk2.green(" \u2713 Eval complete!"));
5117
+ lines.push(chalk2.dim(` View results: candor eval results ${id}`));
5118
+ lines.push("");
5119
+ } else if (!opts.showActivity && result.eval.status === "active") {
5120
+ lines.push(chalk2.dim(` Preview results: candor eval results ${id}`));
5121
+ lines.push("");
5122
+ }
5123
+ return lines.join("\n");
5124
+ }
5125
+ async function evalStatusCommand(id, options) {
5126
+ const chalk2 = (await Promise.resolve().then(() => (init_source(), source_exports))).default;
5127
+ const fetchStatus = async () => {
5128
+ const data = await apiRequest(`/api/evals/${id}`);
5129
+ return data;
5130
+ };
5131
+ try {
5132
+ if (options.json && !options.live) {
5133
+ const data = await fetchStatus();
5134
+ console.log(JSON.stringify(data, null, 2));
5135
+ return;
5136
+ }
5137
+ if (!options.live) {
5138
+ const result = await fetchStatus();
5139
+ console.log(renderStatus(result, id, chalk2, { showActivity: result.activity?.length > 0 }));
5140
+ return;
5141
+ }
5142
+ let lastResponseCount = -1;
5143
+ let seenActivityCount = 0;
5144
+ const poll = async () => {
5145
+ const result = await fetchStatus();
5146
+ if (options.json) {
5147
+ if (result.progress.totalResponses !== lastResponseCount) {
5148
+ console.log(JSON.stringify(result));
5149
+ lastResponseCount = result.progress.totalResponses;
5150
+ }
5151
+ } else {
5152
+ process.stdout.write("\x1B[H\x1B[J");
5153
+ const prevSeen = seenActivityCount;
5154
+ seenActivityCount = result.activity?.length ?? 0;
5155
+ console.log(
5156
+ renderStatus(result, id, chalk2, {
5157
+ showActivity: true,
5158
+ seenActivityCount: prevSeen
5159
+ })
5160
+ );
5161
+ lastResponseCount = result.progress.totalResponses;
5162
+ if (result.eval.status !== "completed") {
5163
+ console.log(chalk2.dim(" Polling every 5s... Ctrl+C to stop"));
5164
+ }
5165
+ }
5166
+ return result.eval.status === "completed";
5167
+ };
5168
+ const done = await poll();
5169
+ if (done) return;
5170
+ await new Promise((resolve) => {
5171
+ const interval = setInterval(async () => {
5172
+ try {
5173
+ const done2 = await poll();
5174
+ if (done2) {
5175
+ clearInterval(interval);
5176
+ resolve();
5177
+ }
5178
+ } catch {
5179
+ clearInterval(interval);
5180
+ resolve();
5181
+ }
5182
+ }, 5e3);
5183
+ process.on("SIGINT", () => {
5184
+ clearInterval(interval);
5185
+ console.log("");
5186
+ resolve();
5187
+ });
5188
+ });
5189
+ } catch (err) {
5190
+ if (options.json) {
5191
+ console.log(
5192
+ JSON.stringify({
5193
+ error: err instanceof Error ? err.message : String(err)
5194
+ })
5195
+ );
5196
+ process.exit(1);
5197
+ }
5198
+ console.log(
5199
+ chalk2.red(`Error: ${err instanceof Error ? err.message : err}`)
5200
+ );
5201
+ process.exit(1);
5202
+ }
5203
+ }
5204
+
5205
+ // src/commands/eval-results.ts
5206
+ async function evalResultsCommand(id, options) {
5207
+ const chalk2 = (await Promise.resolve().then(() => (init_source(), source_exports))).default;
5208
+ try {
5209
+ const data = await apiRequest(`/api/evals/${id}/results`);
5210
+ if (options.json) {
5211
+ console.log(JSON.stringify(data, null, 2));
5212
+ return;
5213
+ }
5214
+ const result = data;
5215
+ if (!result.results) {
5216
+ console.log();
5217
+ console.log(chalk2.yellow(` No results yet.`));
5218
+ if (result.message) console.log(chalk2.dim(` ${result.message}`));
5219
+ console.log();
5220
+ return;
5221
+ }
5222
+ console.log();
5223
+ if ("rankings" in result.results) {
5224
+ const pairwise = result.results;
5225
+ console.log(chalk2.bold(" Rankings"));
5226
+ console.log(chalk2.dim(" \u2500".repeat(30)));
5227
+ for (const r of pairwise.rankings) {
5228
+ const winPct = (r.winRate * 100).toFixed(1);
5229
+ const barLen = 15;
5230
+ const filled = Math.round(r.winRate * barLen);
5231
+ const bar = "\u2588".repeat(filled) + "\u2591".repeat(barLen - filled);
5232
+ console.log(
5233
+ ` ${chalk2.bold(`#${r.rank}`)} ${r.label.padEnd(25)} [${bar}] ${winPct}% (${r.totalWins}/${r.totalComparisons} wins)`
5234
+ );
5235
+ }
5236
+ console.log();
5237
+ console.log(chalk2.bold(" Agreement Metrics"));
5238
+ console.log(chalk2.dim(" \u2500".repeat(30)));
5239
+ const agr = pairwise.agreement;
5240
+ console.log(
5241
+ ` Pairwise agreement: ${(agr.pairwiseAgreementRate * 100).toFixed(1)}%`
5242
+ );
5243
+ console.log(` Krippendorff's \u03B1: ${agr.krippendorphAlpha.toFixed(3)}`);
5244
+ if (agr.krippendorphAlpha >= 0.8) {
5245
+ console.log(chalk2.green(" \u2192 Strong agreement"));
5246
+ } else if (agr.krippendorphAlpha >= 0.667) {
5247
+ console.log(chalk2.yellow(" \u2192 Moderate agreement"));
5248
+ } else {
5249
+ console.log(chalk2.red(" \u2192 Low agreement \u2014 consider more workers per task"));
5250
+ }
5251
+ if (agr.disagreedPairs.length > 0) {
5252
+ console.log();
5253
+ console.log(
5254
+ chalk2.dim(
5255
+ ` ${agr.disagreedPairs.length} pairs with disagreement`
5256
+ )
5257
+ );
5258
+ }
5259
+ console.log();
5260
+ console.log(
5261
+ chalk2.dim(
5262
+ ` ${pairwise.totalResponses} responses across ${pairwise.totalPairs} pairs`
5263
+ )
5264
+ );
5265
+ }
5266
+ if ("items" in result.results && !("rankings" in result.results)) {
5267
+ const categorical = result.results;
5268
+ console.log(chalk2.bold(" Labels"));
5269
+ console.log(chalk2.dim(" \u2500".repeat(30)));
5270
+ for (const item of categorical.items) {
5271
+ const conf = (item.confidence * 100).toFixed(0);
5272
+ const distStr = Object.entries(item.labelDistribution).map(([k, v]) => `${k}:${v}`).join(" ");
5273
+ console.log(
5274
+ ` ${item.label.padEnd(25)} \u2192 ${chalk2.bold(item.assignedLabel)} (${conf}% confidence) [${distStr}]`
5275
+ );
5276
+ }
5277
+ }
5278
+ console.log();
5279
+ if (result.progress) {
5280
+ const p = result.progress;
5281
+ if (result.status === "completed") {
5282
+ console.log(
5283
+ chalk2.dim(
5284
+ ` ${p.totalResponses} responses from ${p.uniqueWorkers || "?"} workers`
5285
+ )
5286
+ );
5287
+ } else {
5288
+ const workers = p.uniqueWorkers != null ? `${p.uniqueWorkers}/${p.workersPerTask || "?"}` : "?";
5289
+ console.log(
5290
+ chalk2.dim(
5291
+ ` ${p.totalResponses} responses (${workers} workers) \u2014 results are partial`
5292
+ )
5293
+ );
5294
+ }
5295
+ console.log();
5296
+ }
5297
+ } catch (err) {
5298
+ if (options.json) {
5299
+ console.log(
5300
+ JSON.stringify({ error: err instanceof Error ? err.message : String(err) })
5301
+ );
5302
+ process.exit(1);
5303
+ }
5304
+ console.log(chalk2.red(`Error: ${err instanceof Error ? err.message : err}`));
5305
+ process.exit(1);
5306
+ }
5307
+ }
5308
+
5309
+ // src/commands/eval-list.ts
5310
+ async function evalListCommand(options) {
5311
+ const chalk2 = (await Promise.resolve().then(() => (init_source(), source_exports))).default;
5312
+ try {
5313
+ const data = await apiRequest("/api/evals");
5314
+ if (options.json) {
5315
+ console.log(JSON.stringify(data, null, 2));
5316
+ return;
5317
+ }
5318
+ const result = data;
5319
+ if (result.evals.length === 0) {
5320
+ console.log();
5321
+ console.log(chalk2.dim(" No evals yet. Create one with:"));
5322
+ console.log(chalk2.dim(' candor eval create --goal "rank by quality" --items "a,b,c"'));
5323
+ console.log();
5324
+ return;
5325
+ }
5326
+ console.log();
5327
+ for (const e of result.evals) {
5328
+ const statusColor = e.status === "completed" ? chalk2.green : e.status === "active" ? chalk2.yellow : chalk2.dim;
5329
+ const progress = e.totalTasks ? ` (${e.completedTasks ?? 0}/${e.totalTasks} done)` : "";
5330
+ console.log(` ${chalk2.bold(e.id)} ${e.name}`);
5331
+ console.log(
5332
+ chalk2.dim(` ${e.taskType} | ${statusColor(e.status)}${progress}`)
5333
+ );
5334
+ console.log();
5335
+ }
5336
+ } catch (err) {
5337
+ if (options.json) {
5338
+ console.log(
5339
+ JSON.stringify({ error: err instanceof Error ? err.message : String(err) })
5340
+ );
5341
+ process.exit(1);
5342
+ }
5343
+ console.log(chalk2.red(`Error: ${err instanceof Error ? err.message : err}`));
5344
+ process.exit(1);
5345
+ }
5346
+ }
5347
+
5348
+ // src/commands/eval-cancel.ts
5349
+ async function evalCancelCommand(id, options) {
5350
+ const chalk2 = (await Promise.resolve().then(() => (init_source(), source_exports))).default;
5351
+ try {
5352
+ const data = await apiRequest(`/api/evals/${id}/cancel`, {
5353
+ method: "POST"
5354
+ });
5355
+ if (options.json) {
5356
+ console.log(JSON.stringify(data, null, 2));
5357
+ return;
5358
+ }
5359
+ console.log();
5360
+ console.log(chalk2.yellow(` Eval ${id} cancelled`));
5361
+ console.log(chalk2.dim(` Stopped recruiting \u2014 no new workers will be assigned`));
5362
+ if (data.warnings?.length) {
5363
+ console.log();
5364
+ for (const w of data.warnings) {
5365
+ console.log(chalk2.red(` Warning: ${w}`));
5366
+ }
5367
+ }
5368
+ console.log();
5369
+ console.log(chalk2.dim(` Any responses already collected are still available:`));
5370
+ console.log(chalk2.dim(` candor eval results ${id}`));
5371
+ console.log();
5372
+ } catch (err) {
5373
+ if (options.json) {
5374
+ console.log(
5375
+ JSON.stringify({ error: err instanceof Error ? err.message : String(err) })
5376
+ );
5377
+ process.exit(1);
5378
+ }
5379
+ console.log(chalk2.red(`Error: ${err instanceof Error ? err.message : err}`));
5380
+ process.exit(1);
4979
5381
  }
4980
5382
  }
4981
5383
 
4982
5384
  // src/index.ts
4983
5385
  var program2 = new Command();
4984
- program2.name("candor").description("AI-moderated user studies, wired into your dev workflow").version("0.2.0");
5386
+ program2.name("candor").description("AI-moderated user studies, wired into your dev workflow").version("0.3.0").enablePositionalOptions();
4985
5387
  program2.command("init").description("Set up Candor: authenticate and configure Claude Code integration").option("--skill-only", "Only reinstall the skill file").action(initCommand);
4986
5388
  program2.command("login").description("Re-authenticate with Candor").action(loginCommand);
4987
5389
  program2.command("logout").description("Log out and clear stored credentials").action(logoutCommand);
4988
- var study = program2.command("study").description("Create, view, and manage user studies").option("--json", "Output as JSON").action(studiesCommand);
5390
+ var study = program2.command("study").description("Create, view, and manage user studies").passThroughOptions().option("--json", "Output as JSON").action(studiesCommand);
4989
5391
  study.command("list").description("List all studies").option("--json", "Output as JSON").action(studiesCommand);
4990
5392
  study.command("show <id>").description("View study details and findings").option("--json", "Output as JSON").action(studyCommand);
4991
5393
  study.command("create").description("Create a new user study").requiredOption("--url <url>", "Product URL to test").requiredOption("--goal <goal>", "What you want to learn").option("--participants <n>", "Number of participants", "5").option("--json", "Output as JSON").action(createStudyCommand);
4992
5394
  study.command("status <id>").description("Check participant progress").option("--json", "Output as JSON").action(statusCommand);
4993
5395
  study.command("findings <id>").description("Get prioritized findings").option("--json", "Output as JSON").action(findingsCommand);
4994
5396
  study.command("approve <id>").description("Approve to begin recruiting").option("--json", "Output as JSON").action(approveCommand);
5397
+ var evalCmd = program2.command("eval").description("Create, approve, and view human evaluation jobs").passThroughOptions().option("--json", "Output as JSON").action(evalListCommand);
5398
+ evalCmd.command("list").description("List all eval jobs").option("--json", "Output as JSON").action(evalListCommand);
5399
+ evalCmd.command("create").description("Create a new eval job").requiredOption("--goal <goal>", "What you want to evaluate").requiredOption("--items <items>", "Comma-separated item labels").option("--type <type>", "Task type: pairwise_comparison, categorical_label, rating_scale").option("--labels <labels>", "Comma-separated labels for categorical_label type").option("--workers <n>", "Workers per task", "3").option("--reward <cents>", "Reward in cents per task", "5").option("--batch-size <n>", "Tasks per assignment (auto if not set)").option("--json", "Output as JSON").action(evalCreateCommand);
5400
+ evalCmd.command("approve <id>").description("Approve and launch an eval").option("--json", "Output as JSON").action(evalLaunchCommand);
5401
+ evalCmd.command("status <id>").description("Check eval progress").option("--json", "Output as JSON").option("--live", "Poll for updates until complete").action(evalStatusCommand);
5402
+ evalCmd.command("results <id>").description("View eval results and rankings").option("--json", "Output as JSON").action(evalResultsCommand);
5403
+ evalCmd.command("cancel <id>").description("Cancel an active eval and stop recruiting").option("--json", "Output as JSON").action(evalCancelCommand);
4995
5404
  program2.command("update").description("Update the Candor CLI to the latest version").action(updateCommand);
4996
5405
  program2.command("doctor", { hidden: true }).description("Run diagnostic checks on your Candor installation").action(doctorCommand);
4997
5406
  program2.parse();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@candor.sh/cli",
3
- "version": "0.2.0",
3
+ "version": "0.3.0",
4
4
  "private": false,
5
5
  "type": "module",
6
6
  "bin": {