clementine-agent 1.18.76 → 1.18.78

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,54 @@
1
+ /**
2
+ * Goal evaluation — PRD Phase 1.
3
+ *
4
+ * Two evaluators run at the END of a successful cron run, when the Task
5
+ * defines `successSchema` (JSON Schema validated against the agent's output)
6
+ * and/or `successCriteriaText` (free-text criterion graded by an evaluator
7
+ * sub-agent). The verdicts merge into a single `goalCheck` object that
8
+ * gets stamped on the run's CronRunEntry.
9
+ *
10
+ * Design constraints:
11
+ * - Never block run completion. Any thrown error becomes status='error' on
12
+ * goalCheck and the rest of the run logs unchanged.
13
+ * - Bounded budgets — schema validation is sub-millisecond; evaluator agent
14
+ * gets max_turns=1, ~30s wall clock, Haiku-class model.
15
+ * - No new top-level deps — ajv is a transitive install; we import it lazily
16
+ * inside the function so test fixtures that don't need it never load it.
17
+ */
18
+ import type { CronJobDefinition, CronRunEntry } from '../types.js';
19
+ type SchemaResult = {
20
+ pass: boolean;
21
+ errors: string[];
22
+ tried: boolean;
23
+ };
24
+ type EvaluatorResult = {
25
+ pass: boolean;
26
+ reason: string;
27
+ };
28
+ /**
29
+ * Validate the agent's response against a JSON Schema. Returns:
30
+ * - tried=false if no JSON could be extracted from the response
31
+ * - tried=true with pass + errors otherwise
32
+ * Schema-compile errors throw — caller catches.
33
+ */
34
+ export declare function validateAgainstSchema(responseText: string, schema: Record<string, unknown>): Promise<SchemaResult>;
35
+ /**
36
+ * Ask a small evaluator sub-agent whether the run accomplished the
37
+ * `successCriteriaText` criterion. Returns null if the evaluator failed
38
+ * to produce a parseable verdict (caller treats null as goalCheck.status='error').
39
+ *
40
+ * The evaluator is intentionally minimal — Haiku, max_turns=1, focused
41
+ * system prompt, ~30s budget. We're grading text, not running tools.
42
+ */
43
+ export declare function evaluateAgainstCriterion(responseText: string, criterion: string, opts?: {
44
+ model?: string;
45
+ timeoutMs?: number;
46
+ }): Promise<EvaluatorResult | null>;
47
+ /**
48
+ * Orchestrator: runs whichever evaluators are configured on the Task and
49
+ * merges their verdicts into a single goalCheck record. Returns undefined
50
+ * when no goal is configured — the field then stays absent on the run entry.
51
+ */
52
+ export declare function runGoalCheck(responseText: string, job: CronJobDefinition): Promise<CronRunEntry['goalCheck']>;
53
+ export {};
54
+ //# sourceMappingURL=goal-evaluator.d.ts.map
@@ -0,0 +1,235 @@
1
+ /**
2
+ * Goal evaluation — PRD Phase 1.
3
+ *
4
+ * Two evaluators run at the END of a successful cron run, when the Task
5
+ * defines `successSchema` (JSON Schema validated against the agent's output)
6
+ * and/or `successCriteriaText` (free-text criterion graded by an evaluator
7
+ * sub-agent). The verdicts merge into a single `goalCheck` object that
8
+ * gets stamped on the run's CronRunEntry.
9
+ *
10
+ * Design constraints:
11
+ * - Never block run completion. Any thrown error becomes status='error' on
12
+ * goalCheck and the rest of the run logs unchanged.
13
+ * - Bounded budgets — schema validation is sub-millisecond; evaluator agent
14
+ * gets max_turns=1, ~30s wall clock, Haiku-class model.
15
+ * - No new top-level deps — ajv is a transitive install; we import it lazily
16
+ * inside the function so test fixtures that don't need it never load it.
17
+ */
18
+ /**
19
+ * Try to extract a JSON object from the agent's response. Looks first at the
20
+ * whole text, then at fenced ```json blocks (the common Claude output shape),
21
+ * then at any {...} substring as a last resort.
22
+ */
23
+ function extractJson(responseText) {
24
+ if (!responseText || typeof responseText !== 'string')
25
+ return null;
26
+ // Whole-text parse first.
27
+ try {
28
+ return JSON.parse(responseText);
29
+ }
30
+ catch { /* fall through */ }
31
+ // Fenced ```json ... ``` block.
32
+ const fenced = responseText.match(/```(?:json|JSON)?\s*([\s\S]*?)```/);
33
+ if (fenced && fenced[1]) {
34
+ try {
35
+ return JSON.parse(fenced[1].trim());
36
+ }
37
+ catch { /* fall through */ }
38
+ }
39
+ // First {...} substring (greedy through last brace).
40
+ const first = responseText.indexOf('{');
41
+ const last = responseText.lastIndexOf('}');
42
+ if (first >= 0 && last > first) {
43
+ try {
44
+ return JSON.parse(responseText.slice(first, last + 1));
45
+ }
46
+ catch { /* fall through */ }
47
+ }
48
+ return null;
49
+ }
50
+ /**
51
+ * Validate the agent's response against a JSON Schema. Returns:
52
+ * - tried=false if no JSON could be extracted from the response
53
+ * - tried=true with pass + errors otherwise
54
+ * Schema-compile errors throw — caller catches.
55
+ */
56
+ export async function validateAgainstSchema(responseText, schema) {
57
+ const candidate = extractJson(responseText);
58
+ if (candidate === null) {
59
+ return { tried: false, pass: false, errors: ['No JSON object found in agent response'] };
60
+ }
61
+ // Lazy import so this module costs nothing when no Task has a schema.
62
+ const ajvMod = await import('ajv').catch(() => null);
63
+ if (!ajvMod) {
64
+ throw new Error('ajv not available — cannot validate success_schema');
65
+ }
66
+ // Handle CJS default-export interop (ajv@8 ships as CJS; the ESM bridge
67
+ // sometimes lands the constructor on .default and sometimes at the top
68
+ // level).
69
+ const AjvCtor = ajvMod.default ?? ajvMod;
70
+ const ajv = new AjvCtor({ allErrors: true, strict: false });
71
+ const validator = ajv.compile(schema);
72
+ const ok = validator(candidate);
73
+ if (ok)
74
+ return { tried: true, pass: true, errors: [] };
75
+ // ajv stamps errors on the compiled validator; the instance fallback covers
76
+ // older versions that put them on the ajv instance instead.
77
+ const rawErrors = validator.errors ?? ajv.errors ?? [];
78
+ const errs = rawErrors.slice(0, 5).map((e) => {
79
+ const path = e.instancePath || '';
80
+ const msg = e.message || 'invalid';
81
+ return path ? `${path} ${msg}` : msg;
82
+ });
83
+ return { tried: true, pass: false, errors: errs.length ? errs : ['validation failed'] };
84
+ }
85
+ /**
86
+ * Ask a small evaluator sub-agent whether the run accomplished the
87
+ * `successCriteriaText` criterion. Returns null if the evaluator failed
88
+ * to produce a parseable verdict (caller treats null as goalCheck.status='error').
89
+ *
90
+ * The evaluator is intentionally minimal — Haiku, max_turns=1, focused
91
+ * system prompt, ~30s budget. We're grading text, not running tools.
92
+ */
93
+ export async function evaluateAgainstCriterion(responseText, criterion, opts = {}) {
94
+ const trimmedResponse = (responseText || '').slice(0, 8000);
95
+ const trimmedCriterion = (criterion || '').slice(0, 2000);
96
+ if (!trimmedCriterion)
97
+ return null;
98
+ const sdk = await import('@anthropic-ai/claude-agent-sdk').catch(() => null);
99
+ if (!sdk || typeof sdk.query !== 'function') {
100
+ return null;
101
+ }
102
+ const systemPrompt = 'You are a strict evaluator. Grade whether a scheduled task accomplished its stated goal.\n' +
103
+ 'Reply with EXACTLY one line in this format:\n' +
104
+ 'PASS — <one-sentence reason> | FAIL — <one-sentence reason>\n' +
105
+ 'Be honest. If the run did not achieve the goal, say FAIL even if the agent claimed success.';
106
+ const userPrompt = `GOAL:\n${trimmedCriterion}\n\nRUN OUTPUT:\n${trimmedResponse}\n\nVerdict:`;
107
+ const timeoutMs = opts.timeoutMs ?? 30_000;
108
+ const model = opts.model ?? 'claude-haiku-4-5-20251001';
109
+ // Race the SDK query against a hard timeout so a hung evaluator never
110
+ // blocks run logging.
111
+ const queryPromise = (async () => {
112
+ let collected = '';
113
+ try {
114
+ const queryFn = sdk.query;
115
+ const iter = queryFn({
116
+ prompt: userPrompt,
117
+ options: {
118
+ systemPrompt,
119
+ model,
120
+ maxTurns: 1,
121
+ permissionMode: 'default',
122
+ allowedTools: [],
123
+ settingSources: [],
124
+ // No tools, no network beyond model — purely text-in / text-out.
125
+ },
126
+ });
127
+ for await (const message of iter) {
128
+ const m = message;
129
+ if (m.type === 'assistant' && Array.isArray(m.content)) {
130
+ for (const block of m.content) {
131
+ const b = block;
132
+ if (b.type === 'text' && typeof b.text === 'string')
133
+ collected += b.text;
134
+ }
135
+ }
136
+ else if (m.type === 'result' && typeof m.result === 'string') {
137
+ collected += m.result;
138
+ }
139
+ }
140
+ }
141
+ catch {
142
+ return null;
143
+ }
144
+ return collected;
145
+ })();
146
+ const timeoutPromise = new Promise((resolve) => setTimeout(() => resolve(null), timeoutMs));
147
+ const collected = await Promise.race([queryPromise, timeoutPromise]);
148
+ if (!collected || typeof collected !== 'string')
149
+ return null;
150
+ // Parse the strict verdict line. Accept variants: "PASS — reason", "FAIL: reason",
151
+ // "Verdict: PASS — reason", etc.
152
+ const match = collected.match(/\b(PASS|FAIL)\b\s*[—\-:]?\s*(.+)/i);
153
+ if (!match)
154
+ return null;
155
+ const verdict = match[1].toUpperCase() === 'PASS';
156
+ const reason = (match[2] || '').replace(/[\r\n].*$/s, '').trim().slice(0, 280);
157
+ return { pass: verdict, reason: reason || (verdict ? 'Pass' : 'Fail') };
158
+ }
159
+ /**
160
+ * Orchestrator: runs whichever evaluators are configured on the Task and
161
+ * merges their verdicts into a single goalCheck record. Returns undefined
162
+ * when no goal is configured — the field then stays absent on the run entry.
163
+ */
164
+ export async function runGoalCheck(responseText, job) {
165
+ const hasSchema = !!(job.successSchema && Object.keys(job.successSchema).length > 0);
166
+ const hasCriterion = !!(job.successCriteriaText && job.successCriteriaText.trim());
167
+ if (!hasSchema && !hasCriterion)
168
+ return undefined;
169
+ let schemaResult = null;
170
+ let evaluatorResult = null;
171
+ let errored = false;
172
+ let errorMessage = '';
173
+ if (hasSchema) {
174
+ try {
175
+ schemaResult = await validateAgainstSchema(responseText, job.successSchema);
176
+ }
177
+ catch (err) {
178
+ errored = true;
179
+ errorMessage = `schema validator threw: ${String(err).slice(0, 200)}`;
180
+ }
181
+ }
182
+ if (hasCriterion) {
183
+ try {
184
+ evaluatorResult = await evaluateAgainstCriterion(responseText, job.successCriteriaText);
185
+ if (evaluatorResult === null && !errored) {
186
+ // Treat unparseable evaluator output as 'error' rather than 'fail' — we
187
+ // don't want a flaky evaluator to mark a healthy run as failed.
188
+ errored = true;
189
+ errorMessage = 'evaluator did not return a parseable PASS/FAIL verdict';
190
+ }
191
+ }
192
+ catch (err) {
193
+ errored = true;
194
+ errorMessage = `evaluator threw: ${String(err).slice(0, 200)}`;
195
+ }
196
+ }
197
+ // Decide overall status. Both passed = pass. Either failed = fail. Neither
198
+ // ran cleanly but both were configured = error.
199
+ const mode = hasSchema && hasCriterion ? 'both' : hasSchema ? 'schema' : 'evaluator';
200
+ let status;
201
+ if (errored && (!schemaResult || !evaluatorResult)) {
202
+ status = 'error';
203
+ }
204
+ else {
205
+ const schemaPassed = schemaResult?.pass !== false; // true if not run, or true if run + passed
206
+ const evaluatorPassed = evaluatorResult?.pass !== false; // same
207
+ const schemaFailed = schemaResult ? !schemaResult.pass || !schemaResult.tried : false;
208
+ const evaluatorFailed = evaluatorResult ? !evaluatorResult.pass : false;
209
+ if (schemaFailed || evaluatorFailed)
210
+ status = 'fail';
211
+ else if (schemaPassed && evaluatorPassed)
212
+ status = 'pass';
213
+ else
214
+ status = 'error';
215
+ }
216
+ const out = { status, mode };
217
+ if (schemaResult) {
218
+ out.schemaPass = schemaResult.pass && schemaResult.tried;
219
+ if (!schemaResult.pass || !schemaResult.tried) {
220
+ out.schemaErrors = schemaResult.errors.slice(0, 5);
221
+ }
222
+ }
223
+ if (evaluatorResult) {
224
+ out.evaluatorPass = evaluatorResult.pass;
225
+ out.evaluatorReason = evaluatorResult.reason;
226
+ }
227
+ if (errored && errorMessage) {
228
+ // Stash the error in evaluatorReason if we don't already have one — the
229
+ // dashboard surfaces this string in the tooltip.
230
+ if (!out.evaluatorReason)
231
+ out.evaluatorReason = errorMessage;
232
+ }
233
+ return out;
234
+ }
235
+ //# sourceMappingURL=goal-evaluator.js.map
package/dist/cli/cron.js CHANGED
@@ -140,7 +140,7 @@ export async function cmdCronRun(jobName) {
140
140
  try {
141
141
  const response = await gateway.handleCronJob(job.name, job.prompt, job.tier, job.maxTurns, job.model, job.workDir, job.mode, job.maxHours);
142
142
  const finishedAt = new Date();
143
- runLog.append({
143
+ const entry = {
144
144
  jobName: job.name,
145
145
  startedAt: startedAt.toISOString(),
146
146
  finishedAt: finishedAt.toISOString(),
@@ -148,7 +148,21 @@ export async function cmdCronRun(jobName) {
148
148
  durationMs: finishedAt.getTime() - startedAt.getTime(),
149
149
  attempt: 1,
150
150
  outputPreview: response ? response.slice(0, 200) : undefined,
151
- });
151
+ };
152
+ // PRD Phase 1.1: goal-orientation evaluator (mirrors the daemon path).
153
+ if (job.successSchema || (job.successCriteriaText && job.successCriteriaText.trim())) {
154
+ try {
155
+ const { runGoalCheck } = await import('../agent/goal-evaluator.js');
156
+ const goalCheck = await runGoalCheck(response ?? '', job);
157
+ if (goalCheck)
158
+ entry.goalCheck = goalCheck;
159
+ }
160
+ catch (err) {
161
+ // Never block logging on evaluator failure.
162
+ entry.goalCheck = { status: 'error', mode: 'evaluator', evaluatorReason: `evaluator orchestrator threw: ${String(err).slice(0, 200)}` };
163
+ }
164
+ }
165
+ runLog.append(entry);
152
166
  console.log(response || '(no output)');
153
167
  if (response && response !== '__NOTHING__') {
154
168
  console.log('\n(Note: Standalone runner — output not delivered to channels. Use the daemon for channel delivery.)');
@@ -6368,7 +6368,9 @@ If the tool returns nothing or errors, return an empty array \`[]\`.`,
6368
6368
  // ── CRON CRUD routes (continued) ──────────────────────────────
6369
6369
  app.post('/api/cron', (req, res) => {
6370
6370
  try {
6371
- const { name, schedule, prompt, tier, enabled, work_dir, mode, max_hours, max_retries, after, agent, context, skills, allowedTools, allowedMcpServers, tags, category, predictable, } = req.body;
6371
+ const { name, schedule, prompt, tier, enabled, work_dir, mode, max_hours, max_retries, after, agent, context, skills, allowedTools, allowedMcpServers, tags, category, predictable,
6372
+ // PRD Phase 1 fields (camelCase from API; written as snake_case YAML).
6373
+ successCriteriaText, successSchema, addDirs, } = req.body;
6372
6374
  if (!name || !schedule || !prompt) {
6373
6375
  res.status(400).json({ error: 'name, schedule, and prompt are required' });
6374
6376
  return;
@@ -6427,6 +6429,16 @@ If the tool returns nothing or errors, return an empty array \`[]\`.`,
6427
6429
  // Predictable mode — default to true (contract execution) for new
6428
6430
  // tricks created via the dashboard. Mirror the MCP tool default.
6429
6431
  job.predictable = (predictable === false) ? false : true;
6432
+ // PRD Phase 1: goal-orientation fields (camelCase from API → snake_case YAML).
6433
+ if (typeof successCriteriaText === 'string' && successCriteriaText.trim()) {
6434
+ job.success_criteria_text = successCriteriaText.trim();
6435
+ }
6436
+ if (successSchema && typeof successSchema === 'object' && !Array.isArray(successSchema) && Object.keys(successSchema).length > 0) {
6437
+ job.success_schema = successSchema;
6438
+ }
6439
+ if (Array.isArray(addDirs) && addDirs.length) {
6440
+ job.add_dirs = addDirs.map(String).map((s) => s.trim()).filter(Boolean);
6441
+ }
6430
6442
  jobs.push(job);
6431
6443
  writeCronFileAt(cronFile, parsed, jobs);
6432
6444
  res.json({ ok: true, message: `Created cron job: ${name}` });
@@ -6568,6 +6580,36 @@ If the tool returns nothing or errors, return an empty array \`[]\`.`,
6568
6580
  if (updates.predictable !== undefined) {
6569
6581
  jobs[idx].predictable = Boolean(updates.predictable);
6570
6582
  }
6583
+ // PRD Phase 1 goal fields. set-when-non-empty / delete-when-cleared,
6584
+ // matching the existing trick-capability pattern above. The deprecated
6585
+ // success_criteria array is dropped on the first save through this path.
6586
+ if (updates.successCriteriaText !== undefined) {
6587
+ const v = typeof updates.successCriteriaText === 'string' ? updates.successCriteriaText.trim() : '';
6588
+ if (v) {
6589
+ jobs[idx].success_criteria_text = v;
6590
+ delete jobs[idx].success_criteria; // sunset the deprecated alias on first save
6591
+ }
6592
+ else {
6593
+ delete jobs[idx].success_criteria_text;
6594
+ }
6595
+ }
6596
+ if (updates.successSchema !== undefined) {
6597
+ const s = updates.successSchema;
6598
+ if (s && typeof s === 'object' && !Array.isArray(s) && Object.keys(s).length > 0) {
6599
+ jobs[idx].success_schema = s;
6600
+ }
6601
+ else {
6602
+ delete jobs[idx].success_schema;
6603
+ }
6604
+ }
6605
+ if (updates.addDirs !== undefined) {
6606
+ if (Array.isArray(updates.addDirs) && updates.addDirs.length) {
6607
+ jobs[idx].add_dirs = updates.addDirs.map(String).map((s) => s.trim()).filter(Boolean);
6608
+ }
6609
+ else {
6610
+ delete jobs[idx].add_dirs;
6611
+ }
6612
+ }
6571
6613
  if (updates.name !== undefined && updates.name !== bareJobName) {
6572
6614
  // Rename — check for duplicates
6573
6615
  const dup = jobs.find((j, i) => i !== idx && String(j.name ?? '').toLowerCase() === String(updates.name).toLowerCase());
@@ -19998,6 +20040,32 @@ if('serviceWorker' in navigator){navigator.serviceWorker.getRegistrations().then
19998
20040
  </div>
19999
20041
  </div>
20000
20042
 
20043
+ <!-- ── PRD Phase 1: Goal (success_criteria_text + success_schema) ──
20044
+ The single most important new field set. Without one of these, a
20045
+ run "finished"; with one, a run "accomplished what it was meant
20046
+ to". Banner warns (does not block) when neither is set. -->
20047
+ <div class="cron-section-card">
20048
+ <h4>Goal <span style="color:var(--text-muted);font-weight:normal;font-size:12px">— how do you know this task succeeded?</span></h4>
20049
+ <p class="cron-section-desc">Optional but strongly recommended. Use plain English (an evaluator agent grades the run) or a JSON Schema (validated against the agent's structured output).</p>
20050
+ <div id="cron-goal-warning" style="display:none;margin-bottom:12px;padding:10px 12px;border-radius:6px;background:rgba(245,158,11,0.10);border:1px solid rgba(245,158,11,0.30);color:var(--yellow);font-size:12px">
20051
+ ⚠ No goal set — runs will be marked "finished" but not "accomplished". Add a success criterion below or a JSON Schema.
20052
+ </div>
20053
+ <div class="form-group">
20054
+ <label class="form-label">Success criterion <span style="color:var(--text-muted);font-weight:normal">(plain English)</span></label>
20055
+ <textarea id="cron-success-criteria-text" rows="3" placeholder="e.g. 'A daily briefing email was sent to nathan@example.com containing the top 3 overnight items.'" oninput="updateGoalWarning()"></textarea>
20056
+ <div class="form-hint">An evaluator sub-agent reads the run's output and this criterion, then emits pass/fail with reasoning.</div>
20057
+ </div>
20058
+ <div class="form-group" style="margin-bottom:0">
20059
+ <details>
20060
+ <summary style="cursor:pointer;font-size:12px;color:var(--text-secondary);font-weight:500;padding:6px 0">▾ Success schema (JSON Schema, advanced)</summary>
20061
+ <div style="margin-top:8px">
20062
+ <textarea id="cron-success-schema" rows="6" placeholder='{ "type": "object", "required": ["sent"], "properties": { "sent": { "type": "boolean" } } }' style="font-family:'JetBrains Mono',monospace;font-size:11px" oninput="updateGoalWarning()"></textarea>
20063
+ <div class="form-hint">JSON Schema validated against the agent's <code>structured_output</code>. Mechanically successful = parses + validates.</div>
20064
+ </div>
20065
+ </details>
20066
+ </div>
20067
+ </div>
20068
+
20001
20069
  <!-- Skills & tools: pinned skills + MCP + tools + tags -->
20002
20070
  <div class="cron-section-card">
20003
20071
  <h4>Skills &amp; tools</h4>
@@ -20074,6 +20142,14 @@ if('serviceWorker' in navigator){navigator.serviceWorker.getRegistrations().then
20074
20142
  <div class="form-hint">Run inside a project directory. Agent gets that project's CLAUDE.md.</div>
20075
20143
  </div>
20076
20144
  </div>
20145
+ <!-- PRD Phase 1: read scope beyond cwd. One absolute path per line. -->
20146
+ <div class="form-row">
20147
+ <div class="form-group" style="flex:1">
20148
+ <label class="form-label">Additional read directories <span style="color:var(--text-muted);font-weight:normal">(optional)</span></label>
20149
+ <textarea id="cron-add-dirs" rows="2" placeholder="/Users/me/notes&#10;/Users/me/clients/acme" style="font-family:'JetBrains Mono',monospace;font-size:11px"></textarea>
20150
+ <div class="form-hint">One absolute path per line. The agent gets read access to these in addition to the Project Context cwd.</div>
20151
+ </div>
20152
+ </div>
20077
20153
  <div class="form-row">
20078
20154
  <div class="form-group">
20079
20155
  <label class="form-label">Mode</label>
@@ -23198,6 +23274,22 @@ function renderScheduledTaskCard(task) {
23198
23274
  var ok = lr.status === 'ok';
23199
23275
  var statusIcon = ok ? '<span style="color:var(--green)">&#10003;</span>' : '<span style="color:var(--red)">&#10007;</span>';
23200
23276
  lastRunHtml = statusIcon + ' ' + esc(lr.status || 'unknown') + ' · ' + esc(timeAgo(lr.finishedAt || lr.startedAt || ''));
23277
+ // PRD Phase 1.1: goal pill. Orthogonal to status — a run can be status='ok'
23278
+ // but goalCheck.status='fail' (the agent finished cleanly without
23279
+ // accomplishing the stated goal). That's exactly the failure mode the
23280
+ // PRD's goal-orientation feature is designed to surface.
23281
+ if (lr.goalCheck) {
23282
+ var gc = lr.goalCheck;
23283
+ var gIcon = gc.status === 'pass' ? '🎯' : gc.status === 'fail' ? '✗' : gc.status === 'error' ? '⚠' : '';
23284
+ var gColor = gc.status === 'pass' ? 'var(--green)' : gc.status === 'fail' ? 'var(--red)' : 'var(--yellow)';
23285
+ var gLabel = gc.status === 'pass' ? 'goal met' : gc.status === 'fail' ? 'goal not met' : gc.status === 'error' ? 'goal eval failed' : '';
23286
+ var gTip = '';
23287
+ if (gc.evaluatorReason) gTip = gc.evaluatorReason;
23288
+ else if (Array.isArray(gc.schemaErrors) && gc.schemaErrors.length) gTip = 'Schema errors: ' + gc.schemaErrors.join('; ');
23289
+ if (gIcon && gLabel) {
23290
+ lastRunHtml += ' <span style="color:' + gColor + ';font-size:11px;font-weight:500" title="' + esc(gTip || gLabel) + '">· ' + gIcon + ' ' + esc(gLabel) + '</span>';
23291
+ }
23292
+ }
23201
23293
  // "ran with: …" — surface the skills + MCP that were live for this run.
23202
23294
  var ranWith = [];
23203
23295
  if (Array.isArray(lr.skillsApplied) && lr.skillsApplied.length > 0) {
@@ -23304,8 +23396,23 @@ function renderRecentHistoryList(runs) {
23304
23396
  var preview = String(entry.outputPreview).slice(0, 140);
23305
23397
  errorPreview = '<div style="font-size:11px;color:var(--text-muted);margin-top:2px;word-break:break-word">' + esc(preview) + '</div>';
23306
23398
  }
23307
- rowsHtml += '<div class="history-row" data-trace-job="' + esc(jobName) + '" style="display:grid;grid-template-columns:24px minmax(180px,1.2fr) minmax(180px,1fr) 90px auto;gap:10px;align-items:start;padding:8px 14px;border-bottom:1px solid var(--border);cursor:pointer">'
23399
+ // PRD Phase 1.1: goal cell. Empty cell when no goal configured (status='skipped'
23400
+ // returned by runGoalCheck means "no goal" — but we omit goalCheck entirely
23401
+ // in that case, so missing field == no goal). The cell stays present in the
23402
+ // grid for column alignment.
23403
+ var goalCellHtml = '<div></div>';
23404
+ if (entry.goalCheck) {
23405
+ var gc2 = entry.goalCheck;
23406
+ var gIcon2 = gc2.status === 'pass' ? '🎯' : gc2.status === 'fail' ? '✗' : gc2.status === 'error' ? '⚠' : '';
23407
+ var gColor2 = gc2.status === 'pass' ? 'var(--green)' : gc2.status === 'fail' ? 'var(--red)' : 'var(--yellow)';
23408
+ var gTip2 = gc2.evaluatorReason
23409
+ ? gc2.evaluatorReason
23410
+ : (Array.isArray(gc2.schemaErrors) && gc2.schemaErrors.length ? 'Schema errors: ' + gc2.schemaErrors.join('; ') : gc2.status);
23411
+ goalCellHtml = '<div style="color:' + gColor2 + ';font-size:13px;line-height:18px;text-align:center" title="' + esc(gTip2) + '">' + gIcon2 + '</div>';
23412
+ }
23413
+ rowsHtml += '<div class="history-row" data-trace-job="' + esc(jobName) + '" style="display:grid;grid-template-columns:24px 24px minmax(180px,1.2fr) minmax(180px,1fr) 90px auto;gap:10px;align-items:start;padding:8px 14px;border-bottom:1px solid var(--border);cursor:pointer">'
23308
23414
  + '<div style="color:' + statusColor + ';font-size:14px;line-height:18px;text-align:center" title="' + esc(status) + '">' + statusIcon + '</div>'
23415
+ + goalCellHtml
23309
23416
  + '<div style="min-width:0">'
23310
23417
  + '<div style="font-weight:500;color:var(--text-primary);font-size:13px;overflow:hidden;text-overflow:ellipsis;white-space:nowrap" title="' + esc(jobName) + '">' + esc(jobName) + attemptLabel + '</div>'
23311
23418
  + errorPreview
@@ -23316,8 +23423,10 @@ function renderRecentHistoryList(runs) {
23316
23423
  + '</div>';
23317
23424
  }
23318
23425
  return '<div class="history-list" style="background:var(--bg-secondary);border:1px solid var(--border);border-radius:var(--radius)">'
23319
- + '<div style="display:grid;grid-template-columns:24px minmax(180px,1.2fr) minmax(180px,1fr) 90px auto;gap:10px;padding:8px 14px;border-bottom:1px solid var(--border);font-size:11px;color:var(--text-muted);text-transform:uppercase;letter-spacing:0.04em;font-weight:500">'
23320
- + '<div></div><div>Task</div><div>Started</div><div>Duration</div><div></div>'
23426
+ + '<div style="display:grid;grid-template-columns:24px 24px minmax(180px,1.2fr) minmax(180px,1fr) 90px auto;gap:10px;padding:8px 14px;border-bottom:1px solid var(--border);font-size:11px;color:var(--text-muted);text-transform:uppercase;letter-spacing:0.04em;font-weight:500">'
23427
+ + '<div title="Run status (ok / error / etc.)"></div>'
23428
+ + '<div title="Goal check result — orthogonal to run status">Goal</div>'
23429
+ + '<div>Task</div><div>Started</div><div>Duration</div><div></div>'
23321
23430
  + '</div>'
23322
23431
  + rowsHtml
23323
23432
  + '</div>';
@@ -24719,6 +24828,18 @@ function renderCronLegacyBanner(job) {
24719
24828
  + '</div>';
24720
24829
  }
24721
24830
 
24831
+ // PRD Phase 1: show a non-blocking warning under the Goal section header
24832
+ // when neither success_criteria_text nor success_schema is set. The PRD's
24833
+ // "the run accomplished what it was supposed to" promise depends on at
24834
+ // least one of the two being present.
24835
+ function updateGoalWarning() {
24836
+ var sct = (document.getElementById('cron-success-criteria-text')?.value || '').trim();
24837
+ var ssc = (document.getElementById('cron-success-schema')?.value || '').trim();
24838
+ var warn = document.getElementById('cron-goal-warning');
24839
+ if (!warn) return;
24840
+ warn.style.display = (!sct && !ssc) ? '' : 'none';
24841
+ }
24842
+
24722
24843
  // One-click migration: flip predictable=true AND save immediately so the
24723
24844
  // user doesn't have to remember to also click Save Changes.
24724
24845
  async function enablePredictableFromBanner() {
@@ -24760,6 +24881,11 @@ function openCreateCronModal(agentSlug) {
24760
24881
  toggleUnleashedOptions();
24761
24882
  document.getElementById('cron-prompt').value = '';
24762
24883
  document.getElementById('cron-context').value = '';
24884
+ // PRD Phase 1 goal fields — empty by default. Warning banner will show.
24885
+ var sct = document.getElementById('cron-success-criteria-text'); if (sct) sct.value = '';
24886
+ var ssc = document.getElementById('cron-success-schema'); if (ssc) ssc.value = '';
24887
+ var addDirsEl = document.getElementById('cron-add-dirs'); if (addDirsEl) addDirsEl.value = '';
24888
+ if (typeof updateGoalWarning === 'function') updateGoalWarning();
24763
24889
  document.getElementById('cron-training-section').style.display = 'none';
24764
24890
  document.getElementById('cron-train-btn').style.display = '';
24765
24891
  resetCronTrainingChat();
@@ -24806,6 +24932,30 @@ function openEditCronModal(jobName) {
24806
24932
  toggleUnleashedOptions();
24807
24933
  document.getElementById('cron-prompt').value = job.prompt || '';
24808
24934
  document.getElementById('cron-context').value = job.context || '';
24935
+ // PRD Phase 1: load goal fields. Accept either casing — old YAML may have
24936
+ // success_criteria as a list (legacy); the parser already coalesces those
24937
+ // into successCriteriaText on read, but defend here too in case the API
24938
+ // shape differs from what the parser produces.
24939
+ var sctE = document.getElementById('cron-success-criteria-text');
24940
+ if (sctE) {
24941
+ var sctVal = job.successCriteriaText || job.success_criteria_text || '';
24942
+ if (!sctVal && Array.isArray(job.successCriteria || job.success_criteria)) {
24943
+ sctVal = (job.successCriteria || job.success_criteria || []).join('\\n');
24944
+ }
24945
+ sctE.value = sctVal;
24946
+ }
24947
+ var sscE = document.getElementById('cron-success-schema');
24948
+ if (sscE) {
24949
+ var sscObj = job.successSchema || job.success_schema;
24950
+ sscE.value = (sscObj && typeof sscObj === 'object') ? JSON.stringify(sscObj, null, 2) : '';
24951
+ }
24952
+ var addDirsE = document.getElementById('cron-add-dirs');
24953
+ if (addDirsE) {
24954
+ var addDirsArr = Array.isArray(job.addDirs) ? job.addDirs
24955
+ : (Array.isArray(job.add_dirs) ? job.add_dirs : []);
24956
+ addDirsE.value = addDirsArr.join('\\n');
24957
+ }
24958
+ if (typeof updateGoalWarning === 'function') updateGoalWarning();
24809
24959
  document.getElementById('cron-training-section').style.display = 'none';
24810
24960
  document.getElementById('cron-train-btn').style.display = '';
24811
24961
  resetCronTrainingChat();
@@ -25001,6 +25151,11 @@ function captureCronModalSnapshot() {
25001
25151
  v('cron-workdir'),
25002
25152
  v('cron-allowed-tools'),
25003
25153
  v('cron-category'),
25154
+ // PRD Phase 1 goal fields — included in dirty check so leaving the
25155
+ // modal with an unsaved success_schema or success_criteria_text prompts.
25156
+ v('cron-success-criteria-text'),
25157
+ v('cron-success-schema'),
25158
+ v('cron-add-dirs'),
25004
25159
  (document.getElementById('cron-predictable') || {}).checked ? '1' : '0',
25005
25160
  JSON.stringify(_cronSelectedSkills || []),
25006
25161
  JSON.stringify(_cronSelectedMcp || []),
@@ -25025,6 +25180,9 @@ function isCronModalDirty() {
25025
25180
  v('cron-workdir'),
25026
25181
  v('cron-allowed-tools'),
25027
25182
  v('cron-category'),
25183
+ v('cron-success-criteria-text'),
25184
+ v('cron-success-schema'),
25185
+ v('cron-add-dirs'),
25028
25186
  (document.getElementById('cron-predictable') || {}).checked ? '1' : '0',
25029
25187
  JSON.stringify(_cronSelectedSkills || []),
25030
25188
  JSON.stringify(_cronSelectedMcp || []),
@@ -25199,6 +25357,34 @@ async function saveCronJob() {
25199
25357
  }
25200
25358
  if (!prompt) { toast('Prompt is required — tell the agent what to do', 'error'); document.getElementById('cron-prompt').focus(); return; }
25201
25359
 
25360
+ // PRD Phase 1 goal fields. successCriteriaText is freeform; successSchema
25361
+ // is parsed JSON. Validate JSON early so the user gets a clean error before
25362
+ // the round-trip. Empty schema is fine — we just send {} or undefined.
25363
+ var successCriteriaText = (document.getElementById('cron-success-criteria-text')?.value || '').trim();
25364
+ var successSchemaRaw = (document.getElementById('cron-success-schema')?.value || '').trim();
25365
+ var successSchema;
25366
+ if (successSchemaRaw) {
25367
+ try {
25368
+ successSchema = JSON.parse(successSchemaRaw);
25369
+ if (!successSchema || typeof successSchema !== 'object' || Array.isArray(successSchema)) {
25370
+ toast('Success schema must be a JSON object', 'error');
25371
+ document.getElementById('cron-success-schema').focus();
25372
+ return;
25373
+ }
25374
+ } catch (e) {
25375
+ toast('Success schema is not valid JSON: ' + (e.message || String(e)), 'error');
25376
+ document.getElementById('cron-success-schema').focus();
25377
+ return;
25378
+ }
25379
+ }
25380
+ // add_dirs: one absolute path per line. Trim, dedupe, drop blanks.
25381
+ var addDirsRaw = (document.getElementById('cron-add-dirs')?.value || '').split(/\\r?\\n/);
25382
+ var addDirs = addDirsRaw.map(function(s){ return s.trim(); }).filter(Boolean);
25383
+ // Quick sanity — warn but don't block on relative paths.
25384
+ if (addDirs.some(function(p){ return !p.startsWith('/') && !p.startsWith('~'); })) {
25385
+ toast('Heads up: add_dirs entries should be absolute paths.', 'info');
25386
+ }
25387
+
25202
25388
  const body = {
25203
25389
  name, schedule, tier, prompt, enabled: true,
25204
25390
  work_dir: work_dir || undefined, mode, max_hours, max_retries, after, context,
@@ -25216,6 +25402,10 @@ async function saveCronJob() {
25216
25402
  tags: editingCronJob ? _cronTags : (_cronTags.length ? _cronTags : undefined),
25217
25403
  category: editingCronJob ? (category || '') : category,
25218
25404
  predictable,
25405
+ // PRD Phase 1 goal-orientation. PUT delete-on-empty pattern below.
25406
+ successCriteriaText: editingCronJob ? successCriteriaText : (successCriteriaText || undefined),
25407
+ successSchema: editingCronJob ? (successSchema || null) : (successSchema || undefined),
25408
+ addDirs: editingCronJob ? addDirs : (addDirs.length ? addDirs : undefined),
25219
25409
  };
25220
25410
 
25221
25411
  var wasEditing = !!editingCronJob;
@@ -113,6 +113,24 @@ export function parseCronJobs() {
113
113
  const successCriteria = Array.isArray(job.success_criteria)
114
114
  ? job.success_criteria.map(c => String(c))
115
115
  : undefined;
116
+ // PRD Phase 1: prefer success_criteria_text (free-form). On read, fall
117
+ // back to joining the legacy success_criteria string[] so legacy YAML
118
+ // keeps rendering in the new editor surface. Writes go to the new field.
119
+ let successCriteriaText = typeof job.success_criteria_text === 'string'
120
+ ? String(job.success_criteria_text)
121
+ : (typeof job.successCriteriaText === 'string' ? String(job.successCriteriaText) : undefined);
122
+ if (!successCriteriaText && Array.isArray(successCriteria) && successCriteria.length > 0) {
123
+ successCriteriaText = successCriteria.join('\n');
124
+ }
125
+ // PRD Phase 1: JSON Schema validated against ResultMessage.structured_output.
126
+ // Accept either snake_case (success_schema) or camelCase from API. Stored
127
+ // as a plain object; ajv is loaded lazily at validation time.
128
+ const successSchemaRaw = job.success_schema ?? job.successSchema;
129
+ const successSchema = (successSchemaRaw && typeof successSchemaRaw === 'object' && !Array.isArray(successSchemaRaw))
130
+ ? successSchemaRaw
131
+ : undefined;
132
+ // PRD Phase 1: read scope beyond cwd. Accept either casing.
133
+ const addDirs = normalizeStringArray(job.add_dirs ?? job.addDirs);
116
134
  const alwaysDeliver = job.always_deliver === true ? true : undefined;
117
135
  const context = job.context != null ? String(job.context) : undefined;
118
136
  const preCheck = job.pre_check != null ? String(job.pre_check) : undefined;
@@ -140,7 +158,8 @@ export function parseCronJobs() {
140
158
  }
141
159
  jobs.push({
142
160
  name, schedule, prompt, enabled, tier, maxTurns, model, workDir, mode,
143
- maxHours, maxRetries, after, successCriteria, alwaysDeliver, context, preCheck, agentSlug,
161
+ maxHours, maxRetries, after, successCriteria, successCriteriaText, successSchema, addDirs,
162
+ alwaysDeliver, context, preCheck, agentSlug,
144
163
  skills, allowedTools, allowedMcpServers, tags, category, predictable,
145
164
  });
146
165
  }
@@ -187,6 +206,18 @@ export function parseAgentCronJobs(agentsDir) {
187
206
  const successCriteria = Array.isArray(job.success_criteria)
188
207
  ? job.success_criteria.map(c => String(c))
189
208
  : undefined;
209
+ // PRD Phase 1 fields — symmetric with global parser above.
210
+ let successCriteriaText = typeof job.success_criteria_text === 'string'
211
+ ? String(job.success_criteria_text)
212
+ : (typeof job.successCriteriaText === 'string' ? String(job.successCriteriaText) : undefined);
213
+ if (!successCriteriaText && Array.isArray(successCriteria) && successCriteria.length > 0) {
214
+ successCriteriaText = successCriteria.join('\n');
215
+ }
216
+ const successSchemaRaw = job.success_schema ?? job.successSchema;
217
+ const successSchema = (successSchemaRaw && typeof successSchemaRaw === 'object' && !Array.isArray(successSchemaRaw))
218
+ ? successSchemaRaw
219
+ : undefined;
220
+ const addDirs = normalizeStringArray(job.add_dirs ?? job.addDirs);
190
221
  const context = job.context != null ? String(job.context) : undefined;
191
222
  const preCheck = job.pre_check != null ? String(job.pre_check) : undefined;
192
223
  // ── Trick capabilities — symmetric with global parser ─────────
@@ -210,7 +241,9 @@ export function parseAgentCronJobs(agentsDir) {
210
241
  allJobs.push({
211
242
  name: `${slug}:${name}`,
212
243
  schedule, prompt, enabled, tier, maxTurns, model, workDir,
213
- mode, maxHours, maxRetries, after, successCriteria, context, preCheck,
244
+ mode, maxHours, maxRetries, after,
245
+ successCriteria, successCriteriaText, successSchema, addDirs,
246
+ context, preCheck,
214
247
  agentSlug: slug,
215
248
  skills, allowedTools, allowedMcpServers, tags, category, predictable,
216
249
  });
@@ -1203,6 +1236,23 @@ export class CronScheduler {
1203
1236
  this.gateway.injectContext(`discord:user:${DISCORD_OWNER_ID}`, `[Scheduled cron: ${job.name}]`, response);
1204
1237
  }
1205
1238
  }
1239
+ // PRD Phase 1.1: goal-orientation. If the Task has successSchema or
1240
+ // successCriteriaText, run the evaluator now (before logging) so the
1241
+ // entry carries the goalCheck verdict. Errors here NEVER block
1242
+ // logging — runGoalCheck catches its own throws and emits
1243
+ // status='error' on the goalCheck instead.
1244
+ if (job.successSchema || (job.successCriteriaText && job.successCriteriaText.trim())) {
1245
+ try {
1246
+ const { runGoalCheck } = await import('../agent/goal-evaluator.js');
1247
+ const goalCheck = await runGoalCheck(response ?? '', job);
1248
+ if (goalCheck)
1249
+ entry.goalCheck = goalCheck;
1250
+ }
1251
+ catch (err) {
1252
+ logger.warn({ err, job: job.name }, 'Goal evaluator failed — proceeding without goalCheck');
1253
+ entry.goalCheck = { status: 'error', mode: 'evaluator', evaluatorReason: `evaluator orchestrator threw: ${String(err).slice(0, 200)}` };
1254
+ }
1255
+ }
1206
1256
  this._logRun(entry);
1207
1257
  this.logAutonomy('completed', job, { durationMs: entry.durationMs, deliveryFailed: entry.deliveryFailed, advisorApplied: !!advisorApplied });
1208
1258
  // Fire-and-forget: extract procedural skill from successful long-running cron jobs
package/dist/types.d.ts CHANGED
@@ -328,7 +328,22 @@ export interface CronJobDefinition {
328
328
  maxRetries?: number;
329
329
  after?: string;
330
330
  agentSlug?: string;
331
+ /** @deprecated Use successCriteriaText (free-text) or successSchema (JSON Schema)
332
+ * per PRD Phase 1. successCriteria is kept readable for one release; on read,
333
+ * parseCronJobs coalesces it into successCriteriaText. */
331
334
  successCriteria?: string[];
335
+ /** PRD Phase 1: free-text "this task is done when…". An evaluator sub-agent reads
336
+ * the run's final state and the criterion and emits a pass/fail with reasoning.
337
+ * Stored as RunEvaluation on the Run. Optional but recommended. */
338
+ successCriteriaText?: string;
339
+ /** PRD Phase 1: JSON Schema validated against ResultMessage.structured_output.
340
+ * If it parses, the run is mechanically successful. The Task editor shows a
341
+ * non-blocking "Goal not set" warning when neither this nor successCriteriaText
342
+ * is present. */
343
+ successSchema?: Record<string, unknown>;
344
+ /** PRD Phase 1: read scope beyond the cwd (workDir). Surfaced as a chip list
345
+ * in the editor's Scope tab. The runner passes these to the SDK as add_dirs. */
346
+ addDirs?: string[];
332
347
  alwaysDeliver?: boolean;
333
348
  context?: string;
334
349
  preCheck?: string;
@@ -432,6 +447,24 @@ export interface CronRunEntry {
432
447
  allowedToolsApplied?: string[];
433
448
  /** MCP servers live for this run (post profile + trick allowlist intersection). */
434
449
  mcpServersApplied?: string[];
450
+ /** PRD Phase 1: did the run accomplish what it was supposed to?
451
+ * Computed at run-end when the Task has successSchema or successCriteriaText.
452
+ * - status='pass' both configured checks passed (or the only one configured did)
453
+ * - status='fail' a configured check failed
454
+ * - status='skipped' no goal configured on the Task (don't show the pill)
455
+ * - status='error' evaluator/validator threw; does NOT mark the run failed
456
+ * This is orthogonal to CronRunEntry.status — a run can be status='ok' with
457
+ * goalCheck.status='fail' (the agent finished cleanly but didn't accomplish
458
+ * the stated goal), and that's the failure mode the PRD is designed to surface. */
459
+ goalCheck?: {
460
+ status: 'pass' | 'fail' | 'skipped' | 'error';
461
+ /** Which evaluators ran. 'both' means schema + evaluator agreed. */
462
+ mode: 'schema' | 'evaluator' | 'both';
463
+ schemaPass?: boolean;
464
+ schemaErrors?: string[];
465
+ evaluatorPass?: boolean;
466
+ evaluatorReason?: string;
467
+ };
435
468
  }
436
469
  export interface Models {
437
470
  haiku: string;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clementine-agent",
3
- "version": "1.18.76",
3
+ "version": "1.18.78",
4
4
  "description": "Clementine — Personal AI Assistant (TypeScript)",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",