npm - clementine-agent - Versions diffs - 1.18.76 → 1.18.78 - Mend

clementine-agent 1.18.76 → 1.18.78

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/agent/goal-evaluator.d.ts +54 -0
package/dist/agent/goal-evaluator.js +235 -0
package/dist/cli/cron.js +16 -2
package/dist/cli/dashboard.js +194 -4
package/dist/gateway/cron-scheduler.js +52 -2
package/dist/types.d.ts +33 -0
package/package.json +1 -1

package/dist/agent/goal-evaluator.d.ts ADDED Viewed

@@ -0,0 +1,54 @@
+/**
+ * Goal evaluation — PRD Phase 1.
+ *
+ * Two evaluators run at the END of a successful cron run, when the Task
+ * defines `successSchema` (JSON Schema validated against the agent's output)
+ * and/or `successCriteriaText` (free-text criterion graded by an evaluator
+ * sub-agent). The verdicts merge into a single `goalCheck` object that
+ * gets stamped on the run's CronRunEntry.
+ *
+ * Design constraints:
+ * - Never block run completion. Any thrown error becomes status='error' on
+ *   goalCheck and the rest of the run logs unchanged.
+ * - Bounded budgets — schema validation is sub-millisecond; evaluator agent
+ *   gets max_turns=1, ~30s wall clock, Haiku-class model.
+ * - No new top-level deps — ajv is a transitive install; we import it lazily
+ *   inside the function so test fixtures that don't need it never load it.
+ */
+import type { CronJobDefinition, CronRunEntry } from '../types.js';
+type SchemaResult = {
+    pass: boolean;
+    errors: string[];
+    tried: boolean;
+};
+type EvaluatorResult = {
+    pass: boolean;
+    reason: string;
+};
+/**
+ * Validate the agent's response against a JSON Schema. Returns:
+ *  - tried=false if no JSON could be extracted from the response
+ *  - tried=true with pass + errors otherwise
+ * Schema-compile errors throw — caller catches.
+ */
+export declare function validateAgainstSchema(responseText: string, schema: Record<string, unknown>): Promise<SchemaResult>;
+/**
+ * Ask a small evaluator sub-agent whether the run accomplished the
+ * `successCriteriaText` criterion. Returns null if the evaluator failed
+ * to produce a parseable verdict (caller treats null as goalCheck.status='error').
+ *
+ * The evaluator is intentionally minimal — Haiku, max_turns=1, focused
+ * system prompt, ~30s budget. We're grading text, not running tools.
+ */
+export declare function evaluateAgainstCriterion(responseText: string, criterion: string, opts?: {
+    model?: string;
+    timeoutMs?: number;
+}): Promise<EvaluatorResult | null>;
+/**
+ * Orchestrator: runs whichever evaluators are configured on the Task and
+ * merges their verdicts into a single goalCheck record. Returns undefined
+ * when no goal is configured — the field then stays absent on the run entry.
+ */
+export declare function runGoalCheck(responseText: string, job: CronJobDefinition): Promise<CronRunEntry['goalCheck']>;
+export {};
+//# sourceMappingURL=goal-evaluator.d.ts.map

package/dist/agent/goal-evaluator.js ADDED Viewed

@@ -0,0 +1,235 @@
+/**
+ * Goal evaluation — PRD Phase 1.
+ *
+ * Two evaluators run at the END of a successful cron run, when the Task
+ * defines `successSchema` (JSON Schema validated against the agent's output)
+ * and/or `successCriteriaText` (free-text criterion graded by an evaluator
+ * sub-agent). The verdicts merge into a single `goalCheck` object that
+ * gets stamped on the run's CronRunEntry.
+ *
+ * Design constraints:
+ * - Never block run completion. Any thrown error becomes status='error' on
+ *   goalCheck and the rest of the run logs unchanged.
+ * - Bounded budgets — schema validation is sub-millisecond; evaluator agent
+ *   gets max_turns=1, ~30s wall clock, Haiku-class model.
+ * - No new top-level deps — ajv is a transitive install; we import it lazily
+ *   inside the function so test fixtures that don't need it never load it.
+ */
+/**
+ * Try to extract a JSON object from the agent's response. Looks first at the
+ * whole text, then at fenced ```json blocks (the common Claude output shape),
+ * then at any {...} substring as a last resort.
+ */
+function extractJson(responseText) {
+    if (!responseText || typeof responseText !== 'string')
+        return null;
+    // Whole-text parse first.
+    try {
+        return JSON.parse(responseText);
+    }
+    catch { /* fall through */ }
+    // Fenced ```json ... ``` block.
+    const fenced = responseText.match(/```(?:json|JSON)?\s*([\s\S]*?)```/);
+    if (fenced && fenced[1]) {
+        try {
+            return JSON.parse(fenced[1].trim());
+        }
+        catch { /* fall through */ }
+    }
+    // First {...} substring (greedy through last brace).
+    const first = responseText.indexOf('{');
+    const last = responseText.lastIndexOf('}');
+    if (first >= 0 && last > first) {
+        try {
+            return JSON.parse(responseText.slice(first, last + 1));
+        }
+        catch { /* fall through */ }
+    }
+    return null;
+}
+/**
+ * Validate the agent's response against a JSON Schema. Returns:
+ *  - tried=false if no JSON could be extracted from the response
+ *  - tried=true with pass + errors otherwise
+ * Schema-compile errors throw — caller catches.
+ */
+export async function validateAgainstSchema(responseText, schema) {
+    const candidate = extractJson(responseText);
+    if (candidate === null) {
+        return { tried: false, pass: false, errors: ['No JSON object found in agent response'] };
+    }
+    // Lazy import so this module costs nothing when no Task has a schema.
+    const ajvMod = await import('ajv').catch(() => null);
+    if (!ajvMod) {
+        throw new Error('ajv not available — cannot validate success_schema');
+    }
+    // Handle CJS default-export interop (ajv@8 ships as CJS; the ESM bridge
+    // sometimes lands the constructor on .default and sometimes at the top
+    // level).
+    const AjvCtor = ajvMod.default ?? ajvMod;
+    const ajv = new AjvCtor({ allErrors: true, strict: false });
+    const validator = ajv.compile(schema);
+    const ok = validator(candidate);
+    if (ok)
+        return { tried: true, pass: true, errors: [] };
+    // ajv stamps errors on the compiled validator; the instance fallback covers
+    // older versions that put them on the ajv instance instead.
+    const rawErrors = validator.errors ?? ajv.errors ?? [];
+    const errs = rawErrors.slice(0, 5).map((e) => {
+        const path = e.instancePath || '';
+        const msg = e.message || 'invalid';
+        return path ? `${path} ${msg}` : msg;
+    });
+    return { tried: true, pass: false, errors: errs.length ? errs : ['validation failed'] };
+}
+/**
+ * Ask a small evaluator sub-agent whether the run accomplished the
+ * `successCriteriaText` criterion. Returns null if the evaluator failed
+ * to produce a parseable verdict (caller treats null as goalCheck.status='error').
+ *
+ * The evaluator is intentionally minimal — Haiku, max_turns=1, focused
+ * system prompt, ~30s budget. We're grading text, not running tools.
+ */
+export async function evaluateAgainstCriterion(responseText, criterion, opts = {}) {
+    const trimmedResponse = (responseText || '').slice(0, 8000);
+    const trimmedCriterion = (criterion || '').slice(0, 2000);
+    if (!trimmedCriterion)
+        return null;
+    const sdk = await import('@anthropic-ai/claude-agent-sdk').catch(() => null);
+    if (!sdk || typeof sdk.query !== 'function') {
+        return null;
+    }
+    const systemPrompt = 'You are a strict evaluator. Grade whether a scheduled task accomplished its stated goal.\n' +
+        'Reply with EXACTLY one line in this format:\n' +
+        'PASS — <one-sentence reason> | FAIL — <one-sentence reason>\n' +
+        'Be honest. If the run did not achieve the goal, say FAIL even if the agent claimed success.';
+    const userPrompt = `GOAL:\n${trimmedCriterion}\n\nRUN OUTPUT:\n${trimmedResponse}\n\nVerdict:`;
+    const timeoutMs = opts.timeoutMs ?? 30_000;
+    const model = opts.model ?? 'claude-haiku-4-5-20251001';
+    // Race the SDK query against a hard timeout so a hung evaluator never
+    // blocks run logging.
+    const queryPromise = (async () => {
+        let collected = '';
+        try {
+            const queryFn = sdk.query;
+            const iter = queryFn({
+                prompt: userPrompt,
+                options: {
+                    systemPrompt,
+                    model,
+                    maxTurns: 1,
+                    permissionMode: 'default',
+                    allowedTools: [],
+                    settingSources: [],
+                    // No tools, no network beyond model — purely text-in / text-out.
+                },
+            });
+            for await (const message of iter) {
+                const m = message;
+                if (m.type === 'assistant' && Array.isArray(m.content)) {
+                    for (const block of m.content) {
+                        const b = block;
+                        if (b.type === 'text' && typeof b.text === 'string')
+                            collected += b.text;
+                    }
+                }
+                else if (m.type === 'result' && typeof m.result === 'string') {
+                    collected += m.result;
+                }
+            }
+        }
+        catch {
+            return null;
+        }
+        return collected;
+    })();
+    const timeoutPromise = new Promise((resolve) => setTimeout(() => resolve(null), timeoutMs));
+    const collected = await Promise.race([queryPromise, timeoutPromise]);
+    if (!collected || typeof collected !== 'string')
+        return null;
+    // Parse the strict verdict line. Accept variants: "PASS — reason", "FAIL: reason",
+    // "Verdict: PASS — reason", etc.
+    const match = collected.match(/\b(PASS|FAIL)\b\s*[—\-:]?\s*(.+)/i);
+    if (!match)
+        return null;
+    const verdict = match[1].toUpperCase() === 'PASS';
+    const reason = (match[2] || '').replace(/[\r\n].*$/s, '').trim().slice(0, 280);
+    return { pass: verdict, reason: reason || (verdict ? 'Pass' : 'Fail') };
+}
+/**
+ * Orchestrator: runs whichever evaluators are configured on the Task and
+ * merges their verdicts into a single goalCheck record. Returns undefined
+ * when no goal is configured — the field then stays absent on the run entry.
+ */
+export async function runGoalCheck(responseText, job) {
+    const hasSchema = !!(job.successSchema && Object.keys(job.successSchema).length > 0);
+    const hasCriterion = !!(job.successCriteriaText && job.successCriteriaText.trim());
+    if (!hasSchema && !hasCriterion)
+        return undefined;
+    let schemaResult = null;
+    let evaluatorResult = null;
+    let errored = false;
+    let errorMessage = '';
+    if (hasSchema) {
+        try {
+            schemaResult = await validateAgainstSchema(responseText, job.successSchema);
+        }
+        catch (err) {
+            errored = true;
+            errorMessage = `schema validator threw: ${String(err).slice(0, 200)}`;
+        }
+    }
+    if (hasCriterion) {
+        try {
+            evaluatorResult = await evaluateAgainstCriterion(responseText, job.successCriteriaText);
+            if (evaluatorResult === null && !errored) {
+                // Treat unparseable evaluator output as 'error' rather than 'fail' — we
+                // don't want a flaky evaluator to mark a healthy run as failed.
+                errored = true;
+                errorMessage = 'evaluator did not return a parseable PASS/FAIL verdict';
+            }
+        }
+        catch (err) {
+            errored = true;
+            errorMessage = `evaluator threw: ${String(err).slice(0, 200)}`;
+        }
+    }
+    // Decide overall status. Both passed = pass. Either failed = fail. Neither
+    // ran cleanly but both were configured = error.
+    const mode = hasSchema && hasCriterion ? 'both' : hasSchema ? 'schema' : 'evaluator';
+    let status;
+    if (errored && (!schemaResult || !evaluatorResult)) {
+        status = 'error';
+    }
+    else {
+        const schemaPassed = schemaResult?.pass !== false; // true if not run, or true if run + passed
+        const evaluatorPassed = evaluatorResult?.pass !== false; // same
+        const schemaFailed = schemaResult ? !schemaResult.pass || !schemaResult.tried : false;
+        const evaluatorFailed = evaluatorResult ? !evaluatorResult.pass : false;
+        if (schemaFailed || evaluatorFailed)
+            status = 'fail';
+        else if (schemaPassed && evaluatorPassed)
+            status = 'pass';
+        else
+            status = 'error';
+    }
+    const out = { status, mode };
+    if (schemaResult) {
+        out.schemaPass = schemaResult.pass && schemaResult.tried;
+        if (!schemaResult.pass || !schemaResult.tried) {
+            out.schemaErrors = schemaResult.errors.slice(0, 5);
+        }
+    }
+    if (evaluatorResult) {
+        out.evaluatorPass = evaluatorResult.pass;
+        out.evaluatorReason = evaluatorResult.reason;
+    }
+    if (errored && errorMessage) {
+        // Stash the error in evaluatorReason if we don't already have one — the
+        // dashboard surfaces this string in the tooltip.
+        if (!out.evaluatorReason)
+            out.evaluatorReason = errorMessage;
+    }
+    return out;
+}
+//# sourceMappingURL=goal-evaluator.js.map

package/dist/cli/cron.js CHANGED Viewed

@@ -140,7 +140,7 @@ export async function cmdCronRun(jobName) {
     try {
         const response = await gateway.handleCronJob(job.name, job.prompt, job.tier, job.maxTurns, job.model, job.workDir, job.mode, job.maxHours);
         const finishedAt = new Date();
-        runLog.append({
+        const entry = {
             jobName: job.name,
             startedAt: startedAt.toISOString(),
             finishedAt: finishedAt.toISOString(),
@@ -148,7 +148,21 @@ export async function cmdCronRun(jobName) {
             durationMs: finishedAt.getTime() - startedAt.getTime(),
             attempt: 1,
             outputPreview: response ? response.slice(0, 200) : undefined,
-        });
+        };
+        // PRD Phase 1.1: goal-orientation evaluator (mirrors the daemon path).
+        if (job.successSchema || (job.successCriteriaText && job.successCriteriaText.trim())) {
+            try {
+                const { runGoalCheck } = await import('../agent/goal-evaluator.js');
+                const goalCheck = await runGoalCheck(response ?? '', job);
+                if (goalCheck)
+                    entry.goalCheck = goalCheck;
+            }
+            catch (err) {
+                // Never block logging on evaluator failure.
+                entry.goalCheck = { status: 'error', mode: 'evaluator', evaluatorReason: `evaluator orchestrator threw: ${String(err).slice(0, 200)}` };
+            }
+        }
+        runLog.append(entry);
         console.log(response || '(no output)');
         if (response && response !== '__NOTHING__') {
             console.log('\n(Note: Standalone runner — output not delivered to channels. Use the daemon for channel delivery.)');

package/dist/cli/dashboard.js CHANGED Viewed

@@ -6368,7 +6368,9 @@ If the tool returns nothing or errors, return an empty array \`[]\`.`,
     // ── CRON CRUD routes (continued) ──────────────────────────────
     app.post('/api/cron', (req, res) => {
         try {
-            const { name, schedule, prompt, tier, enabled, work_dir, mode, max_hours, max_retries, after, agent, context, skills, allowedTools, allowedMcpServers, tags, category, predictable, } = req.body;
+            const { name, schedule, prompt, tier, enabled, work_dir, mode, max_hours, max_retries, after, agent, context, skills, allowedTools, allowedMcpServers, tags, category, predictable,
+            // PRD Phase 1 fields (camelCase from API; written as snake_case YAML).
+            successCriteriaText, successSchema, addDirs, } = req.body;
             if (!name || !schedule || !prompt) {
                 res.status(400).json({ error: 'name, schedule, and prompt are required' });
                 return;
@@ -6427,6 +6429,16 @@ If the tool returns nothing or errors, return an empty array \`[]\`.`,
             // Predictable mode — default to true (contract execution) for new
             // tricks created via the dashboard. Mirror the MCP tool default.
             job.predictable = (predictable === false) ? false : true;
+            // PRD Phase 1: goal-orientation fields (camelCase from API → snake_case YAML).
+            if (typeof successCriteriaText === 'string' && successCriteriaText.trim()) {
+                job.success_criteria_text = successCriteriaText.trim();
+            }
+            if (successSchema && typeof successSchema === 'object' && !Array.isArray(successSchema) && Object.keys(successSchema).length > 0) {
+                job.success_schema = successSchema;
+            }
+            if (Array.isArray(addDirs) && addDirs.length) {
+                job.add_dirs = addDirs.map(String).map((s) => s.trim()).filter(Boolean);
+            }
             jobs.push(job);
             writeCronFileAt(cronFile, parsed, jobs);
             res.json({ ok: true, message: `Created cron job: ${name}` });
@@ -6568,6 +6580,36 @@ If the tool returns nothing or errors, return an empty array \`[]\`.`,
             if (updates.predictable !== undefined) {
                 jobs[idx].predictable = Boolean(updates.predictable);
             }
+            // PRD Phase 1 goal fields. set-when-non-empty / delete-when-cleared,
+            // matching the existing trick-capability pattern above. The deprecated
+            // success_criteria array is dropped on the first save through this path.
+            if (updates.successCriteriaText !== undefined) {
+                const v = typeof updates.successCriteriaText === 'string' ? updates.successCriteriaText.trim() : '';
+                if (v) {
+                    jobs[idx].success_criteria_text = v;
+                    delete jobs[idx].success_criteria; // sunset the deprecated alias on first save
+                }
+                else {
+                    delete jobs[idx].success_criteria_text;
+                }
+            }
+            if (updates.successSchema !== undefined) {
+                const s = updates.successSchema;
+                if (s && typeof s === 'object' && !Array.isArray(s) && Object.keys(s).length > 0) {
+                    jobs[idx].success_schema = s;
+                }
+                else {
+                    delete jobs[idx].success_schema;
+                }
+            }
+            if (updates.addDirs !== undefined) {
+                if (Array.isArray(updates.addDirs) && updates.addDirs.length) {
+                    jobs[idx].add_dirs = updates.addDirs.map(String).map((s) => s.trim()).filter(Boolean);
+                }
+                else {
+                    delete jobs[idx].add_dirs;
+                }
+            }
             if (updates.name !== undefined && updates.name !== bareJobName) {
                 // Rename — check for duplicates
                 const dup = jobs.find((j, i) => i !== idx && String(j.name ?? '').toLowerCase() === String(updates.name).toLowerCase());
@@ -19998,6 +20040,32 @@ if('serviceWorker' in navigator){navigator.serviceWorker.getRegistrations().then
           </div>
         </div>
+        <!-- ── PRD Phase 1: Goal (success_criteria_text + success_schema) ──
+             The single most important new field set. Without one of these, a
+             run "finished"; with one, a run "accomplished what it was meant
+             to". Banner warns (does not block) when neither is set. -->
+        <div class="cron-section-card">
+          <h4>Goal <span style="color:var(--text-muted);font-weight:normal;font-size:12px">— how do you know this task succeeded?</span></h4>
+          <p class="cron-section-desc">Optional but strongly recommended. Use plain English (an evaluator agent grades the run) or a JSON Schema (validated against the agent's structured output).</p>
+          <div id="cron-goal-warning" style="display:none;margin-bottom:12px;padding:10px 12px;border-radius:6px;background:rgba(245,158,11,0.10);border:1px solid rgba(245,158,11,0.30);color:var(--yellow);font-size:12px">
+            ⚠ No goal set — runs will be marked "finished" but not "accomplished". Add a success criterion below or a JSON Schema.
+          </div>
+          <div class="form-group">
+            <label class="form-label">Success criterion <span style="color:var(--text-muted);font-weight:normal">(plain English)</span></label>
+            <textarea id="cron-success-criteria-text" rows="3" placeholder="e.g. 'A daily briefing email was sent to nathan@example.com containing the top 3 overnight items.'" oninput="updateGoalWarning()"></textarea>
+            <div class="form-hint">An evaluator sub-agent reads the run's output and this criterion, then emits pass/fail with reasoning.</div>
+          </div>
+          <div class="form-group" style="margin-bottom:0">
+            <details>
+              <summary style="cursor:pointer;font-size:12px;color:var(--text-secondary);font-weight:500;padding:6px 0">▾ Success schema (JSON Schema, advanced)</summary>
+              <div style="margin-top:8px">
+                <textarea id="cron-success-schema" rows="6" placeholder='{ "type": "object", "required": ["sent"], "properties": { "sent": { "type": "boolean" } } }' style="font-family:'JetBrains Mono',monospace;font-size:11px" oninput="updateGoalWarning()"></textarea>
+                <div class="form-hint">JSON Schema validated against the agent's <code>structured_output</code>. Mechanically successful = parses + validates.</div>
+              </div>
+            </details>
+          </div>
+        </div>
         <!-- Skills & tools: pinned skills + MCP + tools + tags -->
         <div class="cron-section-card">
           <h4>Skills &amp; tools</h4>
@@ -20074,6 +20142,14 @@ if('serviceWorker' in navigator){navigator.serviceWorker.getRegistrations().then
                 <div class="form-hint">Run inside a project directory. Agent gets that project's CLAUDE.md.</div>
               </div>
             </div>
+            <!-- PRD Phase 1: read scope beyond cwd. One absolute path per line. -->
+            <div class="form-row">
+              <div class="form-group" style="flex:1">
+                <label class="form-label">Additional read directories <span style="color:var(--text-muted);font-weight:normal">(optional)</span></label>
+                <textarea id="cron-add-dirs" rows="2" placeholder="/Users/me/notes&#10;/Users/me/clients/acme" style="font-family:'JetBrains Mono',monospace;font-size:11px"></textarea>
+                <div class="form-hint">One absolute path per line. The agent gets read access to these in addition to the Project Context cwd.</div>
+              </div>
+            </div>
             <div class="form-row">
               <div class="form-group">
                 <label class="form-label">Mode</label>
@@ -23198,6 +23274,22 @@ function renderScheduledTaskCard(task) {
     var ok = lr.status === 'ok';
     var statusIcon = ok ? '<span style="color:var(--green)">&#10003;</span>' : '<span style="color:var(--red)">&#10007;</span>';
     lastRunHtml = statusIcon + ' ' + esc(lr.status || 'unknown') + ' · ' + esc(timeAgo(lr.finishedAt || lr.startedAt || ''));
+    // PRD Phase 1.1: goal pill. Orthogonal to status — a run can be status='ok'
+    // but goalCheck.status='fail' (the agent finished cleanly without
+    // accomplishing the stated goal). That's exactly the failure mode the
+    // PRD's goal-orientation feature is designed to surface.
+    if (lr.goalCheck) {
+      var gc = lr.goalCheck;
+      var gIcon = gc.status === 'pass' ? '🎯' : gc.status === 'fail' ? '✗' : gc.status === 'error' ? '⚠' : '';
+      var gColor = gc.status === 'pass' ? 'var(--green)' : gc.status === 'fail' ? 'var(--red)' : 'var(--yellow)';
+      var gLabel = gc.status === 'pass' ? 'goal met' : gc.status === 'fail' ? 'goal not met' : gc.status === 'error' ? 'goal eval failed' : '';
+      var gTip = '';
+      if (gc.evaluatorReason) gTip = gc.evaluatorReason;
+      else if (Array.isArray(gc.schemaErrors) && gc.schemaErrors.length) gTip = 'Schema errors: ' + gc.schemaErrors.join('; ');
+      if (gIcon && gLabel) {
+        lastRunHtml += ' <span style="color:' + gColor + ';font-size:11px;font-weight:500" title="' + esc(gTip || gLabel) + '">· ' + gIcon + ' ' + esc(gLabel) + '</span>';
+      }
+    }
     // "ran with: …" — surface the skills + MCP that were live for this run.
     var ranWith = [];
     if (Array.isArray(lr.skillsApplied) && lr.skillsApplied.length > 0) {
@@ -23304,8 +23396,23 @@ function renderRecentHistoryList(runs) {
       var preview = String(entry.outputPreview).slice(0, 140);
       errorPreview = '<div style="font-size:11px;color:var(--text-muted);margin-top:2px;word-break:break-word">' + esc(preview) + '</div>';
     }
-    rowsHtml += '<div class="history-row" data-trace-job="' + esc(jobName) + '" style="display:grid;grid-template-columns:24px minmax(180px,1.2fr) minmax(180px,1fr) 90px auto;gap:10px;align-items:start;padding:8px 14px;border-bottom:1px solid var(--border);cursor:pointer">'
+    // PRD Phase 1.1: goal cell. Empty cell when no goal configured (status='skipped'
+    // returned by runGoalCheck means "no goal" — but we omit goalCheck entirely
+    // in that case, so missing field == no goal). The cell stays present in the
+    // grid for column alignment.
+    var goalCellHtml = '<div></div>';
+    if (entry.goalCheck) {
+      var gc2 = entry.goalCheck;
+      var gIcon2 = gc2.status === 'pass' ? '🎯' : gc2.status === 'fail' ? '✗' : gc2.status === 'error' ? '⚠' : '';
+      var gColor2 = gc2.status === 'pass' ? 'var(--green)' : gc2.status === 'fail' ? 'var(--red)' : 'var(--yellow)';
+      var gTip2 = gc2.evaluatorReason
+        ? gc2.evaluatorReason
+        : (Array.isArray(gc2.schemaErrors) && gc2.schemaErrors.length ? 'Schema errors: ' + gc2.schemaErrors.join('; ') : gc2.status);
+      goalCellHtml = '<div style="color:' + gColor2 + ';font-size:13px;line-height:18px;text-align:center" title="' + esc(gTip2) + '">' + gIcon2 + '</div>';
+    }
+    rowsHtml += '<div class="history-row" data-trace-job="' + esc(jobName) + '" style="display:grid;grid-template-columns:24px 24px minmax(180px,1.2fr) minmax(180px,1fr) 90px auto;gap:10px;align-items:start;padding:8px 14px;border-bottom:1px solid var(--border);cursor:pointer">'
       + '<div style="color:' + statusColor + ';font-size:14px;line-height:18px;text-align:center" title="' + esc(status) + '">' + statusIcon + '</div>'
+      + goalCellHtml
       + '<div style="min-width:0">'
         + '<div style="font-weight:500;color:var(--text-primary);font-size:13px;overflow:hidden;text-overflow:ellipsis;white-space:nowrap" title="' + esc(jobName) + '">' + esc(jobName) + attemptLabel + '</div>'
         + errorPreview
@@ -23316,8 +23423,10 @@ function renderRecentHistoryList(runs) {
       + '</div>';
   }
   return '<div class="history-list" style="background:var(--bg-secondary);border:1px solid var(--border);border-radius:var(--radius)">'
-    + '<div style="display:grid;grid-template-columns:24px minmax(180px,1.2fr) minmax(180px,1fr) 90px auto;gap:10px;padding:8px 14px;border-bottom:1px solid var(--border);font-size:11px;color:var(--text-muted);text-transform:uppercase;letter-spacing:0.04em;font-weight:500">'
-      + '<div></div><div>Task</div><div>Started</div><div>Duration</div><div></div>'
+    + '<div style="display:grid;grid-template-columns:24px 24px minmax(180px,1.2fr) minmax(180px,1fr) 90px auto;gap:10px;padding:8px 14px;border-bottom:1px solid var(--border);font-size:11px;color:var(--text-muted);text-transform:uppercase;letter-spacing:0.04em;font-weight:500">'
+      + '<div title="Run status (ok / error / etc.)"></div>'
+      + '<div title="Goal check result — orthogonal to run status">Goal</div>'
+      + '<div>Task</div><div>Started</div><div>Duration</div><div></div>'
     + '</div>'
     + rowsHtml
     + '</div>';
@@ -24719,6 +24828,18 @@ function renderCronLegacyBanner(job) {
     + '</div>';
 }
+// PRD Phase 1: show a non-blocking warning under the Goal section header
+// when neither success_criteria_text nor success_schema is set. The PRD's
+// "the run accomplished what it was supposed to" promise depends on at
+// least one of the two being present.
+function updateGoalWarning() {
+  var sct = (document.getElementById('cron-success-criteria-text')?.value || '').trim();
+  var ssc = (document.getElementById('cron-success-schema')?.value || '').trim();
+  var warn = document.getElementById('cron-goal-warning');
+  if (!warn) return;
+  warn.style.display = (!sct && !ssc) ? '' : 'none';
+}
 // One-click migration: flip predictable=true AND save immediately so the
 // user doesn't have to remember to also click Save Changes.
 async function enablePredictableFromBanner() {
@@ -24760,6 +24881,11 @@ function openCreateCronModal(agentSlug) {
   toggleUnleashedOptions();
   document.getElementById('cron-prompt').value = '';
   document.getElementById('cron-context').value = '';
+  // PRD Phase 1 goal fields — empty by default. Warning banner will show.
+  var sct = document.getElementById('cron-success-criteria-text'); if (sct) sct.value = '';
+  var ssc = document.getElementById('cron-success-schema'); if (ssc) ssc.value = '';
+  var addDirsEl = document.getElementById('cron-add-dirs'); if (addDirsEl) addDirsEl.value = '';
+  if (typeof updateGoalWarning === 'function') updateGoalWarning();
   document.getElementById('cron-training-section').style.display = 'none';
   document.getElementById('cron-train-btn').style.display = '';
   resetCronTrainingChat();
@@ -24806,6 +24932,30 @@ function openEditCronModal(jobName) {
   toggleUnleashedOptions();
   document.getElementById('cron-prompt').value = job.prompt || '';
   document.getElementById('cron-context').value = job.context || '';
+  // PRD Phase 1: load goal fields. Accept either casing — old YAML may have
+  // success_criteria as a list (legacy); the parser already coalesces those
+  // into successCriteriaText on read, but defend here too in case the API
+  // shape differs from what the parser produces.
+  var sctE = document.getElementById('cron-success-criteria-text');
+  if (sctE) {
+    var sctVal = job.successCriteriaText || job.success_criteria_text || '';
+    if (!sctVal && Array.isArray(job.successCriteria || job.success_criteria)) {
+      sctVal = (job.successCriteria || job.success_criteria || []).join('\\n');
+    }
+    sctE.value = sctVal;
+  }
+  var sscE = document.getElementById('cron-success-schema');
+  if (sscE) {
+    var sscObj = job.successSchema || job.success_schema;
+    sscE.value = (sscObj && typeof sscObj === 'object') ? JSON.stringify(sscObj, null, 2) : '';
+  }
+  var addDirsE = document.getElementById('cron-add-dirs');
+  if (addDirsE) {
+    var addDirsArr = Array.isArray(job.addDirs) ? job.addDirs
+      : (Array.isArray(job.add_dirs) ? job.add_dirs : []);
+    addDirsE.value = addDirsArr.join('\\n');
+  }
+  if (typeof updateGoalWarning === 'function') updateGoalWarning();
   document.getElementById('cron-training-section').style.display = 'none';
   document.getElementById('cron-train-btn').style.display = '';
   resetCronTrainingChat();
@@ -25001,6 +25151,11 @@ function captureCronModalSnapshot() {
     v('cron-workdir'),
     v('cron-allowed-tools'),
     v('cron-category'),
+    // PRD Phase 1 goal fields — included in dirty check so leaving the
+    // modal with an unsaved success_schema or success_criteria_text prompts.
+    v('cron-success-criteria-text'),
+    v('cron-success-schema'),
+    v('cron-add-dirs'),
     (document.getElementById('cron-predictable') || {}).checked ? '1' : '0',
     JSON.stringify(_cronSelectedSkills || []),
     JSON.stringify(_cronSelectedMcp || []),
@@ -25025,6 +25180,9 @@ function isCronModalDirty() {
     v('cron-workdir'),
     v('cron-allowed-tools'),
     v('cron-category'),
+    v('cron-success-criteria-text'),
+    v('cron-success-schema'),
+    v('cron-add-dirs'),
     (document.getElementById('cron-predictable') || {}).checked ? '1' : '0',
     JSON.stringify(_cronSelectedSkills || []),
     JSON.stringify(_cronSelectedMcp || []),
@@ -25199,6 +25357,34 @@ async function saveCronJob() {
   }
   if (!prompt) { toast('Prompt is required — tell the agent what to do', 'error'); document.getElementById('cron-prompt').focus(); return; }
+  // PRD Phase 1 goal fields. successCriteriaText is freeform; successSchema
+  // is parsed JSON. Validate JSON early so the user gets a clean error before
+  // the round-trip. Empty schema is fine — we just send {} or undefined.
+  var successCriteriaText = (document.getElementById('cron-success-criteria-text')?.value || '').trim();
+  var successSchemaRaw = (document.getElementById('cron-success-schema')?.value || '').trim();
+  var successSchema;
+  if (successSchemaRaw) {
+    try {
+      successSchema = JSON.parse(successSchemaRaw);
+      if (!successSchema || typeof successSchema !== 'object' || Array.isArray(successSchema)) {
+        toast('Success schema must be a JSON object', 'error');
+        document.getElementById('cron-success-schema').focus();
+        return;
+      }
+    } catch (e) {
+      toast('Success schema is not valid JSON: ' + (e.message || String(e)), 'error');
+      document.getElementById('cron-success-schema').focus();
+      return;
+    }
+  }
+  // add_dirs: one absolute path per line. Trim, dedupe, drop blanks.
+  var addDirsRaw = (document.getElementById('cron-add-dirs')?.value || '').split(/\\r?\\n/);
+  var addDirs = addDirsRaw.map(function(s){ return s.trim(); }).filter(Boolean);
+  // Quick sanity — warn but don't block on relative paths.
+  if (addDirs.some(function(p){ return !p.startsWith('/') && !p.startsWith('~'); })) {
+    toast('Heads up: add_dirs entries should be absolute paths.', 'info');
+  }
   const body = {
     name, schedule, tier, prompt, enabled: true,
     work_dir: work_dir || undefined, mode, max_hours, max_retries, after, context,
@@ -25216,6 +25402,10 @@ async function saveCronJob() {
     tags: editingCronJob ? _cronTags : (_cronTags.length ? _cronTags : undefined),
     category: editingCronJob ? (category || '') : category,
     predictable,
+    // PRD Phase 1 goal-orientation. PUT delete-on-empty pattern below.
+    successCriteriaText: editingCronJob ? successCriteriaText : (successCriteriaText || undefined),
+    successSchema: editingCronJob ? (successSchema || null) : (successSchema || undefined),
+    addDirs: editingCronJob ? addDirs : (addDirs.length ? addDirs : undefined),
   };
   var wasEditing = !!editingCronJob;

package/dist/gateway/cron-scheduler.js CHANGED Viewed

@@ -113,6 +113,24 @@ export function parseCronJobs() {
         const successCriteria = Array.isArray(job.success_criteria)
             ? job.success_criteria.map(c => String(c))
             : undefined;
+        // PRD Phase 1: prefer success_criteria_text (free-form). On read, fall
+        // back to joining the legacy success_criteria string[] so legacy YAML
+        // keeps rendering in the new editor surface. Writes go to the new field.
+        let successCriteriaText = typeof job.success_criteria_text === 'string'
+            ? String(job.success_criteria_text)
+            : (typeof job.successCriteriaText === 'string' ? String(job.successCriteriaText) : undefined);
+        if (!successCriteriaText && Array.isArray(successCriteria) && successCriteria.length > 0) {
+            successCriteriaText = successCriteria.join('\n');
+        }
+        // PRD Phase 1: JSON Schema validated against ResultMessage.structured_output.
+        // Accept either snake_case (success_schema) or camelCase from API. Stored
+        // as a plain object; ajv is loaded lazily at validation time.
+        const successSchemaRaw = job.success_schema ?? job.successSchema;
+        const successSchema = (successSchemaRaw && typeof successSchemaRaw === 'object' && !Array.isArray(successSchemaRaw))
+            ? successSchemaRaw
+            : undefined;
+        // PRD Phase 1: read scope beyond cwd. Accept either casing.
+        const addDirs = normalizeStringArray(job.add_dirs ?? job.addDirs);
         const alwaysDeliver = job.always_deliver === true ? true : undefined;
         const context = job.context != null ? String(job.context) : undefined;
         const preCheck = job.pre_check != null ? String(job.pre_check) : undefined;
@@ -140,7 +158,8 @@ export function parseCronJobs() {
         }
         jobs.push({
             name, schedule, prompt, enabled, tier, maxTurns, model, workDir, mode,
-            maxHours, maxRetries, after, successCriteria, alwaysDeliver, context, preCheck, agentSlug,
+            maxHours, maxRetries, after, successCriteria, successCriteriaText, successSchema, addDirs,
+            alwaysDeliver, context, preCheck, agentSlug,
             skills, allowedTools, allowedMcpServers, tags, category, predictable,
         });
     }
@@ -187,6 +206,18 @@ export function parseAgentCronJobs(agentsDir) {
                 const successCriteria = Array.isArray(job.success_criteria)
                     ? job.success_criteria.map(c => String(c))
                     : undefined;
+                // PRD Phase 1 fields — symmetric with global parser above.
+                let successCriteriaText = typeof job.success_criteria_text === 'string'
+                    ? String(job.success_criteria_text)
+                    : (typeof job.successCriteriaText === 'string' ? String(job.successCriteriaText) : undefined);
+                if (!successCriteriaText && Array.isArray(successCriteria) && successCriteria.length > 0) {
+                    successCriteriaText = successCriteria.join('\n');
+                }
+                const successSchemaRaw = job.success_schema ?? job.successSchema;
+                const successSchema = (successSchemaRaw && typeof successSchemaRaw === 'object' && !Array.isArray(successSchemaRaw))
+                    ? successSchemaRaw
+                    : undefined;
+                const addDirs = normalizeStringArray(job.add_dirs ?? job.addDirs);
                 const context = job.context != null ? String(job.context) : undefined;
                 const preCheck = job.pre_check != null ? String(job.pre_check) : undefined;
                 // ── Trick capabilities — symmetric with global parser ─────────
@@ -210,7 +241,9 @@ export function parseAgentCronJobs(agentsDir) {
                 allJobs.push({
                     name: `${slug}:${name}`,
                     schedule, prompt, enabled, tier, maxTurns, model, workDir,
-                    mode, maxHours, maxRetries, after, successCriteria, context, preCheck,
+                    mode, maxHours, maxRetries, after,
+                    successCriteria, successCriteriaText, successSchema, addDirs,
+                    context, preCheck,
                     agentSlug: slug,
                     skills, allowedTools, allowedMcpServers, tags, category, predictable,
                 });
@@ -1203,6 +1236,23 @@ export class CronScheduler {
                             this.gateway.injectContext(`discord:user:${DISCORD_OWNER_ID}`, `[Scheduled cron: ${job.name}]`, response);
                         }
                     }
+                    // PRD Phase 1.1: goal-orientation. If the Task has successSchema or
+                    // successCriteriaText, run the evaluator now (before logging) so the
+                    // entry carries the goalCheck verdict. Errors here NEVER block
+                    // logging — runGoalCheck catches its own throws and emits
+                    // status='error' on the goalCheck instead.
+                    if (job.successSchema || (job.successCriteriaText && job.successCriteriaText.trim())) {
+                        try {
+                            const { runGoalCheck } = await import('../agent/goal-evaluator.js');
+                            const goalCheck = await runGoalCheck(response ?? '', job);
+                            if (goalCheck)
+                                entry.goalCheck = goalCheck;
+                        }
+                        catch (err) {
+                            logger.warn({ err, job: job.name }, 'Goal evaluator failed — proceeding without goalCheck');
+                            entry.goalCheck = { status: 'error', mode: 'evaluator', evaluatorReason: `evaluator orchestrator threw: ${String(err).slice(0, 200)}` };
+                        }
+                    }
                     this._logRun(entry);
                     this.logAutonomy('completed', job, { durationMs: entry.durationMs, deliveryFailed: entry.deliveryFailed, advisorApplied: !!advisorApplied });
                     // Fire-and-forget: extract procedural skill from successful long-running cron jobs

package/dist/types.d.ts CHANGED Viewed

@@ -328,7 +328,22 @@ export interface CronJobDefinition {
     maxRetries?: number;
     after?: string;
     agentSlug?: string;
+    /** @deprecated Use successCriteriaText (free-text) or successSchema (JSON Schema)
+     *  per PRD Phase 1. successCriteria is kept readable for one release; on read,
+     *  parseCronJobs coalesces it into successCriteriaText. */
     successCriteria?: string[];
+    /** PRD Phase 1: free-text "this task is done when…". An evaluator sub-agent reads
+     *  the run's final state and the criterion and emits a pass/fail with reasoning.
+     *  Stored as RunEvaluation on the Run. Optional but recommended. */
+    successCriteriaText?: string;
+    /** PRD Phase 1: JSON Schema validated against ResultMessage.structured_output.
+     *  If it parses, the run is mechanically successful. The Task editor shows a
+     *  non-blocking "Goal not set" warning when neither this nor successCriteriaText
+     *  is present. */
+    successSchema?: Record<string, unknown>;
+    /** PRD Phase 1: read scope beyond the cwd (workDir). Surfaced as a chip list
+     *  in the editor's Scope tab. The runner passes these to the SDK as add_dirs. */
+    addDirs?: string[];
     alwaysDeliver?: boolean;
     context?: string;
     preCheck?: string;
@@ -432,6 +447,24 @@ export interface CronRunEntry {
     allowedToolsApplied?: string[];
     /** MCP servers live for this run (post profile + trick allowlist intersection). */
     mcpServersApplied?: string[];
+    /** PRD Phase 1: did the run accomplish what it was supposed to?
+     *  Computed at run-end when the Task has successSchema or successCriteriaText.
+     *  - status='pass'      both configured checks passed (or the only one configured did)
+     *  - status='fail'      a configured check failed
+     *  - status='skipped'   no goal configured on the Task (don't show the pill)
+     *  - status='error'     evaluator/validator threw; does NOT mark the run failed
+     *  This is orthogonal to CronRunEntry.status — a run can be status='ok' with
+     *  goalCheck.status='fail' (the agent finished cleanly but didn't accomplish
+     *  the stated goal), and that's the failure mode the PRD is designed to surface. */
+    goalCheck?: {
+        status: 'pass' | 'fail' | 'skipped' | 'error';
+        /** Which evaluators ran. 'both' means schema + evaluator agreed. */
+        mode: 'schema' | 'evaluator' | 'both';
+        schemaPass?: boolean;
+        schemaErrors?: string[];
+        evaluatorPass?: boolean;
+        evaluatorReason?: string;
+    };
 }
 export interface Models {
     haiku: string;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clementine-agent",
-  "version": "1.18.76",
+  "version": "1.18.78",
   "description": "Clementine — Personal AI Assistant (TypeScript)",
   "type": "module",
   "main": "dist/index.js",