clementine-agent 1.0.29 → 1.0.31
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/channels/slack.js
CHANGED
|
@@ -205,32 +205,42 @@ export async function startSlack(gateway, dispatcher, slackBotManager) {
|
|
|
205
205
|
* Returns true on success.
|
|
206
206
|
*
|
|
207
207
|
* Session key formats:
|
|
208
|
-
* slack:user:{userId}
|
|
208
|
+
* slack:team:{teamId}:user:{userId} → DM to user (workspace-namespaced, current format)
|
|
209
|
+
* slack:team:{teamId}:dm:{userId} → DM to user (workspace-namespaced)
|
|
210
|
+
* slack:user:{userId} → DM to user (legacy, pre-namespacing)
|
|
211
|
+
* slack:dm:{userId} → DM to user (legacy)
|
|
209
212
|
* slack:channel:{channelId}:{userId} → post in channel
|
|
210
213
|
* slack:channel:{channelId}:{slug}:{userId} → post in channel (agent-scoped chat)
|
|
211
|
-
* slack:dm:{userId} → DM to user
|
|
212
214
|
* slack:agent:{slug}:{userId} → DM to user (agent-scoped)
|
|
213
215
|
*/
|
|
214
216
|
async function trySlackSessionRouting(sessionKey, text) {
|
|
215
217
|
const parts = sessionKey.split(':');
|
|
216
218
|
if (parts[0] !== 'slack' || parts.length < 3)
|
|
217
219
|
return false;
|
|
218
|
-
|
|
220
|
+
// Strip the `team:{teamId}:` workspace prefix if present so downstream
|
|
221
|
+
// routing logic stays format-agnostic. The current bolt app is connected
|
|
222
|
+
// to a single workspace, so we use the existing client regardless of which
|
|
223
|
+
// teamId the session names.
|
|
224
|
+
let effectiveParts = parts;
|
|
225
|
+
if (parts[1] === 'team' && parts.length >= 4) {
|
|
226
|
+
effectiveParts = ['slack', ...parts.slice(3)];
|
|
227
|
+
}
|
|
228
|
+
const kind = effectiveParts[1];
|
|
219
229
|
try {
|
|
220
|
-
if ((kind === 'user' || kind === 'dm') &&
|
|
221
|
-
const dm = await app.client.conversations.open({ users:
|
|
230
|
+
if ((kind === 'user' || kind === 'dm') && effectiveParts[2]) {
|
|
231
|
+
const dm = await app.client.conversations.open({ users: effectiveParts[2] });
|
|
222
232
|
const channelId = dm.channel?.id;
|
|
223
233
|
if (!channelId)
|
|
224
234
|
return false;
|
|
225
235
|
await sendChunkedSlack(app.client, channelId, mdToSlack(text));
|
|
226
236
|
return true;
|
|
227
237
|
}
|
|
228
|
-
if (kind === 'channel' &&
|
|
229
|
-
await sendChunkedSlack(app.client,
|
|
238
|
+
if (kind === 'channel' && effectiveParts[2]) {
|
|
239
|
+
await sendChunkedSlack(app.client, effectiveParts[2], mdToSlack(text));
|
|
230
240
|
return true;
|
|
231
241
|
}
|
|
232
|
-
if (kind === 'agent' &&
|
|
233
|
-
const dm = await app.client.conversations.open({ users:
|
|
242
|
+
if (kind === 'agent' && effectiveParts[3]) {
|
|
243
|
+
const dm = await app.client.conversations.open({ users: effectiveParts[3] });
|
|
234
244
|
const channelId = dm.channel?.id;
|
|
235
245
|
if (!channelId)
|
|
236
246
|
return false;
|
|
@@ -450,9 +450,16 @@ export class CronScheduler {
|
|
|
450
450
|
this.watchAgentsDir();
|
|
451
451
|
this.watchWorkflowDir();
|
|
452
452
|
this.watchTriggers();
|
|
453
|
+
// Deep-mode jobs are owned by the router (_deliverDeepResult). The
|
|
454
|
+
// cron-scheduler callbacks below only dispatch for cron-originated runs;
|
|
455
|
+
// phase updates for deep-mode runs get routed back to the originating
|
|
456
|
+
// session instead of fanning out to every registered channel.
|
|
457
|
+
const isDeepMode = (jobName) => jobName.startsWith('deep-');
|
|
453
458
|
// Wire up push notifications for unleashed task completions
|
|
454
459
|
this.gateway.setUnleashedCompleteCallback((jobName, result) => {
|
|
455
460
|
this.completedJobs.set(jobName, Date.now());
|
|
461
|
+
if (isDeepMode(jobName))
|
|
462
|
+
return; // router handles delivery via _deliverDeepResult
|
|
456
463
|
if (result && result !== '__NOTHING__') {
|
|
457
464
|
const slug = jobName.includes(':') ? jobName.split(':')[0] : undefined;
|
|
458
465
|
// Strip system metadata for clean conversational delivery
|
|
@@ -473,7 +480,15 @@ export class CronScheduler {
|
|
|
473
480
|
const cleanOutput = output
|
|
474
481
|
.replace(/^STATUS SUMMARY:?\s*/im, '')
|
|
475
482
|
.slice(0, 500);
|
|
476
|
-
|
|
483
|
+
// For deep-mode runs, target the originating session so the progress
|
|
484
|
+
// update lands in the same Discord DM / Slack thread / dashboard window.
|
|
485
|
+
const deepSessionKey = isDeepMode(jobName) ? this.gateway.findDeepTaskSessionKey(jobName) : null;
|
|
486
|
+
const ctx = {};
|
|
487
|
+
if (slug)
|
|
488
|
+
ctx.agentSlug = slug;
|
|
489
|
+
if (deepSessionKey)
|
|
490
|
+
ctx.sessionKey = deepSessionKey;
|
|
491
|
+
this.dispatcher.send(`Still working on it — ${cleanOutput}`, ctx).catch(err => logger.debug({ err }, 'Failed to send phase progress notification'));
|
|
477
492
|
});
|
|
478
493
|
// Wire up real-time progress summaries (throttled to max 1 per 5 minutes)
|
|
479
494
|
const lastProgressSent = new Map();
|
|
@@ -484,7 +499,13 @@ export class CronScheduler {
|
|
|
484
499
|
return; // throttle: 1 per 5 minutes
|
|
485
500
|
lastProgressSent.set(jobName, now);
|
|
486
501
|
const slug = jobName.includes(':') ? jobName.split(':')[0] : undefined;
|
|
487
|
-
|
|
502
|
+
const deepSessionKey = isDeepMode(jobName) ? this.gateway.findDeepTaskSessionKey(jobName) : null;
|
|
503
|
+
const ctx = {};
|
|
504
|
+
if (slug)
|
|
505
|
+
ctx.agentSlug = slug;
|
|
506
|
+
if (deepSessionKey)
|
|
507
|
+
ctx.sessionKey = deepSessionKey;
|
|
508
|
+
this.dispatcher.send(summary.slice(0, 300), ctx).catch(err => logger.debug({ err }, 'Failed to send phase progress summary'));
|
|
488
509
|
});
|
|
489
510
|
logger.info(`Cron scheduler started with ${this.jobs.length} jobs`);
|
|
490
511
|
}
|
|
@@ -65,4 +65,22 @@ export declare function computeBrokenJobs(now?: number): BrokenJob[];
|
|
|
65
65
|
* Returns the jobs that triggered a fresh notification (mostly for tests/logs).
|
|
66
66
|
*/
|
|
67
67
|
export declare function runFailureSweep(send: (text: string) => Promise<unknown>, gateway?: import('./router.js').Gateway, now?: number): Promise<BrokenJob[]>;
|
|
68
|
+
export interface StaleCronJob {
|
|
69
|
+
jobName: string;
|
|
70
|
+
agentSlug?: string;
|
|
71
|
+
schedule: string;
|
|
72
|
+
/** ISO timestamp of the job's most recent run, or null if it has never run. */
|
|
73
|
+
lastRunAt: string | null;
|
|
74
|
+
/** Expected time between consecutive scheduled runs, in ms. */
|
|
75
|
+
expectedIntervalMs: number;
|
|
76
|
+
/** How far past the *second* expected tick we are, in minutes. */
|
|
77
|
+
overdueMinutes: number;
|
|
78
|
+
}
|
|
79
|
+
/**
|
|
80
|
+
* Walk enabled cron jobs and find ones that haven't run in at least
|
|
81
|
+
* SLA_MISSED_TICKS × their expected interval. Distinct from the broken-job
|
|
82
|
+
* detector which needs actual error entries — this catches the opposite
|
|
83
|
+
* failure mode: a job that silently stopped firing at all.
|
|
84
|
+
*/
|
|
85
|
+
export declare function computeStaleCronJobs(now?: number): Promise<StaleCronJob[]>;
|
|
68
86
|
//# sourceMappingURL=failure-monitor.d.ts.map
|
|
@@ -20,6 +20,7 @@ import path from 'node:path';
|
|
|
20
20
|
import Database from 'better-sqlite3';
|
|
21
21
|
import pino from 'pino';
|
|
22
22
|
import { BASE_DIR, MEMORY_DB_PATH } from '../config.js';
|
|
23
|
+
import { logAuditJsonl } from '../agent/hooks.js';
|
|
23
24
|
const logger = pino({ name: 'clementine.failure-monitor' });
|
|
24
25
|
const RUNS_DIR = path.join(BASE_DIR, 'cron', 'runs');
|
|
25
26
|
const ADVISOR_EVENTS_FILE = path.join(BASE_DIR, 'cron', 'advisor-events.jsonl');
|
|
@@ -40,12 +41,15 @@ const NOTIFY_COOLDOWN_HOURS = 24;
|
|
|
40
41
|
function loadState() {
|
|
41
42
|
try {
|
|
42
43
|
if (!existsSync(STATE_FILE))
|
|
43
|
-
return { notified: {} };
|
|
44
|
+
return { notified: {}, staleNotified: {} };
|
|
44
45
|
const raw = JSON.parse(readFileSync(STATE_FILE, 'utf-8'));
|
|
45
|
-
return {
|
|
46
|
+
return {
|
|
47
|
+
notified: raw.notified ?? {},
|
|
48
|
+
staleNotified: raw.staleNotified ?? {},
|
|
49
|
+
};
|
|
46
50
|
}
|
|
47
51
|
catch {
|
|
48
|
-
return { notified: {} };
|
|
52
|
+
return { notified: {}, staleNotified: {} };
|
|
49
53
|
}
|
|
50
54
|
}
|
|
51
55
|
function saveState(state) {
|
|
@@ -470,6 +474,15 @@ export async function runFailureSweep(send, gateway, now = Date.now()) {
|
|
|
470
474
|
logger.warn({ err }, 'Suspicious-run grading pre-pass failed (non-fatal)');
|
|
471
475
|
}
|
|
472
476
|
}
|
|
477
|
+
// SLA sweep runs unconditionally — catches jobs that silently stopped
|
|
478
|
+
// firing entirely, which the broken-job detector can't see because those
|
|
479
|
+
// jobs produce no error entries.
|
|
480
|
+
try {
|
|
481
|
+
await runSlaSweep(send, now);
|
|
482
|
+
}
|
|
483
|
+
catch (err) {
|
|
484
|
+
logger.warn({ err }, 'SLA sweep failed (non-fatal)');
|
|
485
|
+
}
|
|
473
486
|
const broken = computeBrokenJobs(now);
|
|
474
487
|
if (broken.length === 0) {
|
|
475
488
|
// Clear cooldowns AND diagnostic cache entries for jobs that recovered.
|
|
@@ -545,6 +558,67 @@ export async function runFailureSweep(send, gateway, now = Date.now()) {
|
|
|
545
558
|
}
|
|
546
559
|
return fresh;
|
|
547
560
|
}
|
|
561
|
+
/**
|
|
562
|
+
* Detect and notify about jobs whose last run is >= SLA_MISSED_TICKS
|
|
563
|
+
* expected intervals old. Each stale job gets one notification per
|
|
564
|
+
* SLA_NOTIFY_COOLDOWN_HOURS window. Always emits cron_sla_breach to
|
|
565
|
+
* audit.jsonl regardless of cooldown so the trace record is complete.
|
|
566
|
+
*/
|
|
567
|
+
async function runSlaSweep(send, now) {
|
|
568
|
+
const stale = await computeStaleCronJobs(now);
|
|
569
|
+
if (stale.length === 0) {
|
|
570
|
+
// Clear cooldowns for jobs that recovered (are no longer stale).
|
|
571
|
+
const state = loadState();
|
|
572
|
+
let mutated = false;
|
|
573
|
+
for (const name of Object.keys(state.staleNotified)) {
|
|
574
|
+
if (!stale.find(s => s.jobName === name)) {
|
|
575
|
+
delete state.staleNotified[name];
|
|
576
|
+
mutated = true;
|
|
577
|
+
}
|
|
578
|
+
}
|
|
579
|
+
if (mutated)
|
|
580
|
+
saveState(state);
|
|
581
|
+
return;
|
|
582
|
+
}
|
|
583
|
+
// Emit audit events for every stale detection so downstream
|
|
584
|
+
// tooling (dashboards, alerts) has a complete record.
|
|
585
|
+
for (const job of stale) {
|
|
586
|
+
logAuditJsonl({
|
|
587
|
+
event_type: 'cron_sla_breach',
|
|
588
|
+
jobName: job.jobName,
|
|
589
|
+
agent_slug: job.agentSlug,
|
|
590
|
+
schedule: job.schedule,
|
|
591
|
+
lastRunAt: job.lastRunAt,
|
|
592
|
+
expectedIntervalMs: job.expectedIntervalMs,
|
|
593
|
+
overdueMinutes: job.overdueMinutes,
|
|
594
|
+
});
|
|
595
|
+
}
|
|
596
|
+
// Apply per-job notify cooldown so we don't spam.
|
|
597
|
+
const state = loadState();
|
|
598
|
+
const cooldownMs = SLA_NOTIFY_COOLDOWN_HOURS * 60 * 60 * 1000;
|
|
599
|
+
const fresh = [];
|
|
600
|
+
for (const job of stale) {
|
|
601
|
+
const prev = state.staleNotified[job.jobName];
|
|
602
|
+
if (prev && now - Date.parse(prev.lastNotifiedAt) < cooldownMs)
|
|
603
|
+
continue;
|
|
604
|
+
fresh.push(job);
|
|
605
|
+
}
|
|
606
|
+
if (fresh.length === 0)
|
|
607
|
+
return;
|
|
608
|
+
try {
|
|
609
|
+
await send(formatStaleReport(fresh));
|
|
610
|
+
const stamp = new Date(now).toISOString();
|
|
611
|
+
for (const job of fresh) {
|
|
612
|
+
state.staleNotified[job.jobName] = { lastNotifiedAt: stamp, lastRunAt: job.lastRunAt };
|
|
613
|
+
}
|
|
614
|
+
saveState(state);
|
|
615
|
+
appendAuditLog('sla_notified', fresh.map(j => j.jobName));
|
|
616
|
+
logger.info({ count: fresh.length, jobs: fresh.map(j => j.jobName) }, 'SLA monitor: notified owner about stale jobs');
|
|
617
|
+
}
|
|
618
|
+
catch (err) {
|
|
619
|
+
logger.warn({ err }, 'SLA monitor: notification dispatch failed');
|
|
620
|
+
}
|
|
621
|
+
}
|
|
548
622
|
function appendAuditLog(action, jobNames) {
|
|
549
623
|
try {
|
|
550
624
|
const auditPath = path.join(BASE_DIR, 'cron', 'failure-monitor.log');
|
|
@@ -606,4 +680,73 @@ async function loadJobPrompt(jobName) {
|
|
|
606
680
|
return null;
|
|
607
681
|
}
|
|
608
682
|
}
|
|
683
|
+
/** A job is "stale" if it has missed >= this many expected ticks. */
|
|
684
|
+
const SLA_MISSED_TICKS = 3;
|
|
685
|
+
/** Absolute floor so sub-hourly cron jobs don't alert too aggressively. */
|
|
686
|
+
const SLA_MIN_OVERDUE_MS = 30 * 60 * 1000;
|
|
687
|
+
/** Don't re-DM the owner about the same stale job within this window. */
|
|
688
|
+
const SLA_NOTIFY_COOLDOWN_HOURS = 12;
|
|
689
|
+
/**
|
|
690
|
+
* Walk enabled cron jobs and find ones that haven't run in at least
|
|
691
|
+
* SLA_MISSED_TICKS × their expected interval. Distinct from the broken-job
|
|
692
|
+
* detector which needs actual error entries — this catches the opposite
|
|
693
|
+
* failure mode: a job that silently stopped firing at all.
|
|
694
|
+
*/
|
|
695
|
+
export async function computeStaleCronJobs(now = Date.now()) {
|
|
696
|
+
const { parseCronJobs, parseAgentCronJobs, CronRunLog } = await import('./cron-scheduler.js');
|
|
697
|
+
const { AGENTS_DIR } = await import('../config.js');
|
|
698
|
+
const cronParser = await import('cron-parser');
|
|
699
|
+
const jobs = [...parseCronJobs(), ...parseAgentCronJobs(AGENTS_DIR)];
|
|
700
|
+
const runLog = new CronRunLog();
|
|
701
|
+
const stale = [];
|
|
702
|
+
for (const job of jobs) {
|
|
703
|
+
if (!job.enabled)
|
|
704
|
+
continue;
|
|
705
|
+
if (job.mode === 'unleashed')
|
|
706
|
+
continue; // one-shot, no recurring SLA
|
|
707
|
+
// Normalize schedule — node-cron accepts 6-field (with seconds) but
|
|
708
|
+
// cron-parser only takes 5-field.
|
|
709
|
+
const fields = job.schedule.trim().split(/\s+/);
|
|
710
|
+
const expr = fields.length === 6 ? fields.slice(1).join(' ') : job.schedule;
|
|
711
|
+
let intervalMs;
|
|
712
|
+
try {
|
|
713
|
+
const parser = cronParser.CronExpressionParser.parse(expr);
|
|
714
|
+
const next = parser.next().toDate().getTime();
|
|
715
|
+
const prev = parser.prev().toDate().getTime();
|
|
716
|
+
intervalMs = next - prev;
|
|
717
|
+
}
|
|
718
|
+
catch {
|
|
719
|
+
continue; // malformed schedule — separate concern
|
|
720
|
+
}
|
|
721
|
+
if (intervalMs <= 0)
|
|
722
|
+
continue;
|
|
723
|
+
const recent = runLog.readRecent(job.name, 1);
|
|
724
|
+
const lastRunAt = recent[0]?.startedAt ?? null;
|
|
725
|
+
const lastRunMs = lastRunAt ? Date.parse(lastRunAt) : 0;
|
|
726
|
+
const threshold = Math.max(intervalMs * SLA_MISSED_TICKS, SLA_MIN_OVERDUE_MS);
|
|
727
|
+
const sinceLastRun = now - lastRunMs;
|
|
728
|
+
if (sinceLastRun <= threshold)
|
|
729
|
+
continue;
|
|
730
|
+
stale.push({
|
|
731
|
+
jobName: job.name,
|
|
732
|
+
agentSlug: job.agentSlug,
|
|
733
|
+
schedule: job.schedule,
|
|
734
|
+
lastRunAt,
|
|
735
|
+
expectedIntervalMs: intervalMs,
|
|
736
|
+
overdueMinutes: Math.round((sinceLastRun - intervalMs) / 60_000),
|
|
737
|
+
});
|
|
738
|
+
}
|
|
739
|
+
return stale;
|
|
740
|
+
}
|
|
741
|
+
function formatStaleReport(stale) {
|
|
742
|
+
const lines = ['**Cron SLA breach — jobs that should have run but didn\'t:**', ''];
|
|
743
|
+
for (const job of stale) {
|
|
744
|
+
const last = job.lastRunAt ? `last ran ${new Date(job.lastRunAt).toISOString().slice(0, 16).replace('T', ' ')}` : 'never run';
|
|
745
|
+
const intervalMin = Math.round(job.expectedIntervalMs / 60_000);
|
|
746
|
+
lines.push(`- **${job.jobName}** (${job.schedule}, every ${intervalMin}m) — ${last}, overdue by ~${job.overdueMinutes}m`);
|
|
747
|
+
}
|
|
748
|
+
lines.push('');
|
|
749
|
+
lines.push('The scheduler may be stuck, the job may have thrown before logging, or it may have been silently disabled. Check the dashboard Scheduled Tasks panel.');
|
|
750
|
+
return lines.join('\n');
|
|
751
|
+
}
|
|
609
752
|
//# sourceMappingURL=failure-monitor.js.map
|
package/dist/gateway/router.d.ts
CHANGED
|
@@ -75,6 +75,13 @@ export declare class Gateway {
|
|
|
75
75
|
constructor(assistant: PersonalAssistant);
|
|
76
76
|
/** Get or create a session state entry. */
|
|
77
77
|
private getSession;
|
|
78
|
+
/**
|
|
79
|
+
* Reverse-lookup the session key that owns a given deep-mode jobName.
|
|
80
|
+
* Used by the cron-scheduler callbacks so phase-progress and completion
|
|
81
|
+
* messages can be routed back to the originating channel instead of
|
|
82
|
+
* fanning out to every registered sender.
|
|
83
|
+
*/
|
|
84
|
+
findDeepTaskSessionKey(jobName: string): string | null;
|
|
78
85
|
getAgentManager(): AgentManager;
|
|
79
86
|
getTeamRouter(): TeamRouter;
|
|
80
87
|
getTeamBus(): TeamBus;
|
package/dist/gateway/router.js
CHANGED
|
@@ -322,6 +322,19 @@ export class Gateway {
|
|
|
322
322
|
}
|
|
323
323
|
return s;
|
|
324
324
|
}
|
|
325
|
+
/**
|
|
326
|
+
* Reverse-lookup the session key that owns a given deep-mode jobName.
|
|
327
|
+
* Used by the cron-scheduler callbacks so phase-progress and completion
|
|
328
|
+
* messages can be routed back to the originating channel instead of
|
|
329
|
+
* fanning out to every registered sender.
|
|
330
|
+
*/
|
|
331
|
+
findDeepTaskSessionKey(jobName) {
|
|
332
|
+
for (const [key, sess] of this.sessions) {
|
|
333
|
+
if (sess.deepTask?.jobName === jobName)
|
|
334
|
+
return key;
|
|
335
|
+
}
|
|
336
|
+
return null;
|
|
337
|
+
}
|
|
325
338
|
// ── Team system accessors ──────────────────────────────────────────
|
|
326
339
|
getAgentManager() {
|
|
327
340
|
if (!this._agentManager) {
|