principles-disciple 1.14.0 → 1.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/openclaw.plugin.json +1 -1
- package/package.json +4 -2
- package/scripts/bootstrap-rules.mjs +66 -0
- package/scripts/validate-live-path.ts +356 -0
- package/src/core/bootstrap-rules.ts +177 -0
- package/src/core/principle-tree-migration.ts +196 -0
- package/src/service/evolution-worker.ts +81 -61
- package/src/service/monitoring-query-service.ts +277 -0
- package/src/service/nocturnal-service.ts +9 -1
- package/src/service/subagent-workflow/nocturnal-workflow-manager.ts +10 -2
- package/tests/core/bootstrap-rules.test.ts +582 -0
- package/tests/core/principle-tree-migration.test.ts +77 -0
- package/tests/scripts/validate-live-path.test.ts +286 -0
- package/tests/service/evolution-worker.nocturnal.test.ts +208 -0
- package/tests/service/monitoring-query-service.test.ts +113 -0
- package/tests/service/nocturnal-runtime-hardening.test.ts +85 -0
- package/ui/src/charts.tsx +4 -1
- package/ui/src/pages/ThinkingModelsPage.tsx +9 -1
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Principle Tree Migration — Migrates trainingStore to tree.principles
|
|
3
|
+
*
|
|
4
|
+
* This migration handles the Phase 11 gap: existing principles in trainingStore
|
|
5
|
+
* were never written to tree.principles, blocking the Rule/Implementation layer.
|
|
6
|
+
*
|
|
7
|
+
* Usage:
|
|
8
|
+
* - Called automatically by migratePrincipleTree() during plugin initialization
|
|
9
|
+
* - Or run manually: node scripts/migrate-principle-tree.mjs <workspace-dir>
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import * as fs from 'fs';
|
|
13
|
+
import * as path from 'path';
|
|
14
|
+
import {
|
|
15
|
+
loadLedger,
|
|
16
|
+
saveLedger,
|
|
17
|
+
type LedgerPrinciple,
|
|
18
|
+
} from './principle-tree-ledger.js';
|
|
19
|
+
import type { LegacyPrincipleTrainingState } from './principle-tree-ledger.js';
|
|
20
|
+
import { SystemLogger } from './system-logger.js';
|
|
21
|
+
|
|
22
|
+
export interface PrincipleTreeMigrationResult {
|
|
23
|
+
migratedCount: number;
|
|
24
|
+
skippedCount: number;
|
|
25
|
+
errorCount: number;
|
|
26
|
+
details: Array<{
|
|
27
|
+
principleId: string;
|
|
28
|
+
status: 'migrated' | 'skipped' | 'error';
|
|
29
|
+
reason?: string;
|
|
30
|
+
}>;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Check if migration is needed by comparing trainingStore and tree.principles
|
|
35
|
+
*/
|
|
36
|
+
export function needsMigration(stateDir: string): boolean {
|
|
37
|
+
const ledger = loadLedger(stateDir);
|
|
38
|
+
return Object.keys(ledger.trainingStore).some((principleId) => !ledger.tree.principles[principleId]);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Create a minimal LedgerPrinciple from LegacyPrincipleTrainingState
|
|
43
|
+
*/
|
|
44
|
+
function trainingStateToTreePrinciple(
|
|
45
|
+
principleId: string,
|
|
46
|
+
state: LegacyPrincipleTrainingState,
|
|
47
|
+
now: string
|
|
48
|
+
): LedgerPrinciple {
|
|
49
|
+
return {
|
|
50
|
+
id: principleId,
|
|
51
|
+
version: 1,
|
|
52
|
+
text: `Principle ${principleId}`, // Minimal text, will be enriched from PRINCIPLES.md if available
|
|
53
|
+
triggerPattern: '', // Unknown from legacy data
|
|
54
|
+
action: '', // Unknown from legacy data
|
|
55
|
+
status: mapInternalizationStatusToPrincipleStatus(state.internalizationStatus),
|
|
56
|
+
priority: 'P1', // Default priority
|
|
57
|
+
scope: 'general',
|
|
58
|
+
evaluability: state.evaluability,
|
|
59
|
+
valueScore: 0,
|
|
60
|
+
adherenceRate: state.complianceRate * 100, // Convert 0-1 to 0-100
|
|
61
|
+
painPreventedCount: 0,
|
|
62
|
+
derivedFromPainIds: [],
|
|
63
|
+
ruleIds: [],
|
|
64
|
+
conflictsWithPrincipleIds: [],
|
|
65
|
+
createdAt: now,
|
|
66
|
+
updatedAt: now,
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Map internalization status to principle status
|
|
72
|
+
*/
|
|
73
|
+
function mapInternalizationStatusToPrincipleStatus(
|
|
74
|
+
status: LegacyPrincipleTrainingState['internalizationStatus']
|
|
75
|
+
): 'candidate' | 'active' | 'deprecated' {
|
|
76
|
+
switch (status) {
|
|
77
|
+
case 'internalized':
|
|
78
|
+
case 'deployed_pending_eval':
|
|
79
|
+
return 'active';
|
|
80
|
+
case 'regressed':
|
|
81
|
+
case 'needs_training':
|
|
82
|
+
return 'candidate';
|
|
83
|
+
case 'prompt_only':
|
|
84
|
+
case 'in_training':
|
|
85
|
+
return 'candidate';
|
|
86
|
+
default:
|
|
87
|
+
return 'candidate';
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Migrate trainingStore principles to tree.principles
|
|
93
|
+
*
|
|
94
|
+
* This function is idempotent: it only migrates principles that don't exist
|
|
95
|
+
* in tree.principles yet.
|
|
96
|
+
*/
|
|
97
|
+
export function migratePrincipleTree(
|
|
98
|
+
stateDir: string,
|
|
99
|
+
workspaceDir?: string
|
|
100
|
+
): PrincipleTreeMigrationResult {
|
|
101
|
+
const result: PrincipleTreeMigrationResult = {
|
|
102
|
+
migratedCount: 0,
|
|
103
|
+
skippedCount: 0,
|
|
104
|
+
errorCount: 0,
|
|
105
|
+
details: [],
|
|
106
|
+
};
|
|
107
|
+
|
|
108
|
+
try {
|
|
109
|
+
const ledger = loadLedger(stateDir);
|
|
110
|
+
const now = new Date().toISOString();
|
|
111
|
+
|
|
112
|
+
for (const [principleId, state] of Object.entries(ledger.trainingStore)) {
|
|
113
|
+
// Skip if already exists in tree.principles
|
|
114
|
+
if (ledger.tree.principles[principleId]) {
|
|
115
|
+
result.skippedCount++;
|
|
116
|
+
result.details.push({
|
|
117
|
+
principleId,
|
|
118
|
+
status: 'skipped',
|
|
119
|
+
reason: 'Already exists in tree.principles',
|
|
120
|
+
});
|
|
121
|
+
continue;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
try {
|
|
125
|
+
const treePrinciple = trainingStateToTreePrinciple(principleId, state, now);
|
|
126
|
+
const nextLedger = loadLedger(stateDir);
|
|
127
|
+
if (!nextLedger.tree.principles[principleId]) {
|
|
128
|
+
nextLedger.tree.principles[principleId] = treePrinciple;
|
|
129
|
+
nextLedger.tree.lastUpdated = now;
|
|
130
|
+
saveLedger(stateDir, nextLedger);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
result.migratedCount++;
|
|
134
|
+
result.details.push({
|
|
135
|
+
principleId,
|
|
136
|
+
status: 'migrated',
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
if (workspaceDir) {
|
|
140
|
+
SystemLogger.log(
|
|
141
|
+
workspaceDir,
|
|
142
|
+
'PRINCIPLE_TREE_MIGRATED',
|
|
143
|
+
`Migrated ${principleId} from trainingStore to tree.principles`
|
|
144
|
+
);
|
|
145
|
+
}
|
|
146
|
+
} catch (err) {
|
|
147
|
+
result.errorCount++;
|
|
148
|
+
result.details.push({
|
|
149
|
+
principleId,
|
|
150
|
+
status: 'error',
|
|
151
|
+
reason: String(err),
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
if (workspaceDir) {
|
|
155
|
+
SystemLogger.log(
|
|
156
|
+
workspaceDir,
|
|
157
|
+
'PRINCIPLE_TREE_MIGRATION_ERROR',
|
|
158
|
+
`Failed to migrate ${principleId}: ${String(err)}`
|
|
159
|
+
);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
if (workspaceDir && result.migratedCount > 0) {
|
|
165
|
+
SystemLogger.log(
|
|
166
|
+
workspaceDir,
|
|
167
|
+
'PRINCIPLE_TREE_MIGRATION_COMPLETE',
|
|
168
|
+
`Migrated ${result.migratedCount} principles to tree.principles (${result.skippedCount} skipped, ${result.errorCount} errors)`
|
|
169
|
+
);
|
|
170
|
+
}
|
|
171
|
+
} catch (err) {
|
|
172
|
+
if (workspaceDir) {
|
|
173
|
+
SystemLogger.log(
|
|
174
|
+
workspaceDir,
|
|
175
|
+
'PRINCIPLE_TREE_MIGRATION_FAILED',
|
|
176
|
+
`Migration failed: ${String(err)}`
|
|
177
|
+
);
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
return result;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
/**
|
|
185
|
+
* Run migration if needed (called during plugin initialization)
|
|
186
|
+
*/
|
|
187
|
+
export function runMigrationIfNeeded(
|
|
188
|
+
stateDir: string,
|
|
189
|
+
workspaceDir?: string
|
|
190
|
+
): PrincipleTreeMigrationResult | null {
|
|
191
|
+
if (!needsMigration(stateDir)) {
|
|
192
|
+
return null;
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
return migratePrincipleTree(stateDir, workspaceDir);
|
|
196
|
+
}
|
|
@@ -188,6 +188,30 @@ export interface RecentPainContext {
|
|
|
188
188
|
recentMaxPainScore: number;
|
|
189
189
|
}
|
|
190
190
|
|
|
191
|
+
function hasUsableNocturnalSnapshot(snapshotData: Record<string, unknown> | undefined): boolean {
|
|
192
|
+
if (!snapshotData || typeof snapshotData.sessionId !== 'string' || snapshotData.sessionId.length === 0) {
|
|
193
|
+
return false;
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
if (snapshotData._dataSource !== 'pain_context_fallback') {
|
|
197
|
+
return true;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
const stats = (snapshotData.stats && typeof snapshotData.stats === 'object')
|
|
201
|
+
? snapshotData.stats as Record<string, number | null | undefined>
|
|
202
|
+
: undefined;
|
|
203
|
+
const recentPain = Array.isArray(snapshotData.recentPain) ? snapshotData.recentPain.length : 0;
|
|
204
|
+
const hasNonZeroStats = !!stats && [
|
|
205
|
+
'totalAssistantTurns',
|
|
206
|
+
'totalToolCalls',
|
|
207
|
+
'failureCount',
|
|
208
|
+
'totalPainEvents',
|
|
209
|
+
'totalGateBlocks',
|
|
210
|
+
].some((key) => Number(stats[key] ?? 0) > 0);
|
|
211
|
+
|
|
212
|
+
return hasNonZeroStats || recentPain > 0;
|
|
213
|
+
}
|
|
214
|
+
|
|
191
215
|
export interface EvolutionQueueItem {
|
|
192
216
|
// Core identity
|
|
193
217
|
id: string;
|
|
@@ -1367,42 +1391,16 @@ async function processEvolutionQueue(wctx: WorkspaceContext, logger: PluginLogge
|
|
|
1367
1391
|
logger?.info?.(`[PD:EvolutionWorker] Processing sleep_reflection task ${sleepTask.id}`);
|
|
1368
1392
|
}
|
|
1369
1393
|
|
|
1370
|
-
|
|
1371
|
-
//
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
nocturnalManager = new NocturnalWorkflowManager({
|
|
1376
|
-
workspaceDir: wctx.workspaceDir,
|
|
1377
|
-
stateDir: wctx.stateDir,
|
|
1378
|
-
logger: api.logger,
|
|
1379
|
-
runtimeAdapter: new OpenClawTrinityRuntimeAdapter(api),
|
|
1380
|
-
});
|
|
1381
|
-
} else {
|
|
1382
|
-
// Cannot create manager without api (runtimeAdapter required)
|
|
1383
|
-
sleepTask.status = 'failed';
|
|
1384
|
-
sleepTask.completed_at = new Date().toISOString();
|
|
1385
|
-
sleepTask.resolution = 'failed_max_retries';
|
|
1386
|
-
sleepTask.lastError = 'No API available to create NocturnalWorkflowManager';
|
|
1387
|
-
sleepTask.retryCount = (sleepTask.retryCount ?? 0) + 1;
|
|
1388
|
-
logger?.warn?.(`[PD:EvolutionWorker] sleep_reflection task ${sleepTask.id} skipped: no API`);
|
|
1389
|
-
continue;
|
|
1390
|
-
}
|
|
1391
|
-
|
|
1392
|
-
// eslint-disable-next-line @typescript-eslint/init-declarations -- assigned in both if/else branches
|
|
1393
|
-
let workflowId: string;
|
|
1394
|
+
let workflowId: string | undefined;
|
|
1395
|
+
// eslint-disable-next-line @typescript-eslint/init-declarations -- assigned when runtime API is available
|
|
1396
|
+
let nocturnalManager: NocturnalWorkflowManager;
|
|
1397
|
+
// eslint-disable-next-line @typescript-eslint/init-declarations -- assigned only for newly started workflows
|
|
1398
|
+
let snapshotData: Record<string, unknown> | undefined;
|
|
1394
1399
|
|
|
1395
1400
|
if (isPollingTask) {
|
|
1396
|
-
//
|
|
1397
|
-
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion -- Reason: isPollingTask flag is only set when resultRef is expected to be present
|
|
1401
|
+
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion -- Reason: polling path requires existing resultRef
|
|
1398
1402
|
workflowId = sleepTask.resultRef!;
|
|
1399
1403
|
} else {
|
|
1400
|
-
// Start workflow via NocturnalWorkflowManager instead of direct executeNocturnalReflectionAsync
|
|
1401
|
-
// Pass taskId in metadata for correlation
|
|
1402
|
-
|
|
1403
|
-
// #181: Build a proper snapshot from trajectory.db instead of hardcoded zeros
|
|
1404
|
-
// eslint-disable-next-line @typescript-eslint/init-declarations -- undefined is valid zero value, assigned conditionally in if/fallback blocks
|
|
1405
|
-
let snapshotData: Record<string, unknown> | undefined;
|
|
1406
1404
|
if (sleepTask.recentPainContext) {
|
|
1407
1405
|
try {
|
|
1408
1406
|
const extractor = createNocturnalTrajectoryExtractor(wctx.workspaceDir);
|
|
@@ -1412,16 +1410,14 @@ async function processEvolutionQueue(wctx: WorkspaceContext, logger: PluginLogge
|
|
|
1412
1410
|
sessionId: fullSnapshot.sessionId,
|
|
1413
1411
|
sessionStart: fullSnapshot.startedAt,
|
|
1414
1412
|
stats: fullSnapshot.stats,
|
|
1415
|
-
recentPain: fullSnapshot.painEvents.slice(-5),
|
|
1413
|
+
recentPain: fullSnapshot.painEvents.slice(-5),
|
|
1416
1414
|
};
|
|
1417
1415
|
}
|
|
1418
1416
|
} catch (snapErr) {
|
|
1419
1417
|
logger?.warn?.(`[PD:EvolutionWorker] Failed to build trajectory snapshot for ${sleepTask.id}: ${String(snapErr)}`);
|
|
1420
1418
|
}
|
|
1421
1419
|
}
|
|
1422
|
-
// Fallback: use pain context only if trajectory extractor failed
|
|
1423
1420
|
if (!snapshotData && sleepTask.recentPainContext) {
|
|
1424
|
-
// #200: Log fallback usage to make data gaps visible
|
|
1425
1421
|
logger?.warn?.(`[PD:EvolutionWorker] Using pain-context fallback for ${sleepTask.id}: trajectory stats unavailable (stats will be partial)`);
|
|
1426
1422
|
snapshotData = {
|
|
1427
1423
|
sessionId: sleepTask.id,
|
|
@@ -1434,31 +1430,63 @@ async function processEvolutionQueue(wctx: WorkspaceContext, logger: PluginLogge
|
|
|
1434
1430
|
totalGateBlocks: 0,
|
|
1435
1431
|
},
|
|
1436
1432
|
recentPain: sleepTask.recentPainContext.mostRecent ? [sleepTask.recentPainContext.mostRecent] : [],
|
|
1437
|
-
// #200: Mark data source so downstream can handle appropriately
|
|
1438
1433
|
_dataSource: 'pain_context_fallback',
|
|
1439
1434
|
};
|
|
1440
1435
|
}
|
|
1441
1436
|
|
|
1437
|
+
if (!hasUsableNocturnalSnapshot(snapshotData)) {
|
|
1438
|
+
sleepTask.status = 'failed';
|
|
1439
|
+
sleepTask.completed_at = new Date().toISOString();
|
|
1440
|
+
sleepTask.resolution = 'failed_max_retries';
|
|
1441
|
+
sleepTask.lastError = 'sleep_reflection failed: missing_usable_snapshot (skipReason: empty_fallback_snapshot)';
|
|
1442
|
+
sleepTask.retryCount = (sleepTask.retryCount ?? 0) + 1;
|
|
1443
|
+
logger?.warn?.(`[PD:EvolutionWorker] sleep_reflection task ${sleepTask.id} rejected: missing usable snapshot`);
|
|
1444
|
+
continue;
|
|
1445
|
+
}
|
|
1446
|
+
}
|
|
1447
|
+
|
|
1448
|
+
if (!api) {
|
|
1449
|
+
sleepTask.status = 'failed';
|
|
1450
|
+
sleepTask.completed_at = new Date().toISOString();
|
|
1451
|
+
sleepTask.resolution = 'failed_max_retries';
|
|
1452
|
+
sleepTask.lastError = 'No API available to create NocturnalWorkflowManager';
|
|
1453
|
+
sleepTask.retryCount = (sleepTask.retryCount ?? 0) + 1;
|
|
1454
|
+
logger?.warn?.(`[PD:EvolutionWorker] sleep_reflection task ${sleepTask.id} skipped: no API`);
|
|
1455
|
+
continue;
|
|
1456
|
+
}
|
|
1457
|
+
|
|
1458
|
+
nocturnalManager = new NocturnalWorkflowManager({
|
|
1459
|
+
workspaceDir: wctx.workspaceDir,
|
|
1460
|
+
stateDir: wctx.stateDir,
|
|
1461
|
+
logger: api.logger,
|
|
1462
|
+
runtimeAdapter: new OpenClawTrinityRuntimeAdapter(api),
|
|
1463
|
+
});
|
|
1464
|
+
|
|
1465
|
+
if (!isPollingTask) {
|
|
1442
1466
|
const workflowHandle = await nocturnalManager.startWorkflow(nocturnalWorkflowSpec, {
|
|
1443
1467
|
parentSessionId: `sleep_reflection:${sleepTask.id}`,
|
|
1444
1468
|
workspaceDir: wctx.workspaceDir,
|
|
1445
1469
|
taskInput: {},
|
|
1446
1470
|
metadata: {
|
|
1447
1471
|
snapshot: snapshotData,
|
|
1448
|
-
|
|
1449
|
-
// via executeNocturnalReflectionAsync when no principleId is provided
|
|
1450
|
-
taskId: sleepTask.id, // NOC-14: correlation ID for evolution worker
|
|
1451
|
-
// Pass painContext to Selector for principle ranking bias
|
|
1472
|
+
taskId: sleepTask.id,
|
|
1452
1473
|
painContext: sleepTask.recentPainContext,
|
|
1453
1474
|
},
|
|
1454
1475
|
});
|
|
1455
|
-
|
|
1456
|
-
// Store workflowId on task for polling on subsequent cycles
|
|
1457
1476
|
sleepTask.resultRef = workflowHandle.workflowId;
|
|
1458
|
-
// eslint-disable-next-line @typescript-eslint/prefer-destructuring -- Reason: workflowId is reassignable outer let - destructuring would shadow
|
|
1459
1477
|
workflowId = workflowHandle.workflowId;
|
|
1460
1478
|
}
|
|
1461
1479
|
|
|
1480
|
+
if (!workflowId) {
|
|
1481
|
+
sleepTask.status = 'failed';
|
|
1482
|
+
sleepTask.completed_at = new Date().toISOString();
|
|
1483
|
+
sleepTask.resolution = 'failed_max_retries';
|
|
1484
|
+
sleepTask.lastError = 'sleep_reflection failed: missing_workflow_id';
|
|
1485
|
+
sleepTask.retryCount = (sleepTask.retryCount ?? 0) + 1;
|
|
1486
|
+
logger?.warn?.(`[PD:EvolutionWorker] sleep_reflection task ${sleepTask.id} missing workflow id after startup`);
|
|
1487
|
+
continue;
|
|
1488
|
+
}
|
|
1489
|
+
|
|
1462
1490
|
// Workflow is running asynchronously. Check if it completed in this cycle
|
|
1463
1491
|
// by polling getWorkflowDebugSummary.
|
|
1464
1492
|
const summary = await nocturnalManager.getWorkflowDebugSummary(workflowId);
|
|
@@ -1490,16 +1518,12 @@ async function processEvolutionQueue(wctx: WorkspaceContext, logger: PluginLogge
|
|
|
1490
1518
|
sleepTask.lastError = detailedError;
|
|
1491
1519
|
sleepTask.retryCount = (sleepTask.retryCount ?? 0) + 1;
|
|
1492
1520
|
|
|
1521
|
+
sleepTask.status = 'failed';
|
|
1522
|
+
sleepTask.completed_at = new Date().toISOString();
|
|
1523
|
+
sleepTask.resolution = 'failed_max_retries';
|
|
1493
1524
|
if (isExpectedSubagentError(errorReason)) {
|
|
1494
|
-
|
|
1495
|
-
sleepTask.status = 'completed';
|
|
1496
|
-
sleepTask.completed_at = new Date().toISOString();
|
|
1497
|
-
sleepTask.resolution = 'stub_fallback';
|
|
1498
|
-
logger?.info?.(`[PD:EvolutionWorker] sleep_reflection task ${sleepTask.id} workflow completed with stub fallback (expected subagent error: ${errorReason})`);
|
|
1525
|
+
logger?.warn?.(`[PD:EvolutionWorker] sleep_reflection task ${sleepTask.id} background runtime unavailable: ${errorReason}`);
|
|
1499
1526
|
} else {
|
|
1500
|
-
sleepTask.status = 'failed';
|
|
1501
|
-
sleepTask.completed_at = new Date().toISOString();
|
|
1502
|
-
sleepTask.resolution = 'failed_max_retries';
|
|
1503
1527
|
logger?.warn?.(`[PD:EvolutionWorker] sleep_reflection task ${sleepTask.id} workflow failed: ${sleepTask.lastError}`);
|
|
1504
1528
|
}
|
|
1505
1529
|
} else {
|
|
@@ -1511,18 +1535,14 @@ async function processEvolutionQueue(wctx: WorkspaceContext, logger: PluginLogge
|
|
|
1511
1535
|
// #202: Handle expected subagent unavailability (e.g., process isolation in daemon mode)
|
|
1512
1536
|
// When subagent is unavailable due to gateway running in separate process,
|
|
1513
1537
|
// use stub fallback instead of failing the task.
|
|
1538
|
+
sleepTask.status = 'failed';
|
|
1539
|
+
sleepTask.completed_at = new Date().toISOString();
|
|
1540
|
+
sleepTask.resolution = 'failed_max_retries';
|
|
1541
|
+
sleepTask.lastError = String(taskErr);
|
|
1542
|
+
sleepTask.retryCount = (sleepTask.retryCount ?? 0) + 1;
|
|
1514
1543
|
if (isExpectedSubagentError(taskErr)) {
|
|
1515
|
-
sleepTask.
|
|
1516
|
-
sleepTask.completed_at = new Date().toISOString();
|
|
1517
|
-
sleepTask.resolution = 'stub_fallback';
|
|
1518
|
-
sleepTask.lastError = String(taskErr);
|
|
1519
|
-
logger?.info?.(`[PD:EvolutionWorker] sleep_reflection task ${sleepTask.id} completed with stub fallback (subagent unavailable)`);
|
|
1544
|
+
logger?.warn?.(`[PD:EvolutionWorker] sleep_reflection task ${sleepTask.id} background runtime unavailable: ${String(taskErr)}`);
|
|
1520
1545
|
} else {
|
|
1521
|
-
sleepTask.status = 'failed';
|
|
1522
|
-
sleepTask.completed_at = new Date().toISOString();
|
|
1523
|
-
sleepTask.resolution = 'failed_max_retries';
|
|
1524
|
-
sleepTask.lastError = String(taskErr);
|
|
1525
|
-
sleepTask.retryCount = (sleepTask.retryCount ?? 0) + 1;
|
|
1526
1546
|
logger?.error?.(`[PD:EvolutionWorker] sleep_reflection task ${sleepTask.id} threw: ${taskErr}`);
|
|
1527
1547
|
}
|
|
1528
1548
|
}
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
import { WorkflowStore } from './subagent-workflow/workflow-store.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Monitoring query service for Nocturnal workflows and Trinity stages.
|
|
5
|
+
* Encapsulates all monitoring data queries, keeping logic separate from API routes.
|
|
6
|
+
*/
|
|
7
|
+
export class MonitoringQueryService {
|
|
8
|
+
private readonly workspaceDir: string;
|
|
9
|
+
private readonly store: WorkflowStore;
|
|
10
|
+
|
|
11
|
+
constructor(workspaceDir: string) {
|
|
12
|
+
this.workspaceDir = workspaceDir;
|
|
13
|
+
this.store = new WorkflowStore({ workspaceDir });
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
dispose(): void {
|
|
17
|
+
this.store.dispose();
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Get workflows with optional filtering and stuck detection.
|
|
22
|
+
* @param filters - Optional state and type filters
|
|
23
|
+
* @returns Workflow list with stuck detection
|
|
24
|
+
*/
|
|
25
|
+
getWorkflows(filters: { state?: string; type?: string } = {}): WorkflowListResponse {
|
|
26
|
+
// Query workflows from WorkflowStore
|
|
27
|
+
let workflows = filters.state
|
|
28
|
+
? this.store.listWorkflows(filters.state)
|
|
29
|
+
: this.store.listWorkflows();
|
|
30
|
+
|
|
31
|
+
// Filter by workflow type if specified
|
|
32
|
+
if (filters.type) {
|
|
33
|
+
workflows = workflows.filter(wf => wf.workflow_type === filters.type);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
const now = Date.now();
|
|
37
|
+
const workflowsWithStuckDetection = workflows.map(wf => {
|
|
38
|
+
// Parse metadata for timeout configuration
|
|
39
|
+
const metadata = parseWorkflowMetadata(wf.metadata_json);
|
|
40
|
+
const timeoutMs = metadata.timeoutMs ?? 15 * 60 * 1000; // Default 15 minutes
|
|
41
|
+
|
|
42
|
+
// Check if workflow is stuck (active and exceeded timeout)
|
|
43
|
+
const isStuck = wf.state === 'active' && (now - wf.created_at) > timeoutMs;
|
|
44
|
+
const stuckDuration = isStuck ? now - wf.created_at : null;
|
|
45
|
+
|
|
46
|
+
return {
|
|
47
|
+
workflowId: wf.workflow_id,
|
|
48
|
+
type: wf.workflow_type,
|
|
49
|
+
state: isStuck ? 'stuck' : wf.state,
|
|
50
|
+
duration: now - wf.created_at,
|
|
51
|
+
createdAt: new Date(wf.created_at).toISOString(),
|
|
52
|
+
stuckDuration,
|
|
53
|
+
};
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
return { workflows: workflowsWithStuckDetection };
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Get Trinity stage status for a specific workflow.
|
|
61
|
+
* @param workflowId - Workflow ID to query
|
|
62
|
+
* @returns Trinity stage status or null if workflow not found
|
|
63
|
+
*/
|
|
64
|
+
getTrinityStatus(workflowId: string): TrinityStatusResponse | null {
|
|
65
|
+
// Get workflow and validate
|
|
66
|
+
const workflow = this.store.getWorkflow(workflowId);
|
|
67
|
+
if (!workflow) {
|
|
68
|
+
return null;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// Fetch stage data
|
|
72
|
+
const events = this.store.getEvents(workflowId);
|
|
73
|
+
const stageOutputs = this.store.getStageOutputs(workflowId);
|
|
74
|
+
|
|
75
|
+
// Define stage types
|
|
76
|
+
const stages = ['dreamer', 'philosopher', 'scribe'] as const;
|
|
77
|
+
|
|
78
|
+
// Compute stage states from events
|
|
79
|
+
const stagesInfo: TrinityStageInfo[] = stages.map(stage => {
|
|
80
|
+
// Find events for this stage
|
|
81
|
+
const startEvent = events.find(e => e.event_type === `trinity_${stage}_start`);
|
|
82
|
+
const completeEvent = events.find(e => e.event_type === `trinity_${stage}_complete`);
|
|
83
|
+
const failedEvent = events.find(e => e.event_type === `trinity_${stage}_failed`);
|
|
84
|
+
|
|
85
|
+
// Determine status
|
|
86
|
+
// eslint-disable-next-line @typescript-eslint/init-declarations
|
|
87
|
+
let status: 'pending' | 'running' | 'completed' | 'failed';
|
|
88
|
+
// eslint-disable-next-line @typescript-eslint/init-declarations
|
|
89
|
+
let reason: string | undefined;
|
|
90
|
+
|
|
91
|
+
if (!startEvent) {
|
|
92
|
+
status = 'pending';
|
|
93
|
+
reason = undefined;
|
|
94
|
+
} else if (failedEvent) {
|
|
95
|
+
status = 'failed';
|
|
96
|
+
({ reason } = failedEvent);
|
|
97
|
+
} else if (completeEvent) {
|
|
98
|
+
status = 'completed';
|
|
99
|
+
reason = undefined;
|
|
100
|
+
} else {
|
|
101
|
+
status = 'running';
|
|
102
|
+
reason = undefined;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// Count outputs for this stage
|
|
106
|
+
const outputCount = stageOutputs.filter(so => so.stage === stage).length;
|
|
107
|
+
|
|
108
|
+
// Calculate duration if stage started and completed/failed
|
|
109
|
+
// eslint-disable-next-line @typescript-eslint/init-declarations
|
|
110
|
+
let duration: number | undefined;
|
|
111
|
+
if (startEvent && (completeEvent || failedEvent)) {
|
|
112
|
+
const endEvent = completeEvent || failedEvent;
|
|
113
|
+
if (endEvent) {
|
|
114
|
+
duration = endEvent.created_at - startEvent.created_at;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
return {
|
|
119
|
+
stage,
|
|
120
|
+
status,
|
|
121
|
+
reason,
|
|
122
|
+
outputCount,
|
|
123
|
+
duration,
|
|
124
|
+
};
|
|
125
|
+
});
|
|
126
|
+
|
|
127
|
+
return {
|
|
128
|
+
workflowId,
|
|
129
|
+
stages: stagesInfo,
|
|
130
|
+
};
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Get aggregate health metrics for all Trinity workflows.
|
|
135
|
+
* @returns Aggregate health statistics
|
|
136
|
+
*/
|
|
137
|
+
getTrinityHealth(): TrinityHealthResponse {
|
|
138
|
+
// Get all workflows
|
|
139
|
+
const workflows = this.store.listWorkflows();
|
|
140
|
+
|
|
141
|
+
// Initialize counters
|
|
142
|
+
let totalCalls = 0;
|
|
143
|
+
let totalDuration = 0;
|
|
144
|
+
let failedCalls = 0;
|
|
145
|
+
|
|
146
|
+
// Initialize stage statistics
|
|
147
|
+
const stageStats = {
|
|
148
|
+
dreamer: { total: 0, completed: 0, failed: 0 },
|
|
149
|
+
philosopher: { total: 0, completed: 0, failed: 0 },
|
|
150
|
+
scribe: { total: 0, completed: 0, failed: 0 },
|
|
151
|
+
};
|
|
152
|
+
|
|
153
|
+
// Iterate through workflows and aggregate
|
|
154
|
+
for (const workflow of workflows) {
|
|
155
|
+
const events = this.store.getEvents(workflow.workflow_id);
|
|
156
|
+
const isTerminal =
|
|
157
|
+
workflow.state === 'completed'
|
|
158
|
+
|| workflow.state === 'terminal_error'
|
|
159
|
+
|| workflow.state === 'expired'
|
|
160
|
+
|| workflow.state === 'cleanup_pending';
|
|
161
|
+
let workflowFailed = false;
|
|
162
|
+
|
|
163
|
+
// Aggregate stage statistics
|
|
164
|
+
for (const stage of ['dreamer', 'philosopher', 'scribe'] as const) {
|
|
165
|
+
// Check if stage started
|
|
166
|
+
const started = events.some(e => e.event_type === `trinity_${stage}_start`);
|
|
167
|
+
if (started) {
|
|
168
|
+
stageStats[stage].total++;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// Check if completed
|
|
172
|
+
const completed = events.some(e => e.event_type === `trinity_${stage}_complete`);
|
|
173
|
+
if (completed) {
|
|
174
|
+
stageStats[stage].completed++;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
// Check if failed
|
|
178
|
+
const failed = events.some(e => e.event_type === `trinity_${stage}_failed`);
|
|
179
|
+
if (failed) {
|
|
180
|
+
stageStats[stage].failed++;
|
|
181
|
+
workflowFailed = true;
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
// Calculate duration for terminal workflows so the aggregate reflects all finished runs.
|
|
186
|
+
if (isTerminal) {
|
|
187
|
+
totalCalls++;
|
|
188
|
+
const duration = workflow.duration_ms ?? (Date.now() - workflow.created_at);
|
|
189
|
+
totalDuration += duration;
|
|
190
|
+
if (workflowFailed || workflow.state === 'terminal_error' || workflow.state === 'expired') {
|
|
191
|
+
failedCalls++;
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// Calculate derived metrics
|
|
197
|
+
const avgDuration = totalCalls > 0 ? totalDuration / totalCalls : 0;
|
|
198
|
+
const failureRate = totalCalls > 0 ? failedCalls / totalCalls : 0;
|
|
199
|
+
|
|
200
|
+
return {
|
|
201
|
+
totalCalls,
|
|
202
|
+
avgDuration: Math.round(avgDuration),
|
|
203
|
+
failureRate: Number(failureRate.toFixed(4)),
|
|
204
|
+
stageStats,
|
|
205
|
+
};
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
function parseWorkflowMetadata(metadataJson: string): { timeoutMs?: number } {
|
|
210
|
+
try {
|
|
211
|
+
const parsed = JSON.parse(metadataJson) as { timeoutMs?: number };
|
|
212
|
+
return parsed && typeof parsed === 'object' ? parsed : {};
|
|
213
|
+
} catch {
|
|
214
|
+
return {};
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
/**
|
|
219
|
+
* Response type for workflow listing endpoint.
|
|
220
|
+
*/
|
|
221
|
+
export interface WorkflowListResponse {
|
|
222
|
+
workflows: WorkflowInfo[];
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
/**
|
|
226
|
+
* Enriched workflow information with stuck detection.
|
|
227
|
+
*/
|
|
228
|
+
export interface WorkflowInfo {
|
|
229
|
+
workflowId: string;
|
|
230
|
+
type: string;
|
|
231
|
+
state: string;
|
|
232
|
+
duration: number;
|
|
233
|
+
createdAt: string;
|
|
234
|
+
stuckDuration: number | null;
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
/**
|
|
238
|
+
* Response type for Trinity status endpoint.
|
|
239
|
+
*/
|
|
240
|
+
export interface TrinityStatusResponse {
|
|
241
|
+
workflowId: string;
|
|
242
|
+
stages: TrinityStageInfo[];
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
/**
|
|
246
|
+
* Information about a single Trinity stage.
|
|
247
|
+
*/
|
|
248
|
+
export interface TrinityStageInfo {
|
|
249
|
+
stage: 'dreamer' | 'philosopher' | 'scribe';
|
|
250
|
+
status: 'pending' | 'running' | 'completed' | 'failed';
|
|
251
|
+
reason?: string;
|
|
252
|
+
outputCount: number;
|
|
253
|
+
duration?: number;
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
/**
|
|
257
|
+
* Response type for Trinity health metrics endpoint.
|
|
258
|
+
*/
|
|
259
|
+
export interface TrinityHealthResponse {
|
|
260
|
+
totalCalls: number;
|
|
261
|
+
avgDuration: number;
|
|
262
|
+
failureRate: number;
|
|
263
|
+
stageStats: {
|
|
264
|
+
dreamer: StageStats;
|
|
265
|
+
philosopher: StageStats;
|
|
266
|
+
scribe: StageStats;
|
|
267
|
+
};
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
/**
|
|
271
|
+
* Per-stage statistics.
|
|
272
|
+
*/
|
|
273
|
+
export interface StageStats {
|
|
274
|
+
total: number;
|
|
275
|
+
completed: number;
|
|
276
|
+
failed: number;
|
|
277
|
+
}
|