plugin-cluster-manager 1.1.7 → 1.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/client.js +1 -0
- package/dist/client/AclCacheManager.d.ts +2 -0
- package/dist/client/CacheMonitor.d.ts +2 -0
- package/dist/client/ClusterManagerLayout.d.ts +2 -0
- package/dist/client/ClusterNodes.d.ts +2 -0
- package/dist/client/ContainerOrchestrator.d.ts +2 -0
- package/dist/client/Doctor.d.ts +2 -0
- package/dist/client/EventQueueMonitor.d.ts +2 -0
- package/dist/client/LockMonitor.d.ts +2 -0
- package/dist/client/NginxCacheManager.d.ts +2 -0
- package/dist/client/PackageInstaller.d.ts +2 -0
- package/dist/client/PluginOperations.d.ts +2 -0
- package/dist/client/RedisMonitor.d.ts +2 -0
- package/dist/client/TaskManager.d.ts +2 -0
- package/dist/client/WorkflowExecutions.d.ts +2 -0
- package/dist/client/index.d.ts +5 -0
- package/dist/client/index.js +1 -1
- package/dist/client/utils/clientSafeCache.d.ts +3 -0
- package/dist/client/utils/requestDedupInterceptor.d.ts +2 -0
- package/dist/client/utils.d.ts +12 -0
- package/dist/externalVersion.js +5 -5
- package/dist/index.d.ts +2 -0
- package/dist/locale/en-US.json +97 -1
- package/dist/locale/vi-VN.json +98 -1
- package/dist/locale/zh-CN.json +98 -1
- package/dist/server/actions/acl-cache.d.ts +53 -0
- package/dist/server/actions/acl-cache.js +1 -1
- package/dist/server/actions/cache-monitor.d.ts +33 -0
- package/dist/server/actions/cache-monitor.js +301 -0
- package/dist/server/actions/cluster-nodes.d.ts +64 -0
- package/dist/server/actions/cluster-nodes.js +394 -10
- package/dist/server/actions/doctor.d.ts +82 -0
- package/dist/server/actions/doctor.js +1250 -0
- package/dist/server/actions/event-queue-monitor.d.ts +13 -0
- package/dist/server/actions/lock-monitor.d.ts +19 -0
- package/dist/server/actions/orchestrator.d.ts +58 -0
- package/dist/server/actions/package-manager.d.ts +6 -0
- package/dist/server/actions/plugin-operations.d.ts +6 -0
- package/dist/server/actions/redis-monitor.d.ts +12 -0
- package/dist/server/actions/tasks.d.ts +7 -0
- package/dist/server/actions/workflow-executions.d.ts +7 -0
- package/dist/server/adapters/redis-lock-adapter.d.ts +15 -0
- package/dist/server/adapters/redis-node-registry.d.ts +12 -0
- package/dist/server/adapters/redis-pubsub-adapter.d.ts +16 -0
- package/dist/server/collections/app.d.ts +8 -0
- package/dist/server/collections/cluster-manager-acl-cache.d.ts +22 -0
- package/dist/server/collections/cluster-manager-cache-mgr.d.ts +22 -0
- package/dist/server/collections/cluster-manager-cluster.d.ts +22 -0
- package/dist/server/collections/cluster-manager-doctor-runs.d.ts +3 -0
- package/dist/server/collections/cluster-manager-doctor-runs.js +52 -0
- package/dist/server/collections/cluster-manager-doctor.d.ts +18 -0
- package/dist/server/collections/cluster-manager-doctor.js +44 -0
- package/dist/server/collections/cluster-manager-lock.d.ts +22 -0
- package/dist/server/collections/cluster-manager-plugins.d.ts +18 -0
- package/dist/server/collections/cluster-manager-queue.d.ts +22 -0
- package/dist/server/collections/cluster-manager-redis.d.ts +22 -0
- package/dist/server/collections/cluster-manager-workflow.d.ts +22 -0
- package/dist/server/collections/cluster-manager.d.ts +22 -0
- package/dist/server/collections/orchestrator-settings.d.ts +59 -0
- package/dist/server/collections/orchestrator-stacks.d.ts +102 -0
- package/dist/server/collections/worker-orchestrator.d.ts +22 -0
- package/dist/server/collections/worker-packages-configs.d.ts +3 -0
- package/dist/server/collections/worker-packages.d.ts +22 -0
- package/dist/server/hooks/cacheInvalidationHooks.d.ts +1 -0
- package/dist/server/hooks/cacheInvalidationHooks.js +81 -0
- package/dist/server/index.d.ts +1 -0
- package/dist/server/middlewares/listMetaCacheMiddleware.d.ts +2 -0
- package/dist/server/middlewares/listMetaCacheMiddleware.js +79 -0
- package/dist/server/orchestrator/PackageManager.d.ts +39 -0
- package/dist/server/orchestrator/PackageManager.js +83 -27
- package/dist/server/orchestrator/docker-adapter.d.ts +41 -0
- package/dist/server/orchestrator/index.d.ts +4 -0
- package/dist/server/orchestrator/k8s-adapter.d.ts +50 -0
- package/dist/server/orchestrator/leader-election.d.ts +48 -0
- package/dist/server/orchestrator/types.d.ts +84 -0
- package/dist/server/plugin.d.ts +26 -0
- package/dist/server/plugin.js +70 -8
- package/dist/server/utils/node.d.ts +6 -0
- package/dist/server/utils/redis.d.ts +29 -0
- package/dist/server/utils/versionManager.d.ts +10 -0
- package/dist/server/utils/versionManager.js +91 -0
- package/dist/shared/packages.d.ts +23 -0
- package/package.json +41 -41
- package/server.js +1 -0
- package/src/client/CacheMonitor.tsx +166 -179
- package/src/client/ClusterManagerLayout.tsx +48 -42
- package/src/client/ClusterNodes.tsx +691 -418
- package/src/client/Doctor.tsx +559 -0
- package/src/client/NginxCacheManager.tsx +415 -0
- package/src/client/PluginOperations.tsx +234 -234
- package/src/client/index.tsx +22 -14
- package/src/client/utils/clientSafeCache.ts +41 -0
- package/src/client/utils/requestDedupInterceptor.ts +213 -0
- package/src/locale/en-US.json +97 -1
- package/src/locale/vi-VN.json +98 -1
- package/src/locale/zh-CN.json +98 -1
- package/src/server/__tests__/doctor.test.ts +53 -0
- package/src/server/actions/acl-cache.ts +272 -272
- package/src/server/actions/cache-monitor.ts +453 -116
- package/src/server/actions/cluster-nodes.ts +882 -378
- package/src/server/actions/doctor.ts +1540 -0
- package/src/server/collections/cluster-manager-doctor-runs.ts +23 -0
- package/src/server/collections/cluster-manager-doctor.ts +19 -0
- package/src/server/hooks/cacheInvalidationHooks.ts +58 -0
- package/src/server/middlewares/listMetaCacheMiddleware.ts +55 -0
- package/src/server/orchestrator/PackageManager.ts +19 -15
- package/src/server/plugin.ts +353 -263
- package/src/server/utils/versionManager.ts +69 -0
|
@@ -0,0 +1,1540 @@
|
|
|
1
|
+
import { Context } from '@nocobase/actions';
|
|
2
|
+
import Application from '@nocobase/server';
|
|
3
|
+
import crypto from 'crypto';
|
|
4
|
+
import { promises as fsp } from 'fs';
|
|
5
|
+
import os from 'os';
|
|
6
|
+
import path from 'path';
|
|
7
|
+
import { RedisNodeRegistry } from '../adapters/redis-node-registry';
|
|
8
|
+
import type { ContainerInfo, StackConfig } from '../orchestrator/types';
|
|
9
|
+
import { getLocalNodeId } from '../utils/node';
|
|
10
|
+
import { getRedisClient, scanKeys } from '../utils/redis';
|
|
11
|
+
import { packagesFromConfig, type CustomPackageMap, type WorkerPackageMap } from '../../shared/packages';
|
|
12
|
+
|
|
13
|
+
const ACTIVE_RUN_KEY = 'cluster-manager:doctor:active';
|
|
14
|
+
const RESPONSE_KEY_PREFIX = 'cluster-manager:doctor-response:';
|
|
15
|
+
const FINISH_LOCK_PREFIX = 'cluster-manager:doctor:finish-lock:';
|
|
16
|
+
const DEFAULT_DURATION_MS = 120000;
|
|
17
|
+
const MAX_DURATION_MS = 120000;
|
|
18
|
+
const MIN_DURATION_MS = 10000;
|
|
19
|
+
const LOCK_BUFFER_MS = 30000;
|
|
20
|
+
const SNAPSHOT_RESPONSE_TTL_SECONDS = 90;
|
|
21
|
+
const FINISH_LOCK_TTL_SECONDS = 90;
|
|
22
|
+
const MAX_NODE_LOG_LINES = 800;
|
|
23
|
+
const MAX_CONTAINER_LOG_LINES = 200;
|
|
24
|
+
const LOG_PREFIXES = ['system', 'system_error', 'request'];
|
|
25
|
+
|
|
26
|
+
interface RepositoryLike {
|
|
27
|
+
findOne(options?: Record<string, unknown>): Promise<ModelLike | null>;
|
|
28
|
+
find(options?: Record<string, unknown>): Promise<ModelLike[]>;
|
|
29
|
+
create(options: Record<string, unknown>): Promise<ModelLike>;
|
|
30
|
+
update(options: Record<string, unknown>): Promise<unknown>;
|
|
31
|
+
count?(options?: Record<string, unknown>): Promise<number>;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
interface ModelLike {
|
|
35
|
+
get(name: string): unknown;
|
|
36
|
+
toJSON?(): Record<string, unknown>;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
interface RedisLike {
|
|
40
|
+
sendCommand(command: string[]): Promise<unknown>;
|
|
41
|
+
ping?(): Promise<unknown>;
|
|
42
|
+
info?(): Promise<string>;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
interface PubSubLike {
|
|
46
|
+
publish(channel: string, payload: unknown): Promise<unknown> | unknown;
|
|
47
|
+
isConnected?(): Promise<boolean> | boolean;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
interface AppRuntime {
|
|
51
|
+
name?: string;
|
|
52
|
+
db: {
|
|
53
|
+
getRepository(name: string): RepositoryLike;
|
|
54
|
+
hasCollection?(name: string): boolean;
|
|
55
|
+
sequelize?: {
|
|
56
|
+
query(sql: string): Promise<unknown>;
|
|
57
|
+
};
|
|
58
|
+
};
|
|
59
|
+
pm?: {
|
|
60
|
+
get(name: string): unknown;
|
|
61
|
+
};
|
|
62
|
+
pubSubManager?: PubSubLike;
|
|
63
|
+
eventQueue?: unknown;
|
|
64
|
+
logger: {
|
|
65
|
+
info(message: string, meta?: Record<string, unknown>): unknown;
|
|
66
|
+
warn(message: string, meta?: Record<string, unknown>): unknown;
|
|
67
|
+
error(message: string, meta?: Record<string, unknown>): unknown;
|
|
68
|
+
debug?(message: string, meta?: Record<string, unknown>): unknown;
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
interface DoctorActiveState {
|
|
73
|
+
runId: string;
|
|
74
|
+
startedAt: string;
|
|
75
|
+
deadlineAt: string;
|
|
76
|
+
durationMs: number;
|
|
77
|
+
startedBy?: string;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
interface DoctorNodeRecord {
|
|
81
|
+
id?: string;
|
|
82
|
+
name?: string;
|
|
83
|
+
hostname?: string;
|
|
84
|
+
appVersion?: string;
|
|
85
|
+
workerMode?: string;
|
|
86
|
+
isSandbox?: boolean;
|
|
87
|
+
status?: string;
|
|
88
|
+
lastHeartbeatAt?: number;
|
|
89
|
+
pid?: number;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
interface DiagnosticLogLine {
|
|
93
|
+
source: string;
|
|
94
|
+
line: string;
|
|
95
|
+
timestamp?: string;
|
|
96
|
+
level?: string;
|
|
97
|
+
message?: string;
|
|
98
|
+
stack?: string;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
interface LogSignature {
|
|
102
|
+
signature: string;
|
|
103
|
+
level: string;
|
|
104
|
+
count: number;
|
|
105
|
+
firstSeen?: string;
|
|
106
|
+
lastSeen?: string;
|
|
107
|
+
sources: string[];
|
|
108
|
+
samples: string[];
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
interface LogAnalysis {
|
|
112
|
+
totalLines: number;
|
|
113
|
+
levels: Record<string, number>;
|
|
114
|
+
signatures: LogSignature[];
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
interface PluginSnapshot {
|
|
118
|
+
name?: string;
|
|
119
|
+
packageName?: string;
|
|
120
|
+
enabled?: boolean;
|
|
121
|
+
dbVersion?: string;
|
|
122
|
+
loaded: boolean;
|
|
123
|
+
runtimeVersion?: string;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
interface DoctorNodeSnapshot {
|
|
127
|
+
nodeId: string;
|
|
128
|
+
node: {
|
|
129
|
+
hostname: string;
|
|
130
|
+
pid: number;
|
|
131
|
+
workerMode: string;
|
|
132
|
+
role: string;
|
|
133
|
+
appVersion: string;
|
|
134
|
+
nodeVersion: string;
|
|
135
|
+
platform: string;
|
|
136
|
+
arch: string;
|
|
137
|
+
uptime: number;
|
|
138
|
+
isSandbox: boolean;
|
|
139
|
+
};
|
|
140
|
+
memory: NodeJS.MemoryUsage;
|
|
141
|
+
os: {
|
|
142
|
+
totalMemory: number;
|
|
143
|
+
freeMemory: number;
|
|
144
|
+
cpuCount: number;
|
|
145
|
+
loadAvg: number[];
|
|
146
|
+
};
|
|
147
|
+
env: Record<string, string | undefined>;
|
|
148
|
+
plugins: PluginSnapshot[];
|
|
149
|
+
logs: {
|
|
150
|
+
files: Array<{ file: string; lineCount: number }>;
|
|
151
|
+
lines: DiagnosticLogLine[];
|
|
152
|
+
analysis: LogAnalysis;
|
|
153
|
+
};
|
|
154
|
+
collectedAt: string;
|
|
155
|
+
error?: string;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
interface DoctorSnapshotOptions {
|
|
159
|
+
runId?: string;
|
|
160
|
+
sinceMs?: number;
|
|
161
|
+
untilMs?: number;
|
|
162
|
+
maxLines?: number;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
interface NormalizedPackages {
|
|
166
|
+
apt: string[];
|
|
167
|
+
npm: string[];
|
|
168
|
+
python: string[];
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
class ActiveDoctorRunError extends Error {
|
|
172
|
+
constructor(public activeRun: DoctorActiveState | null) {
|
|
173
|
+
super('A diagnostic session is already running.');
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
const timers = new Map<string, NodeJS.Timeout>();
|
|
178
|
+
let localActiveRun: DoctorActiveState | null = null;
|
|
179
|
+
|
|
180
|
+
function getApp(app: Application): AppRuntime {
|
|
181
|
+
return app as unknown as AppRuntime;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
function getPayload(ctx: Context) {
|
|
185
|
+
return (ctx.action.params.values ||
|
|
186
|
+
(ctx as unknown as { request?: { body?: unknown } }).request?.body ||
|
|
187
|
+
{}) as Record<string, unknown>;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
function clampDuration(value: unknown) {
|
|
191
|
+
const duration = Number(value) || DEFAULT_DURATION_MS;
|
|
192
|
+
return Math.min(Math.max(duration, MIN_DURATION_MS), MAX_DURATION_MS);
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
function modelToJSON(model: ModelLike | null | undefined): Record<string, unknown> | null {
|
|
196
|
+
if (!model) return null;
|
|
197
|
+
if (typeof model.toJSON === 'function') {
|
|
198
|
+
return model.toJSON();
|
|
199
|
+
}
|
|
200
|
+
return {};
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
function getModelValue(model: ModelLike | Record<string, unknown>, key: string): unknown {
|
|
204
|
+
if ('get' in model && typeof model.get === 'function') {
|
|
205
|
+
return model.get(key);
|
|
206
|
+
}
|
|
207
|
+
return model[key];
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
function getRepository(app: Application, name: string): RepositoryLike {
|
|
211
|
+
return getApp(app).db.getRepository(name);
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
function getUserLabel(ctx: Context) {
|
|
215
|
+
const state = (ctx as unknown as { state?: { currentUser?: Record<string, unknown> } }).state;
|
|
216
|
+
const currentUser = state?.currentUser;
|
|
217
|
+
return String(currentUser?.nickname || currentUser?.username || currentUser?.id || 'unknown');
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
function getErrorMessage(error: unknown) {
|
|
221
|
+
return error instanceof Error ? error.message : String(error);
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
function parseJson<T>(value: unknown, fallback: T): T {
|
|
225
|
+
if (!value) return fallback;
|
|
226
|
+
if (typeof value !== 'string') return value as T;
|
|
227
|
+
try {
|
|
228
|
+
return JSON.parse(value) as T;
|
|
229
|
+
} catch {
|
|
230
|
+
return fallback;
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
function normalizeList(value: unknown): string[] {
|
|
235
|
+
if (!Array.isArray(value)) return [];
|
|
236
|
+
return Array.from(
|
|
237
|
+
new Set(
|
|
238
|
+
value
|
|
239
|
+
.filter((item) => typeof item === 'string')
|
|
240
|
+
.map((item) => item.trim())
|
|
241
|
+
.filter(Boolean),
|
|
242
|
+
),
|
|
243
|
+
);
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
function normalizePackageMap(packages?: WorkerPackageMap): NormalizedPackages {
|
|
247
|
+
return {
|
|
248
|
+
apt: normalizeList(packages?.apt),
|
|
249
|
+
npm: normalizeList(packages?.npm),
|
|
250
|
+
python: normalizeList(packages?.python),
|
|
251
|
+
};
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
function parseCustomPackages(value: unknown): CustomPackageMap {
|
|
255
|
+
const custom = parseJson<CustomPackageMap>(value, { python: [], node: [], npm: [] });
|
|
256
|
+
return {
|
|
257
|
+
python: normalizeList(custom.python),
|
|
258
|
+
node: normalizeList(custom.node),
|
|
259
|
+
npm: normalizeList(custom.npm),
|
|
260
|
+
};
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
function parsePackageWhitelist(value: unknown): NormalizedPackages {
|
|
264
|
+
const whitelist = parseJson<{ apt?: string[]; npm?: string[]; node?: string[]; python?: string[] }>(value, {});
|
|
265
|
+
return {
|
|
266
|
+
apt: normalizeList(whitelist.apt),
|
|
267
|
+
npm: normalizeList([...(whitelist.npm || []), ...(whitelist.node || [])]),
|
|
268
|
+
python: normalizeList(whitelist.python),
|
|
269
|
+
};
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
function diffPackages(expected: NormalizedPackages, installed: NormalizedPackages): NormalizedPackages {
|
|
273
|
+
return {
|
|
274
|
+
apt: expected.apt.filter((pkg) => !installed.apt.includes(pkg)),
|
|
275
|
+
npm: expected.npm.filter((pkg) => !installed.npm.includes(pkg)),
|
|
276
|
+
python: expected.python.filter((pkg) => !installed.python.includes(pkg)),
|
|
277
|
+
};
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
function countPackages(packages: NormalizedPackages) {
|
|
281
|
+
return packages.apt.length + packages.npm.length + packages.python.length;
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
function getNodeRole(node: { workerMode?: string; isSandbox?: boolean }): 'app' | 'worker' | 'sandbox' {
|
|
285
|
+
if (node.isSandbox) {
|
|
286
|
+
return 'sandbox';
|
|
287
|
+
}
|
|
288
|
+
const workerMode = node.workerMode || 'main';
|
|
289
|
+
return workerMode === 'worker' || workerMode === 'task' || workerMode === '*' ? 'worker' : 'app';
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
function getSafeEnv() {
|
|
293
|
+
return {
|
|
294
|
+
APP_ENV: process.env.APP_ENV,
|
|
295
|
+
APP_NAME: process.env.APP_NAME,
|
|
296
|
+
APP_ROLE: process.env.APP_ROLE,
|
|
297
|
+
APP_PORT: process.env.APP_PORT,
|
|
298
|
+
CLUSTER_MODE: process.env.CLUSTER_MODE,
|
|
299
|
+
WORKER_MODE: process.env.WORKER_MODE,
|
|
300
|
+
LOGGER_LEVEL: process.env.LOGGER_LEVEL,
|
|
301
|
+
LOGGER_FORMAT: process.env.LOGGER_FORMAT,
|
|
302
|
+
LOGGER_TRANSPORT: process.env.LOGGER_TRANSPORT,
|
|
303
|
+
NOCOBASE_VERSION: process.env.NOCOBASE_VERSION,
|
|
304
|
+
DB_DIALECT: process.env.DB_DIALECT,
|
|
305
|
+
};
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
function redactText(value: string) {
|
|
309
|
+
return value
|
|
310
|
+
.replace(
|
|
311
|
+
/(authorization|cookie|set-cookie|token|secret|password|passwd|pwd|api[-_]?key)=([^,\s&]+)/gi,
|
|
312
|
+
'$1=[REDACTED]',
|
|
313
|
+
)
|
|
314
|
+
.replace(/(Bearer\s+)[A-Za-z0-9._~+/=-]+/gi, '$1[REDACTED]')
|
|
315
|
+
.replace(/:\/\/([^:/\s]+):([^@/\s]+)@/g, '://[REDACTED]:[REDACTED]@');
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
function getLogDir(app: Application) {
|
|
319
|
+
const appLike = getApp(app);
|
|
320
|
+
const logBasePath = process.env.LOGGER_BASE_PATH || path.resolve(process.cwd(), 'storage', 'logs');
|
|
321
|
+
const appName = process.env.APP_NAME || appLike.name || 'main';
|
|
322
|
+
return path.resolve(logBasePath, appName);
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
async function readTailLines(filePath: string, maxLines: number) {
|
|
326
|
+
try {
|
|
327
|
+
const stat = await fsp.stat(filePath);
|
|
328
|
+
const bufferSize = Math.min(stat.size, Math.max(maxLines, 1) * 2048);
|
|
329
|
+
const buffer = Buffer.alloc(bufferSize);
|
|
330
|
+
const fh = await fsp.open(filePath, 'r');
|
|
331
|
+
try {
|
|
332
|
+
await fh.read(buffer, 0, bufferSize, Math.max(0, stat.size - bufferSize));
|
|
333
|
+
} finally {
|
|
334
|
+
await fh.close();
|
|
335
|
+
}
|
|
336
|
+
return buffer
|
|
337
|
+
.toString('utf8')
|
|
338
|
+
.split(/\r?\n/)
|
|
339
|
+
.filter((line) => line.trim())
|
|
340
|
+
.slice(-maxLines);
|
|
341
|
+
} catch {
|
|
342
|
+
return [];
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
function parseLogTimestamp(line: string): number | null {
|
|
347
|
+
try {
|
|
348
|
+
const parsed = JSON.parse(line) as { timestamp?: string };
|
|
349
|
+
if (parsed.timestamp) {
|
|
350
|
+
const time = Date.parse(parsed.timestamp);
|
|
351
|
+
return Number.isFinite(time) ? time : null;
|
|
352
|
+
}
|
|
353
|
+
} catch {
|
|
354
|
+
// Fall through to text timestamp parsing.
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
const match = line.match(/(\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?)/);
|
|
358
|
+
if (!match) return null;
|
|
359
|
+
const value = match[1].includes('T') ? match[1] : match[1].replace(' ', 'T');
|
|
360
|
+
const time = Date.parse(value);
|
|
361
|
+
return Number.isFinite(time) ? time : null;
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
function parseDiagnosticLine(source: string, rawLine: string): DiagnosticLogLine {
|
|
365
|
+
const redacted = redactText(rawLine);
|
|
366
|
+
try {
|
|
367
|
+
const parsed = JSON.parse(redacted) as Record<string, unknown>;
|
|
368
|
+
return {
|
|
369
|
+
source,
|
|
370
|
+
line: redacted,
|
|
371
|
+
timestamp: typeof parsed.timestamp === 'string' ? parsed.timestamp : undefined,
|
|
372
|
+
level: typeof parsed.level === 'string' ? parsed.level.toLowerCase() : undefined,
|
|
373
|
+
message: typeof parsed.message === 'string' ? parsed.message : undefined,
|
|
374
|
+
stack: typeof parsed.stack === 'string' ? parsed.stack : undefined,
|
|
375
|
+
};
|
|
376
|
+
} catch {
|
|
377
|
+
const levelMatch = redacted.match(/\b(?:level=|\[)(error|warn|warning|info|debug|trace)\b/i);
|
|
378
|
+
const timestamp = parseLogTimestamp(redacted);
|
|
379
|
+
return {
|
|
380
|
+
source,
|
|
381
|
+
line: redacted,
|
|
382
|
+
timestamp: timestamp ? new Date(timestamp).toISOString() : undefined,
|
|
383
|
+
level: levelMatch?.[1]?.toLowerCase().replace('warning', 'warn'),
|
|
384
|
+
message: redacted.replace(/^\d{4}-\d{2}-\d{2}[^[]*/, '').trim(),
|
|
385
|
+
};
|
|
386
|
+
}
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
function normalizeSignature(value: string) {
|
|
390
|
+
return value
|
|
391
|
+
.replace(/\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?/g, '<time>')
|
|
392
|
+
.replace(/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/gi, '<uuid>')
|
|
393
|
+
.replace(/\b[0-9a-f]{16,}\b/gi, '<hex>')
|
|
394
|
+
.replace(/\b\d+\b/g, '<num>')
|
|
395
|
+
.replace(/\s+/g, ' ')
|
|
396
|
+
.trim()
|
|
397
|
+
.slice(0, 220);
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
function analyzeLogLines(lines: DiagnosticLogLine[]): LogAnalysis {
|
|
401
|
+
const levels: Record<string, number> = {};
|
|
402
|
+
const signatures = new Map<string, LogSignature>();
|
|
403
|
+
|
|
404
|
+
for (const item of lines) {
|
|
405
|
+
const level = item.level || (/error/i.test(item.line) ? 'error' : /warn/i.test(item.line) ? 'warn' : 'info');
|
|
406
|
+
levels[level] = (levels[level] || 0) + 1;
|
|
407
|
+
|
|
408
|
+
if (level !== 'error' && level !== 'warn') {
|
|
409
|
+
continue;
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
const base = item.stack || item.message || item.line;
|
|
413
|
+
const signature = normalizeSignature(base);
|
|
414
|
+
const key = `${level}:${signature}`;
|
|
415
|
+
const existing = signatures.get(key);
|
|
416
|
+
if (existing) {
|
|
417
|
+
existing.count++;
|
|
418
|
+
existing.lastSeen = item.timestamp || existing.lastSeen;
|
|
419
|
+
if (!existing.sources.includes(item.source)) {
|
|
420
|
+
existing.sources.push(item.source);
|
|
421
|
+
}
|
|
422
|
+
if (existing.samples.length < 3) {
|
|
423
|
+
existing.samples.push(item.line.slice(0, 1000));
|
|
424
|
+
}
|
|
425
|
+
} else {
|
|
426
|
+
signatures.set(key, {
|
|
427
|
+
signature,
|
|
428
|
+
level,
|
|
429
|
+
count: 1,
|
|
430
|
+
firstSeen: item.timestamp,
|
|
431
|
+
lastSeen: item.timestamp,
|
|
432
|
+
sources: [item.source],
|
|
433
|
+
samples: [item.line.slice(0, 1000)],
|
|
434
|
+
});
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
return {
|
|
439
|
+
totalLines: lines.length,
|
|
440
|
+
levels,
|
|
441
|
+
signatures: [...signatures.values()].sort((a, b) => b.count - a.count).slice(0, 50),
|
|
442
|
+
};
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
async function readDiagnosticLogs(app: Application, options: DoctorSnapshotOptions) {
|
|
446
|
+
const logDir = getLogDir(app);
|
|
447
|
+
const maxLines = Math.min(Number(options.maxLines) || MAX_NODE_LOG_LINES, MAX_NODE_LOG_LINES);
|
|
448
|
+
const sinceMs = Number(options.sinceMs) || 0;
|
|
449
|
+
const untilMs = Number(options.untilMs) || Date.now();
|
|
450
|
+
const files: Array<{ file: string; lineCount: number }> = [];
|
|
451
|
+
const lines: DiagnosticLogLine[] = [];
|
|
452
|
+
|
|
453
|
+
let names: string[] = [];
|
|
454
|
+
try {
|
|
455
|
+
names = await fsp.readdir(logDir);
|
|
456
|
+
} catch {
|
|
457
|
+
return { files, lines, analysis: analyzeLogLines(lines) };
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
const candidates = names
|
|
461
|
+
.filter((name) => name.endsWith('.log') && LOG_PREFIXES.some((prefix) => name.startsWith(prefix)))
|
|
462
|
+
.sort()
|
|
463
|
+
.reverse()
|
|
464
|
+
.slice(0, 12);
|
|
465
|
+
|
|
466
|
+
const perFileLimit = Math.max(50, Math.ceil(maxLines / Math.max(candidates.length || 1, 1)));
|
|
467
|
+
for (const file of candidates) {
|
|
468
|
+
const filePath = path.resolve(logDir, file);
|
|
469
|
+
const rawLines = await readTailLines(filePath, perFileLimit);
|
|
470
|
+
const parsedLines = rawLines
|
|
471
|
+
.map((line) => parseDiagnosticLine(file, line))
|
|
472
|
+
.filter((line) => {
|
|
473
|
+
const timestamp = line.timestamp ? Date.parse(line.timestamp) : parseLogTimestamp(line.line);
|
|
474
|
+
if (!timestamp) return true;
|
|
475
|
+
return timestamp >= sinceMs && timestamp <= untilMs;
|
|
476
|
+
});
|
|
477
|
+
|
|
478
|
+
if (parsedLines.length > 0) {
|
|
479
|
+
files.push({ file, lineCount: parsedLines.length });
|
|
480
|
+
lines.push(...parsedLines);
|
|
481
|
+
}
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
const limitedLines = lines.slice(-maxLines);
|
|
485
|
+
return {
|
|
486
|
+
files,
|
|
487
|
+
lines: limitedLines,
|
|
488
|
+
analysis: analyzeLogLines(limitedLines),
|
|
489
|
+
};
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
async function getApplicationPluginRows(app: Application) {
|
|
493
|
+
try {
|
|
494
|
+
const repo = getRepository(app, 'applicationPlugins');
|
|
495
|
+
const rows = await repo.find({ sort: ['name'] });
|
|
496
|
+
return rows.map((row) => modelToJSON(row) || {});
|
|
497
|
+
} catch {
|
|
498
|
+
return [];
|
|
499
|
+
}
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
function getLoadedPlugin(app: Application, name?: unknown, packageName?: unknown) {
|
|
503
|
+
const pm = getApp(app).pm;
|
|
504
|
+
if (!pm?.get) return null;
|
|
505
|
+
const pluginName = typeof name === 'string' ? name : '';
|
|
506
|
+
const pluginPackageName = typeof packageName === 'string' ? packageName : '';
|
|
507
|
+
return pm.get(pluginName) || pm.get(pluginPackageName) || null;
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
function getRuntimePluginVersion(instance: unknown) {
|
|
511
|
+
const plugin = instance as {
|
|
512
|
+
version?: string;
|
|
513
|
+
options?: {
|
|
514
|
+
packageJson?: {
|
|
515
|
+
version?: string;
|
|
516
|
+
};
|
|
517
|
+
};
|
|
518
|
+
} | null;
|
|
519
|
+
return plugin?.options?.packageJson?.version || plugin?.version;
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
async function getLocalPluginSnapshot(app: Application): Promise<PluginSnapshot[]> {
|
|
523
|
+
const rows = await getApplicationPluginRows(app);
|
|
524
|
+
return rows.map((row) => {
|
|
525
|
+
const instance = getLoadedPlugin(app, row.name, row.packageName);
|
|
526
|
+
return {
|
|
527
|
+
name: typeof row.name === 'string' ? row.name : undefined,
|
|
528
|
+
packageName: typeof row.packageName === 'string' ? row.packageName : undefined,
|
|
529
|
+
enabled: Boolean(row.enabled),
|
|
530
|
+
dbVersion: typeof row.version === 'string' ? row.version : undefined,
|
|
531
|
+
loaded: Boolean(instance),
|
|
532
|
+
runtimeVersion: getRuntimePluginVersion(instance),
|
|
533
|
+
};
|
|
534
|
+
});
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
export async function collectLocalDoctorSnapshot(
|
|
538
|
+
app: Application,
|
|
539
|
+
options: DoctorSnapshotOptions = {},
|
|
540
|
+
): Promise<DoctorNodeSnapshot> {
|
|
541
|
+
const workerMode = process.env.WORKER_MODE || 'main';
|
|
542
|
+
const node = {
|
|
543
|
+
hostname: os.hostname(),
|
|
544
|
+
pid: process.pid,
|
|
545
|
+
workerMode,
|
|
546
|
+
role: getNodeRole({ workerMode, isSandbox: process.env.SKILL_HUB_SANDBOX === 'true' }),
|
|
547
|
+
appVersion: process.env.NOCOBASE_VERSION || process.version,
|
|
548
|
+
nodeVersion: process.version,
|
|
549
|
+
platform: process.platform,
|
|
550
|
+
arch: process.arch,
|
|
551
|
+
uptime: process.uptime(),
|
|
552
|
+
isSandbox: process.env.SKILL_HUB_SANDBOX === 'true',
|
|
553
|
+
};
|
|
554
|
+
|
|
555
|
+
return {
|
|
556
|
+
nodeId: getLocalNodeId(app),
|
|
557
|
+
node,
|
|
558
|
+
memory: process.memoryUsage(),
|
|
559
|
+
os: {
|
|
560
|
+
totalMemory: os.totalmem(),
|
|
561
|
+
freeMemory: os.freemem(),
|
|
562
|
+
cpuCount: os.cpus().length,
|
|
563
|
+
loadAvg: os.loadavg(),
|
|
564
|
+
},
|
|
565
|
+
env: getSafeEnv(),
|
|
566
|
+
plugins: await getLocalPluginSnapshot(app),
|
|
567
|
+
logs: await readDiagnosticLogs(app, options),
|
|
568
|
+
collectedAt: new Date().toISOString(),
|
|
569
|
+
};
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
async function getActiveRunState(app: Application): Promise<DoctorActiveState | null> {
|
|
573
|
+
const redis = getRedisClient(app) as RedisLike | null;
|
|
574
|
+
if (redis) {
|
|
575
|
+
const raw = await redis.sendCommand(['GET', ACTIVE_RUN_KEY]);
|
|
576
|
+
if (typeof raw !== 'string' || !raw) return null;
|
|
577
|
+
return parseJson<DoctorActiveState | null>(raw, null);
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
if (!localActiveRun) return null;
|
|
581
|
+
if (Date.parse(localActiveRun.deadlineAt) + LOCK_BUFFER_MS < Date.now()) {
|
|
582
|
+
localActiveRun = null;
|
|
583
|
+
return null;
|
|
584
|
+
}
|
|
585
|
+
return localActiveRun;
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
async function acquireActiveRun(app: Application, state: DoctorActiveState) {
|
|
589
|
+
const redis = getRedisClient(app) as RedisLike | null;
|
|
590
|
+
if (redis) {
|
|
591
|
+
const ttlMs = state.durationMs + LOCK_BUFFER_MS;
|
|
592
|
+
const result = await redis.sendCommand(['SET', ACTIVE_RUN_KEY, JSON.stringify(state), 'NX', 'PX', String(ttlMs)]);
|
|
593
|
+
if (result !== 'OK') {
|
|
594
|
+
throw new ActiveDoctorRunError(await getActiveRunState(app));
|
|
595
|
+
}
|
|
596
|
+
return;
|
|
597
|
+
}
|
|
598
|
+
|
|
599
|
+
const activeRun = await getActiveRunState(app);
|
|
600
|
+
if (activeRun) {
|
|
601
|
+
throw new ActiveDoctorRunError(activeRun);
|
|
602
|
+
}
|
|
603
|
+
localActiveRun = state;
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
async function releaseActiveRun(app: Application, runId: string) {
|
|
607
|
+
const redis = getRedisClient(app) as RedisLike | null;
|
|
608
|
+
if (redis) {
|
|
609
|
+
const activeRun = await getActiveRunState(app);
|
|
610
|
+
if (activeRun?.runId === runId) {
|
|
611
|
+
await redis.sendCommand(['DEL', ACTIVE_RUN_KEY]);
|
|
612
|
+
}
|
|
613
|
+
return;
|
|
614
|
+
}
|
|
615
|
+
if (localActiveRun?.runId === runId) {
|
|
616
|
+
localActiveRun = null;
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
|
|
620
|
+
async function acquireFinishLock(app: Application, runId: string) {
|
|
621
|
+
const redis = getRedisClient(app) as RedisLike | null;
|
|
622
|
+
if (!redis) return true;
|
|
623
|
+
const result = await redis.sendCommand([
|
|
624
|
+
'SET',
|
|
625
|
+
`${FINISH_LOCK_PREFIX}${runId}`,
|
|
626
|
+
process.pid.toString(),
|
|
627
|
+
'NX',
|
|
628
|
+
'EX',
|
|
629
|
+
String(FINISH_LOCK_TTL_SECONDS),
|
|
630
|
+
]);
|
|
631
|
+
return result === 'OK';
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
async function releaseFinishLock(app: Application, runId: string) {
|
|
635
|
+
const redis = getRedisClient(app) as RedisLike | null;
|
|
636
|
+
if (!redis) return;
|
|
637
|
+
await redis.sendCommand(['DEL', `${FINISH_LOCK_PREFIX}${runId}`]);
|
|
638
|
+
}
|
|
639
|
+
|
|
640
|
+
function clearRunTimer(runId: string) {
|
|
641
|
+
const timer = timers.get(runId);
|
|
642
|
+
if (timer) {
|
|
643
|
+
clearTimeout(timer);
|
|
644
|
+
timers.delete(runId);
|
|
645
|
+
}
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
function scheduleAutoFinish(app: Application, runId: string, deadlineAt: string) {
|
|
649
|
+
clearRunTimer(runId);
|
|
650
|
+
const delayMs = Math.max(0, Date.parse(deadlineAt) - Date.now());
|
|
651
|
+
const timer = setTimeout(() => {
|
|
652
|
+
finishDoctorRun(app, runId, 'timeout').catch((error) => {
|
|
653
|
+
getApp(app).logger.error(
|
|
654
|
+
`[ClusterDoctor] Failed to auto-finish diagnostic run ${runId}: ${getErrorMessage(error)}`,
|
|
655
|
+
);
|
|
656
|
+
});
|
|
657
|
+
}, delayMs);
|
|
658
|
+
timers.set(runId, timer);
|
|
659
|
+
}
|
|
660
|
+
|
|
661
|
+
async function getRunById(app: Application, runId: string) {
|
|
662
|
+
const repo = getRepository(app, 'clusterManagerDoctorRuns');
|
|
663
|
+
return repo.findOne({ filter: { runId } });
|
|
664
|
+
}
|
|
665
|
+
|
|
666
|
+
async function getLatestRun(app: Application) {
|
|
667
|
+
const repo = getRepository(app, 'clusterManagerDoctorRuns');
|
|
668
|
+
const rows = await repo.find({ sort: ['-createdAt'], limit: 1 });
|
|
669
|
+
return rows[0] || null;
|
|
670
|
+
}
|
|
671
|
+
|
|
672
|
+
function runJsonToActiveState(run: Record<string, unknown>): DoctorActiveState | null {
|
|
673
|
+
if (!run.runId || !run.startedAt || !run.deadlineAt) return null;
|
|
674
|
+
return {
|
|
675
|
+
runId: String(run.runId),
|
|
676
|
+
startedAt: new Date(String(run.startedAt)).toISOString(),
|
|
677
|
+
deadlineAt: new Date(String(run.deadlineAt)).toISOString(),
|
|
678
|
+
durationMs: Number(run.durationMs || DEFAULT_DURATION_MS),
|
|
679
|
+
startedBy: typeof run.startedBy === 'string' ? run.startedBy : undefined,
|
|
680
|
+
};
|
|
681
|
+
}
|
|
682
|
+
|
|
683
|
+
async function getBlockingRunStateFromDb(app: Application) {
|
|
684
|
+
const latestRun = modelToJSON(await getLatestRun(app));
|
|
685
|
+
if (!latestRun || latestRun.status !== 'running') {
|
|
686
|
+
return null;
|
|
687
|
+
}
|
|
688
|
+
|
|
689
|
+
const activeState = runJsonToActiveState(latestRun);
|
|
690
|
+
if (!activeState) return null;
|
|
691
|
+
if (Date.parse(activeState.deadlineAt) + LOCK_BUFFER_MS <= Date.now()) {
|
|
692
|
+
return null;
|
|
693
|
+
}
|
|
694
|
+
return activeState;
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
async function updateRun(app: Application, runId: string, values: Record<string, unknown>) {
|
|
698
|
+
const repo = getRepository(app, 'clusterManagerDoctorRuns');
|
|
699
|
+
await repo.update({ filter: { runId }, values });
|
|
700
|
+
}
|
|
701
|
+
|
|
702
|
+
async function getClusterNodes(app: Application): Promise<DoctorNodeRecord[]> {
|
|
703
|
+
const plugin = getApp(app).pm?.get('plugin-cluster-manager') as { nodeRegistry?: RedisNodeRegistry } | null;
|
|
704
|
+
const registry = plugin?.nodeRegistry ?? new RedisNodeRegistry(app);
|
|
705
|
+
return registry.getNodes();
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
async function requestRemoteSnapshot(
|
|
709
|
+
app: Application,
|
|
710
|
+
node: DoctorNodeRecord,
|
|
711
|
+
options: DoctorSnapshotOptions,
|
|
712
|
+
): Promise<DoctorNodeSnapshot> {
|
|
713
|
+
const targetNodeId = node.id;
|
|
714
|
+
if (!targetNodeId) {
|
|
715
|
+
throw new Error('Node does not have an id.');
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
if (targetNodeId === getLocalNodeId(app)) {
|
|
719
|
+
return collectLocalDoctorSnapshot(app, options);
|
|
720
|
+
}
|
|
721
|
+
|
|
722
|
+
const redis = getRedisClient(app) as RedisLike | null;
|
|
723
|
+
const pubSub = getApp(app).pubSubManager;
|
|
724
|
+
if (!redis || !pubSub) {
|
|
725
|
+
throw new Error('Redis/PubSub is not available for remote diagnostic snapshot collection.');
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
const requestId = crypto.randomBytes(8).toString('hex');
|
|
729
|
+
const responseKey = `${RESPONSE_KEY_PREFIX}${requestId}`;
|
|
730
|
+
await pubSub.publish(
|
|
731
|
+
`cluster-manager:doctor-collect:${targetNodeId}`,
|
|
732
|
+
JSON.stringify({
|
|
733
|
+
requestId,
|
|
734
|
+
targetNodeId,
|
|
735
|
+
runId: options.runId,
|
|
736
|
+
sinceMs: options.sinceMs,
|
|
737
|
+
untilMs: options.untilMs,
|
|
738
|
+
maxLines: options.maxLines,
|
|
739
|
+
}),
|
|
740
|
+
);
|
|
741
|
+
|
|
742
|
+
for (let i = 0; i < 60; i++) {
|
|
743
|
+
await new Promise((resolve) => setTimeout(resolve, 250));
|
|
744
|
+
const raw = await redis.sendCommand(['GET', responseKey]);
|
|
745
|
+
if (typeof raw === 'string' && raw) {
|
|
746
|
+
await redis.sendCommand(['DEL', responseKey]);
|
|
747
|
+
const snapshot = parseJson<Partial<DoctorNodeSnapshot>>(raw, {});
|
|
748
|
+
if (snapshot.node && snapshot.logs) {
|
|
749
|
+
return snapshot as DoctorNodeSnapshot;
|
|
750
|
+
}
|
|
751
|
+
throw new Error(snapshot.error || 'Invalid diagnostic snapshot response payload.');
|
|
752
|
+
}
|
|
753
|
+
}
|
|
754
|
+
|
|
755
|
+
throw new Error(`Timeout waiting for diagnostic snapshot from ${targetNodeId}.`);
|
|
756
|
+
}
|
|
757
|
+
|
|
758
|
+
async function collectNodeSnapshots(
|
|
759
|
+
app: Application,
|
|
760
|
+
nodes: DoctorNodeRecord[],
|
|
761
|
+
options: DoctorSnapshotOptions,
|
|
762
|
+
): Promise<DoctorNodeSnapshot[]> {
|
|
763
|
+
const targets = nodes.length > 0 ? nodes : [{ id: getLocalNodeId(app), hostname: os.hostname() }];
|
|
764
|
+
return Promise.all(
|
|
765
|
+
targets.map(async (node) => {
|
|
766
|
+
try {
|
|
767
|
+
return await requestRemoteSnapshot(app, node, options);
|
|
768
|
+
} catch (error) {
|
|
769
|
+
const workerMode = node.workerMode || 'unknown';
|
|
770
|
+
return {
|
|
771
|
+
nodeId: node.id || `${node.hostname || 'unknown'}:${node.pid || 'unknown'}`,
|
|
772
|
+
node: {
|
|
773
|
+
hostname: node.hostname || 'unknown',
|
|
774
|
+
pid: Number(node.pid || 0),
|
|
775
|
+
workerMode,
|
|
776
|
+
role: getNodeRole({ workerMode, isSandbox: node.isSandbox }),
|
|
777
|
+
appVersion: node.appVersion || '',
|
|
778
|
+
nodeVersion: '',
|
|
779
|
+
platform: '',
|
|
780
|
+
arch: '',
|
|
781
|
+
uptime: 0,
|
|
782
|
+
isSandbox: Boolean(node.isSandbox),
|
|
783
|
+
},
|
|
784
|
+
memory: process.memoryUsage(),
|
|
785
|
+
os: { totalMemory: 0, freeMemory: 0, cpuCount: 0, loadAvg: [] },
|
|
786
|
+
env: {},
|
|
787
|
+
plugins: [],
|
|
788
|
+
logs: { files: [], lines: [], analysis: analyzeLogLines([]) },
|
|
789
|
+
collectedAt: new Date().toISOString(),
|
|
790
|
+
error: getErrorMessage(error),
|
|
791
|
+
};
|
|
792
|
+
}
|
|
793
|
+
}),
|
|
794
|
+
);
|
|
795
|
+
}
|
|
796
|
+
|
|
797
|
+
function aggregateTopSignatures(snapshots: DoctorNodeSnapshot[], containerDiagnostics: Record<string, unknown>) {
|
|
798
|
+
const signatures = new Map<string, LogSignature & { nodes: string[] }>();
|
|
799
|
+
const collect = (nodeId: string, items: LogSignature[]) => {
|
|
800
|
+
for (const item of items) {
|
|
801
|
+
const key = `${item.level}:${item.signature}`;
|
|
802
|
+
const existing = signatures.get(key);
|
|
803
|
+
if (existing) {
|
|
804
|
+
existing.count += item.count;
|
|
805
|
+
if (!existing.nodes.includes(nodeId)) {
|
|
806
|
+
existing.nodes.push(nodeId);
|
|
807
|
+
}
|
|
808
|
+
for (const source of item.sources) {
|
|
809
|
+
if (!existing.sources.includes(source)) existing.sources.push(source);
|
|
810
|
+
}
|
|
811
|
+
existing.samples.push(...item.samples.slice(0, Math.max(0, 3 - existing.samples.length)));
|
|
812
|
+
} else {
|
|
813
|
+
signatures.set(key, { ...item, nodes: [nodeId], samples: item.samples.slice(0, 3) });
|
|
814
|
+
}
|
|
815
|
+
}
|
|
816
|
+
};
|
|
817
|
+
|
|
818
|
+
snapshots.forEach((snapshot) => collect(snapshot.nodeId, snapshot.logs.analysis.signatures));
|
|
819
|
+
|
|
820
|
+
const containerStacks = Array.isArray(containerDiagnostics.stacks) ? containerDiagnostics.stacks : [];
|
|
821
|
+
for (const stack of containerStacks as Array<{
|
|
822
|
+
containers?: Array<{ id: string; logs?: { analysis?: LogAnalysis } }>;
|
|
823
|
+
}>) {
|
|
824
|
+
for (const container of stack.containers || []) {
|
|
825
|
+
collect(`container:${container.id}`, container.logs?.analysis?.signatures || []);
|
|
826
|
+
}
|
|
827
|
+
}
|
|
828
|
+
|
|
829
|
+
return [...signatures.values()].sort((a, b) => b.count - a.count).slice(0, 30);
|
|
830
|
+
}
|
|
831
|
+
|
|
832
|
+
function buildVersionDiagnostics(nodes: DoctorNodeRecord[], snapshots: DoctorNodeSnapshot[]) {
|
|
833
|
+
const nodeVersions = snapshots.map((snapshot) => ({
|
|
834
|
+
nodeId: snapshot.nodeId,
|
|
835
|
+
hostname: snapshot.node.hostname,
|
|
836
|
+
role: snapshot.node.role,
|
|
837
|
+
appVersion: snapshot.node.appVersion,
|
|
838
|
+
nodeVersion: snapshot.node.nodeVersion,
|
|
839
|
+
platform: snapshot.node.platform,
|
|
840
|
+
arch: snapshot.node.arch,
|
|
841
|
+
}));
|
|
842
|
+
const appVersions = new Set(nodeVersions.map((node) => node.appVersion).filter(Boolean));
|
|
843
|
+
const runtimeVersions = new Set(
|
|
844
|
+
nodeVersions.map((node) => `${node.nodeVersion}:${node.platform}:${node.arch}`).filter(Boolean),
|
|
845
|
+
);
|
|
846
|
+
|
|
847
|
+
return {
|
|
848
|
+
registryNodes: nodes,
|
|
849
|
+
nodeVersions,
|
|
850
|
+
versionDrift: appVersions.size > 1,
|
|
851
|
+
runtimeDrift: runtimeVersions.size > 1,
|
|
852
|
+
};
|
|
853
|
+
}
|
|
854
|
+
|
|
855
|
+
function buildPluginDiagnostics(pluginRows: Record<string, unknown>[], snapshots: DoctorNodeSnapshot[]) {
|
|
856
|
+
const plugins = pluginRows.map((plugin) => {
|
|
857
|
+
const name = typeof plugin.name === 'string' ? plugin.name : '';
|
|
858
|
+
const packageName = typeof plugin.packageName === 'string' ? plugin.packageName : '';
|
|
859
|
+
const dbVersion = typeof plugin.version === 'string' ? plugin.version : undefined;
|
|
860
|
+
const nodeStates = snapshots.map((snapshot) => {
|
|
861
|
+
const nodePlugin = snapshot.plugins.find((item) => item.name === name || item.packageName === packageName);
|
|
862
|
+
return {
|
|
863
|
+
nodeId: snapshot.nodeId,
|
|
864
|
+
hostname: snapshot.node.hostname,
|
|
865
|
+
loaded: Boolean(nodePlugin?.loaded),
|
|
866
|
+
runtimeVersion: nodePlugin?.runtimeVersion,
|
|
867
|
+
};
|
|
868
|
+
});
|
|
869
|
+
const runtimeVersions = Array.from(
|
|
870
|
+
new Set(nodeStates.filter((item) => item.loaded && item.runtimeVersion).map((item) => item.runtimeVersion)),
|
|
871
|
+
);
|
|
872
|
+
const hasVersionMismatch =
|
|
873
|
+
runtimeVersions.length > 1 ||
|
|
874
|
+
Boolean(dbVersion && runtimeVersions.some((version) => version && version !== dbVersion));
|
|
875
|
+
const loadedValues = new Set(nodeStates.map((item) => item.loaded));
|
|
876
|
+
|
|
877
|
+
return {
|
|
878
|
+
name,
|
|
879
|
+
packageName,
|
|
880
|
+
enabled: Boolean(plugin.enabled),
|
|
881
|
+
dbVersion,
|
|
882
|
+
runtimeVersions,
|
|
883
|
+
versionDrift: hasVersionMismatch,
|
|
884
|
+
loadDrift: Boolean(plugin.enabled) && loadedValues.size > 1,
|
|
885
|
+
nodes: nodeStates,
|
|
886
|
+
};
|
|
887
|
+
});
|
|
888
|
+
|
|
889
|
+
return {
|
|
890
|
+
plugins,
|
|
891
|
+
versionDrifts: plugins.filter((plugin) => plugin.versionDrift),
|
|
892
|
+
loadDrifts: plugins.filter((plugin) => plugin.loadDrift),
|
|
893
|
+
};
|
|
894
|
+
}
|
|
895
|
+
|
|
896
|
+
async function getExpectedPackages(app: Application): Promise<NormalizedPackages> {
|
|
897
|
+
try {
|
|
898
|
+
const repo = getRepository(app, 'workerPackagesConfigs');
|
|
899
|
+
const config = await repo.findOne();
|
|
900
|
+
if (!config) {
|
|
901
|
+
return normalizePackageMap(packagesFromConfig({}));
|
|
902
|
+
}
|
|
903
|
+
const configured = packagesFromConfig({
|
|
904
|
+
aptPackages: getModelValue(config, 'aptPackages'),
|
|
905
|
+
pythonPackages: getModelValue(config, 'pythonPackages'),
|
|
906
|
+
npmPackages: getModelValue(config, 'npmPackages'),
|
|
907
|
+
});
|
|
908
|
+
const custom = parseCustomPackages(getModelValue(config, 'customPackages'));
|
|
909
|
+
return normalizePackageMap({
|
|
910
|
+
apt: configured.apt,
|
|
911
|
+
npm: [...(configured.npm || []), ...(custom.node || []), ...(custom.npm || [])],
|
|
912
|
+
python: [...(configured.python || []), ...(custom.python || [])],
|
|
913
|
+
});
|
|
914
|
+
} catch {
|
|
915
|
+
return { apt: [], npm: [], python: [] };
|
|
916
|
+
}
|
|
917
|
+
}
|
|
918
|
+
|
|
919
|
+
async function getPackageDiagnostics(app: Application, nodes: DoctorNodeRecord[]) {
|
|
920
|
+
const redis = getRedisClient(app) as RedisLike | null;
|
|
921
|
+
const expectedPackages = await getExpectedPackages(app);
|
|
922
|
+
const nodePackages = [];
|
|
923
|
+
|
|
924
|
+
if (!redis) {
|
|
925
|
+
return {
|
|
926
|
+
available: false,
|
|
927
|
+
expectedPackages,
|
|
928
|
+
nodes: [],
|
|
929
|
+
packageDrifts: [],
|
|
930
|
+
};
|
|
931
|
+
}
|
|
932
|
+
|
|
933
|
+
for (const node of nodes.filter((item) => getNodeRole(item) !== 'app')) {
|
|
934
|
+
const keys = [
|
|
935
|
+
node.id ? `cluster-manager:pkg-status:${node.id}` : null,
|
|
936
|
+
node.hostname ? `orchestrator:pkg-status:${node.hostname}` : null,
|
|
937
|
+
node.name ? `orchestrator:pkg-status:${node.name}` : null,
|
|
938
|
+
].filter(Boolean) as string[];
|
|
939
|
+
let status: Record<string, unknown> | null = null;
|
|
940
|
+
for (const key of keys) {
|
|
941
|
+
const raw = await redis.sendCommand(['GET', key]);
|
|
942
|
+
if (typeof raw === 'string' && raw) {
|
|
943
|
+
status = parseJson<Record<string, unknown> | null>(raw, null);
|
|
944
|
+
if (status) break;
|
|
945
|
+
}
|
|
946
|
+
}
|
|
947
|
+
const installedPackages = parsePackageWhitelist(status?.packageWhitelist);
|
|
948
|
+
const missingPackages = diffPackages(expectedPackages, installedPackages);
|
|
949
|
+
nodePackages.push({
|
|
950
|
+
nodeId: node.id,
|
|
951
|
+
hostname: node.hostname,
|
|
952
|
+
role: getNodeRole(node),
|
|
953
|
+
status: status?.initStatus || 'unknown',
|
|
954
|
+
lastInitAt: status?.lastInitAt || null,
|
|
955
|
+
installedPackages,
|
|
956
|
+
missingPackages,
|
|
957
|
+
drift: !status || status.initStatus !== 'succeeded' || countPackages(missingPackages) > 0,
|
|
958
|
+
});
|
|
959
|
+
}
|
|
960
|
+
|
|
961
|
+
return {
|
|
962
|
+
available: true,
|
|
963
|
+
expectedPackages,
|
|
964
|
+
nodes: nodePackages,
|
|
965
|
+
packageDrifts: nodePackages.filter((item) => item.drift),
|
|
966
|
+
};
|
|
967
|
+
}
|
|
968
|
+
|
|
969
|
+
async function safeCount(app: Application, collection: string, filter?: Record<string, unknown>) {
|
|
970
|
+
try {
|
|
971
|
+
if (getApp(app).db.hasCollection && !getApp(app).db.hasCollection(collection)) {
|
|
972
|
+
return null;
|
|
973
|
+
}
|
|
974
|
+
const repo = getRepository(app, collection);
|
|
975
|
+
if (!repo.count) return null;
|
|
976
|
+
return await repo.count(filter ? { filter } : undefined);
|
|
977
|
+
} catch {
|
|
978
|
+
return null;
|
|
979
|
+
}
|
|
980
|
+
}
|
|
981
|
+
|
|
982
|
+
async function getDatabaseDiagnostics(app: Application) {
|
|
983
|
+
const startedAt = Date.now();
|
|
984
|
+
let ping: { ok: boolean; latencyMs?: number; error?: string };
|
|
985
|
+
try {
|
|
986
|
+
await getApp(app).db.sequelize?.query('SELECT 1');
|
|
987
|
+
ping = { ok: true, latencyMs: Date.now() - startedAt };
|
|
988
|
+
} catch (error) {
|
|
989
|
+
ping = { ok: false, error: getErrorMessage(error) };
|
|
990
|
+
}
|
|
991
|
+
|
|
992
|
+
return {
|
|
993
|
+
ping,
|
|
994
|
+
asyncTasks: {
|
|
995
|
+
pending: await safeCount(app, 'asyncTasks', { status: null }),
|
|
996
|
+
running: await safeCount(app, 'asyncTasks', { status: 0 }),
|
|
997
|
+
failed: await safeCount(app, 'asyncTasks', { status: -1 }),
|
|
998
|
+
canceled: await safeCount(app, 'asyncTasks', { status: -2 }),
|
|
999
|
+
},
|
|
1000
|
+
workflowExecutions: {
|
|
1001
|
+
queued: await safeCount(app, 'executions', { status: null }),
|
|
1002
|
+
running: await safeCount(app, 'executions', { status: 0 }),
|
|
1003
|
+
failed: await safeCount(app, 'executions', { status: -1 }),
|
|
1004
|
+
canceled: await safeCount(app, 'executions', { status: -4 }),
|
|
1005
|
+
},
|
|
1006
|
+
jobs: {
|
|
1007
|
+
pending: await safeCount(app, 'jobs', { status: 0 }),
|
|
1008
|
+
failed: await safeCount(app, 'jobs', { status: -1 }),
|
|
1009
|
+
canceled: await safeCount(app, 'jobs', { status: -4 }),
|
|
1010
|
+
},
|
|
1011
|
+
applicationPlugins: {
|
|
1012
|
+
total: await safeCount(app, 'applicationPlugins'),
|
|
1013
|
+
enabled: await safeCount(app, 'applicationPlugins', { enabled: true }),
|
|
1014
|
+
disabled: await safeCount(app, 'applicationPlugins', { enabled: false }),
|
|
1015
|
+
},
|
|
1016
|
+
};
|
|
1017
|
+
}
|
|
1018
|
+
|
|
1019
|
+
function parseRedisInfo(raw: string): Record<string, Record<string, string>> {
|
|
1020
|
+
const sections: Record<string, Record<string, string>> = {};
|
|
1021
|
+
let current = 'default';
|
|
1022
|
+
for (const line of raw.split(/\r?\n/)) {
|
|
1023
|
+
const trimmed = line.trim();
|
|
1024
|
+
if (!trimmed) continue;
|
|
1025
|
+
if (trimmed.startsWith('#')) {
|
|
1026
|
+
current = trimmed.replace(/^#\s*/, '').toLowerCase();
|
|
1027
|
+
sections[current] = sections[current] || {};
|
|
1028
|
+
continue;
|
|
1029
|
+
}
|
|
1030
|
+
const idx = trimmed.indexOf(':');
|
|
1031
|
+
if (idx > 0) {
|
|
1032
|
+
sections[current] = sections[current] || {};
|
|
1033
|
+
sections[current][trimmed.slice(0, idx)] = trimmed.slice(idx + 1);
|
|
1034
|
+
}
|
|
1035
|
+
}
|
|
1036
|
+
return sections;
|
|
1037
|
+
}
|
|
1038
|
+
|
|
1039
|
+
async function getRedisDiagnostics(app: Application) {
|
|
1040
|
+
const redis = getRedisClient(app) as RedisLike | null;
|
|
1041
|
+
if (!redis) {
|
|
1042
|
+
return { available: false };
|
|
1043
|
+
}
|
|
1044
|
+
|
|
1045
|
+
try {
|
|
1046
|
+
const startedAt = Date.now();
|
|
1047
|
+
await redis.sendCommand(['PING']);
|
|
1048
|
+
const rawInfo = typeof redis.info === 'function' ? await redis.info() : String(await redis.sendCommand(['INFO']));
|
|
1049
|
+
const info = parseRedisInfo(rawInfo);
|
|
1050
|
+
const memory = info.memory || {};
|
|
1051
|
+
const stats = info.stats || {};
|
|
1052
|
+
const clients = info.clients || {};
|
|
1053
|
+
const dbSize = Number(await redis.sendCommand(['DBSIZE'])) || 0;
|
|
1054
|
+
const lockKeys = await scanKeys(redis, 'nocobase:lock:*', 200);
|
|
1055
|
+
const rawSlowlog = (await redis.sendCommand(['SLOWLOG', 'GET', '10'])) as unknown[];
|
|
1056
|
+
const slowlog = Array.isArray(rawSlowlog)
|
|
1057
|
+
? rawSlowlog.map((entry) => {
|
|
1058
|
+
const item = Array.isArray(entry) ? entry : [];
|
|
1059
|
+
const command = Array.isArray(item[3]) ? item[3].join(' ') : String(item[3] || '');
|
|
1060
|
+
return {
|
|
1061
|
+
id: item[0],
|
|
1062
|
+
timestamp: item[1],
|
|
1063
|
+
durationUs: item[2],
|
|
1064
|
+
command: redactText(command),
|
|
1065
|
+
};
|
|
1066
|
+
})
|
|
1067
|
+
: [];
|
|
1068
|
+
return {
|
|
1069
|
+
available: true,
|
|
1070
|
+
latencyMs: Date.now() - startedAt,
|
|
1071
|
+
memory: {
|
|
1072
|
+
used: memory.used_memory_human,
|
|
1073
|
+
usedBytes: Number(memory.used_memory || 0),
|
|
1074
|
+
peak: memory.used_memory_peak_human,
|
|
1075
|
+
fragmentationRatio: Number(memory.mem_fragmentation_ratio || 0),
|
|
1076
|
+
},
|
|
1077
|
+
clients: {
|
|
1078
|
+
connected: Number(clients.connected_clients || 0),
|
|
1079
|
+
blocked: Number(clients.blocked_clients || 0),
|
|
1080
|
+
},
|
|
1081
|
+
stats: {
|
|
1082
|
+
opsPerSec: Number(stats.instantaneous_ops_per_sec || 0),
|
|
1083
|
+
evictedKeys: Number(stats.evicted_keys || 0),
|
|
1084
|
+
expiredKeys: Number(stats.expired_keys || 0),
|
|
1085
|
+
},
|
|
1086
|
+
dbSize,
|
|
1087
|
+
activeLocks: lockKeys.length,
|
|
1088
|
+
slowlog,
|
|
1089
|
+
};
|
|
1090
|
+
} catch (error) {
|
|
1091
|
+
return { available: false, error: getErrorMessage(error) };
|
|
1092
|
+
}
|
|
1093
|
+
}
|
|
1094
|
+
|
|
1095
|
+
async function getQueueDiagnostics(app: Application) {
|
|
1096
|
+
const eventQueue = getApp(app).eventQueue as unknown as
|
|
1097
|
+
| {
|
|
1098
|
+
isConnected?(): boolean;
|
|
1099
|
+
adapter?: unknown;
|
|
1100
|
+
events?: Map<string, { concurrency?: number; interval?: number; shared?: boolean }>;
|
|
1101
|
+
getFullChannel?(channel: string, shared?: boolean): string;
|
|
1102
|
+
}
|
|
1103
|
+
| undefined;
|
|
1104
|
+
|
|
1105
|
+
if (!eventQueue) {
|
|
1106
|
+
return { available: false };
|
|
1107
|
+
}
|
|
1108
|
+
|
|
1109
|
+
const adapter = eventQueue.adapter as
|
|
1110
|
+
| { constructor?: { name?: string }; queues?: Map<string, unknown[]> }
|
|
1111
|
+
| undefined;
|
|
1112
|
+
const channels = [];
|
|
1113
|
+
for (const [channel, options] of eventQueue.events || new Map()) {
|
|
1114
|
+
let pending: number | null = null;
|
|
1115
|
+
if (adapter?.queues && eventQueue.getFullChannel) {
|
|
1116
|
+
const fullChannel = eventQueue.getFullChannel(channel, options.shared);
|
|
1117
|
+
pending = adapter.queues.get(fullChannel)?.length || 0;
|
|
1118
|
+
}
|
|
1119
|
+
channels.push({
|
|
1120
|
+
channel,
|
|
1121
|
+
concurrency: options.concurrency || 1,
|
|
1122
|
+
interval: options.interval || 250,
|
|
1123
|
+
pending,
|
|
1124
|
+
});
|
|
1125
|
+
}
|
|
1126
|
+
|
|
1127
|
+
return {
|
|
1128
|
+
available: true,
|
|
1129
|
+
connected: eventQueue.isConnected?.() || false,
|
|
1130
|
+
adapter: adapter?.constructor?.name || 'unknown',
|
|
1131
|
+
channels,
|
|
1132
|
+
totalPending: channels.reduce((sum, item) => sum + (item.pending || 0), 0),
|
|
1133
|
+
};
|
|
1134
|
+
}
|
|
1135
|
+
|
|
1136
|
+
async function getOrchestratorDiagnostics(app: Application, options: DoctorSnapshotOptions) {
|
|
1137
|
+
const plugin = getApp(app).pm?.get('plugin-cluster-manager') as {
|
|
1138
|
+
orchestrator?: {
|
|
1139
|
+
name: string;
|
|
1140
|
+
listContainers(stack: StackConfig): Promise<ContainerInfo[]>;
|
|
1141
|
+
getLogs(containerId: string, tail?: number): Promise<string>;
|
|
1142
|
+
getStats(containerId: string): Promise<unknown>;
|
|
1143
|
+
};
|
|
1144
|
+
} | null;
|
|
1145
|
+
const adapter = plugin?.orchestrator;
|
|
1146
|
+
if (!adapter) {
|
|
1147
|
+
return { configured: false, stacks: [] };
|
|
1148
|
+
}
|
|
1149
|
+
|
|
1150
|
+
let stacks: StackConfig[] = [];
|
|
1151
|
+
try {
|
|
1152
|
+
const repo = getRepository(app, 'orchestratorStacks');
|
|
1153
|
+
const rows = await repo.find({ sort: ['name'], limit: 10 });
|
|
1154
|
+
stacks = rows.map((row) => modelToJSON(row) as unknown as StackConfig).filter((stack) => stack?.enabled !== false);
|
|
1155
|
+
} catch (error) {
|
|
1156
|
+
return { configured: true, adapter: adapter.name, error: getErrorMessage(error), stacks: [] };
|
|
1157
|
+
}
|
|
1158
|
+
|
|
1159
|
+
const results = [];
|
|
1160
|
+
for (const stack of stacks) {
|
|
1161
|
+
try {
|
|
1162
|
+
const containers = await adapter.listContainers(stack);
|
|
1163
|
+
const enriched = [];
|
|
1164
|
+
for (const container of containers.slice(0, 10)) {
|
|
1165
|
+
let stats: unknown = null;
|
|
1166
|
+
let logs: { lineCount: number; analysis: LogAnalysis; error?: string } | null = null;
|
|
1167
|
+
try {
|
|
1168
|
+
stats = container.status === 'running' ? await adapter.getStats(container.id) : null;
|
|
1169
|
+
} catch (error) {
|
|
1170
|
+
stats = { error: getErrorMessage(error) };
|
|
1171
|
+
}
|
|
1172
|
+
try {
|
|
1173
|
+
const rawLogs = await adapter.getLogs(container.id, MAX_CONTAINER_LOG_LINES);
|
|
1174
|
+
const parsed = rawLogs
|
|
1175
|
+
.split(/\r?\n/)
|
|
1176
|
+
.filter((line) => line.trim())
|
|
1177
|
+
.map((line) => parseDiagnosticLine(`container:${container.name}`, line))
|
|
1178
|
+
.filter((line) => {
|
|
1179
|
+
const timestamp = line.timestamp ? Date.parse(line.timestamp) : parseLogTimestamp(line.line);
|
|
1180
|
+
if (!timestamp) return true;
|
|
1181
|
+
return timestamp >= (options.sinceMs || 0) && timestamp <= (options.untilMs || Date.now());
|
|
1182
|
+
});
|
|
1183
|
+
logs = { lineCount: parsed.length, analysis: analyzeLogLines(parsed) };
|
|
1184
|
+
} catch (error) {
|
|
1185
|
+
logs = { lineCount: 0, analysis: analyzeLogLines([]), error: getErrorMessage(error) };
|
|
1186
|
+
}
|
|
1187
|
+
enriched.push({ ...container, stats, logs });
|
|
1188
|
+
}
|
|
1189
|
+
results.push({ stack: { id: stack.id, name: stack.name, adapter: stack.adapter }, containers: enriched });
|
|
1190
|
+
} catch (error) {
|
|
1191
|
+
results.push({
|
|
1192
|
+
stack: { id: stack.id, name: stack.name, adapter: stack.adapter },
|
|
1193
|
+
error: getErrorMessage(error),
|
|
1194
|
+
});
|
|
1195
|
+
}
|
|
1196
|
+
}
|
|
1197
|
+
|
|
1198
|
+
return {
|
|
1199
|
+
configured: true,
|
|
1200
|
+
adapter: adapter.name,
|
|
1201
|
+
stacks: results,
|
|
1202
|
+
};
|
|
1203
|
+
}
|
|
1204
|
+
|
|
1205
|
+
function buildRecommendations(params: {
|
|
1206
|
+
snapshotErrors: number;
|
|
1207
|
+
topErrors: number;
|
|
1208
|
+
versionDrift: boolean;
|
|
1209
|
+
runtimeDrift: boolean;
|
|
1210
|
+
pluginVersionDrifts: number;
|
|
1211
|
+
pluginLoadDrifts: number;
|
|
1212
|
+
packageDrifts: number;
|
|
1213
|
+
redisAvailable: boolean;
|
|
1214
|
+
databaseOk: boolean;
|
|
1215
|
+
}) {
|
|
1216
|
+
const recommendations = [];
|
|
1217
|
+
if (!params.redisAvailable) {
|
|
1218
|
+
recommendations.push({
|
|
1219
|
+
level: 'critical',
|
|
1220
|
+
code: 'redis_unavailable',
|
|
1221
|
+
message: 'Redis is unavailable, so cluster-wide diagnostic collection and locking are degraded.',
|
|
1222
|
+
});
|
|
1223
|
+
}
|
|
1224
|
+
if (!params.databaseOk) {
|
|
1225
|
+
recommendations.push({
|
|
1226
|
+
level: 'critical',
|
|
1227
|
+
code: 'database_unhealthy',
|
|
1228
|
+
message: 'Database ping failed during the diagnostic session.',
|
|
1229
|
+
});
|
|
1230
|
+
}
|
|
1231
|
+
if (params.snapshotErrors > 0) {
|
|
1232
|
+
recommendations.push({
|
|
1233
|
+
level: 'warning',
|
|
1234
|
+
code: 'snapshot_collection_failed',
|
|
1235
|
+
message: `${params.snapshotErrors} node(s) did not return a diagnostic snapshot.`,
|
|
1236
|
+
});
|
|
1237
|
+
}
|
|
1238
|
+
if (params.versionDrift || params.runtimeDrift) {
|
|
1239
|
+
recommendations.push({
|
|
1240
|
+
level: 'warning',
|
|
1241
|
+
code: 'cluster_runtime_drift',
|
|
1242
|
+
message: 'Cluster nodes are not running the same application/runtime version.',
|
|
1243
|
+
});
|
|
1244
|
+
}
|
|
1245
|
+
if (params.pluginVersionDrifts > 0 || params.pluginLoadDrifts > 0) {
|
|
1246
|
+
recommendations.push({
|
|
1247
|
+
level: 'warning',
|
|
1248
|
+
code: 'plugin_drift',
|
|
1249
|
+
message: 'Installed or loaded plugin state is inconsistent across diagnostic snapshots.',
|
|
1250
|
+
});
|
|
1251
|
+
}
|
|
1252
|
+
if (params.packageDrifts > 0) {
|
|
1253
|
+
recommendations.push({
|
|
1254
|
+
level: 'warning',
|
|
1255
|
+
code: 'package_drift',
|
|
1256
|
+
message: 'One or more worker nodes are missing configured packages or have failed package initialization.',
|
|
1257
|
+
});
|
|
1258
|
+
}
|
|
1259
|
+
if (params.topErrors > 0) {
|
|
1260
|
+
recommendations.push({
|
|
1261
|
+
level: 'warning',
|
|
1262
|
+
code: 'log_errors_detected',
|
|
1263
|
+
message: 'Error or warning signatures were found in node/container logs during the diagnostic window.',
|
|
1264
|
+
});
|
|
1265
|
+
}
|
|
1266
|
+
return recommendations;
|
|
1267
|
+
}
|
|
1268
|
+
|
|
1269
|
+
async function buildDoctorReport(app: Application, run: Record<string, unknown>, finishReason: string) {
|
|
1270
|
+
const runId = String(run.runId);
|
|
1271
|
+
const startedAt = new Date(String(run.startedAt));
|
|
1272
|
+
const finishedAt = new Date();
|
|
1273
|
+
const sinceMs = startedAt.getTime();
|
|
1274
|
+
const untilMs = finishedAt.getTime();
|
|
1275
|
+
const nodes = await getClusterNodes(app);
|
|
1276
|
+
const snapshots = await collectNodeSnapshots(app, nodes, {
|
|
1277
|
+
runId,
|
|
1278
|
+
sinceMs,
|
|
1279
|
+
untilMs,
|
|
1280
|
+
maxLines: MAX_NODE_LOG_LINES,
|
|
1281
|
+
});
|
|
1282
|
+
const pluginRows = await getApplicationPluginRows(app);
|
|
1283
|
+
const versionDiagnostics = buildVersionDiagnostics(nodes, snapshots);
|
|
1284
|
+
const pluginDiagnostics = buildPluginDiagnostics(pluginRows, snapshots);
|
|
1285
|
+
const packageDiagnostics = await getPackageDiagnostics(app, nodes);
|
|
1286
|
+
const databaseDiagnostics = await getDatabaseDiagnostics(app);
|
|
1287
|
+
const redisDiagnostics = await getRedisDiagnostics(app);
|
|
1288
|
+
const queueDiagnostics = await getQueueDiagnostics(app);
|
|
1289
|
+
const orchestratorDiagnostics = await getOrchestratorDiagnostics(app, { sinceMs, untilMs });
|
|
1290
|
+
const topSignatures = aggregateTopSignatures(snapshots, orchestratorDiagnostics);
|
|
1291
|
+
const snapshotErrors = snapshots.filter((snapshot) => snapshot.error).length;
|
|
1292
|
+
const recommendations = buildRecommendations({
|
|
1293
|
+
snapshotErrors,
|
|
1294
|
+
topErrors: topSignatures.length,
|
|
1295
|
+
versionDrift: Boolean(versionDiagnostics.versionDrift),
|
|
1296
|
+
runtimeDrift: Boolean(versionDiagnostics.runtimeDrift),
|
|
1297
|
+
pluginVersionDrifts: pluginDiagnostics.versionDrifts.length,
|
|
1298
|
+
pluginLoadDrifts: pluginDiagnostics.loadDrifts.length,
|
|
1299
|
+
packageDrifts: packageDiagnostics.packageDrifts.length,
|
|
1300
|
+
redisAvailable: Boolean(redisDiagnostics.available),
|
|
1301
|
+
databaseOk: Boolean(databaseDiagnostics.ping.ok),
|
|
1302
|
+
});
|
|
1303
|
+
const criticalFindings = recommendations.filter((item) => item.level === 'critical').length;
|
|
1304
|
+
const warningFindings = recommendations.filter((item) => item.level === 'warning').length;
|
|
1305
|
+
const healthStatus = criticalFindings > 0 ? 'critical' : warningFindings > 0 ? 'warning' : 'healthy';
|
|
1306
|
+
const summary = {
|
|
1307
|
+
status: healthStatus,
|
|
1308
|
+
nodes: snapshots.length,
|
|
1309
|
+
snapshotErrors,
|
|
1310
|
+
errors: topSignatures.filter((item) => item.level === 'error').reduce((sum, item) => sum + item.count, 0),
|
|
1311
|
+
warnings: topSignatures.filter((item) => item.level === 'warn').reduce((sum, item) => sum + item.count, 0),
|
|
1312
|
+
versionDrift: versionDiagnostics.versionDrift,
|
|
1313
|
+
runtimeDrift: versionDiagnostics.runtimeDrift,
|
|
1314
|
+
pluginVersionDrifts: pluginDiagnostics.versionDrifts.length,
|
|
1315
|
+
pluginLoadDrifts: pluginDiagnostics.loadDrifts.length,
|
|
1316
|
+
packageDrifts: packageDiagnostics.packageDrifts.length,
|
|
1317
|
+
failedTasks: databaseDiagnostics.asyncTasks.failed,
|
|
1318
|
+
failedWorkflows: databaseDiagnostics.workflowExecutions.failed,
|
|
1319
|
+
};
|
|
1320
|
+
|
|
1321
|
+
return {
|
|
1322
|
+
runId,
|
|
1323
|
+
startedAt: startedAt.toISOString(),
|
|
1324
|
+
finishedAt: finishedAt.toISOString(),
|
|
1325
|
+
durationMs: untilMs - sinceMs,
|
|
1326
|
+
requestedDurationMs: Number(run.durationMs || 0),
|
|
1327
|
+
finishReason,
|
|
1328
|
+
summary,
|
|
1329
|
+
nodes: snapshots,
|
|
1330
|
+
versionDiagnostics,
|
|
1331
|
+
pluginDiagnostics,
|
|
1332
|
+
packageDiagnostics,
|
|
1333
|
+
databaseDiagnostics,
|
|
1334
|
+
redisDiagnostics,
|
|
1335
|
+
queueDiagnostics,
|
|
1336
|
+
orchestratorDiagnostics,
|
|
1337
|
+
logAnalysis: {
|
|
1338
|
+
topSignatures,
|
|
1339
|
+
byNode: snapshots.map((snapshot) => ({
|
|
1340
|
+
nodeId: snapshot.nodeId,
|
|
1341
|
+
hostname: snapshot.node.hostname,
|
|
1342
|
+
role: snapshot.node.role,
|
|
1343
|
+
files: snapshot.logs.files,
|
|
1344
|
+
levels: snapshot.logs.analysis.levels,
|
|
1345
|
+
signatures: snapshot.logs.analysis.signatures.slice(0, 10),
|
|
1346
|
+
error: snapshot.error,
|
|
1347
|
+
})),
|
|
1348
|
+
},
|
|
1349
|
+
recommendations,
|
|
1350
|
+
};
|
|
1351
|
+
}
|
|
1352
|
+
|
|
1353
|
+
async function finishDoctorRun(app: Application, runId: string, reason: string) {
|
|
1354
|
+
const hasFinishLock = await acquireFinishLock(app, runId);
|
|
1355
|
+
if (!hasFinishLock) {
|
|
1356
|
+
return modelToJSON(await getRunById(app, runId));
|
|
1357
|
+
}
|
|
1358
|
+
|
|
1359
|
+
clearRunTimer(runId);
|
|
1360
|
+
try {
|
|
1361
|
+
const run = await getRunById(app, runId);
|
|
1362
|
+
const runJson = modelToJSON(run);
|
|
1363
|
+
if (!run || !runJson) {
|
|
1364
|
+
throw new Error(`Diagnostic run ${runId} was not found.`);
|
|
1365
|
+
}
|
|
1366
|
+
if (runJson.status !== 'running') {
|
|
1367
|
+
await releaseActiveRun(app, runId);
|
|
1368
|
+
return runJson;
|
|
1369
|
+
}
|
|
1370
|
+
|
|
1371
|
+
await updateRun(app, runId, { progress: 40 });
|
|
1372
|
+
const report = await buildDoctorReport(app, runJson, reason);
|
|
1373
|
+
await updateRun(app, runId, {
|
|
1374
|
+
status: 'finished',
|
|
1375
|
+
progress: 100,
|
|
1376
|
+
finishedAt: new Date(),
|
|
1377
|
+
finishReason: reason,
|
|
1378
|
+
summary: report.summary,
|
|
1379
|
+
report,
|
|
1380
|
+
error: null,
|
|
1381
|
+
});
|
|
1382
|
+
await releaseActiveRun(app, runId);
|
|
1383
|
+
return modelToJSON(await getRunById(app, runId));
|
|
1384
|
+
} catch (error) {
|
|
1385
|
+
await updateRun(app, runId, {
|
|
1386
|
+
status: 'failed',
|
|
1387
|
+
progress: 100,
|
|
1388
|
+
finishedAt: new Date(),
|
|
1389
|
+
finishReason: reason,
|
|
1390
|
+
error: getErrorMessage(error),
|
|
1391
|
+
});
|
|
1392
|
+
await releaseActiveRun(app, runId);
|
|
1393
|
+
throw error;
|
|
1394
|
+
} finally {
|
|
1395
|
+
await releaseFinishLock(app, runId);
|
|
1396
|
+
}
|
|
1397
|
+
}
|
|
1398
|
+
|
|
1399
|
+
async function finishExpiredActiveRun(app: Application) {
|
|
1400
|
+
const activeRun = await getActiveRunState(app);
|
|
1401
|
+
if (activeRun && Date.parse(activeRun.deadlineAt) <= Date.now()) {
|
|
1402
|
+
await finishDoctorRun(app, activeRun.runId, 'timeout');
|
|
1403
|
+
return;
|
|
1404
|
+
}
|
|
1405
|
+
|
|
1406
|
+
const latestRun = modelToJSON(await getLatestRun(app));
|
|
1407
|
+
if (
|
|
1408
|
+
latestRun?.status === 'running' &&
|
|
1409
|
+
latestRun.deadlineAt &&
|
|
1410
|
+
Date.parse(String(latestRun.deadlineAt)) <= Date.now()
|
|
1411
|
+
) {
|
|
1412
|
+
await finishDoctorRun(app, String(latestRun.runId), 'timeout');
|
|
1413
|
+
}
|
|
1414
|
+
}
|
|
1415
|
+
|
|
1416
|
+
function sanitizeRunForResponse(run: Record<string, unknown> | null, includeReport = false) {
|
|
1417
|
+
if (!run) return null;
|
|
1418
|
+
if (includeReport) return run;
|
|
1419
|
+
const { report, ...rest } = run;
|
|
1420
|
+
return {
|
|
1421
|
+
...rest,
|
|
1422
|
+
hasReport: Boolean(report),
|
|
1423
|
+
};
|
|
1424
|
+
}
|
|
1425
|
+
|
|
1426
|
+
export const doctorActions = {
|
|
1427
|
+
async start(ctx: Context, next: () => Promise<void>) {
|
|
1428
|
+
const payload = getPayload(ctx);
|
|
1429
|
+
const durationMs = clampDuration(payload.durationMs);
|
|
1430
|
+
const startedAt = new Date();
|
|
1431
|
+
const deadlineAt = new Date(startedAt.getTime() + durationMs);
|
|
1432
|
+
const runId = crypto.randomBytes(8).toString('hex');
|
|
1433
|
+
const activeState: DoctorActiveState = {
|
|
1434
|
+
runId,
|
|
1435
|
+
startedAt: startedAt.toISOString(),
|
|
1436
|
+
deadlineAt: deadlineAt.toISOString(),
|
|
1437
|
+
durationMs,
|
|
1438
|
+
startedBy: getUserLabel(ctx),
|
|
1439
|
+
};
|
|
1440
|
+
|
|
1441
|
+
try {
|
|
1442
|
+
await finishExpiredActiveRun(ctx.app);
|
|
1443
|
+
const blockingRun = await getBlockingRunStateFromDb(ctx.app);
|
|
1444
|
+
if (blockingRun) {
|
|
1445
|
+
throw new ActiveDoctorRunError(blockingRun);
|
|
1446
|
+
}
|
|
1447
|
+
await acquireActiveRun(ctx.app, activeState);
|
|
1448
|
+
} catch (error) {
|
|
1449
|
+
if (error instanceof ActiveDoctorRunError) {
|
|
1450
|
+
ctx.throw(409, 'A diagnostic session is already running.', {
|
|
1451
|
+
activeRun: error.activeRun,
|
|
1452
|
+
});
|
|
1453
|
+
}
|
|
1454
|
+
throw error;
|
|
1455
|
+
}
|
|
1456
|
+
|
|
1457
|
+
try {
|
|
1458
|
+
const repo = getRepository(ctx.app, 'clusterManagerDoctorRuns');
|
|
1459
|
+
await repo.create({
|
|
1460
|
+
values: {
|
|
1461
|
+
runId,
|
|
1462
|
+
status: 'running',
|
|
1463
|
+
durationMs,
|
|
1464
|
+
progress: 5,
|
|
1465
|
+
startedAt,
|
|
1466
|
+
deadlineAt,
|
|
1467
|
+
startedBy: activeState.startedBy,
|
|
1468
|
+
},
|
|
1469
|
+
});
|
|
1470
|
+
scheduleAutoFinish(ctx.app, runId, deadlineAt.toISOString());
|
|
1471
|
+
getApp(ctx.app).logger.info(`[ClusterDoctor] Diagnostic run ${runId} started by ${activeState.startedBy}`);
|
|
1472
|
+
ctx.body = {
|
|
1473
|
+
success: true,
|
|
1474
|
+
runId,
|
|
1475
|
+
status: 'running',
|
|
1476
|
+
startedAt: activeState.startedAt,
|
|
1477
|
+
deadlineAt: activeState.deadlineAt,
|
|
1478
|
+
durationMs,
|
|
1479
|
+
};
|
|
1480
|
+
} catch (error) {
|
|
1481
|
+
await releaseActiveRun(ctx.app, runId);
|
|
1482
|
+
throw error;
|
|
1483
|
+
}
|
|
1484
|
+
|
|
1485
|
+
await next();
|
|
1486
|
+
},
|
|
1487
|
+
|
|
1488
|
+
async stop(ctx: Context, next: () => Promise<void>) {
|
|
1489
|
+
const payload = getPayload(ctx);
|
|
1490
|
+
await finishExpiredActiveRun(ctx.app);
|
|
1491
|
+
const activeRun = await getActiveRunState(ctx.app);
|
|
1492
|
+
const runId = String(payload.runId || activeRun?.runId || ctx.action.params.runId || '');
|
|
1493
|
+
if (!runId) {
|
|
1494
|
+
ctx.throw(404, 'No active diagnostic session was found.');
|
|
1495
|
+
}
|
|
1496
|
+
|
|
1497
|
+
const run = await finishDoctorRun(ctx.app, runId, 'manual');
|
|
1498
|
+
ctx.body = sanitizeRunForResponse(run, true);
|
|
1499
|
+
await next();
|
|
1500
|
+
},
|
|
1501
|
+
|
|
1502
|
+
async status(ctx: Context, next: () => Promise<void>) {
|
|
1503
|
+
await finishExpiredActiveRun(ctx.app);
|
|
1504
|
+
const activeRun = await getActiveRunState(ctx.app);
|
|
1505
|
+
const runId = String(ctx.action.params.runId || activeRun?.runId || '');
|
|
1506
|
+
const run = runId ? await getRunById(ctx.app, runId) : await getLatestRun(ctx.app);
|
|
1507
|
+
ctx.body = {
|
|
1508
|
+
activeRun,
|
|
1509
|
+
run: sanitizeRunForResponse(modelToJSON(run), false),
|
|
1510
|
+
};
|
|
1511
|
+
await next();
|
|
1512
|
+
},
|
|
1513
|
+
|
|
1514
|
+
async report(ctx: Context, next: () => Promise<void>) {
|
|
1515
|
+
await finishExpiredActiveRun(ctx.app);
|
|
1516
|
+
const runId = String(ctx.action.params.runId || '');
|
|
1517
|
+
const run = runId ? await getRunById(ctx.app, runId) : await getLatestRun(ctx.app);
|
|
1518
|
+
const runJson = modelToJSON(run);
|
|
1519
|
+
if (!runJson) {
|
|
1520
|
+
ctx.throw(404, 'Diagnostic report not found.');
|
|
1521
|
+
}
|
|
1522
|
+
ctx.body = sanitizeRunForResponse(runJson, true);
|
|
1523
|
+
await next();
|
|
1524
|
+
},
|
|
1525
|
+
|
|
1526
|
+
async download(ctx: Context, next: () => Promise<void>) {
|
|
1527
|
+
await finishExpiredActiveRun(ctx.app);
|
|
1528
|
+
const runId = String(ctx.action.params.runId || '');
|
|
1529
|
+
const run = runId ? await getRunById(ctx.app, runId) : await getLatestRun(ctx.app);
|
|
1530
|
+
const runJson = modelToJSON(run);
|
|
1531
|
+
if (!runJson?.report) {
|
|
1532
|
+
ctx.throw(404, 'Diagnostic report not found.');
|
|
1533
|
+
}
|
|
1534
|
+
|
|
1535
|
+
ctx.attachment(`doctor-report-${runJson.runId}.json`);
|
|
1536
|
+
ctx.set('Content-Type', 'application/json; charset=utf-8');
|
|
1537
|
+
ctx.body = JSON.stringify(runJson.report, null, 2);
|
|
1538
|
+
await next();
|
|
1539
|
+
},
|
|
1540
|
+
};
|