postgresai 0.15.0 → 0.16.0-dev.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -0
- package/bin/postgres-ai.ts +210 -31
- package/dist/bin/postgres-ai.js +7730 -7250
- package/lib/aas-onboard.ts +217 -0
- package/lib/checkup-api.ts +75 -0
- package/lib/checkup-summary.ts +30 -0
- package/lib/checkup.ts +227 -21
- package/lib/metrics-loader.ts +10 -8
- package/lib/util.ts +10 -3
- package/package.json +1 -1
- package/scripts/embed-metrics.ts +7 -6
- package/test/aas-onboard.test.ts +217 -0
- package/test/checkup.integration.test.ts +55 -0
- package/test/checkup.test.ts +471 -1
- package/test/mcp-server.test.ts +4 -0
- package/test/monitoring.test.ts +128 -49
- package/test/schema-validation.test.ts +29 -0
- package/test/test-utils.ts +8 -0
- package/test/util.test.ts +44 -0
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Hands-off AAS auto-onboarding for `mon local-install` (platform-all #338).
|
|
3
|
+
*
|
|
4
|
+
* After the monitoring stack is up and the instance is adopted, the CLI arms
|
|
5
|
+
* AAS collection without an operator step:
|
|
6
|
+
* 1. mint a `pgai-aas-collect` Grafana Viewer service-account token on the
|
|
7
|
+
* LOCAL Grafana (the CLI holds the admin password),
|
|
8
|
+
* 2. resolve the numeric Prometheus datasource id,
|
|
9
|
+
* 3. read the (cluster, node_name) labels straight from the pgwatch target
|
|
10
|
+
* config the CLI itself wrote (buildInstance's custom_tags) — no live
|
|
11
|
+
* series query, so no waiters>0 timing dependency,
|
|
12
|
+
* 4. hand all of it to the platform via the API-token RPC
|
|
13
|
+
* v1.monitoring_instance_aas_register, which encrypts the token and stores
|
|
14
|
+
* the AAS state keys (it makes no outbound Grafana call of its own).
|
|
15
|
+
*
|
|
16
|
+
* Best-effort, exactly like registerMonitoringInstance: never throws, returns a
|
|
17
|
+
* result the caller logs. The plaintext SA token only ever lives in locals.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
import { loadInstances } from "./instances";
|
|
21
|
+
import { resolveBaseUrls } from "./util";
|
|
22
|
+
|
|
23
|
+
const SA_NAME = "pgai-aas-collect";
|
|
24
|
+
|
|
25
|
+
/** Local Grafana base URL (published on the monitoring host). Overridable for tests/odd setups. */
|
|
26
|
+
function grafanaBaseUrl(): string {
|
|
27
|
+
return (process.env.PGAI_GRAFANA_LOCAL_URL || "http://localhost:3000").replace(/\/+$/, "");
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
function grafanaAdminUser(): string {
|
|
31
|
+
return process.env.GF_SECURITY_ADMIN_USER || "admin";
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/** Parse a vcpus input (flag/env) to a non-negative integer; 0 = "unknown" fallback. */
|
|
35
|
+
export function parseVcpus(raw: string | number | undefined | null): number {
|
|
36
|
+
if (raw === undefined || raw === null || raw === "") return 0;
|
|
37
|
+
const n = typeof raw === "number" ? raw : parseInt(String(raw).trim(), 10);
|
|
38
|
+
return Number.isFinite(n) && n > 0 ? Math.floor(n) : 0;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Read the single enabled target's (cluster, node_name) from the pgwatch
|
|
43
|
+
* instances file. Returns null when it can't be determined unambiguously
|
|
44
|
+
* (0 or >1 enabled targets) — AAS onboards exactly one (cluster, node) pair.
|
|
45
|
+
*/
|
|
46
|
+
export function resolveAasLabels(instancesPath: string): { cluster: string; node: string } | null {
|
|
47
|
+
let instances;
|
|
48
|
+
try {
|
|
49
|
+
instances = loadInstances(instancesPath);
|
|
50
|
+
} catch {
|
|
51
|
+
return null;
|
|
52
|
+
}
|
|
53
|
+
const enabled = instances.filter((i) => i.is_enabled !== false);
|
|
54
|
+
if (enabled.length !== 1) return null;
|
|
55
|
+
const tags = (enabled[0].custom_tags || {}) as Record<string, unknown>;
|
|
56
|
+
const cluster = typeof tags.cluster === "string" && tags.cluster ? tags.cluster : "default";
|
|
57
|
+
const node = typeof tags.node_name === "string" && tags.node_name ? tags.node_name : enabled[0].name;
|
|
58
|
+
if (!cluster || !node) return null;
|
|
59
|
+
return { cluster, node };
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
async function grafanaApi(
|
|
63
|
+
method: string,
|
|
64
|
+
pathPart: string,
|
|
65
|
+
adminPassword: string,
|
|
66
|
+
body?: unknown
|
|
67
|
+
): Promise<Response> {
|
|
68
|
+
const auth = Buffer.from(`${grafanaAdminUser()}:${adminPassword}`).toString("base64");
|
|
69
|
+
return fetch(`${grafanaBaseUrl()}${pathPart}`, {
|
|
70
|
+
method,
|
|
71
|
+
headers: { "Content-Type": "application/json", Authorization: `Basic ${auth}` },
|
|
72
|
+
body: body === undefined ? undefined : JSON.stringify(body),
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Find-or-create the pgai-aas-collect Viewer service account on the local
|
|
78
|
+
* Grafana and mint a fresh glsa_ token. Returns the token or null on any failure.
|
|
79
|
+
*
|
|
80
|
+
* We deliberately do NOT prune prior tokens: deleting them here is racy — a
|
|
81
|
+
* concurrent or repeated install could delete the token the platform currently
|
|
82
|
+
* holds (stored encrypted), silently 401-ing collection until the next register.
|
|
83
|
+
* The unique mint name already avoids 409s, and orphaned Viewer tokens are
|
|
84
|
+
* benign; token hygiene is left to a separate, non-racy mechanism.
|
|
85
|
+
*/
|
|
86
|
+
export async function mintAasServiceAccountToken(
|
|
87
|
+
adminPassword: string,
|
|
88
|
+
debug = false
|
|
89
|
+
): Promise<string | null> {
|
|
90
|
+
const log = (m: string) => debug && console.error(`Debug: AAS SA mint: ${m}`);
|
|
91
|
+
try {
|
|
92
|
+
let saId: number | null = null;
|
|
93
|
+
|
|
94
|
+
const search = await grafanaApi("GET", `/api/serviceaccounts/search?query=${SA_NAME}`, adminPassword);
|
|
95
|
+
if (search.ok) {
|
|
96
|
+
const data = (await search.json().catch(() => null)) as { serviceAccounts?: Array<{ id?: unknown; name?: unknown }> } | null;
|
|
97
|
+
const found = (data?.serviceAccounts || []).find((s) => s.name === SA_NAME);
|
|
98
|
+
if (found && typeof found.id === "number") saId = found.id;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
if (saId == null) {
|
|
102
|
+
const created = await grafanaApi("POST", "/api/serviceaccounts", adminPassword, { name: SA_NAME, role: "Viewer" });
|
|
103
|
+
if (!created.ok) {
|
|
104
|
+
log(`create SA failed: HTTP ${created.status}`);
|
|
105
|
+
return null;
|
|
106
|
+
}
|
|
107
|
+
const cj = (await created.json().catch(() => null)) as { id?: unknown } | null;
|
|
108
|
+
if (typeof cj?.id !== "number") return null;
|
|
109
|
+
saId = cj.id;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// Unique token name avoids a 409 on a pre-existing name (no prune needed).
|
|
113
|
+
const mint = await grafanaApi("POST", `/api/serviceaccounts/${saId}/tokens`, adminPassword, {
|
|
114
|
+
name: `aas-collect-${Date.now()}`,
|
|
115
|
+
role: "Viewer",
|
|
116
|
+
});
|
|
117
|
+
if (!mint.ok) {
|
|
118
|
+
log(`mint token failed: HTTP ${mint.status}`);
|
|
119
|
+
return null;
|
|
120
|
+
}
|
|
121
|
+
const mj = (await mint.json().catch(() => null)) as { key?: unknown } | null;
|
|
122
|
+
return typeof mj?.key === "string" ? mj.key : null;
|
|
123
|
+
} catch (err) {
|
|
124
|
+
log((err as Error).message);
|
|
125
|
+
return null;
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Resolve the single Prometheus-typed datasource's numeric id on the local
|
|
131
|
+
* Grafana. The monitoring stack's VictoriaMetrics datasource is type
|
|
132
|
+
* "prometheus" (VM speaks PromQL), and the stack registers exactly one such
|
|
133
|
+
* datasource — the same one the collector queries. >1 or 0 → null (skip),
|
|
134
|
+
* matching v1.aas_onboard's discovery contract.
|
|
135
|
+
*/
|
|
136
|
+
export async function resolveDatasourceId(adminPassword: string, debug = false): Promise<number | null> {
|
|
137
|
+
try {
|
|
138
|
+
const res = await grafanaApi("GET", "/api/datasources", adminPassword);
|
|
139
|
+
if (!res.ok) return null;
|
|
140
|
+
const list = (await res.json().catch(() => [])) as Array<{ id?: unknown; type?: unknown }>;
|
|
141
|
+
const prom = list.filter((d) => d.type === "prometheus");
|
|
142
|
+
if (prom.length !== 1) {
|
|
143
|
+
if (debug) console.error(`Debug: AAS: expected 1 prometheus datasource, found ${prom.length}`);
|
|
144
|
+
return null;
|
|
145
|
+
}
|
|
146
|
+
return typeof prom[0].id === "number" ? prom[0].id : null;
|
|
147
|
+
} catch {
|
|
148
|
+
return null;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
export interface AasRegisterResult {
|
|
153
|
+
ok: boolean;
|
|
154
|
+
reason?: string;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
/**
|
|
158
|
+
* Arm hands-off AAS collection for an adopted monitoring instance. Best-effort:
|
|
159
|
+
* never throws; returns {ok:false, reason} on any failure so the caller can log
|
|
160
|
+
* a non-fatal warning. Mirrors registerMonitoringInstance's API-call shape.
|
|
161
|
+
*/
|
|
162
|
+
export async function registerAasCollection(
|
|
163
|
+
apiKey: string,
|
|
164
|
+
instanceId: string,
|
|
165
|
+
opts: {
|
|
166
|
+
grafanaPassword: string;
|
|
167
|
+
instancesPath: string;
|
|
168
|
+
vcpus: number;
|
|
169
|
+
apiBaseUrl?: string;
|
|
170
|
+
debug?: boolean;
|
|
171
|
+
fetchImpl?: typeof fetch;
|
|
172
|
+
}
|
|
173
|
+
): Promise<AasRegisterResult> {
|
|
174
|
+
const debug = !!opts.debug;
|
|
175
|
+
try {
|
|
176
|
+
if (!apiKey || !instanceId) return { ok: false, reason: "missing api key or instance id" };
|
|
177
|
+
|
|
178
|
+
const labels = resolveAasLabels(opts.instancesPath);
|
|
179
|
+
if (!labels) return { ok: false, reason: "could not determine a single (cluster, node_name) target" };
|
|
180
|
+
|
|
181
|
+
const datasourceId = await resolveDatasourceId(opts.grafanaPassword, debug);
|
|
182
|
+
if (datasourceId == null) return { ok: false, reason: "could not resolve the Prometheus datasource id" };
|
|
183
|
+
|
|
184
|
+
const saToken = await mintAasServiceAccountToken(opts.grafanaPassword, debug);
|
|
185
|
+
if (!saToken) return { ok: false, reason: "could not mint a Grafana service-account token" };
|
|
186
|
+
|
|
187
|
+
const { apiBaseUrl } = resolveBaseUrls({ apiBaseUrl: opts.apiBaseUrl });
|
|
188
|
+
const url = `${apiBaseUrl}/rpc/monitoring_instance_aas_register`;
|
|
189
|
+
const doFetch = opts.fetchImpl || fetch;
|
|
190
|
+
if (debug) console.error(`Debug: AAS: POST ${url} (cluster=${labels.cluster}, node=${labels.node}, vcpus=${opts.vcpus}, ds=${datasourceId})`);
|
|
191
|
+
|
|
192
|
+
const res = await doFetch(url, {
|
|
193
|
+
method: "POST",
|
|
194
|
+
headers: { "Content-Type": "application/json" },
|
|
195
|
+
body: JSON.stringify({
|
|
196
|
+
api_token: apiKey,
|
|
197
|
+
instance_id: instanceId,
|
|
198
|
+
sa_token: saToken,
|
|
199
|
+
cluster_name: labels.cluster,
|
|
200
|
+
node_name: labels.node,
|
|
201
|
+
vcpus: opts.vcpus,
|
|
202
|
+
datasource_id: datasourceId,
|
|
203
|
+
}),
|
|
204
|
+
});
|
|
205
|
+
if (!res.ok) {
|
|
206
|
+
// Log status only — never the response body: a platform could echo the
|
|
207
|
+
// request payload (incl. sa_token) in an error body, which must not reach
|
|
208
|
+
// the user's debug log.
|
|
209
|
+
if (debug) console.error(`Debug: AAS register failed: HTTP ${res.status}`);
|
|
210
|
+
return { ok: false, reason: `platform returned HTTP ${res.status}` };
|
|
211
|
+
}
|
|
212
|
+
return { ok: true };
|
|
213
|
+
} catch (err) {
|
|
214
|
+
if (debug) console.error(`Debug: AAS register error: ${(err as Error).message}`);
|
|
215
|
+
return { ok: false, reason: (err as Error).message };
|
|
216
|
+
}
|
|
217
|
+
}
|
package/lib/checkup-api.ts
CHANGED
|
@@ -320,6 +320,81 @@ async function postRpc<T>(params: {
|
|
|
320
320
|
});
|
|
321
321
|
}
|
|
322
322
|
|
|
323
|
+
/**
|
|
324
|
+
* Result of an API key pre-flight verification.
|
|
325
|
+
* - "valid": the key was accepted by the API
|
|
326
|
+
* - "invalid": the API definitively rejected the key (HTTP 401/403)
|
|
327
|
+
* - "unknown": verification could not be completed (network error, timeout,
|
|
328
|
+
* unexpected status) — callers should warn and continue, not block the run
|
|
329
|
+
*/
|
|
330
|
+
export type ApiKeyVerification =
|
|
331
|
+
| { status: "valid" }
|
|
332
|
+
| { status: "invalid"; statusCode: number }
|
|
333
|
+
| { status: "unknown"; detail: string };
|
|
334
|
+
|
|
335
|
+
// Timeout for the auth pre-flight (shorter than regular RPC timeout: this is
|
|
336
|
+
// an optional fast check and must not noticeably delay the run when the API
|
|
337
|
+
// is slow or unreachable).
|
|
338
|
+
const VERIFY_API_KEY_TIMEOUT_MS = 10_000;
|
|
339
|
+
|
|
340
|
+
/**
|
|
341
|
+
* Verify an API key with a cheap, side-effect-free authenticated call
|
|
342
|
+
* (GET /checkup_reports?limit=1 — the same endpoint the `reports` command
|
|
343
|
+
* uses) so expensive work can fail fast on bad credentials.
|
|
344
|
+
*
|
|
345
|
+
* Only a definitive HTTP 401/403 is reported as "invalid". Network errors,
|
|
346
|
+
* timeouts, and unexpected statuses are reported as "unknown" so a transient
|
|
347
|
+
* pre-flight failure never blocks a run that might otherwise succeed.
|
|
348
|
+
*/
|
|
349
|
+
export async function verifyApiKey(params: {
|
|
350
|
+
apiKey: string;
|
|
351
|
+
apiBaseUrl: string;
|
|
352
|
+
timeoutMs?: number;
|
|
353
|
+
}): Promise<ApiKeyVerification> {
|
|
354
|
+
const { apiKey, apiBaseUrl, timeoutMs = VERIFY_API_KEY_TIMEOUT_MS } = params;
|
|
355
|
+
const base = normalizeBaseUrl(apiBaseUrl);
|
|
356
|
+
const url = new URL(`${base}/checkup_reports`);
|
|
357
|
+
url.searchParams.set("limit", "1");
|
|
358
|
+
|
|
359
|
+
// Same plaintext-HTTP guard as postRpc: never send the API key over plain
|
|
360
|
+
// HTTP to a non-loopback host. Report "unknown" rather than aborting the
|
|
361
|
+
// run — the upload path raises the definitive, actionable error.
|
|
362
|
+
if (url.protocol === "http:") {
|
|
363
|
+
const hostname = url.hostname.replace(/^\[|\]$/g, "");
|
|
364
|
+
const isLoopback = ["localhost", "127.0.0.1", "::1"].includes(hostname);
|
|
365
|
+
if (!isLoopback && process.env.CHECKUP_ALLOW_HTTP !== "1") {
|
|
366
|
+
return {
|
|
367
|
+
status: "unknown",
|
|
368
|
+
detail: `refusing to send API key over plaintext HTTP to '${url.host}'`,
|
|
369
|
+
};
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
const controller = new AbortController();
|
|
374
|
+
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
|
375
|
+
try {
|
|
376
|
+
const response = await fetch(url.toString(), {
|
|
377
|
+
method: "GET",
|
|
378
|
+
headers: { "access-token": apiKey },
|
|
379
|
+
signal: controller.signal,
|
|
380
|
+
});
|
|
381
|
+
// Drain the body so the connection is released cleanly.
|
|
382
|
+
await response.text().catch(() => "");
|
|
383
|
+
if (response.status === 401 || response.status === 403) {
|
|
384
|
+
return { status: "invalid", statusCode: response.status };
|
|
385
|
+
}
|
|
386
|
+
if (response.ok) {
|
|
387
|
+
return { status: "valid" };
|
|
388
|
+
}
|
|
389
|
+
return { status: "unknown", detail: `HTTP ${response.status}` };
|
|
390
|
+
} catch (err) {
|
|
391
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
392
|
+
return { status: "unknown", detail: message };
|
|
393
|
+
} finally {
|
|
394
|
+
clearTimeout(timer);
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
|
|
323
398
|
/**
|
|
324
399
|
* Create a new checkup report in the PostgresAI backend.
|
|
325
400
|
* This creates the parent report container; individual check results
|
package/lib/checkup-summary.ts
CHANGED
|
@@ -40,6 +40,7 @@ export function generateCheckSummary(checkId: string, report: any): CheckSummary
|
|
|
40
40
|
case 'D001': return summarizeD001(nodeData);
|
|
41
41
|
case 'D004': return summarizeD004(nodeData);
|
|
42
42
|
case 'F001': return summarizeF001(nodeData);
|
|
43
|
+
case 'F003': return summarizeF003(nodeData);
|
|
43
44
|
case 'G001': return summarizeG001(nodeData);
|
|
44
45
|
case 'G003': return summarizeG003(nodeData);
|
|
45
46
|
default:
|
|
@@ -243,6 +244,35 @@ function summarizeF001(nodeData: any): CheckSummary {
|
|
|
243
244
|
};
|
|
244
245
|
}
|
|
245
246
|
|
|
247
|
+
function summarizeF003(nodeData: any): CheckSummary {
|
|
248
|
+
const data = nodeData?.data || {};
|
|
249
|
+
let flaggedCount = 0;
|
|
250
|
+
let disabledCount = 0;
|
|
251
|
+
|
|
252
|
+
// Aggregate across all databases. Only non-tiny disabled-autovacuum tables
|
|
253
|
+
// (autovacuum_disabled_flagged_count) trigger a warning - tiny tables with
|
|
254
|
+
// autovacuum off are common and not worth alerting on.
|
|
255
|
+
for (const dbData of Object.values(data)) {
|
|
256
|
+
const dbEntry = dbData as any;
|
|
257
|
+
flaggedCount += dbEntry.flagged_count || 0;
|
|
258
|
+
disabledCount += dbEntry.autovacuum_disabled_flagged_count || 0;
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
if (flaggedCount === 0 && disabledCount === 0) {
|
|
262
|
+
return { status: 'ok', message: 'No significant dead tuple accumulation' };
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
const parts: string[] = [];
|
|
266
|
+
if (flaggedCount > 0) {
|
|
267
|
+
parts.push(`${flaggedCount} table${flaggedCount > 1 ? 's' : ''} with excessive dead tuples`);
|
|
268
|
+
}
|
|
269
|
+
if (disabledCount > 0) {
|
|
270
|
+
parts.push(`${disabledCount} table${disabledCount > 1 ? 's' : ''} with autovacuum disabled`);
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
return { status: 'warning', message: parts.join(', ') };
|
|
274
|
+
}
|
|
275
|
+
|
|
246
276
|
function summarizeG001(nodeData: any): CheckSummary {
|
|
247
277
|
const data = nodeData?.data || {};
|
|
248
278
|
const settingsCount = Object.keys(data).length;
|