postgresai 0.15.0 → 0.16.0-dev.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,251 @@
1
+ /**
2
+ * Hands-off AAS auto-onboarding for `mon local-install` (platform-all #338).
3
+ *
4
+ * After the monitoring stack is up and the instance is adopted, the CLI arms
5
+ * AAS collection without an operator step:
6
+ * 1. mint a `pgai-aas-collect` Grafana Viewer service-account token on the
7
+ * LOCAL Grafana (the CLI holds the admin password),
8
+ * 2. resolve the numeric Prometheus datasource id,
9
+ * 3. read the (cluster, node_name) labels straight from the pgwatch target
10
+ * config the CLI itself wrote (buildInstance's custom_tags) — no live
11
+ * series query, so no waiters>0 timing dependency,
12
+ * 4. hand all of it to the platform via the API-token RPC
13
+ * v1.monitoring_instance_aas_register, which encrypts the token and stores
14
+ * the AAS state keys (it makes no outbound Grafana call of its own).
15
+ *
16
+ * Best-effort, exactly like registerMonitoringInstance: never throws, returns a
17
+ * result the caller logs. The plaintext SA token only ever lives in locals.
18
+ */
19
+
20
+ import { loadInstances } from "./instances";
21
+ import { resolveBaseUrls } from "./util";
22
+
23
+ const SA_NAME = "pgai-aas-collect";
24
+
25
+ /** Local Grafana base URL (published on the monitoring host). Overridable for tests/odd setups. */
26
+ function grafanaBaseUrl(): string {
27
+ return (process.env.PGAI_GRAFANA_LOCAL_URL || "http://localhost:3000").replace(/\/+$/, "");
28
+ }
29
+
30
+ function grafanaAdminUser(): string {
31
+ // The monitoring stack's compose hardcodes the Grafana admin user to
32
+ // "monitor" (GF_SECURITY_ADMIN_USER: monitor), so default to that rather than
33
+ // Grafana's stock "admin" — otherwise AAS arming logs in as the wrong user
34
+ // and every datasource lookup 401s. An explicit env override still wins.
35
+ return process.env.GF_SECURITY_ADMIN_USER || "monitor";
36
+ }
37
+
38
+ /** Parse a vcpus input (flag/env) to a non-negative integer; 0 = "unknown" fallback. */
39
+ export function parseVcpus(raw: string | number | undefined | null): number {
40
+ if (raw === undefined || raw === null || raw === "") return 0;
41
+ const n = typeof raw === "number" ? raw : parseInt(String(raw).trim(), 10);
42
+ return Number.isFinite(n) && n > 0 ? Math.floor(n) : 0;
43
+ }
44
+
45
+ /**
46
+ * Read the single enabled target's (cluster, node_name) from the pgwatch
47
+ * instances file. Returns null when it can't be determined unambiguously
48
+ * (0 or >1 enabled targets) — AAS onboards exactly one (cluster, node) pair.
49
+ */
50
+ export function resolveAasLabels(instancesPath: string): { cluster: string; node: string } | null {
51
+ let instances;
52
+ try {
53
+ instances = loadInstances(instancesPath);
54
+ } catch {
55
+ return null;
56
+ }
57
+ const enabled = instances.filter((i) => i.is_enabled !== false);
58
+ if (enabled.length !== 1) return null;
59
+ const tags = (enabled[0].custom_tags || {}) as Record<string, unknown>;
60
+ const cluster = typeof tags.cluster === "string" && tags.cluster ? tags.cluster : "default";
61
+ const node = typeof tags.node_name === "string" && tags.node_name ? tags.node_name : enabled[0].name;
62
+ if (!cluster || !node) return null;
63
+ return { cluster, node };
64
+ }
65
+
66
+ async function grafanaApi(
67
+ method: string,
68
+ pathPart: string,
69
+ adminPassword: string,
70
+ body?: unknown
71
+ ): Promise<Response> {
72
+ const auth = Buffer.from(`${grafanaAdminUser()}:${adminPassword}`).toString("base64");
73
+ return fetch(`${grafanaBaseUrl()}${pathPart}`, {
74
+ method,
75
+ headers: { "Content-Type": "application/json", Authorization: `Basic ${auth}` },
76
+ body: body === undefined ? undefined : JSON.stringify(body),
77
+ });
78
+ }
79
+
80
+ /**
81
+ * Find-or-create the pgai-aas-collect Viewer service account on the local
82
+ * Grafana and mint a fresh glsa_ token. Returns the token or null on any failure.
83
+ *
84
+ * We deliberately do NOT prune prior tokens: deleting them here is racy — a
85
+ * concurrent or repeated install could delete the token the platform currently
86
+ * holds (stored encrypted), silently 401-ing collection until the next register.
87
+ * The unique mint name already avoids 409s, and orphaned Viewer tokens are
88
+ * benign; token hygiene is left to a separate, non-racy mechanism.
89
+ */
90
+ export async function mintAasServiceAccountToken(
91
+ adminPassword: string,
92
+ debug = false
93
+ ): Promise<string | null> {
94
+ const log = (m: string) => debug && console.error(`Debug: AAS SA mint: ${m}`);
95
+ try {
96
+ let saId: number | null = null;
97
+
98
+ const search = await grafanaApi("GET", `/api/serviceaccounts/search?query=${SA_NAME}`, adminPassword);
99
+ if (search.ok) {
100
+ const data = (await search.json().catch(() => null)) as { serviceAccounts?: Array<{ id?: unknown; name?: unknown }> } | null;
101
+ const found = (data?.serviceAccounts || []).find((s) => s.name === SA_NAME);
102
+ if (found && typeof found.id === "number") saId = found.id;
103
+ }
104
+
105
+ if (saId == null) {
106
+ const created = await grafanaApi("POST", "/api/serviceaccounts", adminPassword, { name: SA_NAME, role: "Viewer" });
107
+ if (!created.ok) {
108
+ log(`create SA failed: HTTP ${created.status}`);
109
+ return null;
110
+ }
111
+ const cj = (await created.json().catch(() => null)) as { id?: unknown } | null;
112
+ if (typeof cj?.id !== "number") return null;
113
+ saId = cj.id;
114
+ }
115
+
116
+ // Unique token name avoids a 409 on a pre-existing name (no prune needed).
117
+ const mint = await grafanaApi("POST", `/api/serviceaccounts/${saId}/tokens`, adminPassword, {
118
+ name: `aas-collect-${Date.now()}`,
119
+ role: "Viewer",
120
+ });
121
+ if (!mint.ok) {
122
+ log(`mint token failed: HTTP ${mint.status}`);
123
+ return null;
124
+ }
125
+ const mj = (await mint.json().catch(() => null)) as { key?: unknown } | null;
126
+ return typeof mj?.key === "string" ? mj.key : null;
127
+ } catch (err) {
128
+ log((err as Error).message);
129
+ return null;
130
+ }
131
+ }
132
+
133
+ /**
134
+ * Resolve the single Prometheus-typed datasource's numeric id on the local
135
+ * Grafana. The monitoring stack's VictoriaMetrics datasource is type
136
+ * "prometheus" (VM speaks PromQL), and the stack registers exactly one such
137
+ * datasource — the same one the collector queries. 0 / API-not-ready → null
138
+ * (a provisioning transient — the readiness loop retries); >1 → "ambiguous"
139
+ * (a permanent misconfiguration — the loop stops at once), matching
140
+ * v1.aas_onboard's >1 skip.
141
+ */
142
+ export async function resolveDatasourceId(adminPassword: string, debug = false): Promise<number | "ambiguous" | null> {
143
+ try {
144
+ const res = await grafanaApi("GET", "/api/datasources", adminPassword);
145
+ if (!res.ok) return null;
146
+ const list = (await res.json().catch(() => [])) as Array<{ id?: unknown; type?: unknown }>;
147
+ const prom = list.filter((d) => d.type === "prometheus");
148
+ if (prom.length > 1) {
149
+ // >1 is a permanent misconfiguration, not a provisioning transient: the
150
+ // datasource count only grows as Grafana provisions, so retrying can never
151
+ // resolve it. Signal a definitive skip so the readiness loop bails at once.
152
+ if (debug) console.error(`Debug: AAS: ${prom.length} prometheus datasources (ambiguous); not retrying`);
153
+ return "ambiguous";
154
+ }
155
+ if (prom.length === 0) {
156
+ if (debug) console.error(`Debug: AAS: no prometheus datasource resolvable yet`);
157
+ return null;
158
+ }
159
+ return typeof prom[0].id === "number" ? prom[0].id : null;
160
+ } catch {
161
+ return null;
162
+ }
163
+ }
164
+
165
+ export interface AasRegisterResult {
166
+ ok: boolean;
167
+ reason?: string;
168
+ }
169
+
170
+ /**
171
+ * Arm hands-off AAS collection for an adopted monitoring instance. Best-effort:
172
+ * never throws; returns {ok:false, reason} on any failure so the caller can log
173
+ * a non-fatal warning. Mirrors registerMonitoringInstance's API-call shape.
174
+ */
175
+ export async function registerAasCollection(
176
+ apiKey: string,
177
+ instanceId: string,
178
+ opts: {
179
+ grafanaPassword: string;
180
+ instancesPath: string;
181
+ vcpus: number;
182
+ apiBaseUrl?: string;
183
+ debug?: boolean;
184
+ fetchImpl?: typeof fetch;
185
+ // Grafana-readiness polling for the datasource lookup (Grafana has just
186
+ // been started by `compose up`). Defaults: 20 attempts × 3s.
187
+ datasourceMaxAttempts?: number;
188
+ datasourceRetryDelayMs?: number;
189
+ }
190
+ ): Promise<AasRegisterResult> {
191
+ const debug = !!opts.debug;
192
+ try {
193
+ if (!apiKey || !instanceId) return { ok: false, reason: "missing api key or instance id" };
194
+
195
+ const labels = resolveAasLabels(opts.instancesPath);
196
+ if (!labels) return { ok: false, reason: "could not determine a single (cluster, node_name) target" };
197
+
198
+ // Grafana was just started by `compose up`; it needs time to create its
199
+ // admin user, provision datasources, and serve its API. Querying too early
200
+ // makes the datasource lookup fail transiently, so poll until it resolves
201
+ // (best-effort, capped — the install never blocks on this).
202
+ const maxAttempts = opts.datasourceMaxAttempts ?? 20;
203
+ const retryDelayMs = opts.datasourceRetryDelayMs ?? 3000;
204
+ let datasourceId: number | null = null;
205
+ for (let attempt = 1; attempt <= maxAttempts; attempt++) {
206
+ const resolved = await resolveDatasourceId(opts.grafanaPassword, debug);
207
+ if (typeof resolved === "number") { datasourceId = resolved; break; }
208
+ // "ambiguous" (>1 prometheus datasource) is permanent — retrying can't fix
209
+ // it, so stop polling immediately instead of waiting out the whole budget.
210
+ if (resolved === "ambiguous") break;
211
+ if (attempt < maxAttempts) {
212
+ if (debug) console.error(`Debug: AAS: datasource not resolvable yet (attempt ${attempt}/${maxAttempts}); waiting for Grafana…`);
213
+ await new Promise((resolve) => setTimeout(resolve, retryDelayMs));
214
+ }
215
+ }
216
+ if (datasourceId == null) return { ok: false, reason: "could not resolve the Prometheus datasource id" };
217
+
218
+ const saToken = await mintAasServiceAccountToken(opts.grafanaPassword, debug);
219
+ if (!saToken) return { ok: false, reason: "could not mint a Grafana service-account token" };
220
+
221
+ const { apiBaseUrl } = resolveBaseUrls({ apiBaseUrl: opts.apiBaseUrl });
222
+ const url = `${apiBaseUrl}/rpc/monitoring_instance_aas_register`;
223
+ const doFetch = opts.fetchImpl || fetch;
224
+ if (debug) console.error(`Debug: AAS: POST ${url} (cluster=${labels.cluster}, node=${labels.node}, vcpus=${opts.vcpus}, ds=${datasourceId})`);
225
+
226
+ const res = await doFetch(url, {
227
+ method: "POST",
228
+ headers: { "Content-Type": "application/json" },
229
+ body: JSON.stringify({
230
+ api_token: apiKey,
231
+ instance_id: instanceId,
232
+ sa_token: saToken,
233
+ cluster_name: labels.cluster,
234
+ node_name: labels.node,
235
+ vcpus: opts.vcpus,
236
+ datasource_id: datasourceId,
237
+ }),
238
+ });
239
+ if (!res.ok) {
240
+ // Log status only — never the response body: a platform could echo the
241
+ // request payload (incl. sa_token) in an error body, which must not reach
242
+ // the user's debug log.
243
+ if (debug) console.error(`Debug: AAS register failed: HTTP ${res.status}`);
244
+ return { ok: false, reason: `platform returned HTTP ${res.status}` };
245
+ }
246
+ return { ok: true };
247
+ } catch (err) {
248
+ if (debug) console.error(`Debug: AAS register error: ${(err as Error).message}`);
249
+ return { ok: false, reason: (err as Error).message };
250
+ }
251
+ }
@@ -320,6 +320,81 @@ async function postRpc<T>(params: {
320
320
  });
321
321
  }
322
322
 
323
+ /**
324
+ * Result of an API key pre-flight verification.
325
+ * - "valid": the key was accepted by the API
326
+ * - "invalid": the API definitively rejected the key (HTTP 401/403)
327
+ * - "unknown": verification could not be completed (network error, timeout,
328
+ * unexpected status) — callers should warn and continue, not block the run
329
+ */
330
+ export type ApiKeyVerification =
331
+ | { status: "valid" }
332
+ | { status: "invalid"; statusCode: number }
333
+ | { status: "unknown"; detail: string };
334
+
335
+ // Timeout for the auth pre-flight (shorter than regular RPC timeout: this is
336
+ // an optional fast check and must not noticeably delay the run when the API
337
+ // is slow or unreachable).
338
+ const VERIFY_API_KEY_TIMEOUT_MS = 10_000;
339
+
340
+ /**
341
+ * Verify an API key with a cheap, side-effect-free authenticated call
342
+ * (GET /checkup_reports?limit=1 — the same endpoint the `reports` command
343
+ * uses) so expensive work can fail fast on bad credentials.
344
+ *
345
+ * Only a definitive HTTP 401/403 is reported as "invalid". Network errors,
346
+ * timeouts, and unexpected statuses are reported as "unknown" so a transient
347
+ * pre-flight failure never blocks a run that might otherwise succeed.
348
+ */
349
+ export async function verifyApiKey(params: {
350
+ apiKey: string;
351
+ apiBaseUrl: string;
352
+ timeoutMs?: number;
353
+ }): Promise<ApiKeyVerification> {
354
+ const { apiKey, apiBaseUrl, timeoutMs = VERIFY_API_KEY_TIMEOUT_MS } = params;
355
+ const base = normalizeBaseUrl(apiBaseUrl);
356
+ const url = new URL(`${base}/checkup_reports`);
357
+ url.searchParams.set("limit", "1");
358
+
359
+ // Same plaintext-HTTP guard as postRpc: never send the API key over plain
360
+ // HTTP to a non-loopback host. Report "unknown" rather than aborting the
361
+ // run — the upload path raises the definitive, actionable error.
362
+ if (url.protocol === "http:") {
363
+ const hostname = url.hostname.replace(/^\[|\]$/g, "");
364
+ const isLoopback = ["localhost", "127.0.0.1", "::1"].includes(hostname);
365
+ if (!isLoopback && process.env.CHECKUP_ALLOW_HTTP !== "1") {
366
+ return {
367
+ status: "unknown",
368
+ detail: `refusing to send API key over plaintext HTTP to '${url.host}'`,
369
+ };
370
+ }
371
+ }
372
+
373
+ const controller = new AbortController();
374
+ const timer = setTimeout(() => controller.abort(), timeoutMs);
375
+ try {
376
+ const response = await fetch(url.toString(), {
377
+ method: "GET",
378
+ headers: { "access-token": apiKey },
379
+ signal: controller.signal,
380
+ });
381
+ // Drain the body so the connection is released cleanly.
382
+ await response.text().catch(() => "");
383
+ if (response.status === 401 || response.status === 403) {
384
+ return { status: "invalid", statusCode: response.status };
385
+ }
386
+ if (response.ok) {
387
+ return { status: "valid" };
388
+ }
389
+ return { status: "unknown", detail: `HTTP ${response.status}` };
390
+ } catch (err) {
391
+ const message = err instanceof Error ? err.message : String(err);
392
+ return { status: "unknown", detail: message };
393
+ } finally {
394
+ clearTimeout(timer);
395
+ }
396
+ }
397
+
323
398
  /**
324
399
  * Create a new checkup report in the PostgresAI backend.
325
400
  * This creates the parent report container; individual check results
@@ -40,6 +40,7 @@ export function generateCheckSummary(checkId: string, report: any): CheckSummary
40
40
  case 'D001': return summarizeD001(nodeData);
41
41
  case 'D004': return summarizeD004(nodeData);
42
42
  case 'F001': return summarizeF001(nodeData);
43
+ case 'F003': return summarizeF003(nodeData);
43
44
  case 'G001': return summarizeG001(nodeData);
44
45
  case 'G003': return summarizeG003(nodeData);
45
46
  default:
@@ -243,6 +244,35 @@ function summarizeF001(nodeData: any): CheckSummary {
243
244
  };
244
245
  }
245
246
 
247
+ function summarizeF003(nodeData: any): CheckSummary {
248
+ const data = nodeData?.data || {};
249
+ let flaggedCount = 0;
250
+ let disabledCount = 0;
251
+
252
+ // Aggregate across all databases. Only non-tiny disabled-autovacuum tables
253
+ // (autovacuum_disabled_flagged_count) trigger a warning - tiny tables with
254
+ // autovacuum off are common and not worth alerting on.
255
+ for (const dbData of Object.values(data)) {
256
+ const dbEntry = dbData as any;
257
+ flaggedCount += dbEntry.flagged_count || 0;
258
+ disabledCount += dbEntry.autovacuum_disabled_flagged_count || 0;
259
+ }
260
+
261
+ if (flaggedCount === 0 && disabledCount === 0) {
262
+ return { status: 'ok', message: 'No significant dead tuple accumulation' };
263
+ }
264
+
265
+ const parts: string[] = [];
266
+ if (flaggedCount > 0) {
267
+ parts.push(`${flaggedCount} table${flaggedCount > 1 ? 's' : ''} with excessive dead tuples`);
268
+ }
269
+ if (disabledCount > 0) {
270
+ parts.push(`${disabledCount} table${disabledCount > 1 ? 's' : ''} with autovacuum disabled`);
271
+ }
272
+
273
+ return { status: 'warning', message: parts.join(', ') };
274
+ }
275
+
246
276
  function summarizeG001(nodeData: any): CheckSummary {
247
277
  const data = nodeData?.data || {};
248
278
  const settingsCount = Object.keys(data).length;