npm - postgresai - Versions diffs - 0.15.0 → 0.16.0-dev.1 - Mend

postgresai 0.15.0 → 0.16.0-dev.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/README.md +3 -0
package/bin/postgres-ai.ts +210 -31
package/dist/bin/postgres-ai.js +7749 -7248
package/lib/aas-onboard.ts +251 -0
package/lib/checkup-api.ts +75 -0
package/lib/checkup-summary.ts +30 -0
package/lib/checkup.ts +227 -21
package/lib/metrics-loader.ts +10 -8
package/lib/util.ts +10 -3
package/package.json +1 -1
package/scripts/embed-metrics.ts +7 -6
package/test/aas-onboard.test.ts +301 -0
package/test/checkup.integration.test.ts +55 -0
package/test/checkup.test.ts +471 -1
package/test/mcp-server.test.ts +4 -0
package/test/monitoring.test.ts +128 -49
package/test/schema-validation.test.ts +29 -0
package/test/test-utils.ts +8 -0
package/test/util.test.ts +44 -0

package/lib/aas-onboard.ts ADDED Viewed

@@ -0,0 +1,251 @@
+/**
+ * Hands-off AAS auto-onboarding for `mon local-install` (platform-all #338).
+ *
+ * After the monitoring stack is up and the instance is adopted, the CLI arms
+ * AAS collection without an operator step:
+ *   1. mint a `pgai-aas-collect` Grafana Viewer service-account token on the
+ *      LOCAL Grafana (the CLI holds the admin password),
+ *   2. resolve the numeric Prometheus datasource id,
+ *   3. read the (cluster, node_name) labels straight from the pgwatch target
+ *      config the CLI itself wrote (buildInstance's custom_tags) — no live
+ *      series query, so no waiters>0 timing dependency,
+ *   4. hand all of it to the platform via the API-token RPC
+ *      v1.monitoring_instance_aas_register, which encrypts the token and stores
+ *      the AAS state keys (it makes no outbound Grafana call of its own).
+ *
+ * Best-effort, exactly like registerMonitoringInstance: never throws, returns a
+ * result the caller logs. The plaintext SA token only ever lives in locals.
+ */
+import { loadInstances } from "./instances";
+import { resolveBaseUrls } from "./util";
+const SA_NAME = "pgai-aas-collect";
+/** Local Grafana base URL (published on the monitoring host). Overridable for tests/odd setups. */
+function grafanaBaseUrl(): string {
+  return (process.env.PGAI_GRAFANA_LOCAL_URL || "http://localhost:3000").replace(/\/+$/, "");
+}
+function grafanaAdminUser(): string {
+  // The monitoring stack's compose hardcodes the Grafana admin user to
+  // "monitor" (GF_SECURITY_ADMIN_USER: monitor), so default to that rather than
+  // Grafana's stock "admin" — otherwise AAS arming logs in as the wrong user
+  // and every datasource lookup 401s. An explicit env override still wins.
+  return process.env.GF_SECURITY_ADMIN_USER || "monitor";
+}
+/** Parse a vcpus input (flag/env) to a non-negative integer; 0 = "unknown" fallback. */
+export function parseVcpus(raw: string | number | undefined | null): number {
+  if (raw === undefined || raw === null || raw === "") return 0;
+  const n = typeof raw === "number" ? raw : parseInt(String(raw).trim(), 10);
+  return Number.isFinite(n) && n > 0 ? Math.floor(n) : 0;
+}
+/**
+ * Read the single enabled target's (cluster, node_name) from the pgwatch
+ * instances file. Returns null when it can't be determined unambiguously
+ * (0 or >1 enabled targets) — AAS onboards exactly one (cluster, node) pair.
+ */
+export function resolveAasLabels(instancesPath: string): { cluster: string; node: string } | null {
+  let instances;
+  try {
+    instances = loadInstances(instancesPath);
+  } catch {
+    return null;
+  }
+  const enabled = instances.filter((i) => i.is_enabled !== false);
+  if (enabled.length !== 1) return null;
+  const tags = (enabled[0].custom_tags || {}) as Record<string, unknown>;
+  const cluster = typeof tags.cluster === "string" && tags.cluster ? tags.cluster : "default";
+  const node = typeof tags.node_name === "string" && tags.node_name ? tags.node_name : enabled[0].name;
+  if (!cluster || !node) return null;
+  return { cluster, node };
+}
+async function grafanaApi(
+  method: string,
+  pathPart: string,
+  adminPassword: string,
+  body?: unknown
+): Promise<Response> {
+  const auth = Buffer.from(`${grafanaAdminUser()}:${adminPassword}`).toString("base64");
+  return fetch(`${grafanaBaseUrl()}${pathPart}`, {
+    method,
+    headers: { "Content-Type": "application/json", Authorization: `Basic ${auth}` },
+    body: body === undefined ? undefined : JSON.stringify(body),
+  });
+}
+/**
+ * Find-or-create the pgai-aas-collect Viewer service account on the local
+ * Grafana and mint a fresh glsa_ token. Returns the token or null on any failure.
+ *
+ * We deliberately do NOT prune prior tokens: deleting them here is racy — a
+ * concurrent or repeated install could delete the token the platform currently
+ * holds (stored encrypted), silently 401-ing collection until the next register.
+ * The unique mint name already avoids 409s, and orphaned Viewer tokens are
+ * benign; token hygiene is left to a separate, non-racy mechanism.
+ */
+export async function mintAasServiceAccountToken(
+  adminPassword: string,
+  debug = false
+): Promise<string | null> {
+  const log = (m: string) => debug && console.error(`Debug: AAS SA mint: ${m}`);
+  try {
+    let saId: number | null = null;
+    const search = await grafanaApi("GET", `/api/serviceaccounts/search?query=${SA_NAME}`, adminPassword);
+    if (search.ok) {
+      const data = (await search.json().catch(() => null)) as { serviceAccounts?: Array<{ id?: unknown; name?: unknown }> } | null;
+      const found = (data?.serviceAccounts || []).find((s) => s.name === SA_NAME);
+      if (found && typeof found.id === "number") saId = found.id;
+    }
+    if (saId == null) {
+      const created = await grafanaApi("POST", "/api/serviceaccounts", adminPassword, { name: SA_NAME, role: "Viewer" });
+      if (!created.ok) {
+        log(`create SA failed: HTTP ${created.status}`);
+        return null;
+      }
+      const cj = (await created.json().catch(() => null)) as { id?: unknown } | null;
+      if (typeof cj?.id !== "number") return null;
+      saId = cj.id;
+    }
+    // Unique token name avoids a 409 on a pre-existing name (no prune needed).
+    const mint = await grafanaApi("POST", `/api/serviceaccounts/${saId}/tokens`, adminPassword, {
+      name: `aas-collect-${Date.now()}`,
+      role: "Viewer",
+    });
+    if (!mint.ok) {
+      log(`mint token failed: HTTP ${mint.status}`);
+      return null;
+    }
+    const mj = (await mint.json().catch(() => null)) as { key?: unknown } | null;
+    return typeof mj?.key === "string" ? mj.key : null;
+  } catch (err) {
+    log((err as Error).message);
+    return null;
+  }
+}
+/**
+ * Resolve the single Prometheus-typed datasource's numeric id on the local
+ * Grafana. The monitoring stack's VictoriaMetrics datasource is type
+ * "prometheus" (VM speaks PromQL), and the stack registers exactly one such
+ * datasource — the same one the collector queries. 0 / API-not-ready → null
+ * (a provisioning transient — the readiness loop retries); >1 → "ambiguous"
+ * (a permanent misconfiguration — the loop stops at once), matching
+ * v1.aas_onboard's >1 skip.
+ */
+export async function resolveDatasourceId(adminPassword: string, debug = false): Promise<number | "ambiguous" | null> {
+  try {
+    const res = await grafanaApi("GET", "/api/datasources", adminPassword);
+    if (!res.ok) return null;
+    const list = (await res.json().catch(() => [])) as Array<{ id?: unknown; type?: unknown }>;
+    const prom = list.filter((d) => d.type === "prometheus");
+    if (prom.length > 1) {
+      // >1 is a permanent misconfiguration, not a provisioning transient: the
+      // datasource count only grows as Grafana provisions, so retrying can never
+      // resolve it. Signal a definitive skip so the readiness loop bails at once.
+      if (debug) console.error(`Debug: AAS: ${prom.length} prometheus datasources (ambiguous); not retrying`);
+      return "ambiguous";
+    }
+    if (prom.length === 0) {
+      if (debug) console.error(`Debug: AAS: no prometheus datasource resolvable yet`);
+      return null;
+    }
+    return typeof prom[0].id === "number" ? prom[0].id : null;
+  } catch {
+    return null;
+  }
+}
+export interface AasRegisterResult {
+  ok: boolean;
+  reason?: string;
+}
+/**
+ * Arm hands-off AAS collection for an adopted monitoring instance. Best-effort:
+ * never throws; returns {ok:false, reason} on any failure so the caller can log
+ * a non-fatal warning. Mirrors registerMonitoringInstance's API-call shape.
+ */
+export async function registerAasCollection(
+  apiKey: string,
+  instanceId: string,
+  opts: {
+    grafanaPassword: string;
+    instancesPath: string;
+    vcpus: number;
+    apiBaseUrl?: string;
+    debug?: boolean;
+    fetchImpl?: typeof fetch;
+    // Grafana-readiness polling for the datasource lookup (Grafana has just
+    // been started by `compose up`). Defaults: 20 attempts × 3s.
+    datasourceMaxAttempts?: number;
+    datasourceRetryDelayMs?: number;
+  }
+): Promise<AasRegisterResult> {
+  const debug = !!opts.debug;
+  try {
+    if (!apiKey || !instanceId) return { ok: false, reason: "missing api key or instance id" };
+    const labels = resolveAasLabels(opts.instancesPath);
+    if (!labels) return { ok: false, reason: "could not determine a single (cluster, node_name) target" };
+    // Grafana was just started by `compose up`; it needs time to create its
+    // admin user, provision datasources, and serve its API. Querying too early
+    // makes the datasource lookup fail transiently, so poll until it resolves
+    // (best-effort, capped — the install never blocks on this).
+    const maxAttempts = opts.datasourceMaxAttempts ?? 20;
+    const retryDelayMs = opts.datasourceRetryDelayMs ?? 3000;
+    let datasourceId: number | null = null;
+    for (let attempt = 1; attempt <= maxAttempts; attempt++) {
+      const resolved = await resolveDatasourceId(opts.grafanaPassword, debug);
+      if (typeof resolved === "number") { datasourceId = resolved; break; }
+      // "ambiguous" (>1 prometheus datasource) is permanent — retrying can't fix
+      // it, so stop polling immediately instead of waiting out the whole budget.
+      if (resolved === "ambiguous") break;
+      if (attempt < maxAttempts) {
+        if (debug) console.error(`Debug: AAS: datasource not resolvable yet (attempt ${attempt}/${maxAttempts}); waiting for Grafana…`);
+        await new Promise((resolve) => setTimeout(resolve, retryDelayMs));
+      }
+    }
+    if (datasourceId == null) return { ok: false, reason: "could not resolve the Prometheus datasource id" };
+    const saToken = await mintAasServiceAccountToken(opts.grafanaPassword, debug);
+    if (!saToken) return { ok: false, reason: "could not mint a Grafana service-account token" };
+    const { apiBaseUrl } = resolveBaseUrls({ apiBaseUrl: opts.apiBaseUrl });
+    const url = `${apiBaseUrl}/rpc/monitoring_instance_aas_register`;
+    const doFetch = opts.fetchImpl || fetch;
+    if (debug) console.error(`Debug: AAS: POST ${url} (cluster=${labels.cluster}, node=${labels.node}, vcpus=${opts.vcpus}, ds=${datasourceId})`);
+    const res = await doFetch(url, {
+      method: "POST",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify({
+        api_token: apiKey,
+        instance_id: instanceId,
+        sa_token: saToken,
+        cluster_name: labels.cluster,
+        node_name: labels.node,
+        vcpus: opts.vcpus,
+        datasource_id: datasourceId,
+      }),
+    });
+    if (!res.ok) {
+      // Log status only — never the response body: a platform could echo the
+      // request payload (incl. sa_token) in an error body, which must not reach
+      // the user's debug log.
+      if (debug) console.error(`Debug: AAS register failed: HTTP ${res.status}`);
+      return { ok: false, reason: `platform returned HTTP ${res.status}` };
+    }
+    return { ok: true };
+  } catch (err) {
+    if (debug) console.error(`Debug: AAS register error: ${(err as Error).message}`);
+    return { ok: false, reason: (err as Error).message };
+  }
+}

package/lib/checkup-api.ts CHANGED Viewed

@@ -320,6 +320,81 @@ async function postRpc<T>(params: {
   });
 }
+/**
+ * Result of an API key pre-flight verification.
+ * - "valid": the key was accepted by the API
+ * - "invalid": the API definitively rejected the key (HTTP 401/403)
+ * - "unknown": verification could not be completed (network error, timeout,
+ *   unexpected status) — callers should warn and continue, not block the run
+ */
+export type ApiKeyVerification =
+  | { status: "valid" }
+  | { status: "invalid"; statusCode: number }
+  | { status: "unknown"; detail: string };
+// Timeout for the auth pre-flight (shorter than regular RPC timeout: this is
+// an optional fast check and must not noticeably delay the run when the API
+// is slow or unreachable).
+const VERIFY_API_KEY_TIMEOUT_MS = 10_000;
+/**
+ * Verify an API key with a cheap, side-effect-free authenticated call
+ * (GET /checkup_reports?limit=1 — the same endpoint the `reports` command
+ * uses) so expensive work can fail fast on bad credentials.
+ *
+ * Only a definitive HTTP 401/403 is reported as "invalid". Network errors,
+ * timeouts, and unexpected statuses are reported as "unknown" so a transient
+ * pre-flight failure never blocks a run that might otherwise succeed.
+ */
+export async function verifyApiKey(params: {
+  apiKey: string;
+  apiBaseUrl: string;
+  timeoutMs?: number;
+}): Promise<ApiKeyVerification> {
+  const { apiKey, apiBaseUrl, timeoutMs = VERIFY_API_KEY_TIMEOUT_MS } = params;
+  const base = normalizeBaseUrl(apiBaseUrl);
+  const url = new URL(`${base}/checkup_reports`);
+  url.searchParams.set("limit", "1");
+  // Same plaintext-HTTP guard as postRpc: never send the API key over plain
+  // HTTP to a non-loopback host. Report "unknown" rather than aborting the
+  // run — the upload path raises the definitive, actionable error.
+  if (url.protocol === "http:") {
+    const hostname = url.hostname.replace(/^\[|\]$/g, "");
+    const isLoopback = ["localhost", "127.0.0.1", "::1"].includes(hostname);
+    if (!isLoopback && process.env.CHECKUP_ALLOW_HTTP !== "1") {
+      return {
+        status: "unknown",
+        detail: `refusing to send API key over plaintext HTTP to '${url.host}'`,
+      };
+    }
+  }
+  const controller = new AbortController();
+  const timer = setTimeout(() => controller.abort(), timeoutMs);
+  try {
+    const response = await fetch(url.toString(), {
+      method: "GET",
+      headers: { "access-token": apiKey },
+      signal: controller.signal,
+    });
+    // Drain the body so the connection is released cleanly.
+    await response.text().catch(() => "");
+    if (response.status === 401 || response.status === 403) {
+      return { status: "invalid", statusCode: response.status };
+    }
+    if (response.ok) {
+      return { status: "valid" };
+    }
+    return { status: "unknown", detail: `HTTP ${response.status}` };
+  } catch (err) {
+    const message = err instanceof Error ? err.message : String(err);
+    return { status: "unknown", detail: message };
+  } finally {
+    clearTimeout(timer);
+  }
+}
 /**
  * Create a new checkup report in the PostgresAI backend.
  * This creates the parent report container; individual check results

package/lib/checkup-summary.ts CHANGED Viewed

@@ -40,6 +40,7 @@ export function generateCheckSummary(checkId: string, report: any): CheckSummary
     case 'D001': return summarizeD001(nodeData);
     case 'D004': return summarizeD004(nodeData);
     case 'F001': return summarizeF001(nodeData);
+    case 'F003': return summarizeF003(nodeData);
     case 'G001': return summarizeG001(nodeData);
     case 'G003': return summarizeG003(nodeData);
     default:
@@ -243,6 +244,35 @@ function summarizeF001(nodeData: any): CheckSummary {
   };
 }
+function summarizeF003(nodeData: any): CheckSummary {
+  const data = nodeData?.data || {};
+  let flaggedCount = 0;
+  let disabledCount = 0;
+  // Aggregate across all databases. Only non-tiny disabled-autovacuum tables
+  // (autovacuum_disabled_flagged_count) trigger a warning - tiny tables with
+  // autovacuum off are common and not worth alerting on.
+  for (const dbData of Object.values(data)) {
+    const dbEntry = dbData as any;
+    flaggedCount += dbEntry.flagged_count || 0;
+    disabledCount += dbEntry.autovacuum_disabled_flagged_count || 0;
+  }
+  if (flaggedCount === 0 && disabledCount === 0) {
+    return { status: 'ok', message: 'No significant dead tuple accumulation' };
+  }
+  const parts: string[] = [];
+  if (flaggedCount > 0) {
+    parts.push(`${flaggedCount} table${flaggedCount > 1 ? 's' : ''} with excessive dead tuples`);
+  }
+  if (disabledCount > 0) {
+    parts.push(`${disabledCount} table${disabledCount > 1 ? 's' : ''} with autovacuum disabled`);
+  }
+  return { status: 'warning', message: parts.join(', ') };
+}
 function summarizeG001(nodeData: any): CheckSummary {
   const data = nodeData?.data || {};
   const settingsCount = Object.keys(data).length;