npm - postgresai - Versions diffs - 0.15.0-rc.3 → 0.15.0-rc.5 - Mend

postgresai 0.15.0-rc.3 → 0.15.0-rc.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/bin/postgres-ai.ts +127 -20
package/dist/bin/postgres-ai.js +368 -109
package/lib/checkup-dictionary.ts +0 -11
package/lib/checkup.ts +14 -14
package/lib/init.ts +1 -1
package/lib/metrics-loader.ts +3 -3
package/package.json +1 -1
package/test/upgrade.test.ts +123 -0

package/bin/postgres-ai.ts CHANGED Viewed

@@ -74,6 +74,71 @@ function stripMatchingQuotes(value: string): string {
   return trimmed;
 }
+/**
+ * Required env vars contract for the monitoring stack.
+ *
+ * Keys listed here are required by the docker-compose stack and must exist in
+ * `.env` for the stack to start cleanly. Each entry knows how to mint a safe
+ * default if the key is missing. Existing values are always preserved
+ * verbatim - this function is purely additive.
+ *
+ * This is the spine of the in-place upgrade story: when a user upgrades from
+ * a version that didn't require a key (e.g. 0.14, pre-VM-auth) to one that
+ * does (0.15), `ensureRequiredEnvVars` appends what's missing so the next
+ * `docker compose up` doesn't fail with `missing "<KEY>" env var`.
+ */
+type EnvKeyDefault = {
+  key: string;
+  /** Default value or factory for green-field installs / first upgrade. */
+  defaultValue: () => string;
+  /** Key was introduced in this CLI version - used in human-readable migration logs. */
+  introducedIn: string;
+};
+const REQUIRED_ENV_KEYS: EnvKeyDefault[] = [
+  { key: "REPLICATOR_PASSWORD", defaultValue: () => crypto.randomBytes(32).toString("hex"), introducedIn: "0.13" },
+  { key: "VM_AUTH_USERNAME", defaultValue: () => "vmauth", introducedIn: "0.15" },
+  { key: "VM_AUTH_PASSWORD", defaultValue: () => crypto.randomBytes(18).toString("base64"), introducedIn: "0.15" },
+];
+/**
+ * Read `.env` (if present), append any required keys that are missing, write
+ * back atomically with 0600 perms, and return the list of keys that were added.
+ *
+ * Idempotent: a second call is a no-op once all keys are present.
+ *
+ * Used by `mon local-install`, `mon update`, and `mon update-config` so the
+ * in-place upgrade path picks up newly-required env vars without surprising
+ * the user with a silent boot failure on `sink-prometheus` / `grafana`.
+ */
+function ensureRequiredEnvVars(projectDir: string): string[] {
+  const envFile = path.resolve(projectDir, ".env");
+  const existing = fs.existsSync(envFile) ? fs.readFileSync(envFile, "utf8") : "";
+  const added: string[] = [];
+  const appendLines: string[] = [];
+  for (const spec of REQUIRED_ENV_KEYS) {
+    const re = new RegExp(`^${spec.key}=`, "m");
+    if (!re.test(existing)) {
+      appendLines.push(`${spec.key}=${spec.defaultValue()}`);
+      added.push(spec.key);
+    }
+  }
+  if (appendLines.length === 0) {
+    return added;
+  }
+  // Append (don't overwrite) so we preserve order and any comments the user
+  // may have added to their .env. Make sure we have a trailing newline first.
+  const needsTrailingNewline = existing.length > 0 && !existing.endsWith("\n");
+  const newContent = existing + (needsTrailingNewline ? "\n" : "") + appendLines.join("\n") + "\n";
+  fs.writeFileSync(envFile, newContent, { encoding: "utf8", mode: 0o600 });
+  return added;
+}
 // Helper functions for spawning processes - use Node.js child_process for compatibility
 async function execFilePromise(file: string, args: string[]): Promise<{ stdout: string; stderr: string }> {
   return new Promise((resolve, reject) => {
@@ -2970,41 +3035,83 @@ mon
   });
 mon
   .command("update-config")
-  .description("apply monitoring services configuration (generate sources)")
+  .description("apply monitoring services configuration (generate sources, migrate .env)")
   .action(async () => {
+    let projectDir: string;
+    try {
+      ({ projectDir } = await resolveOrInitPaths());
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      console.error(message);
+      process.exitCode = 1;
+      return;
+    }
+    // Migrate .env first: append any required keys introduced by newer stack
+    // versions (e.g. VM_AUTH_* added in 0.15). This is what makes in-place
+    // upgrades from older deployments not break with `missing "VM_AUTH_USERNAME"
+    // env var` when sink-prometheus boots.
+    const added = ensureRequiredEnvVars(projectDir);
+    if (added.length > 0) {
+      console.log(`Added missing .env keys for this stack version: ${added.join(", ")}`);
+      console.log("(existing values were preserved; missing keys filled with safe defaults)\n");
+    }
     const code = await runCompose(["run", "--rm", "sources-generator"]);
     if (code !== 0) process.exitCode = code;
   });
 mon
   .command("update")
-  .description("update monitoring stack")
+  .description("update monitoring stack (migrate .env, pull images)")
   .action(async () => {
     console.log("Updating PostgresAI monitoring stack...\n");
     try {
-      // Check if we're in a git repo
-      const gitDir = path.resolve(process.cwd(), ".git");
-      if (!fs.existsSync(gitDir)) {
-        console.error("Not a git repository. Cannot update.");
+      let projectDir: string;
+      try {
+        ({ projectDir } = await resolveOrInitPaths());
+      } catch (error) {
+        const message = error instanceof Error ? error.message : String(error);
+        console.error(message);
         process.exitCode = 1;
         return;
       }
-      // Fetch latest changes
-      console.log("Fetching latest changes...");
-      await execFilePromise("git", ["fetch", "origin"]);
-      // Check current branch
-      const { stdout: branch } = await execFilePromise("git", ["rev-parse", "--abbrev-ref", "HEAD"]);
-      const currentBranch = branch.trim();
-      console.log(`Current branch: ${currentBranch}`);
-      // Pull latest changes
-      console.log("Pulling latest changes...");
-      const { stdout: pullOut } = await execFilePromise("git", ["pull", "origin", currentBranch]);
-      console.log(pullOut);
+      // Step 1: migrate .env so newer stack versions that require additional
+      // env vars (e.g. VM_AUTH_USERNAME / VM_AUTH_PASSWORD introduced in 0.15)
+      // don't make `docker compose up` fail silently for users who installed
+      // before those vars existed. Purely additive: existing values are kept.
+      console.log("Checking .env for newly-required keys...");
+      const added = ensureRequiredEnvVars(projectDir);
+      if (added.length > 0) {
+        console.log(`✓ Added missing .env keys: ${added.join(", ")}`);
+        console.log("  (existing values preserved; missing keys filled with safe defaults)");
+      } else {
+        console.log("✓ .env is up to date");
+      }
+      console.log();
+      // Step 2: refresh repo if this is a git-based deployment. Some users
+      // upgrade purely via `npm install -g postgresai@latest` and don't have a
+      // git checkout - in that case we skip git operations and still do the
+      // env migration + docker pull.
+      const gitDir = path.resolve(projectDir, ".git");
+      if (fs.existsSync(gitDir)) {
+        console.log("Fetching latest changes...");
+        await execFilePromise("git", ["fetch", "origin"]);
+        const { stdout: branch } = await execFilePromise("git", ["rev-parse", "--abbrev-ref", "HEAD"]);
+        const currentBranch = branch.trim();
+        console.log(`Current branch: ${currentBranch}`);
+        console.log("Pulling latest changes...");
+        const { stdout: pullOut } = await execFilePromise("git", ["pull", "origin", currentBranch]);
+        console.log(pullOut);
+      } else {
+        console.log("(not a git checkout — skipping git fetch/pull and going straight to image pull)");
+      }
-      // Update Docker images
+      // Step 3: pull new images.
       console.log("\nUpdating Docker images...");
       const code = await runCompose(["pull"]);

package/dist/bin/postgres-ai.js CHANGED Viewed

@@ -13423,7 +13423,7 @@ var {
 // package.json
 var package_default = {
   name: "postgresai",
-  version: "0.15.0-rc.3",
+  version: "0.15.0-rc.5",
   description: "postgres_ai CLI",
   license: "Apache-2.0",
   private: false,
@@ -16254,7 +16254,7 @@ var Result = import_lib.default.Result;
 var TypeOverrides = import_lib.default.TypeOverrides;
 var defaults = import_lib.default.defaults;
 // package.json
-var version = "0.15.0-rc.3";
+var version = "0.15.0-rc.5";
 var package_default2 = {
   name: "postgresai",
   version,
@@ -27687,7 +27687,7 @@ where
     statement_timeout_seconds: 300
   },
   pg_invalid_indexes: {
-    description: "This metric identifies invalid indexes in the database with decision tree data for remediation. It provides insights into whether to DROP (if duplicate exists), RECREATE (if backs constraint), or flag as UNCERTAIN (if additional RCA is needed to check query plans). Decision tree: 1) Valid duplicate exists -> DROP, 2) Backs PK/UNIQUE constraint -> RECREATE, 3) Table < 10K rows -> RECREATE (small tables rebuild quickly, typically under 1 second), 4) Otherwise -> UNCERTAIN (need query plan analysis to assess impact).",
+    description: "This metric identifies invalid indexes in the database with decision tree data for remediation. It provides insights into whether to DROP (if duplicate exists), RECREATE (if backs constraint), or flag as UNCERTAIN (if additional RCA is needed to check query plans). Decision tree: 1) Valid duplicate exists -> DROP, 2) Backs PK/UNIQUE constraint -> RECREATE, 3) Table < 10K rows -> RECREATE (small tables rebuild quickly, typically under 1 second), 4) Otherwise -> UNCERTAIN (need query plan analysis to assess impact). Adapts the top-N + `'$other$'` bucket pattern from !262 to this metric: ranks invalid indexes by `index_size_bytes desc` (ties broken by schema, table, then index name for stability), keeps the top 100, and folds the tail into a single `'$other$'` row whose `index_size_bytes` / `table_row_estimate` are summed and whose tag columns carry the literal `'$other$'` sentinel. The `'$other$'` row is omitted entirely (via `HAVING count(*) > 0`) when all invalid indexes fit within the top-100 cap, so its absence on healthy clusters is normal.",
     sqls: {
       11: `with fk_indexes as ( /* pgwatch_generated */
   select
@@ -27755,25 +27755,65 @@ data as (
   left join valid_duplicates vd on vd.invalid_indexrelid = pidx.indexrelid
   where pidx.indisvalid = false
 ),
-num_data as (
+ranked as (
   select
-    row_number() over () as num,
+    row_number() over (
+      order by index_size_bytes desc nulls last,
+               tag_schema_name, tag_table_name, tag_index_name
+    ) as num,
     data.*
   from data
 )
 select
   (extract(epoch from now()) * 1e9)::int8 as epoch_ns,
   current_database() as tag_datname,
-  num_data.*
-from num_data
-limit 1000;
+  num,
+  tag_index_name,
+  tag_schema_name,
+  tag_table_name,
+  tag_relation_name,
+  index_definition,
+  index_size_bytes,
+  is_pk,
+  is_unique,
+  constraint_name,
+  table_row_estimate,
+  has_valid_duplicate,
+  valid_index_name,
+  valid_index_definition,
+  supports_fk
+from ranked
+where num <= 100
+union all
+select
+  (extract(epoch from now()) * 1e9)::int8 as epoch_ns,
+  current_database() as tag_datname,
+  0::bigint as num,
+  '$other$'::text as tag_index_name,
+  '$other$'::text as tag_schema_name,
+  '$other$'::text as tag_table_name,
+  '$other$'::text as tag_relation_name,
+  '$other$'::text as index_definition,
+  coalesce(sum(index_size_bytes), 0)::int8 as index_size_bytes,
+  false as is_pk,
+  false as is_unique,
+  '$other$'::text as constraint_name,
+  coalesce(sum(table_row_estimate), 0)::bigint as table_row_estimate,
+  bool_or(has_valid_duplicate) as has_valid_duplicate,
+  '$other$'::text as valid_index_name,
+  '$other$'::text as valid_index_definition,
+  coalesce(max(supports_fk), 0)::int as supports_fk
+from ranked
+where num > 100
+group by ()
+having count(*) > 0;
 `
     },
     gauges: ["*"],
     statement_timeout_seconds: 15
   },
   unused_indexes: {
-    description: "This metric identifies unused indexes in the database. It provides insights into the number of unused indexes and their details. This metric helps administrators identify and fix unused indexes to improve database performance.",
+    description: "This metric identifies unused indexes in the database. It provides insights into the number of unused indexes and their details. This metric helps administrators identify and fix unused indexes to improve database performance. Adapts the top-N + `'$other$'` bucket pattern from !262 to this metric: within the `idx_scan = 0 AND idx_is_btree` filter, ranks indexes by `index_size_bytes desc` (ties broken by schema, table, index name), keeps the top 100, and folds the tail into a single `'$other$'` row. Counter columns (`idx_scan`, `all_scans`, `writes`, `index_size_bytes`, `table_size_bytes`, `relpages`) are summed across the tail; ratio columns (`index_scan_pct`, `scans_per_write`) and the `supports_fk` boolean are deliberately zeroed/false on the aggregate row because the tail-level average would mislead and the per-row FK relationship has no meaningful aggregate. Tag columns carry the literal `'$other$'` sentinel. The `'$other$'` row is omitted entirely (via `HAVING count(*) > 0`) when ≤100 indexes match the unused filter.",
     sqls: {
       11: `with fk_indexes as ( /* pgwatch_generated */
   select
@@ -27852,6 +27892,17 @@ limit 1000;
   from indexes i
   join table_scans ts on ts.relid = i.indrelid
 )
+, ranked as (
+  select
+    row_number() over (
+      order by index_size_bytes desc nulls last,
+               schema_name, table_name, index_name
+    ) as num,
+    *
+  from index_ratios
+  where idx_scan = 0
+    and idx_is_btree
+)
 select
   'Never Used Indexes' as tag_reason,
   current_database() as tag_datname,
@@ -27871,19 +27922,39 @@ select
   idx_is_btree,
   opclasses as tag_opclasses,
   supports_fk
-from index_ratios
-where
-  idx_scan = 0
-  and idx_is_btree
-order by index_size_bytes desc
-limit 1000;
+from ranked
+where num <= 100
+union all
+select
+  'Never Used Indexes' as tag_reason,
+  current_database() as tag_datname,
+  0::oid as index_id,
+  '$other$'::text as tag_schema_name,
+  '$other$'::text as tag_table_name,
+  '$other$'::text as tag_index_name,
+  '$other$'::text as index_definition,
+  coalesce(sum(idx_scan), 0)::int8 as idx_scan,
+  coalesce(sum(all_scans), 0)::int8 as all_scans,
+  0::numeric as index_scan_pct,
+  coalesce(sum(writes), 0)::int8 as writes,
+  0::numeric as scans_per_write,
+  coalesce(sum(index_size_bytes), 0)::int8 as index_size_bytes,
+  coalesce(sum(table_size_bytes), 0)::int8 as table_size_bytes,
+  coalesce(sum(relpages), 0)::int4 as relpages,
+  true as idx_is_btree,
+  '$other$'::text as tag_opclasses,
+  false as supports_fk
+from ranked
+where num > 100
+group by ()
+having count(*) > 0;
 `
     },
     gauges: ["*"],
     statement_timeout_seconds: 15
   },
   redundant_indexes: {
-    description: "This metric identifies redundant indexes that can potentially be dropped to save storage space and improve write performance. It analyzes index relationships and finds indexes that are covered by other indexes, considering column order, operator classes, and foreign key constraints. Uses the exact logic from tmp.sql with JSON aggregation and proper thresholds.",
+    description: "This metric identifies redundant indexes that can potentially be dropped to save storage space and improve write performance. It analyzes index relationships and finds indexes that are covered by other indexes, considering column order, operator classes, and foreign key constraints. Uses the exact logic from tmp.sql with JSON aggregation and proper thresholds. Adapts the top-N + `'$other$'` bucket pattern from !262 to this metric: ranks redundant indexes by `index_size_bytes desc` (ties broken by `table_name`), keeps the top 100, and folds the tail into a single `'$other$'` row whose `table_size_bytes`, `index_size_bytes` and `index_usage` columns are summed and whose tag columns carry the literal `'$other$'` sentinel. The `redundant_indexes_grouped` CTE intentionally preserves duplicate column aliases (`tag_schema_name` / `tag_index_name` appear twice — once from the raw name and once from the `formated_*` variant) because the dashboards rely on both spellings; the duplication is preserved on the `'$other$'` row for consistency. The `'$other$'` row is omitted entirely (via `HAVING count(*) > 0`) when there are ≤100 redundant pairs, so its absence on healthy clusters is normal.",
     sqls: {
       11: `with fk_indexes as ( /* pgwatch_generated */
   select
@@ -28035,9 +28106,43 @@ redundant_indexes_tmp_num as (
     formated_relation_name,
     supports_fk
   order by index_size_bytes desc
+),
+-- redundant_indexes_grouped intentionally exposes duplicate aliases
+-- (tag_schema_name / tag_index_name appear twice — once from the
+-- raw name and once from the formated_* variant). select * over it
+-- preserves both. Order by table_name (unique, non-duplicated).
+ranked as (
+  select
+    row_number() over (
+      order by index_size_bytes desc nulls last, table_name
+    ) as num,
+    redundant_indexes_grouped.*
+  from redundant_indexes_grouped
 )
-select * from redundant_indexes_grouped
-limit 1000;
+select * from ranked where num <= 100
+union all
+select
+  0::bigint as num,
+  0::oid as index_id,
+  '$other$'::text as tag_schema_name,
+  '$other$'::text as table_name,
+  coalesce(sum(table_size_bytes), 0)::int8 as table_size_bytes,
+  '$other$'::text as tag_index_name,
+  '$other$'::text as tag_access_method,
+  '$other$'::text as tag_reason,
+  coalesce(sum(index_size_bytes), 0)::int8 as index_size_bytes,
+  coalesce(sum(index_usage), 0)::int8 as index_usage,
+  '$other$'::text as index_definition,
+  '$other$'::text as tag_index_name,
+  '$other$'::text as tag_schema_name,
+  '$other$'::text as tag_table_name,
+  '$other$'::text as tag_relation_name,
+  coalesce(max(supports_fk), 0)::int as supports_fk,
+  '$other$'::text as redundant_to_json
+from ranked
+where num > 100
+group by ()
+having count(*) > 0;
 `
     },
     gauges: ["*"],
@@ -28059,93 +28164,139 @@ where datname = current_database()
     statement_timeout_seconds: 15
   },
   pg_table_bloat: {
-    description: "This metric analyzes estimated table bloat by calculating the estimated vs actual table pages and sizes. It provides insights into estimated bloat percentage, real size, extra size due to estimated bloat, and estimated bloat size considering fill factor. This metric helps administrators identify tables that may need maintenance like VACUUM FULL or table reorganization.",
+    description: "Estimated per-table bloat (heap pages allocated vs heap pages needed at perfect packing), bounded to the top 100 per database. Adapts the top-N + `'$other$'` bucket pattern from !262: everything below the cap is summed into a single `'$other$'` row so dashboard \"total bloat across the DB\" stays correct even when the tail is large. Ranks by `bloat_pct` descending (most-bloated tables first), with `is_na = 0` preferred (don't crowd top-N with tables whose estimate is unreliable) and stable schemaname/tblname tiebreakers. Preserves the existing >1 MiB filter (zero-byte and tiny tables aren't interesting for bloat). Aggregate semantics on the `'$other$'` row: sum for real_size_mib / extra_size / bloat_size (total wasted bytes in the tail); recompute extra_pct and bloat_pct from the summed numerator/denominator (weighted-avg effectively); avg(fillfactor); max(is_na) (any tail row with bad stats taints the aggregate). The `'$other$'` sentinel cannot collide with a real Postgres identifier.",
     sqls: {
-      11: `select current_database() as tag_datname, schemaname as tag_schemaname, tblname as tag_tblname, (bs*tblpages)/(1024*1024)::float as real_size_mib, /* pgwatch_generated */
-  (tblpages-est_tblpages)*bs as extra_size,
-  case when tblpages > 0 and tblpages - est_tblpages > 0
-    then 100 * (tblpages - est_tblpages)/tblpages::float
-    else 0
-  end as extra_pct, fillfactor,
-  case when tblpages - est_tblpages_ff > 0
-    then (tblpages-est_tblpages_ff)*bs
-    else 0
-  end as bloat_size,
-  case when tblpages > 0 and tblpages - est_tblpages_ff > 0
-    then 100 * (tblpages - est_tblpages_ff)/tblpages::float
-    else 0
-  end as bloat_pct, is_na
-  -- , tpl_hdr_size, tpl_data_size, (pst).free_percent + (pst).dead_tuple_percent as real_frag -- (DEBUG INFO)
-from (
-  select ceil( reltuples / ( (bs-page_hdr)/tpl_size ) ) + ceil( toasttuples / 4 ) as est_tblpages,
-    ceil( reltuples / ( (bs-page_hdr)*fillfactor/(tpl_size*100) ) ) + ceil( toasttuples / 4 ) as est_tblpages_ff,
-    tblpages, fillfactor, bs, tblid, schemaname, tblname, heappages, toastpages, is_na
-    -- , tpl_hdr_size, tpl_data_size, pgstattuple(tblid) as pst -- (DEBUG INFO)
+      11: `with bloat_data as ( /* pgwatch_generated */
+  select schemaname, tblname,
+    (bs*tblpages)/(1024*1024)::float as real_size_mib,
+    (tblpages-est_tblpages)*bs as extra_size,
+    case when tblpages > 0 and tblpages - est_tblpages > 0
+      then 100 * (tblpages - est_tblpages)/tblpages::float
+      else 0
+    end as extra_pct,
+    fillfactor,
+    case when tblpages - est_tblpages_ff > 0
+      then (tblpages-est_tblpages_ff)*bs
+      else 0
+    end as bloat_size,
+    case when tblpages > 0 and tblpages - est_tblpages_ff > 0
+      then 100 * (tblpages - est_tblpages_ff)/tblpages::float
+      else 0
+    end as bloat_pct,
+    is_na,
+    -- carried for the $other$ aggregate denominators
+    bs, tblpages, est_tblpages, est_tblpages_ff
   from (
-    select
-      ( 4 + tpl_hdr_size + tpl_data_size + (2*ma)
-        - case when tpl_hdr_size%ma = 0 then ma else tpl_hdr_size%ma end
-        - case when ceil(tpl_data_size)::int%ma = 0 then ma else ceil(tpl_data_size)::int%ma end
-      ) as tpl_size, bs - page_hdr as size_per_block, (heappages + toastpages) as tblpages, heappages,
-      toastpages, reltuples, toasttuples, bs, page_hdr, tblid, schemaname, tblname, fillfactor, is_na
-      -- , tpl_hdr_size, tpl_data_size
+    select ceil( reltuples / ( (bs-page_hdr)/tpl_size ) ) + ceil( toasttuples / 4 ) as est_tblpages,
+      ceil( reltuples / ( (bs-page_hdr)*fillfactor/(tpl_size*100) ) ) + ceil( toasttuples / 4 ) as est_tblpages_ff,
+      tblpages, fillfactor, bs, tblid, schemaname, tblname, heappages, toastpages, is_na
     from (
       select
-        tbl.oid as tblid, ns.nspname as schemaname, tbl.relname as tblname, tbl.reltuples,
-        tbl.relpages as heappages, coalesce(toast.relpages, 0) as toastpages,
-        coalesce(toast.reltuples, 0) as toasttuples,
-        coalesce(substring(
-          array_to_string(tbl.reloptions, ' ')
-          from 'fillfactor=([0-9]+)')::smallint, 100) as fillfactor,
-        current_setting('block_size')::numeric as bs,
-        case when version()~'mingw32' or version()~'64-bit|x86_64|ppc64|ia64|amd64' then 8 else 4 end as ma,
-        24 as page_hdr,
-        23 + case when max(coalesce(s.null_frac,0)) > 0 then ( 7 + count(s.attname) ) / 8 else 0::int end
-          + case when bool_or(att.attname = 'oid' and att.attnum < 0) then 4 else 0 end as tpl_hdr_size,
-        sum( (1-coalesce(s.null_frac, 0)) * coalesce(s.avg_width, 0) ) as tpl_data_size,
-        (bool_or(att.atttypid = 'pg_catalog.name'::regtype)
-          or sum(case when att.attnum > 0 then 1 else 0 end) <> count(s.attname))::int as is_na
-      from pg_attribute as att
-        join pg_class as tbl on att.attrelid = tbl.oid
-        join pg_namespace as ns on ns.oid = tbl.relnamespace
-        left join postgres_ai.pg_statistic as s on s.schemaname=ns.nspname
-          and s.tablename = tbl.relname and s.inherited=false and s.attname=att.attname
-        left join pg_class as toast on tbl.reltoastrelid = toast.oid
-      where not att.attisdropped
-        and tbl.relkind in ('r','m')
-      group by 1,2,3,4,5,6,7,8,9,10
-      order by 2,3
-    ) as s
-  ) as s2
-) as s3
--- where not is_na
---   and tblpages*((pst).free_percent + (pst).dead_tuple_percent)::float4/100 >= 1
-where (bs * tblpages::float / (1024 * 1024)) > 1 /* exclude tables below 1 MiB */
-order by is_na = 0 desc, bloat_pct desc
-limit 1000
+        ( 4 + tpl_hdr_size + tpl_data_size + (2*ma)
+          - case when tpl_hdr_size%ma = 0 then ma else tpl_hdr_size%ma end
+          - case when ceil(tpl_data_size)::int%ma = 0 then ma else ceil(tpl_data_size)::int%ma end
+        ) as tpl_size, bs - page_hdr as size_per_block, (heappages + toastpages) as tblpages, heappages,
+        toastpages, reltuples, toasttuples, bs, page_hdr, tblid, schemaname, tblname, fillfactor, is_na
+      from (
+        select
+          tbl.oid as tblid, ns.nspname as schemaname, tbl.relname as tblname, tbl.reltuples,
+          tbl.relpages as heappages, coalesce(toast.relpages, 0) as toastpages,
+          coalesce(toast.reltuples, 0) as toasttuples,
+          coalesce(substring(
+            array_to_string(tbl.reloptions, ' ')
+            from 'fillfactor=([0-9]+)')::smallint, 100) as fillfactor,
+          current_setting('block_size')::numeric as bs,
+          case when version()~'mingw32' or version()~'64-bit|x86_64|ppc64|ia64|amd64' then 8 else 4 end as ma,
+          24 as page_hdr,
+          23 + case when max(coalesce(s.null_frac,0)) > 0 then ( 7 + count(s.attname) ) / 8 else 0::int end
+            + case when bool_or(att.attname = 'oid' and att.attnum < 0) then 4 else 0 end as tpl_hdr_size,
+          sum( (1-coalesce(s.null_frac, 0)) * coalesce(s.avg_width, 0) ) as tpl_data_size,
+          (bool_or(att.atttypid = 'pg_catalog.name'::regtype)
+            or sum(case when att.attnum > 0 then 1 else 0 end) <> count(s.attname))::int as is_na
+        from pg_attribute as att
+          join pg_class as tbl on att.attrelid = tbl.oid
+          join pg_namespace as ns on ns.oid = tbl.relnamespace
+          left join postgres_ai.pg_statistic as s on s.schemaname=ns.nspname
+            and s.tablename = tbl.relname and s.inherited=false and s.attname=att.attname
+          left join pg_class as toast on tbl.reltoastrelid = toast.oid
+        where not att.attisdropped
+          and tbl.relkind in ('r','m')
+        group by 1,2,3,4,5,6,7,8,9,10
+        order by 2,3
+      ) as s
+    ) as s2
+  ) as s3
+  where (bs * tblpages::float / (1024 * 1024)) > 1 /* exclude tables below 1 MiB */
+),
+ranked as (
+  select
+    row_number() over (
+      order by is_na = 0 desc, bloat_pct desc nulls last,
+               schemaname, tblname
+    ) as rownum,
+    *
+  from bloat_data
+)
+select
+  current_database() as tag_datname,
+  schemaname as tag_schemaname,
+  tblname as tag_tblname,
+  real_size_mib,
+  extra_size,
+  extra_pct,
+  fillfactor,
+  bloat_size,
+  bloat_pct,
+  is_na
+from ranked
+where rownum <= 100
+union all
+select
+  current_database() as tag_datname,
+  '$other$'::text as tag_schemaname,
+  '$other$'::text as tag_tblname,
+  coalesce(sum(real_size_mib), 0)::float as real_size_mib,
+  coalesce(sum(extra_size), 0)::int8 as extra_size,
+  case when sum(tblpages) > 0
+    then 100 * sum(greatest(tblpages - est_tblpages, 0))::float / sum(tblpages)
+    else 0
+  end::float as extra_pct,
+  coalesce(avg(fillfactor), 100)::smallint as fillfactor,
+  coalesce(sum(bloat_size), 0)::int8 as bloat_size,
+  case when sum(tblpages) > 0
+    then 100 * sum(greatest(tblpages - est_tblpages_ff, 0))::float / sum(tblpages)
+    else 0
+  end::float as bloat_pct,
+  coalesce(max(is_na), 0)::int as is_na
+from ranked
+where rownum > 100
+group by ()
+having count(*) > 0
 `
     },
     gauges: ["real_size_mib", "extra_size", "extra_pct", "fillfactor", "bloat_size", "bloat_pct", "is_na", "reltuples"],
     statement_timeout_seconds: 300
   },
   pg_btree_bloat: {
-    description: "This metric analyzes estimated index bloat by calculating the estimated vs actual index pages and sizes. It provides insights into estimated bloat percentage, real size, extra size due to estimated bloat, and estimated bloat size considering fill factor. This metric helps administrators identify indexes that may need maintenance like VACUUM FULL or index reorganization.",
+    description: "Estimated per-btree-index bloat (index pages allocated vs index pages needed at perfect packing), bounded to the top 100 per database. Adapts the top-N + `'$other$'` bucket pattern from !262. Ranks by `bloat_pct` descending with `is_na = 0` preferred and stable schema/table/idx tiebreakers. Preserves the existing >1 MiB filter. Aggregate semantics on the `'$other$'` row: sum for real_size_mib / extra_size / bloat_size; recompute extra_pct and bloat_pct from sum(relpages-est_pages) / sum(relpages) (weighted avg over the tail); avg(fillfactor); max(is_na); table_size_mib doesn't aggregate meaningfully across indexes on different tables, so the `'$other$'` row reports 0. The `'$other$'` sentinel cannot collide with a real Postgres identifier.",
     sqls: {
-      11: `select /* pgwatch_generated */
-  current_database() as tag_datname, nspname as tag_schemaname, tblname as tag_tblname, idxname as tag_idxname,
-  (bs*(relpages)/(1024*1024))::float as real_size_mib,
-  (pg_relation_size(tbloid)/(1024*1024))::float as table_size_mib,
-  (bs*(relpages-est_pages))::float as extra_size,
-  100 * (relpages-est_pages)::float / relpages as extra_pct,
-  fillfactor,
-  case when relpages > est_pages_ff
-    then bs*(relpages-est_pages_ff)
-    else 0
-  end as bloat_size,
-  100 * (relpages-est_pages_ff)::float / relpages as bloat_pct,
-  is_na
-  -- , 100-(pst).avg_leaf_density as pst_avg_bloat, est_pages, index_tuple_hdr_bm, maxalign, pagehdr, nulldatawidth, nulldatahdrwidth, reltuples, relpages -- (DEBUG INFO)
-from (
+      11: `with bloat_data as ( /* pgwatch_generated */
+  select
+    nspname, tblname, idxname,
+    (bs*(relpages)/(1024*1024))::float as real_size_mib,
+    (pg_relation_size(tbloid)/(1024*1024))::float as table_size_mib,
+    (bs*(relpages-est_pages))::float as extra_size,
+    100 * (relpages-est_pages)::float / relpages as extra_pct,
+    fillfactor,
+    case when relpages > est_pages_ff
+      then bs*(relpages-est_pages_ff)
+      else 0
+    end as bloat_size,
+    100 * (relpages-est_pages_ff)::float / relpages as bloat_pct,
+    is_na,
+    -- carried for the $other$ aggregate denominators
+    bs, relpages, est_pages, est_pages_ff
+  from (
   select coalesce(1 +
         ceil(reltuples/floor((bs-pageopqdata-pagehdr)/(4+nulldatahdrwidth)::float)), 0 -- ItemIdData size + computed avg size of a tuple (nulldatahdrwidth)
       ) as est_pages,
@@ -28232,8 +28383,55 @@ from (
   ) as rows_hdr_pdg_stats
 ) as relation_stats
 where (bs * relpages::float / (1024 * 1024)) > 1 /* exclude indexes below 1 MiB */
-order by is_na = 0 desc, bloat_pct desc
-limit 1000
+),
+ranked as (
+  select
+    row_number() over (
+      order by is_na = 0 desc, bloat_pct desc nulls last,
+               nspname, tblname, idxname
+    ) as rownum,
+    *
+  from bloat_data
+)
+select
+  current_database() as tag_datname,
+  nspname as tag_schemaname,
+  tblname as tag_tblname,
+  idxname as tag_idxname,
+  real_size_mib,
+  table_size_mib,
+  extra_size,
+  extra_pct,
+  fillfactor,
+  bloat_size,
+  bloat_pct,
+  is_na
+from ranked
+where rownum <= 100
+union all
+select
+  current_database() as tag_datname,
+  '$other$'::text as tag_schemaname,
+  '$other$'::text as tag_tblname,
+  '$other$'::text as tag_idxname,
+  coalesce(sum(real_size_mib), 0)::float as real_size_mib,
+  0::float as table_size_mib,
+  coalesce(sum(extra_size), 0)::float as extra_size,
+  case when sum(relpages) > 0
+    then 100 * sum(greatest(relpages - est_pages, 0))::float / sum(relpages)
+    else 0
+  end::float as extra_pct,
+  coalesce(avg(fillfactor), 90)::smallint as fillfactor,
+  coalesce(sum(bloat_size), 0)::float as bloat_size,
+  case when sum(relpages) > 0
+    then 100 * sum(greatest(relpages - est_pages_ff, 0))::float / sum(relpages)
+    else 0
+  end::float as bloat_pct,
+  coalesce(max(is_na), 0)::int as is_na
+from ranked
+where rownum > 100
+group by ()
+having count(*) > 0
 `
     },
     gauges: ["real_size_mib", "table_size_mib", "extra_size", "extra_pct", "fillfactor", "bloat_size", "bloat_pct", "is_na", "reltuples"],
@@ -33353,6 +33551,35 @@ function stripMatchingQuotes(value) {
   }
   return trimmed;
 }
+var REQUIRED_ENV_KEYS = [
+  { key: "REPLICATOR_PASSWORD", defaultValue: () => crypto2.randomBytes(32).toString("hex"), introducedIn: "0.13" },
+  { key: "VM_AUTH_USERNAME", defaultValue: () => "vmauth", introducedIn: "0.15" },
+  { key: "VM_AUTH_PASSWORD", defaultValue: () => crypto2.randomBytes(18).toString("base64"), introducedIn: "0.15" }
+];
+function ensureRequiredEnvVars(projectDir) {
+  const envFile = path7.resolve(projectDir, ".env");
+  const existing = fs8.existsSync(envFile) ? fs8.readFileSync(envFile, "utf8") : "";
+  const added = [];
+  const appendLines = [];
+  for (const spec of REQUIRED_ENV_KEYS) {
+    const re = new RegExp(`^${spec.key}=`, "m");
+    if (!re.test(existing)) {
+      appendLines.push(`${spec.key}=${spec.defaultValue()}`);
+      added.push(spec.key);
+    }
+  }
+  if (appendLines.length === 0) {
+    return added;
+  }
+  const needsTrailingNewline = existing.length > 0 && !existing.endsWith(`
+`);
+  const newContent = existing + (needsTrailingNewline ? `
+` : "") + appendLines.join(`
+`) + `
+`;
+  fs8.writeFileSync(envFile, newContent, { encoding: "utf8", mode: 384 });
+  return added;
+}
 async function execFilePromise(file, args) {
   return new Promise((resolve8, reject) => {
     childProcess.execFile(file, args, (error2, stdout, stderr) => {
@@ -35627,29 +35854,61 @@ Instances configuration:
       console.log();
   }
 });
-mon.command("update-config").description("apply monitoring services configuration (generate sources)").action(async () => {
+mon.command("update-config").description("apply monitoring services configuration (generate sources, migrate .env)").action(async () => {
+  let projectDir;
+  try {
+    ({ projectDir } = await resolveOrInitPaths());
+  } catch (error2) {
+    const message = error2 instanceof Error ? error2.message : String(error2);
+    console.error(message);
+    process.exitCode = 1;
+    return;
+  }
+  const added = ensureRequiredEnvVars(projectDir);
+  if (added.length > 0) {
+    console.log(`Added missing .env keys for this stack version: ${added.join(", ")}`);
+    console.log(`(existing values were preserved; missing keys filled with safe defaults)
+`);
+  }
   const code = await runCompose(["run", "--rm", "sources-generator"]);
   if (code !== 0)
     process.exitCode = code;
 });
-mon.command("update").description("update monitoring stack").action(async () => {
+mon.command("update").description("update monitoring stack (migrate .env, pull images)").action(async () => {
   console.log(`Updating PostgresAI monitoring stack...
 `);
   try {
-    const gitDir = path7.resolve(process.cwd(), ".git");
-    if (!fs8.existsSync(gitDir)) {
-      console.error("Not a git repository. Cannot update.");
+    let projectDir;
+    try {
+      ({ projectDir } = await resolveOrInitPaths());
+    } catch (error2) {
+      const message = error2 instanceof Error ? error2.message : String(error2);
+      console.error(message);
       process.exitCode = 1;
       return;
     }
-    console.log("Fetching latest changes...");
-    await execFilePromise("git", ["fetch", "origin"]);
-    const { stdout: branch } = await execFilePromise("git", ["rev-parse", "--abbrev-ref", "HEAD"]);
-    const currentBranch = branch.trim();
-    console.log(`Current branch: ${currentBranch}`);
-    console.log("Pulling latest changes...");
-    const { stdout: pullOut } = await execFilePromise("git", ["pull", "origin", currentBranch]);
-    console.log(pullOut);
+    console.log("Checking .env for newly-required keys...");
+    const added = ensureRequiredEnvVars(projectDir);
+    if (added.length > 0) {
+      console.log(`\u2713 Added missing .env keys: ${added.join(", ")}`);
+      console.log("  (existing values preserved; missing keys filled with safe defaults)");
+    } else {
+      console.log("\u2713 .env is up to date");
+    }
+    console.log();
+    const gitDir = path7.resolve(projectDir, ".git");
+    if (fs8.existsSync(gitDir)) {
+      console.log("Fetching latest changes...");
+      await execFilePromise("git", ["fetch", "origin"]);
+      const { stdout: branch } = await execFilePromise("git", ["rev-parse", "--abbrev-ref", "HEAD"]);
+      const currentBranch = branch.trim();
+      console.log(`Current branch: ${currentBranch}`);
+      console.log("Pulling latest changes...");
+      const { stdout: pullOut } = await execFilePromise("git", ["pull", "origin", currentBranch]);
+      console.log(pullOut);
+    } else {
+      console.log("(not a git checkout \u2014 skipping git fetch/pull and going straight to image pull)");
+    }
     console.log(`
 Updating Docker images...`);
     const code = await runCompose(["pull"]);

package/lib/checkup-dictionary.ts CHANGED Viewed

@@ -57,17 +57,6 @@ export function getCheckupEntry(code: string): CheckupDictionaryEntry | null {
   return dictionaryByCode.get(code.toUpperCase()) ?? null;
 }
-/**
- * Get the title for a checkup code.
- *
- * @param code - The check code (e.g., "A001", "H002")
- * @returns The title or the code itself if not found
- */
-export function getCheckupTitle(code: string): string {
-  const entry = getCheckupEntry(code);
-  return entry?.title ?? code;
-}
 /**
  * Check if a code exists in the dictionary.
  *

package/lib/checkup.ts CHANGED Viewed

@@ -2,41 +2,41 @@
  * Express Checkup Module
  * ======================
  * Generates JSON health check reports directly from PostgreSQL without Prometheus.
- *
+ *
  * ARCHITECTURAL DECISIONS
  * -----------------------
- *
+ *
  * 1. SINGLE SOURCE OF TRUTH FOR SQL QUERIES
- *    Complex metrics (index health, settings, db_stats) are loaded from
+ *    Complex metrics (index health, settings, db_stats) are loaded from
  *    config/pgwatch-prometheus/metrics.yml via getMetricSql() from metrics-loader.ts.
- *
+ *
  *    Simple queries (version, database list, connection states, uptime) use
  *    inline SQL as they're trivial and CLI-specific.
- *
+ *
  * 2. JSON SCHEMA COMPLIANCE
  *    All generated reports MUST comply with JSON schemas in reporter/schemas/.
  *    These schemas define the expected format for both:
  *    - Full-fledged monitoring reporter output
  *    - Express checkup output
- *
+ *
  *    Before adding or modifying a report, verify the corresponding schema exists
  *    and ensure the output matches. Run schema validation tests to confirm.
- *
+ *
  * 3. ERROR HANDLING STRATEGY
  *    Functions follow two patterns based on criticality:
- *
+ *
  *    PROPAGATING (throws on error):
  *    - Core data functions: getPostgresVersion, getSettings, getAlteredSettings,
  *      getDatabaseSizes, getInvalidIndexes, getUnusedIndexes, getRedundantIndexes
  *    - If these fail, the entire report should fail (data is required)
  *    - Callers should handle errors at the report generation level
- *
+ *
  *    GRACEFUL DEGRADATION (catches errors, includes error in output):
  *    - Optional/supplementary queries: pg_stat_statements, pg_stat_kcache checks,
  *      memory calculations, postmaster startup time
  *    - These are nice-to-have; missing data shouldn't fail the whole report
  *    - Errors are logged and included in report output for visibility
- *
+ *
  * ADDING NEW REPORTS
  * ------------------
  * 1. Add/verify the metric exists in config/pgwatch-prometheus/metrics.yml
@@ -51,7 +51,7 @@ import * as fs from "fs";
 import * as path from "path";
 import * as pkg from "../package.json";
 import { getMetricSql, transformMetricRow, METRIC_NAMES } from "./metrics-loader";
-import { getCheckupTitle, buildCheckInfoMap } from "./checkup-dictionary";
+import { buildCheckInfoMap } from "./checkup-dictionary";
 // Time constants
 const SECONDS_PER_DAY = 86400;
@@ -336,7 +336,7 @@ export function parseVersionNum(versionNum: string): { major: string; minor: str
 /**
  * Format bytes to human readable string using binary units (1024-based).
  * Uses IEC standard: KiB, MiB, GiB, etc.
- *
+ *
  * Note: PostgreSQL's pg_size_pretty() uses kB/MB/GB with 1024 base (technically
  * incorrect SI usage), but we follow IEC binary units per project style guide.
  */
@@ -387,7 +387,7 @@ function formatSettingPrettyValue(
 /**
  * Get PostgreSQL version information.
  * Uses simple inline SQL (trivial query, CLI-specific).
- *
+ *
  * @throws {Error} If database query fails (propagating - critical data)
  */
 export async function getPostgresVersion(client: Client): Promise<PostgresVersion> {
@@ -1084,7 +1084,7 @@ export const generateH004 = (client: Client, nodeName = "node-01") =>
 /**
  * Generate D004 report - pg_stat_statements and pg_stat_kcache settings.
- *
+ *
  * Uses graceful degradation: extension queries are wrapped in try-catch
  * because extensions may not be installed. Errors are included in the
  * report output rather than failing the entire report.

package/lib/init.ts CHANGED Viewed

@@ -87,7 +87,7 @@ export type AdminConnection = {
 /**
  * Check if an error indicates SSL negotiation failed and fallback to non-SSL should be attempted.
  * This mimics libpq's sslmode=prefer behavior.
- *
+ *
  * IMPORTANT: This should NOT match certificate errors (expired, invalid, self-signed)
  * as those are real errors the user needs to fix, not negotiation failures.
  */

package/lib/metrics-loader.ts CHANGED Viewed

@@ -1,6 +1,6 @@
 /**
  * Metrics loader for express checkup reports
- *
+ *
  * Loads SQL queries from embedded metrics data (generated from metrics.yml at build time).
  * Provides version-aware query selection and row transformation utilities.
  */
@@ -9,7 +9,7 @@ import { METRICS, MetricDefinition } from "./metrics-embedded";
 /**
  * Get SQL query for a specific metric, selecting the appropriate version.
- *
+ *
  * @param metricName - Name of the metric (e.g., "settings", "db_stats")
  * @param pgMajorVersion - PostgreSQL major version (default: 16)
  * @returns SQL query string
@@ -41,7 +41,7 @@ export function getMetricSql(metricName: string, pgMajorVersion: number = 16): s
 /**
  * Get metric definition including all metadata.
- *
+ *
  * @param metricName - Name of the metric
  * @returns MetricDefinition or undefined if not found
  */

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "postgresai",
-  "version": "0.15.0-rc.3",
+  "version": "0.15.0-rc.5",
   "description": "postgres_ai CLI",
   "license": "Apache-2.0",
   "private": false,

package/test/upgrade.test.ts CHANGED Viewed

@@ -420,3 +420,126 @@ describe("upgrade CLI commands", () => {
     expect(stdout).toMatch(/health/i);
   }, { timeout: TEST_TIMEOUT });
 });
+describe("in-place upgrade env migration (mon update / update-config)", () => {
+  /**
+   * Regression tests for the 0.14 -> 0.15 in-place upgrade gap (#203).
+   *
+   * Before this fix, a user who installed at 0.14 and ran the documented
+   * upgrade flow (`pgai mon update`) ended up with a .env file that lacked
+   * VM_AUTH_USERNAME / VM_AUTH_PASSWORD, so sink-prometheus exited with:
+   *
+   *   fatal cannot read "/postgres_ai_configs/prometheus/prometheus.yml":
+   *   cannot expand environment variables: missing "VM_AUTH_USERNAME" env var
+   *
+   * `mon update` and `mon update-config` now migrate .env additively before
+   * doing anything else.
+   */
+  let tempDir: string;
+  beforeAll(() => {
+    tempDir = fs.mkdtempSync(resolve(os.tmpdir(), "pgai-upgrade-env-migration-"));
+  });
+  afterAll(() => {
+    if (tempDir && fs.existsSync(tempDir)) {
+      fs.rmSync(tempDir, { recursive: true, force: true });
+    }
+  });
+  test("mon update-config appends missing VM_AUTH_USERNAME / VM_AUTH_PASSWORD to a 0.14-shaped .env", () => {
+    const testDir = resolve(tempDir, "update-config-0.14-env");
+    fs.mkdirSync(testDir, { recursive: true });
+    // 0.14-shaped .env: PGAI_TAG present, VM_AUTH_* absent.
+    fs.writeFileSync(resolve(testDir, ".env"), "PGAI_TAG=0.14.0\nGF_SECURITY_ADMIN_PASSWORD=user-set-grafana-pw\n");
+    fs.writeFileSync(resolve(testDir, "docker-compose.yml"), "version: '3'\nservices: {}\n");
+    fs.writeFileSync(resolve(testDir, "instances.yml"), "# instances\n");
+    // The compose run will fail (no Docker in CI), but env migration runs first.
+    runCliInDir(["mon", "update-config"], testDir, { PGAI_TAG: undefined });
+    const envContent = fs.readFileSync(resolve(testDir, ".env"), "utf8");
+    // Existing values must be preserved verbatim.
+    expect(envContent).toMatch(/^PGAI_TAG=0\.14\.0$/m);
+    expect(envContent).toMatch(/^GF_SECURITY_ADMIN_PASSWORD=user-set-grafana-pw$/m);
+    // New required keys must be appended (vmauth username + non-empty base64 password).
+    expect(envContent).toMatch(/^VM_AUTH_USERNAME=vmauth$/m);
+    expect(envContent).toMatch(/^VM_AUTH_PASSWORD=[A-Za-z0-9+/]+={0,2}$/m);
+    // REPLICATOR_PASSWORD was introduced earlier and is also part of the contract.
+    expect(envContent).toMatch(/^REPLICATOR_PASSWORD=[a-f0-9]{64}$/m);
+  }, { timeout: TEST_TIMEOUT });
+  test("mon update appends missing VM_AUTH_USERNAME / VM_AUTH_PASSWORD to a 0.14-shaped .env", () => {
+    const testDir = resolve(tempDir, "update-0.14-env");
+    fs.mkdirSync(testDir, { recursive: true });
+    fs.writeFileSync(resolve(testDir, ".env"), "PGAI_TAG=0.14.0\n");
+    fs.writeFileSync(resolve(testDir, "docker-compose.yml"), "version: '3'\nservices: {}\n");
+    fs.writeFileSync(resolve(testDir, "instances.yml"), "# instances\n");
+    // mon update will fail (no Docker in CI, no git repo), but env migration runs first.
+    const result = runCliInDir(["mon", "update"], testDir, { PGAI_TAG: undefined });
+    const envContent = fs.readFileSync(resolve(testDir, ".env"), "utf8");
+    expect(envContent).toMatch(/^PGAI_TAG=0\.14\.0$/m);
+    expect(envContent).toMatch(/^VM_AUTH_USERNAME=vmauth$/m);
+    expect(envContent).toMatch(/^VM_AUTH_PASSWORD=[A-Za-z0-9+/]+={0,2}$/m);
+    // The migration step should print what it added so the user can see it.
+    expect(result.stdout).toMatch(/Added missing \.env keys/);
+    expect(result.stdout).toMatch(/VM_AUTH_USERNAME/);
+    expect(result.stdout).toMatch(/VM_AUTH_PASSWORD/);
+  }, { timeout: TEST_TIMEOUT });
+  test("mon update preserves existing VM_AUTH_* values (no rotation)", () => {
+    const testDir = resolve(tempDir, "update-preserve-vm-auth");
+    fs.mkdirSync(testDir, { recursive: true });
+    // User already has VM auth configured (e.g. set up via rotate-vm-auth.sh).
+    fs.writeFileSync(
+      resolve(testDir, ".env"),
+      "PGAI_TAG=0.15.0\nVM_AUTH_USERNAME=custom-user\nVM_AUTH_PASSWORD=custom-pw-do-not-rotate\nREPLICATOR_PASSWORD=" +
+        "a".repeat(64) +
+        "\n",
+    );
+    fs.writeFileSync(resolve(testDir, "docker-compose.yml"), "version: '3'\nservices: {}\n");
+    fs.writeFileSync(resolve(testDir, "instances.yml"), "# instances\n");
+    const result = runCliInDir(["mon", "update"], testDir, { PGAI_TAG: undefined });
+    const envContent = fs.readFileSync(resolve(testDir, ".env"), "utf8");
+    expect(envContent).toMatch(/^VM_AUTH_USERNAME=custom-user$/m);
+    expect(envContent).toMatch(/^VM_AUTH_PASSWORD=custom-pw-do-not-rotate$/m);
+    expect(envContent).toMatch(/^REPLICATOR_PASSWORD=a{64}$/m);
+    // When nothing is missing, the migration step should say so.
+    expect(result.stdout).toMatch(/\.env is up to date/);
+  }, { timeout: TEST_TIMEOUT });
+  test("mon update-config handles a .env that doesn't end with a newline", () => {
+    const testDir = resolve(tempDir, "update-config-no-trailing-newline");
+    fs.mkdirSync(testDir, { recursive: true });
+    // No trailing newline - migration must add one before appending new keys
+    // or we'd produce e.g. `PGAI_TAG=0.14.0VM_AUTH_USERNAME=vmauth`.
+    fs.writeFileSync(resolve(testDir, ".env"), "PGAI_TAG=0.14.0");
+    fs.writeFileSync(resolve(testDir, "docker-compose.yml"), "version: '3'\nservices: {}\n");
+    fs.writeFileSync(resolve(testDir, "instances.yml"), "# instances\n");
+    runCliInDir(["mon", "update-config"], testDir, { PGAI_TAG: undefined });
+    const envContent = fs.readFileSync(resolve(testDir, ".env"), "utf8");
+    expect(envContent).toMatch(/^PGAI_TAG=0\.14\.0$/m);
+    expect(envContent).toMatch(/^VM_AUTH_USERNAME=vmauth$/m);
+    // No key should be glued onto the previous line.
+    expect(envContent).not.toMatch(/PGAI_TAG=0\.14\.0VM_AUTH_USERNAME/);
+  }, { timeout: TEST_TIMEOUT });
+});