@synapta/skills 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (353) hide show
  1. package/dist/index.js +11 -4
  2. package/package.json +3 -4
  3. package/skills/ATTRIBUTION.md +80 -0
  4. package/skills/accessibility-audit/SKILL.md +325 -0
  5. package/skills/accessibility-audit/reference/wcag-checklist.md +103 -0
  6. package/skills/apns-notifier/SKILL.md +86 -0
  7. package/skills/approval-policy-enforcer/SKILL.md +66 -0
  8. package/skills/apps-sdk-builder/LICENSE.txt +201 -0
  9. package/skills/apps-sdk-builder/SKILL.md +328 -0
  10. package/skills/apps-sdk-builder/agents/openai.yaml +13 -0
  11. package/skills/apps-sdk-builder/references/app-archetypes.md +132 -0
  12. package/skills/apps-sdk-builder/references/apps-sdk-docs-workflow.md +135 -0
  13. package/skills/apps-sdk-builder/references/interactive-state-sync-patterns.md +113 -0
  14. package/skills/apps-sdk-builder/references/repo-contract-and-validation.md +93 -0
  15. package/skills/apps-sdk-builder/references/search-fetch-standard.md +67 -0
  16. package/skills/apps-sdk-builder/references/upstream-example-workflow.md +79 -0
  17. package/skills/apps-sdk-builder/references/window-openai-patterns.md +79 -0
  18. package/skills/apps-sdk-builder/scripts/scaffold_node_ext_apps.mjs +606 -0
  19. package/skills/architecture-selector/SKILL.md +64 -0
  20. package/skills/backlog-planner/SKILL.md +68 -0
  21. package/skills/carplay-entitlement-checker/SKILL.md +82 -0
  22. package/skills/concept-discovery/SKILL.md +517 -0
  23. package/skills/concept-discovery/assets/sample-analysis.json +81 -0
  24. package/skills/concept-discovery/expected_outputs/sample-enum-dictionary.md +25 -0
  25. package/skills/concept-discovery/expected_outputs/sample-page-user-list.md +83 -0
  26. package/skills/concept-discovery/expected_outputs/sample-prd-readme.md +43 -0
  27. package/skills/concept-discovery/references/framework-patterns.md +228 -0
  28. package/skills/concept-discovery/references/prd-quality-checklist.md +65 -0
  29. package/skills/concept-discovery/scripts/codebase_analyzer.py +732 -0
  30. package/skills/concept-discovery/scripts/prd_scaffolder.py +435 -0
  31. package/skills/dast-zap/SKILL.md +453 -0
  32. package/skills/dast-zap/assets/.gitkeep +9 -0
  33. package/skills/dast-zap/assets/github_action.yml +207 -0
  34. package/skills/dast-zap/assets/gitlab_ci.yml +226 -0
  35. package/skills/dast-zap/assets/zap_automation.yaml +196 -0
  36. package/skills/dast-zap/assets/zap_context.xml +192 -0
  37. package/skills/dast-zap/references/EXAMPLE.md +40 -0
  38. package/skills/dast-zap/references/api_testing_guide.md +475 -0
  39. package/skills/dast-zap/references/authentication_guide.md +431 -0
  40. package/skills/dast-zap/references/false_positive_handling.md +427 -0
  41. package/skills/dast-zap/references/owasp_mapping.md +255 -0
  42. package/skills/dep-sbom-scan/SKILL.md +466 -0
  43. package/skills/deploy-cloudflare/SKILL.md +930 -0
  44. package/skills/deploy-docker/SKILL.md +55 -0
  45. package/skills/deploy-fly/SKILL.md +228 -0
  46. package/skills/deploy-k8s/SKILL.md +108 -0
  47. package/skills/deploy-k8s/assets/logo.png +0 -0
  48. package/skills/deploy-k8s/docs/README.md +29 -0
  49. package/skills/deploy-k8s/docs/SUMMARY.md +56 -0
  50. package/skills/deploy-k8s/docs/advanced/token-efficiency.md +61 -0
  51. package/skills/deploy-k8s/docs/architecture/multi-tenancy.md +96 -0
  52. package/skills/deploy-k8s/docs/architecture/storage-and-state.md +102 -0
  53. package/skills/deploy-k8s/docs/architecture/workload-patterns.md +87 -0
  54. package/skills/deploy-k8s/docs/book.json +16 -0
  55. package/skills/deploy-k8s/docs/community/changelog.md +34 -0
  56. package/skills/deploy-k8s/docs/community/contributing.md +67 -0
  57. package/skills/deploy-k8s/docs/core-concepts/failure-modes.md +153 -0
  58. package/skills/deploy-k8s/docs/core-concepts/philosophy.md +83 -0
  59. package/skills/deploy-k8s/docs/core-concepts/workflow.md +124 -0
  60. package/skills/deploy-k8s/docs/examples/bad-patterns.md +47 -0
  61. package/skills/deploy-k8s/docs/examples/do-dont-checklist.md +37 -0
  62. package/skills/deploy-k8s/docs/examples/good-patterns.md +49 -0
  63. package/skills/deploy-k8s/docs/failure-modes/api-drift.md +104 -0
  64. package/skills/deploy-k8s/docs/failure-modes/fragile-rollouts.md +99 -0
  65. package/skills/deploy-k8s/docs/failure-modes/insecure-workload-defaults.md +80 -0
  66. package/skills/deploy-k8s/docs/failure-modes/network-exposure.md +98 -0
  67. package/skills/deploy-k8s/docs/failure-modes/privilege-sprawl.md +91 -0
  68. package/skills/deploy-k8s/docs/failure-modes/resource-starvation.md +85 -0
  69. package/skills/deploy-k8s/docs/getting-started/installation.md +152 -0
  70. package/skills/deploy-k8s/docs/getting-started/quick-start.md +115 -0
  71. package/skills/deploy-k8s/docs/guides/helm-patterns.md +71 -0
  72. package/skills/deploy-k8s/docs/guides/kustomize-patterns.md +65 -0
  73. package/skills/deploy-k8s/docs/guides/observability.md +67 -0
  74. package/skills/deploy-k8s/docs/guides/security-hardening.md +59 -0
  75. package/skills/deploy-k8s/docs/guides/validation-and-policy.md +66 -0
  76. package/skills/deploy-k8s/docs/integrations/mcp-integration.md +52 -0
  77. package/skills/deploy-k8s/docs/package-lock.json +2892 -0
  78. package/skills/deploy-k8s/docs/package.json +13 -0
  79. package/skills/deploy-k8s/references/api-drift.md +298 -0
  80. package/skills/deploy-k8s/references/conditional/aks-patterns.md +70 -0
  81. package/skills/deploy-k8s/references/conditional/eks-patterns.md +79 -0
  82. package/skills/deploy-k8s/references/conditional/gitops-controllers.md +71 -0
  83. package/skills/deploy-k8s/references/conditional/gke-patterns.md +74 -0
  84. package/skills/deploy-k8s/references/conditional/observability-stacks.md +80 -0
  85. package/skills/deploy-k8s/references/conditional/openshift-patterns.md +67 -0
  86. package/skills/deploy-k8s/references/daemonset-operator-patterns.md +155 -0
  87. package/skills/deploy-k8s/references/deployment-patterns.md +146 -0
  88. package/skills/deploy-k8s/references/do-dont-patterns.md +87 -0
  89. package/skills/deploy-k8s/references/examples-bad.md +282 -0
  90. package/skills/deploy-k8s/references/examples-good.md +440 -0
  91. package/skills/deploy-k8s/references/fragile-rollouts.md +303 -0
  92. package/skills/deploy-k8s/references/helm-patterns.md +203 -0
  93. package/skills/deploy-k8s/references/insecure-workload-defaults.md +300 -0
  94. package/skills/deploy-k8s/references/job-patterns.md +120 -0
  95. package/skills/deploy-k8s/references/kustomize-patterns.md +239 -0
  96. package/skills/deploy-k8s/references/multi-tenancy.md +343 -0
  97. package/skills/deploy-k8s/references/network-exposure.md +481 -0
  98. package/skills/deploy-k8s/references/observability.md +302 -0
  99. package/skills/deploy-k8s/references/privilege-sprawl.md +273 -0
  100. package/skills/deploy-k8s/references/resource-starvation.md +374 -0
  101. package/skills/deploy-k8s/references/security-hardening.md +209 -0
  102. package/skills/deploy-k8s/references/stateful-patterns.md +130 -0
  103. package/skills/deploy-k8s/references/storage-and-state.md +330 -0
  104. package/skills/deploy-k8s/references/validation-and-policy.md +242 -0
  105. package/skills/deploy-railway/SKILL.md +235 -0
  106. package/skills/deploy-railway/references/analyze-db-mongo.md +84 -0
  107. package/skills/deploy-railway/references/analyze-db-mysql.md +254 -0
  108. package/skills/deploy-railway/references/analyze-db-postgres.md +479 -0
  109. package/skills/deploy-railway/references/analyze-db-redis.md +208 -0
  110. package/skills/deploy-railway/references/analyze-db.md +344 -0
  111. package/skills/deploy-railway/references/configure.md +309 -0
  112. package/skills/deploy-railway/references/deploy.md +195 -0
  113. package/skills/deploy-railway/references/operate.md +214 -0
  114. package/skills/deploy-railway/references/request.md +248 -0
  115. package/skills/deploy-railway/references/setup.md +312 -0
  116. package/skills/deploy-railway/scripts/analyze-mongo.py +1549 -0
  117. package/skills/deploy-railway/scripts/analyze-mysql.py +1195 -0
  118. package/skills/deploy-railway/scripts/analyze-postgres.py +3058 -0
  119. package/skills/deploy-railway/scripts/analyze-redis.py +1090 -0
  120. package/skills/deploy-railway/scripts/dal.py +671 -0
  121. package/skills/deploy-railway/scripts/enable-pg-stats.py +170 -0
  122. package/skills/deploy-railway/scripts/pg-extensions.py +370 -0
  123. package/skills/deploy-railway/scripts/railway-api.sh +52 -0
  124. package/skills/deploy-ssh/SKILL.md +91 -0
  125. package/skills/deploy-vercel/SKILL.md +304 -0
  126. package/skills/deploy-vercel/resources/deploy-codex.sh +301 -0
  127. package/skills/deploy-vercel/resources/deploy.sh +301 -0
  128. package/skills/docs-runbooks/SKILL.md +399 -0
  129. package/skills/drive-status-renderer/SKILL.md +62 -0
  130. package/skills/iac-scan/SKILL.md +680 -0
  131. package/skills/iac-scan/assets/.gitkeep +9 -0
  132. package/skills/iac-scan/assets/checkov_config.yaml +94 -0
  133. package/skills/iac-scan/assets/github_actions.yml +199 -0
  134. package/skills/iac-scan/assets/gitlab_ci.yml +218 -0
  135. package/skills/iac-scan/assets/pre_commit_config.yaml +92 -0
  136. package/skills/iac-scan/references/EXAMPLE.md +40 -0
  137. package/skills/iac-scan/references/compliance_mapping.md +237 -0
  138. package/skills/iac-scan/references/custom_policies.md +460 -0
  139. package/skills/iac-scan/references/suppression_guide.md +431 -0
  140. package/skills/incident-briefing/SKILL.md +66 -0
  141. package/skills/incident-triage/SKILL.md +481 -0
  142. package/{LICENSE → skills/mcp-builder/LICENSE.txt} +15 -14
  143. package/skills/mcp-builder/SKILL.md +244 -0
  144. package/skills/mcp-builder/reference/evaluation.md +602 -0
  145. package/skills/mcp-builder/reference/mcp_best_practices.md +249 -0
  146. package/skills/mcp-builder/reference/node_mcp_server.md +970 -0
  147. package/skills/mcp-builder/reference/python_mcp_server.md +719 -0
  148. package/skills/mcp-builder/scripts/connections.py +151 -0
  149. package/skills/mcp-builder/scripts/evaluation.py +373 -0
  150. package/skills/mcp-builder/scripts/example_evaluation.xml +22 -0
  151. package/skills/mcp-builder/scripts/requirements.txt +2 -0
  152. package/skills/mobile-pairing/SKILL.md +52 -0
  153. package/skills/ops-sre/SKILL.md +297 -0
  154. package/skills/playwright-qa/LICENSE.txt +201 -0
  155. package/skills/playwright-qa/NOTICE.txt +14 -0
  156. package/skills/playwright-qa/SKILL.md +156 -0
  157. package/skills/playwright-qa/agents/openai.yaml +6 -0
  158. package/skills/playwright-qa/assets/playwright-small.svg +3 -0
  159. package/skills/playwright-qa/assets/playwright.png +0 -0
  160. package/skills/playwright-qa/references/cli.md +116 -0
  161. package/skills/playwright-qa/references/workflows.md +95 -0
  162. package/skills/playwright-qa/scripts/playwright_cli.sh +25 -0
  163. package/skills/release-publish/SKILL.md +85 -0
  164. package/skills/repo-bootstrap/SKILL.md +92 -0
  165. package/skills/repo-bootstrap/assets/example-workflows/validate-agents.yml +89 -0
  166. package/skills/repo-bootstrap/assets/root-thin.md +141 -0
  167. package/skills/repo-bootstrap/assets/root-verbose.md +149 -0
  168. package/skills/repo-bootstrap/assets/scoped/backend-go.md +107 -0
  169. package/skills/repo-bootstrap/assets/scoped/backend-php.md +94 -0
  170. package/skills/repo-bootstrap/assets/scoped/backend-python.md +84 -0
  171. package/skills/repo-bootstrap/assets/scoped/backend-typescript.md +89 -0
  172. package/skills/repo-bootstrap/assets/scoped/claude-code-skill.md +101 -0
  173. package/skills/repo-bootstrap/assets/scoped/cli.md +83 -0
  174. package/skills/repo-bootstrap/assets/scoped/concourse.md +196 -0
  175. package/skills/repo-bootstrap/assets/scoped/ddev.md +68 -0
  176. package/skills/repo-bootstrap/assets/scoped/docker.md +160 -0
  177. package/skills/repo-bootstrap/assets/scoped/documentation.md +98 -0
  178. package/skills/repo-bootstrap/assets/scoped/examples.md +96 -0
  179. package/skills/repo-bootstrap/assets/scoped/frontend-typescript.md +88 -0
  180. package/skills/repo-bootstrap/assets/scoped/github-actions.md +174 -0
  181. package/skills/repo-bootstrap/assets/scoped/gitlab-ci.md +174 -0
  182. package/skills/repo-bootstrap/assets/scoped/oro-bundle.md +209 -0
  183. package/skills/repo-bootstrap/assets/scoped/oro-project.md +170 -0
  184. package/skills/repo-bootstrap/assets/scoped/python-modern.md +170 -0
  185. package/skills/repo-bootstrap/assets/scoped/resources.md +96 -0
  186. package/skills/repo-bootstrap/assets/scoped/skill-repo.md +139 -0
  187. package/skills/repo-bootstrap/assets/scoped/symfony.md +168 -0
  188. package/skills/repo-bootstrap/assets/scoped/testing.md +87 -0
  189. package/skills/repo-bootstrap/assets/scoped/typo3-docs.md +103 -0
  190. package/skills/repo-bootstrap/assets/scoped/typo3-extension.md +133 -0
  191. package/skills/repo-bootstrap/assets/scoped/typo3-project.md +137 -0
  192. package/skills/repo-bootstrap/assets/scoped/typo3-testing.md +80 -0
  193. package/skills/repo-bootstrap/checkpoints.yaml +279 -0
  194. package/skills/repo-bootstrap/evals/evals.json +385 -0
  195. package/skills/repo-bootstrap/references/ai-contribution-guidelines.md +63 -0
  196. package/skills/repo-bootstrap/references/ai-tool-compatibility.md +223 -0
  197. package/skills/repo-bootstrap/references/directory-coverage.md +82 -0
  198. package/skills/repo-bootstrap/references/examples/coding-agent-cli/AGENTS.md +70 -0
  199. package/skills/repo-bootstrap/references/examples/coding-agent-cli/go.mod +3 -0
  200. package/skills/repo-bootstrap/references/examples/coding-agent-cli/scripts-AGENTS.md +389 -0
  201. package/skills/repo-bootstrap/references/examples/express-api-ts/.env.example +13 -0
  202. package/skills/repo-bootstrap/references/examples/express-api-ts/AGENTS.md +91 -0
  203. package/skills/repo-bootstrap/references/examples/express-api-ts/package.json +33 -0
  204. package/skills/repo-bootstrap/references/examples/express-api-ts/pnpm-lock.yaml +3 -0
  205. package/skills/repo-bootstrap/references/examples/express-api-ts/src/AGENTS.md +91 -0
  206. package/skills/repo-bootstrap/references/examples/express-api-ts/src/config.ts +28 -0
  207. package/skills/repo-bootstrap/references/examples/express-api-ts/src/controllers/userController.ts +74 -0
  208. package/skills/repo-bootstrap/references/examples/express-api-ts/src/index.ts +26 -0
  209. package/skills/repo-bootstrap/references/examples/express-api-ts/src/middleware/errorHandler.ts +45 -0
  210. package/skills/repo-bootstrap/references/examples/express-api-ts/src/middleware/requestLogger.ts +18 -0
  211. package/skills/repo-bootstrap/references/examples/express-api-ts/src/routes/health.ts +18 -0
  212. package/skills/repo-bootstrap/references/examples/express-api-ts/src/routes/users.ts +13 -0
  213. package/skills/repo-bootstrap/references/examples/express-api-ts/src/utils/errors.ts +40 -0
  214. package/skills/repo-bootstrap/references/examples/express-api-ts/src/utils/logger.ts +14 -0
  215. package/skills/repo-bootstrap/references/examples/express-api-ts/tsconfig.json +24 -0
  216. package/skills/repo-bootstrap/references/examples/fastapi-app/.env.example +19 -0
  217. package/skills/repo-bootstrap/references/examples/fastapi-app/AGENTS.md +92 -0
  218. package/skills/repo-bootstrap/references/examples/fastapi-app/pyproject.toml +88 -0
  219. package/skills/repo-bootstrap/references/examples/fastapi-app/src/AGENTS.md +85 -0
  220. package/skills/repo-bootstrap/references/examples/fastapi-app/src/__init__.py +3 -0
  221. package/skills/repo-bootstrap/references/examples/fastapi-app/src/config.py +49 -0
  222. package/skills/repo-bootstrap/references/examples/fastapi-app/src/main.py +66 -0
  223. package/skills/repo-bootstrap/references/examples/fastapi-app/src/models/__init__.py +13 -0
  224. package/skills/repo-bootstrap/references/examples/fastapi-app/src/models/item.py +43 -0
  225. package/skills/repo-bootstrap/references/examples/fastapi-app/src/models/user.py +40 -0
  226. package/skills/repo-bootstrap/references/examples/fastapi-app/src/routes/__init__.py +5 -0
  227. package/skills/repo-bootstrap/references/examples/fastapi-app/src/routes/health.py +20 -0
  228. package/skills/repo-bootstrap/references/examples/fastapi-app/src/routes/items.py +61 -0
  229. package/skills/repo-bootstrap/references/examples/fastapi-app/src/routes/users.py +55 -0
  230. package/skills/repo-bootstrap/references/examples/fastapi-app/src/services/__init__.py +6 -0
  231. package/skills/repo-bootstrap/references/examples/fastapi-app/src/services/item_service.py +77 -0
  232. package/skills/repo-bootstrap/references/examples/fastapi-app/src/services/user_service.py +69 -0
  233. package/skills/repo-bootstrap/references/examples/fastapi-app/uv.lock +4 -0
  234. package/skills/repo-bootstrap/references/examples/go-api-with-react-admin/.scopes +3 -0
  235. package/skills/repo-bootstrap/references/examples/go-api-with-react-admin/AGENTS.md +86 -0
  236. package/skills/repo-bootstrap/references/examples/go-api-with-react-admin/admin/package.json +20 -0
  237. package/skills/repo-bootstrap/references/examples/go-api-with-react-admin/admin/src/App.tsx +5 -0
  238. package/skills/repo-bootstrap/references/examples/go-api-with-react-admin/cmd/api/main.go +7 -0
  239. package/skills/repo-bootstrap/references/examples/go-api-with-react-admin/go.mod +2 -0
  240. package/skills/repo-bootstrap/references/examples/go-api-with-react-admin/main.go +7 -0
  241. package/skills/repo-bootstrap/references/examples/go-with-internal-web-tsx/.scopes +3 -0
  242. package/skills/repo-bootstrap/references/examples/go-with-internal-web-tsx/AGENTS.md +89 -0
  243. package/skills/repo-bootstrap/references/examples/go-with-internal-web-tsx/go.mod +2 -0
  244. package/skills/repo-bootstrap/references/examples/go-with-internal-web-tsx/internal/web/AGENTS.md +90 -0
  245. package/skills/repo-bootstrap/references/examples/go-with-internal-web-tsx/internal/web/package.json +17 -0
  246. package/skills/repo-bootstrap/references/examples/go-with-internal-web-tsx/internal/web/src/App.tsx +1 -0
  247. package/skills/repo-bootstrap/references/examples/go-with-internal-web-tsx/internal/web/src/Button.tsx +1 -0
  248. package/skills/repo-bootstrap/references/examples/go-with-internal-web-tsx/internal/web/src/Footer.tsx +1 -0
  249. package/skills/repo-bootstrap/references/examples/go-with-internal-web-tsx/internal/web/src/Header.tsx +1 -0
  250. package/skills/repo-bootstrap/references/examples/go-with-internal-web-tsx/internal/web/src/Sidebar.tsx +1 -0
  251. package/skills/repo-bootstrap/references/examples/go-with-internal-web-tsx/main.go +7 -0
  252. package/skills/repo-bootstrap/references/examples/go-with-internal-web-tsx/package-lock.json +0 -0
  253. package/skills/repo-bootstrap/references/examples/go-with-internal-web-tsx/package.json +12 -0
  254. package/skills/repo-bootstrap/references/examples/ldap-selfservice/AGENTS.md +70 -0
  255. package/skills/repo-bootstrap/references/examples/ldap-selfservice/go.mod +3 -0
  256. package/skills/repo-bootstrap/references/examples/ldap-selfservice/internal-AGENTS.md +371 -0
  257. package/skills/repo-bootstrap/references/examples/ldap-selfservice/internal-web-AGENTS.md +448 -0
  258. package/skills/repo-bootstrap/references/examples/php-with-frontend/.scopes +3 -0
  259. package/skills/repo-bootstrap/references/examples/php-with-frontend/AGENTS.md +91 -0
  260. package/skills/repo-bootstrap/references/examples/php-with-frontend/composer.json +8 -0
  261. package/skills/repo-bootstrap/references/examples/php-with-frontend/package.json +15 -0
  262. package/skills/repo-bootstrap/references/examples/php-with-frontend/pnpm-lock.yaml +0 -0
  263. package/skills/repo-bootstrap/references/examples/php-with-frontend/src/Controller.php +3 -0
  264. package/skills/repo-bootstrap/references/examples/php-with-frontend/web/AGENTS.md +92 -0
  265. package/skills/repo-bootstrap/references/examples/php-with-frontend/web/package.json +26 -0
  266. package/skills/repo-bootstrap/references/examples/php-with-frontend/web/src/App.tsx +3 -0
  267. package/skills/repo-bootstrap/references/examples/php-with-frontend/web/src/Button.tsx +10 -0
  268. package/skills/repo-bootstrap/references/examples/php-with-frontend/web/src/Footer.tsx +9 -0
  269. package/skills/repo-bootstrap/references/examples/php-with-frontend/web/src/Header.tsx +9 -0
  270. package/skills/repo-bootstrap/references/examples/php-with-frontend/web/src/main.tsx +3 -0
  271. package/skills/repo-bootstrap/references/examples/php-with-frontend/web/tsconfig.json +13 -0
  272. package/skills/repo-bootstrap/references/examples/pnpm-workspace/AGENTS.md +75 -0
  273. package/skills/repo-bootstrap/references/examples/pnpm-workspace/package.json +7 -0
  274. package/skills/repo-bootstrap/references/examples/pnpm-workspace/packages/web/package.json +11 -0
  275. package/skills/repo-bootstrap/references/examples/pnpm-workspace/packages/web/src/index.ts +11 -0
  276. package/skills/repo-bootstrap/references/examples/pnpm-workspace/pnpm-lock.yaml +42 -0
  277. package/skills/repo-bootstrap/references/examples/pnpm-workspace/pnpm-workspace.yaml +2 -0
  278. package/skills/repo-bootstrap/references/examples/simple-ldap-go/AGENTS.md +70 -0
  279. package/skills/repo-bootstrap/references/examples/simple-ldap-go/examples-AGENTS.md +45 -0
  280. package/skills/repo-bootstrap/references/examples/simple-ldap-go/go.mod +3 -0
  281. package/skills/repo-bootstrap/references/examples/t3x-rte-ckeditor-image/AGENTS.md +70 -0
  282. package/skills/repo-bootstrap/references/examples/t3x-rte-ckeditor-image/Classes-AGENTS.md +392 -0
  283. package/skills/repo-bootstrap/references/examples/t3x-rte-ckeditor-image/composer.json +8 -0
  284. package/skills/repo-bootstrap/references/feedback-memory-schema.md +135 -0
  285. package/skills/repo-bootstrap/references/git-hooks-setup.md +79 -0
  286. package/skills/repo-bootstrap/references/output-structure.md +124 -0
  287. package/skills/repo-bootstrap/references/scripts-guide.md +175 -0
  288. package/skills/repo-bootstrap/references/verification-guide.md +137 -0
  289. package/skills/repo-bootstrap/scripts/analyze-git-history.sh +315 -0
  290. package/skills/repo-bootstrap/scripts/check-freshness.sh +230 -0
  291. package/skills/repo-bootstrap/scripts/detect-golden-samples.sh +161 -0
  292. package/skills/repo-bootstrap/scripts/detect-heuristics.sh +93 -0
  293. package/skills/repo-bootstrap/scripts/detect-project.sh +486 -0
  294. package/skills/repo-bootstrap/scripts/detect-scopes.sh +330 -0
  295. package/skills/repo-bootstrap/scripts/detect-utilities.sh +133 -0
  296. package/skills/repo-bootstrap/scripts/extract-adrs.sh +194 -0
  297. package/skills/repo-bootstrap/scripts/extract-agent-configs.sh +331 -0
  298. package/skills/repo-bootstrap/scripts/extract-architecture-rules.sh +522 -0
  299. package/skills/repo-bootstrap/scripts/extract-ci-commands.sh +385 -0
  300. package/skills/repo-bootstrap/scripts/extract-ci-rules.sh +384 -0
  301. package/skills/repo-bootstrap/scripts/extract-commands.sh +358 -0
  302. package/skills/repo-bootstrap/scripts/extract-documentation.sh +308 -0
  303. package/skills/repo-bootstrap/scripts/extract-github-rulesets.sh +96 -0
  304. package/skills/repo-bootstrap/scripts/extract-github-settings.sh +88 -0
  305. package/skills/repo-bootstrap/scripts/extract-ide-settings.sh +228 -0
  306. package/skills/repo-bootstrap/scripts/extract-platform-files.sh +290 -0
  307. package/skills/repo-bootstrap/scripts/extract-quality-configs.sh +442 -0
  308. package/skills/repo-bootstrap/scripts/generate-agents.sh +2424 -0
  309. package/skills/repo-bootstrap/scripts/generate-file-map.sh +153 -0
  310. package/skills/repo-bootstrap/scripts/lib/config-root.sh +211 -0
  311. package/skills/repo-bootstrap/scripts/lib/summary.sh +244 -0
  312. package/skills/repo-bootstrap/scripts/lib/template.sh +397 -0
  313. package/skills/repo-bootstrap/scripts/validate-structure.sh +324 -0
  314. package/skills/repo-bootstrap/scripts/verify-commands.sh +615 -0
  315. package/skills/repo-bootstrap/scripts/verify-content.sh +302 -0
  316. package/skills/schema-api-contracts/SKILL.md +56 -0
  317. package/skills/secret-hygiene/SKILL.md +511 -0
  318. package/skills/secret-hygiene/assets/.gitkeep +9 -0
  319. package/skills/secret-hygiene/assets/config-balanced.toml +81 -0
  320. package/skills/secret-hygiene/assets/config-custom.toml +178 -0
  321. package/skills/secret-hygiene/assets/config-strict.toml +48 -0
  322. package/skills/secret-hygiene/assets/github-action.yml +181 -0
  323. package/skills/secret-hygiene/assets/gitlab-ci.yml +257 -0
  324. package/skills/secret-hygiene/assets/precommit-config.yaml +70 -0
  325. package/skills/secret-hygiene/references/EXAMPLE.md +40 -0
  326. package/skills/secret-hygiene/references/compliance_mapping.md +538 -0
  327. package/skills/secret-hygiene/references/detection_rules.md +276 -0
  328. package/skills/secret-hygiene/references/false_positives.md +598 -0
  329. package/skills/secret-hygiene/references/remediation_guide.md +530 -0
  330. package/skills/stack-selector/SKILL.md +56 -0
  331. package/skills/telegram-control/SKILL.md +110 -0
  332. package/skills/telegram-control/references/architecture.md +184 -0
  333. package/skills/telegram-control/references/convex.md +173 -0
  334. package/skills/telegram-control/references/error_handling.md +212 -0
  335. package/skills/telegram-control/references/initial_setup.md +165 -0
  336. package/skills/telegram-control/references/telegram_api.md +156 -0
  337. package/skills/telegram-control/scripts/cancel_message.ts +53 -0
  338. package/skills/telegram-control/scripts/list_scheduled.ts +103 -0
  339. package/skills/telegram-control/scripts/logger.ts +121 -0
  340. package/skills/telegram-control/scripts/proxy-util.ts +11 -0
  341. package/skills/telegram-control/scripts/schedule_message.ts +216 -0
  342. package/skills/telegram-control/scripts/send_message.ts +115 -0
  343. package/skills/telegram-control/scripts/setup.ts +185 -0
  344. package/skills/telegram-control/scripts/types.ts +75 -0
  345. package/skills/telegram-control/scripts/view_history.ts +74 -0
  346. package/skills/test-strategy/SKILL.md +352 -0
  347. package/skills/threat-model/SKILL.md +303 -0
  348. package/skills/threat-model/examples/example-output.md +196 -0
  349. package/skills/threat-model/template.md +96 -0
  350. package/skills/ts-lint/SKILL.md +80 -0
  351. package/skills/ui-flow/SKILL.md +668 -0
  352. package/skills/voice-command-router/SKILL.md +51 -0
  353. package/skills/widget-live-activity-sync/SKILL.md +66 -0
@@ -0,0 +1,3058 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Complete database analysis for Railway deployments.
4
+
5
+ Produces a comprehensive report covering:
6
+ - Deployment status
7
+ - Resource overview (disk, connections)
8
+ - Memory configuration
9
+ - Cache efficiency (overall and per-table)
10
+ - Vacuum health
11
+ - Query performance (with --deep)
12
+ - Index health
13
+ - Recommendations
14
+
15
+ Usage:
16
+ analyze-postgres.py --service <name>
17
+ analyze-postgres.py --service <name> --deep
18
+ analyze-postgres.py --service <name> --json
19
+ """
20
+
21
+ import argparse
22
+ import base64
23
+ import json
24
+ import os
25
+ import subprocess
26
+ import sys
27
+ import re
28
+ from concurrent.futures import ThreadPoolExecutor, as_completed
29
+ from datetime import datetime, timezone
30
+ from typing import Dict, List, Optional, Any, Tuple
31
+ from dataclasses import dataclass, field, asdict
32
+
33
+ import dal
34
+ from dal import (
35
+ LOG_LINES_DEFAULT, ProgressTimer, RailwayContext,
36
+ _init_context, progress, run_railway_command, run_ssh_query, run_psql_query,
37
+ get_railway_status, get_deployment_status,
38
+ get_all_metrics_from_api, _analyze_window, _build_metrics_history,
39
+ get_recent_logs,
40
+ _trend_indicator,
41
+ )
42
+
43
+
44
+ @dataclass
45
+ class AnalysisResult:
46
+ """Container for analysis results."""
47
+ service: str
48
+ db_type: str
49
+ timestamp: str
50
+ deployment_status: str = "UNKNOWN"
51
+ disk_usage: Optional[Dict[str, Any]] = None
52
+ cpu_memory: Optional[Dict[str, Any]] = None
53
+ connections: Optional[Dict[str, Any]] = None
54
+ connection_states: List[Dict[str, Any]] = field(default_factory=list)
55
+ connections_by_app: List[Dict[str, Any]] = field(default_factory=list)
56
+ connections_by_age: List[Dict[str, Any]] = field(default_factory=list)
57
+ oldest_connection_sec: Optional[int] = None
58
+ oldest_connections: List[Dict[str, Any]] = field(default_factory=list)
59
+ memory_config: Optional[Dict[str, Any]] = None
60
+ cache_hit: Optional[Dict[str, Any]] = None
61
+ cache_per_table: List[Dict[str, Any]] = field(default_factory=list)
62
+ table_sizes: List[Dict[str, Any]] = field(default_factory=list)
63
+ database_stats: Optional[Dict[str, Any]] = None
64
+ size_breakdown: Optional[Dict[str, Any]] = None
65
+ vacuum_health: List[Dict[str, Any]] = field(default_factory=list)
66
+ xid_age: Optional[Dict[str, Any]] = None
67
+ pg_stat_statements_installed: bool = False
68
+ top_queries: List[Dict[str, Any]] = field(default_factory=list)
69
+ long_running_queries: List[Dict[str, Any]] = field(default_factory=list)
70
+ idle_in_transaction: List[Dict[str, Any]] = field(default_factory=list)
71
+ blocked_queries: List[Dict[str, Any]] = field(default_factory=list)
72
+ locks: List[Dict[str, Any]] = field(default_factory=list)
73
+ unused_indexes: List[Dict[str, Any]] = field(default_factory=list)
74
+ invalid_indexes: List[Dict[str, Any]] = field(default_factory=list)
75
+ seq_scan_tables: List[Dict[str, Any]] = field(default_factory=list)
76
+ replication: List[Dict[str, Any]] = field(default_factory=list)
77
+ bgwriter: Optional[Dict[str, Any]] = None
78
+ archiver: Optional[Dict[str, Any]] = None
79
+ progress_vacuum: List[Dict[str, Any]] = field(default_factory=list)
80
+ ssl_stats: Optional[Dict[str, Any]] = None
81
+ ha_cluster: Optional[Dict[str, Any]] = None
82
+ cluster_logs: List[Dict[str, Any]] = field(default_factory=list)
83
+ recent_logs: List[str] = field(default_factory=list) # Raw unfiltered logs for LLM analysis
84
+ recent_errors: List[str] = field(default_factory=list) # Legacy: filtered error logs
85
+ metrics_history: Optional[Dict[str, Any]] = None # Multi-window time series + trend analysis for CPU, memory, disk, network
86
+ collection_status: Dict[str, Dict[str, Any]] = field(default_factory=dict) # Status of each data source
87
+ errors: List[str] = field(default_factory=list)
88
+ recommendations: List[Dict[str, str]] = field(default_factory=list)
89
+
90
+
91
+
92
+ def run_psql_query_safe(service: str, query: str, timeout: int = 60) -> Tuple[int, str, str]:
93
+ """Run a psql query using base64 encoding to avoid shell quoting issues."""
94
+ encoded = base64.b64encode(query.encode()).decode()
95
+ # 2>/dev/null suppresses psql warnings (e.g., collation version mismatch) that pollute stdout
96
+ command = f"echo '{encoded}' | base64 -d | psql $DATABASE_URL -P pager=off -t -A 2>/dev/null"
97
+ return run_ssh_query(service, command, timeout)
98
+
99
+
100
+ def build_analysis_query() -> str:
101
+ """Build a single SQL query that returns all analysis data as JSON."""
102
+ return """
103
+ SELECT json_build_object(
104
+ 'connections', (
105
+ SELECT json_build_object(
106
+ 'current', (SELECT count(*) FROM pg_stat_activity WHERE datname = current_database()),
107
+ 'max', (SELECT setting::int FROM pg_settings WHERE name = 'max_connections'),
108
+ 'reserved', (SELECT setting::int FROM pg_settings WHERE name = 'superuser_reserved_connections'),
109
+ 'active', (SELECT count(*) FROM pg_stat_activity WHERE datname = current_database() AND state = 'active'),
110
+ 'idle', (SELECT count(*) FROM pg_stat_activity WHERE datname = current_database() AND state = 'idle'),
111
+ 'idle_in_transaction', (SELECT count(*) FROM pg_stat_activity WHERE datname = current_database() AND state = 'idle in transaction')
112
+ )
113
+ ),
114
+ 'memory_config', (
115
+ SELECT json_agg(json_build_object(
116
+ 'name', name,
117
+ 'setting', setting,
118
+ 'unit', unit
119
+ ))
120
+ FROM pg_settings
121
+ WHERE name IN (
122
+ 'shared_buffers', 'effective_cache_size', 'work_mem', 'maintenance_work_mem',
123
+ 'wal_buffers', 'checkpoint_completion_target', 'min_wal_size', 'max_wal_size',
124
+ 'max_parallel_workers', 'max_parallel_workers_per_gather', 'random_page_cost',
125
+ 'default_statistics_target', 'synchronous_commit', 'max_connections',
126
+ 'autovacuum', 'autovacuum_vacuum_scale_factor', 'autovacuum_analyze_scale_factor',
127
+ 'track_activity_query_size', 'log_min_duration_statement',
128
+ 'idle_in_transaction_session_timeout', 'statement_timeout',
129
+ 'track_io_timing'
130
+ )
131
+ ),
132
+ 'cache_hit', (
133
+ SELECT json_build_object(
134
+ 'table_hit_pct', ROUND(100.0 * sum(heap_blks_hit) / NULLIF(sum(heap_blks_hit) + sum(heap_blks_read), 0), 2),
135
+ 'index_hit_pct', ROUND(100.0 * sum(idx_blks_hit) / NULLIF(sum(idx_blks_hit) + sum(idx_blks_read), 0), 2)
136
+ )
137
+ FROM pg_statio_user_tables
138
+ ),
139
+ 'database_stats', (
140
+ SELECT json_build_object(
141
+ 'deadlocks', deadlocks,
142
+ 'temp_files', temp_files,
143
+ 'temp_bytes', temp_bytes,
144
+ 'stats_reset', COALESCE(stats_reset::text, 'never'),
145
+ 'blks_read', blks_read,
146
+ 'blks_hit', blks_hit,
147
+ 'tup_returned', tup_returned,
148
+ 'tup_fetched', tup_fetched,
149
+ 'tup_inserted', tup_inserted,
150
+ 'tup_updated', tup_updated,
151
+ 'tup_deleted', tup_deleted,
152
+ 'conflicts', conflicts,
153
+ 'checksum_failures', COALESCE(checksum_failures, 0)
154
+ )
155
+ FROM pg_stat_database
156
+ WHERE datname = current_database()
157
+ ),
158
+ 'cache_per_table', (
159
+ SELECT COALESCE(json_agg(t ORDER BY t.disk_reads DESC), '[]'::json)
160
+ FROM (
161
+ SELECT
162
+ relname as table_name,
163
+ heap_blks_read as disk_reads,
164
+ heap_blks_hit as cache_hits,
165
+ ROUND(100.0 * heap_blks_hit / NULLIF(heap_blks_hit + heap_blks_read, 0), 2) as hit_pct
166
+ FROM pg_statio_user_tables
167
+ WHERE heap_blks_read > 10000
168
+ ORDER BY heap_blks_read DESC LIMIT 1000
169
+ ) t
170
+ ),
171
+ 'table_sizes', (
172
+ SELECT COALESCE(json_agg(t ORDER BY t.total_bytes DESC), '[]'::json)
173
+ FROM (
174
+ SELECT
175
+ schemaname as schema,
176
+ relname as table_name,
177
+ pg_size_pretty(pg_total_relation_size(relid)) as size,
178
+ pg_total_relation_size(relid) as total_bytes,
179
+ pg_table_size(relid) as data_bytes,
180
+ pg_indexes_size(relid) as index_bytes,
181
+ n_live_tup as row_count
182
+ FROM pg_stat_user_tables
183
+ ORDER BY pg_total_relation_size(relid) DESC LIMIT 1000
184
+ ) t
185
+ ),
186
+ 'size_breakdown', (
187
+ SELECT json_build_object(
188
+ 'database_bytes', pg_database_size(current_database()),
189
+ 'wal_bytes', COALESCE((SELECT sum(size) FROM pg_ls_waldir()), 0),
190
+ 'user_tables_bytes', COALESCE((SELECT sum(pg_table_size(relid)) FROM pg_stat_user_tables), 0),
191
+ 'user_indexes_bytes', COALESCE((SELECT sum(pg_indexes_size(relid)) FROM pg_stat_user_tables), 0),
192
+ 'system_bytes', COALESCE((
193
+ SELECT sum(pg_total_relation_size(c.oid))
194
+ FROM pg_class c
195
+ JOIN pg_namespace n ON n.oid = c.relnamespace
196
+ WHERE n.nspname IN ('pg_catalog', 'information_schema') AND NOT c.relisshared
197
+ ), 0)
198
+ )
199
+ ),
200
+ 'vacuum_health', (
201
+ SELECT COALESCE(json_agg(t ORDER BY t.dead_rows DESC), '[]'::json)
202
+ FROM (
203
+ SELECT
204
+ s.schemaname as schema,
205
+ s.relname as table_name,
206
+ n_live_tup as live_rows,
207
+ n_dead_tup as dead_rows,
208
+ CASE WHEN n_live_tup > 0 THEN ROUND(100.0 * n_dead_tup / n_live_tup, 2) ELSE 0 END as dead_pct,
209
+ vacuum_count,
210
+ autovacuum_count,
211
+ COALESCE(last_vacuum::text, 'never') as last_vacuum,
212
+ COALESCE(last_autovacuum::text, 'never') as last_autovacuum,
213
+ COALESCE(last_analyze::text, 'never') as last_analyze,
214
+ age(c.relfrozenxid) as xid_age,
215
+ CASE WHEN n_dead_tup > 1000 AND (n_live_tup = 0 OR n_dead_tup::float / NULLIF(n_live_tup, 0) > 0.1) THEN true ELSE false END as needs_vacuum,
216
+ CASE WHEN age(c.relfrozenxid) > 150000000 THEN true ELSE false END as needs_freeze
217
+ FROM pg_stat_user_tables s
218
+ JOIN pg_class c ON c.oid = s.relid
219
+ WHERE n_dead_tup > 100
220
+ ORDER BY n_dead_tup DESC LIMIT 1000
221
+ ) t
222
+ ),
223
+ 'xid_age', (
224
+ SELECT json_build_object(
225
+ 'value', age(datfrozenxid)
226
+ )
227
+ FROM pg_database WHERE datname = current_database()
228
+ ),
229
+ 'unused_indexes', (
230
+ SELECT COALESCE(json_agg(t ORDER BY t.size_bytes DESC), '[]'::json)
231
+ FROM (
232
+ SELECT
233
+ s.schemaname as schema,
234
+ s.relname as table_name,
235
+ s.indexrelname as index_name,
236
+ pg_size_pretty(pg_relation_size(s.indexrelid)) as size,
237
+ pg_relation_size(s.indexrelid) as size_bytes,
238
+ s.idx_scan as scans,
239
+ t.seq_scan as table_seq_scans,
240
+ t.idx_scan as table_idx_scans,
241
+ t.n_live_tup as table_rows,
242
+ i.indisprimary as is_primary,
243
+ i.indisunique as is_unique,
244
+ CASE WHEN t.seq_scan > 0 AND s.idx_scan = 0 AND t.n_live_tup > 1000
245
+ THEN t.seq_scan ELSE 0 END as missing_index_score
246
+ FROM pg_stat_user_indexes s
247
+ JOIN pg_stat_user_tables t ON s.relid = t.relid
248
+ JOIN pg_index i ON s.indexrelid = i.indexrelid
249
+ WHERE s.idx_scan = 0 AND pg_relation_size(s.indexrelid) > 8192
250
+ ORDER BY pg_relation_size(s.indexrelid) DESC LIMIT 1000
251
+ ) t
252
+ ),
253
+ 'connection_states', (
254
+ SELECT COALESCE(json_agg(t ORDER BY t.count DESC), '[]'::json)
255
+ FROM (
256
+ SELECT state, count(*) as count
257
+ FROM pg_stat_activity
258
+ WHERE datname = current_database()
259
+ GROUP BY state
260
+ ORDER BY count DESC
261
+ ) t
262
+ ),
263
+ 'connections_by_app', (
264
+ SELECT COALESCE(json_agg(t ORDER BY t.count DESC), '[]'::json)
265
+ FROM (
266
+ SELECT COALESCE(application_name, '') as app, COUNT(*) as count
267
+ FROM pg_stat_activity
268
+ WHERE datname = current_database()
269
+ GROUP BY application_name
270
+ ORDER BY count DESC LIMIT 100
271
+ ) t
272
+ ),
273
+ 'connections_by_age', (
274
+ SELECT COALESCE(json_agg(t), '[]'::json)
275
+ FROM (
276
+ SELECT
277
+ CASE
278
+ WHEN age_seconds < 60 THEN '< 1 min'
279
+ WHEN age_seconds < 300 THEN '1-5 min'
280
+ WHEN age_seconds < 3600 THEN '5-60 min'
281
+ WHEN age_seconds < 86400 THEN '1-24 hr'
282
+ ELSE '> 24 hr'
283
+ END as range,
284
+ count(*) as count
285
+ FROM (
286
+ SELECT EXTRACT(EPOCH FROM (now() - backend_start)) as age_seconds
287
+ FROM pg_stat_activity WHERE datname = current_database()
288
+ ) sub
289
+ GROUP BY 1
290
+ ORDER BY MIN(age_seconds)
291
+ ) t
292
+ ),
293
+ 'oldest_connection_sec', (
294
+ SELECT COALESCE(MAX(EXTRACT(EPOCH FROM (now() - backend_start)))::int, 0)
295
+ FROM pg_stat_activity
296
+ WHERE datname = current_database()
297
+ ),
298
+ 'oldest_connections', (
299
+ SELECT COALESCE(json_agg(t), '[]'::json)
300
+ FROM (
301
+ SELECT
302
+ COALESCE(application_name, '') as application_name,
303
+ state,
304
+ LEFT(query, 100) as query_preview,
305
+ ROUND(EXTRACT(EPOCH FROM (now() - backend_start)) / 3600)::int as age_hours,
306
+ ROUND(EXTRACT(EPOCH FROM (now() - backend_start)) / 86400, 1) as age_days,
307
+ client_addr::text,
308
+ wait_event_type,
309
+ wait_event
310
+ FROM pg_stat_activity
311
+ WHERE datname = current_database()
312
+ AND EXTRACT(EPOCH FROM (now() - backend_start)) > 86400
313
+ ORDER BY backend_start ASC
314
+ LIMIT 5
315
+ ) t
316
+ ),
317
+ 'seq_scan_tables', (
318
+ SELECT COALESCE(json_agg(t ORDER BY t.seq_scans DESC), '[]'::json)
319
+ FROM (
320
+ SELECT
321
+ relname as table_name,
322
+ seq_scan as seq_scans,
323
+ idx_scan as idx_scans,
324
+ n_live_tup as rows
325
+ FROM pg_stat_user_tables
326
+ WHERE seq_scan > 100 AND n_live_tup > 1000
327
+ ORDER BY seq_scan DESC LIMIT 100
328
+ ) t
329
+ ),
330
+ 'pg_stat_statements_installed', (
331
+ SELECT EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements')
332
+ ),
333
+ 'top_queries', (
334
+ SELECT COALESCE(json_agg(t), '[]'::json)
335
+ FROM (
336
+ SELECT
337
+ query,
338
+ calls,
339
+ ROUND(total_exec_time::numeric/1000/60, 1) as total_min,
340
+ ROUND(mean_exec_time::numeric, 1) as mean_ms,
341
+ ROUND(min_exec_time::numeric, 1) as min_ms,
342
+ ROUND(max_exec_time::numeric, 1) as max_ms,
343
+ ROUND(stddev_exec_time::numeric, 1) as stddev_ms,
344
+ rows,
345
+ CASE WHEN calls > 0 THEN ROUND(rows::numeric / calls, 2) ELSE 0 END as rows_per_call,
346
+ total_exec_time,
347
+ ROUND(total_plan_time::numeric, 1) as total_plan_ms,
348
+ ROUND(mean_plan_time::numeric, 1) as mean_plan_ms,
349
+ shared_blks_hit,
350
+ shared_blks_read,
351
+ shared_blks_dirtied,
352
+ shared_blks_written,
353
+ ROUND(100.0 * shared_blks_hit / NULLIF(shared_blks_hit + shared_blks_read, 0), 2) as cache_hit_pct,
354
+ local_blks_hit,
355
+ local_blks_read,
356
+ temp_blks_read,
357
+ temp_blks_written,
358
+ ROUND(blk_read_time::numeric, 1) as blk_read_time_ms,
359
+ ROUND(blk_write_time::numeric, 1) as blk_write_time_ms,
360
+ wal_records,
361
+ wal_bytes
362
+ FROM pg_stat_statements s
363
+ JOIN pg_database d ON s.dbid = d.oid
364
+ WHERE d.datname = current_database()
365
+ ORDER BY total_exec_time DESC LIMIT 100
366
+ ) t
367
+ ),
368
+ 'long_running_queries', (
369
+ SELECT COALESCE(json_agg(t ORDER BY t.duration_sec DESC), '[]'::json)
370
+ FROM (
371
+ SELECT
372
+ pid,
373
+ EXTRACT(EPOCH FROM (now() - query_start))::int as duration_sec,
374
+ query
375
+ FROM pg_stat_activity
376
+ WHERE state = 'active'
377
+ AND now() - query_start > interval '5 seconds'
378
+ ORDER BY query_start LIMIT 100
379
+ ) t
380
+ ),
381
+ 'idle_in_transaction', (
382
+ SELECT COALESCE(json_agg(t ORDER BY t.idle_sec DESC), '[]'::json)
383
+ FROM (
384
+ SELECT
385
+ pid,
386
+ EXTRACT(EPOCH FROM (now() - state_change))::int as idle_sec,
387
+ COALESCE(usename, '') as username,
388
+ COALESCE(application_name, '') as app,
389
+ query as last_query
390
+ FROM pg_stat_activity
391
+ WHERE state = 'idle in transaction'
392
+ AND now() - state_change > interval '30 seconds'
393
+ ORDER BY state_change LIMIT 100
394
+ ) t
395
+ ),
396
+ 'blocked_queries', (
397
+ SELECT COALESCE(json_agg(t ORDER BY t.wait_sec DESC), '[]'::json)
398
+ FROM (
399
+ SELECT
400
+ blocked.pid,
401
+ EXTRACT(EPOCH FROM (now() - blocked.query_start))::int as wait_sec,
402
+ COALESCE(blocked.usename, '') as username,
403
+ COALESCE(blocking.pid::text, '') as blocking_pid,
404
+ left(blocked.query, 60) as query
405
+ FROM pg_stat_activity blocked
406
+ JOIN pg_locks blocked_locks ON blocked.pid = blocked_locks.pid
407
+ JOIN pg_locks blocking_locks ON blocked_locks.locktype = blocking_locks.locktype
408
+ AND blocked_locks.database IS NOT DISTINCT FROM blocking_locks.database
409
+ AND blocked_locks.relation IS NOT DISTINCT FROM blocking_locks.relation
410
+ AND blocked_locks.page IS NOT DISTINCT FROM blocking_locks.page
411
+ AND blocked_locks.tuple IS NOT DISTINCT FROM blocking_locks.tuple
412
+ AND blocked_locks.virtualxid IS NOT DISTINCT FROM blocking_locks.virtualxid
413
+ AND blocked_locks.transactionid IS NOT DISTINCT FROM blocking_locks.transactionid
414
+ AND blocked_locks.classid IS NOT DISTINCT FROM blocking_locks.classid
415
+ AND blocked_locks.objid IS NOT DISTINCT FROM blocking_locks.objid
416
+ AND blocked_locks.objsubid IS NOT DISTINCT FROM blocking_locks.objsubid
417
+ AND blocked_locks.pid != blocking_locks.pid
418
+ JOIN pg_stat_activity blocking ON blocking_locks.pid = blocking.pid
419
+ WHERE NOT blocked_locks.granted
420
+ ORDER BY blocked.query_start LIMIT 100
421
+ ) t
422
+ ),
423
+ 'locks', (
424
+ SELECT COALESCE(json_agg(t), '[]'::json)
425
+ FROM (
426
+ SELECT
427
+ l.locktype,
428
+ l.mode,
429
+ COALESCE(a.usename, '') as username,
430
+ COALESCE(a.application_name, '') as app,
431
+ left(COALESCE(a.query, ''), 50) as query
432
+ FROM pg_locks l
433
+ JOIN pg_stat_activity a ON l.pid = a.pid
434
+ WHERE a.datname = current_database() AND NOT l.granted
435
+ LIMIT 100
436
+ ) t
437
+ ),
438
+ 'replication', (
439
+ SELECT COALESCE(json_agg(t), '[]'::json)
440
+ FROM (
441
+ SELECT
442
+ COALESCE(client_addr::text, 'local') as client,
443
+ state,
444
+ sent_lsn::text as sent_lsn,
445
+ replay_lsn::text as replay_lsn
446
+ FROM pg_stat_replication
447
+ ) t
448
+ ),
449
+ 'bgwriter', (
450
+ SELECT json_build_object(
451
+ 'checkpoints_timed', checkpoints_timed,
452
+ 'checkpoints_req', checkpoints_req,
453
+ 'buffers_checkpoint', buffers_checkpoint,
454
+ 'buffers_clean', buffers_clean,
455
+ 'buffers_backend', buffers_backend,
456
+ 'buffers_backend_fsync', buffers_backend_fsync,
457
+ 'maxwritten_clean', maxwritten_clean,
458
+ 'stats_reset', COALESCE(stats_reset::text, 'never')
459
+ )
460
+ FROM pg_stat_bgwriter
461
+ ),
462
+ 'invalid_indexes', (
463
+ SELECT COALESCE(json_agg(json_build_object(
464
+ 'schema', n.nspname,
465
+ 'table', c.relname,
466
+ 'index', i.relname
467
+ )), '[]'::json)
468
+ FROM pg_index x
469
+ JOIN pg_class c ON c.oid = x.indrelid
470
+ JOIN pg_class i ON i.oid = x.indexrelid
471
+ JOIN pg_namespace n ON n.oid = c.relnamespace
472
+ WHERE NOT x.indisvalid
473
+ ),
474
+ 'archiver', (
475
+ SELECT json_build_object(
476
+ 'archived_count', archived_count,
477
+ 'failed_count', failed_count,
478
+ 'last_archived_wal', last_archived_wal,
479
+ 'last_archived_time', COALESCE(last_archived_time::text, 'never'),
480
+ 'last_failed_wal', last_failed_wal,
481
+ 'last_failed_time', COALESCE(last_failed_time::text, 'never'),
482
+ 'stats_reset', COALESCE(stats_reset::text, 'never')
483
+ )
484
+ FROM pg_stat_archiver
485
+ ),
486
+ 'progress_vacuum', (
487
+ SELECT COALESCE(json_agg(json_build_object(
488
+ 'pid', p.pid,
489
+ 'datname', d.datname,
490
+ 'relname', c.relname,
491
+ 'phase', p.phase,
492
+ 'heap_blks_total', p.heap_blks_total,
493
+ 'heap_blks_scanned', p.heap_blks_scanned,
494
+ 'heap_blks_vacuumed', p.heap_blks_vacuumed,
495
+ 'index_vacuum_count', p.index_vacuum_count,
496
+ 'max_dead_tuples', p.max_dead_tuples,
497
+ 'num_dead_tuples', p.num_dead_tuples
498
+ )), '[]'::json)
499
+ FROM pg_stat_progress_vacuum p
500
+ JOIN pg_database d ON p.datid = d.oid
501
+ LEFT JOIN pg_class c ON p.relid = c.oid
502
+ ),
503
+ 'ssl_stats', (
504
+ SELECT json_build_object(
505
+ 'ssl_connections', (SELECT count(*) FROM pg_stat_ssl WHERE ssl = true),
506
+ 'non_ssl_connections', (SELECT count(*) FROM pg_stat_ssl WHERE ssl = false),
507
+ 'ssl_versions', (
508
+ SELECT COALESCE(json_agg(json_build_object('version', version, 'count', cnt)), '[]'::json)
509
+ FROM (SELECT version, count(*) as cnt FROM pg_stat_ssl WHERE ssl = true GROUP BY version) v
510
+ )
511
+ )
512
+ )
513
+ )::text;
514
+ """
515
+
516
+
517
+ def parse_batched_analysis(data: Dict[str, Any], result: AnalysisResult) -> None:
518
+ """Parse the batched JSON analysis data into the result object."""
519
+
520
+ # Connections
521
+ conn = data.get("connections")
522
+ if conn:
523
+ current = conn.get("current", 0)
524
+ max_conn = conn.get("max", 1)
525
+ reserved = conn.get("reserved", 3)
526
+ result.connections = {
527
+ "current": current,
528
+ "max": max_conn,
529
+ "reserved": reserved,
530
+ "available": max_conn - current - reserved,
531
+ "percent": round(current / max_conn * 100, 1) if max_conn > 0 else 0,
532
+ "active": conn.get("active", 0),
533
+ "idle": conn.get("idle", 0),
534
+ "idle_in_transaction": conn.get("idle_in_transaction", 0),
535
+ }
536
+
537
+ # Memory config (expanded for tuning analysis)
538
+ mem_config = data.get("memory_config")
539
+ if mem_config:
540
+ result.memory_config = {}
541
+ for row in mem_config:
542
+ name = row["name"]
543
+ setting = row["setting"]
544
+ unit = row["unit"]
545
+
546
+ # Handle different value types
547
+ if unit == "8kB":
548
+ # Convert 8kB pages to MB
549
+ mb = int(setting) * 8 / 1024 if str(setting).isdigit() else 0
550
+ result.memory_config[name] = {"value": int(setting) if str(setting).isdigit() else 0, "unit": unit, "mb": round(mb, 1)}
551
+ elif unit == "kB":
552
+ mb = int(setting) / 1024 if str(setting).isdigit() else 0
553
+ result.memory_config[name] = {"value": int(setting) if str(setting).isdigit() else 0, "unit": unit, "mb": round(mb, 1)}
554
+ elif unit == "MB":
555
+ result.memory_config[name] = {"value": int(setting) if str(setting).isdigit() else 0, "unit": unit, "mb": int(setting) if str(setting).isdigit() else 0}
556
+ elif unit in ("ms", "s", "min"):
557
+ # Time-based settings
558
+ result.memory_config[name] = {"value": setting, "unit": unit}
559
+ elif name in ("random_page_cost", "checkpoint_completion_target", "autovacuum_vacuum_scale_factor", "autovacuum_analyze_scale_factor"):
560
+ # Float settings
561
+ result.memory_config[name] = {"value": float(setting) if setting else 0}
562
+ elif name in ("synchronous_commit", "autovacuum", "track_io_timing"):
563
+ # On/off settings
564
+ result.memory_config[name] = {"value": setting}
565
+ else:
566
+ # Integer settings (max_connections, max_parallel_workers, etc.)
567
+ result.memory_config[name] = {"value": int(setting) if str(setting).isdigit() else setting}
568
+
569
+ # Cache hit
570
+ cache = data.get("cache_hit")
571
+ if cache:
572
+ result.cache_hit = {
573
+ "table_hit_pct": cache.get("table_hit_pct"),
574
+ "index_hit_pct": cache.get("index_hit_pct"),
575
+ }
576
+
577
+ # Database stats
578
+ db_stats = data.get("database_stats")
579
+ if db_stats:
580
+ result.database_stats = {
581
+ "deadlocks": db_stats.get("deadlocks", 0),
582
+ "temp_files": db_stats.get("temp_files", 0),
583
+ "temp_bytes": db_stats.get("temp_bytes", 0),
584
+ "stats_reset": db_stats.get("stats_reset", "unknown"),
585
+ "blks_read": db_stats.get("blks_read", 0),
586
+ "blks_hit": db_stats.get("blks_hit", 0),
587
+ "tup_returned": db_stats.get("tup_returned", 0),
588
+ "tup_fetched": db_stats.get("tup_fetched", 0),
589
+ "tup_inserted": db_stats.get("tup_inserted", 0),
590
+ "tup_updated": db_stats.get("tup_updated", 0),
591
+ "tup_deleted": db_stats.get("tup_deleted", 0),
592
+ "conflicts": db_stats.get("conflicts", 0),
593
+ "checksum_failures": db_stats.get("checksum_failures", 0),
594
+ }
595
+
596
+ # Cache per table
597
+ cache_per_table = data.get("cache_per_table", [])
598
+ result.cache_per_table = [
599
+ {
600
+ "table": t.get("table_name"),
601
+ "disk_reads": str(t.get("disk_reads", 0)),
602
+ "cache_hits": str(t.get("cache_hits", 0)),
603
+ "hit_pct": str(t.get("hit_pct", 0)),
604
+ }
605
+ for t in cache_per_table
606
+ ]
607
+
608
+ # Table sizes
609
+ table_sizes = data.get("table_sizes", [])
610
+ result.table_sizes = [
611
+ {
612
+ "schema": t.get("schema"),
613
+ "table": t.get("table_name"),
614
+ "size": t.get("size"),
615
+ "total_bytes": str(t.get("total_bytes", 0)),
616
+ "data_bytes": str(t.get("data_bytes", 0)),
617
+ "index_bytes": str(t.get("index_bytes", 0)),
618
+ "row_count": str(t.get("row_count", 0)),
619
+ }
620
+ for t in table_sizes
621
+ ]
622
+
623
+ # Size breakdown
624
+ size = data.get("size_breakdown")
625
+ if size:
626
+ result.size_breakdown = {
627
+ "database_bytes": size.get("database_bytes", 0),
628
+ "wal_bytes": size.get("wal_bytes", 0),
629
+ "user_tables_bytes": size.get("user_tables_bytes", 0),
630
+ "user_indexes_bytes": size.get("user_indexes_bytes", 0),
631
+ "system_bytes": size.get("system_bytes", 0),
632
+ }
633
+
634
+ # Vacuum health
635
+ vacuum = data.get("vacuum_health", [])
636
+ result.vacuum_health = [
637
+ {
638
+ "schema": t.get("schema"),
639
+ "table": t.get("table_name"),
640
+ "live_rows": str(t.get("live_rows", 0)),
641
+ "dead_rows": str(t.get("dead_rows", 0)),
642
+ "dead_pct": str(t.get("dead_pct", 0)),
643
+ "vacuum_count": str(t.get("vacuum_count", 0)),
644
+ "autovacuum_count": str(t.get("autovacuum_count", 0)),
645
+ "last_vacuum": t.get("last_vacuum", "never"),
646
+ "last_autovacuum": t.get("last_autovacuum", "never"),
647
+ "last_analyze": t.get("last_analyze", "never"),
648
+ "xid_age": str(t.get("xid_age", 0)),
649
+ "needs_vacuum": "true" if t.get("needs_vacuum") else "false",
650
+ "needs_freeze": "true" if t.get("needs_freeze") else "false",
651
+ }
652
+ for t in vacuum
653
+ ]
654
+
655
+ # XID age
656
+ xid = data.get("xid_age")
657
+ if xid and xid.get("value") is not None:
658
+ xid_val = xid["value"]
659
+ result.xid_age = {
660
+ "value": xid_val,
661
+ "millions": round(xid_val / 1_000_000, 1),
662
+ "pct_to_wraparound": round(xid_val / 2_147_483_647 * 100, 2)
663
+ }
664
+
665
+ # Unused indexes
666
+ unused = data.get("unused_indexes", [])
667
+ result.unused_indexes = [
668
+ {
669
+ "schema": t.get("schema"),
670
+ "table": t.get("table_name"),
671
+ "index": t.get("index_name"),
672
+ "size": t.get("size"),
673
+ "size_bytes": str(t.get("size_bytes", 0)),
674
+ "scans": str(t.get("scans", 0)),
675
+ "table_seq_scans": str(t.get("table_seq_scans", 0)),
676
+ "table_idx_scans": str(t.get("table_idx_scans", 0)),
677
+ "table_rows": str(t.get("table_rows", 0)),
678
+ "is_primary": t.get("is_primary", False),
679
+ "is_unique": t.get("is_unique", False),
680
+ "missing_index_score": str(t.get("missing_index_score", 0)),
681
+ }
682
+ for t in unused
683
+ ]
684
+
685
+ # Connection states
686
+ conn_states = data.get("connection_states", [])
687
+ result.connection_states = [
688
+ {"state": t.get("state"), "count": str(t.get("count", 0))}
689
+ for t in conn_states
690
+ ]
691
+
692
+ # Connections by app
693
+ conn_app = data.get("connections_by_app", [])
694
+ result.connections_by_app = [
695
+ {"app": t.get("app", ""), "count": str(t.get("count", 0))}
696
+ for t in conn_app
697
+ ]
698
+
699
+ # Connections by age
700
+ conn_age = data.get("connections_by_age", [])
701
+ result.connections_by_age = [
702
+ {"range": t.get("range"), "count": str(t.get("count", 0))}
703
+ for t in conn_age
704
+ ]
705
+
706
+ # Oldest connection
707
+ oldest = data.get("oldest_connection_sec")
708
+ if oldest is not None:
709
+ result.oldest_connection_sec = oldest
710
+
711
+ # Details of old connections (>24 hours)
712
+ oldest_conns = data.get("oldest_connections", [])
713
+ result.oldest_connections = [
714
+ {
715
+ "application_name": c.get("application_name", ""),
716
+ "state": c.get("state"),
717
+ "query_preview": c.get("query_preview"),
718
+ "age_hours": c.get("age_hours"),
719
+ "age_days": c.get("age_days"),
720
+ "client_addr": c.get("client_addr"),
721
+ "wait_event_type": c.get("wait_event_type"),
722
+ "wait_event": c.get("wait_event"),
723
+ }
724
+ for c in oldest_conns
725
+ ]
726
+
727
+ # Seq scan tables
728
+ seq_tables = data.get("seq_scan_tables", [])
729
+ result.seq_scan_tables = [
730
+ {
731
+ "table": t.get("table_name"),
732
+ "seq_scans": str(t.get("seq_scans", 0)),
733
+ "idx_scans": str(t.get("idx_scans", 0)),
734
+ "rows": str(t.get("rows", 0)),
735
+ }
736
+ for t in seq_tables
737
+ ]
738
+
739
+ # Top queries
740
+ top_q = data.get("top_queries", [])
741
+ result.top_queries = [
742
+ {
743
+ "query": t.get("query"),
744
+ "calls": str(t.get("calls", 0)),
745
+ "total_min": str(t.get("total_min", 0)),
746
+ "mean_ms": str(t.get("mean_ms", 0)),
747
+ "min_ms": str(t.get("min_ms", 0)),
748
+ "max_ms": str(t.get("max_ms", 0)),
749
+ "stddev_ms": str(t.get("stddev_ms", 0)),
750
+ "rows": str(t.get("rows", 0)),
751
+ "rows_per_call": str(t.get("rows_per_call", 0)),
752
+ "total_plan_ms": str(t.get("total_plan_ms", 0)),
753
+ "mean_plan_ms": str(t.get("mean_plan_ms", 0)),
754
+ "shared_blks_hit": t.get("shared_blks_hit", 0),
755
+ "shared_blks_read": t.get("shared_blks_read", 0),
756
+ "shared_blks_dirtied": t.get("shared_blks_dirtied", 0),
757
+ "shared_blks_written": t.get("shared_blks_written", 0),
758
+ "cache_hit_pct": t.get("cache_hit_pct"),
759
+ "local_blks_hit": t.get("local_blks_hit", 0),
760
+ "local_blks_read": t.get("local_blks_read", 0),
761
+ "temp_blks_read": t.get("temp_blks_read", 0),
762
+ "temp_blks_written": t.get("temp_blks_written", 0),
763
+ "blk_read_time_ms": str(t.get("blk_read_time_ms", 0)),
764
+ "blk_write_time_ms": str(t.get("blk_write_time_ms", 0)),
765
+ "wal_records": t.get("wal_records", 0),
766
+ "wal_bytes": t.get("wal_bytes", 0),
767
+ }
768
+ for t in top_q
769
+ ]
770
+
771
+ # Long running queries
772
+ long_q = data.get("long_running_queries", [])
773
+ result.long_running_queries = [
774
+ {
775
+ "pid": str(t.get("pid")),
776
+ "duration_sec": str(t.get("duration_sec", 0)),
777
+ "query": t.get("query"),
778
+ }
779
+ for t in long_q
780
+ ]
781
+
782
+ # Idle in transaction
783
+ idle_txn = data.get("idle_in_transaction", [])
784
+ result.idle_in_transaction = [
785
+ {
786
+ "pid": str(t.get("pid")),
787
+ "idle_sec": str(t.get("idle_sec", 0)),
788
+ "user": t.get("username", ""),
789
+ "app": t.get("app", ""),
790
+ "last_query": t.get("last_query"),
791
+ }
792
+ for t in idle_txn
793
+ ]
794
+
795
+ # Blocked queries
796
+ blocked = data.get("blocked_queries", [])
797
+ result.blocked_queries = [
798
+ {
799
+ "pid": str(t.get("pid")),
800
+ "wait_sec": str(t.get("wait_sec", 0)),
801
+ "user": t.get("username", ""),
802
+ "blocking_pid": t.get("blocking_pid", ""),
803
+ "query": t.get("query"),
804
+ }
805
+ for t in blocked
806
+ ]
807
+
808
+ # Locks
809
+ locks = data.get("locks", [])
810
+ result.locks = [
811
+ {
812
+ "locktype": t.get("locktype"),
813
+ "mode": t.get("mode"),
814
+ "user": t.get("username", ""),
815
+ "app": t.get("app", ""),
816
+ "query": t.get("query"),
817
+ }
818
+ for t in locks
819
+ ]
820
+
821
+ # Replication
822
+ repl = data.get("replication", [])
823
+ result.replication = [
824
+ {
825
+ "client": t.get("client"),
826
+ "state": t.get("state"),
827
+ "sent_lsn": t.get("sent_lsn"),
828
+ "replay_lsn": t.get("replay_lsn"),
829
+ }
830
+ for t in repl
831
+ ]
832
+
833
+ # pg_stat_statements installed flag
834
+ result.pg_stat_statements_installed = data.get("pg_stat_statements_installed", False)
835
+
836
+ # Background writer stats
837
+ bgwriter = data.get("bgwriter")
838
+ if bgwriter:
839
+ result.bgwriter = {
840
+ "checkpoints_timed": bgwriter.get("checkpoints_timed", 0),
841
+ "checkpoints_req": bgwriter.get("checkpoints_req", 0),
842
+ "buffers_checkpoint": bgwriter.get("buffers_checkpoint", 0),
843
+ "buffers_clean": bgwriter.get("buffers_clean", 0),
844
+ "buffers_backend": bgwriter.get("buffers_backend", 0),
845
+ "buffers_backend_fsync": bgwriter.get("buffers_backend_fsync", 0),
846
+ "maxwritten_clean": bgwriter.get("maxwritten_clean", 0),
847
+ "stats_reset": bgwriter.get("stats_reset", "never"),
848
+ }
849
+
850
+ # Invalid indexes
851
+ invalid_idx = data.get("invalid_indexes", [])
852
+ result.invalid_indexes = [
853
+ {
854
+ "schema": t.get("schema"),
855
+ "table": t.get("table"),
856
+ "index": t.get("index"),
857
+ }
858
+ for t in invalid_idx
859
+ ]
860
+
861
+ # WAL archiver stats
862
+ archiver = data.get("archiver")
863
+ if archiver:
864
+ result.archiver = {
865
+ "archived_count": archiver.get("archived_count", 0),
866
+ "failed_count": archiver.get("failed_count", 0),
867
+ "last_archived_wal": archiver.get("last_archived_wal"),
868
+ "last_archived_time": archiver.get("last_archived_time", "never"),
869
+ "last_failed_wal": archiver.get("last_failed_wal"),
870
+ "last_failed_time": archiver.get("last_failed_time", "never"),
871
+ "stats_reset": archiver.get("stats_reset", "never"),
872
+ }
873
+
874
+ # Vacuum progress
875
+ progress_vac = data.get("progress_vacuum", [])
876
+ result.progress_vacuum = [
877
+ {
878
+ "pid": t.get("pid"),
879
+ "datname": t.get("datname"),
880
+ "relname": t.get("relname"),
881
+ "phase": t.get("phase"),
882
+ "heap_blks_total": t.get("heap_blks_total", 0),
883
+ "heap_blks_scanned": t.get("heap_blks_scanned", 0),
884
+ "heap_blks_vacuumed": t.get("heap_blks_vacuumed", 0),
885
+ "index_vacuum_count": t.get("index_vacuum_count", 0),
886
+ "max_dead_tuples": t.get("max_dead_tuples", 0),
887
+ "num_dead_tuples": t.get("num_dead_tuples", 0),
888
+ }
889
+ for t in progress_vac
890
+ ]
891
+
892
+ # SSL connection stats
893
+ ssl = data.get("ssl_stats")
894
+ if ssl:
895
+ result.ssl_stats = {
896
+ "ssl_connections": ssl.get("ssl_connections", 0),
897
+ "non_ssl_connections": ssl.get("non_ssl_connections", 0),
898
+ "ssl_versions": ssl.get("ssl_versions", []),
899
+ }
900
+
901
+
902
+ def parse_psql_output(output: str, columns: List[str]) -> List[Dict[str, str]]:
903
+ """Parse psql -t -A output (pipe-separated) into list of dicts."""
904
+ rows = []
905
+ for line in output.strip().split("\n"):
906
+ if not line or line.startswith("("):
907
+ continue
908
+ values = line.split("|")
909
+ if len(values) == len(columns):
910
+ rows.append(dict(zip(columns, [v.strip() for v in values])))
911
+ return rows
912
+
913
+
914
+ def get_disk_usage_from_api(environment_id: str, service_id: str) -> Optional[Dict[str, Any]]:
915
+ """Get disk usage from Railway metrics API."""
916
+ from datetime import timedelta
917
+
918
+ # Build the API query
919
+ start_date = (datetime.now(timezone.utc) - timedelta(hours=1)).isoformat()
920
+
921
+ # Use railway-api.sh script
922
+ import os
923
+ script_dir = os.path.dirname(os.path.abspath(__file__))
924
+ api_script = os.path.join(script_dir, "railway-api.sh")
925
+
926
+ if not os.path.exists(api_script):
927
+ return None
928
+
929
+ query = '''query metrics($environmentId: String!, $serviceId: String, $startDate: DateTime!, $measurements: [MetricMeasurement!]!) {
930
+ metrics(environmentId: $environmentId, serviceId: $serviceId, startDate: $startDate, measurements: $measurements) {
931
+ measurement values { ts value }
932
+ }
933
+ }'''
934
+
935
+ variables = json.dumps({
936
+ "environmentId": environment_id,
937
+ "serviceId": service_id,
938
+ "startDate": start_date,
939
+ "measurements": ["DISK_USAGE_GB"]
940
+ })
941
+
942
+ try:
943
+ result = subprocess.run(
944
+ [api_script, query, variables],
945
+ capture_output=True,
946
+ text=True,
947
+ timeout=30
948
+ )
949
+ if result.returncode != 0:
950
+ return None
951
+
952
+ data = json.loads(result.stdout)
953
+ metrics = data.get("data", {}).get("metrics", [])
954
+
955
+ for metric in metrics:
956
+ if metric.get("measurement") == "DISK_USAGE_GB":
957
+ values = metric.get("values", [])
958
+ if values:
959
+ # Get latest value
960
+ latest = values[-1].get("value", 0)
961
+ return {
962
+ "used_gb": round(latest, 2),
963
+ "used": f"{latest:.1f} GB",
964
+ }
965
+ except (subprocess.TimeoutExpired, json.JSONDecodeError, FileNotFoundError):
966
+ pass
967
+
968
+ return None
969
+
970
+
971
+ def get_disk_usage(service: str, environment_id: Optional[str] = None, service_id: Optional[str] = None) -> Optional[Dict[str, Any]]:
972
+ """Get disk usage - try API first, fall back to SSH."""
973
+ # Try Railway API first
974
+ if environment_id and service_id:
975
+ api_result = get_disk_usage_from_api(environment_id, service_id)
976
+ if api_result:
977
+ return api_result
978
+
979
+ # Fall back to SSH
980
+ command = "df -h /var/lib/postgresql/data 2>/dev/null || df -h / | tail -1"
981
+ code, stdout, stderr = run_ssh_query(service, command)
982
+ if code != 0 or not stdout:
983
+ return None
984
+
985
+ # Parse df output: Filesystem Size Used Avail Use% Mounted
986
+ lines = stdout.strip().split("\n")
987
+ for line in lines:
988
+ if line and not line.startswith("Filesystem"):
989
+ parts = line.split()
990
+ if len(parts) >= 5:
991
+ return {
992
+ "total": parts[1],
993
+ "used": parts[2],
994
+ "available": parts[3],
995
+ "use_percent": parts[4].rstrip("%"),
996
+ }
997
+ return None
998
+
999
+
1000
+ def get_cpu_memory_from_api(environment_id: str, service_id: str) -> Optional[Dict[str, Any]]:
1001
+ """Get CPU and memory usage from Railway metrics API.
1002
+
1003
+ DEPRECATED: Use get_all_metrics_from_api() instead for combined disk/cpu/memory.
1004
+ """
1005
+ result = get_all_metrics_from_api(environment_id, service_id)
1006
+ if result:
1007
+ return result.get("cpu_memory")
1008
+ return None
1009
+
1010
+
1011
+ def get_recent_errors(service: str, limit: int = 10) -> List[str]:
1012
+ """Get recent error logs (legacy - kept for backwards compat)."""
1013
+ code, stdout, stderr = run_railway_command(
1014
+ ["logs", "--service", service, "--lines", "100", "--filter", "@level:error"],
1015
+ timeout=30
1016
+ )
1017
+ if code != 0:
1018
+ return []
1019
+
1020
+ errors = []
1021
+ for line in stdout.strip().split("\n")[:limit]:
1022
+ if line.strip():
1023
+ errors.append(line.strip())
1024
+ return errors
1025
+
1026
+
1027
+ def get_cluster_logs(
1028
+ ha_cluster: Optional[Dict[str, Any]],
1029
+ environment_id: Optional[str],
1030
+ limit: int = 100
1031
+ ) -> List[Dict[str, Any]]:
1032
+ """Get logs from all HA cluster members via Railway API.
1033
+
1034
+ For HA clusters, each member may be a separate deployment.
1035
+ This function fetches recent logs from each cluster member.
1036
+ """
1037
+ if not ha_cluster or not environment_id:
1038
+ return []
1039
+
1040
+ members = ha_cluster.get("members", [])
1041
+ if not members:
1042
+ return []
1043
+
1044
+ import os
1045
+ script_dir = os.path.dirname(os.path.abspath(__file__))
1046
+ api_script = os.path.join(script_dir, "railway-api.sh")
1047
+
1048
+ if not os.path.exists(api_script):
1049
+ return []
1050
+
1051
+ cluster_logs = []
1052
+
1053
+ # Query to get deployments for the environment
1054
+ deployment_query = '''query deployments($environmentId: String!) {
1055
+ deployments(input: { environmentId: $environmentId }) {
1056
+ edges { node { id status staticUrl service { id name } } }
1057
+ }
1058
+ }'''
1059
+
1060
+ try:
1061
+ result = subprocess.run(
1062
+ [api_script, deployment_query, json.dumps({"environmentId": environment_id})],
1063
+ capture_output=True,
1064
+ text=True,
1065
+ timeout=30
1066
+ )
1067
+ if result.returncode != 0:
1068
+ return []
1069
+
1070
+ data = json.loads(result.stdout)
1071
+ deployments = data.get("data", {}).get("deployments", {}).get("edges", [])
1072
+
1073
+ # Find deployments that match cluster member names
1074
+ member_names = {m.get("name", "").lower() for m in members}
1075
+
1076
+ for edge in deployments:
1077
+ deployment = edge.get("node", {})
1078
+ deployment_id = deployment.get("id")
1079
+ service_name = deployment.get("service", {}).get("name", "").lower()
1080
+ status = deployment.get("status")
1081
+
1082
+ # Check if this deployment corresponds to a cluster member
1083
+ is_member = any(
1084
+ member_name in service_name or service_name in member_name
1085
+ for member_name in member_names
1086
+ )
1087
+
1088
+ if not is_member and status != "SUCCESS":
1089
+ continue
1090
+
1091
+ if not deployment_id:
1092
+ continue
1093
+
1094
+ # Fetch logs for this deployment
1095
+ log_query = '''query deploymentLogs($deploymentId: String!, $limit: Int) {
1096
+ deploymentLogs(deploymentId: $deploymentId, limit: $limit) {
1097
+ timestamp message severity
1098
+ }
1099
+ }'''
1100
+
1101
+ log_result = subprocess.run(
1102
+ [api_script, log_query, json.dumps({
1103
+ "deploymentId": deployment_id,
1104
+ "limit": limit
1105
+ })],
1106
+ capture_output=True,
1107
+ text=True,
1108
+ timeout=30
1109
+ )
1110
+
1111
+ if log_result.returncode == 0:
1112
+ log_data = json.loads(log_result.stdout)
1113
+ logs = log_data.get("data", {}).get("deploymentLogs", [])
1114
+ if logs:
1115
+ cluster_logs.append({
1116
+ "member": service_name,
1117
+ "deployment_id": deployment_id,
1118
+ "status": status,
1119
+ "logs": logs[-limit:], # Last N logs
1120
+ })
1121
+
1122
+ except (subprocess.TimeoutExpired, json.JSONDecodeError, FileNotFoundError):
1123
+ pass
1124
+
1125
+ return cluster_logs
1126
+
1127
+
1128
+ def is_postgres_ha_service(service_id: Optional[str]) -> bool:
1129
+ """Check if service is from postgres-ha template.
1130
+
1131
+ Returns True if the service source repo contains 'postgres-ha',
1132
+ indicating this is part of an HA cluster that uses Patroni.
1133
+ """
1134
+ if not service_id:
1135
+ return False
1136
+
1137
+ script_dir = os.path.dirname(os.path.abspath(__file__))
1138
+ api_script = os.path.join(script_dir, "railway-api.sh")
1139
+
1140
+ if not os.path.exists(api_script):
1141
+ return False
1142
+
1143
+ query = '''query service($id: String!) {
1144
+ service(id: $id) {
1145
+ source { repo }
1146
+ }
1147
+ }'''
1148
+
1149
+ try:
1150
+ result = subprocess.run(
1151
+ [api_script, query, json.dumps({"id": service_id})],
1152
+ capture_output=True,
1153
+ text=True,
1154
+ timeout=10
1155
+ )
1156
+ if result.returncode != 0:
1157
+ return False
1158
+
1159
+ data = json.loads(result.stdout)
1160
+ repo = data.get("data", {}).get("service", {}).get("source", {}).get("repo", "")
1161
+ return "postgres-ha" in repo.lower() if repo else False
1162
+ except (subprocess.TimeoutExpired, json.JSONDecodeError, FileNotFoundError):
1163
+ return False
1164
+
1165
+
1166
+ def analyze_postgres(service: str, timeout: int = 300, quiet: bool = False,
1167
+ skip_logs: bool = False,
1168
+ metrics_hours: int = 168,
1169
+ project_id: Optional[str] = None,
1170
+ environment_id: Optional[str] = None,
1171
+ service_id: Optional[str] = None) -> AnalysisResult:
1172
+ """Run complete Postgres analysis with maximum data collection.
1173
+
1174
+ Uses a single batched SQL query to collect all database metrics,
1175
+ minimizing SSH connections (~3 total instead of ~22).
1176
+
1177
+ Args:
1178
+ skip_logs: Skip log fetching for faster analysis (~60s saved)
1179
+ metrics_hours: Hours of metrics history to fetch (default: 168, max: 168)
1180
+ project_id: Project ID (bypasses railway link config)
1181
+ environment_id: Environment ID (bypasses railway link config)
1182
+ service_id: Service ID (bypasses railway link config)
1183
+ """
1184
+ if not quiet:
1185
+ print(f"Analyzing postgres database: {service}", file=sys.stderr)
1186
+
1187
+ result = AnalysisResult(
1188
+ service=service,
1189
+ db_type="postgres",
1190
+ timestamp=datetime.now(timezone.utc).isoformat(),
1191
+ )
1192
+
1193
+ # === FAST CONTEXT LOADING ===
1194
+ # Use explicit IDs if provided, otherwise read from config file (instant)
1195
+ if not quiet:
1196
+ print(" [0/5] Getting Railway context...", file=sys.stderr, flush=True)
1197
+ dal._progress_timer.start()
1198
+
1199
+ if environment_id and service_id:
1200
+ # IDs passed directly — no need to read config or link
1201
+ dal._ctx = RailwayContext(project_id=project_id, environment_id=environment_id, service_id=service_id)
1202
+ if not quiet:
1203
+ print(f" using explicit IDs (env={environment_id[:8]}..., svc={service_id[:8]}...)", file=sys.stderr, flush=True)
1204
+ else:
1205
+ # Fall back to reading railway context from local config (instant, no API call)
1206
+ railway_status = get_railway_status()
1207
+ if railway_status:
1208
+ dal._ctx = RailwayContext(
1209
+ project_id=railway_status.get("projectId"),
1210
+ environment_id=railway_status.get("environmentId"),
1211
+ service_id=railway_status.get("serviceId"),
1212
+ )
1213
+ environment_id = dal._ctx.environment_id
1214
+ service_id = dal._ctx.service_id
1215
+
1216
+ # Check if this is an HA service - only call API if name suggests HA
1217
+ is_ha_service = False
1218
+ if any(hint in service.lower() for hint in ["postgres-ha", "patroni", "-ha"]):
1219
+ is_ha_service = is_postgres_ha_service(service_id)
1220
+
1221
+ # Get deployment status via API (~1s) instead of CLI (~15s)
1222
+ progress(1, 5, "Fetching deployment status...", quiet)
1223
+ result.deployment_status = get_deployment_status(service, service_id=service_id)
1224
+
1225
+ # === SSH PRE-CHECK WITH RETRY ===
1226
+ # SSH can be flaky — retry with increasing timeouts before giving up
1227
+ progress(2, 4, "Testing SSH connectivity...", quiet)
1228
+ ssh_available = False
1229
+ ssh_stderr = ""
1230
+ ssh_attempts = [30, 60, 90]
1231
+ for attempt, attempt_timeout in enumerate(ssh_attempts, 1):
1232
+ ssh_code, ssh_stdout, ssh_stderr = run_ssh_query(service, "echo ok", timeout=attempt_timeout)
1233
+ if ssh_code == 0 and "ok" in ssh_stdout:
1234
+ ssh_available = True
1235
+ if not quiet:
1236
+ for line in ssh_stderr.splitlines():
1237
+ if line.startswith("Using SSH key:"):
1238
+ print(f" {line}", file=sys.stderr, flush=True)
1239
+ break
1240
+ break
1241
+ if not quiet:
1242
+ remaining = len(ssh_attempts) - attempt
1243
+ if remaining > 0:
1244
+ print(f" SSH attempt {attempt}/{len(ssh_attempts)} failed ({ssh_stderr or 'no response'}), retrying with {ssh_attempts[attempt]}s timeout...", file=sys.stderr, flush=True)
1245
+ else:
1246
+ print(f" SSH attempt {attempt}/{len(ssh_attempts)} failed ({ssh_stderr or 'no response'}), giving up", file=sys.stderr, flush=True)
1247
+
1248
+ # === PARALLEL EXECUTION OF SLOW OPERATIONS ===
1249
+ # Run metrics API, database query, and logs in parallel (~17-27s down to ~max of the three)
1250
+ progress(3, 4, "Running analysis (metrics, query, logs in parallel)...", quiet)
1251
+
1252
+ analysis_query = build_analysis_query()
1253
+
1254
+ # Define parallel tasks
1255
+ def task_metrics():
1256
+ """Fetch all metrics (disk, CPU, memory) in one API call."""
1257
+ if environment_id and service_id:
1258
+ return get_all_metrics_from_api(environment_id, service_id, hours=metrics_hours)
1259
+ return None
1260
+
1261
+ def task_database_query():
1262
+ """Run the batched database analysis query with retry."""
1263
+ if not ssh_available:
1264
+ return (1, "", f"SSH not available: {ssh_stderr or 'connection failed'}")
1265
+ code, stdout, stderr = run_psql_query_safe(service, analysis_query, timeout=timeout)
1266
+ if code != 0:
1267
+ # Retry once — SSH sessions can drop mid-query
1268
+ if not quiet:
1269
+ print(f" Database query failed ({stderr or 'unknown'}), retrying...", file=sys.stderr, flush=True)
1270
+ code, stdout, stderr = run_psql_query_safe(service, analysis_query, timeout=timeout)
1271
+ return (code, stdout, stderr)
1272
+
1273
+ def task_logs():
1274
+ """Fetch recent logs via API (~3s)."""
1275
+ if skip_logs:
1276
+ return []
1277
+ return get_recent_logs(service, lines=LOG_LINES_DEFAULT,
1278
+ environment_id=environment_id,
1279
+ service_id=service_id)
1280
+
1281
+ def task_ha_cluster():
1282
+ """Check HA cluster status (Patroni)."""
1283
+ if not is_ha_service:
1284
+ return "skipped_not_ha"
1285
+ if not ssh_available:
1286
+ return "skipped_no_ssh"
1287
+ code, stdout, stderr = run_ssh_query(service, "curl -s localhost:8008/cluster 2>/dev/null || echo '{}'")
1288
+ if code == 0 and stdout and stdout.strip() != "{}":
1289
+ try:
1290
+ patroni_data = json.loads(stdout)
1291
+ members = patroni_data.get("members", [])
1292
+ if members:
1293
+ return {
1294
+ "members": [
1295
+ {
1296
+ "name": m.get("name"),
1297
+ "role": m.get("role"),
1298
+ "state": m.get("state"),
1299
+ "timeline": m.get("timeline"),
1300
+ "lag": m.get("lag"),
1301
+ }
1302
+ for m in members
1303
+ ]
1304
+ }
1305
+ except json.JSONDecodeError:
1306
+ pass
1307
+ return None
1308
+
1309
+ # Run all tasks in parallel
1310
+ with ThreadPoolExecutor(max_workers=4) as executor:
1311
+ future_metrics = executor.submit(task_metrics)
1312
+ future_db = executor.submit(task_database_query)
1313
+ future_logs = executor.submit(task_logs)
1314
+ future_ha = executor.submit(task_ha_cluster)
1315
+
1316
+ # Collect results
1317
+ metrics_result = future_metrics.result()
1318
+ db_result = future_db.result()
1319
+ logs_result = future_logs.result()
1320
+ ha_result = future_ha.result()
1321
+
1322
+ # Process metrics result (combined disk + cpu/memory + 24h history)
1323
+ if metrics_result:
1324
+ result.disk_usage = metrics_result.get("disk_usage")
1325
+ result.cpu_memory = metrics_result.get("cpu_memory")
1326
+ result.metrics_history = metrics_result.get("metrics_history")
1327
+ result.collection_status["metrics_api"] = {"status": "success"}
1328
+ else:
1329
+ result.collection_status["metrics_api"] = {
1330
+ "status": "error",
1331
+ "error": "Metrics API returned no data"
1332
+ }
1333
+
1334
+ # Process database query result
1335
+ code, stdout, stderr = db_result
1336
+ if code == 0 and stdout:
1337
+ try:
1338
+ data = json.loads(stdout.strip())
1339
+ parse_batched_analysis(data, result)
1340
+ result.collection_status["database_query"] = {"status": "success"}
1341
+ except json.JSONDecodeError as e:
1342
+ result.errors.append(f"Failed to parse batched analysis JSON: {e}")
1343
+ result.collection_status["database_query"] = {
1344
+ "status": "error",
1345
+ "error": f"JSON parse error: {e}"
1346
+ }
1347
+ else:
1348
+ error_msg = stderr or stdout or "Unknown error"
1349
+ if not ssh_available:
1350
+ error_msg = f"SSH failed after {len(ssh_attempts)} attempts: {ssh_stderr or 'connection failed'}"
1351
+ result.errors.append(f"Batched analysis query failed: {error_msg}")
1352
+ result.collection_status["database_query"] = {
1353
+ "status": "error",
1354
+ "error": error_msg
1355
+ }
1356
+
1357
+ # Process HA cluster result
1358
+ if ha_result == "skipped_not_ha":
1359
+ result.ha_cluster = None
1360
+ result.collection_status["ha_cluster"] = {"status": "skipped", "reason": "not an HA service"}
1361
+ elif ha_result == "skipped_no_ssh":
1362
+ result.ha_cluster = None
1363
+ result.collection_status["ha_cluster"] = {"status": "skipped", "reason": "SSH not available"}
1364
+ elif ha_result is not None:
1365
+ result.ha_cluster = ha_result
1366
+ result.collection_status["ha_cluster"] = {"status": "success"}
1367
+ else:
1368
+ result.ha_cluster = None
1369
+ result.collection_status["ha_cluster"] = {
1370
+ "status": "error" if is_ha_service else "skipped",
1371
+ "error": "Failed to retrieve Patroni cluster data" if is_ha_service else "not an HA service"
1372
+ }
1373
+
1374
+ # Process logs result
1375
+ if skip_logs:
1376
+ result.collection_status["logs_api"] = {"status": "skipped", "reason": "skip_logs flag set"}
1377
+ elif logs_result:
1378
+ result.recent_logs = logs_result
1379
+ result.collection_status["logs_api"] = {
1380
+ "status": "success",
1381
+ "lines": len(logs_result)
1382
+ }
1383
+
1384
+ # Extract error logs locally
1385
+ result.recent_errors = [
1386
+ line for line in result.recent_logs
1387
+ if 'ERROR' in line.upper() or 'FATAL' in line.upper() or 'PANIC' in line.upper()
1388
+ ][:100]
1389
+
1390
+ # HA cluster logs (API call) - done after parallel since it depends on ha_cluster
1391
+ if result.ha_cluster and environment_id:
1392
+ progress(4, 5, "Fetching HA cluster logs...", quiet)
1393
+ result.cluster_logs = get_cluster_logs(result.ha_cluster, environment_id, limit=5000)
1394
+ else:
1395
+ result.recent_logs = []
1396
+ result.collection_status["logs_api"] = {
1397
+ "status": "error",
1398
+ "error": "Logs API returned no data"
1399
+ }
1400
+
1401
+ # Generate recommendations
1402
+ progress(5, 5, "Generating recommendations...", quiet)
1403
+ result.recommendations = generate_recommendations(result)
1404
+
1405
+ if not quiet:
1406
+ total = dal._progress_timer.total_elapsed()
1407
+ print(f"Done.{total}", file=sys.stderr)
1408
+
1409
+ return result
1410
+
1411
+
1412
+ def generate_recommendations(result: AnalysisResult) -> List[Dict[str, str]]:
1413
+ """Generate recommendations based on analysis results."""
1414
+ recommendations = []
1415
+
1416
+ # Collection failures — surface critical issues when SSH/introspection failed
1417
+ if result.collection_status:
1418
+ failed = {k: v for k, v in result.collection_status.items()
1419
+ if v.get("status") in ("failed", "error")}
1420
+ ssh_sources = {"database_query", "ha_cluster"}
1421
+ ssh_failed = {k: v for k, v in failed.items() if k in ssh_sources}
1422
+ if ssh_failed:
1423
+ sources = ", ".join(ssh_failed.keys())
1424
+ errors = "; ".join(v.get("error", "unknown") for v in ssh_failed.values())
1425
+ recommendations.append({
1426
+ "severity": "critical",
1427
+ "category": "collection",
1428
+ "message": f"SSH introspection failed — unable to collect {sources}. "
1429
+ f"Error: {errors}. "
1430
+ f"Analysis is incomplete: connection stats, query performance, "
1431
+ f"table bloat, and tuning parameters could not be evaluated.",
1432
+ })
1433
+
1434
+ # === POSTGRESQL TUNING RECOMMENDATIONS ===
1435
+ # Based on best practices from PostgreSQL wiki and community
1436
+ if result.memory_config:
1437
+ mem = result.memory_config
1438
+
1439
+ # Get system memory from CPU/memory metrics if available
1440
+ system_memory_gb = None
1441
+ if result.cpu_memory and "memory_limit_gb" in result.cpu_memory:
1442
+ # Use actual memory limit from Railway API
1443
+ system_memory_gb = result.cpu_memory["memory_limit_gb"]
1444
+ elif result.cpu_memory and "memory_gb" in result.cpu_memory:
1445
+ # Fallback: estimate total as ~2x current usage
1446
+ system_memory_gb = result.cpu_memory["memory_gb"] * 2 # rough estimate
1447
+
1448
+ # shared_buffers check (should be ~25% of RAM, max ~40%)
1449
+ shared_buffers = mem.get("shared_buffers", {})
1450
+ if shared_buffers and shared_buffers.get("mb"):
1451
+ sb_mb = shared_buffers["mb"]
1452
+ # Flag if shared_buffers is very low (< 128MB) - likely default
1453
+ if sb_mb < 128:
1454
+ # Calculate recommended value based on system memory or default to 1GB
1455
+ rec_sb = "1GB"
1456
+ if system_memory_gb:
1457
+ rec_sb_mb = int(system_memory_gb * 1024 * 0.25)
1458
+ rec_sb = f"{rec_sb_mb}MB" if rec_sb_mb < 1024 else f"{round(rec_sb_mb/1024, 1)}GB"
1459
+ recommendations.append({
1460
+ "priority": "immediate",
1461
+ "issue": f"shared_buffers is only {sb_mb}MB (likely default)",
1462
+ "action": f"Increase shared_buffers to {rec_sb} (25% of RAM)",
1463
+ "explanation": "shared_buffers is PostgreSQL's main data cache - pages read from disk are stored here. "
1464
+ f"At {sb_mb}MB, your entire working set cannot fit in memory, forcing repeated disk reads. "
1465
+ "The rule of thumb is 25% of total RAM, up to 40% for read-heavy workloads.",
1466
+ "commands": [
1467
+ f"ALTER SYSTEM SET shared_buffers = '{rec_sb}';",
1468
+ "-- Requires database restart to take effect"
1469
+ ],
1470
+ "restart_required": True,
1471
+ })
1472
+ elif sb_mb < 256:
1473
+ rec_sb = "512MB"
1474
+ if system_memory_gb:
1475
+ rec_sb_mb = int(system_memory_gb * 1024 * 0.25)
1476
+ rec_sb = f"{rec_sb_mb}MB" if rec_sb_mb < 1024 else f"{round(rec_sb_mb/1024, 1)}GB"
1477
+ recommendations.append({
1478
+ "priority": "short-term",
1479
+ "issue": f"shared_buffers is {sb_mb}MB - may be undersized",
1480
+ "action": f"Consider increasing shared_buffers to {rec_sb} (25% of RAM)",
1481
+ "explanation": "shared_buffers holds cached data pages. A larger buffer pool means more data stays in memory, "
1482
+ "reducing disk I/O. Current size may be limiting cache hit ratio.",
1483
+ "commands": [
1484
+ f"ALTER SYSTEM SET shared_buffers = '{rec_sb}';",
1485
+ "-- Requires database restart to take effect"
1486
+ ],
1487
+ "restart_required": True,
1488
+ })
1489
+
1490
+ # effective_cache_size check (should be 50-75% of RAM)
1491
+ effective_cache = mem.get("effective_cache_size", {})
1492
+ if effective_cache and effective_cache.get("mb"):
1493
+ ec_mb = effective_cache["mb"]
1494
+ # Flag if effective_cache_size seems low
1495
+ if ec_mb < 512:
1496
+ rec_ec = "3GB"
1497
+ if system_memory_gb:
1498
+ rec_ec_mb = int(system_memory_gb * 1024 * 0.75)
1499
+ rec_ec = f"{rec_ec_mb}MB" if rec_ec_mb < 1024 else f"{round(rec_ec_mb/1024, 1)}GB"
1500
+ recommendations.append({
1501
+ "priority": "short-term",
1502
+ "issue": f"effective_cache_size is {ec_mb}MB - may cause poor query plans",
1503
+ "action": f"Set effective_cache_size to {rec_ec} (75% of RAM)",
1504
+ "explanation": "effective_cache_size is a hint to the query planner about how much memory is available for caching "
1505
+ "(shared_buffers + OS cache). It does NOT allocate memory - it just helps PostgreSQL estimate "
1506
+ "whether data is likely to be cached. A low value makes the planner pessimistic, avoiding efficient "
1507
+ "index scans in favor of sequential scans.",
1508
+ "commands": [
1509
+ f"ALTER SYSTEM SET effective_cache_size = '{rec_ec}';",
1510
+ "SELECT pg_reload_conf(); -- Takes effect immediately"
1511
+ ],
1512
+ "restart_required": False,
1513
+ })
1514
+
1515
+ # work_mem check (per-operation memory for sorts/hashes)
1516
+ work_mem = mem.get("work_mem", {})
1517
+ if work_mem and work_mem.get("mb"):
1518
+ wm_mb = work_mem["mb"]
1519
+ # Calculate recommended work_mem based on connections and RAM
1520
+ max_conns = result.connections.get("max", 100) if result.connections else 100
1521
+ rec_wm = "32MB"
1522
+ if system_memory_gb:
1523
+ # Formula: (RAM / max_connections) / 4
1524
+ rec_wm_mb = int((system_memory_gb * 1024 / max_conns) / 4)
1525
+ rec_wm_mb = max(16, min(rec_wm_mb, 128)) # Clamp between 16-128MB
1526
+ rec_wm = f"{rec_wm_mb}MB"
1527
+
1528
+ # Flag if work_mem is at default (4MB) with high temp file usage
1529
+ if result.database_stats:
1530
+ temp_files = result.database_stats.get("temp_files", 0)
1531
+ temp_bytes = result.database_stats.get("temp_bytes", 0)
1532
+ temp_gb = round(temp_bytes / 1024 / 1024 / 1024, 1) if temp_bytes > 0 else 0
1533
+ if wm_mb <= 4 and temp_files > 1000:
1534
+ recommendations.append({
1535
+ "priority": "immediate",
1536
+ "issue": f"work_mem is only {wm_mb}MB with {temp_files:,} temp files ({temp_gb} GB) spilled to disk",
1537
+ "action": f"Increase work_mem to {rec_wm}",
1538
+ "explanation": f"work_mem controls how much memory each sort, hash, or join operation can use BEFORE "
1539
+ f"spilling to disk (temp files). At {wm_mb}MB, your queries are constantly spilling. "
1540
+ f"The {temp_files:,} temp files mean disk I/O instead of fast memory operations. "
1541
+ f"CAUTION: A query can use multiple work_mem allocations (one per sort node), "
1542
+ f"so don't set this too high. Formula: (RAM / max_connections) / 4.",
1543
+ "commands": [
1544
+ f"ALTER SYSTEM SET work_mem = '{rec_wm}';",
1545
+ "SELECT pg_reload_conf(); -- Takes effect for new connections"
1546
+ ],
1547
+ "restart_required": False,
1548
+ })
1549
+ elif wm_mb <= 4:
1550
+ recommendations.append({
1551
+ "priority": "long-term",
1552
+ "issue": f"work_mem is at default ({wm_mb}MB)",
1553
+ "action": f"Consider increasing work_mem to {rec_wm} for complex queries",
1554
+ "explanation": "work_mem is memory per sort/hash operation. The default 4MB is conservative. "
1555
+ "Increasing it can speed up complex queries but uses more memory per operation.",
1556
+ "commands": [
1557
+ f"ALTER SYSTEM SET work_mem = '{rec_wm}';",
1558
+ "SELECT pg_reload_conf(); -- Takes effect for new connections"
1559
+ ],
1560
+ "restart_required": False,
1561
+ })
1562
+
1563
+ # maintenance_work_mem check
1564
+ maint_mem = mem.get("maintenance_work_mem", {})
1565
+ if maint_mem and maint_mem.get("mb"):
1566
+ mm_mb = maint_mem["mb"]
1567
+ if mm_mb < 64:
1568
+ rec_mm = "256MB"
1569
+ if system_memory_gb and system_memory_gb >= 8:
1570
+ rec_mm = "512MB"
1571
+ recommendations.append({
1572
+ "priority": "short-term",
1573
+ "issue": f"maintenance_work_mem is {mm_mb}MB - VACUUM and CREATE INDEX will be slow",
1574
+ "action": f"Increase maintenance_work_mem to {rec_mm}",
1575
+ "explanation": "maintenance_work_mem is used by VACUUM, CREATE INDEX, and ALTER TABLE operations. "
1576
+ f"At {mm_mb}MB, these maintenance operations process data in small batches, making them slow. "
1577
+ "Unlike work_mem, only one maintenance operation runs per session, so this can safely be higher.",
1578
+ "commands": [
1579
+ f"ALTER SYSTEM SET maintenance_work_mem = '{rec_mm}';",
1580
+ "SELECT pg_reload_conf(); -- Takes effect immediately"
1581
+ ],
1582
+ "restart_required": False,
1583
+ })
1584
+
1585
+ # random_page_cost check (default 4.0, should be 1.1-2.0 for SSD)
1586
+ rpc = mem.get("random_page_cost", {})
1587
+ if rpc and rpc.get("value"):
1588
+ rpc_val = float(rpc["value"])
1589
+ if rpc_val >= 4.0:
1590
+ recommendations.append({
1591
+ "priority": "short-term",
1592
+ "issue": f"random_page_cost is {rpc_val} (HDD default) - Railway uses SSDs",
1593
+ "action": "Set random_page_cost to 1.5 for SSD storage",
1594
+ "explanation": "random_page_cost tells the query planner how expensive random disk access is compared to "
1595
+ "sequential access. The default 4.0 assumes slow HDDs where random reads are 4x more expensive. "
1596
+ "Railway uses fast SSDs where random reads are almost as fast as sequential. At 4.0, "
1597
+ "the planner avoids index scans (random access) in favor of slower sequential scans.",
1598
+ "commands": [
1599
+ "ALTER SYSTEM SET random_page_cost = 1.5;",
1600
+ "SELECT pg_reload_conf(); -- Takes effect immediately"
1601
+ ],
1602
+ "restart_required": False,
1603
+ })
1604
+
1605
+ # checkpoint_completion_target check (should be 0.9)
1606
+ cct = mem.get("checkpoint_completion_target", {})
1607
+ if cct and cct.get("value"):
1608
+ cct_val = float(cct["value"])
1609
+ if cct_val < 0.9:
1610
+ recommendations.append({
1611
+ "priority": "long-term",
1612
+ "issue": f"checkpoint_completion_target is {cct_val} - I/O may be spiky",
1613
+ "action": "Set checkpoint_completion_target to 0.9",
1614
+ "explanation": f"PostgreSQL periodically writes dirty buffers to disk (checkpoints). At {cct_val}, "
1615
+ f"it tries to complete this in {int(cct_val*100)}% of the checkpoint interval, causing I/O spikes. "
1616
+ "At 0.9, writes spread over 90% of the interval, smoothing disk I/O. "
1617
+ "WHY: Spiky I/O can cause query latency spikes during checkpoints. "
1618
+ "SIDE EFFECT: Slightly more consistent (but spread out) disk writes. No downside in practice.",
1619
+ "commands": [
1620
+ "ALTER SYSTEM SET checkpoint_completion_target = 0.9;",
1621
+ "SELECT pg_reload_conf(); -- Takes effect immediately"
1622
+ ],
1623
+ "restart_required": False,
1624
+ })
1625
+
1626
+ # max_parallel_workers check
1627
+ mpw = mem.get("max_parallel_workers", {})
1628
+ mpwpg = mem.get("max_parallel_workers_per_gather", {})
1629
+ if mpw and mpw.get("value") == 0:
1630
+ recommendations.append({
1631
+ "priority": "short-term",
1632
+ "issue": "max_parallel_workers is 0 - parallel queries disabled",
1633
+ "action": "Set max_parallel_workers to number of CPU cores",
1634
+ "explanation": "PostgreSQL can use multiple CPU cores for large sequential scans, aggregates, and joins. "
1635
+ "With max_parallel_workers=0, all queries run single-threaded regardless of table size. "
1636
+ "WHY: Large analytical queries (COUNT, SUM, scans of big tables) could run 2-8x faster with parallelism. "
1637
+ "SIDE EFFECT: Parallel queries use more CPU and memory simultaneously. For OLTP workloads with many "
1638
+ "small queries, this rarely triggers. For analytical queries, it's a significant speedup. "
1639
+ "IF NOT CHANGED: Large table scans will always be slow, even with idle CPU cores.",
1640
+ "commands": [
1641
+ "ALTER SYSTEM SET max_parallel_workers = 4; -- Adjust to your CPU count",
1642
+ "ALTER SYSTEM SET max_parallel_workers_per_gather = 2;",
1643
+ "SELECT pg_reload_conf();"
1644
+ ],
1645
+ "restart_required": False,
1646
+ })
1647
+ elif mpwpg and mpwpg.get("value") == 0:
1648
+ recommendations.append({
1649
+ "priority": "long-term",
1650
+ "issue": "max_parallel_workers_per_gather is 0 - parallel queries won't use workers",
1651
+ "action": "Set max_parallel_workers_per_gather to 2-4",
1652
+ "explanation": "Even though max_parallel_workers allows parallel execution, max_parallel_workers_per_gather=0 "
1653
+ "means each query can use 0 parallel workers (i.e., none). "
1654
+ "WHY: This effectively disables parallelism for all queries. "
1655
+ "SIDE EFFECT: Each parallel query can use up to this many additional workers. "
1656
+ "Setting to 2 means a query could use 3 total processes (1 leader + 2 workers). "
1657
+ "IF NOT CHANGED: You have parallel infrastructure configured but no queries will use it.",
1658
+ "commands": [
1659
+ "ALTER SYSTEM SET max_parallel_workers_per_gather = 2;",
1660
+ "SELECT pg_reload_conf();"
1661
+ ],
1662
+ "restart_required": False,
1663
+ })
1664
+
1665
+ # autovacuum check
1666
+ autovac = mem.get("autovacuum", {})
1667
+ if autovac and autovac.get("value") == "off":
1668
+ recommendations.append({
1669
+ "priority": "immediate",
1670
+ "issue": "autovacuum is DISABLED - database will bloat and eventually fail",
1671
+ "action": "Enable autovacuum immediately",
1672
+ "explanation": "Autovacuum is PostgreSQL's background process that reclaims space from deleted/updated rows "
1673
+ "and prevents transaction ID wraparound. With autovacuum OFF: "
1674
+ "1) Tables bloat indefinitely - deleted rows waste space and slow queries. "
1675
+ "2) Transaction IDs (XIDs) are never frozen - the database WILL shut down when XIDs wrap (~2 billion transactions). "
1676
+ "3) Table statistics become stale - query planner makes bad decisions. "
1677
+ "WHY IT WAS DISABLED: Sometimes disabled for bulk loads, but must be re-enabled after. "
1678
+ "IF NOT CHANGED: Database will eventually refuse all writes to prevent corruption. This is not recoverable without maintenance.",
1679
+ "commands": [
1680
+ "ALTER SYSTEM SET autovacuum = on;",
1681
+ "SELECT pg_reload_conf();"
1682
+ ],
1683
+ "restart_required": False,
1684
+ })
1685
+
1686
+ # synchronous_commit info (not a warning, just info)
1687
+ sync = mem.get("synchronous_commit", {})
1688
+ if sync and sync.get("value") == "off":
1689
+ recommendations.append({
1690
+ "priority": "long-term",
1691
+ "issue": "synchronous_commit is off - faster writes but risk of data loss on crash",
1692
+ "action": "Evaluate if this is acceptable for your data",
1693
+ "explanation": "With synchronous_commit=off, PostgreSQL returns 'success' to clients BEFORE data is flushed to disk. "
1694
+ "BENEFIT: Write transactions are 2-10x faster because they don't wait for disk. "
1695
+ "RISK: If the server crashes, the last ~100-800ms of committed transactions may be lost. "
1696
+ "The database will NOT be corrupted - it will be consistent, just missing recent commits. "
1697
+ "ACCEPTABLE FOR: Session data, analytics, caches, logs - anything you can afford to lose. "
1698
+ "NOT ACCEPTABLE FOR: Financial transactions, user data, anything where 'committed' must mean 'durable'. "
1699
+ "IF NOT CHANGED: You keep the performance benefit but accept the crash-loss risk.",
1700
+ })
1701
+
1702
+ # pg_stat_statements not available
1703
+ if not result.top_queries:
1704
+ recommendations.append({
1705
+ "priority": "short-term",
1706
+ "issue": "pg_stat_statements extension not available - cannot analyze query performance",
1707
+ "action": "Enable pg_stat_statements extension",
1708
+ "explanation": "pg_stat_statements tracks execution statistics for all SQL queries: call count, total time, "
1709
+ "rows returned, cache hits, temp file usage. Without it, you cannot identify slow queries or optimization targets. "
1710
+ "WHY: This analysis found memory/vacuum issues but cannot pinpoint which QUERIES cause problems. "
1711
+ "SIDE EFFECT: Minor overhead (~1-5%) for tracking. Stores stats in shared memory. "
1712
+ "IF NOT ENABLED: You're flying blind - you can see symptoms (high I/O, temp files) but not the queries causing them. "
1713
+ "To enable, run: python3 scripts/enable-pg-stats.py --service <name> (may require brief restart).",
1714
+ })
1715
+
1716
+ # Cache hit ratio
1717
+ if result.cache_hit:
1718
+ table_hit = result.cache_hit.get("table_hit_pct")
1719
+ if table_hit is not None and table_hit < 95:
1720
+ priority = "immediate" if table_hit < 90 else "short-term"
1721
+ # Find the worst offending tables for context
1722
+ worst_tables = []
1723
+ for t in result.cache_per_table[:3]:
1724
+ if float(t.get("hit_pct") or 100) < 90:
1725
+ worst_tables.append(f"{t['table']} ({t['hit_pct']}%)")
1726
+ context = f" Worst tables: {', '.join(worst_tables)}." if worst_tables else ""
1727
+ recommendations.append({
1728
+ "priority": priority,
1729
+ "issue": f"Table cache hit ratio is {table_hit}% (should be >95%)",
1730
+ "action": "Increase shared_buffers - data is being read from disk instead of memory cache",
1731
+ "explanation": f"Cache hit ratio measures how often PostgreSQL finds requested data in memory (shared_buffers) "
1732
+ f"vs reading from disk. At {table_hit}%, roughly {100-table_hit}% of data requests hit disk.{context}",
1733
+ })
1734
+
1735
+ # Per-table cache - check for low hit rates with high disk reads
1736
+ for table in result.cache_per_table:
1737
+ try:
1738
+ hit_pct = float(table.get("hit_pct") or 100)
1739
+ disk_reads = int(table.get("disk_reads") or 0)
1740
+ table_size = table.get("size", "unknown")
1741
+ except (ValueError, TypeError):
1742
+ continue
1743
+
1744
+ if hit_pct < 50 and disk_reads > 1_000_000:
1745
+ recommendations.append({
1746
+ "priority": "immediate",
1747
+ "issue": f"Table '{table['table']}' has {hit_pct}% cache hit with {disk_reads:,} disk reads",
1748
+ "action": "Increase shared_buffers to fit this table in memory",
1749
+ "explanation": f"The '{table['table']}' table ({table_size}) is almost never found in cache. "
1750
+ f"With {disk_reads:,} disk reads, every query touching this table causes disk I/O. "
1751
+ f"This is likely because the table is larger than shared_buffers.",
1752
+ })
1753
+ elif hit_pct < 80 and disk_reads > 10_000_000:
1754
+ recommendations.append({
1755
+ "priority": "short-term",
1756
+ "issue": f"Table '{table['table']}' has {hit_pct}% cache hit with {disk_reads:,} disk reads",
1757
+ "action": "Consider increasing shared_buffers for better caching",
1758
+ "explanation": f"The '{table['table']}' table has a low cache hit rate, causing frequent disk reads. "
1759
+ f"Increasing shared_buffers would allow more of this table to stay in memory.",
1760
+ })
1761
+
1762
+ # Memory config
1763
+ if result.memory_config and result.table_sizes:
1764
+ shared_buffers_mb = result.memory_config.get("shared_buffers", {}).get("mb", 0)
1765
+ total_table_bytes = sum(int(t.get("bytes", 0)) for t in result.table_sizes)
1766
+ total_table_mb = total_table_bytes / 1024 / 1024
1767
+
1768
+ if shared_buffers_mb > 0 and total_table_mb > shared_buffers_mb * 4:
1769
+ largest_table = result.table_sizes[0] if result.table_sizes else None
1770
+ context = ""
1771
+ if largest_table:
1772
+ lt_mb = int(largest_table.get("bytes", 0)) / 1024 / 1024
1773
+ context = f" Your largest table ({largest_table['table']}) is {round(lt_mb)}MB alone."
1774
+ recommendations.append({
1775
+ "priority": "immediate",
1776
+ "issue": f"shared_buffers ({shared_buffers_mb}MB) is much smaller than working set (~{round(total_table_mb)}MB)",
1777
+ "action": f"Increase shared_buffers to at least {round(total_table_mb / 4)}MB",
1778
+ "explanation": f"Your database has ~{round(total_table_mb)}MB of table data but only {shared_buffers_mb}MB of buffer cache.{context} "
1779
+ f"PostgreSQL cannot keep frequently-accessed data in memory, causing constant disk I/O.",
1780
+ })
1781
+
1782
+ # Vacuum health (using enhanced flags)
1783
+ for table in result.vacuum_health:
1784
+ dead_pct = float(table.get("dead_pct", 0))
1785
+ dead_rows = int(table.get("dead_rows", 0))
1786
+ needs_vacuum = table.get("needs_vacuum") == "true"
1787
+ needs_freeze = table.get("needs_freeze") == "true"
1788
+ last_vacuum = table.get("last_vacuum", "never")
1789
+ last_analyze = table.get("last_analyze", "never")
1790
+
1791
+ # Check needs_freeze flag first (more urgent)
1792
+ if needs_freeze:
1793
+ recommendations.append({
1794
+ "priority": "immediate",
1795
+ "issue": f"Table '{table['table']}' needs FREEZE (XID age > 150M)",
1796
+ "action": f"Run: VACUUM FREEZE \"{table['table']}\";",
1797
+ "explanation": "PostgreSQL uses transaction IDs (XIDs) that can wrap around after ~2 billion transactions. "
1798
+ "VACUUM FREEZE marks old rows as 'frozen' so they don't need XID checking. "
1799
+ "If XIDs wrap around without freezing, the database will shut down to prevent data corruption.",
1800
+ "commands": [f"VACUUM FREEZE \"{table['table']}\";"],
1801
+ })
1802
+ elif needs_vacuum:
1803
+ recommendations.append({
1804
+ "priority": "immediate",
1805
+ "issue": f"Table '{table['table']}' needs VACUUM ({dead_pct}% dead rows, {dead_rows:,} rows)",
1806
+ "action": f"Run: VACUUM ANALYZE \"{table['table']}\";",
1807
+ "explanation": f"This table has {dead_rows:,} dead rows ({dead_pct}% of table) from UPDATE/DELETE operations. "
1808
+ "Dead rows waste disk space and slow down queries by making them scan more pages. "
1809
+ f"Last vacuum: {last_vacuum}. Last analyze: {last_analyze}. "
1810
+ "ANALYZE also updates statistics for better query plans.",
1811
+ "commands": [f"VACUUM ANALYZE \"{table['table']}\";"],
1812
+ })
1813
+ elif dead_pct > 20:
1814
+ recommendations.append({
1815
+ "priority": "immediate",
1816
+ "issue": f"Table '{table['table']}' has {dead_pct}% dead rows ({dead_rows:,} rows)",
1817
+ "action": f"Run: VACUUM ANALYZE \"{table['table']}\";",
1818
+ "explanation": f"Over 20% of this table is dead rows from UPDATEs and DELETEs. "
1819
+ f"This bloat forces queries to scan many useless rows. Last vacuum: {last_vacuum}.",
1820
+ "commands": [f"VACUUM ANALYZE \"{table['table']}\";"],
1821
+ })
1822
+ elif dead_pct > 10:
1823
+ recommendations.append({
1824
+ "priority": "short-term",
1825
+ "issue": f"Table '{table['table']}' has {dead_pct}% dead rows ({dead_rows:,} rows)",
1826
+ "action": f"Run: VACUUM ANALYZE \"{table['table']}\";",
1827
+ "explanation": f"This table has accumulated {dead_rows:,} dead rows. While autovacuum should handle this, "
1828
+ f"it may be falling behind. Last vacuum: {last_vacuum}.",
1829
+ "commands": [f"VACUUM ANALYZE \"{table['table']}\";"],
1830
+ })
1831
+
1832
+ # XID age
1833
+ if result.xid_age:
1834
+ xid_millions = result.xid_age.get("millions", 0)
1835
+ if xid_millions > 150:
1836
+ recommendations.append({
1837
+ "priority": "immediate",
1838
+ "issue": f"XID age is {xid_millions}M (wraparound risk at 2147M)",
1839
+ "action": "Run VACUUM FREEZE on all high-XID tables",
1840
+ "explanation": "PostgreSQL's transaction ID counter wraps around at ~2.1 billion. At 150M+, you're using ~7% of "
1841
+ "the available space. If this reaches 2 billion without VACUUM FREEZE, PostgreSQL will "
1842
+ "shut down to prevent data corruption. This is a critical issue requiring immediate action.",
1843
+ "commands": ["VACUUM FREEZE; -- Run on affected tables"],
1844
+ })
1845
+ elif xid_millions > 100:
1846
+ recommendations.append({
1847
+ "priority": "short-term",
1848
+ "issue": f"XID age is {xid_millions}M (approaching wraparound risk)",
1849
+ "action": "Monitor autovacuum and consider manual VACUUM FREEZE",
1850
+ "explanation": "XID age is elevated. Autovacuum should handle this, but verify it's running. "
1851
+ "If tables are being vacuumed but XID age stays high, long-running transactions may be blocking freezing.",
1852
+ })
1853
+
1854
+ # Database stats (deadlocks, temp files)
1855
+ if result.database_stats:
1856
+ deadlocks = result.database_stats.get("deadlocks", 0)
1857
+ if deadlocks > 0:
1858
+ recommendations.append({
1859
+ "priority": "short-term",
1860
+ "issue": f"{deadlocks} deadlock(s) detected since last stats reset",
1861
+ "action": "Review application transaction logic and lock ordering",
1862
+ "explanation": f"A deadlock occurs when two transactions each hold a lock the other needs, creating a cycle. "
1863
+ f"PostgreSQL detects this and kills one transaction (the 'victim') so the other can proceed. "
1864
+ f"WHY THIS MATTERS: {deadlocks} deadlocks means {deadlocks} transactions were aborted and had to retry. "
1865
+ f"COMMON CAUSES: 1) Transactions locking rows in different orders. 2) Long transactions holding locks. "
1866
+ f"3) Hot rows updated by many concurrent transactions. "
1867
+ f"FIX: Ensure all code paths lock tables/rows in the same order. Keep transactions short. "
1868
+ f"IF NOT FIXED: Deadlocks will continue, causing random transaction failures and retries.",
1869
+ })
1870
+
1871
+ # Temp files - flag with description based on daily rate
1872
+ temp_files = result.database_stats.get("temp_files", 0)
1873
+ temp_bytes = result.database_stats.get("temp_bytes", 0)
1874
+ temp_gb = round(temp_bytes / 1024 / 1024 / 1024, 1) if temp_bytes > 0 else 0
1875
+ stats_reset = result.database_stats.get("stats_reset", "unknown")
1876
+ # Calculate days since reset for rate-based thresholds
1877
+ days_since_reset = None
1878
+ if stats_reset and stats_reset not in ("unknown", "never"):
1879
+ try:
1880
+ reset_date = datetime.fromisoformat(stats_reset.replace('Z', '+00:00'))
1881
+ days_since_reset = (datetime.now(timezone.utc) - reset_date).days
1882
+ days_since_reset = max(days_since_reset, 1) # Avoid division by zero
1883
+ except (ValueError, TypeError):
1884
+ pass
1885
+ # Use rate-based threshold if we have time period data
1886
+ if days_since_reset and days_since_reset > 0:
1887
+ gb_per_day = temp_gb / days_since_reset
1888
+ files_per_day = round(temp_files / days_since_reset)
1889
+ if gb_per_day > 5: # More than 5GB/day is concerning
1890
+ # Get current work_mem for context
1891
+ wm_mb = result.memory_config.get("work_mem", {}).get("mb", 4) if result.memory_config else 4
1892
+ recommendations.append({
1893
+ "priority": "short-term",
1894
+ "issue": f"High temp file usage: ~{files_per_day:,} files/day ({round(gb_per_day, 1)} GB/day)",
1895
+ "action": "Increase work_mem from {wm_mb}MB to 32-64MB",
1896
+ "explanation": f"When a query needs to sort or hash more data than work_mem allows ({wm_mb}MB), "
1897
+ f"PostgreSQL spills to temp files on disk. Your queries are creating ~{files_per_day:,} temp files daily, "
1898
+ f"writing {round(gb_per_day, 1)}GB to disk. This is slower than in-memory operations.",
1899
+ "commands": [
1900
+ "ALTER SYSTEM SET work_mem = '32MB';",
1901
+ "SELECT pg_reload_conf(); -- Takes effect for new connections"
1902
+ ],
1903
+ "restart_required": False,
1904
+ })
1905
+ elif temp_files > 10000 or temp_gb > 10: # Fallback if no date
1906
+ wm_mb = result.memory_config.get("work_mem", {}).get("mb", 4) if result.memory_config else 4
1907
+ recommendations.append({
1908
+ "priority": "short-term",
1909
+ "issue": f"High temp file usage: {temp_files:,} files, {temp_gb} GB written since stats reset",
1910
+ "action": f"Increase work_mem from {wm_mb}MB to 32-64MB",
1911
+ "explanation": f"Queries are spilling to disk because work_mem ({wm_mb}MB) is too small for sort/hash operations. "
1912
+ f"Each temp file represents a query that couldn't fit its working data in memory.",
1913
+ "commands": [
1914
+ "ALTER SYSTEM SET work_mem = '32MB';",
1915
+ "SELECT pg_reload_conf(); -- Takes effect for new connections"
1916
+ ],
1917
+ "restart_required": False,
1918
+ })
1919
+
1920
+ # Connection usage
1921
+ if result.connections:
1922
+ pct = result.connections.get("percent", 0)
1923
+ current = result.connections.get("current", 0)
1924
+ max_conn = result.connections.get("max", 100)
1925
+ available = result.connections.get("available", max_conn - current)
1926
+ if pct > 90:
1927
+ recommendations.append({
1928
+ "priority": "immediate",
1929
+ "issue": f"Connection usage is {pct}% ({current}/{max_conn}, only {available} available)",
1930
+ "action": "Use connection pooling (PgBouncer) or increase max_connections",
1931
+ "explanation": f"You're using {current} of {max_conn} connections. Each PostgreSQL connection uses memory "
1932
+ f"(~10MB each). Rather than increasing max_connections, use connection pooling (PgBouncer) "
1933
+ f"to multiplex many app connections over fewer database connections.",
1934
+ })
1935
+ elif pct > 70:
1936
+ recommendations.append({
1937
+ "priority": "short-term",
1938
+ "issue": f"Connection usage is {pct}% ({current}/{max_conn})",
1939
+ "action": "Consider connection pooling for scalability",
1940
+ "explanation": "Connection usage is elevated. Connection pooling (PgBouncer) helps applications share "
1941
+ "database connections efficiently, especially during traffic spikes.",
1942
+ })
1943
+
1944
+ # Old connections
1945
+ if result.oldest_connection_sec is not None:
1946
+ age_hours = result.oldest_connection_sec / 3600
1947
+ age_days = round(age_hours / 24, 1)
1948
+ if age_hours > 48:
1949
+ # Include details about what the old connections are
1950
+ conn_details = ""
1951
+ if result.oldest_connections:
1952
+ details_list = []
1953
+ for c in result.oldest_connections[:3]:
1954
+ app = c.get("application_name") or "(unnamed)"
1955
+ state = c.get("state", "unknown")
1956
+ days = c.get("age_days", "?")
1957
+ details_list.append(f"{app} ({state}, {days} days)")
1958
+ conn_details = f" Old connections: {'; '.join(details_list)}."
1959
+
1960
+ recommendations.append({
1961
+ "priority": "short-term",
1962
+ "issue": f"Oldest connection is ~{age_days} days old ({round(age_hours)} hours)",
1963
+ "action": "Review connection pooling settings and application connection management",
1964
+ "explanation": f"Long-lived connections can indicate connection pool misconfiguration or connection leaks. "
1965
+ f"They can also hold locks or prevent autovacuum from cleaning up. "
1966
+ f"If using connection pooling, ensure idle connections are recycled.{conn_details}",
1967
+ })
1968
+
1969
+ # Disk usage
1970
+ if result.disk_usage:
1971
+ use_pct = int(result.disk_usage.get("use_percent", 0))
1972
+ used = result.disk_usage.get("used", "unknown")
1973
+ total = result.disk_usage.get("total", "unknown")
1974
+ if use_pct > 85:
1975
+ recommendations.append({
1976
+ "priority": "immediate",
1977
+ "issue": f"Disk usage is {use_pct}% ({used} / {total})",
1978
+ "action": "Increase volume size or clean up data",
1979
+ "explanation": "PostgreSQL needs free disk space for WAL files, temp files, and VACUUM operations. "
1980
+ "Running out of disk space can cause database crashes. Consider: "
1981
+ "1) Increasing volume size, 2) Dropping unused indexes, 3) VACUUM FULL on bloated tables, "
1982
+ "4) Archiving old data.",
1983
+ })
1984
+ elif use_pct > 70:
1985
+ recommendations.append({
1986
+ "priority": "short-term",
1987
+ "issue": f"Disk usage is {use_pct}% ({used} / {total})",
1988
+ "action": "Plan for volume expansion",
1989
+ "explanation": f"Disk is at {use_pct}%, approaching the danger zone. PostgreSQL needs free space for: "
1990
+ f"1) WAL files - write-ahead logs that ensure durability. "
1991
+ f"2) Temp files - sorts and hashes spill here when work_mem is exceeded. "
1992
+ f"3) VACUUM operations - need space to rewrite tables during VACUUM FULL. "
1993
+ f"IF NOT ADDRESSED: At 85%+ you risk write failures. At 100%, database crashes and may not restart. "
1994
+ f"ACTIONS: Increase volume size in Railway, or identify large unused tables/indexes to drop.",
1995
+ })
1996
+
1997
+ # Unused indexes - only flag non-PK, non-unique indexes >100MB
1998
+ droppable_indexes = [
1999
+ idx for idx in result.unused_indexes
2000
+ if not idx.get("is_primary") and not idx.get("is_unique")
2001
+ and int(idx.get("size_bytes", 0)) > 100 * 1024 * 1024
2002
+ ]
2003
+ if droppable_indexes:
2004
+ total_size = sum_index_sizes(droppable_indexes)
2005
+ index_names = [idx['index'] for idx in droppable_indexes[:3]]
2006
+ recommendations.append({
2007
+ "priority": "long-term",
2008
+ "issue": f"{len(droppable_indexes)} unused non-constraint indexes >100MB ({total_size})",
2009
+ "action": "Review and drop unused indexes to save space and improve write performance",
2010
+ "explanation": f"These indexes have 0 scans since stats reset, meaning no queries are using them. "
2011
+ f"Each index costs disk space AND slows down writes (INSERT/UPDATE/DELETE must update all indexes). "
2012
+ f"Examples: {', '.join(index_names)}{'...' if len(droppable_indexes) > 3 else ''}",
2013
+ "commands": [f"DROP INDEX IF EXISTS \"{idx['index']}\"; -- saves {idx['size']}" for idx in droppable_indexes[:3]],
2014
+ })
2015
+
2016
+ # Tables with high missing index score (lots of seq scans, no index usage)
2017
+ for idx in result.unused_indexes:
2018
+ try:
2019
+ missing_score = int(idx.get("missing_index_score", 0))
2020
+ if missing_score > 1000:
2021
+ table_rows = idx.get("table_rows", "unknown")
2022
+ recommendations.append({
2023
+ "priority": "short-term",
2024
+ "issue": f"Table '{idx['table']}' has {missing_score:,} sequential scans with no index usage",
2025
+ "action": f"Consider adding an index on commonly filtered columns of '{idx['table']}'",
2026
+ "explanation": f"Sequential scans read the entire table ({table_rows} rows) for each query. "
2027
+ f"With {missing_score:,} sequential scans, queries are repeatedly scanning all rows. "
2028
+ f"An index on commonly filtered columns (WHERE clauses) would dramatically speed this up.",
2029
+ })
2030
+ except (ValueError, TypeError):
2031
+ pass
2032
+
2033
+ # Long-running queries
2034
+ if result.long_running_queries:
2035
+ for q in result.long_running_queries[:3]:
2036
+ try:
2037
+ duration = int(q.get("duration_sec", 0))
2038
+ query_preview = q.get("query", "")[:80]
2039
+ if duration > 60:
2040
+ recommendations.append({
2041
+ "priority": "immediate",
2042
+ "issue": f"Query running for {duration}s (PID {q.get('pid')})",
2043
+ "action": "Investigate and potentially cancel",
2044
+ "explanation": f"This query has been running for {duration} seconds. "
2045
+ f"QUERY: {query_preview}... "
2046
+ f"WHY THIS MATTERS: Long queries hold locks, consume memory, and may indicate missing indexes or inefficient queries. "
2047
+ f"TO CANCEL (graceful): SELECT pg_cancel_backend({q.get('pid')}); "
2048
+ f"TO TERMINATE (force): SELECT pg_terminate_backend({q.get('pid')}); "
2049
+ f"SIDE EFFECT OF CANCEL: The query's transaction will be rolled back. The application will receive an error.",
2050
+ "commands": [f"SELECT pg_cancel_backend({q.get('pid')}); -- Graceful cancel"],
2051
+ })
2052
+ except (ValueError, TypeError):
2053
+ pass
2054
+
2055
+ # Idle in transaction (stuck transactions)
2056
+ if result.idle_in_transaction:
2057
+ for txn in result.idle_in_transaction[:3]:
2058
+ try:
2059
+ idle_sec = int(txn.get("idle_sec", 0))
2060
+ app_name = txn.get("application_name", "unknown app")
2061
+ if idle_sec > 300: # 5 minutes
2062
+ recommendations.append({
2063
+ "priority": "immediate",
2064
+ "issue": f"Transaction idle for {idle_sec}s (PID {txn.get('pid')}, user: {txn.get('user', 'unknown')}, app: {app_name})",
2065
+ "action": "Terminate the stuck transaction",
2066
+ "explanation": f"This connection started a transaction (BEGIN) but hasn't done anything for {idle_sec}s. "
2067
+ f"WHY THIS IS BAD: 1) Holds row-level locks that block other queries. "
2068
+ f"2) Prevents VACUUM from cleaning dead rows in any table it touched. "
2069
+ f"3) Holds a transaction ID slot, contributing to XID bloat. "
2070
+ f"COMMON CAUSES: Application bug, network timeout without cleanup, abandoned connection. "
2071
+ f"TO FIX: SELECT pg_terminate_backend({txn.get('pid')}); (terminates connection). "
2072
+ f"PREVENTION: Set idle_in_transaction_session_timeout to auto-kill stuck transactions.",
2073
+ "commands": [
2074
+ f"SELECT pg_terminate_backend({txn.get('pid')}); -- Kill this connection",
2075
+ "ALTER SYSTEM SET idle_in_transaction_session_timeout = '5min'; -- Auto-kill in future",
2076
+ ],
2077
+ })
2078
+ elif idle_sec > 60:
2079
+ recommendations.append({
2080
+ "priority": "short-term",
2081
+ "issue": f"Transaction idle for {idle_sec}s (PID {txn.get('pid')}, app: {app_name})",
2082
+ "action": "Review application transaction handling",
2083
+ "explanation": f"This transaction has been idle for {idle_sec}s. While not critical yet, "
2084
+ f"transactions should be short-lived. Long idle transactions hold locks and block VACUUM. "
2085
+ f"COMMON CAUSES: Missing COMMIT/ROLLBACK, waiting for user input inside transaction, connection pool issues. "
2086
+ f"PREVENTION: Use idle_in_transaction_session_timeout to auto-terminate stuck transactions.",
2087
+ })
2088
+ except (ValueError, TypeError):
2089
+ pass
2090
+
2091
+ # Blocked queries
2092
+ if result.blocked_queries:
2093
+ for q in result.blocked_queries[:3]:
2094
+ try:
2095
+ wait_sec = int(q.get("wait_sec", 0))
2096
+ if wait_sec > 30:
2097
+ recommendations.append({
2098
+ "priority": "immediate",
2099
+ "issue": f"Query waiting {wait_sec}s for lock (PID {q.get('pid')} blocked by {q.get('blocking_pid')})",
2100
+ "action": "Investigate the blocking query and terminate if appropriate",
2101
+ "explanation": f"PID {q.get('pid')} has been waiting {wait_sec}s for a lock held by PID {q.get('blocking_pid')}. "
2102
+ f"WHY: The blocking query/transaction is holding a lock (row, table, or advisory) that this query needs. "
2103
+ f"COMMON CAUSES: Long-running transaction, idle-in-transaction, DDL operations (ALTER TABLE). "
2104
+ f"TO INVESTIGATE: SELECT query FROM pg_stat_activity WHERE pid = {q.get('blocking_pid')}; "
2105
+ f"TO UNBLOCK: Cancel or terminate the blocking PID if it's stuck. "
2106
+ f"SIDE EFFECT: Terminating the blocker will rollback its transaction, but unblock waiting queries.",
2107
+ "commands": [
2108
+ f"-- See what {q.get('blocking_pid')} is doing:",
2109
+ f"SELECT pid, state, query FROM pg_stat_activity WHERE pid = {q.get('blocking_pid')};",
2110
+ f"-- To terminate (if stuck): SELECT pg_terminate_backend({q.get('blocking_pid')});",
2111
+ ],
2112
+ })
2113
+ except (ValueError, TypeError):
2114
+ pass
2115
+
2116
+ # Lock contention
2117
+ if result.locks:
2118
+ lock_types = set(lock.get("locktype", "unknown") for lock in result.locks)
2119
+ recommendations.append({
2120
+ "priority": "immediate",
2121
+ "issue": f"{len(result.locks)} blocked lock(s) detected ({', '.join(lock_types)})",
2122
+ "action": "Investigate lock contention - may indicate long transactions or deadlocks",
2123
+ "explanation": "Queries are waiting for locks held by other transactions. Common causes: "
2124
+ "1) Long-running transactions holding locks, 2) Deadlocks (PostgreSQL will resolve these automatically), "
2125
+ "3) DDL operations (ALTER TABLE) blocking normal queries. "
2126
+ "Check blocked_queries and idle_in_transaction sections for details.",
2127
+ })
2128
+
2129
+ # Sequential scans on large tables
2130
+ for table in result.seq_scan_tables:
2131
+ try:
2132
+ seq_scans = int(table.get("seq_scans", 0))
2133
+ idx_scans = int(table.get("idx_scans", 0))
2134
+ rows = int(table.get("rows", 0))
2135
+ if seq_scans > 1000 and idx_scans == 0 and rows > 10000:
2136
+ recommendations.append({
2137
+ "priority": "short-term",
2138
+ "issue": f"Table '{table['table']}' has {seq_scans:,} sequential scans with 0 index scans ({rows:,} rows)",
2139
+ "action": "Add indexes on columns used in WHERE, JOIN, and ORDER BY clauses",
2140
+ "explanation": f"Every query on '{table['table']}' scans all {rows:,} rows instead of using an index. "
2141
+ f"With {seq_scans:,} sequential scans, this table is a performance hotspot. "
2142
+ f"To find which columns to index, run: EXPLAIN ANALYZE on slow queries touching this table, "
2143
+ f"or check pg_stat_statements for common query patterns.",
2144
+ })
2145
+ except (ValueError, TypeError):
2146
+ pass
2147
+
2148
+ # HA cluster issues
2149
+ if result.ha_cluster:
2150
+ members = result.ha_cluster.get("members", [])
2151
+ for m in members:
2152
+ state = m.get("state", "")
2153
+ if state == "start failed":
2154
+ recommendations.append({
2155
+ "priority": "immediate",
2156
+ "issue": f"HA replica '{m.get('name')}' is in 'start failed' state",
2157
+ "action": "Resync the replica",
2158
+ "explanation": f"The replica '{m.get('name')}' failed to start, typically due to timeline divergence. "
2159
+ f"This happens when the replica's WAL history diverges from the primary (e.g., after failover). "
2160
+ f"WHY THIS MATTERS: This replica cannot be used for failover or read scaling until fixed. "
2161
+ f"FIX: The replica needs a fresh base backup (pg_basebackup) from the primary. "
2162
+ f"IF NOT FIXED: You're running without redundancy - if the primary fails, no automatic failover is possible.",
2163
+ })
2164
+ elif state not in ("running", "streaming"):
2165
+ recommendations.append({
2166
+ "priority": "short-term",
2167
+ "issue": f"HA replica '{m.get('name')}' is in '{state}' state",
2168
+ "action": "Investigate replica health",
2169
+ "explanation": f"Expected state is 'running' or 'streaming', but replica is '{state}'. "
2170
+ f"POSSIBLE STATES: 'creating' (initializing), 'stopped' (manually stopped), 'start failed' (broken). "
2171
+ f"WHY THIS MATTERS: Non-streaming replicas may have stale data and can't be used for failover. "
2172
+ f"CHECK: Replica logs for specific errors. Network connectivity to primary. WAL lag.",
2173
+ })
2174
+
2175
+ # Recent errors
2176
+ if result.recent_errors and len(result.recent_errors) > 5:
2177
+ # Summarize error types (recent_errors is a list of strings)
2178
+ error_samples = [e[:60] if isinstance(e, str) else str(e)[:60] for e in result.recent_errors[:3]]
2179
+ recommendations.append({
2180
+ "priority": "short-term",
2181
+ "issue": f"{len(result.recent_errors)} recent errors in logs",
2182
+ "action": "Review error logs for patterns",
2183
+ "explanation": f"Multiple errors detected in recent logs. Sample messages: {'; '.join(error_samples)}... "
2184
+ f"WHY THIS MATTERS: Frequent errors may indicate application bugs, configuration issues, or resource constraints. "
2185
+ f"CHECK: Look for patterns - are errors from one app? One query? Specific time periods? "
2186
+ f"COMMON TYPES: Connection errors (app/network issue), query errors (syntax/permissions), "
2187
+ f"out-of-memory errors (need more RAM or lower work_mem).",
2188
+ })
2189
+
2190
+ # Invalid indexes
2191
+ if result.invalid_indexes:
2192
+ for idx in result.invalid_indexes:
2193
+ recommendations.append({
2194
+ "priority": "immediate",
2195
+ "issue": f"Invalid index '{idx.get('index')}' on {idx.get('schema')}.{idx.get('table')}",
2196
+ "action": "Drop and recreate the index",
2197
+ "explanation": f"This index is marked as invalid - PostgreSQL will NOT use it for queries. "
2198
+ f"CAUSE: Usually a CREATE INDEX CONCURRENTLY that failed partway through (e.g., due to constraint violation, "
2199
+ f"out of disk space, or duplicate key). "
2200
+ f"WHY THIS MATTERS: The index takes up disk space and slows writes, but provides zero query benefit. "
2201
+ f"FIX: Drop it and recreate. Use CONCURRENTLY to avoid locking the table.",
2202
+ "commands": [
2203
+ f"DROP INDEX CONCURRENTLY IF EXISTS \"{idx.get('index')}\";",
2204
+ f"-- Then recreate with: CREATE INDEX CONCURRENTLY ...",
2205
+ ],
2206
+ })
2207
+
2208
+ # WAL archiver failures
2209
+ if result.archiver and result.archiver.get("failed_count", 0) > 0:
2210
+ last_failed_wal = result.archiver.get("last_failed_wal", "unknown")
2211
+ last_failed_time = result.archiver.get("last_failed_time", "unknown")
2212
+ recommendations.append({
2213
+ "priority": "immediate",
2214
+ "issue": f"WAL archiver has {result.archiver['failed_count']} failed archival(s)",
2215
+ "action": "Check archive_command configuration and destination storage",
2216
+ "explanation": f"WAL (Write-Ahead Log) archiving is failing. Last failed WAL: {last_failed_wal} at {last_failed_time}. "
2217
+ f"This affects point-in-time recovery capability. Common causes: "
2218
+ f"1) Archive destination full or unreachable, 2) Permissions issues, 3) archive_command misconfiguration.",
2219
+ })
2220
+
2221
+ # Background writer issues
2222
+ if result.bgwriter:
2223
+ bg = result.bgwriter
2224
+ # High backend fsync indicates shared_buffers pressure
2225
+ if bg.get("buffers_backend_fsync", 0) > 0:
2226
+ recommendations.append({
2227
+ "priority": "short-term",
2228
+ "issue": f"Backend processes forced {bg['buffers_backend_fsync']:,} fsync operations",
2229
+ "action": "Increase shared_buffers to reduce dirty buffer pressure",
2230
+ "explanation": "Normally, the background writer or checkpointer flushes dirty buffers to disk. "
2231
+ f"When shared_buffers is too small, backends must flush dirty buffers themselves "
2232
+ f"(buffers_backend_fsync > 0). This forces query processes to do I/O, causing latency spikes.",
2233
+ })
2234
+
2235
+ # Check if most checkpoints are requested (not timed)
2236
+ timed = bg.get("checkpoints_timed", 0)
2237
+ req = bg.get("checkpoints_req", 0)
2238
+ total = timed + req
2239
+ if total > 10 and req > timed:
2240
+ req_pct = round(100.0 * req / total, 1)
2241
+ recommendations.append({
2242
+ "priority": "short-term",
2243
+ "issue": f"{req_pct}% of checkpoints are requested (not timed) - WAL is filling up",
2244
+ "action": "Increase max_wal_size to 2-4GB",
2245
+ "explanation": f"Checkpoints should happen on a timer (checkpoint_timeout), not because WAL fills up. "
2246
+ f"With {req_pct}% requested checkpoints, WAL segments are filling faster than expected. "
2247
+ f"This causes I/O spikes. Increasing max_wal_size gives more headroom before forced checkpoints.",
2248
+ "commands": [
2249
+ "ALTER SYSTEM SET max_wal_size = '2GB';",
2250
+ "SELECT pg_reload_conf(); -- Takes effect immediately"
2251
+ ],
2252
+ "restart_required": False,
2253
+ })
2254
+
2255
+ # High maxwritten_clean indicates bgwriter can't keep up
2256
+ if bg.get("maxwritten_clean", 0) > 100:
2257
+ recommendations.append({
2258
+ "priority": "long-term",
2259
+ "issue": f"Background writer hit max write limit {bg['maxwritten_clean']:,} times",
2260
+ "action": "Increase bgwriter_lru_maxpages to let bgwriter flush more buffers per round",
2261
+ "explanation": "The background writer proactively flushes dirty buffers before they're needed. "
2262
+ f"It hit the per-round limit {bg['maxwritten_clean']:,} times, meaning it couldn't "
2263
+ f"keep up with the write rate. Increasing bgwriter_lru_maxpages allows more buffer "
2264
+ f"flushes per round.",
2265
+ })
2266
+
2267
+ return recommendations
2268
+
2269
+
2270
+ def sum_index_sizes(indexes: List[Dict[str, Any]]) -> str:
2271
+ """Sum up index sizes and return human-readable string."""
2272
+ total_bytes = 0
2273
+ for idx in indexes:
2274
+ size_str = idx.get("size", "0")
2275
+ # Parse sizes like "23 MB", "8448 kB", etc.
2276
+ match = re.match(r"(\d+)\s*(MB|kB|GB|bytes?)?", size_str, re.IGNORECASE)
2277
+ if match:
2278
+ value = int(match.group(1))
2279
+ unit = (match.group(2) or "bytes").upper()
2280
+ if unit in ("KB", "KB"):
2281
+ total_bytes += value * 1024
2282
+ elif unit == "MB":
2283
+ total_bytes += value * 1024 * 1024
2284
+ elif unit == "GB":
2285
+ total_bytes += value * 1024 * 1024 * 1024
2286
+ else:
2287
+ total_bytes += value
2288
+
2289
+ if total_bytes >= 1024 * 1024 * 1024:
2290
+ return f"{total_bytes / 1024 / 1024 / 1024:.1f} GB"
2291
+ elif total_bytes >= 1024 * 1024:
2292
+ return f"{total_bytes / 1024 / 1024:.1f} MB"
2293
+ elif total_bytes >= 1024:
2294
+ return f"{total_bytes / 1024:.1f} KB"
2295
+ return f"{total_bytes} bytes"
2296
+
2297
+
2298
+ def format_report(result: AnalysisResult) -> str:
2299
+ """Format analysis result as human-readable report."""
2300
+ lines = []
2301
+ lines.append("=" * 60)
2302
+ lines.append(f"Database Analysis: {result.service}")
2303
+ lines.append("=" * 60)
2304
+ lines.append(f"Type: {result.db_type}")
2305
+ lines.append(f"Generated: {result.timestamp}")
2306
+ lines.append(f"Status: {result.deployment_status}")
2307
+ lines.append("")
2308
+
2309
+ # Collection status table
2310
+ if result.collection_status:
2311
+ lines.append("## Data Collection Status")
2312
+ lines.append("")
2313
+ lines.append("| Source | Status | Details |")
2314
+ lines.append("|--------|--------|---------|")
2315
+ source_labels = {
2316
+ "database_query": "Database Query (SSH)",
2317
+ "metrics_api": "Metrics API",
2318
+ "logs_api": "Logs API",
2319
+ "ha_cluster": "HA Cluster (Patroni)",
2320
+ }
2321
+ for source in ["database_query", "metrics_api", "logs_api", "ha_cluster"]:
2322
+ if source in result.collection_status:
2323
+ info = result.collection_status[source]
2324
+ status = info["status"].upper()
2325
+ details = ""
2326
+ if info.get("error"):
2327
+ details = info["error"]
2328
+ elif info.get("reason"):
2329
+ details = info["reason"]
2330
+ elif info.get("lines"):
2331
+ details = f"{info['lines']} lines collected"
2332
+ elif status == "SUCCESS":
2333
+ details = "OK"
2334
+ label = source_labels.get(source, source)
2335
+ lines.append(f"| {label} | {status} | {details} |")
2336
+ lines.append("")
2337
+
2338
+ # Summary table
2339
+ lines.append("## Summary")
2340
+ lines.append("")
2341
+ lines.append("| Metric | Value | Status |")
2342
+ lines.append("|--------|-------|--------|")
2343
+
2344
+ # Deployment
2345
+ status_icon = "Healthy" if result.deployment_status == "SUCCESS" else "Warning"
2346
+ lines.append(f"| Deployment | {result.deployment_status} | {status_icon} |")
2347
+
2348
+ # Connections
2349
+ if result.connections:
2350
+ pct = result.connections["percent"]
2351
+ current = result.connections["current"]
2352
+ max_conn = result.connections["max"]
2353
+ reserved = result.connections.get("reserved", 3)
2354
+ available = result.connections.get("available", max_conn - current)
2355
+ active = result.connections.get("active", 0)
2356
+ idle = result.connections.get("idle", 0)
2357
+ idle_in_txn = result.connections.get("idle_in_transaction", 0)
2358
+ status = "Critical" if pct > 90 else "Warning" if pct > 70 else "Healthy"
2359
+ lines.append(f"| Connections | {current} / {max_conn} ({pct}%) | {status} |")
2360
+ lines.append(f"| - Active/Idle/IdleTxn | {active} / {idle} / {idle_in_txn} | {'Warning' if idle_in_txn > 5 else '-'} |")
2361
+ lines.append(f"| - Available | {available} (reserved: {reserved}) | - |")
2362
+
2363
+ # Database size
2364
+ if result.size_breakdown and result.size_breakdown.get("database_bytes"):
2365
+ db_bytes = result.size_breakdown["database_bytes"]
2366
+ db_gb = round(db_bytes / 1024 / 1024 / 1024, 2)
2367
+ lines.append(f"| Database Size | {db_gb} GB | - |")
2368
+
2369
+ # Disk
2370
+ if result.disk_usage:
2371
+ pct = int(result.disk_usage["use_percent"])
2372
+ status = "Critical" if pct > 85 else "Warning" if pct > 70 else "Healthy"
2373
+ lines.append(f"| Disk | {result.disk_usage['used']} / {result.disk_usage['total']} ({pct}%) | {status} |")
2374
+
2375
+ # Cache hit
2376
+ if result.cache_hit:
2377
+ table_hit = result.cache_hit.get("table_hit_pct")
2378
+ if table_hit is not None:
2379
+ status = "Healthy" if table_hit >= 99 else "OK" if table_hit >= 95 else "Warning" if table_hit >= 90 else "Critical"
2380
+ lines.append(f"| Table Cache Hit | {table_hit}% | {status} |")
2381
+
2382
+ index_hit = result.cache_hit.get("index_hit_pct")
2383
+ if index_hit is not None:
2384
+ status = "Healthy" if index_hit >= 99 else "OK" if index_hit >= 95 else "Warning"
2385
+ lines.append(f"| Index Cache Hit | {index_hit}% | {status} |")
2386
+
2387
+ # Memory config summary
2388
+ if result.memory_config:
2389
+ if "shared_buffers" in result.memory_config:
2390
+ sb = result.memory_config["shared_buffers"]
2391
+ mb = sb.get("mb", 0)
2392
+ status = "Warning" if mb < 128 else "OK" if mb < 256 else "Healthy"
2393
+ lines.append(f"| shared_buffers | {mb} MB | {status} |")
2394
+ if "work_mem" in result.memory_config:
2395
+ wm = result.memory_config["work_mem"]
2396
+ mb = wm.get("mb", 0)
2397
+ status = "Default" if mb <= 4 else "OK"
2398
+ lines.append(f"| work_mem | {mb} MB | {status} |")
2399
+
2400
+ # XID age
2401
+ if result.xid_age:
2402
+ millions = result.xid_age["millions"]
2403
+ status = "Critical" if millions > 150 else "Warning" if millions > 100 else "Healthy"
2404
+ lines.append(f"| XID Age | {millions}M | {status} |")
2405
+
2406
+ # CPU/Memory (with trend indicators from 24h history)
2407
+ if result.cpu_memory:
2408
+ if "cpu_percent" in result.cpu_memory:
2409
+ cpu = result.cpu_memory["cpu_percent"]
2410
+ status = "Critical" if cpu > 85 else "Warning" if cpu > 70 else "Healthy"
2411
+ trend_str = _trend_indicator(result.metrics_history, "cpu")
2412
+ lines.append(f"| CPU Usage | {cpu} vCPU{trend_str} | {status} |")
2413
+ if result.cpu_memory.get("cpu_limit"):
2414
+ lines.append(f"| CPU Limit | {result.cpu_memory['cpu_limit']} vCPU | - |")
2415
+ if "memory_gb" in result.cpu_memory:
2416
+ mem = result.cpu_memory["memory_gb"]
2417
+ trend_str = _trend_indicator(result.metrics_history, "memory")
2418
+ utilization = ""
2419
+ if result.cpu_memory.get("memory_limit_gb"):
2420
+ pct = round((mem / result.cpu_memory["memory_limit_gb"]) * 100, 1)
2421
+ status = "Critical" if pct > 90 else "Warning" if pct > 80 else "Healthy"
2422
+ utilization = f" ({pct}% of {result.cpu_memory['memory_limit_gb']} GB)"
2423
+ else:
2424
+ status = "-"
2425
+ lines.append(f"| Memory Usage | {mem} GB{utilization}{trend_str} | {status} |")
2426
+
2427
+ # Database stats
2428
+ if result.database_stats:
2429
+ stats_reset = result.database_stats.get("stats_reset", "unknown")
2430
+ # Calculate days since stats reset for rate calculations
2431
+ days_since_reset = None
2432
+ if stats_reset and stats_reset not in ("unknown", "never"):
2433
+ try:
2434
+ reset_date = datetime.fromisoformat(stats_reset.replace('Z', '+00:00'))
2435
+ days_since_reset = (datetime.now(timezone.utc) - reset_date).days
2436
+ days_since_reset = max(days_since_reset, 1) # Avoid division by zero
2437
+ except (ValueError, TypeError):
2438
+ pass
2439
+ # Shorten timestamp to just date if it's a full timestamp
2440
+ stats_reset_display = stats_reset
2441
+ if stats_reset and stats_reset != "unknown" and stats_reset != "never" and len(stats_reset) > 10:
2442
+ stats_reset_display = stats_reset[:10]
2443
+ lines.append(f"| Stats Reset | {stats_reset_display} | - |")
2444
+ deadlocks = result.database_stats.get("deadlocks", 0)
2445
+ temp_files = result.database_stats.get("temp_files", 0)
2446
+ temp_bytes = result.database_stats.get("temp_bytes", 0)
2447
+ temp_gb = round(temp_bytes / 1024 / 1024 / 1024, 2) if temp_bytes > 0 else 0
2448
+ status = "Warning" if deadlocks > 0 else "Healthy"
2449
+ lines.append(f"| Deadlocks | {deadlocks} (since reset) | {status} |")
2450
+ # Show temp files with daily rate if we have time period data
2451
+ if days_since_reset:
2452
+ files_per_day = round(temp_files / days_since_reset)
2453
+ gb_per_day = round(temp_gb / days_since_reset, 2)
2454
+ # Status based on daily rate, not totals
2455
+ if gb_per_day > 5:
2456
+ temp_status = "High"
2457
+ elif gb_per_day > 1:
2458
+ temp_status = "Moderate"
2459
+ else:
2460
+ temp_status = "OK"
2461
+ lines.append(f"| Temp Files | {temp_files:,} ({temp_gb} GB) over {days_since_reset}d (~{files_per_day}/day, {gb_per_day} GB/day) | {temp_status} |")
2462
+ else:
2463
+ lines.append(f"| Temp Files | {temp_files:,} ({temp_gb} GB since reset) | - |")
2464
+
2465
+ # Size breakdown
2466
+ if result.size_breakdown:
2467
+ wal_bytes = result.size_breakdown.get("wal_bytes", 0)
2468
+ wal_mb = round(wal_bytes / 1024 / 1024, 1)
2469
+ lines.append(f"| WAL Size | {wal_mb} MB | - |")
2470
+
2471
+ # Oldest connection
2472
+ if result.oldest_connection_sec is not None:
2473
+ age_hrs = round(result.oldest_connection_sec / 3600, 1)
2474
+ status = "Warning" if age_hrs > 24 else "Healthy"
2475
+ lines.append(f"| Oldest Connection | {age_hrs} hrs | {status} |")
2476
+
2477
+ # pg_stat_statements extension
2478
+ pss_status = "Installed" if result.pg_stat_statements_installed else "Not installed"
2479
+ pss_icon = "OK" if result.pg_stat_statements_installed else "Info"
2480
+ lines.append(f"| pg_stat_statements | {pss_status} | {pss_icon} |")
2481
+
2482
+ lines.append("")
2483
+
2484
+ # Infrastructure Trends (multi-window)
2485
+ if result.metrics_history and result.metrics_history.get("windows"):
2486
+ windows = result.metrics_history.get("windows", {})
2487
+ for window_label, window_data in windows.items():
2488
+ mh = window_data.get("metrics", {})
2489
+ if not mh:
2490
+ continue
2491
+ lines.append(f"## Infrastructure Trends ({window_label})")
2492
+ lines.append("")
2493
+ lines.append("| Metric | Current | Min | Max | Avg | Trend | Change |")
2494
+ lines.append("|--------|---------|-----|-----|-----|-------|--------|")
2495
+ display_order = [
2496
+ ("cpu", "CPU"),
2497
+ ("memory", "Memory"),
2498
+ ("disk", "Disk"),
2499
+ ("network_rx", "Network RX"),
2500
+ ("network_tx", "Network TX"),
2501
+ ]
2502
+ for key, label in display_order:
2503
+ if key in mh:
2504
+ m = mh[key]
2505
+ unit = m["unit"]
2506
+ trend = m.get("trend", {})
2507
+ direction = trend.get("direction", "?")
2508
+ change = trend.get("change_pct", 0)
2509
+ arrow = {"increasing": "^", "decreasing": "v", "stable": "~"}.get(direction, "?")
2510
+ spike_note = ""
2511
+ if m.get("spikes"):
2512
+ spike_note = f" ({m['spikes']['count']} spikes)"
2513
+ lines.append(
2514
+ f"| {label} | {m['current']} {unit} | {m['min']} | {m['max']} | {m['avg']} | "
2515
+ f"{arrow} {direction} | {change:+.1f}%{spike_note} |"
2516
+ )
2517
+ lines.append("")
2518
+
2519
+ # PostgreSQL Configuration (tuning parameters)
2520
+ if result.memory_config:
2521
+ lines.append("## PostgreSQL Configuration")
2522
+ lines.append("")
2523
+ lines.append("| Parameter | Value | Recommended | Status |")
2524
+ lines.append("|-----------|-------|-------------|--------|")
2525
+
2526
+ mem = result.memory_config
2527
+
2528
+ # Memory settings
2529
+ if "shared_buffers" in mem:
2530
+ sb = mem["shared_buffers"]
2531
+ mb = sb.get("mb", 0)
2532
+ status = "Low" if mb < 128 else "Default" if mb < 256 else "OK"
2533
+ lines.append(f"| shared_buffers | {mb} MB | 25% of RAM | {status} |")
2534
+
2535
+ if "effective_cache_size" in mem:
2536
+ ec = mem["effective_cache_size"]
2537
+ mb = ec.get("mb", 0)
2538
+ status = "Low" if mb < 512 else "OK"
2539
+ lines.append(f"| effective_cache_size | {mb} MB | 50-75% of RAM | {status} |")
2540
+
2541
+ if "work_mem" in mem:
2542
+ wm = mem["work_mem"]
2543
+ mb = wm.get("mb", 0)
2544
+ status = "Default" if mb <= 4 else "OK"
2545
+ lines.append(f"| work_mem | {mb} MB | 16-64 MB | {status} |")
2546
+
2547
+ if "maintenance_work_mem" in mem:
2548
+ mm = mem["maintenance_work_mem"]
2549
+ mb = mm.get("mb", 0)
2550
+ status = "Low" if mb < 64 else "OK"
2551
+ lines.append(f"| maintenance_work_mem | {mb} MB | 256-1024 MB | {status} |")
2552
+
2553
+ # WAL settings
2554
+ if "wal_buffers" in mem:
2555
+ wb = mem["wal_buffers"]
2556
+ mb = wb.get("mb", 0)
2557
+ lines.append(f"| wal_buffers | {mb} MB | 16 MB | OK |")
2558
+
2559
+ if "checkpoint_completion_target" in mem:
2560
+ cct = mem["checkpoint_completion_target"]
2561
+ val = cct.get("value", 0)
2562
+ status = "Low" if float(val) < 0.9 else "OK"
2563
+ lines.append(f"| checkpoint_completion_target | {val} | 0.9 | {status} |")
2564
+
2565
+ # Parallelism
2566
+ if "max_parallel_workers" in mem:
2567
+ mpw = mem["max_parallel_workers"]
2568
+ val = mpw.get("value", 0)
2569
+ status = "Disabled" if val == 0 else "OK"
2570
+ lines.append(f"| max_parallel_workers | {val} | CPU cores | {status} |")
2571
+
2572
+ if "max_parallel_workers_per_gather" in mem:
2573
+ mpwpg = mem["max_parallel_workers_per_gather"]
2574
+ val = mpwpg.get("value", 0)
2575
+ status = "Disabled" if val == 0 else "OK"
2576
+ lines.append(f"| max_parallel_workers_per_gather | {val} | 2-4 | {status} |")
2577
+
2578
+ # Planner
2579
+ if "random_page_cost" in mem:
2580
+ rpc = mem["random_page_cost"]
2581
+ val = rpc.get("value", 4.0)
2582
+ status = "HDD default" if float(val) >= 4.0 else "SSD optimized" if float(val) <= 2.0 else "OK"
2583
+ lines.append(f"| random_page_cost | {val} | 1.1-2.0 (SSD) | {status} |")
2584
+
2585
+ if "default_statistics_target" in mem:
2586
+ dst = mem["default_statistics_target"]
2587
+ val = dst.get("value", 100)
2588
+ lines.append(f"| default_statistics_target | {val} | 100-500 | OK |")
2589
+
2590
+ # Autovacuum
2591
+ if "autovacuum" in mem:
2592
+ av = mem["autovacuum"]
2593
+ val = av.get("value", "on")
2594
+ status = "CRITICAL" if val == "off" else "OK"
2595
+ lines.append(f"| autovacuum | {val} | on | {status} |")
2596
+
2597
+ # Durability
2598
+ if "synchronous_commit" in mem:
2599
+ sc = mem["synchronous_commit"]
2600
+ val = sc.get("value", "on")
2601
+ status = "Faster (risk)" if val == "off" else "Safe"
2602
+ lines.append(f"| synchronous_commit | {val} | on (safe) | {status} |")
2603
+
2604
+ lines.append("")
2605
+
2606
+ # Connection states
2607
+ if result.connection_states:
2608
+ lines.append("## Connection States")
2609
+ lines.append("")
2610
+ lines.append("| State | Count |")
2611
+ lines.append("|-------|-------|")
2612
+ for s in result.connection_states:
2613
+ lines.append(f"| {s.get('state', 'unknown')} | {s.get('count', 0)} |")
2614
+ lines.append("")
2615
+
2616
+ # Connections by application
2617
+ if result.connections_by_app:
2618
+ lines.append("## Connections by Application")
2619
+ lines.append("")
2620
+ lines.append("| Application | Count |")
2621
+ lines.append("|-------------|-------|")
2622
+ for c in result.connections_by_app[:10]:
2623
+ app = c.get('app', '') or '(empty)'
2624
+ lines.append(f"| {app} | {c.get('count', 0)} |")
2625
+ lines.append("")
2626
+
2627
+ # Connections by age
2628
+ if result.connections_by_age:
2629
+ lines.append("## Connections by Age")
2630
+ lines.append("")
2631
+ lines.append("| Age Range | Count |")
2632
+ lines.append("|-----------|-------|")
2633
+ for c in result.connections_by_age:
2634
+ lines.append(f"| {c.get('range', '')} | {c.get('count', 0)} |")
2635
+ lines.append("")
2636
+
2637
+ # Per-table cache
2638
+ if result.cache_per_table:
2639
+ lines.append("## Per-Table Cache Hit Rates")
2640
+ lines.append("")
2641
+ lines.append("| Table | Hit % | Disk Reads | Status |")
2642
+ lines.append("|-------|-------|------------|--------|")
2643
+ for t in result.cache_per_table[:10]:
2644
+ hit_pct = float(t.get("hit_pct", 0))
2645
+ status = "OK" if hit_pct >= 95 else "Warning" if hit_pct >= 80 else "Critical"
2646
+ lines.append(f"| {t['table']} | {t['hit_pct']}% | {int(t['disk_reads']):,} | {status} |")
2647
+ lines.append("")
2648
+
2649
+ # Table sizes
2650
+ if result.table_sizes:
2651
+ lines.append("## Table Sizes")
2652
+ lines.append("")
2653
+ lines.append("| Schema.Table | Total | Data | Indexes | Rows |")
2654
+ lines.append("|--------------|-------|------|---------|------|")
2655
+ for t in result.table_sizes[:10]:
2656
+ schema = t.get('schema', 'public')
2657
+ table = t.get('table', '')
2658
+ full_name = f"{schema}.{table}" if schema != 'public' else table
2659
+ data_bytes = int(t.get('data_bytes', 0))
2660
+ index_bytes = int(t.get('index_bytes', 0))
2661
+ data_mb = round(data_bytes / 1024 / 1024, 1)
2662
+ index_mb = round(index_bytes / 1024 / 1024, 1)
2663
+ row_count = t.get('row_count', '0')
2664
+ lines.append(f"| {full_name} | {t['size']} | {data_mb}MB | {index_mb}MB | {row_count} |")
2665
+ lines.append("")
2666
+
2667
+ # Vacuum health
2668
+ if result.vacuum_health:
2669
+ lines.append("## Vacuum Health")
2670
+ lines.append("")
2671
+ lines.append("| Schema.Table | Dead Rows | Dead % | V/AV Count | Last Analyze | XID Age | Flags |")
2672
+ lines.append("|--------------|-----------|--------|------------|--------------|---------|-------|")
2673
+ for t in result.vacuum_health[:10]:
2674
+ schema = t.get('schema', 'public')
2675
+ table = t.get('table', '')
2676
+ vacuum_count = t.get('vacuum_count', '0')
2677
+ autovacuum_count = t.get('autovacuum_count', '0')
2678
+ last_analyze = t.get('last_analyze', 'never')
2679
+ # Shorten timestamp to just date
2680
+ if last_analyze and last_analyze != 'never' and len(last_analyze) > 10:
2681
+ last_analyze = last_analyze[:10]
2682
+ xid_age = t.get('xid_age', '0')
2683
+ xid_millions = round(int(xid_age) / 1_000_000, 1) if xid_age.isdigit() else 0
2684
+ flags = []
2685
+ if t.get('needs_vacuum') == 'true':
2686
+ flags.append('VACUUM')
2687
+ if t.get('needs_freeze') == 'true':
2688
+ flags.append('FREEZE')
2689
+ flags_str = ', '.join(flags) if flags else '-'
2690
+ full_name = f"{schema}.{table}" if schema != 'public' else table
2691
+ lines.append(f"| {full_name} | {int(t['dead_rows']):,} | {t['dead_pct']}% | {vacuum_count}/{autovacuum_count} | {last_analyze} | {xid_millions}M | {flags_str} |")
2692
+ lines.append("")
2693
+
2694
+ # Unused indexes
2695
+ if result.unused_indexes:
2696
+ lines.append("## Unused Indexes (0 scans since stats reset)")
2697
+ lines.append("")
2698
+ lines.append("| Schema.Table | Index | Size | Type | Table Idx Scans |")
2699
+ lines.append("|--------------|-------|------|------|-----------------|")
2700
+ for t in result.unused_indexes[:20]:
2701
+ schema = t.get('schema', 'public')
2702
+ table = t.get('table', '')
2703
+ full_name = f"{schema}.{table}" if schema != 'public' else table
2704
+ table_idx_scans = t.get('table_idx_scans', '0')
2705
+ # Show index type
2706
+ idx_type = "PK" if t.get('is_primary') else "UNIQUE" if t.get('is_unique') else "idx"
2707
+ lines.append(f"| {full_name} | {t['index']} | {t['size']} | {idx_type} | {table_idx_scans} |")
2708
+ lines.append("")
2709
+
2710
+ # Invalid indexes (failed concurrent index builds)
2711
+ if result.invalid_indexes:
2712
+ lines.append("## Invalid Indexes (require rebuild)")
2713
+ lines.append("")
2714
+ lines.append("| Schema | Table | Index |")
2715
+ lines.append("|--------|-------|-------|")
2716
+ for t in result.invalid_indexes:
2717
+ lines.append(f"| {t.get('schema', '')} | {t.get('table', '')} | {t.get('index', '')} |")
2718
+ lines.append("")
2719
+
2720
+ # Top queries
2721
+ if result.top_queries:
2722
+ lines.append("## Top Queries by Execution Time")
2723
+ lines.append("")
2724
+ lines.append("| Query | Calls | Total (min) | Mean (ms) | Min/Max (ms) | Stddev | Rows/Call | Cache Hit % | Temp R/W | Plan (ms) | I/O Time (ms) |")
2725
+ lines.append("|-------|-------|-------------|-----------|--------------|--------|-----------|-------------|----------|-----------|---------------|")
2726
+ for t in result.top_queries[:15]:
2727
+ query = t.get('query', '')[:50]
2728
+ cache_pct = t.get('cache_hit_pct')
2729
+ cache_str = f"{cache_pct}%" if cache_pct is not None else "-"
2730
+ temp_r = t.get('temp_blks_read', 0)
2731
+ temp_w = t.get('temp_blks_written', 0)
2732
+ temp_str = f"{temp_r:,}/{temp_w:,}" if (temp_r or temp_w) else "-"
2733
+ min_max = f"{t.get('min_ms', '-')}/{t.get('max_ms', '-')}"
2734
+ stddev = t.get('stddev_ms', '-')
2735
+ rows_per_call = t.get('rows_per_call', '-')
2736
+ plan_ms = t.get('mean_plan_ms', '-')
2737
+ blk_read = float(t.get('blk_read_time_ms', 0) or 0)
2738
+ blk_write = float(t.get('blk_write_time_ms', 0) or 0)
2739
+ io_time = f"{blk_read + blk_write:.0f}" if (blk_read + blk_write) > 0 else "-"
2740
+ lines.append(f"| {query}... | {t.get('calls', '')} | {t.get('total_min', '')} | {t.get('mean_ms', '')} | {min_max} | {stddev} | {rows_per_call} | {cache_str} | {temp_str} | {plan_ms} | {io_time} |")
2741
+ lines.append("")
2742
+
2743
+ # Long-running queries
2744
+ if result.long_running_queries:
2745
+ lines.append("## Long-Running Queries (>5s)")
2746
+ lines.append("")
2747
+ lines.append("| PID | Duration (s) | Query |")
2748
+ lines.append("|-----|--------------|-------|")
2749
+ for q in result.long_running_queries:
2750
+ lines.append(f"| {q.get('pid', '')} | {q.get('duration_sec', '')} | {q.get('query', '')[:40]}... |")
2751
+ lines.append("")
2752
+
2753
+ # Idle in transaction (stuck transactions)
2754
+ if result.idle_in_transaction:
2755
+ lines.append("## Idle In Transaction (>30s)")
2756
+ lines.append("")
2757
+ lines.append("| PID | Idle (s) | User | App | Last Query |")
2758
+ lines.append("|-----|----------|------|-----|------------|")
2759
+ for txn in result.idle_in_transaction:
2760
+ lines.append(f"| {txn.get('pid', '')} | {txn.get('idle_sec', '')} | {txn.get('user', '')} | {txn.get('app', '')[:15]} | {txn.get('last_query', '')[:30]}... |")
2761
+ lines.append("")
2762
+
2763
+ # Blocked queries
2764
+ if result.blocked_queries:
2765
+ lines.append("## Blocked Queries (waiting on locks)")
2766
+ lines.append("")
2767
+ lines.append("| PID | Wait (s) | User | Blocked By | Query |")
2768
+ lines.append("|-----|----------|------|------------|-------|")
2769
+ for q in result.blocked_queries:
2770
+ lines.append(f"| {q.get('pid', '')} | {q.get('wait_sec', '')} | {q.get('user', '')} | PID {q.get('blocking_pid', '')} | {q.get('query', '')[:30]}... |")
2771
+ lines.append("")
2772
+
2773
+ # Lock contention
2774
+ if result.locks:
2775
+ lines.append("## Lock Contention")
2776
+ lines.append("")
2777
+ lines.append("| Lock Type | Mode | User | App | Query |")
2778
+ lines.append("|-----------|------|------|-----|-------|")
2779
+ for lock in result.locks:
2780
+ lines.append(f"| {lock.get('locktype', '')} | {lock.get('mode', '')} | {lock.get('user', '')} | {lock.get('app', '')[:15]} | {lock.get('query', '')[:25]}... |")
2781
+ lines.append("")
2782
+
2783
+ # Sequential scan patterns
2784
+ if result.seq_scan_tables:
2785
+ lines.append("## Tables with High Sequential Scans")
2786
+ lines.append("")
2787
+ lines.append("| Table | Seq Scans | Index Scans | Rows |")
2788
+ lines.append("|-------|-----------|-------------|------|")
2789
+ for t in result.seq_scan_tables[:10]:
2790
+ lines.append(f"| {t.get('table', '')} | {t.get('seq_scans', '')} | {t.get('idx_scans', '')} | {t.get('rows', '')} |")
2791
+ lines.append("")
2792
+
2793
+ # Replication
2794
+ if result.replication:
2795
+ lines.append("## Replication Status")
2796
+ lines.append("")
2797
+ lines.append("| Client | State | Sent LSN | Replay LSN |")
2798
+ lines.append("|--------|-------|----------|------------|")
2799
+ for r in result.replication:
2800
+ lines.append(f"| {r.get('client', '')} | {r.get('state', '')} | {r.get('sent_lsn', '')} | {r.get('replay_lsn', '')} |")
2801
+ lines.append("")
2802
+
2803
+ # HA Cluster
2804
+ if result.ha_cluster:
2805
+ lines.append("## HA Cluster (Patroni)")
2806
+ lines.append("")
2807
+ members = result.ha_cluster.get("members", [])
2808
+ if members:
2809
+ lines.append("| Name | Role | State | Timeline | Lag |")
2810
+ lines.append("|------|------|-------|----------|-----|")
2811
+ for m in members:
2812
+ lag = m.get('lag', 0) or 0
2813
+ lines.append(f"| {m.get('name', '')} | {m.get('role', '')} | {m.get('state', '')} | {m.get('timeline', '')} | {lag} |")
2814
+ lines.append("")
2815
+
2816
+ # Cluster logs (for HA clusters) - raw output for LLM analysis
2817
+ if result.cluster_logs:
2818
+ lines.append("## Cluster Member Logs")
2819
+ lines.append("")
2820
+ lines.append("(Use --json for full log data. LLM will analyze patterns.)")
2821
+ lines.append("")
2822
+ for member_log in result.cluster_logs:
2823
+ member = member_log.get('member', 'unknown')
2824
+ status = member_log.get('status', 'unknown')
2825
+ logs = member_log.get('logs', [])
2826
+ lines.append(f"### {member} ({status}) - {len(logs)} log entries collected")
2827
+ lines.append("")
2828
+
2829
+ # Background writer stats
2830
+ if result.bgwriter:
2831
+ lines.append("## Background Writer Stats")
2832
+ lines.append("")
2833
+ lines.append("| Metric | Value |")
2834
+ lines.append("|--------|-------|")
2835
+ bg = result.bgwriter
2836
+ total_checkpoints = bg.get('checkpoints_timed', 0) + bg.get('checkpoints_req', 0)
2837
+ timed_pct = round(100.0 * bg.get('checkpoints_timed', 0) / total_checkpoints, 1) if total_checkpoints > 0 else 0
2838
+ lines.append(f"| Checkpoints (timed/requested) | {bg.get('checkpoints_timed', 0):,} / {bg.get('checkpoints_req', 0):,} ({timed_pct}% timed) |")
2839
+ lines.append(f"| Buffers: checkpoint | {bg.get('buffers_checkpoint', 0):,} |")
2840
+ lines.append(f"| Buffers: bgwriter clean | {bg.get('buffers_clean', 0):,} |")
2841
+ lines.append(f"| Buffers: backend direct | {bg.get('buffers_backend', 0):,} |")
2842
+ lines.append(f"| Buffers: backend fsync | {bg.get('buffers_backend_fsync', 0):,} |")
2843
+ lines.append(f"| Max written clean | {bg.get('maxwritten_clean', 0):,} |")
2844
+ stats_reset = bg.get('stats_reset', 'never')
2845
+ if stats_reset and stats_reset != 'never' and len(stats_reset) > 10:
2846
+ stats_reset = stats_reset[:10]
2847
+ lines.append(f"| Stats reset | {stats_reset} |")
2848
+ lines.append("")
2849
+
2850
+ # WAL archiver stats
2851
+ if result.archiver:
2852
+ arch = result.archiver
2853
+ lines.append("## WAL Archiver Status")
2854
+ lines.append("")
2855
+ lines.append("| Metric | Value |")
2856
+ lines.append("|--------|-------|")
2857
+ lines.append(f"| Archived WAL count | {arch.get('archived_count', 0):,} |")
2858
+ lines.append(f"| Failed archival count | {arch.get('failed_count', 0):,} |")
2859
+ last_wal = arch.get('last_archived_wal') or 'none'
2860
+ last_time = arch.get('last_archived_time', 'never')
2861
+ if last_time and last_time != 'never' and len(last_time) > 19:
2862
+ last_time = last_time[:19]
2863
+ lines.append(f"| Last archived WAL | {last_wal} |")
2864
+ lines.append(f"| Last archived time | {last_time} |")
2865
+ if arch.get('failed_count', 0) > 0:
2866
+ failed_wal = arch.get('last_failed_wal') or 'none'
2867
+ failed_time = arch.get('last_failed_time', 'never')
2868
+ if failed_time and failed_time != 'never' and len(failed_time) > 19:
2869
+ failed_time = failed_time[:19]
2870
+ lines.append(f"| Last failed WAL | {failed_wal} |")
2871
+ lines.append(f"| Last failed time | {failed_time} |")
2872
+ lines.append("")
2873
+
2874
+ # Vacuum progress (ongoing vacuums)
2875
+ if result.progress_vacuum:
2876
+ lines.append("## Ongoing Vacuum Operations")
2877
+ lines.append("")
2878
+ lines.append("| PID | Table | Phase | Progress |")
2879
+ lines.append("|-----|-------|-------|----------|")
2880
+ for vac in result.progress_vacuum:
2881
+ total = vac.get('heap_blks_total', 0)
2882
+ scanned = vac.get('heap_blks_scanned', 0)
2883
+ pct = round(100.0 * scanned / total, 1) if total > 0 else 0
2884
+ lines.append(f"| {vac.get('pid', '')} | {vac.get('relname', '')} | {vac.get('phase', '')} | {pct}% ({scanned:,}/{total:,} blks) |")
2885
+ lines.append("")
2886
+
2887
+ # SSL connection stats
2888
+ if result.ssl_stats:
2889
+ ssl = result.ssl_stats
2890
+ lines.append("## SSL Connection Stats")
2891
+ lines.append("")
2892
+ total = ssl.get('ssl_connections', 0) + ssl.get('non_ssl_connections', 0)
2893
+ ssl_pct = round(100.0 * ssl.get('ssl_connections', 0) / total, 1) if total > 0 else 0
2894
+ lines.append(f"- SSL connections: {ssl.get('ssl_connections', 0)} ({ssl_pct}%)")
2895
+ lines.append(f"- Non-SSL connections: {ssl.get('non_ssl_connections', 0)}")
2896
+ versions = ssl.get('ssl_versions', [])
2897
+ if versions:
2898
+ lines.append("- SSL versions in use:")
2899
+ for v in versions:
2900
+ lines.append(f" - {v.get('version', 'unknown')}: {v.get('count', 0)} connections")
2901
+ lines.append("")
2902
+
2903
+ # Recent errors
2904
+ if result.recent_errors:
2905
+ lines.append("## Recent Errors")
2906
+ lines.append("")
2907
+ for error in result.recent_errors[:5]:
2908
+ lines.append(f"- {error[:100]}...")
2909
+ lines.append("")
2910
+
2911
+ # Recommendations
2912
+ if result.recommendations:
2913
+ lines.append("## Recommendations")
2914
+ lines.append("")
2915
+ for i, rec in enumerate(result.recommendations, 1):
2916
+ priority = rec["priority"].upper()
2917
+ lines.append(f"{i}. **[{priority}]** {rec['issue']}")
2918
+ lines.append(f" **Action:** {rec['action']}")
2919
+
2920
+ # Show explanation if available
2921
+ if rec.get("explanation"):
2922
+ lines.append(f" **Why:** {rec['explanation']}")
2923
+
2924
+ # Show commands if available
2925
+ if rec.get("commands"):
2926
+ lines.append(" **Commands:**")
2927
+ for cmd in rec["commands"]:
2928
+ lines.append(f" ```sql")
2929
+ lines.append(f" {cmd}")
2930
+ lines.append(f" ```")
2931
+
2932
+ # Note if restart is required
2933
+ if rec.get("restart_required"):
2934
+ lines.append(" ⚠️ *Requires database restart*")
2935
+
2936
+ lines.append("")
2937
+
2938
+ # Errors
2939
+ if result.errors:
2940
+ lines.append("## Errors")
2941
+ lines.append("")
2942
+ for error in result.errors:
2943
+ lines.append(f"- {error}")
2944
+ lines.append("")
2945
+
2946
+ lines.append("=" * 60)
2947
+ lines.append("END OF REPORT")
2948
+ lines.append("=" * 60)
2949
+
2950
+ return "\n".join(lines)
2951
+
2952
+
2953
+ def main():
2954
+ parser = argparse.ArgumentParser(
2955
+ description="Complete database analysis for Railway services.",
2956
+ formatter_class=argparse.RawDescriptionHelpFormatter,
2957
+ )
2958
+
2959
+ parser.add_argument("--service", required=True, help="Service name")
2960
+ parser.add_argument("--json", action="store_true",
2961
+ help="Output as JSON")
2962
+ parser.add_argument("--timeout", type=int, default=300,
2963
+ help="Timeout in seconds for analysis query (default: 300)")
2964
+ parser.add_argument("--quiet", "-q", action="store_true",
2965
+ help="Suppress progress messages")
2966
+ parser.add_argument("--skip-logs", action="store_true",
2967
+ help="Skip log fetching for faster analysis")
2968
+ parser.add_argument("--metrics-hours", type=int, default=168,
2969
+ help="Hours of metrics history to fetch (default: 168, max: 168)")
2970
+ parser.add_argument("--step", choices=["ssh-test", "query", "logs", "metrics"],
2971
+ help="Run a single collection step for debugging")
2972
+ parser.add_argument("--project-id", help="Project ID (bypasses railway link)")
2973
+ parser.add_argument("--environment-id", help="Environment ID (bypasses railway link)")
2974
+ parser.add_argument("--service-id", help="Service ID (bypasses railway link)")
2975
+
2976
+ args = parser.parse_args()
2977
+
2978
+ # Single-step debugging mode
2979
+ if args.step:
2980
+ return run_single_step(args)
2981
+
2982
+ # Run analysis
2983
+ result = analyze_postgres(args.service, timeout=args.timeout, quiet=args.quiet,
2984
+ skip_logs=args.skip_logs,
2985
+ metrics_hours=min(args.metrics_hours, 168),
2986
+ project_id=args.project_id,
2987
+ environment_id=args.environment_id,
2988
+ service_id=args.service_id)
2989
+
2990
+ # Output
2991
+ if args.json:
2992
+ print(json.dumps(asdict(result), indent=2))
2993
+ else:
2994
+ print(format_report(result))
2995
+
2996
+ return 0
2997
+
2998
+
2999
+ def run_single_step(args) -> int:
3000
+ """Run a single collection step for debugging."""
3001
+ service = args.service
3002
+ _init_context(args)
3003
+ environment_id = dal._ctx.environment_id
3004
+ service_id = dal._ctx.service_id
3005
+
3006
+ if args.step == "ssh-test":
3007
+ print(f"Testing SSH to service: {service}", file=sys.stderr)
3008
+ code, stdout, stderr = run_ssh_query(service, "echo ok", timeout=45)
3009
+ print(f"Exit code: {code}")
3010
+ print(f"Stdout: {stdout.strip()}")
3011
+ if stderr:
3012
+ print(f"Stderr: {stderr.strip()}")
3013
+ return 0 if (code == 0 and "ok" in stdout) else 1
3014
+
3015
+ elif args.step == "query":
3016
+ print(f"Running analysis query on: {service}", file=sys.stderr)
3017
+ query = build_analysis_query()
3018
+ code, stdout, stderr = run_psql_query_safe(service, query, timeout=args.timeout)
3019
+ print(f"Exit code: {code}")
3020
+ if code == 0 and stdout:
3021
+ try:
3022
+ data = json.loads(stdout.strip())
3023
+ print(json.dumps(data, indent=2))
3024
+ except json.JSONDecodeError:
3025
+ print(f"Raw output:\n{stdout}")
3026
+ else:
3027
+ print(f"Error: {stderr or stdout}")
3028
+ return code
3029
+
3030
+ elif args.step == "logs":
3031
+ print(f"Fetching logs for: {service}", file=sys.stderr)
3032
+ logs = get_recent_logs(service, lines=LOG_LINES_DEFAULT,
3033
+ environment_id=environment_id,
3034
+ service_id=service_id)
3035
+ print(f"Lines fetched: {len(logs)}")
3036
+ for line in logs:
3037
+ print(line)
3038
+ return 0
3039
+
3040
+ elif args.step == "metrics":
3041
+ print(f"Fetching metrics for: {service}", file=sys.stderr)
3042
+ if environment_id and service_id:
3043
+ metrics = get_all_metrics_from_api(environment_id, service_id)
3044
+ if metrics:
3045
+ print(json.dumps(metrics, indent=2))
3046
+ else:
3047
+ print("Metrics API returned no data")
3048
+ return 1
3049
+ else:
3050
+ print("Missing environment_id or service_id from railway config")
3051
+ return 1
3052
+ return 0
3053
+
3054
+ return 1
3055
+
3056
+
3057
+ if __name__ == "__main__":
3058
+ sys.exit(main())