@umacloud/knowledge 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (418) hide show
  1. package/00-governance/governance-capabilities.md +557 -0
  2. package/00-governance/knowledge-map.md +39 -0
  3. package/00-governance/maintenance-policy.md +76 -0
  4. package/00-governance/review-checklist.md +81 -0
  5. package/README.md +13 -0
  6. package/ai/01-standards/agent-development-complete.md +691 -0
  7. package/ai/01-standards/llm-application-complete.md +488 -0
  8. package/ai/01-standards/mlops-complete.md +798 -0
  9. package/ai/01-standards/prompt-engineering-complete.md +646 -0
  10. package/ai/01-standards/rag-architecture-complete.md +649 -0
  11. package/ai/02-playbooks/llm-evaluation-playbook.md +847 -0
  12. package/ai/03-checklists/ai-project-checklist.md +215 -0
  13. package/ai/04-antipatterns/ai-antipatterns.md +661 -0
  14. package/ai/05-cases/case-rag-production.md +147 -0
  15. package/ai/06-glossary/ai-glossary.md +162 -0
  16. package/ai/agent-evaluation-benchmark.md +53 -0
  17. package/ai/ai-agent-memory-context-management.md +41 -0
  18. package/ai/ai-cost-capacity-optimization-playbook.md +42 -0
  19. package/ai/ai-data-security-and-compliance-playbook.md +37 -0
  20. package/ai/ai-domain-index-and-checklist.md +40 -0
  21. package/ai/ai-governance-maturity-model.md +50 -0
  22. package/ai/ai-model-selection-and-routing-strategy.md +47 -0
  23. package/ai/ai-observability-and-oncall-runbook.md +52 -0
  24. package/ai/ai-rag-engineering-playbook.md +42 -0
  25. package/ai/ai-red-team-and-safety-evaluation.md +42 -0
  26. package/ai/ai-release-readiness-and-rollback-gate.md +42 -0
  27. package/ai/llm-agent-engineering-deep-dive.md +57 -0
  28. package/ai/prompt-and-tool-guardrails.md +52 -0
  29. package/api/01-standards/enterprise-api-standards.md +198 -0
  30. package/api/01-standards/rest-api-design-guide.md +63 -0
  31. package/api/02-playbooks/api-pagination-playbook.md +93 -0
  32. package/api/02-playbooks/graphql-production-playbook.md +176 -0
  33. package/api/03-checklists/api-review-checklist.md +55 -0
  34. package/api/04-antipatterns/api-antipatterns.md +112 -0
  35. package/architecture/01-standards/api-gateway-patterns.md +496 -0
  36. package/architecture/01-standards/cloud-native-patterns.md +644 -0
  37. package/architecture/01-standards/distributed-systems-patterns.md +591 -0
  38. package/architecture/01-standards/event-driven-architecture.md +595 -0
  39. package/architecture/01-standards/microservices-patterns-complete.md +968 -0
  40. package/architecture/01-standards/microservices-patterns.md +495 -0
  41. package/architecture/01-standards/system-design-interview.md +664 -0
  42. package/architecture/02-playbooks/microservices-patterns-playbook.md +137 -0
  43. package/architecture/02-playbooks/migration-playbook.md +780 -0
  44. package/architecture/02-playbooks/system-design-playbook.md +779 -0
  45. package/architecture/03-checklists/architecture-decision-checklist.md +297 -0
  46. package/architecture/04-antipatterns/architecture-antipatterns.md +417 -0
  47. package/architecture/05-cases/case-netflix-microservices.md +413 -0
  48. package/architecture/06-glossary/architecture-glossary.md +164 -0
  49. package/architecture/adr-template-and-examples.md +38 -0
  50. package/architecture/api-gateway-deep-dive.md +1291 -0
  51. package/architecture/configuration-management.md +1162 -0
  52. package/architecture/distributed-transactions.md +1220 -0
  53. package/architecture/microservices-complete.md +735 -0
  54. package/architecture/resilience-and-disaster-patterns.md +37 -0
  55. package/architecture/service-governance.md +1198 -0
  56. package/architecture/system-architecture-deep-dive.md +37 -0
  57. package/backend/01-standards/analytics-and-growth.md +65 -0
  58. package/backend/01-standards/api-and-error-conventions.md +120 -0
  59. package/backend/01-standards/application-layering-and-packaging.md +160 -0
  60. package/backend/01-standards/auth-implementation.md +104 -0
  61. package/backend/01-standards/backend-framework-idioms.md +74 -0
  62. package/backend/01-standards/background-jobs-and-async.md +66 -0
  63. package/backend/01-standards/caching-strategies-complete.md +390 -0
  64. package/backend/01-standards/config-and-observability.md +77 -0
  65. package/backend/01-standards/data-modeling-and-persistence.md +94 -0
  66. package/backend/01-standards/django-complete.md +1765 -0
  67. package/backend/01-standards/email-and-notifications.md +64 -0
  68. package/backend/01-standards/fastapi-complete.md +925 -0
  69. package/backend/01-standards/file-upload-and-storage.md +66 -0
  70. package/backend/01-standards/graphql-api-complete.md +416 -0
  71. package/backend/01-standards/llm-application-standard.md +78 -0
  72. package/backend/01-standards/message-queue-patterns.md +379 -0
  73. package/backend/01-standards/microservices-and-distributed.md +78 -0
  74. package/backend/01-standards/nestjs-complete.md +2167 -0
  75. package/backend/01-standards/payment-integration.md +80 -0
  76. package/backend/01-standards/rate-limiting-complete.md +451 -0
  77. package/backend/01-standards/realtime-and-websocket.md +65 -0
  78. package/backend/01-standards/search-and-filtering.md +64 -0
  79. package/backend/01-standards/spring-boot-complete.md +445 -0
  80. package/backend/02-playbooks/api-design-playbook.md +718 -0
  81. package/backend/02-playbooks/email-send-playbook.md +130 -0
  82. package/backend/02-playbooks/file-upload-s3-playbook.md +153 -0
  83. package/backend/02-playbooks/typescript-enterprise-playbook.md +133 -0
  84. package/backend/02-playbooks/websocket-realtime-playbook.md +154 -0
  85. package/backend/03-checklists/api-launch-checklist.md +189 -0
  86. package/backend/04-antipatterns/backend-antipatterns.md +1051 -0
  87. package/blockchain/01-standards/blockchain-basics.md +557 -0
  88. package/blockchain/01-standards/smart-contract-development.md +1315 -0
  89. package/cicd/01-standards/deployment-and-delivery-standard.md +96 -0
  90. package/cicd/01-standards/github-actions-complete.md +473 -0
  91. package/cicd/01-standards/release-and-store-submission.md +75 -0
  92. package/cicd/02-playbooks/cicd-pipeline-playbook.md +144 -0
  93. package/cicd/02-playbooks/release-management-playbook.md +605 -0
  94. package/cicd/03-checklists/pipeline-security-checklist.md +168 -0
  95. package/cicd/04-antipatterns/cicd-antipatterns.md +589 -0
  96. package/cicd/05-cases/case-deployment-automation.md +221 -0
  97. package/cicd/05-cases/case-gitops-transformation.md +212 -0
  98. package/cicd/06-glossary/cicd-glossary.md +114 -0
  99. package/cicd/cicd-blueprint-deep-dive.md +38 -0
  100. package/cicd/release-readiness-gate.md +37 -0
  101. package/cloud-native/01-standards/container-security.md +741 -0
  102. package/cloud-native/01-standards/kubernetes-complete.md +812 -0
  103. package/cloud-native/02-playbooks/api-gateway-playbook.md +155 -0
  104. package/cloud-native/02-playbooks/gitops-with-argocd.md +760 -0
  105. package/cloud-native/02-playbooks/k8s-troubleshooting-playbook.md +1942 -0
  106. package/cloud-native/02-playbooks/message-queue-playbook.md +129 -0
  107. package/cloud-native/02-playbooks/multicloud-governance.md +726 -0
  108. package/cloud-native/02-playbooks/serverless-patterns.md +788 -0
  109. package/cloud-native/02-playbooks/service-mesh-playbook.md +612 -0
  110. package/cloud-native/02-playbooks/terraform-iac-playbook.md +143 -0
  111. package/cloud-native/03-checklists/container-security-checklist.md +431 -0
  112. package/cloud-native/03-checklists/k8s-production-readiness-checklist.md +460 -0
  113. package/cloud-native/04-antipatterns/container-antipatterns.md +660 -0
  114. package/cloud-native/04-antipatterns/k8s-antipatterns.md +743 -0
  115. package/cloud-native/05-cases/case-k8s-migration.md +478 -0
  116. package/cloud-native/05-cases/case-k8s-scaling.md +642 -0
  117. package/cloud-native/05-cases/case-k8s-security-incident.md +397 -0
  118. package/cloud-native/06-glossary/cloud-native-glossary.md +337 -0
  119. package/cross-platform/01-standards/cross-platform-frameworks.md +83 -0
  120. package/cross-platform/01-standards/platform-selection-and-architecture.md +77 -0
  121. package/data/01-standards/elasticsearch-complete.md +2098 -0
  122. package/data/01-standards/postgresql-complete.md +1613 -0
  123. package/data/01-standards/redis-complete.md +1527 -0
  124. package/data/02-playbooks/database-optimization-playbook.md +403 -0
  125. package/data/02-playbooks/elasticsearch-production-playbook.md +132 -0
  126. package/data/03-checklists/database-launch-checklist.md +187 -0
  127. package/data/04-antipatterns/database-antipatterns.md +873 -0
  128. package/data/05-cases/case-database-migration.md +310 -0
  129. package/data/06-glossary/database-glossary.md +440 -0
  130. package/data/data-governance-and-modeling-deep-dive.md +39 -0
  131. package/data-engineering/01-standards/airflow-complete.md +523 -0
  132. package/data-engineering/01-standards/kafka-complete.md +1521 -0
  133. package/data-engineering/02-playbooks/spark-etl-playbook.md +496 -0
  134. package/data-engineering/03-checklists/pipeline-launch-checklist.md +194 -0
  135. package/data-engineering/04-antipatterns/data-pipeline-antipatterns.md +684 -0
  136. package/data-engineering/05-cases/case-real-time-pipeline.md +355 -0
  137. package/data-engineering/06-glossary/data-engineering-glossary.md +429 -0
  138. package/database/01-standards/database-schema-standards.md +147 -0
  139. package/database/02-playbooks/postgresql-optimization-quick.md +52 -0
  140. package/database/02-playbooks/postgresql-performance-optimization.md +58 -0
  141. package/database/02-playbooks/postgresql-production-playbook.md +146 -0
  142. package/database/02-playbooks/redis-caching-playbook.md +117 -0
  143. package/database/03-checklists/database-review-checklist.md +50 -0
  144. package/database/04-antipatterns/database-antipatterns.md +112 -0
  145. package/design/01-standards/ui-design-system-complete.md +423 -0
  146. package/design/02-playbooks/design-handoff-playbook.md +254 -0
  147. package/design/02-playbooks/design-review-playbook.md +388 -0
  148. package/design/03-checklists/design-review-checklist.md +246 -0
  149. package/design/04-antipatterns/design-antipatterns.md +378 -0
  150. package/design/05-cases/case-design-system-adoption.md +328 -0
  151. package/design/06-glossary/design-glossary.md +329 -0
  152. package/design/ui-full-lifecycle-cross-platform-playbook.md +571 -0
  153. package/design/ux-system-deep-dive.md +38 -0
  154. package/design-systems/00-craft-rules.md +71 -0
  155. package/design-systems/aesthetic-families.md +43 -0
  156. package/design-systems/anti-ai-slop.md +162 -0
  157. package/design-systems/bold-geometric.md +120 -0
  158. package/design-systems/brutalist-bold.md +103 -0
  159. package/design-systems/editorial-clean.md +109 -0
  160. package/design-systems/glass-aurora.md +108 -0
  161. package/design-systems/modern-minimal.md +145 -0
  162. package/design-systems/premium-luxury.md +106 -0
  163. package/design-systems/product-type-design-map.md +48 -0
  164. package/design-systems/soft-warm.md +123 -0
  165. package/design-systems/tech-utility.md +113 -0
  166. package/desktop/01-standards/desktop-app-standard.md +72 -0
  167. package/desktop/01-standards/desktop-design.md +71 -0
  168. package/development/00-governance/document-template.md +41 -0
  169. package/development/01-standards/api-versioning-strategies.md +432 -0
  170. package/development/01-standards/authentication-patterns-complete.md +479 -0
  171. package/development/01-standards/css-architecture-complete.md +550 -0
  172. package/development/01-standards/database-migration-strategies.md +484 -0
  173. package/development/01-standards/elasticsearch-complete.md +347 -0
  174. package/development/01-standards/git-complete.md +371 -0
  175. package/development/01-standards/golang-complete.md +1565 -0
  176. package/development/01-standards/graphql-complete.md +298 -0
  177. package/development/01-standards/javascript-bundlers-complete.md +469 -0
  178. package/development/01-standards/javascript-typescript-complete.md +528 -0
  179. package/development/01-standards/jest-complete.md +275 -0
  180. package/development/01-standards/linux-complete.md +234 -0
  181. package/development/01-standards/logging-observability-complete.md +526 -0
  182. package/development/01-standards/microservices-communication.md +502 -0
  183. package/development/01-standards/mongodb-complete.md +406 -0
  184. package/development/01-standards/oauth2-complete.md +285 -0
  185. package/development/01-standards/performance-optimization-complete.md +289 -0
  186. package/development/01-standards/playwright-complete.md +247 -0
  187. package/development/01-standards/postgresql-complete.md +456 -0
  188. package/development/01-standards/pytest-complete.md +340 -0
  189. package/development/01-standards/python-async-programming.md +902 -0
  190. package/development/01-standards/python-complete.md +956 -0
  191. package/development/01-standards/python-decorators-complete.md +799 -0
  192. package/development/01-standards/python-design-patterns.md +2854 -0
  193. package/development/01-standards/python-packaging-distribution.md +420 -0
  194. package/development/01-standards/python-testing-strategies.md +607 -0
  195. package/development/01-standards/python-web-frameworks-comparison.md +471 -0
  196. package/development/01-standards/redis-complete.md +317 -0
  197. package/development/01-standards/rest-api-complete.md +316 -0
  198. package/development/01-standards/rust-complete.md +578 -0
  199. package/development/01-standards/typescript-advanced-types.md +1513 -0
  200. package/development/01-standards/web-security-complete.md +292 -0
  201. package/development/02-playbooks/api-design-playbook.md +810 -0
  202. package/development/02-playbooks/database-migration-playbook.md +580 -0
  203. package/development/02-playbooks/debugging-playbook.md +692 -0
  204. package/development/02-playbooks/feature-delivery-playbook.md +430 -0
  205. package/development/02-playbooks/incident-hotfix-playbook.md +387 -0
  206. package/development/02-playbooks/performance-optimization-playbook.md +531 -0
  207. package/development/02-playbooks/performance-tuning-playbook.md +652 -0
  208. package/development/02-playbooks/refactor-playbook.md +403 -0
  209. package/development/02-playbooks/release-playbook.md +469 -0
  210. package/development/03-checklists/architecture-review-checklist.md +168 -0
  211. package/development/03-checklists/data-migration-checklist.md +157 -0
  212. package/development/03-checklists/oncall-handover-checklist.md +173 -0
  213. package/development/03-checklists/pr-checklist.md +158 -0
  214. package/development/03-checklists/production-readiness-checklist.md +190 -0
  215. package/development/03-checklists/release-readiness-checklist.md +154 -0
  216. package/development/03-checklists/security-review-checklist.md +182 -0
  217. package/development/04-antipatterns/api-antipatterns.md +657 -0
  218. package/development/04-antipatterns/architecture-antipatterns.md +686 -0
  219. package/development/04-antipatterns/backend-antipatterns.md +648 -0
  220. package/development/04-antipatterns/cicd-antipatterns.md +540 -0
  221. package/development/04-antipatterns/code-smell-antipatterns.md +571 -0
  222. package/development/04-antipatterns/data-antipatterns.md +658 -0
  223. package/development/04-antipatterns/database-antipatterns.md +578 -0
  224. package/development/04-antipatterns/frontend-antipatterns.md +635 -0
  225. package/development/04-antipatterns/reliability-antipatterns.md +700 -0
  226. package/development/04-antipatterns/security-antipatterns.md +747 -0
  227. package/development/05-cases/case-api-version-migration.md +428 -0
  228. package/development/05-cases/case-authorization-hardening.md +383 -0
  229. package/development/05-cases/case-bluegreen-rollback.md +466 -0
  230. package/development/05-cases/case-cache-snowball-protection.md +485 -0
  231. package/development/05-cases/case-ci-cd-pipeline.md +544 -0
  232. package/development/05-cases/case-database-scaling.md +500 -0
  233. package/development/05-cases/case-db-hotspot-optimization.md +487 -0
  234. package/development/05-cases/case-incident-mttr-reduction.md +563 -0
  235. package/development/05-cases/case-microservice-migration.md +375 -0
  236. package/development/05-cases/case-performance-optimization.md +406 -0
  237. package/development/05-cases/case-security-incident-response.md +345 -0
  238. package/development/06-glossary/full-stack-glossary.md +166 -0
  239. package/development/09-maturity/quarterly-audit-template.md +35 -0
  240. package/development/11-ui-excellence/ui-aesthetic-system.md +41 -0
  241. package/development/11-ui-excellence/ui-engineering-excellence.md +435 -0
  242. package/development/12-scenarios/development-scenarios-guide.md +565 -0
  243. package/development/13-implementation-assets/implementation-toolkit.md +282 -0
  244. package/development/13-implementation-assets/knowledge-gates-execution.md +43 -0
  245. package/development/14-full-lifecycle/software-lifecycle-gates.md +511 -0
  246. package/development/15-lifecycle-templates/project-templates-collection.md +791 -0
  247. package/development/api-contract-and-versioning-guide.md +36 -0
  248. package/development/api-governance-complete.md +43 -0
  249. package/development/backend-engineering-complete.md +43 -0
  250. package/development/code-review-quality-complete.md +43 -0
  251. package/development/concurrency-reliability-complete.md +43 -0
  252. package/development/database-engineering-complete.md +43 -0
  253. package/development/engineering-effectiveness-complete.md +43 -0
  254. package/development/engineering-standards-deep-dive.md +38 -0
  255. package/development/frontend-engineering-complete.md +43 -0
  256. package/development/performance-capacity-complete.md +43 -0
  257. package/development/refactor-migration-complete.md +42 -0
  258. package/development/refactoring-and-techdebt-playbook.md +37 -0
  259. package/development/security-in-development-complete.md +43 -0
  260. package/devops/01-standards/cicd-pipeline-complete.md +262 -0
  261. package/devops/01-standards/docker-complete.md +1490 -0
  262. package/devops/01-standards/github-actions-complete.md +337 -0
  263. package/devops/01-standards/kubernetes-complete.md +638 -0
  264. package/devops/01-standards/terraform-complete.md +2117 -0
  265. package/devops/02-playbooks/docker-compose-playbook.md +233 -0
  266. package/devops/02-playbooks/docker-k8s-production-playbook.md +186 -0
  267. package/devops/02-playbooks/docker-production-playbook.md +952 -0
  268. package/edge-iot/01-standards/edge-iot-complete.md +473 -0
  269. package/experts/architect/api-design.md +178 -0
  270. package/experts/architect/methodology.md +124 -0
  271. package/experts/architect/security.md +75 -0
  272. package/experts/backend-lead/methodology.md +216 -0
  273. package/experts/devops/methodology.md +160 -0
  274. package/experts/frontend-lead/methodology.md +178 -0
  275. package/experts/product-manager/industry/ecommerce.md +43 -0
  276. package/experts/product-manager/industry/saas.md +40 -0
  277. package/experts/product-manager/methodology.md +97 -0
  278. package/experts/qa-lead/methodology.md +123 -0
  279. package/experts/qa-lead/test-strategy.md +128 -0
  280. package/experts/uiux-designer/methodology.md +125 -0
  281. package/frontend/01-standards/accessibility-complete.md +532 -0
  282. package/frontend/01-standards/accessibility-standard.md +74 -0
  283. package/frontend/01-standards/admin-dashboard-and-crud.md +72 -0
  284. package/frontend/01-standards/design-tokens-complete.md +444 -0
  285. package/frontend/01-standards/forms-and-validation.md +77 -0
  286. package/frontend/01-standards/frontend-architecture-and-layering.md +119 -0
  287. package/frontend/01-standards/i18n-and-localization.md +65 -0
  288. package/frontend/01-standards/nextjs-complete.md +451 -0
  289. package/frontend/01-standards/react-complete.md +713 -0
  290. package/frontend/01-standards/react-hooks-complete-guide.md +1100 -0
  291. package/frontend/01-standards/react-hooks-complete.md +1171 -0
  292. package/frontend/01-standards/seo-and-web-vitals.md +77 -0
  293. package/frontend/01-standards/state-management-complete.md +444 -0
  294. package/frontend/01-standards/vue-complete.md +499 -0
  295. package/frontend/01-standards/vue3-complete.md +2002 -0
  296. package/frontend/01-standards/web-framework-best-practices.md +64 -0
  297. package/frontend/01-standards/web-performance-complete.md +495 -0
  298. package/frontend/02-playbooks/accessibility-a11y-playbook.md +161 -0
  299. package/frontend/02-playbooks/frontend-performance-playbook.md +707 -0
  300. package/frontend/02-playbooks/i18n-internationalization-playbook.md +120 -0
  301. package/frontend/02-playbooks/performance-optimization-playbook.md +163 -0
  302. package/frontend/02-playbooks/react-nextjs-production-playbook.md +167 -0
  303. package/frontend/02-playbooks/react-state-management-playbook.md +173 -0
  304. package/frontend/03-checklists/component-quality-checklist.md +166 -0
  305. package/frontend/03-checklists/frontend-launch-checklist.md +299 -0
  306. package/frontend/04-antipatterns/frontend-antipatterns.md +886 -0
  307. package/frontend/05-cases/case-performance-optimization.md +274 -0
  308. package/harmony/01-standards/harmonyos-arkts-standard.md +75 -0
  309. package/harmony/01-standards/harmonyos-design.md +65 -0
  310. package/high-quality-engineering-playbook.md +54 -0
  311. package/incident/01-standards/incident-response-complete.md +303 -0
  312. package/incident/02-playbooks/chaos-engineering-playbook.md +883 -0
  313. package/incident/02-playbooks/postmortem-playbook.md +398 -0
  314. package/incident/03-checklists/incident-readiness-checklist.md +181 -0
  315. package/incident/04-antipatterns/incident-antipatterns.md +490 -0
  316. package/incident/05-cases/case-cascade-failure.md +176 -0
  317. package/incident/06-glossary/incident-glossary.md +114 -0
  318. package/incident/postmortem-and-response-deep-dive.md +39 -0
  319. package/industries/ecommerce/ecommerce-complete.md +631 -0
  320. package/industries/education/education-complete.md +555 -0
  321. package/industries/fintech/fintech-complete.md +501 -0
  322. package/industries/gaming/gaming-complete.md +587 -0
  323. package/industries/healthcare/healthcare-complete.md +452 -0
  324. package/low-code/01-standards/low-code-complete.md +944 -0
  325. package/miniprogram/01-standards/ai-common-mistakes.md +61 -0
  326. package/miniprogram/01-standards/miniprogram-custom-navbar-capsule.md +77 -0
  327. package/miniprogram/01-standards/miniprogram-design.md +61 -0
  328. package/miniprogram/01-standards/miniprogram-standard.md +81 -0
  329. package/mobile/01-standards/android-material-design.md +70 -0
  330. package/mobile/01-standards/flutter-complete.md +384 -0
  331. package/mobile/01-standards/ios-design-hig.md +78 -0
  332. package/mobile/01-standards/mobile-app-standard.md +85 -0
  333. package/mobile/01-standards/react-native-complete.md +352 -0
  334. package/mobile/02-playbooks/mobile-cross-platform-playbook.md +175 -0
  335. package/mobile/02-playbooks/mobile-performance.md +473 -0
  336. package/mobile/03-checklists/mobile-release-checklist.md +234 -0
  337. package/mobile/04-antipatterns/mobile-antipatterns.md +798 -0
  338. package/mobile/05-cases/case-app-performance.md +500 -0
  339. package/mobile/05-cases/case-app-startup-optimization.md +218 -0
  340. package/mobile/06-glossary/mobile-glossary.md +484 -0
  341. package/observability/01-standards/observability-standards.md +103 -0
  342. package/observability/02-playbooks/prometheus-grafana-playbook.md +135 -0
  343. package/observability/02-playbooks/structured-logging-playbook.md +73 -0
  344. package/observability/03-checklists/observability-checklist.md +54 -0
  345. package/observability/04-antipatterns/observability-antipatterns.md +106 -0
  346. package/operations/01-standards/prometheus-monitoring-complete.md +1578 -0
  347. package/operations/02-playbooks/capacity-planning-playbook.md +620 -0
  348. package/operations/03-checklists/production-launch-checklist.md +365 -0
  349. package/operations/04-antipatterns/operations-antipatterns.md +664 -0
  350. package/operations/05-cases/case-sre-practices.md +581 -0
  351. package/operations/06-glossary/operations-glossary.md +120 -0
  352. package/operations/aiops-anomaly-detection.md +758 -0
  353. package/operations/capacity-planning.md +1061 -0
  354. package/operations/chaos-engineering.md +659 -0
  355. package/operations/incident-command-system.md +38 -0
  356. package/operations/observability-complete.md +442 -0
  357. package/operations/slo-sli-playbook.md +517 -0
  358. package/operations/sre-operations-deep-dive.md +39 -0
  359. package/package.json +8 -0
  360. package/performance/01-standards/performance-and-scalability.md +80 -0
  361. package/performance/01-standards/performance-standards.md +156 -0
  362. package/performance/02-playbooks/query-optimization-playbook.md +103 -0
  363. package/performance/03-checklists/performance-checklist.md +56 -0
  364. package/performance/04-antipatterns/performance-antipatterns.md +146 -0
  365. package/product/01-standards/product-management-complete.md +285 -0
  366. package/product/02-playbooks/feature-launch-playbook.md +207 -0
  367. package/product/02-playbooks/user-research-playbook.md +532 -0
  368. package/product/03-checklists/feature-launch-checklist.md +275 -0
  369. package/product/04-antipatterns/product-antipatterns.md +355 -0
  370. package/product/05-cases/case-mvp-to-scale.md +384 -0
  371. package/product/06-glossary/product-glossary.md +462 -0
  372. package/product/feature-prioritization-framework.md +40 -0
  373. package/product/kpi-and-metric-tree.md +37 -0
  374. package/product/product-discovery-and-prd-deep-dive.md +41 -0
  375. package/quantum/01-standards/quantum-complete.md +1186 -0
  376. package/security/01-standards/api-security-complete.md +511 -0
  377. package/security/01-standards/container-runtime-security.md +574 -0
  378. package/security/01-standards/data-protection-gdpr.md +543 -0
  379. package/security/01-standards/owasp-top10-complete.md +1890 -0
  380. package/security/01-standards/secure-coding-baseline.md +90 -0
  381. package/security/01-standards/supply-chain-security.md +441 -0
  382. package/security/01-standards/web-security-checklist.md +108 -0
  383. package/security/01-standards/zero-trust-architecture.md +521 -0
  384. package/security/02-playbooks/auth-sso-playbook.md +166 -0
  385. package/security/02-playbooks/incident-response-security-playbook.md +588 -0
  386. package/security/02-playbooks/owasp-api-security-playbook.md +129 -0
  387. package/security/02-playbooks/payment-integration-playbook.md +119 -0
  388. package/security/02-playbooks/penetration-testing-playbook.md +517 -0
  389. package/security/03-checklists/security-audit-checklist.md +356 -0
  390. package/security/04-antipatterns/security-coding-antipatterns.md +580 -0
  391. package/security/05-cases/case-log4shell-incident.md +537 -0
  392. package/security/05-cases/case-major-breaches.md +468 -0
  393. package/security/06-glossary/security-glossary.md +212 -0
  394. package/security/compliance-automation.md +993 -0
  395. package/security/container-security.md +680 -0
  396. package/security/devsecops-complete.md +426 -0
  397. package/security/sast-dast-sca.md +775 -0
  398. package/security/secrets-management.md +594 -0
  399. package/security/security-architecture-deep-dive.md +37 -0
  400. package/security/threat-modeling-stride-playbook.md +40 -0
  401. package/seed-templates/auth-system.md +59 -0
  402. package/seed-templates/blog-content.md +94 -0
  403. package/seed-templates/dashboard.md +89 -0
  404. package/seed-templates/docs-site.md +73 -0
  405. package/seed-templates/e-commerce.md +50 -0
  406. package/seed-templates/saas-landing.md +92 -0
  407. package/seed-templates/settings-page.md +51 -0
  408. package/testing/01-standards/test-strategy-and-layering.md +83 -0
  409. package/testing/01-standards/testing-strategy-complete.md +422 -0
  410. package/testing/01-standards/unit-testing-best-practices.md +118 -0
  411. package/testing/02-playbooks/e2e-testing-playbook.md +988 -0
  412. package/testing/02-playbooks/testing-strategy-playbook.md +126 -0
  413. package/testing/03-checklists/test-strategy-checklist.md +208 -0
  414. package/testing/04-antipatterns/testing-antipatterns.md +718 -0
  415. package/testing/05-cases/case-testing-transformation.md +300 -0
  416. package/testing/06-glossary/testing-glossary.md +110 -0
  417. package/testing/risk-based-test-matrix.md +36 -0
  418. package/testing/testing-strategy-deep-dive.md +37 -0
@@ -0,0 +1,700 @@
1
+ ---
2
+ id: reliability-antipatterns
3
+ title: 稳定性反模式指南
4
+ domain: development
5
+ category: 04-antipatterns
6
+ difficulty: intermediate
7
+ tags: [antipatterns, breaker, circuit, development, failure, fatigue, incident, point]
8
+ quality_score: 70
9
+ last_updated: 2026-06-15
10
+ ---
11
+ # 稳定性反模式指南
12
+
13
+ > 适用范围:分布式系统 / 微服务 / 高可用架构
14
+ > 约束级别:SHALL(必须在架构评审和 SRE 审查阶段拦截)
15
+
16
+ ---
17
+
18
+ ## 1. 无超时与熔断(Missing Timeout and Circuit Breaker)
19
+
20
+ ### 描述
21
+ 调用外部依赖(HTTP API、数据库、Redis、消息队列、第三方服务)时不设超时,也没有熔断降级策略。一个下游服务的响应变慢或不可用时,上游服务的线程/连接被占满,导致级联故障(雪崩效应)。
22
+
23
+ ### 错误示例
24
+ ```python
25
+ # HTTP 无超时
26
+ def get_recommendations(user_id):
27
+ # 推荐服务变慢(30 秒响应),调用方所有线程被阻塞
28
+ response = requests.get(f"{RECOMMEND_SERVICE}/users/{user_id}/recs")
29
+ return response.json()
30
+
31
+ # 数据库无超时
32
+ def complex_report():
33
+ # 复杂查询执行 10 分钟,连接池耗尽
34
+ return db.execute("SELECT ... FROM huge_table JOIN ... WHERE ...")
35
+
36
+ # 无熔断 -- 明知服务已宕机仍继续请求
37
+ def get_user_avatar(user_id):
38
+ try:
39
+ return requests.get(f"{AVATAR_SERVICE}/avatars/{user_id}").content
40
+ except:
41
+ return DEFAULT_AVATAR
42
+ # 每个请求都尝试,即使服务已经连续超时 1000 次
43
+ ```
44
+
45
+ ### 正确示例
46
+ ```python
47
+ import httpx
48
+ from circuitbreaker import circuit, CircuitBreakerError
49
+
50
+ # 1. HTTP 超时
51
+ client = httpx.AsyncClient(
52
+ timeout=httpx.Timeout(connect=2.0, read=5.0, write=5.0, pool=10.0),
53
+ )
54
+
55
+ # 2. 熔断器
56
+ @circuit(failure_threshold=5, recovery_timeout=30)
57
+ async def get_recommendations(user_id: str) -> list[Product]:
58
+ response = await client.get(f"{RECOMMEND_SERVICE}/users/{user_id}/recs")
59
+ response.raise_for_status()
60
+ return [Product(**p) for p in response.json()]
61
+
62
+ # 3. 降级策略
63
+ async def get_recommendations_with_fallback(user_id: str) -> list[Product]:
64
+ try:
65
+ return await get_recommendations(user_id)
66
+ except CircuitBreakerError:
67
+ logger.warning("Recommendation circuit open, returning cached/popular items")
68
+ return await cache.get_popular_products()
69
+ except httpx.TimeoutException:
70
+ logger.warning("Recommendation service timeout")
71
+ return await cache.get_popular_products()
72
+
73
+ # 4. 数据库超时
74
+ engine = create_engine(
75
+ DATABASE_URL,
76
+ pool_size=10,
77
+ max_overflow=5,
78
+ pool_timeout=10,
79
+ connect_args={"connect_timeout": 5, "options": "-c statement_timeout=30000"},
80
+ )
81
+
82
+ # 5. 重试 + 退避
83
+ from tenacity import retry, stop_after_attempt, wait_exponential
84
+
85
+ @retry(
86
+ stop=stop_after_attempt(3),
87
+ wait=wait_exponential(multiplier=1, min=1, max=10),
88
+ retry=retry_if_exception_type(httpx.TransportError),
89
+ )
90
+ async def idempotent_external_call(data: dict) -> dict:
91
+ response = await client.post(f"{EXTERNAL_API}/process", json=data)
92
+ response.raise_for_status()
93
+ return response.json()
94
+ ```
95
+
96
+ ### 检测方法
97
+ - HTTP 调用无 `timeout` 参数。
98
+ - 数据库连接无 `connect_timeout` / `statement_timeout`。
99
+ - 无 `circuitbreaker` / `pybreaker` 等熔断库。
100
+ - 外部调用失败时直接报错,无降级策略。
101
+ - 重试逻辑无退避策略(立即重试导致放大效应)。
102
+
103
+ ### 修复步骤
104
+ 1. 所有 HTTP 调用设置 connect + read + write timeout。
105
+ 2. 数据库设置 connection timeout + statement timeout。
106
+ 3. 关键外部依赖添加熔断器(阈值 5 次失败,恢复周期 30 秒)。
107
+ 4. 每个外部依赖定义降级策略(缓存 / 默认值 / 部分降级)。
108
+ 5. 幂等操作添加重试 + 指数退避。
109
+ 6. 建立超时和熔断的监控告警。
110
+
111
+ ### Agent Checklist
112
+ - [ ] 所有 HTTP 调用有 timeout
113
+ - [ ] 数据库有 connection + statement timeout
114
+ - [ ] 关键外部依赖有熔断器
115
+ - [ ] 每个外部调用有降级方案
116
+ - [ ] 重试使用指数退避
117
+
118
+ ---
119
+
120
+ ## 2. 无统一事故指挥流程(Missing Incident Response)
121
+
122
+ ### 描述
123
+ 事故发生时没有标准化的响应流程,导致多人同时操作互相冲突、关键决策无人拍板、恢复时间不可预测、事故信息在不同群组分散传播。
124
+
125
+ ### 错误示例
126
+ ```
127
+ # 典型混乱场景
128
+ 12:00 告警触发
129
+ 12:05 小明在微信群说 "谁看一下"
130
+ 12:10 小红 SSH 到生产机器查日志
131
+ 12:15 小李也 SSH 到同一台机器,两人操作冲突
132
+ 12:20 小明在另一个群问 "是不是发布导致的"
133
+ 12:25 没人知道最近发布了什么
134
+ 12:30 Leader 开始问 "什么情况"
135
+ 12:35 三个人同时尝试不同的修复方案
136
+ 12:45 修复方案 A 和方案 B 互相冲突,导致问题更严重
137
+ 13:00 终于有人想起来可以回滚
138
+ 13:15 回滚完成,但已经 75 分钟了
139
+ ```
140
+
141
+ ### 正确示例
142
+ ```yaml
143
+ # 事故响应流程(Runbook)
144
+ incident_response:
145
+ severity_levels:
146
+ P0: "全站不可用 / 数据丢失 / 安全事件"
147
+ P1: "核心功能不可用 / 部分用户受影响"
148
+ P2: "非核心功能异常 / 性能退化"
149
+ P3: "不影响用户的内部系统问题"
150
+
151
+ roles:
152
+ incident_commander: "统一决策,协调资源,对外沟通"
153
+ operations_lead: "执行诊断和修复操作"
154
+ communications_lead: "更新状态页,通知利益方"
155
+ scribe: "记录时间线和操作日志"
156
+
157
+ workflow:
158
+ 1_detect:
159
+ - "告警触发或用户报告"
160
+ - "值班人员 5 分钟内确认"
161
+ - "判定严重性等级"
162
+ 2_triage:
163
+ - "Incident Commander 开启事故频道"
164
+ - "创建事故文档,记录时间线"
165
+ - "召集相关人员"
166
+ 3_mitigate:
167
+ - "优先恢复服务(回滚 / 降级 / 扩容)"
168
+ - "修复可以在恢复之后"
169
+ - "所有操作在事故频道中发出,禁止私聊操作"
170
+ 4_resolve:
171
+ - "确认问题完全恢复"
172
+ - "监控 30 分钟无复发"
173
+ - "降级告警"
174
+ 5_postmortem:
175
+ - "48 小时内完成复盘"
176
+ - "产出 Action Items 并分配 Owner"
177
+ - "纳入发布门禁或监控"
178
+
179
+ escalation:
180
+ response_time:
181
+ P0: "5 min 确认, 15 min 召集"
182
+ P1: "15 min 确认, 30 min 召集"
183
+ P2: "1 hour 确认"
184
+ auto_escalation:
185
+ - "P0/P1 超过 30 分钟未缓解 -> 通知 VP Engineering"
186
+ - "P0 超过 1 小时未恢复 -> 通知 CTO"
187
+ ```
188
+
189
+ ```python
190
+ # 自动化事故流程
191
+ class IncidentManager:
192
+ async def create_incident(self, severity: str, title: str, reporter: str):
193
+ incident = Incident(severity=severity, title=title, reporter=reporter)
194
+
195
+ # 创建事故频道
196
+ channel = await slack.create_channel(f"inc-{incident.id}-{slug(title)}")
197
+
198
+ # 通知值班人员
199
+ oncall = await pagerduty.get_oncall(service="platform")
200
+ await slack.send(channel, f"@{oncall.name} Incident Commander assigned")
201
+
202
+ # 创建事故文档
203
+ doc = await google_docs.create(
204
+ template="incident_template",
205
+ title=f"[{severity}] {title}",
206
+ )
207
+
208
+ # 更新状态页
209
+ if severity in ("P0", "P1"):
210
+ await statuspage.create_incident(title=title, status="investigating")
211
+
212
+ return incident
213
+ ```
214
+
215
+ ### 检测方法
216
+ - 无事故响应 Runbook 文档。
217
+ - 事故恢复时间 (MTTR) > 1 小时。
218
+ - 相同故障在 3 个月内重复发生。
219
+ - 事故后无复盘文档和 Action Items。
220
+ - 告警触发后超过 15 分钟无人响应。
221
+
222
+ ### 修复步骤
223
+ 1. 编写事故响应 Runbook,定义严重性等级和响应流程。
224
+ 2. 明确事故角色(Incident Commander / Operations Lead / Scribe)。
225
+ 3. 配置 PagerDuty / OpsGenie 值班轮换和自动升级。
226
+ 4. 建立事故频道模板和事故文档模板。
227
+ 5. 制度化事后复盘(48 小时内),Action Items 有 Owner 和 Deadline。
228
+
229
+ ### Agent Checklist
230
+ - [ ] 有事故响应 Runbook
231
+ - [ ] 事故角色分工明确
232
+ - [ ] 有值班轮换和自动升级机制
233
+ - [ ] 事后复盘有 Action Items 跟踪
234
+ - [ ] MTTR 目标:P0 < 30 min, P1 < 1 hour
235
+
236
+ ---
237
+
238
+ ## 3. 告警疲劳(Alert Fatigue)
239
+
240
+ ### 描述
241
+ 告警策略设置不当,导致大量无效告警(误报、低优先级、重复告警),值班人员被噪声淹没后开始忽略所有告警,当真正的故障发生时未能及时响应。
242
+
243
+ ### 错误示例
244
+ ```yaml
245
+ # 告警太多、太敏感
246
+ alerts:
247
+ - name: "CPU > 50%"
248
+ condition: cpu_usage > 50 # 阈值太低,经常触发
249
+ action: page_oncall # 所有告警都寻呼值班
250
+ severity: critical # 所有告警都是 critical
251
+
252
+ - name: "Any error in logs"
253
+ condition: error_count > 0 # 1 条错误就告警
254
+ action: page_oncall
255
+ severity: critical
256
+
257
+ - name: "Response time > 100ms"
258
+ condition: p99_latency > 100 # 阈值太低
259
+ action: page_oncall
260
+ severity: critical
261
+
262
+ # 结果:每天 200+ 条告警,值班人员直接静音
263
+ ```
264
+
265
+ ### 正确示例
266
+ ```yaml
267
+ # 分层告警策略
268
+ alerts:
269
+ # P0 -- 立即寻呼(每月 < 5 次)
270
+ critical:
271
+ - name: "Service down"
272
+ condition: health_check_failures >= 3 AND duration > 2m
273
+ action: page_oncall
274
+ runbook: "https://runbook.example.com/service-down"
275
+
276
+ - name: "Error rate spike"
277
+ condition: error_rate > 5% AND duration > 3m # 持续 3 分钟才告警
278
+ action: page_oncall
279
+ runbook: "https://runbook.example.com/error-rate"
280
+
281
+ # P1 -- Slack 通知(每天 < 10 次)
282
+ warning:
283
+ - name: "High latency"
284
+ condition: p99_latency > 500ms AND duration > 5m
285
+ action: slack_channel
286
+ auto_resolve: true
287
+
288
+ - name: "Disk space low"
289
+ condition: disk_usage > 80%
290
+ action: slack_channel
291
+
292
+ # P2 -- 仪表板(随时查看)
293
+ info:
294
+ - name: "Elevated CPU"
295
+ condition: cpu_usage > 70% AND duration > 10m
296
+ action: dashboard_only
297
+ ```
298
+
299
+ ```python
300
+ # 告警去重和聚合
301
+ class AlertManager:
302
+ def __init__(self, redis: Redis):
303
+ self._redis = redis
304
+
305
+ async def fire(self, alert: Alert):
306
+ # 去重:相同告警 15 分钟内不重复发送
307
+ dedup_key = f"alert:{alert.name}:{alert.source}"
308
+ if await self._redis.get(dedup_key):
309
+ return # 已存在,跳过
310
+
311
+ await self._redis.setex(dedup_key, 900, "1") # 15 分钟去重窗口
312
+
313
+ # 分级通知
314
+ if alert.severity == "critical":
315
+ await self._page_oncall(alert)
316
+ elif alert.severity == "warning":
317
+ await self._notify_slack(alert)
318
+ else:
319
+ await self._log_to_dashboard(alert)
320
+
321
+ async def auto_resolve(self, alert_name: str, source: str):
322
+ """条件恢复时自动关闭告警"""
323
+ dedup_key = f"alert:{alert_name}:{source}"
324
+ await self._redis.delete(dedup_key)
325
+ await self._notify_slack(Alert(
326
+ name=alert_name, severity="resolved", message="Auto-resolved"
327
+ ))
328
+ ```
329
+
330
+ ### 检测方法
331
+ - 每日告警数量 > 50。
332
+ - 值班人员确认告警的平均时间 > 15 分钟。
333
+ - 告警中误报率 > 30%。
334
+ - 所有告警使用相同的严重性级别。
335
+ - 告警无对应的 Runbook 链接。
336
+
337
+ ### 修复步骤
338
+ 1. 审计过去 30 天的告警数据,统计各告警的触发次数和有效处理率。
339
+ 2. 删除误报率 > 50% 的告警或调整阈值。
340
+ 3. 将告警分为 3 级:Critical(寻呼)/ Warning(Slack)/ Info(仪表板)。
341
+ 4. 为每条告警添加持续时间条件(避免瞬时抖动触发)。
342
+ 5. 实现告警去重(相同告警 15 分钟内不重复发送)。
343
+ 6. 每条 Critical/Warning 告警关联 Runbook。
344
+
345
+ ### Agent Checklist
346
+ - [ ] Critical 告警每月 < 5 次
347
+ - [ ] 告警分为 3 个严重性级别
348
+ - [ ] 所有告警有持续时间条件(不瞬时触发)
349
+ - [ ] 告警有 15 分钟去重窗口
350
+ - [ ] 每条告警关联 Runbook
351
+
352
+ ---
353
+
354
+ ## 4. 单点故障(Single Point of Failure)
355
+
356
+ ### 描述
357
+ 系统中存在单一的不可替代组件,该组件的故障导致整个系统不可用。常见单点:单实例数据库、单机部署、单一网络路径、单一外部依赖无降级。
358
+
359
+ ### 错误示例
360
+ ```yaml
361
+ # 单实例部署 -- 机器挂了全完了
362
+ services:
363
+ app:
364
+ image: myapp:latest
365
+ deploy:
366
+ replicas: 1 # 只有 1 个实例
367
+
368
+ postgres:
369
+ image: postgres:15
370
+ volumes:
371
+ - /data/postgres:/var/lib/postgresql/data # 单机本地存储,磁盘坏了数据全没
372
+ # 无从库、无备份
373
+ ```
374
+
375
+ ```python
376
+ # 唯一的外部依赖无降级
377
+ def process_payment(order):
378
+ # 只对接了一个支付渠道,它挂了就完全无法收款
379
+ result = stripe_client.charge(order.total)
380
+ return result
381
+ ```
382
+
383
+ ### 正确示例
384
+ ```yaml
385
+ # 多实例 + 多可用区
386
+ services:
387
+ app:
388
+ image: myapp:latest
389
+ deploy:
390
+ replicas: 3
391
+ placement:
392
+ constraints:
393
+ - node.labels.zone != same # 分布到不同可用区
394
+
395
+ postgres:
396
+ # 使用托管数据库服务(自带多可用区和自动故障转移)
397
+ # 或者自建:主 + 同步从 + 异步从
398
+ image: postgres:15
399
+ environment:
400
+ POSTGRES_REPLICATION_MODE: master
401
+ volumes:
402
+ - type: volume
403
+ source: pg-data
404
+ target: /var/lib/postgresql/data
405
+ volume:
406
+ driver: rbd # 分布式存储
407
+
408
+ postgres-replica:
409
+ image: postgres:15
410
+ environment:
411
+ POSTGRES_REPLICATION_MODE: replica
412
+ POSTGRES_MASTER_HOST: postgres
413
+ ```
414
+
415
+ ```python
416
+ # 多渠道降级
417
+ class PaymentService:
418
+ def __init__(self, primary: StripeClient, fallback: AlipayClient):
419
+ self._primary = primary
420
+ self._fallback = fallback
421
+
422
+ async def charge(self, order: Order) -> PaymentResult:
423
+ try:
424
+ return await self._primary.charge(order.total)
425
+ except PaymentGatewayError:
426
+ logger.warning("Primary payment failed, falling back to Alipay")
427
+ return await self._fallback.charge(order.total)
428
+
429
+ # 缓存降级
430
+ class ProductService:
431
+ async def get_product(self, product_id: str) -> Product:
432
+ # L1: 本地缓存
433
+ cached = self._local_cache.get(product_id)
434
+ if cached:
435
+ return cached
436
+
437
+ # L2: Redis 缓存
438
+ try:
439
+ cached = await self._redis.get(f"product:{product_id}")
440
+ if cached:
441
+ product = Product.model_validate_json(cached)
442
+ self._local_cache.set(product_id, product)
443
+ return product
444
+ except RedisError:
445
+ logger.warning("Redis unavailable, falling through to DB")
446
+
447
+ # L3: 数据库
448
+ product = await self._db.get_product(product_id)
449
+ return product
450
+ ```
451
+
452
+ ### 检测方法
453
+ - 部署 replicas = 1。
454
+ - 数据库无从库或备份。
455
+ - 应用部署在单一可用区。
456
+ - 关键外部依赖只有一个供应商且无降级。
457
+ - 负载均衡器后只有一台服务器。
458
+
459
+ ### 修复步骤
460
+ 1. 应用至少部署 2 个实例,分布在不同可用区。
461
+ 2. 数据库配置主从复制 + 自动故障转移。
462
+ 3. 关键外部依赖配置备用渠道。
463
+ 4. 实现多级缓存降级策略(本地 -> Redis -> DB)。
464
+ 5. 定期进行故障注入演练(Chaos Engineering)。
465
+
466
+ ### Agent Checklist
467
+ - [ ] 应用实例 >= 2,分布在不同可用区
468
+ - [ ] 数据库有主从复制和自动故障转移
469
+ - [ ] 关键外部依赖有降级方案
470
+ - [ ] 有定期故障演练
471
+ - [ ] 无单机本地存储(使用分布式存储)
472
+
473
+ ---
474
+
475
+ ## 5. 复盘无闭环(Postmortem Without Follow-through)
476
+
477
+ ### 描述
478
+ 事故后虽然做了复盘,但 Action Items 无人跟进、无截止日期、无验收机制,导致同类故障反复发生。复盘变成了走形式。
479
+
480
+ ### 错误示例
481
+ ```markdown
482
+ # 事故复盘文档
483
+ ## 2024-01-15 数据库宕机事故
484
+
485
+ ### 根因
486
+ 数据库磁盘满导致写入失败。
487
+
488
+ ### 改进措施
489
+ - 增加磁盘监控告警
490
+ - 优化数据归档策略
491
+ - 增加磁盘容量
492
+
493
+ ### 状态:已复盘 ✅
494
+
495
+ # 三个月后...同样的问题再次发生
496
+ # "上次复盘的改进措施做了吗?" "呃..."
497
+ ```
498
+
499
+ ### 正确示例
500
+ ```markdown
501
+ # 事故复盘文档
502
+ ## INC-2024-001: 数据库磁盘满导致订单服务不可用
503
+
504
+ ### 时间线
505
+ - 14:00 告警触发:PostgreSQL 磁盘使用率 > 95%
506
+ - 14:05 值班工程师确认
507
+ - 14:10 开始紧急清理临时表
508
+ - 14:25 磁盘使用率降至 70%,服务恢复
509
+ - 14:30 确认服务完全恢复
510
+
511
+ ### 影响范围
512
+ - 持续时间:25 分钟
513
+ - 影响用户:约 5,000 用户无法下单
514
+ - 数据丢失:无
515
+
516
+ ### 根因分析
517
+ 1. 审计日志表 `audit_log` 无数据归档策略,累计 500GB。
518
+ 2. 磁盘告警阈值设为 90%,触发时已无足够缓冲时间。
519
+ 3. 无定期数据清理计划任务。
520
+
521
+ ### Action Items
522
+
523
+ | # | 措施 | Owner | Deadline | 验收标准 | 状态 |
524
+ |---|------|-------|----------|----------|------|
525
+ | 1 | 磁盘告警阈值调整为 80% | SRE-张三 | 2024-01-17 | 告警规则已更新且触发测试通过 | ✅ Done |
526
+ | 2 | audit_log 表添加 30 天自动归档 | DBA-李四 | 2024-01-22 | Cron Job 运行正常,历史数据已归档 | ✅ Done |
527
+ | 3 | 所有数据库表制定保留策略 | DBA-李四 | 2024-02-01 | 保留策略文档已发布 | 🔄 In Progress |
528
+ | 4 | CI 流水线添加磁盘预算检查 | DevOps-王五 | 2024-02-15 | PR 中大表变更触发 DBA 审批 | ⬜ TODO |
529
+ ```
530
+
531
+ ```python
532
+ # 自动化跟踪 Action Items
533
+ class PostmortemTracker:
534
+ async def check_overdue_items(self):
535
+ """每日检查超期未完成的 Action Items"""
536
+ overdue = await self._repo.get_overdue_actions()
537
+ for item in overdue:
538
+ days_overdue = (datetime.now() - item.deadline).days
539
+ if days_overdue > 7:
540
+ # 超期 7 天以上,升级通知
541
+ await self._notify_manager(item)
542
+ else:
543
+ await self._notify_owner(item)
544
+
545
+ async def block_similar_changes(self, incident_id: str):
546
+ """将复盘改进措施纳入 CI 门禁"""
547
+ actions = await self._repo.get_actions(incident_id)
548
+ for action in actions:
549
+ if action.ci_rule and action.status != "done":
550
+ # 相关代码变更被阻断,直到改进措施完成
551
+ await self._ci.add_blocking_rule(action.ci_rule)
552
+ ```
553
+
554
+ ### 检测方法
555
+ - 复盘文档中 Action Items 无 Owner 或无 Deadline。
556
+ - 超过 Deadline 的 Action Items 占比 > 30%。
557
+ - 相同根因的事故在 6 个月内重复发生。
558
+ - 复盘会议后无跟踪机制。
559
+
560
+ ### 修复步骤
561
+ 1. 复盘文档模板强制包含:Owner + Deadline + 验收标准。
562
+ 2. Action Items 录入项目管理工具(Jira / Linear),设置到期提醒。
563
+ 3. 每周站会检查复盘 Action Items 进度。
564
+ 4. 将改进措施纳入 CI 门禁或告警规则。
565
+ 5. 季度审计:统计 Action Items 完成率和重复故障率。
566
+
567
+ ### Agent Checklist
568
+ - [ ] 复盘文档包含 Owner + Deadline + 验收标准
569
+ - [ ] Action Items 在项目管理工具中跟踪
570
+ - [ ] 有每周跟踪机制
571
+ - [ ] 关键改进纳入 CI 门禁
572
+ - [ ] 相同根因故障不重复发生
573
+
574
+ ---
575
+
576
+ ## 6. 缺乏可观测性(Poor Observability)
577
+
578
+ ### 描述
579
+ 系统缺少日志、指标、链路追踪三大支柱中的一个或多个,导致出问题时无法快速定位根因。只有日志没有指标(无法看全局趋势),只有指标没有链路追踪(无法定位单个请求的问题)。
580
+
581
+ ### 错误示例
582
+ ```python
583
+ # 无结构化日志 + 无指标 + 无追踪
584
+ @app.post("/orders")
585
+ def create_order(data):
586
+ print(f"Creating order: {data}") # 无结构化日志
587
+ order = order_service.create(data)
588
+ print(f"Order created: {order.id}")
589
+ return order
590
+ # 问题来了:
591
+ # - 无法知道 P99 延迟是多少
592
+ # - 无法知道某个慢请求经过了哪些服务
593
+ # - 无法知道错误率的趋势
594
+ ```
595
+
596
+ ### 正确示例
597
+ ```python
598
+ import structlog
599
+ from opentelemetry import trace
600
+ from prometheus_client import Counter, Histogram
601
+
602
+ # 1. 结构化日志
603
+ logger = structlog.get_logger()
604
+
605
+ # 2. 指标
606
+ order_created_total = Counter(
607
+ "order_created_total", "Total orders created", ["status"]
608
+ )
609
+ order_creation_duration = Histogram(
610
+ "order_creation_duration_seconds", "Order creation latency",
611
+ buckets=[0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0],
612
+ )
613
+
614
+ # 3. 链路追踪
615
+ tracer = trace.get_tracer(__name__)
616
+
617
+ @app.post("/orders")
618
+ async def create_order(data: CreateOrderRequest, request: Request):
619
+ with tracer.start_as_current_span("create_order") as span:
620
+ span.set_attribute("user_id", data.user_id)
621
+ span.set_attribute("item_count", len(data.items))
622
+
623
+ log = logger.bind(
624
+ request_id=request.state.request_id,
625
+ user_id=data.user_id,
626
+ trace_id=span.get_span_context().trace_id,
627
+ )
628
+
629
+ log.info("order_creation_started", item_count=len(data.items))
630
+
631
+ with order_creation_duration.time():
632
+ try:
633
+ order = await order_service.create(data)
634
+ order_created_total.labels(status="success").inc()
635
+ log.info("order_created", order_id=order.id, total=str(order.total))
636
+ return order
637
+ except InsufficientStockError as e:
638
+ order_created_total.labels(status="insufficient_stock").inc()
639
+ log.warning("order_creation_failed", reason="insufficient_stock")
640
+ raise
641
+ except Exception as e:
642
+ order_created_total.labels(status="error").inc()
643
+ log.error("order_creation_error", error=str(e))
644
+ raise
645
+
646
+ # 4. 中间件自动注入 request_id 和追踪上下文
647
+ @app.middleware("http")
648
+ async def observability_middleware(request: Request, call_next):
649
+ request_id = request.headers.get("X-Request-ID", str(uuid4()))
650
+ request.state.request_id = request_id
651
+
652
+ with tracer.start_as_current_span(
653
+ f"{request.method} {request.url.path}",
654
+ attributes={
655
+ "http.method": request.method,
656
+ "http.url": str(request.url),
657
+ "http.request_id": request_id,
658
+ },
659
+ ):
660
+ response = await call_next(request)
661
+ response.headers["X-Request-ID"] = request_id
662
+ return response
663
+ ```
664
+
665
+ ### 检测方法
666
+ - 无 Prometheus / Datadog / New Relic 等指标采集。
667
+ - 无 Jaeger / Zipkin / OpenTelemetry 链路追踪。
668
+ - 日志使用 `print()` 且无结构化。
669
+ - 出问题时需要 SSH 到服务器 `grep` 日志定位原因。
670
+ - 无法回答 "当前系统的 P99 延迟是多少" 这种基础问题。
671
+
672
+ ### 修复步骤
673
+ 1. 引入 OpenTelemetry SDK,统一日志 + 指标 + 追踪。
674
+ 2. 使用结构化日志(`structlog` / `python-json-logger`),输出 JSON 格式。
675
+ 3. 定义核心业务指标(RED: Rate / Error / Duration)。
676
+ 4. 为关键链路添加 Span 追踪。
677
+ 5. 搭建可观测性平台(Grafana + Prometheus + Jaeger 或 Datadog)。
678
+ 6. 创建标准仪表板:系统总览 / 服务详情 / 错误率趋势 / 延迟分布。
679
+
680
+ ### Agent Checklist
681
+ - [ ] 有结构化日志(JSON 格式)
682
+ - [ ] 有指标采集(Prometheus / Datadog)
683
+ - [ ] 有链路追踪(OpenTelemetry / Jaeger)
684
+ - [ ] 核心 API 有 RED 指标仪表板
685
+ - [ ] 日志包含 request_id 和 trace_id
686
+
687
+ ---
688
+
689
+ ## 全局 Agent Checklist
690
+
691
+ | 检查项 | 阈值 | 工具 |
692
+ |--------|------|------|
693
+ | HTTP 调用无 timeout | 0 处 | Code Review |
694
+ | 熔断器覆盖 | 100% 外部依赖 | 架构审查 |
695
+ | 事故 MTTR | P0 < 30min, P1 < 1h | 事故跟踪 |
696
+ | 每日告警数 | < 20 条 | 告警平台统计 |
697
+ | Critical 告警/月 | < 5 条 | 告警平台统计 |
698
+ | 服务实例数 | >= 2 | 部署配置 |
699
+ | 复盘 Action 完成率 | > 90% | 项目管理工具 |
700
+ | 可观测性三支柱 | 日志 + 指标 + 追踪 | 平台审查 |