specweave 0.3.13 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (168) hide show
  1. package/CLAUDE.md +506 -17
  2. package/README.md +100 -58
  3. package/bin/install-all.sh +9 -2
  4. package/bin/install-hooks.sh +57 -0
  5. package/bin/specweave.js +16 -0
  6. package/dist/adapters/adapter-base.d.ts +21 -0
  7. package/dist/adapters/adapter-base.d.ts.map +1 -1
  8. package/dist/adapters/adapter-base.js +28 -0
  9. package/dist/adapters/adapter-base.js.map +1 -1
  10. package/dist/adapters/adapter-interface.d.ts +41 -0
  11. package/dist/adapters/adapter-interface.d.ts.map +1 -1
  12. package/dist/adapters/claude/adapter.d.ts +36 -0
  13. package/dist/adapters/claude/adapter.d.ts.map +1 -1
  14. package/dist/adapters/claude/adapter.js +135 -0
  15. package/dist/adapters/claude/adapter.js.map +1 -1
  16. package/dist/adapters/copilot/adapter.d.ts +25 -0
  17. package/dist/adapters/copilot/adapter.d.ts.map +1 -1
  18. package/dist/adapters/copilot/adapter.js +112 -0
  19. package/dist/adapters/copilot/adapter.js.map +1 -1
  20. package/dist/adapters/cursor/adapter.d.ts +36 -0
  21. package/dist/adapters/cursor/adapter.d.ts.map +1 -1
  22. package/dist/adapters/cursor/adapter.js +140 -0
  23. package/dist/adapters/cursor/adapter.js.map +1 -1
  24. package/dist/adapters/generic/adapter.d.ts +25 -0
  25. package/dist/adapters/generic/adapter.d.ts.map +1 -1
  26. package/dist/adapters/generic/adapter.js +111 -0
  27. package/dist/adapters/generic/adapter.js.map +1 -1
  28. package/dist/cli/commands/init.d.ts.map +1 -1
  29. package/dist/cli/commands/init.js +103 -1
  30. package/dist/cli/commands/init.js.map +1 -1
  31. package/dist/cli/commands/plugin.d.ts +37 -0
  32. package/dist/cli/commands/plugin.d.ts.map +1 -0
  33. package/dist/cli/commands/plugin.js +296 -0
  34. package/dist/cli/commands/plugin.js.map +1 -0
  35. package/dist/core/agent-model-manager.d.ts +52 -0
  36. package/dist/core/agent-model-manager.d.ts.map +1 -0
  37. package/dist/core/agent-model-manager.js +120 -0
  38. package/dist/core/agent-model-manager.js.map +1 -0
  39. package/dist/core/cost-tracker.d.ts +108 -0
  40. package/dist/core/cost-tracker.d.ts.map +1 -0
  41. package/dist/core/cost-tracker.js +281 -0
  42. package/dist/core/cost-tracker.js.map +1 -0
  43. package/dist/core/model-selector.d.ts +57 -0
  44. package/dist/core/model-selector.d.ts.map +1 -0
  45. package/dist/core/model-selector.js +115 -0
  46. package/dist/core/model-selector.js.map +1 -0
  47. package/dist/core/phase-detector.d.ts +62 -0
  48. package/dist/core/phase-detector.d.ts.map +1 -0
  49. package/dist/core/phase-detector.js +229 -0
  50. package/dist/core/phase-detector.js.map +1 -0
  51. package/dist/core/plugin-detector.d.ts +96 -0
  52. package/dist/core/plugin-detector.d.ts.map +1 -0
  53. package/dist/core/plugin-detector.js +349 -0
  54. package/dist/core/plugin-detector.js.map +1 -0
  55. package/dist/core/plugin-loader.d.ts +111 -0
  56. package/dist/core/plugin-loader.d.ts.map +1 -0
  57. package/dist/core/plugin-loader.js +319 -0
  58. package/dist/core/plugin-loader.js.map +1 -0
  59. package/dist/core/plugin-manager.d.ts +144 -0
  60. package/dist/core/plugin-manager.d.ts.map +1 -0
  61. package/dist/core/plugin-manager.js +393 -0
  62. package/dist/core/plugin-manager.js.map +1 -0
  63. package/dist/core/schemas/plugin-manifest.schema.json +253 -0
  64. package/dist/core/types/plugin.d.ts +252 -0
  65. package/dist/core/types/plugin.d.ts.map +1 -0
  66. package/dist/core/types/plugin.js +48 -0
  67. package/dist/core/types/plugin.js.map +1 -0
  68. package/dist/integrations/jira/jira-mapper.d.ts +2 -2
  69. package/dist/integrations/jira/jira-mapper.js +2 -2
  70. package/dist/types/cost-tracking.d.ts +43 -0
  71. package/dist/types/cost-tracking.d.ts.map +1 -0
  72. package/dist/types/cost-tracking.js +8 -0
  73. package/dist/types/cost-tracking.js.map +1 -0
  74. package/dist/types/model-selection.d.ts +53 -0
  75. package/dist/types/model-selection.d.ts.map +1 -0
  76. package/dist/types/model-selection.js +12 -0
  77. package/dist/types/model-selection.js.map +1 -0
  78. package/dist/utils/cost-reporter.d.ts +58 -0
  79. package/dist/utils/cost-reporter.d.ts.map +1 -0
  80. package/dist/utils/cost-reporter.js +224 -0
  81. package/dist/utils/cost-reporter.js.map +1 -0
  82. package/dist/utils/pricing-constants.d.ts +70 -0
  83. package/dist/utils/pricing-constants.d.ts.map +1 -0
  84. package/dist/utils/pricing-constants.js +71 -0
  85. package/dist/utils/pricing-constants.js.map +1 -0
  86. package/package.json +13 -9
  87. package/src/adapters/adapter-base.ts +33 -0
  88. package/src/adapters/adapter-interface.ts +46 -0
  89. package/src/adapters/claude/adapter.ts +164 -0
  90. package/src/adapters/copilot/adapter.ts +138 -0
  91. package/src/adapters/cursor/adapter.ts +170 -0
  92. package/src/adapters/generic/adapter.ts +137 -0
  93. package/src/agents/architect/AGENT.md +3 -0
  94. package/src/agents/code-reviewer.md +156 -0
  95. package/src/agents/data-scientist/AGENT.md +181 -0
  96. package/src/agents/database-optimizer/AGENT.md +147 -0
  97. package/src/agents/devops/AGENT.md +3 -0
  98. package/src/agents/diagrams-architect/AGENT.md +3 -0
  99. package/src/agents/docs-writer/AGENT.md +3 -0
  100. package/src/agents/kubernetes-architect/AGENT.md +142 -0
  101. package/src/agents/ml-engineer/AGENT.md +150 -0
  102. package/src/agents/mlops-engineer/AGENT.md +201 -0
  103. package/src/agents/network-engineer/AGENT.md +149 -0
  104. package/src/agents/observability-engineer/AGENT.md +213 -0
  105. package/src/agents/payment-integration/AGENT.md +35 -0
  106. package/src/agents/performance/AGENT.md +3 -0
  107. package/src/agents/performance-engineer/AGENT.md +153 -0
  108. package/src/agents/pm/AGENT.md +3 -0
  109. package/src/agents/qa-lead/AGENT.md +3 -0
  110. package/src/agents/security/AGENT.md +3 -0
  111. package/src/agents/sre/AGENT.md +3 -0
  112. package/src/agents/tdd-orchestrator/AGENT.md +169 -0
  113. package/src/agents/tech-lead/AGENT.md +3 -0
  114. package/src/commands/specweave.costs.md +261 -0
  115. package/src/commands/specweave.increment.md +48 -4
  116. package/src/commands/specweave.ml-pipeline.md +292 -0
  117. package/src/commands/specweave.monitor-setup.md +501 -0
  118. package/src/commands/specweave.slo-implement.md +1055 -0
  119. package/src/commands/specweave.sync-github.md +1 -1
  120. package/src/commands/specweave.tdd-cycle.md +199 -0
  121. package/src/commands/specweave.tdd-green.md +842 -0
  122. package/src/commands/specweave.tdd-red.md +135 -0
  123. package/src/commands/specweave.tdd-refactor.md +165 -0
  124. package/src/hooks/post-increment-plugin-detect.sh +142 -0
  125. package/src/hooks/post-task-completion.sh +53 -11
  126. package/src/hooks/pre-task-plugin-detect.sh +96 -0
  127. package/src/skills/SKILLS-INDEX.md +18 -10
  128. package/src/skills/billing-automation/SKILL.md +559 -0
  129. package/src/skills/distributed-tracing/SKILL.md +438 -0
  130. package/src/skills/e2e-playwright/README.md +1 -1
  131. package/src/skills/e2e-playwright/package.json +1 -1
  132. package/src/skills/gitops-workflow/SKILL.md +285 -0
  133. package/src/skills/gitops-workflow/references/argocd-setup.md +134 -0
  134. package/src/skills/gitops-workflow/references/sync-policies.md +131 -0
  135. package/src/skills/grafana-dashboards/SKILL.md +369 -0
  136. package/src/skills/helm-chart-scaffolding/SKILL.md +544 -0
  137. package/src/skills/helm-chart-scaffolding/assets/Chart.yaml.template +42 -0
  138. package/src/skills/helm-chart-scaffolding/assets/values.yaml.template +185 -0
  139. package/src/skills/helm-chart-scaffolding/references/chart-structure.md +500 -0
  140. package/src/skills/helm-chart-scaffolding/scripts/validate-chart.sh +244 -0
  141. package/src/skills/k8s-manifest-generator/SKILL.md +511 -0
  142. package/src/skills/k8s-manifest-generator/assets/configmap-template.yaml +296 -0
  143. package/src/skills/k8s-manifest-generator/assets/deployment-template.yaml +203 -0
  144. package/src/skills/k8s-manifest-generator/assets/service-template.yaml +171 -0
  145. package/src/skills/k8s-manifest-generator/references/deployment-spec.md +753 -0
  146. package/src/skills/k8s-manifest-generator/references/service-spec.md +724 -0
  147. package/src/skills/k8s-security-policies/SKILL.md +334 -0
  148. package/src/skills/k8s-security-policies/assets/network-policy-template.yaml +177 -0
  149. package/src/skills/k8s-security-policies/references/rbac-patterns.md +187 -0
  150. package/src/skills/ml-pipeline-workflow/SKILL.md +245 -0
  151. package/src/skills/paypal-integration/SKILL.md +467 -0
  152. package/src/skills/pci-compliance/SKILL.md +466 -0
  153. package/src/skills/prometheus-configuration/SKILL.md +392 -0
  154. package/src/skills/slo-implementation/SKILL.md +329 -0
  155. package/src/skills/stripe-integration/SKILL.md +442 -0
  156. package/src/skills/tdd-workflow/SKILL.md +378 -0
  157. package/src/templates/README.md.template +1 -1
  158. package/src/skills/bmad-method-expert/SKILL.md +0 -626
  159. package/src/skills/bmad-method-expert/scripts/analyze-project.js +0 -318
  160. package/src/skills/bmad-method-expert/scripts/check-setup.js +0 -208
  161. package/src/skills/bmad-method-expert/scripts/generate-template.js +0 -1149
  162. package/src/skills/bmad-method-expert/scripts/validate-documents.js +0 -340
  163. package/src/skills/context-optimizer/SKILL.md +0 -588
  164. package/src/skills/figma-designer/SKILL.md +0 -149
  165. package/src/skills/figma-implementer/SKILL.md +0 -148
  166. package/src/skills/figma-mcp-connector/SKILL.md +0 -136
  167. package/src/skills/figma-to-code/SKILL.md +0 -128
  168. package/src/skills/spec-kit-expert/SKILL.md +0 -1010
@@ -0,0 +1,501 @@
1
+ # Monitoring and Observability Setup
2
+
3
+ You are a monitoring and observability expert specializing in implementing comprehensive monitoring solutions. Set up metrics collection, distributed tracing, log aggregation, and create insightful dashboards that provide full visibility into system health and performance.
4
+
5
+ ## Context
6
+ The user needs to implement or improve monitoring and observability. Focus on the three pillars of observability (metrics, logs, traces), setting up monitoring infrastructure, creating actionable dashboards, and establishing effective alerting strategies.
7
+
8
+ ## Requirements
9
+ $ARGUMENTS
10
+
11
+ ## Instructions
12
+
13
+ ### 1. Prometheus & Metrics Setup
14
+
15
+ **Prometheus Configuration**
16
+ ```yaml
17
+ # prometheus.yml
18
+ global:
19
+ scrape_interval: 15s
20
+ evaluation_interval: 15s
21
+ external_labels:
22
+ cluster: 'production'
23
+ region: 'us-east-1'
24
+
25
+ alerting:
26
+ alertmanagers:
27
+ - static_configs:
28
+ - targets: ['alertmanager:9093']
29
+
30
+ rule_files:
31
+ - "alerts/*.yml"
32
+ - "recording_rules/*.yml"
33
+
34
+ scrape_configs:
35
+ - job_name: 'prometheus'
36
+ static_configs:
37
+ - targets: ['localhost:9090']
38
+
39
+ - job_name: 'node'
40
+ static_configs:
41
+ - targets: ['node-exporter:9100']
42
+
43
+ - job_name: 'application'
44
+ kubernetes_sd_configs:
45
+ - role: pod
46
+ relabel_configs:
47
+ - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
48
+ action: keep
49
+ regex: true
50
+ ```
51
+
52
+ **Custom Metrics Implementation**
53
+ ```typescript
54
+ // metrics.ts
55
+ import { Counter, Histogram, Gauge, Registry } from 'prom-client';
56
+
57
+ export class MetricsCollector {
58
+ private registry: Registry;
59
+ private httpRequestDuration: Histogram<string>;
60
+ private httpRequestTotal: Counter<string>;
61
+
62
+ constructor() {
63
+ this.registry = new Registry();
64
+ this.initializeMetrics();
65
+ }
66
+
67
+ private initializeMetrics() {
68
+ this.httpRequestDuration = new Histogram({
69
+ name: 'http_request_duration_seconds',
70
+ help: 'Duration of HTTP requests in seconds',
71
+ labelNames: ['method', 'route', 'status_code'],
72
+ buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2, 5]
73
+ });
74
+
75
+ this.httpRequestTotal = new Counter({
76
+ name: 'http_requests_total',
77
+ help: 'Total number of HTTP requests',
78
+ labelNames: ['method', 'route', 'status_code']
79
+ });
80
+
81
+ this.registry.registerMetric(this.httpRequestDuration);
82
+ this.registry.registerMetric(this.httpRequestTotal);
83
+ }
84
+
85
+ httpMetricsMiddleware() {
86
+ return (req: Request, res: Response, next: NextFunction) => {
87
+ const start = Date.now();
88
+ const route = req.route?.path || req.path;
89
+
90
+ res.on('finish', () => {
91
+ const duration = (Date.now() - start) / 1000;
92
+ const labels = {
93
+ method: req.method,
94
+ route,
95
+ status_code: res.statusCode.toString()
96
+ };
97
+
98
+ this.httpRequestDuration.observe(labels, duration);
99
+ this.httpRequestTotal.inc(labels);
100
+ });
101
+
102
+ next();
103
+ };
104
+ }
105
+
106
+ async getMetrics(): Promise<string> {
107
+ return this.registry.metrics();
108
+ }
109
+ }
110
+ ```
111
+
112
+ ### 2. Grafana Dashboard Setup
113
+
114
+ **Dashboard Configuration**
115
+ ```typescript
116
+ // dashboards/service-dashboard.ts
117
+ export const createServiceDashboard = (serviceName: string) => {
118
+ return {
119
+ title: `${serviceName} Service Dashboard`,
120
+ uid: `${serviceName}-overview`,
121
+ tags: ['service', serviceName],
122
+ time: { from: 'now-6h', to: 'now' },
123
+ refresh: '30s',
124
+
125
+ panels: [
126
+ // Golden Signals
127
+ {
128
+ title: 'Request Rate',
129
+ type: 'graph',
130
+ gridPos: { x: 0, y: 0, w: 6, h: 8 },
131
+ targets: [{
132
+ expr: `sum(rate(http_requests_total{service="${serviceName}"}[5m])) by (method)`,
133
+ legendFormat: '{{method}}'
134
+ }]
135
+ },
136
+ {
137
+ title: 'Error Rate',
138
+ type: 'graph',
139
+ gridPos: { x: 6, y: 0, w: 6, h: 8 },
140
+ targets: [{
141
+ expr: `sum(rate(http_requests_total{service="${serviceName}",status_code=~"5.."}[5m])) / sum(rate(http_requests_total{service="${serviceName}"}[5m]))`,
142
+ legendFormat: 'Error %'
143
+ }]
144
+ },
145
+ {
146
+ title: 'Latency Percentiles',
147
+ type: 'graph',
148
+ gridPos: { x: 12, y: 0, w: 12, h: 8 },
149
+ targets: [
150
+ {
151
+ expr: `histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{service="${serviceName}"}[5m])) by (le))`,
152
+ legendFormat: 'p50'
153
+ },
154
+ {
155
+ expr: `histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{service="${serviceName}"}[5m])) by (le))`,
156
+ legendFormat: 'p95'
157
+ },
158
+ {
159
+ expr: `histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{service="${serviceName}"}[5m])) by (le))`,
160
+ legendFormat: 'p99'
161
+ }
162
+ ]
163
+ }
164
+ ]
165
+ };
166
+ };
167
+ ```
168
+
169
+ ### 3. Distributed Tracing
170
+
171
+ **OpenTelemetry Configuration**
172
+ ```typescript
173
+ // tracing.ts
174
+ import { NodeSDK } from '@opentelemetry/sdk-node';
175
+ import { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node';
176
+ import { Resource } from '@opentelemetry/resources';
177
+ import { SemanticResourceAttributes } from '@opentelemetry/semantic-conventions';
178
+ import { JaegerExporter } from '@opentelemetry/exporter-jaeger';
179
+ import { BatchSpanProcessor } from '@opentelemetry/sdk-trace-base';
180
+
181
+ export class TracingSetup {
182
+ private sdk: NodeSDK;
183
+
184
+ constructor(serviceName: string, environment: string) {
185
+ const jaegerExporter = new JaegerExporter({
186
+ endpoint: process.env.JAEGER_ENDPOINT || 'http://localhost:14268/api/traces',
187
+ });
188
+
189
+ this.sdk = new NodeSDK({
190
+ resource: new Resource({
191
+ [SemanticResourceAttributes.SERVICE_NAME]: serviceName,
192
+ [SemanticResourceAttributes.SERVICE_VERSION]: process.env.SERVICE_VERSION || '1.0.0',
193
+ [SemanticResourceAttributes.DEPLOYMENT_ENVIRONMENT]: environment,
194
+ }),
195
+
196
+ traceExporter: jaegerExporter,
197
+ spanProcessor: new BatchSpanProcessor(jaegerExporter),
198
+
199
+ instrumentations: [
200
+ getNodeAutoInstrumentations({
201
+ '@opentelemetry/instrumentation-fs': { enabled: false },
202
+ }),
203
+ ],
204
+ });
205
+ }
206
+
207
+ start() {
208
+ this.sdk.start()
209
+ .then(() => console.log('Tracing initialized'))
210
+ .catch((error) => console.error('Error initializing tracing', error));
211
+ }
212
+
213
+ shutdown() {
214
+ return this.sdk.shutdown();
215
+ }
216
+ }
217
+ ```
218
+
219
+ ### 4. Log Aggregation
220
+
221
+ **Fluentd Configuration**
222
+ ```yaml
223
+ # fluent.conf
224
+ <source>
225
+ @type tail
226
+ path /var/log/containers/*.log
227
+ pos_file /var/log/fluentd-containers.log.pos
228
+ tag kubernetes.*
229
+ <parse>
230
+ @type json
231
+ time_format %Y-%m-%dT%H:%M:%S.%NZ
232
+ </parse>
233
+ </source>
234
+
235
+ <filter kubernetes.**>
236
+ @type kubernetes_metadata
237
+ kubernetes_url "#{ENV['KUBERNETES_SERVICE_HOST']}"
238
+ </filter>
239
+
240
+ <filter kubernetes.**>
241
+ @type record_transformer
242
+ <record>
243
+ cluster_name ${ENV['CLUSTER_NAME']}
244
+ environment ${ENV['ENVIRONMENT']}
245
+ @timestamp ${time.strftime('%Y-%m-%dT%H:%M:%S.%LZ')}
246
+ </record>
247
+ </filter>
248
+
249
+ <match kubernetes.**>
250
+ @type elasticsearch
251
+ host "#{ENV['FLUENT_ELASTICSEARCH_HOST']}"
252
+ port "#{ENV['FLUENT_ELASTICSEARCH_PORT']}"
253
+ index_name logstash
254
+ logstash_format true
255
+ <buffer>
256
+ @type file
257
+ path /var/log/fluentd-buffers/kubernetes.buffer
258
+ flush_interval 5s
259
+ chunk_limit_size 2M
260
+ </buffer>
261
+ </match>
262
+ ```
263
+
264
+ **Structured Logging Library**
265
+ ```python
266
+ # structured_logging.py
267
+ import json
268
+ import logging
269
+ from datetime import datetime
270
+ from typing import Any, Dict, Optional
271
+
272
+ class StructuredLogger:
273
+ def __init__(self, name: str, service: str, version: str):
274
+ self.logger = logging.getLogger(name)
275
+ self.service = service
276
+ self.version = version
277
+ self.default_context = {
278
+ 'service': service,
279
+ 'version': version,
280
+ 'environment': os.getenv('ENVIRONMENT', 'development')
281
+ }
282
+
283
+ def _format_log(self, level: str, message: str, context: Dict[str, Any]) -> str:
284
+ log_entry = {
285
+ '@timestamp': datetime.utcnow().isoformat() + 'Z',
286
+ 'level': level,
287
+ 'message': message,
288
+ **self.default_context,
289
+ **context
290
+ }
291
+
292
+ trace_context = self._get_trace_context()
293
+ if trace_context:
294
+ log_entry['trace'] = trace_context
295
+
296
+ return json.dumps(log_entry)
297
+
298
+ def info(self, message: str, **context):
299
+ log_msg = self._format_log('INFO', message, context)
300
+ self.logger.info(log_msg)
301
+
302
+ def error(self, message: str, error: Optional[Exception] = None, **context):
303
+ if error:
304
+ context['error'] = {
305
+ 'type': type(error).__name__,
306
+ 'message': str(error),
307
+ 'stacktrace': traceback.format_exc()
308
+ }
309
+
310
+ log_msg = self._format_log('ERROR', message, context)
311
+ self.logger.error(log_msg)
312
+ ```
313
+
314
+ ### 5. Alert Configuration
315
+
316
+ **Alert Rules**
317
+ ```yaml
318
+ # alerts/application.yml
319
+ groups:
320
+ - name: application
321
+ interval: 30s
322
+ rules:
323
+ - alert: HighErrorRate
324
+ expr: |
325
+ sum(rate(http_requests_total{status_code=~"5.."}[5m])) by (service)
326
+ / sum(rate(http_requests_total[5m])) by (service) > 0.05
327
+ for: 5m
328
+ labels:
329
+ severity: critical
330
+ annotations:
331
+ summary: "High error rate on {{ $labels.service }}"
332
+ description: "Error rate is {{ $value | humanizePercentage }}"
333
+
334
+ - alert: SlowResponseTime
335
+ expr: |
336
+ histogram_quantile(0.95,
337
+ sum(rate(http_request_duration_seconds_bucket[5m])) by (service, le)
338
+ ) > 1
339
+ for: 10m
340
+ labels:
341
+ severity: warning
342
+ annotations:
343
+ summary: "Slow response time on {{ $labels.service }}"
344
+
345
+ - name: infrastructure
346
+ rules:
347
+ - alert: HighCPUUsage
348
+ expr: avg(rate(container_cpu_usage_seconds_total[5m])) by (pod) > 0.8
349
+ for: 15m
350
+ labels:
351
+ severity: warning
352
+
353
+ - alert: HighMemoryUsage
354
+ expr: |
355
+ container_memory_working_set_bytes / container_spec_memory_limit_bytes > 0.9
356
+ for: 10m
357
+ labels:
358
+ severity: critical
359
+ ```
360
+
361
+ **Alertmanager Configuration**
362
+ ```yaml
363
+ # alertmanager.yml
364
+ global:
365
+ resolve_timeout: 5m
366
+ slack_api_url: '$SLACK_API_URL'
367
+
368
+ route:
369
+ group_by: ['alertname', 'cluster', 'service']
370
+ group_wait: 10s
371
+ group_interval: 10s
372
+ repeat_interval: 12h
373
+ receiver: 'default'
374
+
375
+ routes:
376
+ - match:
377
+ severity: critical
378
+ receiver: pagerduty
379
+ continue: true
380
+
381
+ - match_re:
382
+ severity: critical|warning
383
+ receiver: slack
384
+
385
+ receivers:
386
+ - name: 'slack'
387
+ slack_configs:
388
+ - channel: '#alerts'
389
+ title: '{{ .GroupLabels.alertname }}'
390
+ text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
391
+ send_resolved: true
392
+
393
+ - name: 'pagerduty'
394
+ pagerduty_configs:
395
+ - service_key: '$PAGERDUTY_SERVICE_KEY'
396
+ description: '{{ .GroupLabels.alertname }}: {{ .Annotations.summary }}'
397
+ ```
398
+
399
+ ### 6. SLO Implementation
400
+
401
+ **SLO Configuration**
402
+ ```typescript
403
+ // slo-manager.ts
404
+ interface SLO {
405
+ name: string;
406
+ target: number; // e.g., 99.9
407
+ window: string; // e.g., '30d'
408
+ burnRates: BurnRate[];
409
+ }
410
+
411
+ export class SLOManager {
412
+ private slos: SLO[] = [
413
+ {
414
+ name: 'API Availability',
415
+ target: 99.9,
416
+ window: '30d',
417
+ burnRates: [
418
+ { window: '1h', threshold: 14.4, severity: 'critical' },
419
+ { window: '6h', threshold: 6, severity: 'critical' },
420
+ { window: '1d', threshold: 3, severity: 'warning' }
421
+ ]
422
+ }
423
+ ];
424
+
425
+ generateSLOQueries(): string {
426
+ return this.slos.map(slo => this.generateSLOQuery(slo)).join('\n\n');
427
+ }
428
+
429
+ private generateSLOQuery(slo: SLO): string {
430
+ const errorBudget = 1 - (slo.target / 100);
431
+
432
+ return `
433
+ # ${slo.name} SLO
434
+ - record: slo:${this.sanitizeName(slo.name)}:error_budget
435
+ expr: ${errorBudget}
436
+
437
+ - record: slo:${this.sanitizeName(slo.name)}:consumed_error_budget
438
+ expr: |
439
+ 1 - (sum(rate(successful_requests[${slo.window}])) / sum(rate(total_requests[${slo.window}])))
440
+ `;
441
+ }
442
+ }
443
+ ```
444
+
445
+ ### 7. Infrastructure as Code
446
+
447
+ **Terraform Configuration**
448
+ ```hcl
449
+ # monitoring.tf
450
+ module "prometheus" {
451
+ source = "./modules/prometheus"
452
+
453
+ namespace = "monitoring"
454
+ storage_size = "100Gi"
455
+ retention_days = 30
456
+
457
+ external_labels = {
458
+ cluster = var.cluster_name
459
+ region = var.region
460
+ }
461
+ }
462
+
463
+ module "grafana" {
464
+ source = "./modules/grafana"
465
+
466
+ namespace = "monitoring"
467
+ admin_password = var.grafana_admin_password
468
+
469
+ datasources = [
470
+ {
471
+ name = "Prometheus"
472
+ type = "prometheus"
473
+ url = "http://prometheus:9090"
474
+ }
475
+ ]
476
+ }
477
+
478
+ module "alertmanager" {
479
+ source = "./modules/alertmanager"
480
+
481
+ namespace = "monitoring"
482
+
483
+ config = templatefile("${path.module}/alertmanager.yml", {
484
+ slack_webhook = var.slack_webhook
485
+ pagerduty_key = var.pagerduty_service_key
486
+ })
487
+ }
488
+ ```
489
+
490
+ ## Output Format
491
+
492
+ 1. **Infrastructure Assessment**: Current monitoring capabilities analysis
493
+ 2. **Monitoring Architecture**: Complete monitoring stack design
494
+ 3. **Implementation Plan**: Step-by-step deployment guide
495
+ 4. **Metric Definitions**: Comprehensive metrics catalog
496
+ 5. **Dashboard Templates**: Ready-to-use Grafana dashboards
497
+ 6. **Alert Runbooks**: Detailed alert response procedures
498
+ 7. **SLO Definitions**: Service level objectives and error budgets
499
+ 8. **Integration Guide**: Service instrumentation instructions
500
+
501
+ Focus on creating a monitoring system that provides actionable insights, reduces MTTR, and enables proactive issue detection.