ecip-observability-stack 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/CLAUDE.md +48 -0
  2. package/README.md +75 -0
  3. package/alerts/analysis-backlog.yaml +39 -0
  4. package/alerts/cache-degradation.yaml +44 -0
  5. package/alerts/dlq-depth.yaml +56 -0
  6. package/alerts/lsp-daemon.yaml +43 -0
  7. package/alerts/mcp-latency.yaml +46 -0
  8. package/alerts/security-anomaly.yaml +59 -0
  9. package/alerts/sla-latency.yaml +61 -0
  10. package/chaos/kafka-broker-restart.sh +168 -0
  11. package/chaos/kill-lsp-daemon.sh +148 -0
  12. package/chaos/redis-node-failure.sh +318 -0
  13. package/ci/check-observability-contract.js +285 -0
  14. package/ci/eslint-plugin-ecip/index.js +209 -0
  15. package/ci/eslint-plugin-ecip/package.json +12 -0
  16. package/ci/github-actions-observability-gate.yaml +180 -0
  17. package/ci/ruff-shared.toml +41 -0
  18. package/collector/otel-collector-config.yaml +226 -0
  19. package/collector/otel-collector-daemonset.yaml +168 -0
  20. package/collector/sampling-config.yaml +83 -0
  21. package/dashboards/_provisioning/grafana-dashboards.yaml +16 -0
  22. package/dashboards/analysis-throughput.json +166 -0
  23. package/dashboards/cache-performance.json +129 -0
  24. package/dashboards/cross-repo-fanout.json +93 -0
  25. package/dashboards/event-bus-dlq.json +129 -0
  26. package/dashboards/lsp-daemon-health.json +104 -0
  27. package/dashboards/mcp-call-graph.json +114 -0
  28. package/dashboards/query-latency.json +160 -0
  29. package/dashboards/security-events.json +131 -0
  30. package/docs/M08-Observability-Design.md +639 -0
  31. package/docs/PROGRESS.md +375 -0
  32. package/docs/module-documentation.md +64 -0
  33. package/elasticsearch/ilm-policy.json +57 -0
  34. package/elasticsearch/index-template.json +62 -0
  35. package/elasticsearch/kibana-space.yaml +53 -0
  36. package/helm/Chart.yaml +30 -0
  37. package/helm/templates/configmaps.yaml +25 -0
  38. package/helm/templates/elasticsearch.yaml +68 -0
  39. package/helm/templates/grafana-secret.yaml +22 -0
  40. package/helm/templates/grafana.yaml +19 -0
  41. package/helm/templates/loki.yaml +33 -0
  42. package/helm/templates/otel-collector.yaml +119 -0
  43. package/helm/templates/prometheus.yaml +43 -0
  44. package/helm/templates/tempo.yaml +16 -0
  45. package/helm/values.prod.yaml +159 -0
  46. package/helm/values.yaml +146 -0
  47. package/logging-lib/nodejs/package.json +57 -0
  48. package/logging-lib/nodejs/pnpm-lock.yaml +4576 -0
  49. package/logging-lib/python/pyproject.toml +45 -0
  50. package/logging-lib/python/src/__init__.py +19 -0
  51. package/logging-lib/python/src/logger.py +131 -0
  52. package/logging-lib/python/src/security_events.py +150 -0
  53. package/logging-lib/python/src/tracer.py +185 -0
  54. package/logging-lib/python/tests/test_logger.py +113 -0
  55. package/package.json +21 -0
  56. package/prometheus/prometheus-values.yaml +170 -0
  57. package/prometheus/recording-rules.yaml +97 -0
  58. package/prometheus/scrape-configs.yaml +122 -0
  59. package/runbooks/SDK-INTEGRATION.md +239 -0
  60. package/runbooks/alert-response/ANALYSIS_BACKLOG.md +128 -0
  61. package/runbooks/alert-response/DLQ_DEPTH_EXCEEDED.md +150 -0
  62. package/runbooks/alert-response/HIGH_QUERY_LATENCY.md +134 -0
  63. package/runbooks/alert-response/LSP_DAEMON_RESTART.md +118 -0
  64. package/runbooks/alert-response/SECURITY_ANOMALY.md +160 -0
  65. package/runbooks/dashboard-guide.md +169 -0
  66. package/scripts/lint-dashboards.js +184 -0
  67. package/tempo/tempo-datasource.yaml +46 -0
  68. package/tempo/tempo-values.yaml +94 -0
  69. package/tests/alert-threshold-config.test.ts +283 -0
  70. package/tests/log-schema-validation.test.ts +246 -0
  71. package/tests/metric-label-validation.test.ts +292 -0
  72. package/tests/otel-pipeline-integration.test.ts +420 -0
  73. package/tests/security-events.test.ts +417 -0
  74. package/tsconfig.json +17 -0
  75. package/vitest.config.ts +21 -0
  76. package/vitest.integration.config.ts +9 -0
@@ -0,0 +1,68 @@
1
+ {{- /*
2
+ ECIP M08 — Elasticsearch Security Event Index Setup
3
+ Provisions the index template and ILM policy for security events.
4
+ */ -}}
5
+ apiVersion: v1
6
+ kind: ConfigMap
7
+ metadata:
8
+ name: elasticsearch-security-config
9
+ namespace: {{ .Values.namespace | default "monitoring" }}
10
+ labels:
11
+ app: ecip
12
+ component: elasticsearch
13
+ data:
14
+ index-template.json: |-
15
+ {{ .Files.Get "elasticsearch/index-template.json" | nindent 4 }}
16
+ ilm-policy.json: |-
17
+ {{ .Files.Get "elasticsearch/ilm-policy.json" | nindent 4 }}
18
+ ---
19
+ {{- /* Job to apply index template and ILM policy on install/upgrade */ -}}
20
+ apiVersion: batch/v1
21
+ kind: Job
22
+ metadata:
23
+ name: elasticsearch-setup-{{ .Release.Revision }}
24
+ namespace: {{ .Values.namespace | default "monitoring" }}
25
+ labels:
26
+ app: ecip
27
+ component: elasticsearch-setup
28
+ annotations:
29
+ helm.sh/hook: post-install,post-upgrade
30
+ helm.sh/hook-weight: "10"
31
+ helm.sh/hook-delete-policy: before-hook-creation
32
+ spec:
33
+ backoffLimit: 3
34
+ template:
35
+ spec:
36
+ restartPolicy: Never
37
+ containers:
38
+ - name: elasticsearch-setup
39
+ image: curlimages/curl:8.5.0
40
+ command:
41
+ - /bin/sh
42
+ - -c
43
+ - |
44
+ ES_URL="{{ .Values.elasticsearch.protocol | default "http" }}://{{ .Values.elasticsearch.host | default "elasticsearch.monitoring" }}:{{ .Values.elasticsearch.port | default 9200 }}"
45
+
46
+ echo "Applying ILM policy..."
47
+ curl -s -X PUT "$ES_URL/_ilm/policy/ecip-security-events-ilm" \
48
+ -H "Content-Type: application/json" \
49
+ -d @/config/ilm-policy.json
50
+
51
+ echo "Applying index template..."
52
+ curl -s -X PUT "$ES_URL/_index_template/ecip-security-events" \
53
+ -H "Content-Type: application/json" \
54
+ -d @/config/index-template.json
55
+
56
+ echo "Creating initial index..."
57
+ curl -s -X PUT "$ES_URL/ecip-security-events-000001" \
58
+ -H "Content-Type: application/json" \
59
+ -d '{"aliases":{"ecip-security-events":{"is_write_index":true}}}'
60
+
61
+ echo "Setup complete."
62
+ volumeMounts:
63
+ - name: config
64
+ mountPath: /config
65
+ volumes:
66
+ - name: config
67
+ configMap:
68
+ name: elasticsearch-security-config
@@ -0,0 +1,22 @@
1
+ {{- /*
2
+ ECIP M08 — Grafana Admin Secret
3
+ Creates the admin credentials secret if not using ExternalSecrets.
4
+ For production, replace with ExternalSecret pointing to AWS Secrets Manager / Vault.
5
+ */ -}}
6
+ {{- if not (lookup "v1" "Secret" (.Values.namespace | default "monitoring") (index .Values "kube-prometheus-stack" "grafana" "admin" "existingSecret")) }}
7
+ apiVersion: v1
8
+ kind: Secret
9
+ metadata:
10
+ name: {{ index .Values "kube-prometheus-stack" "grafana" "admin" "existingSecret" }}
11
+ namespace: {{ .Values.namespace | default "monitoring" }}
12
+ labels:
13
+ app: ecip
14
+ component: grafana
15
+ type: Opaque
16
+ data:
17
+ # Base64-encoded. Override in your environment — NEVER commit real credentials.
18
+ # echo -n 'admin' | base64
19
+ admin-user: YWRtaW4=
20
+ # Generate a real password: openssl rand -base64 32 | base64
21
+ admin-password: CHANGEME_GENERATE_REAL_PASSWORD
22
+ {{- end }}
@@ -0,0 +1,19 @@
1
+ {{- /*
2
+ ECIP M08 — Grafana Dashboard Provisioning
3
+ Deploys dashboards as ConfigMaps with the grafana_dashboard label
4
+ so the Grafana sidecar picks them up automatically.
5
+ */ -}}
6
+ {{- range $path, $_ := .Files.Glob "dashboards/*.json" }}
7
+ apiVersion: v1
8
+ kind: ConfigMap
9
+ metadata:
10
+ name: ecip-dashboard-{{ base $path | trimSuffix ".json" | lower }}
11
+ namespace: {{ $.Values.namespace | default "monitoring" }}
12
+ labels:
13
+ grafana_dashboard: "1"
14
+ app: ecip
15
+ data:
16
+ {{ base $path }}: |-
17
+ {{ $.Files.Get $path | nindent 4 }}
18
+ ---
19
+ {{- end }}
@@ -0,0 +1,33 @@
1
+ {{- /*
2
+ ECIP M08 — Loki Datasource Provisioning (OD-01 Resolution)
3
+ Provisions Loki as a Grafana datasource for general application logs.
4
+ */ -}}
5
+ {{- if .Values.loki.enabled }}
6
+ apiVersion: v1
7
+ kind: ConfigMap
8
+ metadata:
9
+ name: grafana-datasource-loki
10
+ namespace: {{ .Values.namespace | default "monitoring" }}
11
+ labels:
12
+ grafana_datasource: "1"
13
+ app: ecip
14
+ component: loki
15
+ data:
16
+ loki-datasource.yaml: |-
17
+ apiVersion: 1
18
+ datasources:
19
+ - name: Loki
20
+ type: loki
21
+ access: proxy
22
+ url: http://loki-gateway.{{ .Values.namespace | default "monitoring" }}:3100
23
+ uid: loki
24
+ isDefault: false
25
+ editable: true
26
+ jsonData:
27
+ maxLines: 1000
28
+ derivedFields:
29
+ - name: TraceID
30
+ matcherRegex: '"trace_id":"(\w+)"'
31
+ url: "$${__value.raw}"
32
+ datasourceUid: tempo
33
+ {{- end }}
@@ -0,0 +1,119 @@
1
+ {{- /*
2
+ ECIP M08 — OTel Collector Helm Template
3
+ Deploys the OTel Collector as a DaemonSet with OTLP receivers.
4
+ */ -}}
5
+ apiVersion: apps/v1
6
+ kind: DaemonSet
7
+ metadata:
8
+ name: otel-collector
9
+ namespace: {{ .Values.namespace | default "monitoring" }}
10
+ labels:
11
+ app.kubernetes.io/name: otel-collector
12
+ app.kubernetes.io/component: observability
13
+ app.kubernetes.io/part-of: ecip
14
+ helm.sh/chart: {{ .Chart.Name }}-{{ .Chart.Version }}
15
+ spec:
16
+ selector:
17
+ matchLabels:
18
+ app.kubernetes.io/name: otel-collector
19
+ template:
20
+ metadata:
21
+ labels:
22
+ app.kubernetes.io/name: otel-collector
23
+ ecip.module: M08
24
+ annotations:
25
+ checksum/config: {{ include (print $.Template.BasePath "/configmaps.yaml") . | sha256sum }}
26
+ spec:
27
+ serviceAccountName: otel-collector
28
+ containers:
29
+ - name: otel-collector
30
+ image: "{{ .Values.otelCollector.image.repository }}:{{ .Values.otelCollector.image.tag }}"
31
+ args:
32
+ - --config=/etc/otel/otel-collector-config.yaml
33
+ ports:
34
+ - name: otlp-grpc
35
+ containerPort: {{ .Values.otelCollector.ports.otlpGrpc }}
36
+ hostPort: {{ .Values.otelCollector.ports.otlpGrpc }}
37
+ - name: otlp-http
38
+ containerPort: {{ .Values.otelCollector.ports.otlpHttp }}
39
+ hostPort: {{ .Values.otelCollector.ports.otlpHttp }}
40
+ - name: health
41
+ containerPort: {{ .Values.otelCollector.healthCheck.port }}
42
+ - name: metrics
43
+ containerPort: {{ .Values.otelCollector.ports.metrics }}
44
+ resources:
45
+ {{- toYaml .Values.otelCollector.resources | nindent 12 }}
46
+ volumeMounts:
47
+ - name: collector-config
48
+ mountPath: /etc/otel
49
+ readOnly: true
50
+ livenessProbe:
51
+ httpGet:
52
+ path: /
53
+ port: {{ .Values.otelCollector.healthCheck.port }}
54
+ initialDelaySeconds: 10
55
+ periodSeconds: 15
56
+ readinessProbe:
57
+ httpGet:
58
+ path: /
59
+ port: {{ .Values.otelCollector.healthCheck.port }}
60
+ initialDelaySeconds: 5
61
+ periodSeconds: 10
62
+ volumes:
63
+ - name: collector-config
64
+ configMap:
65
+ name: otel-collector-config
66
+ tolerations:
67
+ - effect: NoSchedule
68
+ operator: Exists
69
+ ---
70
+ apiVersion: v1
71
+ kind: Service
72
+ metadata:
73
+ name: otel-collector
74
+ namespace: {{ .Values.namespace | default "monitoring" }}
75
+ spec:
76
+ type: ClusterIP
77
+ ports:
78
+ - name: otlp-grpc
79
+ port: {{ .Values.otelCollector.ports.otlpGrpc }}
80
+ targetPort: {{ .Values.otelCollector.ports.otlpGrpc }}
81
+ - name: otlp-http
82
+ port: {{ .Values.otelCollector.ports.otlpHttp }}
83
+ targetPort: {{ .Values.otelCollector.ports.otlpHttp }}
84
+ - name: metrics
85
+ port: {{ .Values.otelCollector.ports.metrics }}
86
+ targetPort: {{ .Values.otelCollector.ports.metrics }}
87
+ selector:
88
+ app.kubernetes.io/name: otel-collector
89
+ ---
90
+ apiVersion: v1
91
+ kind: ServiceAccount
92
+ metadata:
93
+ name: otel-collector
94
+ namespace: {{ .Values.namespace | default "monitoring" }}
95
+ ---
96
+ apiVersion: rbac.authorization.k8s.io/v1
97
+ kind: ClusterRole
98
+ metadata:
99
+ name: otel-collector
100
+ rules:
101
+ - apiGroups: [""]
102
+ resources: ["pods", "namespaces", "nodes"]
103
+ verbs: ["get", "list", "watch"]
104
+ - apiGroups: ["apps"]
105
+ resources: ["deployments", "replicasets", "daemonsets", "statefulsets"]
106
+ verbs: ["get", "list", "watch"]
107
+ ---
108
+ apiVersion: rbac.authorization.k8s.io/v1
109
+ kind: ClusterRoleBinding
110
+ metadata:
111
+ name: otel-collector
112
+ subjects:
113
+ - kind: ServiceAccount
114
+ name: otel-collector
115
+ namespace: {{ .Values.namespace | default "monitoring" }}
116
+ roleRef:
117
+ kind: ClusterRole
118
+ name: otel-collector
119
+ apiGroup: rbac.authorization.k8s.io
@@ -0,0 +1,43 @@
1
+ {{- /*
2
+ ECIP M08 — Prometheus Helm Template
3
+ References the kube-prometheus-stack subchart values.
4
+ This template adds ECIP-specific alert rule ConfigMaps.
5
+ */ -}}
6
+ apiVersion: v1
7
+ kind: ConfigMap
8
+ metadata:
9
+ name: {{ .Values.alertRules.configMapName | default "ecip-alert-rules" }}
10
+ namespace: {{ .Values.namespace | default "monitoring" }}
11
+ labels:
12
+ role: alert-rules
13
+ app: ecip
14
+ grafana_alert: "1"
15
+ data:
16
+ {{- range $path, $_ := .Files.Glob "alerts/*.yaml" }}
17
+ {{ base $path }}: |-
18
+ {{ $.Files.Get $path | nindent 4 }}
19
+ {{- end }}
20
+ ---
21
+ apiVersion: v1
22
+ kind: ConfigMap
23
+ metadata:
24
+ name: ecip-recording-rules
25
+ namespace: {{ .Values.namespace | default "monitoring" }}
26
+ labels:
27
+ role: recording-rules
28
+ app: ecip
29
+ data:
30
+ recording-rules.yaml: |-
31
+ {{ .Files.Get "prometheus/recording-rules.yaml" | nindent 4 }}
32
+ ---
33
+ apiVersion: v1
34
+ kind: Secret
35
+ metadata:
36
+ name: ecip-scrape-configs
37
+ namespace: {{ .Values.namespace | default "monitoring" }}
38
+ labels:
39
+ app: ecip
40
+ type: Opaque
41
+ stringData:
42
+ scrape-configs.yaml: |-
43
+ {{ .Files.Get "prometheus/scrape-configs.yaml" | nindent 4 }}
@@ -0,0 +1,16 @@
1
+ {{- /*
2
+ ECIP M08 — Grafana Tempo Configuration
3
+ References the tempo-distributed subchart. This template adds
4
+ the Tempo datasource provisioning for Grafana.
5
+ */ -}}
6
+ apiVersion: v1
7
+ kind: ConfigMap
8
+ metadata:
9
+ name: grafana-datasource-tempo
10
+ namespace: {{ .Values.namespace | default "monitoring" }}
11
+ labels:
12
+ grafana_datasource: "1"
13
+ app: ecip
14
+ data:
15
+ tempo-datasource.yaml: |-
16
+ {{ .Files.Get "tempo/tempo-datasource.yaml" | nindent 4 }}
@@ -0,0 +1,159 @@
1
+ # =============================================================================
2
+ # ECIP M08 — Helm Values (Production Overrides)
3
+ # =============================================================================
4
+ # Overrides for production deployment. Applied on top of values.yaml.
5
+ # Usage: helm upgrade --install ecip-obs ./helm -f helm/values.yaml -f helm/values.prod.yaml
6
+ # =============================================================================
7
+
8
+ # --- OTel Collector ---
9
+ otelCollector:
10
+ resources:
11
+ requests:
12
+ cpu: 500m
13
+ memory: 512Mi
14
+ limits:
15
+ cpu: 2000m
16
+ memory: 1024Mi
17
+
18
+ # --- Prometheus ---
19
+ kube-prometheus-stack:
20
+ prometheus:
21
+ prometheusSpec:
22
+ retention: 30d
23
+ retentionSize: 50GB
24
+ resources:
25
+ requests:
26
+ cpu: 500m
27
+ memory: 2Gi
28
+ limits:
29
+ cpu: 2000m
30
+ memory: 8Gi
31
+ storageSpec:
32
+ volumeClaimTemplate:
33
+ spec:
34
+ storageClassName: gp3
35
+ accessModes: ["ReadWriteOnce"]
36
+ resources:
37
+ requests:
38
+ storage: 100Gi
39
+ alertmanager:
40
+ alertmanagerSpec:
41
+ resources:
42
+ requests:
43
+ cpu: 200m
44
+ memory: 256Mi
45
+ limits:
46
+ cpu: 500m
47
+ memory: 512Mi
48
+ grafana:
49
+ # Production: admin credentials from ExternalSecret (AWS Secrets Manager)
50
+ admin:
51
+ existingSecret: ecip-grafana-admin-prod
52
+ userKey: admin-user
53
+ passwordKey: admin-password
54
+ persistence:
55
+ enabled: true
56
+ size: 20Gi
57
+ resources:
58
+ requests:
59
+ cpu: 250m
60
+ memory: 512Mi
61
+ limits:
62
+ cpu: 1000m
63
+ memory: 1Gi
64
+
65
+ # --- Tempo ---
66
+ tempo-distributed:
67
+ storage:
68
+ trace:
69
+ backend: s3
70
+ s3:
71
+ bucket: ecip-tempo-traces-prod
72
+ endpoint: s3.amazonaws.com
73
+ region: us-east-1
74
+ retention:
75
+ compacted_block_retention: 336h # 14 days
76
+ ingester:
77
+ replicas: 3
78
+ resources:
79
+ requests:
80
+ cpu: 500m
81
+ memory: 1Gi
82
+ limits:
83
+ cpu: 1000m
84
+ memory: 2Gi
85
+ distributor:
86
+ replicas: 3
87
+ resources:
88
+ requests:
89
+ cpu: 500m
90
+ memory: 512Mi
91
+ limits:
92
+ cpu: 1000m
93
+ memory: 1Gi
94
+ compactor:
95
+ replicas: 2
96
+ resources:
97
+ requests:
98
+ cpu: 500m
99
+ memory: 1Gi
100
+ limits:
101
+ cpu: 1000m
102
+ memory: 2Gi
103
+
104
+ # --- Elasticsearch ---
105
+ elasticsearch:
106
+ replicas: 3
107
+ minimumMasterNodes: 2
108
+ resources:
109
+ requests:
110
+ cpu: 1000m
111
+ memory: 4Gi
112
+ limits:
113
+ cpu: 2000m
114
+ memory: 8Gi
115
+ volumeClaimTemplate:
116
+ resources:
117
+ requests:
118
+ storage: 100Gi
119
+ storageClassName: gp3
120
+
121
+ # --- Loki (production) ---
122
+ loki:
123
+ loki:
124
+ storage:
125
+ type: s3
126
+ s3:
127
+ s3: s3://ecip-loki-logs-prod
128
+ region: us-east-1
129
+ commonConfig:
130
+ replication_factor: 3
131
+ write:
132
+ replicas: 3
133
+ resources:
134
+ requests:
135
+ cpu: 500m
136
+ memory: 1Gi
137
+ limits:
138
+ cpu: 1000m
139
+ memory: 2Gi
140
+ read:
141
+ replicas: 3
142
+ resources:
143
+ requests:
144
+ cpu: 500m
145
+ memory: 1Gi
146
+ limits:
147
+ cpu: 1000m
148
+ memory: 2Gi
149
+ backend:
150
+ replicas: 2
151
+ resources:
152
+ requests:
153
+ cpu: 250m
154
+ memory: 512Mi
155
+ limits:
156
+ cpu: 500m
157
+ memory: 1Gi
158
+ gateway:
159
+ replicas: 2
@@ -0,0 +1,146 @@
1
+ # =============================================================================
2
+ # ECIP M08 — Helm Values (Dev/Staging Defaults)
3
+ # =============================================================================
4
+ # Production overrides: values.prod.yaml
5
+ # =============================================================================
6
+
7
+ # --- Feature toggles ---
8
+ prometheus:
9
+ enabled: true
10
+ tempo:
11
+ enabled: true
12
+ elasticsearch:
13
+ enabled: true
14
+ loki:
15
+ enabled: true
16
+
17
+ # --- Namespace ---
18
+ namespace: monitoring
19
+
20
+ # --- OTel Collector ---
21
+ otelCollector:
22
+ image:
23
+ repository: otel/opentelemetry-collector-contrib
24
+ tag: "0.96.0"
25
+ resources:
26
+ requests:
27
+ cpu: 200m
28
+ memory: 256Mi
29
+ limits:
30
+ cpu: 1000m
31
+ memory: 512Mi
32
+ healthCheck:
33
+ port: 13133
34
+ ports:
35
+ otlpGrpc: 4317
36
+ otlpHttp: 4318
37
+ metrics: 8888
38
+ zpages: 55679
39
+
40
+ # --- Prometheus ---
41
+ kube-prometheus-stack:
42
+ prometheus:
43
+ prometheusSpec:
44
+ retention: 7d
45
+ retentionSize: 10GB
46
+ resources:
47
+ requests:
48
+ cpu: 250m
49
+ memory: 1Gi
50
+ limits:
51
+ cpu: 1000m
52
+ memory: 4Gi
53
+ storageSpec:
54
+ volumeClaimTemplate:
55
+ spec:
56
+ storageClassName: standard
57
+ accessModes: ["ReadWriteOnce"]
58
+ resources:
59
+ requests:
60
+ storage: 20Gi
61
+ alertmanager:
62
+ enabled: true
63
+ grafana:
64
+ enabled: true
65
+ # Admin password sourced from K8s Secret — NEVER hardcode in values files
66
+ admin:
67
+ existingSecret: ecip-grafana-admin
68
+ userKey: admin-user
69
+ passwordKey: admin-password
70
+ sidecar:
71
+ dashboards:
72
+ enabled: true
73
+ label: grafana_dashboard
74
+
75
+ # --- Tempo ---
76
+ tempo-distributed:
77
+ storage:
78
+ trace:
79
+ backend: local
80
+ retention:
81
+ compacted_block_retention: 168h # 7 days for dev/staging
82
+ resources:
83
+ requests:
84
+ cpu: 250m
85
+ memory: 512Mi
86
+ limits:
87
+ cpu: 500m
88
+ memory: 1Gi
89
+
90
+ # --- Elasticsearch ---
91
+ elasticsearch:
92
+ replicas: 1
93
+ minimumMasterNodes: 1
94
+ host: elasticsearch.monitoring
95
+ port: 9200
96
+ protocol: http
97
+ resources:
98
+ requests:
99
+ cpu: 250m
100
+ memory: 1Gi
101
+ limits:
102
+ cpu: 1000m
103
+ memory: 2Gi
104
+ volumeClaimTemplate:
105
+ resources:
106
+ requests:
107
+ storage: 20Gi
108
+
109
+ # --- Dashboard provisioning ---
110
+ dashboards:
111
+ configMapName: ecip-grafana-dashboards
112
+ folder: ECIP
113
+
114
+ # --- Alert rules ---
115
+ alertRules:
116
+ configMapName: ecip-alert-rules
117
+
118
+ # --- Loki (general application logs — OD-01 resolution) ---
119
+ loki:
120
+ enabled: true
121
+ loki:
122
+ auth_enabled: false
123
+ storage:
124
+ type: filesystem
125
+ commonConfig:
126
+ replication_factor: 1
127
+ singleBinary:
128
+ replicas: 1
129
+ resources:
130
+ requests:
131
+ cpu: 250m
132
+ memory: 512Mi
133
+ limits:
134
+ cpu: 1000m
135
+ memory: 1Gi
136
+ persistence:
137
+ enabled: true
138
+ size: 20Gi
139
+ storageClassName: standard
140
+ gateway:
141
+ enabled: true
142
+ monitoring:
143
+ selfMonitoring:
144
+ enabled: false
145
+ lokiCanary:
146
+ enabled: false
@@ -0,0 +1,57 @@
1
+ {
2
+ "name": "@ecip-platform/observability",
3
+ "version": "1.1.0",
4
+ "description": "OpenTelemetry instrumentation wrapper for ECIP modules",
5
+ "main": "dist/index.js",
6
+ "types": "dist/index.d.ts",
7
+ "files": [
8
+ "dist/"
9
+ ],
10
+ "scripts": {
11
+ "build": "tsc",
12
+ "test": "vitest run",
13
+ "test:watch": "vitest watch",
14
+ "lint": "eslint src/ --ext .ts",
15
+ "prepublishOnly": "pnpm run build"
16
+ },
17
+ "keywords": [
18
+ "ecip",
19
+ "observability",
20
+ "opentelemetry",
21
+ "logging",
22
+ "tracing"
23
+ ],
24
+ "author": "ECIP Platform Team",
25
+ "license": "MIT",
26
+ "dependencies": {
27
+ "@opentelemetry/api": "^1.7.0",
28
+ "@opentelemetry/auto-instrumentations-node": "^0.43.0",
29
+ "@opentelemetry/context-async-hooks": "^1.22.0",
30
+ "@opentelemetry/exporter-metrics-otlp-http": "^0.48.0",
31
+ "@opentelemetry/exporter-trace-otlp-grpc": "^0.48.0",
32
+ "@opentelemetry/exporter-trace-otlp-http": "^0.48.0",
33
+ "@opentelemetry/instrumentation-grpc": "^0.48.0",
34
+ "@opentelemetry/instrumentation-http": "^0.48.0",
35
+ "@opentelemetry/resources": "^1.22.0",
36
+ "@opentelemetry/sdk-metrics": "^1.22.0",
37
+ "@opentelemetry/sdk-node": "^0.48.0",
38
+ "@opentelemetry/sdk-trace-node": "^1.22.0",
39
+ "@opentelemetry/semantic-conventions": "^1.22.0",
40
+ "pino": "^8.19.0"
41
+ },
42
+ "devDependencies": {
43
+ "@types/express": "^5.0.6",
44
+ "@types/node": "^20.11.0",
45
+ "@typescript-eslint/eslint-plugin": "^6.19.0",
46
+ "@typescript-eslint/parser": "^6.19.0",
47
+ "eslint": "^8.56.0",
48
+ "typescript": "^5.3.0",
49
+ "vitest": "^1.2.0"
50
+ },
51
+ "engines": {
52
+ "node": ">=18.0.0"
53
+ },
54
+ "publishConfig": {
55
+ "access": "public"
56
+ }
57
+ }