ecip-observability-stack 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +48 -0
- package/README.md +75 -0
- package/alerts/analysis-backlog.yaml +39 -0
- package/alerts/cache-degradation.yaml +44 -0
- package/alerts/dlq-depth.yaml +56 -0
- package/alerts/lsp-daemon.yaml +43 -0
- package/alerts/mcp-latency.yaml +46 -0
- package/alerts/security-anomaly.yaml +59 -0
- package/alerts/sla-latency.yaml +61 -0
- package/chaos/kafka-broker-restart.sh +168 -0
- package/chaos/kill-lsp-daemon.sh +148 -0
- package/chaos/redis-node-failure.sh +318 -0
- package/ci/check-observability-contract.js +285 -0
- package/ci/eslint-plugin-ecip/index.js +209 -0
- package/ci/eslint-plugin-ecip/package.json +12 -0
- package/ci/github-actions-observability-gate.yaml +180 -0
- package/ci/ruff-shared.toml +41 -0
- package/collector/otel-collector-config.yaml +226 -0
- package/collector/otel-collector-daemonset.yaml +168 -0
- package/collector/sampling-config.yaml +83 -0
- package/dashboards/_provisioning/grafana-dashboards.yaml +16 -0
- package/dashboards/analysis-throughput.json +166 -0
- package/dashboards/cache-performance.json +129 -0
- package/dashboards/cross-repo-fanout.json +93 -0
- package/dashboards/event-bus-dlq.json +129 -0
- package/dashboards/lsp-daemon-health.json +104 -0
- package/dashboards/mcp-call-graph.json +114 -0
- package/dashboards/query-latency.json +160 -0
- package/dashboards/security-events.json +131 -0
- package/docs/M08-Observability-Design.md +639 -0
- package/docs/PROGRESS.md +375 -0
- package/docs/module-documentation.md +64 -0
- package/elasticsearch/ilm-policy.json +57 -0
- package/elasticsearch/index-template.json +62 -0
- package/elasticsearch/kibana-space.yaml +53 -0
- package/helm/Chart.yaml +30 -0
- package/helm/templates/configmaps.yaml +25 -0
- package/helm/templates/elasticsearch.yaml +68 -0
- package/helm/templates/grafana-secret.yaml +22 -0
- package/helm/templates/grafana.yaml +19 -0
- package/helm/templates/loki.yaml +33 -0
- package/helm/templates/otel-collector.yaml +119 -0
- package/helm/templates/prometheus.yaml +43 -0
- package/helm/templates/tempo.yaml +16 -0
- package/helm/values.prod.yaml +159 -0
- package/helm/values.yaml +146 -0
- package/logging-lib/nodejs/package.json +57 -0
- package/logging-lib/nodejs/pnpm-lock.yaml +4576 -0
- package/logging-lib/python/pyproject.toml +45 -0
- package/logging-lib/python/src/__init__.py +19 -0
- package/logging-lib/python/src/logger.py +131 -0
- package/logging-lib/python/src/security_events.py +150 -0
- package/logging-lib/python/src/tracer.py +185 -0
- package/logging-lib/python/tests/test_logger.py +113 -0
- package/package.json +21 -0
- package/prometheus/prometheus-values.yaml +170 -0
- package/prometheus/recording-rules.yaml +97 -0
- package/prometheus/scrape-configs.yaml +122 -0
- package/runbooks/SDK-INTEGRATION.md +239 -0
- package/runbooks/alert-response/ANALYSIS_BACKLOG.md +128 -0
- package/runbooks/alert-response/DLQ_DEPTH_EXCEEDED.md +150 -0
- package/runbooks/alert-response/HIGH_QUERY_LATENCY.md +134 -0
- package/runbooks/alert-response/LSP_DAEMON_RESTART.md +118 -0
- package/runbooks/alert-response/SECURITY_ANOMALY.md +160 -0
- package/runbooks/dashboard-guide.md +169 -0
- package/scripts/lint-dashboards.js +184 -0
- package/tempo/tempo-datasource.yaml +46 -0
- package/tempo/tempo-values.yaml +94 -0
- package/tests/alert-threshold-config.test.ts +283 -0
- package/tests/log-schema-validation.test.ts +246 -0
- package/tests/metric-label-validation.test.ts +292 -0
- package/tests/otel-pipeline-integration.test.ts +420 -0
- package/tests/security-events.test.ts +417 -0
- package/tsconfig.json +17 -0
- package/vitest.config.ts +21 -0
- package/vitest.integration.config.ts +9 -0
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
{{- /*
|
|
2
|
+
ECIP M08 — Elasticsearch Security Event Index Setup
|
|
3
|
+
Provisions the index template and ILM policy for security events.
|
|
4
|
+
*/ -}}
|
|
5
|
+
apiVersion: v1
|
|
6
|
+
kind: ConfigMap
|
|
7
|
+
metadata:
|
|
8
|
+
name: elasticsearch-security-config
|
|
9
|
+
namespace: {{ .Values.namespace | default "monitoring" }}
|
|
10
|
+
labels:
|
|
11
|
+
app: ecip
|
|
12
|
+
component: elasticsearch
|
|
13
|
+
data:
|
|
14
|
+
index-template.json: |-
|
|
15
|
+
{{ .Files.Get "elasticsearch/index-template.json" | nindent 4 }}
|
|
16
|
+
ilm-policy.json: |-
|
|
17
|
+
{{ .Files.Get "elasticsearch/ilm-policy.json" | nindent 4 }}
|
|
18
|
+
---
|
|
19
|
+
{{- /* Job to apply index template and ILM policy on install/upgrade */ -}}
|
|
20
|
+
apiVersion: batch/v1
|
|
21
|
+
kind: Job
|
|
22
|
+
metadata:
|
|
23
|
+
name: elasticsearch-setup-{{ .Release.Revision }}
|
|
24
|
+
namespace: {{ .Values.namespace | default "monitoring" }}
|
|
25
|
+
labels:
|
|
26
|
+
app: ecip
|
|
27
|
+
component: elasticsearch-setup
|
|
28
|
+
annotations:
|
|
29
|
+
helm.sh/hook: post-install,post-upgrade
|
|
30
|
+
helm.sh/hook-weight: "10"
|
|
31
|
+
helm.sh/hook-delete-policy: before-hook-creation
|
|
32
|
+
spec:
|
|
33
|
+
backoffLimit: 3
|
|
34
|
+
template:
|
|
35
|
+
spec:
|
|
36
|
+
restartPolicy: Never
|
|
37
|
+
containers:
|
|
38
|
+
- name: elasticsearch-setup
|
|
39
|
+
image: curlimages/curl:8.5.0
|
|
40
|
+
command:
|
|
41
|
+
- /bin/sh
|
|
42
|
+
- -c
|
|
43
|
+
- |
|
|
44
|
+
ES_URL="{{ .Values.elasticsearch.protocol | default "http" }}://{{ .Values.elasticsearch.host | default "elasticsearch.monitoring" }}:{{ .Values.elasticsearch.port | default 9200 }}"
|
|
45
|
+
|
|
46
|
+
echo "Applying ILM policy..."
|
|
47
|
+
curl -s -X PUT "$ES_URL/_ilm/policy/ecip-security-events-ilm" \
|
|
48
|
+
-H "Content-Type: application/json" \
|
|
49
|
+
-d @/config/ilm-policy.json
|
|
50
|
+
|
|
51
|
+
echo "Applying index template..."
|
|
52
|
+
curl -s -X PUT "$ES_URL/_index_template/ecip-security-events" \
|
|
53
|
+
-H "Content-Type: application/json" \
|
|
54
|
+
-d @/config/index-template.json
|
|
55
|
+
|
|
56
|
+
echo "Creating initial index..."
|
|
57
|
+
curl -s -X PUT "$ES_URL/ecip-security-events-000001" \
|
|
58
|
+
-H "Content-Type: application/json" \
|
|
59
|
+
-d '{"aliases":{"ecip-security-events":{"is_write_index":true}}}'
|
|
60
|
+
|
|
61
|
+
echo "Setup complete."
|
|
62
|
+
volumeMounts:
|
|
63
|
+
- name: config
|
|
64
|
+
mountPath: /config
|
|
65
|
+
volumes:
|
|
66
|
+
- name: config
|
|
67
|
+
configMap:
|
|
68
|
+
name: elasticsearch-security-config
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
{{- /*
|
|
2
|
+
ECIP M08 — Grafana Admin Secret
|
|
3
|
+
Creates the admin credentials secret if not using ExternalSecrets.
|
|
4
|
+
For production, replace with ExternalSecret pointing to AWS Secrets Manager / Vault.
|
|
5
|
+
*/ -}}
|
|
6
|
+
{{- if not (lookup "v1" "Secret" (.Values.namespace | default "monitoring") (index .Values "kube-prometheus-stack" "grafana" "admin" "existingSecret")) }}
|
|
7
|
+
apiVersion: v1
|
|
8
|
+
kind: Secret
|
|
9
|
+
metadata:
|
|
10
|
+
name: {{ index .Values "kube-prometheus-stack" "grafana" "admin" "existingSecret" }}
|
|
11
|
+
namespace: {{ .Values.namespace | default "monitoring" }}
|
|
12
|
+
labels:
|
|
13
|
+
app: ecip
|
|
14
|
+
component: grafana
|
|
15
|
+
type: Opaque
|
|
16
|
+
data:
|
|
17
|
+
# Base64-encoded. Override in your environment — NEVER commit real credentials.
|
|
18
|
+
# echo -n 'admin' | base64
|
|
19
|
+
admin-user: YWRtaW4=
|
|
20
|
+
# Generate a real password: openssl rand -base64 32 | base64
|
|
21
|
+
admin-password: CHANGEME_GENERATE_REAL_PASSWORD
|
|
22
|
+
{{- end }}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
{{- /*
|
|
2
|
+
ECIP M08 — Grafana Dashboard Provisioning
|
|
3
|
+
Deploys dashboards as ConfigMaps with the grafana_dashboard label
|
|
4
|
+
so the Grafana sidecar picks them up automatically.
|
|
5
|
+
*/ -}}
|
|
6
|
+
{{- range $path, $_ := .Files.Glob "dashboards/*.json" }}
|
|
7
|
+
apiVersion: v1
|
|
8
|
+
kind: ConfigMap
|
|
9
|
+
metadata:
|
|
10
|
+
name: ecip-dashboard-{{ base $path | trimSuffix ".json" | lower }}
|
|
11
|
+
namespace: {{ $.Values.namespace | default "monitoring" }}
|
|
12
|
+
labels:
|
|
13
|
+
grafana_dashboard: "1"
|
|
14
|
+
app: ecip
|
|
15
|
+
data:
|
|
16
|
+
{{ base $path }}: |-
|
|
17
|
+
{{ $.Files.Get $path | nindent 4 }}
|
|
18
|
+
---
|
|
19
|
+
{{- end }}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
{{- /*
|
|
2
|
+
ECIP M08 — Loki Datasource Provisioning (OD-01 Resolution)
|
|
3
|
+
Provisions Loki as a Grafana datasource for general application logs.
|
|
4
|
+
*/ -}}
|
|
5
|
+
{{- if .Values.loki.enabled }}
|
|
6
|
+
apiVersion: v1
|
|
7
|
+
kind: ConfigMap
|
|
8
|
+
metadata:
|
|
9
|
+
name: grafana-datasource-loki
|
|
10
|
+
namespace: {{ .Values.namespace | default "monitoring" }}
|
|
11
|
+
labels:
|
|
12
|
+
grafana_datasource: "1"
|
|
13
|
+
app: ecip
|
|
14
|
+
component: loki
|
|
15
|
+
data:
|
|
16
|
+
loki-datasource.yaml: |-
|
|
17
|
+
apiVersion: 1
|
|
18
|
+
datasources:
|
|
19
|
+
- name: Loki
|
|
20
|
+
type: loki
|
|
21
|
+
access: proxy
|
|
22
|
+
url: http://loki-gateway.{{ .Values.namespace | default "monitoring" }}:3100
|
|
23
|
+
uid: loki
|
|
24
|
+
isDefault: false
|
|
25
|
+
editable: true
|
|
26
|
+
jsonData:
|
|
27
|
+
maxLines: 1000
|
|
28
|
+
derivedFields:
|
|
29
|
+
- name: TraceID
|
|
30
|
+
matcherRegex: '"trace_id":"(\w+)"'
|
|
31
|
+
url: "$${__value.raw}"
|
|
32
|
+
datasourceUid: tempo
|
|
33
|
+
{{- end }}
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
{{- /*
|
|
2
|
+
ECIP M08 — OTel Collector Helm Template
|
|
3
|
+
Deploys the OTel Collector as a DaemonSet with OTLP receivers.
|
|
4
|
+
*/ -}}
|
|
5
|
+
apiVersion: apps/v1
|
|
6
|
+
kind: DaemonSet
|
|
7
|
+
metadata:
|
|
8
|
+
name: otel-collector
|
|
9
|
+
namespace: {{ .Values.namespace | default "monitoring" }}
|
|
10
|
+
labels:
|
|
11
|
+
app.kubernetes.io/name: otel-collector
|
|
12
|
+
app.kubernetes.io/component: observability
|
|
13
|
+
app.kubernetes.io/part-of: ecip
|
|
14
|
+
helm.sh/chart: {{ .Chart.Name }}-{{ .Chart.Version }}
|
|
15
|
+
spec:
|
|
16
|
+
selector:
|
|
17
|
+
matchLabels:
|
|
18
|
+
app.kubernetes.io/name: otel-collector
|
|
19
|
+
template:
|
|
20
|
+
metadata:
|
|
21
|
+
labels:
|
|
22
|
+
app.kubernetes.io/name: otel-collector
|
|
23
|
+
ecip.module: M08
|
|
24
|
+
annotations:
|
|
25
|
+
checksum/config: {{ include (print $.Template.BasePath "/configmaps.yaml") . | sha256sum }}
|
|
26
|
+
spec:
|
|
27
|
+
serviceAccountName: otel-collector
|
|
28
|
+
containers:
|
|
29
|
+
- name: otel-collector
|
|
30
|
+
image: "{{ .Values.otelCollector.image.repository }}:{{ .Values.otelCollector.image.tag }}"
|
|
31
|
+
args:
|
|
32
|
+
- --config=/etc/otel/otel-collector-config.yaml
|
|
33
|
+
ports:
|
|
34
|
+
- name: otlp-grpc
|
|
35
|
+
containerPort: {{ .Values.otelCollector.ports.otlpGrpc }}
|
|
36
|
+
hostPort: {{ .Values.otelCollector.ports.otlpGrpc }}
|
|
37
|
+
- name: otlp-http
|
|
38
|
+
containerPort: {{ .Values.otelCollector.ports.otlpHttp }}
|
|
39
|
+
hostPort: {{ .Values.otelCollector.ports.otlpHttp }}
|
|
40
|
+
- name: health
|
|
41
|
+
containerPort: {{ .Values.otelCollector.healthCheck.port }}
|
|
42
|
+
- name: metrics
|
|
43
|
+
containerPort: {{ .Values.otelCollector.ports.metrics }}
|
|
44
|
+
resources:
|
|
45
|
+
{{- toYaml .Values.otelCollector.resources | nindent 12 }}
|
|
46
|
+
volumeMounts:
|
|
47
|
+
- name: collector-config
|
|
48
|
+
mountPath: /etc/otel
|
|
49
|
+
readOnly: true
|
|
50
|
+
livenessProbe:
|
|
51
|
+
httpGet:
|
|
52
|
+
path: /
|
|
53
|
+
port: {{ .Values.otelCollector.healthCheck.port }}
|
|
54
|
+
initialDelaySeconds: 10
|
|
55
|
+
periodSeconds: 15
|
|
56
|
+
readinessProbe:
|
|
57
|
+
httpGet:
|
|
58
|
+
path: /
|
|
59
|
+
port: {{ .Values.otelCollector.healthCheck.port }}
|
|
60
|
+
initialDelaySeconds: 5
|
|
61
|
+
periodSeconds: 10
|
|
62
|
+
volumes:
|
|
63
|
+
- name: collector-config
|
|
64
|
+
configMap:
|
|
65
|
+
name: otel-collector-config
|
|
66
|
+
tolerations:
|
|
67
|
+
- effect: NoSchedule
|
|
68
|
+
operator: Exists
|
|
69
|
+
---
|
|
70
|
+
apiVersion: v1
|
|
71
|
+
kind: Service
|
|
72
|
+
metadata:
|
|
73
|
+
name: otel-collector
|
|
74
|
+
namespace: {{ .Values.namespace | default "monitoring" }}
|
|
75
|
+
spec:
|
|
76
|
+
type: ClusterIP
|
|
77
|
+
ports:
|
|
78
|
+
- name: otlp-grpc
|
|
79
|
+
port: {{ .Values.otelCollector.ports.otlpGrpc }}
|
|
80
|
+
targetPort: {{ .Values.otelCollector.ports.otlpGrpc }}
|
|
81
|
+
- name: otlp-http
|
|
82
|
+
port: {{ .Values.otelCollector.ports.otlpHttp }}
|
|
83
|
+
targetPort: {{ .Values.otelCollector.ports.otlpHttp }}
|
|
84
|
+
- name: metrics
|
|
85
|
+
port: {{ .Values.otelCollector.ports.metrics }}
|
|
86
|
+
targetPort: {{ .Values.otelCollector.ports.metrics }}
|
|
87
|
+
selector:
|
|
88
|
+
app.kubernetes.io/name: otel-collector
|
|
89
|
+
---
|
|
90
|
+
apiVersion: v1
|
|
91
|
+
kind: ServiceAccount
|
|
92
|
+
metadata:
|
|
93
|
+
name: otel-collector
|
|
94
|
+
namespace: {{ .Values.namespace | default "monitoring" }}
|
|
95
|
+
---
|
|
96
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
|
97
|
+
kind: ClusterRole
|
|
98
|
+
metadata:
|
|
99
|
+
name: otel-collector
|
|
100
|
+
rules:
|
|
101
|
+
- apiGroups: [""]
|
|
102
|
+
resources: ["pods", "namespaces", "nodes"]
|
|
103
|
+
verbs: ["get", "list", "watch"]
|
|
104
|
+
- apiGroups: ["apps"]
|
|
105
|
+
resources: ["deployments", "replicasets", "daemonsets", "statefulsets"]
|
|
106
|
+
verbs: ["get", "list", "watch"]
|
|
107
|
+
---
|
|
108
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
|
109
|
+
kind: ClusterRoleBinding
|
|
110
|
+
metadata:
|
|
111
|
+
name: otel-collector
|
|
112
|
+
subjects:
|
|
113
|
+
- kind: ServiceAccount
|
|
114
|
+
name: otel-collector
|
|
115
|
+
namespace: {{ .Values.namespace | default "monitoring" }}
|
|
116
|
+
roleRef:
|
|
117
|
+
kind: ClusterRole
|
|
118
|
+
name: otel-collector
|
|
119
|
+
apiGroup: rbac.authorization.k8s.io
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
{{- /*
|
|
2
|
+
ECIP M08 — Prometheus Helm Template
|
|
3
|
+
References the kube-prometheus-stack subchart values.
|
|
4
|
+
This template adds ECIP-specific alert rule ConfigMaps.
|
|
5
|
+
*/ -}}
|
|
6
|
+
apiVersion: v1
|
|
7
|
+
kind: ConfigMap
|
|
8
|
+
metadata:
|
|
9
|
+
name: {{ .Values.alertRules.configMapName | default "ecip-alert-rules" }}
|
|
10
|
+
namespace: {{ .Values.namespace | default "monitoring" }}
|
|
11
|
+
labels:
|
|
12
|
+
role: alert-rules
|
|
13
|
+
app: ecip
|
|
14
|
+
grafana_alert: "1"
|
|
15
|
+
data:
|
|
16
|
+
{{- range $path, $_ := .Files.Glob "alerts/*.yaml" }}
|
|
17
|
+
{{ base $path }}: |-
|
|
18
|
+
{{ $.Files.Get $path | nindent 4 }}
|
|
19
|
+
{{- end }}
|
|
20
|
+
---
|
|
21
|
+
apiVersion: v1
|
|
22
|
+
kind: ConfigMap
|
|
23
|
+
metadata:
|
|
24
|
+
name: ecip-recording-rules
|
|
25
|
+
namespace: {{ .Values.namespace | default "monitoring" }}
|
|
26
|
+
labels:
|
|
27
|
+
role: recording-rules
|
|
28
|
+
app: ecip
|
|
29
|
+
data:
|
|
30
|
+
recording-rules.yaml: |-
|
|
31
|
+
{{ .Files.Get "prometheus/recording-rules.yaml" | nindent 4 }}
|
|
32
|
+
---
|
|
33
|
+
apiVersion: v1
|
|
34
|
+
kind: Secret
|
|
35
|
+
metadata:
|
|
36
|
+
name: ecip-scrape-configs
|
|
37
|
+
namespace: {{ .Values.namespace | default "monitoring" }}
|
|
38
|
+
labels:
|
|
39
|
+
app: ecip
|
|
40
|
+
type: Opaque
|
|
41
|
+
stringData:
|
|
42
|
+
scrape-configs.yaml: |-
|
|
43
|
+
{{ .Files.Get "prometheus/scrape-configs.yaml" | nindent 4 }}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{{- /*
|
|
2
|
+
ECIP M08 — Grafana Tempo Configuration
|
|
3
|
+
References the tempo-distributed subchart. This template adds
|
|
4
|
+
the Tempo datasource provisioning for Grafana.
|
|
5
|
+
*/ -}}
|
|
6
|
+
apiVersion: v1
|
|
7
|
+
kind: ConfigMap
|
|
8
|
+
metadata:
|
|
9
|
+
name: grafana-datasource-tempo
|
|
10
|
+
namespace: {{ .Values.namespace | default "monitoring" }}
|
|
11
|
+
labels:
|
|
12
|
+
grafana_datasource: "1"
|
|
13
|
+
app: ecip
|
|
14
|
+
data:
|
|
15
|
+
tempo-datasource.yaml: |-
|
|
16
|
+
{{ .Files.Get "tempo/tempo-datasource.yaml" | nindent 4 }}
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# ECIP M08 — Helm Values (Production Overrides)
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# Overrides for production deployment. Applied on top of values.yaml.
|
|
5
|
+
# Usage: helm upgrade --install ecip-obs ./helm -f helm/values.yaml -f helm/values.prod.yaml
|
|
6
|
+
# =============================================================================
|
|
7
|
+
|
|
8
|
+
# --- OTel Collector ---
|
|
9
|
+
otelCollector:
|
|
10
|
+
resources:
|
|
11
|
+
requests:
|
|
12
|
+
cpu: 500m
|
|
13
|
+
memory: 512Mi
|
|
14
|
+
limits:
|
|
15
|
+
cpu: 2000m
|
|
16
|
+
memory: 1024Mi
|
|
17
|
+
|
|
18
|
+
# --- Prometheus ---
|
|
19
|
+
kube-prometheus-stack:
|
|
20
|
+
prometheus:
|
|
21
|
+
prometheusSpec:
|
|
22
|
+
retention: 30d
|
|
23
|
+
retentionSize: 50GB
|
|
24
|
+
resources:
|
|
25
|
+
requests:
|
|
26
|
+
cpu: 500m
|
|
27
|
+
memory: 2Gi
|
|
28
|
+
limits:
|
|
29
|
+
cpu: 2000m
|
|
30
|
+
memory: 8Gi
|
|
31
|
+
storageSpec:
|
|
32
|
+
volumeClaimTemplate:
|
|
33
|
+
spec:
|
|
34
|
+
storageClassName: gp3
|
|
35
|
+
accessModes: ["ReadWriteOnce"]
|
|
36
|
+
resources:
|
|
37
|
+
requests:
|
|
38
|
+
storage: 100Gi
|
|
39
|
+
alertmanager:
|
|
40
|
+
alertmanagerSpec:
|
|
41
|
+
resources:
|
|
42
|
+
requests:
|
|
43
|
+
cpu: 200m
|
|
44
|
+
memory: 256Mi
|
|
45
|
+
limits:
|
|
46
|
+
cpu: 500m
|
|
47
|
+
memory: 512Mi
|
|
48
|
+
grafana:
|
|
49
|
+
# Production: admin credentials from ExternalSecret (AWS Secrets Manager)
|
|
50
|
+
admin:
|
|
51
|
+
existingSecret: ecip-grafana-admin-prod
|
|
52
|
+
userKey: admin-user
|
|
53
|
+
passwordKey: admin-password
|
|
54
|
+
persistence:
|
|
55
|
+
enabled: true
|
|
56
|
+
size: 20Gi
|
|
57
|
+
resources:
|
|
58
|
+
requests:
|
|
59
|
+
cpu: 250m
|
|
60
|
+
memory: 512Mi
|
|
61
|
+
limits:
|
|
62
|
+
cpu: 1000m
|
|
63
|
+
memory: 1Gi
|
|
64
|
+
|
|
65
|
+
# --- Tempo ---
|
|
66
|
+
tempo-distributed:
|
|
67
|
+
storage:
|
|
68
|
+
trace:
|
|
69
|
+
backend: s3
|
|
70
|
+
s3:
|
|
71
|
+
bucket: ecip-tempo-traces-prod
|
|
72
|
+
endpoint: s3.amazonaws.com
|
|
73
|
+
region: us-east-1
|
|
74
|
+
retention:
|
|
75
|
+
compacted_block_retention: 336h # 14 days
|
|
76
|
+
ingester:
|
|
77
|
+
replicas: 3
|
|
78
|
+
resources:
|
|
79
|
+
requests:
|
|
80
|
+
cpu: 500m
|
|
81
|
+
memory: 1Gi
|
|
82
|
+
limits:
|
|
83
|
+
cpu: 1000m
|
|
84
|
+
memory: 2Gi
|
|
85
|
+
distributor:
|
|
86
|
+
replicas: 3
|
|
87
|
+
resources:
|
|
88
|
+
requests:
|
|
89
|
+
cpu: 500m
|
|
90
|
+
memory: 512Mi
|
|
91
|
+
limits:
|
|
92
|
+
cpu: 1000m
|
|
93
|
+
memory: 1Gi
|
|
94
|
+
compactor:
|
|
95
|
+
replicas: 2
|
|
96
|
+
resources:
|
|
97
|
+
requests:
|
|
98
|
+
cpu: 500m
|
|
99
|
+
memory: 1Gi
|
|
100
|
+
limits:
|
|
101
|
+
cpu: 1000m
|
|
102
|
+
memory: 2Gi
|
|
103
|
+
|
|
104
|
+
# --- Elasticsearch ---
|
|
105
|
+
elasticsearch:
|
|
106
|
+
replicas: 3
|
|
107
|
+
minimumMasterNodes: 2
|
|
108
|
+
resources:
|
|
109
|
+
requests:
|
|
110
|
+
cpu: 1000m
|
|
111
|
+
memory: 4Gi
|
|
112
|
+
limits:
|
|
113
|
+
cpu: 2000m
|
|
114
|
+
memory: 8Gi
|
|
115
|
+
volumeClaimTemplate:
|
|
116
|
+
resources:
|
|
117
|
+
requests:
|
|
118
|
+
storage: 100Gi
|
|
119
|
+
storageClassName: gp3
|
|
120
|
+
|
|
121
|
+
# --- Loki (production) ---
|
|
122
|
+
loki:
|
|
123
|
+
loki:
|
|
124
|
+
storage:
|
|
125
|
+
type: s3
|
|
126
|
+
s3:
|
|
127
|
+
s3: s3://ecip-loki-logs-prod
|
|
128
|
+
region: us-east-1
|
|
129
|
+
commonConfig:
|
|
130
|
+
replication_factor: 3
|
|
131
|
+
write:
|
|
132
|
+
replicas: 3
|
|
133
|
+
resources:
|
|
134
|
+
requests:
|
|
135
|
+
cpu: 500m
|
|
136
|
+
memory: 1Gi
|
|
137
|
+
limits:
|
|
138
|
+
cpu: 1000m
|
|
139
|
+
memory: 2Gi
|
|
140
|
+
read:
|
|
141
|
+
replicas: 3
|
|
142
|
+
resources:
|
|
143
|
+
requests:
|
|
144
|
+
cpu: 500m
|
|
145
|
+
memory: 1Gi
|
|
146
|
+
limits:
|
|
147
|
+
cpu: 1000m
|
|
148
|
+
memory: 2Gi
|
|
149
|
+
backend:
|
|
150
|
+
replicas: 2
|
|
151
|
+
resources:
|
|
152
|
+
requests:
|
|
153
|
+
cpu: 250m
|
|
154
|
+
memory: 512Mi
|
|
155
|
+
limits:
|
|
156
|
+
cpu: 500m
|
|
157
|
+
memory: 1Gi
|
|
158
|
+
gateway:
|
|
159
|
+
replicas: 2
|
package/helm/values.yaml
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# ECIP M08 — Helm Values (Dev/Staging Defaults)
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# Production overrides: values.prod.yaml
|
|
5
|
+
# =============================================================================
|
|
6
|
+
|
|
7
|
+
# --- Feature toggles ---
|
|
8
|
+
prometheus:
|
|
9
|
+
enabled: true
|
|
10
|
+
tempo:
|
|
11
|
+
enabled: true
|
|
12
|
+
elasticsearch:
|
|
13
|
+
enabled: true
|
|
14
|
+
loki:
|
|
15
|
+
enabled: true
|
|
16
|
+
|
|
17
|
+
# --- Namespace ---
|
|
18
|
+
namespace: monitoring
|
|
19
|
+
|
|
20
|
+
# --- OTel Collector ---
|
|
21
|
+
otelCollector:
|
|
22
|
+
image:
|
|
23
|
+
repository: otel/opentelemetry-collector-contrib
|
|
24
|
+
tag: "0.96.0"
|
|
25
|
+
resources:
|
|
26
|
+
requests:
|
|
27
|
+
cpu: 200m
|
|
28
|
+
memory: 256Mi
|
|
29
|
+
limits:
|
|
30
|
+
cpu: 1000m
|
|
31
|
+
memory: 512Mi
|
|
32
|
+
healthCheck:
|
|
33
|
+
port: 13133
|
|
34
|
+
ports:
|
|
35
|
+
otlpGrpc: 4317
|
|
36
|
+
otlpHttp: 4318
|
|
37
|
+
metrics: 8888
|
|
38
|
+
zpages: 55679
|
|
39
|
+
|
|
40
|
+
# --- Prometheus ---
|
|
41
|
+
kube-prometheus-stack:
|
|
42
|
+
prometheus:
|
|
43
|
+
prometheusSpec:
|
|
44
|
+
retention: 7d
|
|
45
|
+
retentionSize: 10GB
|
|
46
|
+
resources:
|
|
47
|
+
requests:
|
|
48
|
+
cpu: 250m
|
|
49
|
+
memory: 1Gi
|
|
50
|
+
limits:
|
|
51
|
+
cpu: 1000m
|
|
52
|
+
memory: 4Gi
|
|
53
|
+
storageSpec:
|
|
54
|
+
volumeClaimTemplate:
|
|
55
|
+
spec:
|
|
56
|
+
storageClassName: standard
|
|
57
|
+
accessModes: ["ReadWriteOnce"]
|
|
58
|
+
resources:
|
|
59
|
+
requests:
|
|
60
|
+
storage: 20Gi
|
|
61
|
+
alertmanager:
|
|
62
|
+
enabled: true
|
|
63
|
+
grafana:
|
|
64
|
+
enabled: true
|
|
65
|
+
# Admin password sourced from K8s Secret — NEVER hardcode in values files
|
|
66
|
+
admin:
|
|
67
|
+
existingSecret: ecip-grafana-admin
|
|
68
|
+
userKey: admin-user
|
|
69
|
+
passwordKey: admin-password
|
|
70
|
+
sidecar:
|
|
71
|
+
dashboards:
|
|
72
|
+
enabled: true
|
|
73
|
+
label: grafana_dashboard
|
|
74
|
+
|
|
75
|
+
# --- Tempo ---
|
|
76
|
+
tempo-distributed:
|
|
77
|
+
storage:
|
|
78
|
+
trace:
|
|
79
|
+
backend: local
|
|
80
|
+
retention:
|
|
81
|
+
compacted_block_retention: 168h # 7 days for dev/staging
|
|
82
|
+
resources:
|
|
83
|
+
requests:
|
|
84
|
+
cpu: 250m
|
|
85
|
+
memory: 512Mi
|
|
86
|
+
limits:
|
|
87
|
+
cpu: 500m
|
|
88
|
+
memory: 1Gi
|
|
89
|
+
|
|
90
|
+
# --- Elasticsearch ---
|
|
91
|
+
elasticsearch:
|
|
92
|
+
replicas: 1
|
|
93
|
+
minimumMasterNodes: 1
|
|
94
|
+
host: elasticsearch.monitoring
|
|
95
|
+
port: 9200
|
|
96
|
+
protocol: http
|
|
97
|
+
resources:
|
|
98
|
+
requests:
|
|
99
|
+
cpu: 250m
|
|
100
|
+
memory: 1Gi
|
|
101
|
+
limits:
|
|
102
|
+
cpu: 1000m
|
|
103
|
+
memory: 2Gi
|
|
104
|
+
volumeClaimTemplate:
|
|
105
|
+
resources:
|
|
106
|
+
requests:
|
|
107
|
+
storage: 20Gi
|
|
108
|
+
|
|
109
|
+
# --- Dashboard provisioning ---
|
|
110
|
+
dashboards:
|
|
111
|
+
configMapName: ecip-grafana-dashboards
|
|
112
|
+
folder: ECIP
|
|
113
|
+
|
|
114
|
+
# --- Alert rules ---
|
|
115
|
+
alertRules:
|
|
116
|
+
configMapName: ecip-alert-rules
|
|
117
|
+
|
|
118
|
+
# --- Loki (general application logs — OD-01 resolution) ---
|
|
119
|
+
loki:
|
|
120
|
+
enabled: true
|
|
121
|
+
loki:
|
|
122
|
+
auth_enabled: false
|
|
123
|
+
storage:
|
|
124
|
+
type: filesystem
|
|
125
|
+
commonConfig:
|
|
126
|
+
replication_factor: 1
|
|
127
|
+
singleBinary:
|
|
128
|
+
replicas: 1
|
|
129
|
+
resources:
|
|
130
|
+
requests:
|
|
131
|
+
cpu: 250m
|
|
132
|
+
memory: 512Mi
|
|
133
|
+
limits:
|
|
134
|
+
cpu: 1000m
|
|
135
|
+
memory: 1Gi
|
|
136
|
+
persistence:
|
|
137
|
+
enabled: true
|
|
138
|
+
size: 20Gi
|
|
139
|
+
storageClassName: standard
|
|
140
|
+
gateway:
|
|
141
|
+
enabled: true
|
|
142
|
+
monitoring:
|
|
143
|
+
selfMonitoring:
|
|
144
|
+
enabled: false
|
|
145
|
+
lokiCanary:
|
|
146
|
+
enabled: false
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@ecip-platform/observability",
|
|
3
|
+
"version": "1.1.0",
|
|
4
|
+
"description": "OpenTelemetry instrumentation wrapper for ECIP modules",
|
|
5
|
+
"main": "dist/index.js",
|
|
6
|
+
"types": "dist/index.d.ts",
|
|
7
|
+
"files": [
|
|
8
|
+
"dist/"
|
|
9
|
+
],
|
|
10
|
+
"scripts": {
|
|
11
|
+
"build": "tsc",
|
|
12
|
+
"test": "vitest run",
|
|
13
|
+
"test:watch": "vitest watch",
|
|
14
|
+
"lint": "eslint src/ --ext .ts",
|
|
15
|
+
"prepublishOnly": "pnpm run build"
|
|
16
|
+
},
|
|
17
|
+
"keywords": [
|
|
18
|
+
"ecip",
|
|
19
|
+
"observability",
|
|
20
|
+
"opentelemetry",
|
|
21
|
+
"logging",
|
|
22
|
+
"tracing"
|
|
23
|
+
],
|
|
24
|
+
"author": "ECIP Platform Team",
|
|
25
|
+
"license": "MIT",
|
|
26
|
+
"dependencies": {
|
|
27
|
+
"@opentelemetry/api": "^1.7.0",
|
|
28
|
+
"@opentelemetry/auto-instrumentations-node": "^0.43.0",
|
|
29
|
+
"@opentelemetry/context-async-hooks": "^1.22.0",
|
|
30
|
+
"@opentelemetry/exporter-metrics-otlp-http": "^0.48.0",
|
|
31
|
+
"@opentelemetry/exporter-trace-otlp-grpc": "^0.48.0",
|
|
32
|
+
"@opentelemetry/exporter-trace-otlp-http": "^0.48.0",
|
|
33
|
+
"@opentelemetry/instrumentation-grpc": "^0.48.0",
|
|
34
|
+
"@opentelemetry/instrumentation-http": "^0.48.0",
|
|
35
|
+
"@opentelemetry/resources": "^1.22.0",
|
|
36
|
+
"@opentelemetry/sdk-metrics": "^1.22.0",
|
|
37
|
+
"@opentelemetry/sdk-node": "^0.48.0",
|
|
38
|
+
"@opentelemetry/sdk-trace-node": "^1.22.0",
|
|
39
|
+
"@opentelemetry/semantic-conventions": "^1.22.0",
|
|
40
|
+
"pino": "^8.19.0"
|
|
41
|
+
},
|
|
42
|
+
"devDependencies": {
|
|
43
|
+
"@types/express": "^5.0.6",
|
|
44
|
+
"@types/node": "^20.11.0",
|
|
45
|
+
"@typescript-eslint/eslint-plugin": "^6.19.0",
|
|
46
|
+
"@typescript-eslint/parser": "^6.19.0",
|
|
47
|
+
"eslint": "^8.56.0",
|
|
48
|
+
"typescript": "^5.3.0",
|
|
49
|
+
"vitest": "^1.2.0"
|
|
50
|
+
},
|
|
51
|
+
"engines": {
|
|
52
|
+
"node": ">=18.0.0"
|
|
53
|
+
},
|
|
54
|
+
"publishConfig": {
|
|
55
|
+
"access": "public"
|
|
56
|
+
}
|
|
57
|
+
}
|