@pleri/olam-cli 0.1.160 → 0.1.162
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -0
- package/dist/agent-stream/agent-sdk-to-chunks.js +20 -2
- package/dist/commands/bootstrap.d.ts +15 -0
- package/dist/commands/bootstrap.d.ts.map +1 -1
- package/dist/commands/bootstrap.js +58 -5
- package/dist/commands/bootstrap.js.map +1 -1
- package/dist/commands/flywheel/migrate-overlays.d.ts +1 -0
- package/dist/commands/flywheel/migrate-overlays.d.ts.map +1 -1
- package/dist/commands/flywheel/migrate-overlays.js +29 -3
- package/dist/commands/flywheel/migrate-overlays.js.map +1 -1
- package/dist/commands/skills-source.d.ts.map +1 -1
- package/dist/commands/skills-source.js +57 -2
- package/dist/commands/skills-source.js.map +1 -1
- package/dist/commands/skills.d.ts.map +1 -1
- package/dist/commands/skills.js +14 -0
- package/dist/commands/skills.js.map +1 -1
- package/dist/image-digests.json +7 -7
- package/dist/index.js +2424 -1781
- package/dist/lib/bootstrap-kubernetes.d.ts +42 -0
- package/dist/lib/bootstrap-kubernetes.d.ts.map +1 -0
- package/dist/lib/bootstrap-kubernetes.js +367 -0
- package/dist/lib/bootstrap-kubernetes.js.map +1 -0
- package/dist/lib/config.d.ts.map +1 -1
- package/dist/lib/config.js +6 -1
- package/dist/lib/config.js.map +1 -1
- package/dist/mcp-server.js +568 -368
- package/hermes-bundle/version.json +1 -1
- package/host-cp/k8s/manifests/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +1 -1
- package/host-cp/observability/grafana-port-forward.sh +283 -0
- package/host-cp/observability/kyverno-cardinality-mutate.sh +462 -0
- package/host-cp/observability/loki-ingest.sh +253 -0
- package/host-cp/observability/prom-no-double-grafana.sh +311 -0
- package/host-cp/peripheral-services/helm-values/grafana-values.yaml +159 -0
- package/host-cp/peripheral-services/helm-values/kube-prom-stack-values.yaml +229 -0
- package/host-cp/peripheral-services/helm-values/kyverno-values.yaml +85 -0
- package/host-cp/peripheral-services/helm-values/loki-values.yaml +166 -0
- package/host-cp/peripheral-services/helm-values/promtail-staging.yaml +92 -0
- package/host-cp/peripheral-services/helm-values/promtail-values.yaml +102 -0
- package/host-cp/peripheral-services/helm-values/traefik-values.yaml +73 -0
- package/host-cp/peripheral-services/manifests/20-namespace.yaml +6 -0
- package/host-cp/peripheral-services/manifests/24-deploy-kg-service.yaml +245 -0
- package/host-cp/peripheral-services/manifests/30-traefik-ingressroute-host-cp.yaml +22 -0
- package/host-cp/peripheral-services/manifests/40-traefik-ingressroute-kg.yaml +29 -0
- package/host-cp/peripheral-services/manifests/50-traefik-ingressroute-agent-memory.yaml +29 -0
- package/host-cp/peripheral-services/manifests/60-networkpolicy-ingress.yaml +80 -0
- package/host-cp/peripheral-services/manifests/65-networkpolicy-loki-prom-deny.yaml +67 -0
- package/host-cp/peripheral-services/manifests/80-grafana-dashboard-configmap.yaml +1349 -0
- package/host-cp/peripheral-services/manifests/90-prom-alert-cardinality.yaml +50 -0
- package/host-cp/peripheral-services/manifests/91-servicemonitor-host-cp.yaml +70 -0
- package/host-cp/peripheral-services/manifests/92-servicemonitor-kg-service.yaml +70 -0
- package/host-cp/peripheral-services/manifests/93-servicemonitor-memory-service.yaml +87 -0
- package/host-cp/peripheral-services/manifests/95-prom-recording-rules.yaml +108 -0
- package/host-cp/peripheral-services/manifests/96-kyverno-cardinality-mutate.yaml +195 -0
- package/host-cp/src/plan-chat-service.mjs +147 -1
- package/package.json +1 -1
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
# Grafana Helm values — k3s-ingress-observability Phase B Task B2
|
|
2
|
+
#
|
|
3
|
+
# STANDALONE grafana/grafana chart per OQ-p3-4 + Decision 16.
|
|
4
|
+
# - This is NOT the Grafana bundled with kube-prometheus-stack.
|
|
5
|
+
# - Phase C kube-prometheus-stack MUST set `grafana.enabled: false`
|
|
6
|
+
# explicitly to prevent a second Grafana Deployment from landing.
|
|
7
|
+
# - Port-forward only — NEVER expose via Traefik IngressRoute.
|
|
8
|
+
# See T7 in DESIGN.md: secret exfil mitigated by no ingress surface.
|
|
9
|
+
#
|
|
10
|
+
# Chart: grafana/grafana; pinned to 8.5.2 (latest stable as of 2026-05-20).
|
|
11
|
+
# Upgrade discipline: chart version is embedded in the e2e script comment.
|
|
12
|
+
|
|
13
|
+
# -------------------------------------------------------------------------
|
|
14
|
+
# Admin credentials — loaded from a pre-existing Secret, NOT from chart
|
|
15
|
+
# values. Secret is created by scripts/e2e/grafana-port-forward.sh before
|
|
16
|
+
# helm install, or by the operator following the procedure in
|
|
17
|
+
# packages/peripheral-services/manifests/README.md (§ "Grafana admin secret").
|
|
18
|
+
# The placeholder manifest (70-grafana-secret.yaml) was removed 2026-05-21
|
|
19
|
+
# (dogfood finding #4) because `kubectl apply` would overwrite the operator's
|
|
20
|
+
# pre-created Secret with the placeholder value.
|
|
21
|
+
# -------------------------------------------------------------------------
|
|
22
|
+
admin:
|
|
23
|
+
existingSecret: olam-grafana-admin
|
|
24
|
+
userKey: admin-user
|
|
25
|
+
passwordKey: admin-password
|
|
26
|
+
|
|
27
|
+
# -------------------------------------------------------------------------
|
|
28
|
+
# Service: ClusterIP only.
|
|
29
|
+
# Decision 16: port-forward only; never ingress-routed.
|
|
30
|
+
# Access: `kubectl port-forward -n monitoring svc/olam-grafana 3000:80`
|
|
31
|
+
# -------------------------------------------------------------------------
|
|
32
|
+
service:
|
|
33
|
+
type: ClusterIP
|
|
34
|
+
port: 80
|
|
35
|
+
|
|
36
|
+
# -------------------------------------------------------------------------
|
|
37
|
+
# Ingress: disabled.
|
|
38
|
+
# Decision 16 + OQ-p3-4: Grafana is never exposed via Traefik IngressRoute.
|
|
39
|
+
# Port-forward is the sole operator access path. Enabling ingress here would
|
|
40
|
+
# silently violate the access-control intent even if no IngressRoute manifest
|
|
41
|
+
# is committed.
|
|
42
|
+
# -------------------------------------------------------------------------
|
|
43
|
+
ingress:
|
|
44
|
+
enabled: false # Decision 16: port-forward only; never ingress-routed
|
|
45
|
+
|
|
46
|
+
# -------------------------------------------------------------------------
|
|
47
|
+
# Datasources: Loki (default) + Prometheus (added in Phase C Task C1).
|
|
48
|
+
#
|
|
49
|
+
# Dual-chart pattern:
|
|
50
|
+
# - kube-prometheus-stack (C1) provides Prometheus. Its bundled Grafana
|
|
51
|
+
# sub-chart is disabled (grafana.enabled: false in kube-prom-stack-values.yaml).
|
|
52
|
+
# - This standalone grafana/grafana chart (Phase B) is the only Grafana.
|
|
53
|
+
# - The Prometheus datasource URL points at `prometheus-operated`, which is
|
|
54
|
+
# the in-cluster Service that kube-prometheus-stack's Prometheus Operator
|
|
55
|
+
# creates for the managed Prometheus StatefulSet.
|
|
56
|
+
# - timeInterval: 15s matches the scrape interval in kube-prom-stack-values.yaml
|
|
57
|
+
# so Grafana's step calculation aligns with actual data granularity.
|
|
58
|
+
# - exemplarTraceIdDestinations.datasourceUid: tempo is harmless until Phase D
|
|
59
|
+
# adds Tempo; Grafana silently ignores unknown datasource UIDs.
|
|
60
|
+
#
|
|
61
|
+
# editable: false prevents accidental operator drift across sessions.
|
|
62
|
+
# -------------------------------------------------------------------------
|
|
63
|
+
datasources:
|
|
64
|
+
datasources.yaml:
|
|
65
|
+
apiVersion: 1
|
|
66
|
+
datasources:
|
|
67
|
+
- name: Loki
|
|
68
|
+
type: loki
|
|
69
|
+
access: proxy
|
|
70
|
+
url: http://olam-loki.monitoring.svc.cluster.local:3100
|
|
71
|
+
isDefault: true
|
|
72
|
+
editable: false
|
|
73
|
+
- name: Prometheus
|
|
74
|
+
type: prometheus
|
|
75
|
+
access: proxy
|
|
76
|
+
url: http://prometheus-operated.monitoring.svc.cluster.local:9090
|
|
77
|
+
isDefault: false
|
|
78
|
+
editable: false
|
|
79
|
+
jsonData:
|
|
80
|
+
timeInterval: 15s # matches scrape interval in kube-prom-stack-values.yaml
|
|
81
|
+
exemplarTraceIdDestinations:
|
|
82
|
+
- name: trace_id
|
|
83
|
+
datasourceUid: tempo # Phase D may add Tempo; harmless until then
|
|
84
|
+
|
|
85
|
+
# -------------------------------------------------------------------------
|
|
86
|
+
# Dashboard provisioner: file-based ConfigMap mount.
|
|
87
|
+
# B3 lands the olam-dashboards ConfigMap and the actual JSON files.
|
|
88
|
+
# B2 wires the loader so B3's ConfigMap is picked up automatically.
|
|
89
|
+
# -------------------------------------------------------------------------
|
|
90
|
+
dashboardProviders:
|
|
91
|
+
dashboardproviders.yaml:
|
|
92
|
+
apiVersion: 1
|
|
93
|
+
providers:
|
|
94
|
+
- name: olam-default
|
|
95
|
+
orgId: 1
|
|
96
|
+
folder: 'Olam'
|
|
97
|
+
type: file
|
|
98
|
+
disableDeletion: true
|
|
99
|
+
updateIntervalSeconds: 30
|
|
100
|
+
allowUiUpdates: false
|
|
101
|
+
options:
|
|
102
|
+
path: /var/lib/grafana/dashboards/olam-default
|
|
103
|
+
|
|
104
|
+
# Wire the volume mount — B3 creates this ConfigMap with the actual JSON.
|
|
105
|
+
# Grafana will warn "ConfigMap olam-dashboards not found" until B3 lands;
|
|
106
|
+
# this is benign and does not block Grafana startup.
|
|
107
|
+
dashboardsConfigMaps:
|
|
108
|
+
olam-default: olam-dashboards # B3 creates this ConfigMap
|
|
109
|
+
|
|
110
|
+
# -------------------------------------------------------------------------
|
|
111
|
+
# Resources: tuned for single-operator k3s (<256Mi idle typical).
|
|
112
|
+
# P2 acceptance criterion: <500MB idle / <1GB typical across full LGTM stack.
|
|
113
|
+
# -------------------------------------------------------------------------
|
|
114
|
+
resources:
|
|
115
|
+
requests:
|
|
116
|
+
cpu: 50m
|
|
117
|
+
memory: 128Mi
|
|
118
|
+
limits:
|
|
119
|
+
cpu: 200m
|
|
120
|
+
memory: 256Mi # P2: keeps Grafana within its share of the LGTM RAM budget
|
|
121
|
+
|
|
122
|
+
# -------------------------------------------------------------------------
|
|
123
|
+
# Persistence: disabled for Phase B.
|
|
124
|
+
# Grafana state (dashboards, users) lives in ConfigMaps / values files.
|
|
125
|
+
# Phase C may enable a PV if fine-grained alert state or annotations
|
|
126
|
+
# accumulate. For now, stateless Grafana is simpler and matches S2.
|
|
127
|
+
# -------------------------------------------------------------------------
|
|
128
|
+
persistence:
|
|
129
|
+
enabled: false # S2: ConfigMap-mounted dashboards; no PV needed in Phase B
|
|
130
|
+
|
|
131
|
+
# -------------------------------------------------------------------------
|
|
132
|
+
# ServiceMonitor: Phase C Prometheus scrapes Grafana's /metrics endpoint.
|
|
133
|
+
# Disabled in Phase B: the ServiceMonitor CRD (monitoring.coreos.com/v1) is
|
|
134
|
+
# shipped by kube-prometheus-stack in Phase C. The earlier "enable now to
|
|
135
|
+
# avoid a Phase C helm upgrade" rationale was wrong — Phase C will need a
|
|
136
|
+
# helm upgrade anyway to wire Prometheus scrape targets. Flipping this on
|
|
137
|
+
# pre-CRD breaks the install on chart versions that hard-validate.
|
|
138
|
+
# -------------------------------------------------------------------------
|
|
139
|
+
serviceMonitor:
|
|
140
|
+
# Disabled in the source-of-truth values file so a standalone Phase B install
|
|
141
|
+
# (without kube-prometheus-stack) does not hard-fail when the CRD is absent.
|
|
142
|
+
# The C1 e2e script flips this on at RUNTIME via
|
|
143
|
+
# helm upgrade ... --reuse-values --set serviceMonitor.enabled=true
|
|
144
|
+
# AFTER kube-prom-stack has installed the ServiceMonitor CRD.
|
|
145
|
+
enabled: false
|
|
146
|
+
|
|
147
|
+
# -------------------------------------------------------------------------
|
|
148
|
+
# Grafana.ini overrides: anonymous access disabled (default); only
|
|
149
|
+
# setting the server root_url so port-forward URLs render correctly
|
|
150
|
+
# in email / share links (cosmetic; not a security seam).
|
|
151
|
+
# -------------------------------------------------------------------------
|
|
152
|
+
grafana.ini:
|
|
153
|
+
server:
|
|
154
|
+
root_url: "%(protocol)s://%(domain)s:%(http_port)s/"
|
|
155
|
+
analytics:
|
|
156
|
+
reporting_enabled: false # no telemetry to grafana.com
|
|
157
|
+
check_for_updates: false
|
|
158
|
+
security:
|
|
159
|
+
allow_embedding: false
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
# kube-prometheus-stack Helm values — k3s-ingress-observability Phase C Task C1
|
|
2
|
+
#
|
|
3
|
+
# Chart: prometheus-community/kube-prometheus-stack; pinned to 85.2.0
|
|
4
|
+
# (latest stable as of 2026-05-21).
|
|
5
|
+
# Upgrade discipline: pin in this file + e2e script comment must stay in sync.
|
|
6
|
+
#
|
|
7
|
+
# CRITICAL: grafana.enabled MUST stay false.
|
|
8
|
+
# Phase B ships a standalone grafana/grafana chart (olam-grafana release).
|
|
9
|
+
# kube-prometheus-stack's bundled Grafana sub-chart is disabled to prevent
|
|
10
|
+
# a second Grafana Deployment from landing in the cluster.
|
|
11
|
+
# Decision 16 + OQ-p3-4: Phase B's standalone Grafana is canonical.
|
|
12
|
+
# Enabling the sub-chart here would violate that decision and create two
|
|
13
|
+
# Grafana instances — caught by prom-no-double-grafana.sh's single-Grafana
|
|
14
|
+
# assertion.
|
|
15
|
+
#
|
|
16
|
+
# Resource budget summary (Phase C contribution to P2 target <500MB idle / <1GB typical):
|
|
17
|
+
# prometheus-operator: 128Mi req / 512Mi limit
|
|
18
|
+
# prometheus: 512Mi req / 2Gi limit
|
|
19
|
+
# node-exporter: 64Mi req / 128Mi limit
|
|
20
|
+
# kube-state-metrics: 128Mi req / 256Mi limit
|
|
21
|
+
# Total C1 addition: ~832Mi req / ~3Gi limit (spread across nodes)
|
|
22
|
+
#
|
|
23
|
+
# Retention policy (Decision 14): scrape 15s / retention 15d / size cap 10GiB.
|
|
24
|
+
# The size cap (T10 TSDB corruption mitigation) is the hard guard; retention 15d
|
|
25
|
+
# is advisory — the size cap enforces first.
|
|
26
|
+
#
|
|
27
|
+
# Alertmanager: disabled for C1. C2 lands the first alert rule (cardinality 80k).
|
|
28
|
+
# When C2 ships, flip alertmanager.enabled: true and configure receivers.
|
|
29
|
+
# Comment: "C1 ships without alertmanager; C2 enables when first alert rule lands."
|
|
30
|
+
|
|
31
|
+
# -------------------------------------------------------------------------
|
|
32
|
+
# CARDINALITY ENFORCEMENT — Task C2 (T1 cardinality bomb / P4 <100k active series)
|
|
33
|
+
#
|
|
34
|
+
# Goal: strip high-cardinality labels (world_id, trace_id, user_id,
|
|
35
|
+
# request_id, operator_id) from every scraped series BEFORE TSDB ingest.
|
|
36
|
+
#
|
|
37
|
+
# Architecture finding (helm template verified, 2026-05-21):
|
|
38
|
+
# The prometheus-operator Prometheus CR has NO global metricRelabelConfigs
|
|
39
|
+
# field. The Prometheus CR spec exposes only per-ServiceMonitor endpoint
|
|
40
|
+
# metricRelabelings. There is no chart-level "apply to all scrapes" slot.
|
|
41
|
+
#
|
|
42
|
+
# Enforcement strategy (two-layer):
|
|
43
|
+
# Layer 1 — chart-managed ServiceMonitors: set metricRelabelings on every
|
|
44
|
+
# ServiceMonitor the chart controls (coreDns, prometheusOperator,
|
|
45
|
+
# prometheus self-scrape, node-exporter). Belt-and-suspenders; these
|
|
46
|
+
# services don't emit world_id etc. in practice, but the rule is free.
|
|
47
|
+
# Note: kube-state-metrics sub-chart has no metricRelabelings slot in
|
|
48
|
+
# its prometheus.monitor section at chart version 85.2.0 — omitted.
|
|
49
|
+
# Layer 2 — user-deployed ServiceMonitors: the cardinality-drop.sh e2e
|
|
50
|
+
# script's synthetic violator ServiceMonitor carries the same labeldrop
|
|
51
|
+
# rule (release: olam-prom label + metricRelabelings). New services
|
|
52
|
+
# MUST include the same block — enforced by docs + code review.
|
|
53
|
+
#
|
|
54
|
+
# Why labeldrop is the right action:
|
|
55
|
+
# action: labeldrop removes the matched labels from ALL series that carry
|
|
56
|
+
# them, regardless of metric name. This is the same semantic as Promtail's
|
|
57
|
+
# pipeline drop stages (promtail-values.yaml) — both layers stay in sync.
|
|
58
|
+
# world_id surfaces in dashboards via EXEMPLARS (Decision 9), not labels.
|
|
59
|
+
#
|
|
60
|
+
# Regex covers all five taxonomy labels from observability-label-taxonomy:
|
|
61
|
+
# world_id, trace_id, user_id, request_id, operator_id
|
|
62
|
+
# -------------------------------------------------------------------------
|
|
63
|
+
_cardinalityLabeldrop: &cardinality-labeldrop
|
|
64
|
+
- action: labeldrop
|
|
65
|
+
regex: 'world_id|trace_id|user_id|request_id|operator_id'
|
|
66
|
+
|
|
67
|
+
# -------------------------------------------------------------------------
|
|
68
|
+
# HARD REQUIREMENT: grafana sub-chart is off.
|
|
69
|
+
# See top-of-file comment for rationale.
|
|
70
|
+
# -------------------------------------------------------------------------
|
|
71
|
+
grafana:
|
|
72
|
+
enabled: false # HARD: Decision 16 + OQ-p3-4 — standalone Grafana (olam-grafana) is canonical
|
|
73
|
+
|
|
74
|
+
# -------------------------------------------------------------------------
|
|
75
|
+
# Alertmanager: off until C2 lands the first alert rule.
|
|
76
|
+
# C2 comment: "C1 ships without alertmanager; C2 enables when first alert rule lands."
|
|
77
|
+
# -------------------------------------------------------------------------
|
|
78
|
+
alertmanager:
|
|
79
|
+
enabled: true # C2: first alert rule (OlamActiveSeriesHigh) lands; alertmanager enabled
|
|
80
|
+
serviceMonitor:
|
|
81
|
+
metricRelabelings: *cardinality-labeldrop
|
|
82
|
+
|
|
83
|
+
# -------------------------------------------------------------------------
|
|
84
|
+
# Default kube-controller-manager / scheduler / proxy / etcd monitors.
|
|
85
|
+
# These ServiceMonitors don't work on k3d/k3s because the endpoints are not
|
|
86
|
+
# exposed via the usual ports. Disabling avoids noisy "endpoint not found"
|
|
87
|
+
# warnings and scrape failures on every Prometheus eval cycle.
|
|
88
|
+
# -------------------------------------------------------------------------
|
|
89
|
+
kubeControllerManager:
|
|
90
|
+
enabled: false
|
|
91
|
+
|
|
92
|
+
kubeScheduler:
|
|
93
|
+
enabled: false
|
|
94
|
+
|
|
95
|
+
kubeProxy:
|
|
96
|
+
enabled: false
|
|
97
|
+
|
|
98
|
+
kubeEtcd:
|
|
99
|
+
enabled: false
|
|
100
|
+
|
|
101
|
+
# kube-apiserver and kubelet DO work on k3d but generate high-cardinality
|
|
102
|
+
# label combinations. Disable for now; re-evaluate when per-service /metrics
|
|
103
|
+
# (C3) and cardinality enforcement (C2) are in place.
|
|
104
|
+
kubeApiServer:
|
|
105
|
+
enabled: false
|
|
106
|
+
|
|
107
|
+
kubelet:
|
|
108
|
+
enabled: false
|
|
109
|
+
|
|
110
|
+
# -------------------------------------------------------------------------
|
|
111
|
+
# Default alerting rules: off.
|
|
112
|
+
# The bundled default rules generate Alertmanager receivers and PrometheusRule
|
|
113
|
+
# objects for kubelet, etcd, apiserver, etc. — most don't fire on k3d anyway
|
|
114
|
+
# and add noise before C2's focused cardinality rule lands.
|
|
115
|
+
# C2 will add targeted PrometheusRule objects separately.
|
|
116
|
+
# -------------------------------------------------------------------------
|
|
117
|
+
defaultRules:
|
|
118
|
+
create: false
|
|
119
|
+
|
|
120
|
+
# -------------------------------------------------------------------------
|
|
121
|
+
# coreDns — ServiceMonitor with labeldrop (Layer 1 cardinality enforcement)
|
|
122
|
+
# -------------------------------------------------------------------------
|
|
123
|
+
coreDns:
|
|
124
|
+
serviceMonitor:
|
|
125
|
+
metricRelabelings: *cardinality-labeldrop
|
|
126
|
+
|
|
127
|
+
# -------------------------------------------------------------------------
|
|
128
|
+
# CRDs: install via chart (default: true, explicit for clarity).
|
|
129
|
+
# These CRDs (ServiceMonitor, PodMonitor, PrometheusRule, etc.) are required
|
|
130
|
+
# before Phase B's loki/promtail/grafana charts can have serviceMonitor.enabled:true.
|
|
131
|
+
# Phase C's e2e script waits for servicemonitors.monitoring.coreos.com to be
|
|
132
|
+
# Established before helm-upgrading the Phase B charts.
|
|
133
|
+
# -------------------------------------------------------------------------
|
|
134
|
+
crds:
|
|
135
|
+
enabled: true
|
|
136
|
+
|
|
137
|
+
# -------------------------------------------------------------------------
|
|
138
|
+
# Prometheus Operator
|
|
139
|
+
# -------------------------------------------------------------------------
|
|
140
|
+
prometheusOperator:
|
|
141
|
+
enabled: true
|
|
142
|
+
serviceMonitor:
|
|
143
|
+
metricRelabelings: *cardinality-labeldrop
|
|
144
|
+
resources:
|
|
145
|
+
requests:
|
|
146
|
+
cpu: 100m
|
|
147
|
+
memory: 128Mi
|
|
148
|
+
limits:
|
|
149
|
+
cpu: 500m
|
|
150
|
+
memory: 512Mi
|
|
151
|
+
|
|
152
|
+
# -------------------------------------------------------------------------
|
|
153
|
+
# Prometheus core — Decision 14: scrape 15s / retention 15d / 10GiB cap
|
|
154
|
+
# -------------------------------------------------------------------------
|
|
155
|
+
prometheus:
|
|
156
|
+
serviceMonitor:
|
|
157
|
+
metricRelabelings: *cardinality-labeldrop
|
|
158
|
+
prometheusSpec:
|
|
159
|
+
scrapeInterval: 15s # Decision 14
|
|
160
|
+
evaluationInterval: 15s
|
|
161
|
+
retention: 15d # Decision 14 — advisory; size cap enforces first
|
|
162
|
+
retentionSize: 10GiB # Decision 14 — T10 TSDB corruption prevention
|
|
163
|
+
walCompression: true
|
|
164
|
+
enableAdminAPI: false # security: admin API allows snapshot deletion + series deletion
|
|
165
|
+
enableRemoteWriteReceiver: false # not a remote-write target; no inbound writes
|
|
166
|
+
logLevel: warn # info is noisy at 15s scrape cycle
|
|
167
|
+
|
|
168
|
+
resources:
|
|
169
|
+
requests:
|
|
170
|
+
cpu: 200m
|
|
171
|
+
memory: 512Mi
|
|
172
|
+
limits:
|
|
173
|
+
cpu: 1000m
|
|
174
|
+
memory: 2Gi
|
|
175
|
+
|
|
176
|
+
# PersistentVolume for TSDB. 12Gi = 10GiB retention cap + ~20% headroom.
|
|
177
|
+
# local-path provisioner is used on k3d; cloud providers use their default SC.
|
|
178
|
+
storageSpec:
|
|
179
|
+
volumeClaimTemplate:
|
|
180
|
+
spec:
|
|
181
|
+
accessModes:
|
|
182
|
+
- ReadWriteOnce
|
|
183
|
+
resources:
|
|
184
|
+
requests:
|
|
185
|
+
storage: 12Gi # 10GiB retention + 20% headroom for in-flight segments
|
|
186
|
+
|
|
187
|
+
# -------------------------------------------------------------------------
|
|
188
|
+
# Node exporter — keep enabled (host-level metrics: CPU, memory, disk, net).
|
|
189
|
+
# -------------------------------------------------------------------------
|
|
190
|
+
nodeExporter:
|
|
191
|
+
enabled: true
|
|
192
|
+
|
|
193
|
+
prometheus-node-exporter:
|
|
194
|
+
prometheus:
|
|
195
|
+
monitor:
|
|
196
|
+
metricRelabelings: *cardinality-labeldrop
|
|
197
|
+
resources:
|
|
198
|
+
requests:
|
|
199
|
+
cpu: 30m
|
|
200
|
+
memory: 64Mi
|
|
201
|
+
limits:
|
|
202
|
+
cpu: 100m
|
|
203
|
+
memory: 128Mi
|
|
204
|
+
|
|
205
|
+
# -------------------------------------------------------------------------
|
|
206
|
+
# kube-state-metrics — keep enabled (k8s-level metrics: pod phases, deployments).
|
|
207
|
+
# -------------------------------------------------------------------------
|
|
208
|
+
kubeStateMetrics:
|
|
209
|
+
enabled: true
|
|
210
|
+
|
|
211
|
+
kube-state-metrics:
|
|
212
|
+
resources:
|
|
213
|
+
requests:
|
|
214
|
+
cpu: 50m
|
|
215
|
+
memory: 128Mi
|
|
216
|
+
limits:
|
|
217
|
+
cpu: 200m
|
|
218
|
+
memory: 256Mi
|
|
219
|
+
|
|
220
|
+
# -------------------------------------------------------------------------
|
|
221
|
+
# Datasource auto-discovery note:
|
|
222
|
+
# kube-prometheus-stack's grafana.sidecar.datasources is N/A (grafana sub-chart
|
|
223
|
+
# is off). Phase B's standalone Grafana (grafana-values.yaml) has been updated
|
|
224
|
+
# in this same C1 PR to include a Prometheus datasource entry pointing at:
|
|
225
|
+
# http://prometheus-operated.monitoring.svc.cluster.local:9090
|
|
226
|
+
# This is the in-cluster Service that kube-prometheus-stack creates for the
|
|
227
|
+
# Prometheus StatefulSet (created by the Prometheus Operator from the
|
|
228
|
+
# Prometheus CR above).
|
|
229
|
+
# -------------------------------------------------------------------------
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# Kyverno Helm values — k3s-ingress-observability Phase C C8 follow-up.
|
|
2
|
+
#
|
|
3
|
+
# Kyverno is the policy-as-code layer for cluster-wide cardinality
|
|
4
|
+
# enforcement (closes codex's C2 concern on PR #783). The companion
|
|
5
|
+
# ClusterPolicy in
|
|
6
|
+
# `packages/peripheral-services/manifests/96-kyverno-cardinality-mutate.yaml`
|
|
7
|
+
# mutates every incoming ServiceMonitor and PodMonitor to inject the
|
|
8
|
+
# labeldrop rule before the object is persisted — so a third-party
|
|
9
|
+
# chart (or hand-rolled object) cannot bypass the layer-2
|
|
10
|
+
# per-ServiceMonitor enforcement landed in C2.
|
|
11
|
+
#
|
|
12
|
+
# Chart: kyverno/kyverno; pinned to 3.8.1 (app v1.18.1, 2026-05-21 latest stable).
|
|
13
|
+
# Upgrade discipline: this pin AND the helm-install line in
|
|
14
|
+
# `scripts/e2e/kyverno-cardinality-mutate.sh` must stay in sync.
|
|
15
|
+
#
|
|
16
|
+
# Footprint posture (single-operator k3s scale):
|
|
17
|
+
# We only run admission-time mutation. The ClusterPolicy uses
|
|
18
|
+
# `spec.background: false`, so the background-scan controller is
|
|
19
|
+
# unused. Cleanup + reports controllers are also dead weight for
|
|
20
|
+
# a single ClusterPolicy with no PolicyExceptions — they're disabled
|
|
21
|
+
# so Kyverno's pod count stays minimal (1 pod, not 4).
|
|
22
|
+
#
|
|
23
|
+
# Footprint (Phase C C8 contribution to P2 target <500MB idle / <1GB typical):
|
|
24
|
+
# admissionController: 128Mi req / 384Mi limit (chart default 128Mi/384Mi)
|
|
25
|
+
# Total addition: ~128Mi req / ~384Mi limit
|
|
26
|
+
#
|
|
27
|
+
# If/when we want policy reports populated for observability dashboards,
|
|
28
|
+
# flip `reportsController.enabled: true` and the `features.policyReports`
|
|
29
|
+
# block below. Same for cleanup.
|
|
30
|
+
#
|
|
31
|
+
# Resource limits — tuned upward from chart default for admission webhook
|
|
32
|
+
# stability under burst churn (kube-prom-stack ships ~10 ServiceMonitors at
|
|
33
|
+
# once during `helm upgrade`, which arrives as a burst of AdmissionReviews).
|
|
34
|
+
|
|
35
|
+
# -------------------------------------------------------------------------
|
|
36
|
+
# Disable controllers we don't need
|
|
37
|
+
# -------------------------------------------------------------------------
|
|
38
|
+
backgroundController:
|
|
39
|
+
enabled: false # ClusterPolicy is admission-only (background: false)
|
|
40
|
+
|
|
41
|
+
cleanupController:
|
|
42
|
+
enabled: false # no CleanupPolicy objects in this repo
|
|
43
|
+
|
|
44
|
+
reportsController:
|
|
45
|
+
enabled: false # no policy-reports surface wired into Grafana yet
|
|
46
|
+
|
|
47
|
+
# -------------------------------------------------------------------------
|
|
48
|
+
# Features — admissionReports + policyReports remain ON inside the
|
|
49
|
+
# admission controller itself even when the standalone reports controller
|
|
50
|
+
# is disabled. This keeps `kubectl get clusterpolicyreport` queryable
|
|
51
|
+
# during dogfood; the reports controller would only AGGREGATE them
|
|
52
|
+
# cluster-wide, which we don't need yet.
|
|
53
|
+
# -------------------------------------------------------------------------
|
|
54
|
+
features:
|
|
55
|
+
admissionReports:
|
|
56
|
+
enabled: true
|
|
57
|
+
policyReports:
|
|
58
|
+
enabled: true
|
|
59
|
+
# Background scan is N/A — the policy uses background: false. Explicit
|
|
60
|
+
# off avoids the controller scheduling unnecessary scan workers even
|
|
61
|
+
# when the controller pod is disabled above.
|
|
62
|
+
backgroundScan:
|
|
63
|
+
enabled: false
|
|
64
|
+
# Logging volume defaults are fine; level 2 = info-ish.
|
|
65
|
+
logging:
|
|
66
|
+
format: text
|
|
67
|
+
verbosity: 2
|
|
68
|
+
|
|
69
|
+
# -------------------------------------------------------------------------
|
|
70
|
+
# Admission controller — the only pod we run.
|
|
71
|
+
# -------------------------------------------------------------------------
|
|
72
|
+
admissionController:
|
|
73
|
+
replicas: 1 # single-operator k3s scale; HA is N/A for dogfood
|
|
74
|
+
|
|
75
|
+
rbac:
|
|
76
|
+
create: true # ClusterPolicy needs cluster-wide watch on ServiceMonitor + PodMonitor
|
|
77
|
+
|
|
78
|
+
container:
|
|
79
|
+
resources:
|
|
80
|
+
requests:
|
|
81
|
+
cpu: 100m
|
|
82
|
+
memory: 256Mi
|
|
83
|
+
limits:
|
|
84
|
+
cpu: 500m
|
|
85
|
+
memory: 512Mi
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
# Loki Helm values — k3s-ingress-observability Phase B Task B1
|
|
2
|
+
#
|
|
3
|
+
# Single-binary mode (Decision-16 + Phase B scope):
|
|
4
|
+
# Distributed mode (microservices) adds 5+ independent Deployments + a Minio
|
|
5
|
+
# or S3 backend for object storage — pure overhead for a single-operator
|
|
6
|
+
# k3s install where Loki's write throughput is bounded by one Promtail
|
|
7
|
+
# DaemonSet and a handful of containers. SingleBinary collapses all roles
|
|
8
|
+
# (ingester, querier, compactor) into one Pod, fits within the <500MB idle
|
|
9
|
+
# LGTM RAM target (P2), and is trivially replaceable if scale demands change.
|
|
10
|
+
#
|
|
11
|
+
# See: docs/plans/k3s-ingress-observability/DESIGN.md (P2, S2)
|
|
12
|
+
#
|
|
13
|
+
# Chart: grafana/loki; pinned to 6.7.4 (latest stable as of 2026-05-20).
|
|
14
|
+
# Upgrade discipline: chart version is embedded in the e2e script comment.
|
|
15
|
+
|
|
16
|
+
deploymentMode: SingleBinary
|
|
17
|
+
|
|
18
|
+
loki:
|
|
19
|
+
auth_enabled: false # single-tenant; multi-tenancy adds header overhead with no benefit here
|
|
20
|
+
|
|
21
|
+
commonConfig:
|
|
22
|
+
replication_factor: 1 # single-binary; no replicas = no cross-replica consistency needed
|
|
23
|
+
|
|
24
|
+
# -------------------------------------------------------------------------
|
|
25
|
+
# Storage backend: filesystem (boltdb-shipper + tsdb index; local PV).
|
|
26
|
+
# Object storage (S3/GCS/MinIO) deferred to fatbox multi-org Phase F+.
|
|
27
|
+
# For single-operator k3s, local PV is simpler and sufficient.
|
|
28
|
+
# -------------------------------------------------------------------------
|
|
29
|
+
storage:
|
|
30
|
+
type: filesystem
|
|
31
|
+
|
|
32
|
+
schemaConfig:
|
|
33
|
+
configs:
|
|
34
|
+
- from: "2024-01-01"
|
|
35
|
+
store: tsdb
|
|
36
|
+
object_store: filesystem
|
|
37
|
+
schema: v13
|
|
38
|
+
index:
|
|
39
|
+
prefix: loki_index_
|
|
40
|
+
period: 24h
|
|
41
|
+
|
|
42
|
+
# -------------------------------------------------------------------------
|
|
43
|
+
# Retention: 7 days (168h) per Performance budget acceptance criterion #6.
|
|
44
|
+
# compactor.retention_enabled enables deletion; ring config required for
|
|
45
|
+
# single-binary mode.
|
|
46
|
+
# -------------------------------------------------------------------------
|
|
47
|
+
limits_config:
|
|
48
|
+
retention_period: 168h # 7 days
|
|
49
|
+
ingestion_rate_mb: 4 # per-tenant ingestion cap (single tenant)
|
|
50
|
+
ingestion_burst_size_mb: 8
|
|
51
|
+
max_query_series: 5000 # cap log-derived queries from going wide (P3 <3s p95)
|
|
52
|
+
max_entries_limit_per_query: 5000
|
|
53
|
+
|
|
54
|
+
compactor:
|
|
55
|
+
retention_enabled: true
|
|
56
|
+
delete_request_store: filesystem
|
|
57
|
+
compaction_interval: 10m
|
|
58
|
+
working_directory: /var/loki/compactor
|
|
59
|
+
|
|
60
|
+
ingester:
|
|
61
|
+
chunk_idle_period: 30m # flush to storage; appropriate for low write rate
|
|
62
|
+
chunk_retain_period: 1m
|
|
63
|
+
max_chunk_age: 2h
|
|
64
|
+
|
|
65
|
+
# Self-metrics endpoint — Phase C Prometheus scrapes this.
|
|
66
|
+
# Server block exposed on port 3100 (default); /metrics is always available.
|
|
67
|
+
|
|
68
|
+
singleBinary:
|
|
69
|
+
replicas: 1
|
|
70
|
+
|
|
71
|
+
# -------------------------------------------------------------------------
|
|
72
|
+
# Persistence: 10Gi PV.
|
|
73
|
+
#
|
|
74
|
+
# Rationale: 7-day retention at olam scale (<500 containers, access logs
|
|
75
|
+
# estimated 1–2MB/day compressed) → ~100MB typical stored. 10Gi gives 10x
|
|
76
|
+
# headroom for burst (failed deploy loops, chatty containers) and is well
|
|
77
|
+
# within the <1GB typical acceptance criterion #6. Cloud provider default SC
|
|
78
|
+
# is fine; on bare-metal k3s the local-path provisioner is used.
|
|
79
|
+
# -------------------------------------------------------------------------
|
|
80
|
+
persistence:
|
|
81
|
+
enabled: true
|
|
82
|
+
size: 10Gi # 10× headroom over 7-day typical (~100MB); <1GB usage target per AC#6
|
|
83
|
+
|
|
84
|
+
# -------------------------------------------------------------------------
|
|
85
|
+
# Resources: memory limit 512Mi per task spec.
|
|
86
|
+
# Typical usage at olam scale: <200MB idle (boltdb index + block cache).
|
|
87
|
+
# 512Mi limit prevents compaction spikes from triggering OOM on the node.
|
|
88
|
+
# -------------------------------------------------------------------------
|
|
89
|
+
resources:
|
|
90
|
+
requests:
|
|
91
|
+
cpu: 100m
|
|
92
|
+
memory: 128Mi
|
|
93
|
+
limits:
|
|
94
|
+
cpu: 500m
|
|
95
|
+
memory: 512Mi # P2: <500MB idle / <1GB typical; limit prevents spike OOM
|
|
96
|
+
|
|
97
|
+
# -------------------------------------------------------------------------
|
|
98
|
+
# Self-metrics for Phase C Prometheus scrape.
|
|
99
|
+
# ServiceMonitor is created here; Prometheus picks it up in Phase C.
|
|
100
|
+
# -------------------------------------------------------------------------
|
|
101
|
+
monitoring:
|
|
102
|
+
selfMonitoring:
|
|
103
|
+
enabled: false # disables the bundled GrafanaAgent sub-chart dependency
|
|
104
|
+
grafanaAgent:
|
|
105
|
+
installOperator: false
|
|
106
|
+
serviceMonitor:
|
|
107
|
+
# Disabled in the source-of-truth values file so a standalone Phase B install
|
|
108
|
+
# (without kube-prometheus-stack) does not hard-fail when the CRD is absent.
|
|
109
|
+
# The C1 e2e script flips this on at RUNTIME via
|
|
110
|
+
# helm upgrade ... --reuse-values --set monitoring.serviceMonitor.enabled=true
|
|
111
|
+
# AFTER kube-prom-stack has installed the ServiceMonitor CRD.
|
|
112
|
+
# NOTE: Loki 6.7.4 uses monitoring.serviceMonitor (not top-level serviceMonitor).
|
|
113
|
+
enabled: false
|
|
114
|
+
|
|
115
|
+
# -------------------------------------------------------------------------
|
|
116
|
+
# Backend and read/write gateway: disabled for SingleBinary mode.
|
|
117
|
+
# These are microservices-mode components and must be off or the chart
|
|
118
|
+
# emits validation errors when deploymentMode=SingleBinary.
|
|
119
|
+
# -------------------------------------------------------------------------
|
|
120
|
+
backend:
|
|
121
|
+
replicas: 0
|
|
122
|
+
read:
|
|
123
|
+
replicas: 0
|
|
124
|
+
write:
|
|
125
|
+
replicas: 0
|
|
126
|
+
|
|
127
|
+
# Grafana agent / canary: not needed; disable to keep resource footprint minimal.
|
|
128
|
+
lokiCanary:
|
|
129
|
+
enabled: false
|
|
130
|
+
|
|
131
|
+
test:
|
|
132
|
+
enabled: false
|
|
133
|
+
|
|
134
|
+
# -------------------------------------------------------------------------
|
|
135
|
+
# Sub-component slimming — chart 6.7.4 defaults include nginx gateway +
|
|
136
|
+
# two Memcached clusters + minio + sidecar watchers that single-binary
|
|
137
|
+
# mode doesn't need. Each adds image-pull and Ready-wait time. Disabling
|
|
138
|
+
# all of them brings the install Ready-time within the harness budget.
|
|
139
|
+
# If a future scenario needs query-result caching, re-evaluate
|
|
140
|
+
# resultsCache specifically.
|
|
141
|
+
# -------------------------------------------------------------------------
|
|
142
|
+
|
|
143
|
+
# nginx routing front; Promtail writes direct to single-binary :3100
|
|
144
|
+
gateway:
|
|
145
|
+
enabled: false
|
|
146
|
+
|
|
147
|
+
# Memcached cluster — overhead for single-binary
|
|
148
|
+
chunksCache:
|
|
149
|
+
enabled: false
|
|
150
|
+
|
|
151
|
+
# second Memcached cluster — overhead for single-binary
|
|
152
|
+
resultsCache:
|
|
153
|
+
enabled: false
|
|
154
|
+
|
|
155
|
+
# minio is off because storage.type=filesystem, but be explicit
|
|
156
|
+
minio:
|
|
157
|
+
enabled: false
|
|
158
|
+
|
|
159
|
+
# Sidecar that watches ConfigMaps for runtime config reloads — we don't ship one.
|
|
160
|
+
sidecar:
|
|
161
|
+
rules:
|
|
162
|
+
enabled: false
|
|
163
|
+
datasources:
|
|
164
|
+
enabled: false
|
|
165
|
+
configs:
|
|
166
|
+
enabled: false
|