@pleri/olam-cli 0.1.160 → 0.1.162

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/README.md +11 -0
  2. package/dist/agent-stream/agent-sdk-to-chunks.js +20 -2
  3. package/dist/commands/bootstrap.d.ts +15 -0
  4. package/dist/commands/bootstrap.d.ts.map +1 -1
  5. package/dist/commands/bootstrap.js +58 -5
  6. package/dist/commands/bootstrap.js.map +1 -1
  7. package/dist/commands/flywheel/migrate-overlays.d.ts +1 -0
  8. package/dist/commands/flywheel/migrate-overlays.d.ts.map +1 -1
  9. package/dist/commands/flywheel/migrate-overlays.js +29 -3
  10. package/dist/commands/flywheel/migrate-overlays.js.map +1 -1
  11. package/dist/commands/skills-source.d.ts.map +1 -1
  12. package/dist/commands/skills-source.js +57 -2
  13. package/dist/commands/skills-source.js.map +1 -1
  14. package/dist/commands/skills.d.ts.map +1 -1
  15. package/dist/commands/skills.js +14 -0
  16. package/dist/commands/skills.js.map +1 -1
  17. package/dist/image-digests.json +7 -7
  18. package/dist/index.js +2424 -1781
  19. package/dist/lib/bootstrap-kubernetes.d.ts +42 -0
  20. package/dist/lib/bootstrap-kubernetes.d.ts.map +1 -0
  21. package/dist/lib/bootstrap-kubernetes.js +367 -0
  22. package/dist/lib/bootstrap-kubernetes.js.map +1 -0
  23. package/dist/lib/config.d.ts.map +1 -1
  24. package/dist/lib/config.js +6 -1
  25. package/dist/lib/config.js.map +1 -1
  26. package/dist/mcp-server.js +568 -368
  27. package/hermes-bundle/version.json +1 -1
  28. package/host-cp/k8s/manifests/50-deployment.yaml +1 -1
  29. package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
  30. package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
  31. package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
  32. package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +1 -1
  33. package/host-cp/observability/grafana-port-forward.sh +283 -0
  34. package/host-cp/observability/kyverno-cardinality-mutate.sh +462 -0
  35. package/host-cp/observability/loki-ingest.sh +253 -0
  36. package/host-cp/observability/prom-no-double-grafana.sh +311 -0
  37. package/host-cp/peripheral-services/helm-values/grafana-values.yaml +159 -0
  38. package/host-cp/peripheral-services/helm-values/kube-prom-stack-values.yaml +229 -0
  39. package/host-cp/peripheral-services/helm-values/kyverno-values.yaml +85 -0
  40. package/host-cp/peripheral-services/helm-values/loki-values.yaml +166 -0
  41. package/host-cp/peripheral-services/helm-values/promtail-staging.yaml +92 -0
  42. package/host-cp/peripheral-services/helm-values/promtail-values.yaml +102 -0
  43. package/host-cp/peripheral-services/helm-values/traefik-values.yaml +73 -0
  44. package/host-cp/peripheral-services/manifests/20-namespace.yaml +6 -0
  45. package/host-cp/peripheral-services/manifests/24-deploy-kg-service.yaml +245 -0
  46. package/host-cp/peripheral-services/manifests/30-traefik-ingressroute-host-cp.yaml +22 -0
  47. package/host-cp/peripheral-services/manifests/40-traefik-ingressroute-kg.yaml +29 -0
  48. package/host-cp/peripheral-services/manifests/50-traefik-ingressroute-agent-memory.yaml +29 -0
  49. package/host-cp/peripheral-services/manifests/60-networkpolicy-ingress.yaml +80 -0
  50. package/host-cp/peripheral-services/manifests/65-networkpolicy-loki-prom-deny.yaml +67 -0
  51. package/host-cp/peripheral-services/manifests/80-grafana-dashboard-configmap.yaml +1349 -0
  52. package/host-cp/peripheral-services/manifests/90-prom-alert-cardinality.yaml +50 -0
  53. package/host-cp/peripheral-services/manifests/91-servicemonitor-host-cp.yaml +70 -0
  54. package/host-cp/peripheral-services/manifests/92-servicemonitor-kg-service.yaml +70 -0
  55. package/host-cp/peripheral-services/manifests/93-servicemonitor-memory-service.yaml +87 -0
  56. package/host-cp/peripheral-services/manifests/95-prom-recording-rules.yaml +108 -0
  57. package/host-cp/peripheral-services/manifests/96-kyverno-cardinality-mutate.yaml +195 -0
  58. package/host-cp/src/plan-chat-service.mjs +147 -1
  59. package/package.json +1 -1
@@ -0,0 +1,159 @@
1
+ # Grafana Helm values — k3s-ingress-observability Phase B Task B2
2
+ #
3
+ # STANDALONE grafana/grafana chart per OQ-p3-4 + Decision 16.
4
+ # - This is NOT the Grafana bundled with kube-prometheus-stack.
5
+ # - Phase C kube-prometheus-stack MUST set `grafana.enabled: false`
6
+ # explicitly to prevent a second Grafana Deployment from landing.
7
+ # - Port-forward only — NEVER expose via Traefik IngressRoute.
8
+ # See T7 in DESIGN.md: secret exfil mitigated by no ingress surface.
9
+ #
10
+ # Chart: grafana/grafana; pinned to 8.5.2 (latest stable as of 2026-05-20).
11
+ # Upgrade discipline: chart version is embedded in the e2e script comment.
12
+
13
+ # -------------------------------------------------------------------------
14
+ # Admin credentials — loaded from a pre-existing Secret, NOT from chart
15
+ # values. Secret is created by scripts/e2e/grafana-port-forward.sh before
16
+ # helm install, or by the operator following the procedure in
17
+ # packages/peripheral-services/manifests/README.md (§ "Grafana admin secret").
18
+ # The placeholder manifest (70-grafana-secret.yaml) was removed 2026-05-21
19
+ # (dogfood finding #4) because `kubectl apply` would overwrite the operator's
20
+ # pre-created Secret with the placeholder value.
21
+ # -------------------------------------------------------------------------
22
+ admin:
23
+ existingSecret: olam-grafana-admin
24
+ userKey: admin-user
25
+ passwordKey: admin-password
26
+
27
+ # -------------------------------------------------------------------------
28
+ # Service: ClusterIP only.
29
+ # Decision 16: port-forward only; never ingress-routed.
30
+ # Access: `kubectl port-forward -n monitoring svc/olam-grafana 3000:80`
31
+ # -------------------------------------------------------------------------
32
+ service:
33
+ type: ClusterIP
34
+ port: 80
35
+
36
+ # -------------------------------------------------------------------------
37
+ # Ingress: disabled.
38
+ # Decision 16 + OQ-p3-4: Grafana is never exposed via Traefik IngressRoute.
39
+ # Port-forward is the sole operator access path. Enabling ingress here would
40
+ # silently violate the access-control intent even if no IngressRoute manifest
41
+ # is committed.
42
+ # -------------------------------------------------------------------------
43
+ ingress:
44
+ enabled: false # Decision 16: port-forward only; never ingress-routed
45
+
46
+ # -------------------------------------------------------------------------
47
+ # Datasources: Loki (default) + Prometheus (added in Phase C Task C1).
48
+ #
49
+ # Dual-chart pattern:
50
+ # - kube-prometheus-stack (C1) provides Prometheus. Its bundled Grafana
51
+ # sub-chart is disabled (grafana.enabled: false in kube-prom-stack-values.yaml).
52
+ # - This standalone grafana/grafana chart (Phase B) is the only Grafana.
53
+ # - The Prometheus datasource URL points at `prometheus-operated`, which is
54
+ # the in-cluster Service that kube-prometheus-stack's Prometheus Operator
55
+ # creates for the managed Prometheus StatefulSet.
56
+ # - timeInterval: 15s matches the scrape interval in kube-prom-stack-values.yaml
57
+ # so Grafana's step calculation aligns with actual data granularity.
58
+ # - exemplarTraceIdDestinations.datasourceUid: tempo is harmless until Phase D
59
+ # adds Tempo; Grafana silently ignores unknown datasource UIDs.
60
+ #
61
+ # editable: false prevents accidental operator drift across sessions.
62
+ # -------------------------------------------------------------------------
63
+ datasources:
64
+ datasources.yaml:
65
+ apiVersion: 1
66
+ datasources:
67
+ - name: Loki
68
+ type: loki
69
+ access: proxy
70
+ url: http://olam-loki.monitoring.svc.cluster.local:3100
71
+ isDefault: true
72
+ editable: false
73
+ - name: Prometheus
74
+ type: prometheus
75
+ access: proxy
76
+ url: http://prometheus-operated.monitoring.svc.cluster.local:9090
77
+ isDefault: false
78
+ editable: false
79
+ jsonData:
80
+ timeInterval: 15s # matches scrape interval in kube-prom-stack-values.yaml
81
+ exemplarTraceIdDestinations:
82
+ - name: trace_id
83
+ datasourceUid: tempo # Phase D may add Tempo; harmless until then
84
+
85
+ # -------------------------------------------------------------------------
86
+ # Dashboard provisioner: file-based ConfigMap mount.
87
+ # B3 lands the olam-dashboards ConfigMap and the actual JSON files.
88
+ # B2 wires the loader so B3's ConfigMap is picked up automatically.
89
+ # -------------------------------------------------------------------------
90
+ dashboardProviders:
91
+ dashboardproviders.yaml:
92
+ apiVersion: 1
93
+ providers:
94
+ - name: olam-default
95
+ orgId: 1
96
+ folder: 'Olam'
97
+ type: file
98
+ disableDeletion: true
99
+ updateIntervalSeconds: 30
100
+ allowUiUpdates: false
101
+ options:
102
+ path: /var/lib/grafana/dashboards/olam-default
103
+
104
+ # Wire the volume mount — B3 creates this ConfigMap with the actual JSON.
105
+ # Grafana will warn "ConfigMap olam-dashboards not found" until B3 lands;
106
+ # this is benign and does not block Grafana startup.
107
+ dashboardsConfigMaps:
108
+ olam-default: olam-dashboards # B3 creates this ConfigMap
109
+
110
+ # -------------------------------------------------------------------------
111
+ # Resources: tuned for single-operator k3s (<256Mi idle typical).
112
+ # P2 acceptance criterion: <500MB idle / <1GB typical across full LGTM stack.
113
+ # -------------------------------------------------------------------------
114
+ resources:
115
+ requests:
116
+ cpu: 50m
117
+ memory: 128Mi
118
+ limits:
119
+ cpu: 200m
120
+ memory: 256Mi # P2: keeps Grafana within its share of the LGTM RAM budget
121
+
122
+ # -------------------------------------------------------------------------
123
+ # Persistence: disabled for Phase B.
124
+ # Grafana state (dashboards, users) lives in ConfigMaps / values files.
125
+ # Phase C may enable a PV if fine-grained alert state or annotations
126
+ # accumulate. For now, stateless Grafana is simpler and matches S2.
127
+ # -------------------------------------------------------------------------
128
+ persistence:
129
+ enabled: false # S2: ConfigMap-mounted dashboards; no PV needed in Phase B
130
+
131
+ # -------------------------------------------------------------------------
132
+ # ServiceMonitor: Phase C Prometheus scrapes Grafana's /metrics endpoint.
133
+ # Disabled in Phase B: the ServiceMonitor CRD (monitoring.coreos.com/v1) is
134
+ # shipped by kube-prometheus-stack in Phase C. The earlier "enable now to
135
+ # avoid a Phase C helm upgrade" rationale was wrong — Phase C will need a
136
+ # helm upgrade anyway to wire Prometheus scrape targets. Flipping this on
137
+ # pre-CRD breaks the install on chart versions that hard-validate.
138
+ # -------------------------------------------------------------------------
139
+ serviceMonitor:
140
+ # Disabled in the source-of-truth values file so a standalone Phase B install
141
+ # (without kube-prometheus-stack) does not hard-fail when the CRD is absent.
142
+ # The C1 e2e script flips this on at RUNTIME via
143
+ # helm upgrade ... --reuse-values --set serviceMonitor.enabled=true
144
+ # AFTER kube-prom-stack has installed the ServiceMonitor CRD.
145
+ enabled: false
146
+
147
+ # -------------------------------------------------------------------------
148
+ # Grafana.ini overrides: anonymous access disabled (default); only
149
+ # setting the server root_url so port-forward URLs render correctly
150
+ # in email / share links (cosmetic; not a security seam).
151
+ # -------------------------------------------------------------------------
152
+ grafana.ini:
153
+ server:
154
+ root_url: "%(protocol)s://%(domain)s:%(http_port)s/"
155
+ analytics:
156
+ reporting_enabled: false # no telemetry to grafana.com
157
+ check_for_updates: false
158
+ security:
159
+ allow_embedding: false
@@ -0,0 +1,229 @@
1
+ # kube-prometheus-stack Helm values — k3s-ingress-observability Phase C Task C1
2
+ #
3
+ # Chart: prometheus-community/kube-prometheus-stack; pinned to 85.2.0
4
+ # (latest stable as of 2026-05-21).
5
+ # Upgrade discipline: pin in this file + e2e script comment must stay in sync.
6
+ #
7
+ # CRITICAL: grafana.enabled MUST stay false.
8
+ # Phase B ships a standalone grafana/grafana chart (olam-grafana release).
9
+ # kube-prometheus-stack's bundled Grafana sub-chart is disabled to prevent
10
+ # a second Grafana Deployment from landing in the cluster.
11
+ # Decision 16 + OQ-p3-4: Phase B's standalone Grafana is canonical.
12
+ # Enabling the sub-chart here would violate that decision and create two
13
+ # Grafana instances — caught by prom-no-double-grafana.sh's single-Grafana
14
+ # assertion.
15
+ #
16
+ # Resource budget summary (Phase C contribution to P2 target <500MB idle / <1GB typical):
17
+ # prometheus-operator: 128Mi req / 512Mi limit
18
+ # prometheus: 512Mi req / 2Gi limit
19
+ # node-exporter: 64Mi req / 128Mi limit
20
+ # kube-state-metrics: 128Mi req / 256Mi limit
21
+ # Total C1 addition: ~832Mi req / ~3Gi limit (spread across nodes)
22
+ #
23
+ # Retention policy (Decision 14): scrape 15s / retention 15d / size cap 10GiB.
24
+ # The size cap (T10 TSDB corruption mitigation) is the hard guard; retention 15d
25
+ # is advisory — the size cap enforces first.
26
+ #
27
+ # Alertmanager: disabled for C1. C2 lands the first alert rule (cardinality 80k).
28
+ # When C2 ships, flip alertmanager.enabled: true and configure receivers.
29
+ # Comment: "C1 ships without alertmanager; C2 enables when first alert rule lands."
30
+
31
+ # -------------------------------------------------------------------------
32
+ # CARDINALITY ENFORCEMENT — Task C2 (T1 cardinality bomb / P4 <100k active series)
33
+ #
34
+ # Goal: strip high-cardinality labels (world_id, trace_id, user_id,
35
+ # request_id, operator_id) from every scraped series BEFORE TSDB ingest.
36
+ #
37
+ # Architecture finding (helm template verified, 2026-05-21):
38
+ # The prometheus-operator Prometheus CR has NO global metricRelabelConfigs
39
+ # field. The Prometheus CR spec exposes only per-ServiceMonitor endpoint
40
+ # metricRelabelings. There is no chart-level "apply to all scrapes" slot.
41
+ #
42
+ # Enforcement strategy (two-layer):
43
+ # Layer 1 — chart-managed ServiceMonitors: set metricRelabelings on every
44
+ # ServiceMonitor the chart controls (coreDns, prometheusOperator,
45
+ # prometheus self-scrape, node-exporter). Belt-and-suspenders; these
46
+ # services don't emit world_id etc. in practice, but the rule is free.
47
+ # Note: kube-state-metrics sub-chart has no metricRelabelings slot in
48
+ # its prometheus.monitor section at chart version 85.2.0 — omitted.
49
+ # Layer 2 — user-deployed ServiceMonitors: the cardinality-drop.sh e2e
50
+ # script's synthetic violator ServiceMonitor carries the same labeldrop
51
+ # rule (release: olam-prom label + metricRelabelings). New services
52
+ # MUST include the same block — enforced by docs + code review.
53
+ #
54
+ # Why labeldrop is the right action:
55
+ # action: labeldrop removes the matched labels from ALL series that carry
56
+ # them, regardless of metric name. This is the same semantic as Promtail's
57
+ # pipeline drop stages (promtail-values.yaml) — both layers stay in sync.
58
+ # world_id surfaces in dashboards via EXEMPLARS (Decision 9), not labels.
59
+ #
60
+ # Regex covers all five taxonomy labels from observability-label-taxonomy:
61
+ # world_id, trace_id, user_id, request_id, operator_id
62
+ # -------------------------------------------------------------------------
63
+ _cardinalityLabeldrop: &cardinality-labeldrop
64
+ - action: labeldrop
65
+ regex: 'world_id|trace_id|user_id|request_id|operator_id'
66
+
67
+ # -------------------------------------------------------------------------
68
+ # HARD REQUIREMENT: grafana sub-chart is off.
69
+ # See top-of-file comment for rationale.
70
+ # -------------------------------------------------------------------------
71
+ grafana:
72
+ enabled: false # HARD: Decision 16 + OQ-p3-4 — standalone Grafana (olam-grafana) is canonical
73
+
74
+ # -------------------------------------------------------------------------
75
+ # Alertmanager: off until C2 lands the first alert rule.
76
+ # C2 comment: "C1 ships without alertmanager; C2 enables when first alert rule lands."
77
+ # -------------------------------------------------------------------------
78
+ alertmanager:
79
+ enabled: true # C2: first alert rule (OlamActiveSeriesHigh) lands; alertmanager enabled
80
+ serviceMonitor:
81
+ metricRelabelings: *cardinality-labeldrop
82
+
83
+ # -------------------------------------------------------------------------
84
+ # Default kube-controller-manager / scheduler / proxy / etcd monitors.
85
+ # These ServiceMonitors don't work on k3d/k3s because the endpoints are not
86
+ # exposed via the usual ports. Disabling avoids noisy "endpoint not found"
87
+ # warnings and scrape failures on every Prometheus eval cycle.
88
+ # -------------------------------------------------------------------------
89
+ kubeControllerManager:
90
+ enabled: false
91
+
92
+ kubeScheduler:
93
+ enabled: false
94
+
95
+ kubeProxy:
96
+ enabled: false
97
+
98
+ kubeEtcd:
99
+ enabled: false
100
+
101
+ # kube-apiserver and kubelet DO work on k3d but generate high-cardinality
102
+ # label combinations. Disable for now; re-evaluate when per-service /metrics
103
+ # (C3) and cardinality enforcement (C2) are in place.
104
+ kubeApiServer:
105
+ enabled: false
106
+
107
+ kubelet:
108
+ enabled: false
109
+
110
+ # -------------------------------------------------------------------------
111
+ # Default alerting rules: off.
112
+ # The bundled default rules generate Alertmanager receivers and PrometheusRule
113
+ # objects for kubelet, etcd, apiserver, etc. — most don't fire on k3d anyway
114
+ # and add noise before C2's focused cardinality rule lands.
115
+ # C2 will add targeted PrometheusRule objects separately.
116
+ # -------------------------------------------------------------------------
117
+ defaultRules:
118
+ create: false
119
+
120
+ # -------------------------------------------------------------------------
121
+ # coreDns — ServiceMonitor with labeldrop (Layer 1 cardinality enforcement)
122
+ # -------------------------------------------------------------------------
123
+ coreDns:
124
+ serviceMonitor:
125
+ metricRelabelings: *cardinality-labeldrop
126
+
127
+ # -------------------------------------------------------------------------
128
+ # CRDs: install via chart (default: true, explicit for clarity).
129
+ # These CRDs (ServiceMonitor, PodMonitor, PrometheusRule, etc.) are required
130
+ # before Phase B's loki/promtail/grafana charts can have serviceMonitor.enabled:true.
131
+ # Phase C's e2e script waits for servicemonitors.monitoring.coreos.com to be
132
+ # Established before helm-upgrading the Phase B charts.
133
+ # -------------------------------------------------------------------------
134
+ crds:
135
+ enabled: true
136
+
137
+ # -------------------------------------------------------------------------
138
+ # Prometheus Operator
139
+ # -------------------------------------------------------------------------
140
+ prometheusOperator:
141
+ enabled: true
142
+ serviceMonitor:
143
+ metricRelabelings: *cardinality-labeldrop
144
+ resources:
145
+ requests:
146
+ cpu: 100m
147
+ memory: 128Mi
148
+ limits:
149
+ cpu: 500m
150
+ memory: 512Mi
151
+
152
+ # -------------------------------------------------------------------------
153
+ # Prometheus core — Decision 14: scrape 15s / retention 15d / 10GiB cap
154
+ # -------------------------------------------------------------------------
155
+ prometheus:
156
+ serviceMonitor:
157
+ metricRelabelings: *cardinality-labeldrop
158
+ prometheusSpec:
159
+ scrapeInterval: 15s # Decision 14
160
+ evaluationInterval: 15s
161
+ retention: 15d # Decision 14 — advisory; size cap enforces first
162
+ retentionSize: 10GiB # Decision 14 — T10 TSDB corruption prevention
163
+ walCompression: true
164
+ enableAdminAPI: false # security: admin API allows snapshot deletion + series deletion
165
+ enableRemoteWriteReceiver: false # not a remote-write target; no inbound writes
166
+ logLevel: warn # info is noisy at 15s scrape cycle
167
+
168
+ resources:
169
+ requests:
170
+ cpu: 200m
171
+ memory: 512Mi
172
+ limits:
173
+ cpu: 1000m
174
+ memory: 2Gi
175
+
176
+ # PersistentVolume for TSDB. 12Gi = 10GiB retention cap + ~20% headroom.
177
+ # local-path provisioner is used on k3d; cloud providers use their default SC.
178
+ storageSpec:
179
+ volumeClaimTemplate:
180
+ spec:
181
+ accessModes:
182
+ - ReadWriteOnce
183
+ resources:
184
+ requests:
185
+ storage: 12Gi # 10GiB retention + 20% headroom for in-flight segments
186
+
187
+ # -------------------------------------------------------------------------
188
+ # Node exporter — keep enabled (host-level metrics: CPU, memory, disk, net).
189
+ # -------------------------------------------------------------------------
190
+ nodeExporter:
191
+ enabled: true
192
+
193
+ prometheus-node-exporter:
194
+ prometheus:
195
+ monitor:
196
+ metricRelabelings: *cardinality-labeldrop
197
+ resources:
198
+ requests:
199
+ cpu: 30m
200
+ memory: 64Mi
201
+ limits:
202
+ cpu: 100m
203
+ memory: 128Mi
204
+
205
+ # -------------------------------------------------------------------------
206
+ # kube-state-metrics — keep enabled (k8s-level metrics: pod phases, deployments).
207
+ # -------------------------------------------------------------------------
208
+ kubeStateMetrics:
209
+ enabled: true
210
+
211
+ kube-state-metrics:
212
+ resources:
213
+ requests:
214
+ cpu: 50m
215
+ memory: 128Mi
216
+ limits:
217
+ cpu: 200m
218
+ memory: 256Mi
219
+
220
+ # -------------------------------------------------------------------------
221
+ # Datasource auto-discovery note:
222
+ # kube-prometheus-stack's grafana.sidecar.datasources is N/A (grafana sub-chart
223
+ # is off). Phase B's standalone Grafana (grafana-values.yaml) has been updated
224
+ # in this same C1 PR to include a Prometheus datasource entry pointing at:
225
+ # http://prometheus-operated.monitoring.svc.cluster.local:9090
226
+ # This is the in-cluster Service that kube-prometheus-stack creates for the
227
+ # Prometheus StatefulSet (created by the Prometheus Operator from the
228
+ # Prometheus CR above).
229
+ # -------------------------------------------------------------------------
@@ -0,0 +1,85 @@
1
+ # Kyverno Helm values — k3s-ingress-observability Phase C C8 follow-up.
2
+ #
3
+ # Kyverno is the policy-as-code layer for cluster-wide cardinality
4
+ # enforcement (closes codex's C2 concern on PR #783). The companion
5
+ # ClusterPolicy in
6
+ # `packages/peripheral-services/manifests/96-kyverno-cardinality-mutate.yaml`
7
+ # mutates every incoming ServiceMonitor and PodMonitor to inject the
8
+ # labeldrop rule before the object is persisted — so a third-party
9
+ # chart (or hand-rolled object) cannot bypass the layer-2
10
+ # per-ServiceMonitor enforcement landed in C2.
11
+ #
12
+ # Chart: kyverno/kyverno; pinned to 3.8.1 (app v1.18.1, 2026-05-21 latest stable).
13
+ # Upgrade discipline: this pin AND the helm-install line in
14
+ # `scripts/e2e/kyverno-cardinality-mutate.sh` must stay in sync.
15
+ #
16
+ # Footprint posture (single-operator k3s scale):
17
+ # We only run admission-time mutation. The ClusterPolicy uses
18
+ # `spec.background: false`, so the background-scan controller is
19
+ # unused. Cleanup + reports controllers are also dead weight for
20
+ # a single ClusterPolicy with no PolicyExceptions — they're disabled
21
+ # so Kyverno's pod count stays minimal (1 pod, not 4).
22
+ #
23
+ # Footprint (Phase C C8 contribution to P2 target <500MB idle / <1GB typical):
24
+ # admissionController: 128Mi req / 384Mi limit (chart default 128Mi/384Mi)
25
+ # Total addition: ~128Mi req / ~384Mi limit
26
+ #
27
+ # If/when we want policy reports populated for observability dashboards,
28
+ # flip `reportsController.enabled: true` and the `features.policyReports`
29
+ # block below. Same for cleanup.
30
+ #
31
+ # Resource limits — tuned upward from chart default for admission webhook
32
+ # stability under burst churn (kube-prom-stack ships ~10 ServiceMonitors at
33
+ # once during `helm upgrade`, which arrives as a burst of AdmissionReviews).
34
+
35
+ # -------------------------------------------------------------------------
36
+ # Disable controllers we don't need
37
+ # -------------------------------------------------------------------------
38
+ backgroundController:
39
+ enabled: false # ClusterPolicy is admission-only (background: false)
40
+
41
+ cleanupController:
42
+ enabled: false # no CleanupPolicy objects in this repo
43
+
44
+ reportsController:
45
+ enabled: false # no policy-reports surface wired into Grafana yet
46
+
47
+ # -------------------------------------------------------------------------
48
+ # Features — admissionReports + policyReports remain ON inside the
49
+ # admission controller itself even when the standalone reports controller
50
+ # is disabled. This keeps `kubectl get clusterpolicyreport` queryable
51
+ # during dogfood; the reports controller would only AGGREGATE them
52
+ # cluster-wide, which we don't need yet.
53
+ # -------------------------------------------------------------------------
54
+ features:
55
+ admissionReports:
56
+ enabled: true
57
+ policyReports:
58
+ enabled: true
59
+ # Background scan is N/A — the policy uses background: false. Explicit
60
+ # off avoids the controller scheduling unnecessary scan workers even
61
+ # when the controller pod is disabled above.
62
+ backgroundScan:
63
+ enabled: false
64
+ # Logging volume defaults are fine; level 2 = info-ish.
65
+ logging:
66
+ format: text
67
+ verbosity: 2
68
+
69
+ # -------------------------------------------------------------------------
70
+ # Admission controller — the only pod we run.
71
+ # -------------------------------------------------------------------------
72
+ admissionController:
73
+ replicas: 1 # single-operator k3s scale; HA is N/A for dogfood
74
+
75
+ rbac:
76
+ create: true # ClusterPolicy needs cluster-wide watch on ServiceMonitor + PodMonitor
77
+
78
+ container:
79
+ resources:
80
+ requests:
81
+ cpu: 100m
82
+ memory: 256Mi
83
+ limits:
84
+ cpu: 500m
85
+ memory: 512Mi
@@ -0,0 +1,166 @@
1
+ # Loki Helm values — k3s-ingress-observability Phase B Task B1
2
+ #
3
+ # Single-binary mode (Decision-16 + Phase B scope):
4
+ # Distributed mode (microservices) adds 5+ independent Deployments + a Minio
5
+ # or S3 backend for object storage — pure overhead for a single-operator
6
+ # k3s install where Loki's write throughput is bounded by one Promtail
7
+ # DaemonSet and a handful of containers. SingleBinary collapses all roles
8
+ # (ingester, querier, compactor) into one Pod, fits within the <500MB idle
9
+ # LGTM RAM target (P2), and is trivially replaceable if scale demands change.
10
+ #
11
+ # See: docs/plans/k3s-ingress-observability/DESIGN.md (P2, S2)
12
+ #
13
+ # Chart: grafana/loki; pinned to 6.7.4 (latest stable as of 2026-05-20).
14
+ # Upgrade discipline: chart version is embedded in the e2e script comment.
15
+
16
+ deploymentMode: SingleBinary
17
+
18
+ loki:
19
+ auth_enabled: false # single-tenant; multi-tenancy adds header overhead with no benefit here
20
+
21
+ commonConfig:
22
+ replication_factor: 1 # single-binary; no replicas = no cross-replica consistency needed
23
+
24
+ # -------------------------------------------------------------------------
25
+ # Storage backend: filesystem (boltdb-shipper + tsdb index; local PV).
26
+ # Object storage (S3/GCS/MinIO) deferred to fatbox multi-org Phase F+.
27
+ # For single-operator k3s, local PV is simpler and sufficient.
28
+ # -------------------------------------------------------------------------
29
+ storage:
30
+ type: filesystem
31
+
32
+ schemaConfig:
33
+ configs:
34
+ - from: "2024-01-01"
35
+ store: tsdb
36
+ object_store: filesystem
37
+ schema: v13
38
+ index:
39
+ prefix: loki_index_
40
+ period: 24h
41
+
42
+ # -------------------------------------------------------------------------
43
+ # Retention: 7 days (168h) per Performance budget acceptance criterion #6.
44
+ # compactor.retention_enabled enables deletion; ring config required for
45
+ # single-binary mode.
46
+ # -------------------------------------------------------------------------
47
+ limits_config:
48
+ retention_period: 168h # 7 days
49
+ ingestion_rate_mb: 4 # per-tenant ingestion cap (single tenant)
50
+ ingestion_burst_size_mb: 8
51
+ max_query_series: 5000 # cap log-derived queries from going wide (P3 <3s p95)
52
+ max_entries_limit_per_query: 5000
53
+
54
+ compactor:
55
+ retention_enabled: true
56
+ delete_request_store: filesystem
57
+ compaction_interval: 10m
58
+ working_directory: /var/loki/compactor
59
+
60
+ ingester:
61
+ chunk_idle_period: 30m # flush to storage; appropriate for low write rate
62
+ chunk_retain_period: 1m
63
+ max_chunk_age: 2h
64
+
65
+ # Self-metrics endpoint — Phase C Prometheus scrapes this.
66
+ # Server block exposed on port 3100 (default); /metrics is always available.
67
+
68
+ singleBinary:
69
+ replicas: 1
70
+
71
+ # -------------------------------------------------------------------------
72
+ # Persistence: 10Gi PV.
73
+ #
74
+ # Rationale: 7-day retention at olam scale (<500 containers, access logs
75
+ # estimated 1–2MB/day compressed) → ~100MB typical stored. 10Gi gives 10x
76
+ # headroom for burst (failed deploy loops, chatty containers) and is well
77
+ # within the <1GB typical acceptance criterion #6. Cloud provider default SC
78
+ # is fine; on bare-metal k3s the local-path provisioner is used.
79
+ # -------------------------------------------------------------------------
80
+ persistence:
81
+ enabled: true
82
+ size: 10Gi # 10× headroom over 7-day typical (~100MB); <1GB usage target per AC#6
83
+
84
+ # -------------------------------------------------------------------------
85
+ # Resources: memory limit 512Mi per task spec.
86
+ # Typical usage at olam scale: <200MB idle (boltdb index + block cache).
87
+ # 512Mi limit prevents compaction spikes from triggering OOM on the node.
88
+ # -------------------------------------------------------------------------
89
+ resources:
90
+ requests:
91
+ cpu: 100m
92
+ memory: 128Mi
93
+ limits:
94
+ cpu: 500m
95
+ memory: 512Mi # P2: <500MB idle / <1GB typical; limit prevents spike OOM
96
+
97
+ # -------------------------------------------------------------------------
98
+ # Self-metrics for Phase C Prometheus scrape.
99
+ # ServiceMonitor is created here; Prometheus picks it up in Phase C.
100
+ # -------------------------------------------------------------------------
101
+ monitoring:
102
+ selfMonitoring:
103
+ enabled: false # disables the bundled GrafanaAgent sub-chart dependency
104
+ grafanaAgent:
105
+ installOperator: false
106
+ serviceMonitor:
107
+ # Disabled in the source-of-truth values file so a standalone Phase B install
108
+ # (without kube-prometheus-stack) does not hard-fail when the CRD is absent.
109
+ # The C1 e2e script flips this on at RUNTIME via
110
+ # helm upgrade ... --reuse-values --set monitoring.serviceMonitor.enabled=true
111
+ # AFTER kube-prom-stack has installed the ServiceMonitor CRD.
112
+ # NOTE: Loki 6.7.4 uses monitoring.serviceMonitor (not top-level serviceMonitor).
113
+ enabled: false
114
+
115
+ # -------------------------------------------------------------------------
116
+ # Backend and read/write gateway: disabled for SingleBinary mode.
117
+ # These are microservices-mode components and must be off or the chart
118
+ # emits validation errors when deploymentMode=SingleBinary.
119
+ # -------------------------------------------------------------------------
120
+ backend:
121
+ replicas: 0
122
+ read:
123
+ replicas: 0
124
+ write:
125
+ replicas: 0
126
+
127
+ # Grafana agent / canary: not needed; disable to keep resource footprint minimal.
128
+ lokiCanary:
129
+ enabled: false
130
+
131
+ test:
132
+ enabled: false
133
+
134
+ # -------------------------------------------------------------------------
135
+ # Sub-component slimming — chart 6.7.4 defaults include nginx gateway +
136
+ # two Memcached clusters + minio + sidecar watchers that single-binary
137
+ # mode doesn't need. Each adds image-pull and Ready-wait time. Disabling
138
+ # all of them brings the install Ready-time within the harness budget.
139
+ # If a future scenario needs query-result caching, re-evaluate
140
+ # resultsCache specifically.
141
+ # -------------------------------------------------------------------------
142
+
143
+ # nginx routing front; Promtail writes direct to single-binary :3100
144
+ gateway:
145
+ enabled: false
146
+
147
+ # Memcached cluster — overhead for single-binary
148
+ chunksCache:
149
+ enabled: false
150
+
151
+ # second Memcached cluster — overhead for single-binary
152
+ resultsCache:
153
+ enabled: false
154
+
155
+ # minio is off because storage.type=filesystem, but be explicit
156
+ minio:
157
+ enabled: false
158
+
159
+ # Sidecar that watches ConfigMaps for runtime config reloads — we don't ship one.
160
+ sidecar:
161
+ rules:
162
+ enabled: false
163
+ datasources:
164
+ enabled: false
165
+ configs:
166
+ enabled: false