@pleri/olam-cli 0.1.161 → 0.1.162
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -4
- package/dist/agent-stream/agent-sdk-to-chunks.js +20 -2
- package/dist/commands/bootstrap.d.ts.map +1 -1
- package/dist/commands/bootstrap.js +35 -11
- package/dist/commands/bootstrap.js.map +1 -1
- package/dist/commands/flywheel/migrate-overlays.d.ts +1 -0
- package/dist/commands/flywheel/migrate-overlays.d.ts.map +1 -1
- package/dist/commands/flywheel/migrate-overlays.js +29 -3
- package/dist/commands/flywheel/migrate-overlays.js.map +1 -1
- package/dist/commands/skills-source.d.ts.map +1 -1
- package/dist/commands/skills-source.js +57 -2
- package/dist/commands/skills-source.js.map +1 -1
- package/dist/commands/skills.d.ts.map +1 -1
- package/dist/commands/skills.js +14 -0
- package/dist/commands/skills.js.map +1 -1
- package/dist/image-digests.json +7 -7
- package/dist/index.js +996 -618
- package/dist/lib/bootstrap-kubernetes.d.ts.map +1 -1
- package/dist/lib/bootstrap-kubernetes.js +93 -13
- package/dist/lib/bootstrap-kubernetes.js.map +1 -1
- package/dist/mcp-server.js +568 -368
- package/hermes-bundle/version.json +1 -1
- package/host-cp/k8s/manifests/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +1 -1
- package/host-cp/observability/grafana-port-forward.sh +12 -2
- package/host-cp/observability/kyverno-cardinality-mutate.sh +12 -2
- package/host-cp/observability/loki-ingest.sh +12 -2
- package/host-cp/observability/prom-no-double-grafana.sh +15 -5
- package/host-cp/peripheral-services/helm-values/grafana-values.yaml +159 -0
- package/host-cp/peripheral-services/helm-values/kube-prom-stack-values.yaml +229 -0
- package/host-cp/peripheral-services/helm-values/kyverno-values.yaml +85 -0
- package/host-cp/peripheral-services/helm-values/loki-values.yaml +166 -0
- package/host-cp/peripheral-services/helm-values/promtail-staging.yaml +92 -0
- package/host-cp/peripheral-services/helm-values/promtail-values.yaml +102 -0
- package/host-cp/peripheral-services/helm-values/traefik-values.yaml +73 -0
- package/host-cp/peripheral-services/manifests/20-namespace.yaml +6 -0
- package/host-cp/peripheral-services/manifests/24-deploy-kg-service.yaml +245 -0
- package/host-cp/peripheral-services/manifests/30-traefik-ingressroute-host-cp.yaml +22 -0
- package/host-cp/peripheral-services/manifests/40-traefik-ingressroute-kg.yaml +29 -0
- package/host-cp/peripheral-services/manifests/50-traefik-ingressroute-agent-memory.yaml +29 -0
- package/host-cp/peripheral-services/manifests/60-networkpolicy-ingress.yaml +80 -0
- package/host-cp/peripheral-services/manifests/65-networkpolicy-loki-prom-deny.yaml +67 -0
- package/host-cp/peripheral-services/manifests/80-grafana-dashboard-configmap.yaml +1349 -0
- package/host-cp/peripheral-services/manifests/90-prom-alert-cardinality.yaml +50 -0
- package/host-cp/peripheral-services/manifests/91-servicemonitor-host-cp.yaml +70 -0
- package/host-cp/peripheral-services/manifests/92-servicemonitor-kg-service.yaml +70 -0
- package/host-cp/peripheral-services/manifests/93-servicemonitor-memory-service.yaml +87 -0
- package/host-cp/peripheral-services/manifests/95-prom-recording-rules.yaml +108 -0
- package/host-cp/peripheral-services/manifests/96-kyverno-cardinality-mutate.yaml +195 -0
- package/host-cp/src/plan-chat-service.mjs +147 -1
- package/package.json +1 -1
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# Kyverno Helm values — k3s-ingress-observability Phase C C8 follow-up.
|
|
2
|
+
#
|
|
3
|
+
# Kyverno is the policy-as-code layer for cluster-wide cardinality
|
|
4
|
+
# enforcement (closes codex's C2 concern on PR #783). The companion
|
|
5
|
+
# ClusterPolicy in
|
|
6
|
+
# `packages/peripheral-services/manifests/96-kyverno-cardinality-mutate.yaml`
|
|
7
|
+
# mutates every incoming ServiceMonitor and PodMonitor to inject the
|
|
8
|
+
# labeldrop rule before the object is persisted — so a third-party
|
|
9
|
+
# chart (or hand-rolled object) cannot bypass the layer-2
|
|
10
|
+
# per-ServiceMonitor enforcement landed in C2.
|
|
11
|
+
#
|
|
12
|
+
# Chart: kyverno/kyverno; pinned to 3.8.1 (app v1.18.1, 2026-05-21 latest stable).
|
|
13
|
+
# Upgrade discipline: this pin AND the helm-install line in
|
|
14
|
+
# `scripts/e2e/kyverno-cardinality-mutate.sh` must stay in sync.
|
|
15
|
+
#
|
|
16
|
+
# Footprint posture (single-operator k3s scale):
|
|
17
|
+
# We only run admission-time mutation. The ClusterPolicy uses
|
|
18
|
+
# `spec.background: false`, so the background-scan controller is
|
|
19
|
+
# unused. Cleanup + reports controllers are also dead weight for
|
|
20
|
+
# a single ClusterPolicy with no PolicyExceptions — they're disabled
|
|
21
|
+
# so Kyverno's pod count stays minimal (1 pod, not 4).
|
|
22
|
+
#
|
|
23
|
+
# Footprint (Phase C C8 contribution to P2 target <500MB idle / <1GB typical):
|
|
24
|
+
# admissionController: 128Mi req / 384Mi limit (chart default 128Mi/384Mi)
|
|
25
|
+
# Total addition: ~128Mi req / ~384Mi limit
|
|
26
|
+
#
|
|
27
|
+
# If/when we want policy reports populated for observability dashboards,
|
|
28
|
+
# flip `reportsController.enabled: true` and the `features.policyReports`
|
|
29
|
+
# block below. Same for cleanup.
|
|
30
|
+
#
|
|
31
|
+
# Resource limits — tuned upward from chart default for admission webhook
|
|
32
|
+
# stability under burst churn (kube-prom-stack ships ~10 ServiceMonitors at
|
|
33
|
+
# once during `helm upgrade`, which arrives as a burst of AdmissionReviews).
|
|
34
|
+
|
|
35
|
+
# -------------------------------------------------------------------------
|
|
36
|
+
# Disable controllers we don't need
|
|
37
|
+
# -------------------------------------------------------------------------
|
|
38
|
+
backgroundController:
|
|
39
|
+
enabled: false # ClusterPolicy is admission-only (background: false)
|
|
40
|
+
|
|
41
|
+
cleanupController:
|
|
42
|
+
enabled: false # no CleanupPolicy objects in this repo
|
|
43
|
+
|
|
44
|
+
reportsController:
|
|
45
|
+
enabled: false # no policy-reports surface wired into Grafana yet
|
|
46
|
+
|
|
47
|
+
# -------------------------------------------------------------------------
|
|
48
|
+
# Features — admissionReports + policyReports remain ON inside the
|
|
49
|
+
# admission controller itself even when the standalone reports controller
|
|
50
|
+
# is disabled. This keeps `kubectl get clusterpolicyreport` queryable
|
|
51
|
+
# during dogfood; the reports controller would only AGGREGATE them
|
|
52
|
+
# cluster-wide, which we don't need yet.
|
|
53
|
+
# -------------------------------------------------------------------------
|
|
54
|
+
features:
|
|
55
|
+
admissionReports:
|
|
56
|
+
enabled: true
|
|
57
|
+
policyReports:
|
|
58
|
+
enabled: true
|
|
59
|
+
# Background scan is N/A — the policy uses background: false. Explicit
|
|
60
|
+
# off avoids the controller scheduling unnecessary scan workers even
|
|
61
|
+
# when the controller pod is disabled above.
|
|
62
|
+
backgroundScan:
|
|
63
|
+
enabled: false
|
|
64
|
+
# Logging volume defaults are fine; level 2 = info-ish.
|
|
65
|
+
logging:
|
|
66
|
+
format: text
|
|
67
|
+
verbosity: 2
|
|
68
|
+
|
|
69
|
+
# -------------------------------------------------------------------------
|
|
70
|
+
# Admission controller — the only pod we run.
|
|
71
|
+
# -------------------------------------------------------------------------
|
|
72
|
+
admissionController:
|
|
73
|
+
replicas: 1 # single-operator k3s scale; HA is N/A for dogfood
|
|
74
|
+
|
|
75
|
+
rbac:
|
|
76
|
+
create: true # ClusterPolicy needs cluster-wide watch on ServiceMonitor + PodMonitor
|
|
77
|
+
|
|
78
|
+
container:
|
|
79
|
+
resources:
|
|
80
|
+
requests:
|
|
81
|
+
cpu: 100m
|
|
82
|
+
memory: 256Mi
|
|
83
|
+
limits:
|
|
84
|
+
cpu: 500m
|
|
85
|
+
memory: 512Mi
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
# Loki Helm values — k3s-ingress-observability Phase B Task B1
|
|
2
|
+
#
|
|
3
|
+
# Single-binary mode (Decision-16 + Phase B scope):
|
|
4
|
+
# Distributed mode (microservices) adds 5+ independent Deployments + a Minio
|
|
5
|
+
# or S3 backend for object storage — pure overhead for a single-operator
|
|
6
|
+
# k3s install where Loki's write throughput is bounded by one Promtail
|
|
7
|
+
# DaemonSet and a handful of containers. SingleBinary collapses all roles
|
|
8
|
+
# (ingester, querier, compactor) into one Pod, fits within the <500MB idle
|
|
9
|
+
# LGTM RAM target (P2), and is trivially replaceable if scale demands change.
|
|
10
|
+
#
|
|
11
|
+
# See: docs/plans/k3s-ingress-observability/DESIGN.md (P2, S2)
|
|
12
|
+
#
|
|
13
|
+
# Chart: grafana/loki; pinned to 6.7.4 (latest stable as of 2026-05-20).
|
|
14
|
+
# Upgrade discipline: chart version is embedded in the e2e script comment.
|
|
15
|
+
|
|
16
|
+
deploymentMode: SingleBinary
|
|
17
|
+
|
|
18
|
+
loki:
|
|
19
|
+
auth_enabled: false # single-tenant; multi-tenancy adds header overhead with no benefit here
|
|
20
|
+
|
|
21
|
+
commonConfig:
|
|
22
|
+
replication_factor: 1 # single-binary; no replicas = no cross-replica consistency needed
|
|
23
|
+
|
|
24
|
+
# -------------------------------------------------------------------------
|
|
25
|
+
# Storage backend: filesystem (boltdb-shipper + tsdb index; local PV).
|
|
26
|
+
# Object storage (S3/GCS/MinIO) deferred to fatbox multi-org Phase F+.
|
|
27
|
+
# For single-operator k3s, local PV is simpler and sufficient.
|
|
28
|
+
# -------------------------------------------------------------------------
|
|
29
|
+
storage:
|
|
30
|
+
type: filesystem
|
|
31
|
+
|
|
32
|
+
schemaConfig:
|
|
33
|
+
configs:
|
|
34
|
+
- from: "2024-01-01"
|
|
35
|
+
store: tsdb
|
|
36
|
+
object_store: filesystem
|
|
37
|
+
schema: v13
|
|
38
|
+
index:
|
|
39
|
+
prefix: loki_index_
|
|
40
|
+
period: 24h
|
|
41
|
+
|
|
42
|
+
# -------------------------------------------------------------------------
|
|
43
|
+
# Retention: 7 days (168h) per Performance budget acceptance criterion #6.
|
|
44
|
+
# compactor.retention_enabled enables deletion; ring config required for
|
|
45
|
+
# single-binary mode.
|
|
46
|
+
# -------------------------------------------------------------------------
|
|
47
|
+
limits_config:
|
|
48
|
+
retention_period: 168h # 7 days
|
|
49
|
+
ingestion_rate_mb: 4 # per-tenant ingestion cap (single tenant)
|
|
50
|
+
ingestion_burst_size_mb: 8
|
|
51
|
+
max_query_series: 5000 # cap log-derived queries from going wide (P3 <3s p95)
|
|
52
|
+
max_entries_limit_per_query: 5000
|
|
53
|
+
|
|
54
|
+
compactor:
|
|
55
|
+
retention_enabled: true
|
|
56
|
+
delete_request_store: filesystem
|
|
57
|
+
compaction_interval: 10m
|
|
58
|
+
working_directory: /var/loki/compactor
|
|
59
|
+
|
|
60
|
+
ingester:
|
|
61
|
+
chunk_idle_period: 30m # flush to storage; appropriate for low write rate
|
|
62
|
+
chunk_retain_period: 1m
|
|
63
|
+
max_chunk_age: 2h
|
|
64
|
+
|
|
65
|
+
# Self-metrics endpoint — Phase C Prometheus scrapes this.
|
|
66
|
+
# Server block exposed on port 3100 (default); /metrics is always available.
|
|
67
|
+
|
|
68
|
+
singleBinary:
|
|
69
|
+
replicas: 1
|
|
70
|
+
|
|
71
|
+
# -------------------------------------------------------------------------
|
|
72
|
+
# Persistence: 10Gi PV.
|
|
73
|
+
#
|
|
74
|
+
# Rationale: 7-day retention at olam scale (<500 containers, access logs
|
|
75
|
+
# estimated 1–2MB/day compressed) → ~100MB typical stored. 10Gi gives 10x
|
|
76
|
+
# headroom for burst (failed deploy loops, chatty containers) and is well
|
|
77
|
+
# within the <1GB typical acceptance criterion #6. Cloud provider default SC
|
|
78
|
+
# is fine; on bare-metal k3s the local-path provisioner is used.
|
|
79
|
+
# -------------------------------------------------------------------------
|
|
80
|
+
persistence:
|
|
81
|
+
enabled: true
|
|
82
|
+
size: 10Gi # 10× headroom over 7-day typical (~100MB); <1GB usage target per AC#6
|
|
83
|
+
|
|
84
|
+
# -------------------------------------------------------------------------
|
|
85
|
+
# Resources: memory limit 512Mi per task spec.
|
|
86
|
+
# Typical usage at olam scale: <200MB idle (boltdb index + block cache).
|
|
87
|
+
# 512Mi limit prevents compaction spikes from triggering OOM on the node.
|
|
88
|
+
# -------------------------------------------------------------------------
|
|
89
|
+
resources:
|
|
90
|
+
requests:
|
|
91
|
+
cpu: 100m
|
|
92
|
+
memory: 128Mi
|
|
93
|
+
limits:
|
|
94
|
+
cpu: 500m
|
|
95
|
+
memory: 512Mi # P2: <500MB idle / <1GB typical; limit prevents spike OOM
|
|
96
|
+
|
|
97
|
+
# -------------------------------------------------------------------------
|
|
98
|
+
# Self-metrics for Phase C Prometheus scrape.
|
|
99
|
+
# ServiceMonitor is created here; Prometheus picks it up in Phase C.
|
|
100
|
+
# -------------------------------------------------------------------------
|
|
101
|
+
monitoring:
|
|
102
|
+
selfMonitoring:
|
|
103
|
+
enabled: false # disables the bundled GrafanaAgent sub-chart dependency
|
|
104
|
+
grafanaAgent:
|
|
105
|
+
installOperator: false
|
|
106
|
+
serviceMonitor:
|
|
107
|
+
# Disabled in the source-of-truth values file so a standalone Phase B install
|
|
108
|
+
# (without kube-prometheus-stack) does not hard-fail when the CRD is absent.
|
|
109
|
+
# The C1 e2e script flips this on at RUNTIME via
|
|
110
|
+
# helm upgrade ... --reuse-values --set monitoring.serviceMonitor.enabled=true
|
|
111
|
+
# AFTER kube-prom-stack has installed the ServiceMonitor CRD.
|
|
112
|
+
# NOTE: Loki 6.7.4 uses monitoring.serviceMonitor (not top-level serviceMonitor).
|
|
113
|
+
enabled: false
|
|
114
|
+
|
|
115
|
+
# -------------------------------------------------------------------------
|
|
116
|
+
# Backend and read/write gateway: disabled for SingleBinary mode.
|
|
117
|
+
# These are microservices-mode components and must be off or the chart
|
|
118
|
+
# emits validation errors when deploymentMode=SingleBinary.
|
|
119
|
+
# -------------------------------------------------------------------------
|
|
120
|
+
backend:
|
|
121
|
+
replicas: 0
|
|
122
|
+
read:
|
|
123
|
+
replicas: 0
|
|
124
|
+
write:
|
|
125
|
+
replicas: 0
|
|
126
|
+
|
|
127
|
+
# Grafana agent / canary: not needed; disable to keep resource footprint minimal.
|
|
128
|
+
lokiCanary:
|
|
129
|
+
enabled: false
|
|
130
|
+
|
|
131
|
+
test:
|
|
132
|
+
enabled: false
|
|
133
|
+
|
|
134
|
+
# -------------------------------------------------------------------------
|
|
135
|
+
# Sub-component slimming — chart 6.7.4 defaults include nginx gateway +
|
|
136
|
+
# two Memcached clusters + minio + sidecar watchers that single-binary
|
|
137
|
+
# mode doesn't need. Each adds image-pull and Ready-wait time. Disabling
|
|
138
|
+
# all of them brings the install Ready-time within the harness budget.
|
|
139
|
+
# If a future scenario needs query-result caching, re-evaluate
|
|
140
|
+
# resultsCache specifically.
|
|
141
|
+
# -------------------------------------------------------------------------
|
|
142
|
+
|
|
143
|
+
# nginx routing front; Promtail writes direct to single-binary :3100
|
|
144
|
+
gateway:
|
|
145
|
+
enabled: false
|
|
146
|
+
|
|
147
|
+
# Memcached cluster — overhead for single-binary
|
|
148
|
+
chunksCache:
|
|
149
|
+
enabled: false
|
|
150
|
+
|
|
151
|
+
# second Memcached cluster — overhead for single-binary
|
|
152
|
+
resultsCache:
|
|
153
|
+
enabled: false
|
|
154
|
+
|
|
155
|
+
# minio is off because storage.type=filesystem, but be explicit
|
|
156
|
+
minio:
|
|
157
|
+
enabled: false
|
|
158
|
+
|
|
159
|
+
# Sidecar that watches ConfigMaps for runtime config reloads — we don't ship one.
|
|
160
|
+
sidecar:
|
|
161
|
+
rules:
|
|
162
|
+
enabled: false
|
|
163
|
+
datasources:
|
|
164
|
+
enabled: false
|
|
165
|
+
configs:
|
|
166
|
+
enabled: false
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# Promtail Helm values — Phase A Task A5 staging (Phase B consumes)
|
|
2
|
+
#
|
|
3
|
+
# Tails every container's stdout; ships to Loki single-binary (Phase B installs Loki).
|
|
4
|
+
# Per OQ-p3-6: Traefik native config can redact HEADERS but NOT URL query params —
|
|
5
|
+
# query-param scrubbing for `?token=`, `?code=`, `?access_token=`, `?state=` happens
|
|
6
|
+
# HERE at Promtail ingest via pipeline_stages.replace regex.
|
|
7
|
+
#
|
|
8
|
+
# Resource limits per OQ-p3-37 (Promtail OOM risk under chatty container-cp 100ms cadence):
|
|
9
|
+
# - memory limit 256Mi
|
|
10
|
+
# - pipeline_stages.limit rate 100 lines/sec/stream
|
|
11
|
+
#
|
|
12
|
+
# Scrape config matches every pod log; namespace-scope labels are added so Loki LogQL queries
|
|
13
|
+
# can filter by service / namespace / pod.
|
|
14
|
+
#
|
|
15
|
+
# SECURITY NOTE — replace stage regex semantics (load-bearing):
|
|
16
|
+
# Promtail's `replace` stage iterates over CAPTURE GROUPS, not full matches.
|
|
17
|
+
# The `replace` field is a Go text/template string; `${1}` is NOT valid Go
|
|
18
|
+
# template syntax and silently becomes a literal. The correct pattern is:
|
|
19
|
+
# expression: '(?:prefix)(secret_value_only)' — capture ONLY the secret part
|
|
20
|
+
# replace: 'REDACTED' — replace captured secret with literal
|
|
21
|
+
# See promtail-values.yaml header comment for full details.
|
|
22
|
+
|
|
23
|
+
deploymentMode: DaemonSet
|
|
24
|
+
|
|
25
|
+
resources:
|
|
26
|
+
requests:
|
|
27
|
+
cpu: 50m
|
|
28
|
+
memory: 64Mi
|
|
29
|
+
limits:
|
|
30
|
+
cpu: 200m
|
|
31
|
+
memory: 256Mi # OQ-p3-37: bounded; OOM-kill restart preferred over runaway memory
|
|
32
|
+
|
|
33
|
+
config:
|
|
34
|
+
clients:
|
|
35
|
+
- url: http://olam-loki.monitoring.svc.cluster.local:3100/loki/api/v1/push
|
|
36
|
+
|
|
37
|
+
snippets:
|
|
38
|
+
pipelineStages:
|
|
39
|
+
# 1. Parse JSON access logs from Traefik (key field present in JSON line)
|
|
40
|
+
- match:
|
|
41
|
+
selector: '{container="traefik"}'
|
|
42
|
+
stages:
|
|
43
|
+
- json:
|
|
44
|
+
expressions:
|
|
45
|
+
request_method: RequestMethod
|
|
46
|
+
request_path: RequestPath
|
|
47
|
+
status: DownstreamStatus
|
|
48
|
+
request_id: requestId
|
|
49
|
+
service: ServiceName
|
|
50
|
+
router: RouterName
|
|
51
|
+
|
|
52
|
+
# 2. Scrub OAuth/token values from URL query params and Authorization headers.
|
|
53
|
+
#
|
|
54
|
+
# IMPORTANT — capture group semantics:
|
|
55
|
+
# The replace stage replaces each CAPTURE GROUP with the `replace` template
|
|
56
|
+
# value. Capture groups must wrap ONLY the secret value, not the surrounding
|
|
57
|
+
# context. The prefix (e.g. `?code=`) uses a non-capturing group `(?:...)` so
|
|
58
|
+
# it is preserved in the output while only the secret is replaced.
|
|
59
|
+
- replace:
|
|
60
|
+
# OAuth code= callback values — capture only the token value after `code=`
|
|
61
|
+
expression: '(?:\?|&)code=([^&\s]+)'
|
|
62
|
+
replace: 'REDACTED'
|
|
63
|
+
- replace:
|
|
64
|
+
# Bearer / access tokens in query strings — capture only the value
|
|
65
|
+
expression: '(?:\?|&)(?:access_token|token|api_key|secret)=([^&\s]+)'
|
|
66
|
+
replace: 'REDACTED'
|
|
67
|
+
- replace:
|
|
68
|
+
# OAuth state param (may carry session info) — capture only the value
|
|
69
|
+
expression: '(?:\?|&)state=([^&\s]+)'
|
|
70
|
+
replace: 'REDACTED'
|
|
71
|
+
- replace:
|
|
72
|
+
# Authorization header Bearer value — capture only the token after `Bearer `
|
|
73
|
+
expression: '(?:Authorization|authorization):\s*(?:Bearer|bearer)\s+(\S+)'
|
|
74
|
+
replace: 'REDACTED'
|
|
75
|
+
|
|
76
|
+
# 3. Rate-limit ingestion per-stream to prevent OOM cascade under chatty containers (OQ-p3-37)
|
|
77
|
+
- limit:
|
|
78
|
+
rate: 100 # max log lines/sec per stream
|
|
79
|
+
burst: 200
|
|
80
|
+
drop: true # drop excess lines; do NOT block tail
|
|
81
|
+
|
|
82
|
+
# 4. Promote parsed fields to labels (low-cardinality only — taxonomy compliance)
|
|
83
|
+
- labels:
|
|
84
|
+
service: # from Traefik JSON access log; matches taxonomy `service` label
|
|
85
|
+
router: # Traefik router name
|
|
86
|
+
status: # HTTP status code (within taxonomy)
|
|
87
|
+
|
|
88
|
+
# Retention is configured on Loki side (Phase B), not Promtail.
|
|
89
|
+
# Sample retention target: 7 days per Performance budget Row.
|
|
90
|
+
|
|
91
|
+
serviceMonitor:
|
|
92
|
+
enabled: true # Prometheus (Phase C) scrapes Promtail's own /metrics for self-observability
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# Promtail Helm values — k3s-ingress-observability Phase B Task B1 (production)
|
|
2
|
+
#
|
|
3
|
+
# Production Promtail values. Staging copy at promtail-staging.yaml has the
|
|
4
|
+
# same scrubbing pipeline shape; this file sets the Loki client URL +
|
|
5
|
+
# production resource limits.
|
|
6
|
+
#
|
|
7
|
+
# Scrubbing pipeline:
|
|
8
|
+
# - 4 `replace` stages: code=, token/access_token/api_key/secret=, state=, Authorization
|
|
9
|
+
# - `limit` stage: rate=100/burst=200/drop=true (OQ-p3-37: Promtail OOM under chatty containers)
|
|
10
|
+
# Client URL: http://olam-loki.monitoring.svc.cluster.local:3100/loki/api/v1/push
|
|
11
|
+
# Service name `olam-loki` is the Helm release name used in scripts/e2e/loki-ingest.sh
|
|
12
|
+
# (`helm upgrade --install olam-loki grafana/loki ...`); the chart's Service
|
|
13
|
+
# is named after the release, so `olam-loki` is the in-cluster DNS hostname.
|
|
14
|
+
#
|
|
15
|
+
# SECURITY NOTE — replace stage regex semantics (load-bearing):
|
|
16
|
+
# Promtail's `replace` stage iterates over CAPTURE GROUPS, not full matches.
|
|
17
|
+
# The `replace` field is a Go text/template string; `${1}` is NOT valid Go
|
|
18
|
+
# template syntax and silently becomes a literal. The correct pattern is:
|
|
19
|
+
# expression: '(?:prefix)(secret_value_only)' — capture ONLY the secret part
|
|
20
|
+
# replace: 'REDACTED' — replace captured secret with literal
|
|
21
|
+
# This leaves the surrounding context (e.g. `?code=`) intact and redacts only
|
|
22
|
+
# the value. The broken pattern `(\?|&)code=[^&\s]+` with `replace: '${1}code=REDACTED'`
|
|
23
|
+
# was the root cause of the Phase B scrubbing regression (PR #776).
|
|
24
|
+
#
|
|
25
|
+
# See: docs/plans/k3s-ingress-observability/DESIGN.md (T8, T9)
|
|
26
|
+
|
|
27
|
+
deploymentMode: DaemonSet
|
|
28
|
+
|
|
29
|
+
resources:
|
|
30
|
+
requests:
|
|
31
|
+
cpu: 50m
|
|
32
|
+
memory: 64Mi
|
|
33
|
+
limits:
|
|
34
|
+
cpu: 200m
|
|
35
|
+
memory: 256Mi # OQ-p3-37: bounded; OOM-kill restart preferred over runaway memory
|
|
36
|
+
|
|
37
|
+
config:
|
|
38
|
+
clients:
|
|
39
|
+
- url: http://olam-loki.monitoring.svc.cluster.local:3100/loki/api/v1/push
|
|
40
|
+
|
|
41
|
+
snippets:
|
|
42
|
+
pipelineStages:
|
|
43
|
+
# 1. Parse JSON access logs from Traefik (key field present in JSON line)
|
|
44
|
+
- match:
|
|
45
|
+
selector: '{container="traefik"}'
|
|
46
|
+
stages:
|
|
47
|
+
- json:
|
|
48
|
+
expressions:
|
|
49
|
+
request_method: RequestMethod
|
|
50
|
+
request_path: RequestPath
|
|
51
|
+
status: DownstreamStatus
|
|
52
|
+
request_id: requestId
|
|
53
|
+
service: ServiceName
|
|
54
|
+
router: RouterName
|
|
55
|
+
|
|
56
|
+
# 2. Scrub OAuth/token values from URL query params and Authorization headers.
|
|
57
|
+
#
|
|
58
|
+
# IMPORTANT — capture group semantics:
|
|
59
|
+
# The replace stage replaces each CAPTURE GROUP with the `replace` template
|
|
60
|
+
# value. Capture groups must wrap ONLY the secret value, not the surrounding
|
|
61
|
+
# context. The prefix (e.g. `?code=`) uses a non-capturing group `(?:...)` so
|
|
62
|
+
# it is preserved in the output while only the secret is replaced.
|
|
63
|
+
- replace:
|
|
64
|
+
# OAuth code= callback values — capture only the token value after `code=`
|
|
65
|
+
expression: '(?:\?|&)code=([^&\s]+)'
|
|
66
|
+
replace: 'REDACTED'
|
|
67
|
+
- replace:
|
|
68
|
+
# Bearer / access tokens in query strings — capture only the value
|
|
69
|
+
expression: '(?:\?|&)(?:access_token|token|api_key|secret)=([^&\s]+)'
|
|
70
|
+
replace: 'REDACTED'
|
|
71
|
+
- replace:
|
|
72
|
+
# OAuth state param (may carry session info) — capture only the value
|
|
73
|
+
expression: '(?:\?|&)state=([^&\s]+)'
|
|
74
|
+
replace: 'REDACTED'
|
|
75
|
+
- replace:
|
|
76
|
+
# Authorization header Bearer value — capture only the token after `Bearer `
|
|
77
|
+
expression: '(?:Authorization|authorization):\s*(?:Bearer|bearer)\s+(\S+)'
|
|
78
|
+
replace: 'REDACTED'
|
|
79
|
+
|
|
80
|
+
# 3. Rate-limit ingestion per-stream to prevent OOM cascade under chatty containers (OQ-p3-37)
|
|
81
|
+
- limit:
|
|
82
|
+
rate: 100 # max log lines/sec per stream
|
|
83
|
+
burst: 200
|
|
84
|
+
drop: true # drop excess lines; do NOT block tail
|
|
85
|
+
|
|
86
|
+
# 4. Promote parsed fields to labels (low-cardinality only — taxonomy compliance)
|
|
87
|
+
- labels:
|
|
88
|
+
service: # from Traefik JSON access log; matches taxonomy `service` label
|
|
89
|
+
router: # Traefik router name
|
|
90
|
+
status: # HTTP status code (within taxonomy)
|
|
91
|
+
|
|
92
|
+
# Retention is configured on Loki side (loki-values.yaml: 7 days / 168h).
|
|
93
|
+
|
|
94
|
+
serviceMonitor:
|
|
95
|
+
# Disabled in the source-of-truth values file so a standalone Phase B install
|
|
96
|
+
# (without kube-prometheus-stack) does not hard-fail with
|
|
97
|
+
# "no matches for kind ServiceMonitor in version monitoring.coreos.com/v1".
|
|
98
|
+
# The C1 e2e script flips this on at RUNTIME via
|
|
99
|
+
# helm upgrade ... --reuse-values --set serviceMonitor.enabled=true
|
|
100
|
+
# AFTER kube-prom-stack has installed the ServiceMonitor CRD. Source-of-truth
|
|
101
|
+
# stays standalone-friendly; runtime override wires Prometheus discovery.
|
|
102
|
+
enabled: false
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# Traefik Helm values — k3s-ingress-observability Phase A Task A3
|
|
2
|
+
# Pinned NodePort 30080 per OQ-p3-7 (world hooks bake this URL).
|
|
3
|
+
# Structured JSON access logs ready for Phase A Task A5 + Phase B Promtail pickup.
|
|
4
|
+
|
|
5
|
+
deployment:
|
|
6
|
+
replicas: 1 # SPOF mitigation = host systemd watchdog (Phase A Task A11), not HA replicas
|
|
7
|
+
|
|
8
|
+
ports:
|
|
9
|
+
web:
|
|
10
|
+
port: 8000
|
|
11
|
+
expose:
|
|
12
|
+
default: true
|
|
13
|
+
exposedPort: 80
|
|
14
|
+
nodePort: 30080 # PIN (OQ-p3-7); world hooks reach via host.docker.internal:30080
|
|
15
|
+
protocol: TCP
|
|
16
|
+
websecure:
|
|
17
|
+
port: 8443
|
|
18
|
+
expose:
|
|
19
|
+
default: true
|
|
20
|
+
exposedPort: 443
|
|
21
|
+
nodePort: 30443
|
|
22
|
+
protocol: TCP
|
|
23
|
+
# v1: HTTPS deferred to fatbox multi-org (Out-of-scope of this plan); TLS not configured.
|
|
24
|
+
|
|
25
|
+
service:
|
|
26
|
+
type: NodePort
|
|
27
|
+
|
|
28
|
+
# Structured access logs to stdout — Promtail picks up in Phase B.
|
|
29
|
+
# Authorization header redaction here; URL query-param scrubbing happens
|
|
30
|
+
# at Promtail pipeline_stages.replace per OQ-p3-6 (Traefik can't scrub query params natively).
|
|
31
|
+
logs:
|
|
32
|
+
general:
|
|
33
|
+
level: INFO
|
|
34
|
+
format: json
|
|
35
|
+
access:
|
|
36
|
+
enabled: true
|
|
37
|
+
format: json
|
|
38
|
+
fields:
|
|
39
|
+
headers:
|
|
40
|
+
defaultMode: keep
|
|
41
|
+
names:
|
|
42
|
+
Authorization: redact
|
|
43
|
+
Cookie: redact
|
|
44
|
+
|
|
45
|
+
# Built-in /metrics for Phase C Prometheus scrape
|
|
46
|
+
metrics:
|
|
47
|
+
prometheus:
|
|
48
|
+
enabled: true
|
|
49
|
+
addEntryPointsLabels: true
|
|
50
|
+
addRoutersLabels: true
|
|
51
|
+
addServicesLabels: true
|
|
52
|
+
|
|
53
|
+
# Dashboard disabled in cluster — operator uses Grafana (Phase B)
|
|
54
|
+
ingressRoute:
|
|
55
|
+
dashboard:
|
|
56
|
+
enabled: false
|
|
57
|
+
|
|
58
|
+
# IngressRoute CRD enabled
|
|
59
|
+
providers:
|
|
60
|
+
kubernetesCRD:
|
|
61
|
+
enabled: true
|
|
62
|
+
allowCrossNamespace: false # explicit; matches namespace-isolation strategy from A1
|
|
63
|
+
kubernetesIngress:
|
|
64
|
+
enabled: false # CRD-only; vanilla Ingress not supported in this stack
|
|
65
|
+
|
|
66
|
+
# Resource bounds — observability stack target <500MB RAM idle (P2)
|
|
67
|
+
resources:
|
|
68
|
+
requests:
|
|
69
|
+
cpu: 100m
|
|
70
|
+
memory: 64Mi
|
|
71
|
+
limits:
|
|
72
|
+
cpu: 500m
|
|
73
|
+
memory: 256Mi
|