@pleri/olam-cli 0.1.161 → 0.1.162

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/README.md +4 -4
  2. package/dist/agent-stream/agent-sdk-to-chunks.js +20 -2
  3. package/dist/commands/bootstrap.d.ts.map +1 -1
  4. package/dist/commands/bootstrap.js +35 -11
  5. package/dist/commands/bootstrap.js.map +1 -1
  6. package/dist/commands/flywheel/migrate-overlays.d.ts +1 -0
  7. package/dist/commands/flywheel/migrate-overlays.d.ts.map +1 -1
  8. package/dist/commands/flywheel/migrate-overlays.js +29 -3
  9. package/dist/commands/flywheel/migrate-overlays.js.map +1 -1
  10. package/dist/commands/skills-source.d.ts.map +1 -1
  11. package/dist/commands/skills-source.js +57 -2
  12. package/dist/commands/skills-source.js.map +1 -1
  13. package/dist/commands/skills.d.ts.map +1 -1
  14. package/dist/commands/skills.js +14 -0
  15. package/dist/commands/skills.js.map +1 -1
  16. package/dist/image-digests.json +7 -7
  17. package/dist/index.js +996 -618
  18. package/dist/lib/bootstrap-kubernetes.d.ts.map +1 -1
  19. package/dist/lib/bootstrap-kubernetes.js +93 -13
  20. package/dist/lib/bootstrap-kubernetes.js.map +1 -1
  21. package/dist/mcp-server.js +568 -368
  22. package/hermes-bundle/version.json +1 -1
  23. package/host-cp/k8s/manifests/50-deployment.yaml +1 -1
  24. package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
  25. package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
  26. package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
  27. package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +1 -1
  28. package/host-cp/observability/grafana-port-forward.sh +12 -2
  29. package/host-cp/observability/kyverno-cardinality-mutate.sh +12 -2
  30. package/host-cp/observability/loki-ingest.sh +12 -2
  31. package/host-cp/observability/prom-no-double-grafana.sh +15 -5
  32. package/host-cp/peripheral-services/helm-values/grafana-values.yaml +159 -0
  33. package/host-cp/peripheral-services/helm-values/kube-prom-stack-values.yaml +229 -0
  34. package/host-cp/peripheral-services/helm-values/kyverno-values.yaml +85 -0
  35. package/host-cp/peripheral-services/helm-values/loki-values.yaml +166 -0
  36. package/host-cp/peripheral-services/helm-values/promtail-staging.yaml +92 -0
  37. package/host-cp/peripheral-services/helm-values/promtail-values.yaml +102 -0
  38. package/host-cp/peripheral-services/helm-values/traefik-values.yaml +73 -0
  39. package/host-cp/peripheral-services/manifests/20-namespace.yaml +6 -0
  40. package/host-cp/peripheral-services/manifests/24-deploy-kg-service.yaml +245 -0
  41. package/host-cp/peripheral-services/manifests/30-traefik-ingressroute-host-cp.yaml +22 -0
  42. package/host-cp/peripheral-services/manifests/40-traefik-ingressroute-kg.yaml +29 -0
  43. package/host-cp/peripheral-services/manifests/50-traefik-ingressroute-agent-memory.yaml +29 -0
  44. package/host-cp/peripheral-services/manifests/60-networkpolicy-ingress.yaml +80 -0
  45. package/host-cp/peripheral-services/manifests/65-networkpolicy-loki-prom-deny.yaml +67 -0
  46. package/host-cp/peripheral-services/manifests/80-grafana-dashboard-configmap.yaml +1349 -0
  47. package/host-cp/peripheral-services/manifests/90-prom-alert-cardinality.yaml +50 -0
  48. package/host-cp/peripheral-services/manifests/91-servicemonitor-host-cp.yaml +70 -0
  49. package/host-cp/peripheral-services/manifests/92-servicemonitor-kg-service.yaml +70 -0
  50. package/host-cp/peripheral-services/manifests/93-servicemonitor-memory-service.yaml +87 -0
  51. package/host-cp/peripheral-services/manifests/95-prom-recording-rules.yaml +108 -0
  52. package/host-cp/peripheral-services/manifests/96-kyverno-cardinality-mutate.yaml +195 -0
  53. package/host-cp/src/plan-chat-service.mjs +147 -1
  54. package/package.json +1 -1
@@ -0,0 +1,85 @@
1
+ # Kyverno Helm values — k3s-ingress-observability Phase C C8 follow-up.
2
+ #
3
+ # Kyverno is the policy-as-code layer for cluster-wide cardinality
4
+ # enforcement (closes codex's C2 concern on PR #783). The companion
5
+ # ClusterPolicy in
6
+ # `packages/peripheral-services/manifests/96-kyverno-cardinality-mutate.yaml`
7
+ # mutates every incoming ServiceMonitor and PodMonitor to inject the
8
+ # labeldrop rule before the object is persisted — so a third-party
9
+ # chart (or hand-rolled object) cannot bypass the layer-2
10
+ # per-ServiceMonitor enforcement landed in C2.
11
+ #
12
+ # Chart: kyverno/kyverno; pinned to 3.8.1 (app v1.18.1, 2026-05-21 latest stable).
13
+ # Upgrade discipline: this pin AND the helm-install line in
14
+ # `scripts/e2e/kyverno-cardinality-mutate.sh` must stay in sync.
15
+ #
16
+ # Footprint posture (single-operator k3s scale):
17
+ # We only run admission-time mutation. The ClusterPolicy uses
18
+ # `spec.background: false`, so the background-scan controller is
19
+ # unused. Cleanup + reports controllers are also dead weight for
20
+ # a single ClusterPolicy with no PolicyExceptions — they're disabled
21
+ # so Kyverno's pod count stays minimal (1 pod, not 4).
22
+ #
23
+ # Footprint (Phase C C8 contribution to P2 target <500MB idle / <1GB typical):
24
+ # admissionController: 128Mi req / 384Mi limit (chart default 128Mi/384Mi)
25
+ # Total addition: ~128Mi req / ~384Mi limit
26
+ #
27
+ # If/when we want policy reports populated for observability dashboards,
28
+ # flip `reportsController.enabled: true` and the `features.policyReports`
29
+ # block below. Same for cleanup.
30
+ #
31
+ # Resource limits — tuned upward from chart default for admission webhook
32
+ # stability under burst churn (kube-prom-stack ships ~10 ServiceMonitors at
33
+ # once during `helm upgrade`, which arrives as a burst of AdmissionReviews).
34
+
35
+ # -------------------------------------------------------------------------
36
+ # Disable controllers we don't need
37
+ # -------------------------------------------------------------------------
38
+ backgroundController:
39
+ enabled: false # ClusterPolicy is admission-only (background: false)
40
+
41
+ cleanupController:
42
+ enabled: false # no CleanupPolicy objects in this repo
43
+
44
+ reportsController:
45
+ enabled: false # no policy-reports surface wired into Grafana yet
46
+
47
+ # -------------------------------------------------------------------------
48
+ # Features — admissionReports + policyReports remain ON inside the
49
+ # admission controller itself even when the standalone reports controller
50
+ # is disabled. This keeps `kubectl get clusterpolicyreport` queryable
51
+ # during dogfood; the reports controller would only AGGREGATE them
52
+ # cluster-wide, which we don't need yet.
53
+ # -------------------------------------------------------------------------
54
+ features:
55
+ admissionReports:
56
+ enabled: true
57
+ policyReports:
58
+ enabled: true
59
+ # Background scan is N/A — the policy uses background: false. Explicit
60
+ # off avoids the controller scheduling unnecessary scan workers even
61
+ # when the controller pod is disabled above.
62
+ backgroundScan:
63
+ enabled: false
64
+ # Logging volume defaults are fine; level 2 = info-ish.
65
+ logging:
66
+ format: text
67
+ verbosity: 2
68
+
69
+ # -------------------------------------------------------------------------
70
+ # Admission controller — the only pod we run.
71
+ # -------------------------------------------------------------------------
72
+ admissionController:
73
+ replicas: 1 # single-operator k3s scale; HA is N/A for dogfood
74
+
75
+ rbac:
76
+ create: true # ClusterPolicy needs cluster-wide watch on ServiceMonitor + PodMonitor
77
+
78
+ container:
79
+ resources:
80
+ requests:
81
+ cpu: 100m
82
+ memory: 256Mi
83
+ limits:
84
+ cpu: 500m
85
+ memory: 512Mi
@@ -0,0 +1,166 @@
1
+ # Loki Helm values — k3s-ingress-observability Phase B Task B1
2
+ #
3
+ # Single-binary mode (Decision-16 + Phase B scope):
4
+ # Distributed mode (microservices) adds 5+ independent Deployments + a Minio
5
+ # or S3 backend for object storage — pure overhead for a single-operator
6
+ # k3s install where Loki's write throughput is bounded by one Promtail
7
+ # DaemonSet and a handful of containers. SingleBinary collapses all roles
8
+ # (ingester, querier, compactor) into one Pod, fits within the <500MB idle
9
+ # LGTM RAM target (P2), and is trivially replaceable if scale demands change.
10
+ #
11
+ # See: docs/plans/k3s-ingress-observability/DESIGN.md (P2, S2)
12
+ #
13
+ # Chart: grafana/loki; pinned to 6.7.4 (latest stable as of 2026-05-20).
14
+ # Upgrade discipline: chart version is embedded in the e2e script comment.
15
+
16
+ deploymentMode: SingleBinary
17
+
18
+ loki:
19
+ auth_enabled: false # single-tenant; multi-tenancy adds header overhead with no benefit here
20
+
21
+ commonConfig:
22
+ replication_factor: 1 # single-binary; no replicas = no cross-replica consistency needed
23
+
24
+ # -------------------------------------------------------------------------
25
+ # Storage backend: filesystem (boltdb-shipper + tsdb index; local PV).
26
+ # Object storage (S3/GCS/MinIO) deferred to fatbox multi-org Phase F+.
27
+ # For single-operator k3s, local PV is simpler and sufficient.
28
+ # -------------------------------------------------------------------------
29
+ storage:
30
+ type: filesystem
31
+
32
+ schemaConfig:
33
+ configs:
34
+ - from: "2024-01-01"
35
+ store: tsdb
36
+ object_store: filesystem
37
+ schema: v13
38
+ index:
39
+ prefix: loki_index_
40
+ period: 24h
41
+
42
+ # -------------------------------------------------------------------------
43
+ # Retention: 7 days (168h) per Performance budget acceptance criterion #6.
44
+ # compactor.retention_enabled enables deletion; ring config required for
45
+ # single-binary mode.
46
+ # -------------------------------------------------------------------------
47
+ limits_config:
48
+ retention_period: 168h # 7 days
49
+ ingestion_rate_mb: 4 # per-tenant ingestion cap (single tenant)
50
+ ingestion_burst_size_mb: 8
51
+ max_query_series: 5000 # cap log-derived queries from going wide (P3 <3s p95)
52
+ max_entries_limit_per_query: 5000
53
+
54
+ compactor:
55
+ retention_enabled: true
56
+ delete_request_store: filesystem
57
+ compaction_interval: 10m
58
+ working_directory: /var/loki/compactor
59
+
60
+ ingester:
61
+ chunk_idle_period: 30m # flush to storage; appropriate for low write rate
62
+ chunk_retain_period: 1m
63
+ max_chunk_age: 2h
64
+
65
+ # Self-metrics endpoint — Phase C Prometheus scrapes this.
66
+ # Server block exposed on port 3100 (default); /metrics is always available.
67
+
68
+ singleBinary:
69
+ replicas: 1
70
+
71
+ # -------------------------------------------------------------------------
72
+ # Persistence: 10Gi PV.
73
+ #
74
+ # Rationale: 7-day retention at olam scale (<500 containers, access logs
75
+ # estimated 1–2MB/day compressed) → ~100MB typical stored. 10Gi gives 10x
76
+ # headroom for burst (failed deploy loops, chatty containers) and is well
77
+ # within the <1GB typical acceptance criterion #6. Cloud provider default SC
78
+ # is fine; on bare-metal k3s the local-path provisioner is used.
79
+ # -------------------------------------------------------------------------
80
+ persistence:
81
+ enabled: true
82
+ size: 10Gi # 10× headroom over 7-day typical (~100MB); <1GB usage target per AC#6
83
+
84
+ # -------------------------------------------------------------------------
85
+ # Resources: memory limit 512Mi per task spec.
86
+ # Typical usage at olam scale: <200MB idle (boltdb index + block cache).
87
+ # 512Mi limit prevents compaction spikes from triggering OOM on the node.
88
+ # -------------------------------------------------------------------------
89
+ resources:
90
+ requests:
91
+ cpu: 100m
92
+ memory: 128Mi
93
+ limits:
94
+ cpu: 500m
95
+ memory: 512Mi # P2: <500MB idle / <1GB typical; limit prevents spike OOM
96
+
97
+ # -------------------------------------------------------------------------
98
+ # Self-metrics for Phase C Prometheus scrape.
99
+ # ServiceMonitor is created here; Prometheus picks it up in Phase C.
100
+ # -------------------------------------------------------------------------
101
+ monitoring:
102
+ selfMonitoring:
103
+ enabled: false # disables the bundled GrafanaAgent sub-chart dependency
104
+ grafanaAgent:
105
+ installOperator: false
106
+ serviceMonitor:
107
+ # Disabled in the source-of-truth values file so a standalone Phase B install
108
+ # (without kube-prometheus-stack) does not hard-fail when the CRD is absent.
109
+ # The C1 e2e script flips this on at RUNTIME via
110
+ # helm upgrade ... --reuse-values --set monitoring.serviceMonitor.enabled=true
111
+ # AFTER kube-prom-stack has installed the ServiceMonitor CRD.
112
+ # NOTE: Loki 6.7.4 uses monitoring.serviceMonitor (not top-level serviceMonitor).
113
+ enabled: false
114
+
115
+ # -------------------------------------------------------------------------
116
+ # Backend and read/write gateway: disabled for SingleBinary mode.
117
+ # These are microservices-mode components and must be off or the chart
118
+ # emits validation errors when deploymentMode=SingleBinary.
119
+ # -------------------------------------------------------------------------
120
+ backend:
121
+ replicas: 0
122
+ read:
123
+ replicas: 0
124
+ write:
125
+ replicas: 0
126
+
127
+ # Grafana agent / canary: not needed; disable to keep resource footprint minimal.
128
+ lokiCanary:
129
+ enabled: false
130
+
131
+ test:
132
+ enabled: false
133
+
134
+ # -------------------------------------------------------------------------
135
+ # Sub-component slimming — chart 6.7.4 defaults include nginx gateway +
136
+ # two Memcached clusters + minio + sidecar watchers that single-binary
137
+ # mode doesn't need. Each adds image-pull and Ready-wait time. Disabling
138
+ # all of them brings the install Ready-time within the harness budget.
139
+ # If a future scenario needs query-result caching, re-evaluate
140
+ # resultsCache specifically.
141
+ # -------------------------------------------------------------------------
142
+
143
+ # nginx routing front; Promtail writes direct to single-binary :3100
144
+ gateway:
145
+ enabled: false
146
+
147
+ # Memcached cluster — overhead for single-binary
148
+ chunksCache:
149
+ enabled: false
150
+
151
+ # second Memcached cluster — overhead for single-binary
152
+ resultsCache:
153
+ enabled: false
154
+
155
+ # minio is off because storage.type=filesystem, but be explicit
156
+ minio:
157
+ enabled: false
158
+
159
+ # Sidecar that watches ConfigMaps for runtime config reloads — we don't ship one.
160
+ sidecar:
161
+ rules:
162
+ enabled: false
163
+ datasources:
164
+ enabled: false
165
+ configs:
166
+ enabled: false
@@ -0,0 +1,92 @@
1
+ # Promtail Helm values — Phase A Task A5 staging (Phase B consumes)
2
+ #
3
+ # Tails every container's stdout; ships to Loki single-binary (Phase B installs Loki).
4
+ # Per OQ-p3-6: Traefik native config can redact HEADERS but NOT URL query params —
5
+ # query-param scrubbing for `?token=`, `?code=`, `?access_token=`, `?state=` happens
6
+ # HERE at Promtail ingest via pipeline_stages.replace regex.
7
+ #
8
+ # Resource limits per OQ-p3-37 (Promtail OOM risk under chatty container-cp 100ms cadence):
9
+ # - memory limit 256Mi
10
+ # - pipeline_stages.limit rate 100 lines/sec/stream
11
+ #
12
+ # Scrape config matches every pod log; namespace-scope labels are added so Loki LogQL queries
13
+ # can filter by service / namespace / pod.
14
+ #
15
+ # SECURITY NOTE — replace stage regex semantics (load-bearing):
16
+ # Promtail's `replace` stage iterates over CAPTURE GROUPS, not full matches.
17
+ # The `replace` field is a Go text/template string; `${1}` is NOT valid Go
18
+ # template syntax and silently becomes a literal. The correct pattern is:
19
+ # expression: '(?:prefix)(secret_value_only)' — capture ONLY the secret part
20
+ # replace: 'REDACTED' — replace captured secret with literal
21
+ # See promtail-values.yaml header comment for full details.
22
+
23
+ deploymentMode: DaemonSet
24
+
25
+ resources:
26
+ requests:
27
+ cpu: 50m
28
+ memory: 64Mi
29
+ limits:
30
+ cpu: 200m
31
+ memory: 256Mi # OQ-p3-37: bounded; OOM-kill restart preferred over runaway memory
32
+
33
+ config:
34
+ clients:
35
+ - url: http://olam-loki.monitoring.svc.cluster.local:3100/loki/api/v1/push
36
+
37
+ snippets:
38
+ pipelineStages:
39
+ # 1. Parse JSON access logs from Traefik (key field present in JSON line)
40
+ - match:
41
+ selector: '{container="traefik"}'
42
+ stages:
43
+ - json:
44
+ expressions:
45
+ request_method: RequestMethod
46
+ request_path: RequestPath
47
+ status: DownstreamStatus
48
+ request_id: requestId
49
+ service: ServiceName
50
+ router: RouterName
51
+
52
+ # 2. Scrub OAuth/token values from URL query params and Authorization headers.
53
+ #
54
+ # IMPORTANT — capture group semantics:
55
+ # The replace stage replaces each CAPTURE GROUP with the `replace` template
56
+ # value. Capture groups must wrap ONLY the secret value, not the surrounding
57
+ # context. The prefix (e.g. `?code=`) uses a non-capturing group `(?:...)` so
58
+ # it is preserved in the output while only the secret is replaced.
59
+ - replace:
60
+ # OAuth code= callback values — capture only the token value after `code=`
61
+ expression: '(?:\?|&)code=([^&\s]+)'
62
+ replace: 'REDACTED'
63
+ - replace:
64
+ # Bearer / access tokens in query strings — capture only the value
65
+ expression: '(?:\?|&)(?:access_token|token|api_key|secret)=([^&\s]+)'
66
+ replace: 'REDACTED'
67
+ - replace:
68
+ # OAuth state param (may carry session info) — capture only the value
69
+ expression: '(?:\?|&)state=([^&\s]+)'
70
+ replace: 'REDACTED'
71
+ - replace:
72
+ # Authorization header Bearer value — capture only the token after `Bearer `
73
+ expression: '(?:Authorization|authorization):\s*(?:Bearer|bearer)\s+(\S+)'
74
+ replace: 'REDACTED'
75
+
76
+ # 3. Rate-limit ingestion per-stream to prevent OOM cascade under chatty containers (OQ-p3-37)
77
+ - limit:
78
+ rate: 100 # max log lines/sec per stream
79
+ burst: 200
80
+ drop: true # drop excess lines; do NOT block tail
81
+
82
+ # 4. Promote parsed fields to labels (low-cardinality only — taxonomy compliance)
83
+ - labels:
84
+ service: # from Traefik JSON access log; matches taxonomy `service` label
85
+ router: # Traefik router name
86
+ status: # HTTP status code (within taxonomy)
87
+
88
+ # Retention is configured on Loki side (Phase B), not Promtail.
89
+ # Sample retention target: 7 days per Performance budget Row.
90
+
91
+ serviceMonitor:
92
+ enabled: true # Prometheus (Phase C) scrapes Promtail's own /metrics for self-observability
@@ -0,0 +1,102 @@
1
+ # Promtail Helm values — k3s-ingress-observability Phase B Task B1 (production)
2
+ #
3
+ # Production Promtail values. Staging copy at promtail-staging.yaml has the
4
+ # same scrubbing pipeline shape; this file sets the Loki client URL +
5
+ # production resource limits.
6
+ #
7
+ # Scrubbing pipeline:
8
+ # - 4 `replace` stages: code=, token/access_token/api_key/secret=, state=, Authorization
9
+ # - `limit` stage: rate=100/burst=200/drop=true (OQ-p3-37: Promtail OOM under chatty containers)
10
+ # Client URL: http://olam-loki.monitoring.svc.cluster.local:3100/loki/api/v1/push
11
+ # Service name `olam-loki` is the Helm release name used in scripts/e2e/loki-ingest.sh
12
+ # (`helm upgrade --install olam-loki grafana/loki ...`); the chart's Service
13
+ # is named after the release, so `olam-loki` is the in-cluster DNS hostname.
14
+ #
15
+ # SECURITY NOTE — replace stage regex semantics (load-bearing):
16
+ # Promtail's `replace` stage iterates over CAPTURE GROUPS, not full matches.
17
+ # The `replace` field is a Go text/template string; `${1}` is NOT valid Go
18
+ # template syntax and silently becomes a literal. The correct pattern is:
19
+ # expression: '(?:prefix)(secret_value_only)' — capture ONLY the secret part
20
+ # replace: 'REDACTED' — replace captured secret with literal
21
+ # This leaves the surrounding context (e.g. `?code=`) intact and redacts only
22
+ # the value. The broken pattern `(\?|&)code=[^&\s]+` with `replace: '${1}code=REDACTED'`
23
+ # was the root cause of the Phase B scrubbing regression (PR #776).
24
+ #
25
+ # See: docs/plans/k3s-ingress-observability/DESIGN.md (T8, T9)
26
+
27
+ deploymentMode: DaemonSet
28
+
29
+ resources:
30
+ requests:
31
+ cpu: 50m
32
+ memory: 64Mi
33
+ limits:
34
+ cpu: 200m
35
+ memory: 256Mi # OQ-p3-37: bounded; OOM-kill restart preferred over runaway memory
36
+
37
+ config:
38
+ clients:
39
+ - url: http://olam-loki.monitoring.svc.cluster.local:3100/loki/api/v1/push
40
+
41
+ snippets:
42
+ pipelineStages:
43
+ # 1. Parse JSON access logs from Traefik (key field present in JSON line)
44
+ - match:
45
+ selector: '{container="traefik"}'
46
+ stages:
47
+ - json:
48
+ expressions:
49
+ request_method: RequestMethod
50
+ request_path: RequestPath
51
+ status: DownstreamStatus
52
+ request_id: requestId
53
+ service: ServiceName
54
+ router: RouterName
55
+
56
+ # 2. Scrub OAuth/token values from URL query params and Authorization headers.
57
+ #
58
+ # IMPORTANT — capture group semantics:
59
+ # The replace stage replaces each CAPTURE GROUP with the `replace` template
60
+ # value. Capture groups must wrap ONLY the secret value, not the surrounding
61
+ # context. The prefix (e.g. `?code=`) uses a non-capturing group `(?:...)` so
62
+ # it is preserved in the output while only the secret is replaced.
63
+ - replace:
64
+ # OAuth code= callback values — capture only the token value after `code=`
65
+ expression: '(?:\?|&)code=([^&\s]+)'
66
+ replace: 'REDACTED'
67
+ - replace:
68
+ # Bearer / access tokens in query strings — capture only the value
69
+ expression: '(?:\?|&)(?:access_token|token|api_key|secret)=([^&\s]+)'
70
+ replace: 'REDACTED'
71
+ - replace:
72
+ # OAuth state param (may carry session info) — capture only the value
73
+ expression: '(?:\?|&)state=([^&\s]+)'
74
+ replace: 'REDACTED'
75
+ - replace:
76
+ # Authorization header Bearer value — capture only the token after `Bearer `
77
+ expression: '(?:Authorization|authorization):\s*(?:Bearer|bearer)\s+(\S+)'
78
+ replace: 'REDACTED'
79
+
80
+ # 3. Rate-limit ingestion per-stream to prevent OOM cascade under chatty containers (OQ-p3-37)
81
+ - limit:
82
+ rate: 100 # max log lines/sec per stream
83
+ burst: 200
84
+ drop: true # drop excess lines; do NOT block tail
85
+
86
+ # 4. Promote parsed fields to labels (low-cardinality only — taxonomy compliance)
87
+ - labels:
88
+ service: # from Traefik JSON access log; matches taxonomy `service` label
89
+ router: # Traefik router name
90
+ status: # HTTP status code (within taxonomy)
91
+
92
+ # Retention is configured on Loki side (loki-values.yaml: 7 days / 168h).
93
+
94
+ serviceMonitor:
95
+ # Disabled in the source-of-truth values file so a standalone Phase B install
96
+ # (without kube-prometheus-stack) does not hard-fail with
97
+ # "no matches for kind ServiceMonitor in version monitoring.coreos.com/v1".
98
+ # The C1 e2e script flips this on at RUNTIME via
99
+ # helm upgrade ... --reuse-values --set serviceMonitor.enabled=true
100
+ # AFTER kube-prom-stack has installed the ServiceMonitor CRD. Source-of-truth
101
+ # stays standalone-friendly; runtime override wires Prometheus discovery.
102
+ enabled: false
@@ -0,0 +1,73 @@
1
+ # Traefik Helm values — k3s-ingress-observability Phase A Task A3
2
+ # Pinned NodePort 30080 per OQ-p3-7 (world hooks bake this URL).
3
+ # Structured JSON access logs ready for Phase A Task A5 + Phase B Promtail pickup.
4
+
5
+ deployment:
6
+ replicas: 1 # SPOF mitigation = host systemd watchdog (Phase A Task A11), not HA replicas
7
+
8
+ ports:
9
+ web:
10
+ port: 8000
11
+ expose:
12
+ default: true
13
+ exposedPort: 80
14
+ nodePort: 30080 # PIN (OQ-p3-7); world hooks reach via host.docker.internal:30080
15
+ protocol: TCP
16
+ websecure:
17
+ port: 8443
18
+ expose:
19
+ default: true
20
+ exposedPort: 443
21
+ nodePort: 30443
22
+ protocol: TCP
23
+ # v1: HTTPS deferred to fatbox multi-org (Out-of-scope of this plan); TLS not configured.
24
+
25
+ service:
26
+ type: NodePort
27
+
28
+ # Structured access logs to stdout — Promtail picks up in Phase B.
29
+ # Authorization header redaction here; URL query-param scrubbing happens
30
+ # at Promtail pipeline_stages.replace per OQ-p3-6 (Traefik can't scrub query params natively).
31
+ logs:
32
+ general:
33
+ level: INFO
34
+ format: json
35
+ access:
36
+ enabled: true
37
+ format: json
38
+ fields:
39
+ headers:
40
+ defaultMode: keep
41
+ names:
42
+ Authorization: redact
43
+ Cookie: redact
44
+
45
+ # Built-in /metrics for Phase C Prometheus scrape
46
+ metrics:
47
+ prometheus:
48
+ enabled: true
49
+ addEntryPointsLabels: true
50
+ addRoutersLabels: true
51
+ addServicesLabels: true
52
+
53
+ # Dashboard disabled in cluster — operator uses Grafana (Phase B)
54
+ ingressRoute:
55
+ dashboard:
56
+ enabled: false
57
+
58
+ # IngressRoute CRD enabled
59
+ providers:
60
+ kubernetesCRD:
61
+ enabled: true
62
+ allowCrossNamespace: false # explicit; matches namespace-isolation strategy from A1
63
+ kubernetesIngress:
64
+ enabled: false # CRD-only; vanilla Ingress not supported in this stack
65
+
66
+ # Resource bounds — observability stack target <500MB RAM idle (P2)
67
+ resources:
68
+ requests:
69
+ cpu: 100m
70
+ memory: 64Mi
71
+ limits:
72
+ cpu: 500m
73
+ memory: 256Mi
@@ -0,0 +1,6 @@
1
+ # Namespace for k3s-ingress-observability peripheral services
2
+ # (Traefik installs to kube-system; observability stack to monitoring; this is for IngressRoute CRDs targeting olam services)
3
+ apiVersion: v1
4
+ kind: Namespace
5
+ metadata:
6
+ name: olam