@pleri/olam-cli 0.1.160 → 0.1.162
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -0
- package/dist/agent-stream/agent-sdk-to-chunks.js +20 -2
- package/dist/commands/bootstrap.d.ts +15 -0
- package/dist/commands/bootstrap.d.ts.map +1 -1
- package/dist/commands/bootstrap.js +58 -5
- package/dist/commands/bootstrap.js.map +1 -1
- package/dist/commands/flywheel/migrate-overlays.d.ts +1 -0
- package/dist/commands/flywheel/migrate-overlays.d.ts.map +1 -1
- package/dist/commands/flywheel/migrate-overlays.js +29 -3
- package/dist/commands/flywheel/migrate-overlays.js.map +1 -1
- package/dist/commands/skills-source.d.ts.map +1 -1
- package/dist/commands/skills-source.js +57 -2
- package/dist/commands/skills-source.js.map +1 -1
- package/dist/commands/skills.d.ts.map +1 -1
- package/dist/commands/skills.js +14 -0
- package/dist/commands/skills.js.map +1 -1
- package/dist/image-digests.json +7 -7
- package/dist/index.js +2424 -1781
- package/dist/lib/bootstrap-kubernetes.d.ts +42 -0
- package/dist/lib/bootstrap-kubernetes.d.ts.map +1 -0
- package/dist/lib/bootstrap-kubernetes.js +367 -0
- package/dist/lib/bootstrap-kubernetes.js.map +1 -0
- package/dist/lib/config.d.ts.map +1 -1
- package/dist/lib/config.js +6 -1
- package/dist/lib/config.js.map +1 -1
- package/dist/mcp-server.js +568 -368
- package/hermes-bundle/version.json +1 -1
- package/host-cp/k8s/manifests/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +1 -1
- package/host-cp/observability/grafana-port-forward.sh +283 -0
- package/host-cp/observability/kyverno-cardinality-mutate.sh +462 -0
- package/host-cp/observability/loki-ingest.sh +253 -0
- package/host-cp/observability/prom-no-double-grafana.sh +311 -0
- package/host-cp/peripheral-services/helm-values/grafana-values.yaml +159 -0
- package/host-cp/peripheral-services/helm-values/kube-prom-stack-values.yaml +229 -0
- package/host-cp/peripheral-services/helm-values/kyverno-values.yaml +85 -0
- package/host-cp/peripheral-services/helm-values/loki-values.yaml +166 -0
- package/host-cp/peripheral-services/helm-values/promtail-staging.yaml +92 -0
- package/host-cp/peripheral-services/helm-values/promtail-values.yaml +102 -0
- package/host-cp/peripheral-services/helm-values/traefik-values.yaml +73 -0
- package/host-cp/peripheral-services/manifests/20-namespace.yaml +6 -0
- package/host-cp/peripheral-services/manifests/24-deploy-kg-service.yaml +245 -0
- package/host-cp/peripheral-services/manifests/30-traefik-ingressroute-host-cp.yaml +22 -0
- package/host-cp/peripheral-services/manifests/40-traefik-ingressroute-kg.yaml +29 -0
- package/host-cp/peripheral-services/manifests/50-traefik-ingressroute-agent-memory.yaml +29 -0
- package/host-cp/peripheral-services/manifests/60-networkpolicy-ingress.yaml +80 -0
- package/host-cp/peripheral-services/manifests/65-networkpolicy-loki-prom-deny.yaml +67 -0
- package/host-cp/peripheral-services/manifests/80-grafana-dashboard-configmap.yaml +1349 -0
- package/host-cp/peripheral-services/manifests/90-prom-alert-cardinality.yaml +50 -0
- package/host-cp/peripheral-services/manifests/91-servicemonitor-host-cp.yaml +70 -0
- package/host-cp/peripheral-services/manifests/92-servicemonitor-kg-service.yaml +70 -0
- package/host-cp/peripheral-services/manifests/93-servicemonitor-memory-service.yaml +87 -0
- package/host-cp/peripheral-services/manifests/95-prom-recording-rules.yaml +108 -0
- package/host-cp/peripheral-services/manifests/96-kyverno-cardinality-mutate.yaml +195 -0
- package/host-cp/src/plan-chat-service.mjs +147 -1
- package/package.json +1 -1
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# Promtail Helm values — Phase A Task A5 staging (Phase B consumes)
|
|
2
|
+
#
|
|
3
|
+
# Tails every container's stdout; ships to Loki single-binary (Phase B installs Loki).
|
|
4
|
+
# Per OQ-p3-6: Traefik native config can redact HEADERS but NOT URL query params —
|
|
5
|
+
# query-param scrubbing for `?token=`, `?code=`, `?access_token=`, `?state=` happens
|
|
6
|
+
# HERE at Promtail ingest via pipeline_stages.replace regex.
|
|
7
|
+
#
|
|
8
|
+
# Resource limits per OQ-p3-37 (Promtail OOM risk under chatty container-cp 100ms cadence):
|
|
9
|
+
# - memory limit 256Mi
|
|
10
|
+
# - pipeline_stages.limit rate 100 lines/sec/stream
|
|
11
|
+
#
|
|
12
|
+
# Scrape config matches every pod log; namespace-scope labels are added so Loki LogQL queries
|
|
13
|
+
# can filter by service / namespace / pod.
|
|
14
|
+
#
|
|
15
|
+
# SECURITY NOTE — replace stage regex semantics (load-bearing):
|
|
16
|
+
# Promtail's `replace` stage iterates over CAPTURE GROUPS, not full matches.
|
|
17
|
+
# The `replace` field is a Go text/template string; `${1}` is NOT valid Go
|
|
18
|
+
# template syntax and silently becomes a literal. The correct pattern is:
|
|
19
|
+
# expression: '(?:prefix)(secret_value_only)' — capture ONLY the secret part
|
|
20
|
+
# replace: 'REDACTED' — replace captured secret with literal
|
|
21
|
+
# See promtail-values.yaml header comment for full details.
|
|
22
|
+
|
|
23
|
+
deploymentMode: DaemonSet
|
|
24
|
+
|
|
25
|
+
resources:
|
|
26
|
+
requests:
|
|
27
|
+
cpu: 50m
|
|
28
|
+
memory: 64Mi
|
|
29
|
+
limits:
|
|
30
|
+
cpu: 200m
|
|
31
|
+
memory: 256Mi # OQ-p3-37: bounded; OOM-kill restart preferred over runaway memory
|
|
32
|
+
|
|
33
|
+
config:
|
|
34
|
+
clients:
|
|
35
|
+
- url: http://olam-loki.monitoring.svc.cluster.local:3100/loki/api/v1/push
|
|
36
|
+
|
|
37
|
+
snippets:
|
|
38
|
+
pipelineStages:
|
|
39
|
+
# 1. Parse JSON access logs from Traefik (key field present in JSON line)
|
|
40
|
+
- match:
|
|
41
|
+
selector: '{container="traefik"}'
|
|
42
|
+
stages:
|
|
43
|
+
- json:
|
|
44
|
+
expressions:
|
|
45
|
+
request_method: RequestMethod
|
|
46
|
+
request_path: RequestPath
|
|
47
|
+
status: DownstreamStatus
|
|
48
|
+
request_id: requestId
|
|
49
|
+
service: ServiceName
|
|
50
|
+
router: RouterName
|
|
51
|
+
|
|
52
|
+
# 2. Scrub OAuth/token values from URL query params and Authorization headers.
|
|
53
|
+
#
|
|
54
|
+
# IMPORTANT — capture group semantics:
|
|
55
|
+
# The replace stage replaces each CAPTURE GROUP with the `replace` template
|
|
56
|
+
# value. Capture groups must wrap ONLY the secret value, not the surrounding
|
|
57
|
+
# context. The prefix (e.g. `?code=`) uses a non-capturing group `(?:...)` so
|
|
58
|
+
# it is preserved in the output while only the secret is replaced.
|
|
59
|
+
- replace:
|
|
60
|
+
# OAuth code= callback values — capture only the token value after `code=`
|
|
61
|
+
expression: '(?:\?|&)code=([^&\s]+)'
|
|
62
|
+
replace: 'REDACTED'
|
|
63
|
+
- replace:
|
|
64
|
+
# Bearer / access tokens in query strings — capture only the value
|
|
65
|
+
expression: '(?:\?|&)(?:access_token|token|api_key|secret)=([^&\s]+)'
|
|
66
|
+
replace: 'REDACTED'
|
|
67
|
+
- replace:
|
|
68
|
+
# OAuth state param (may carry session info) — capture only the value
|
|
69
|
+
expression: '(?:\?|&)state=([^&\s]+)'
|
|
70
|
+
replace: 'REDACTED'
|
|
71
|
+
- replace:
|
|
72
|
+
# Authorization header Bearer value — capture only the token after `Bearer `
|
|
73
|
+
expression: '(?:Authorization|authorization):\s*(?:Bearer|bearer)\s+(\S+)'
|
|
74
|
+
replace: 'REDACTED'
|
|
75
|
+
|
|
76
|
+
# 3. Rate-limit ingestion per-stream to prevent OOM cascade under chatty containers (OQ-p3-37)
|
|
77
|
+
- limit:
|
|
78
|
+
rate: 100 # max log lines/sec per stream
|
|
79
|
+
burst: 200
|
|
80
|
+
drop: true # drop excess lines; do NOT block tail
|
|
81
|
+
|
|
82
|
+
# 4. Promote parsed fields to labels (low-cardinality only — taxonomy compliance)
|
|
83
|
+
- labels:
|
|
84
|
+
service: # from Traefik JSON access log; matches taxonomy `service` label
|
|
85
|
+
router: # Traefik router name
|
|
86
|
+
status: # HTTP status code (within taxonomy)
|
|
87
|
+
|
|
88
|
+
# Retention is configured on Loki side (Phase B), not Promtail.
|
|
89
|
+
# Sample retention target: 7 days per Performance budget Row.
|
|
90
|
+
|
|
91
|
+
serviceMonitor:
|
|
92
|
+
enabled: true # Prometheus (Phase C) scrapes Promtail's own /metrics for self-observability
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# Promtail Helm values — k3s-ingress-observability Phase B Task B1 (production)
|
|
2
|
+
#
|
|
3
|
+
# Production Promtail values. Staging copy at promtail-staging.yaml has the
|
|
4
|
+
# same scrubbing pipeline shape; this file sets the Loki client URL +
|
|
5
|
+
# production resource limits.
|
|
6
|
+
#
|
|
7
|
+
# Scrubbing pipeline:
|
|
8
|
+
# - 4 `replace` stages: code=, token/access_token/api_key/secret=, state=, Authorization
|
|
9
|
+
# - `limit` stage: rate=100/burst=200/drop=true (OQ-p3-37: Promtail OOM under chatty containers)
|
|
10
|
+
# Client URL: http://olam-loki.monitoring.svc.cluster.local:3100/loki/api/v1/push
|
|
11
|
+
# Service name `olam-loki` is the Helm release name used in scripts/e2e/loki-ingest.sh
|
|
12
|
+
# (`helm upgrade --install olam-loki grafana/loki ...`); the chart's Service
|
|
13
|
+
# is named after the release, so `olam-loki` is the in-cluster DNS hostname.
|
|
14
|
+
#
|
|
15
|
+
# SECURITY NOTE — replace stage regex semantics (load-bearing):
|
|
16
|
+
# Promtail's `replace` stage iterates over CAPTURE GROUPS, not full matches.
|
|
17
|
+
# The `replace` field is a Go text/template string; `${1}` is NOT valid Go
|
|
18
|
+
# template syntax and silently becomes a literal. The correct pattern is:
|
|
19
|
+
# expression: '(?:prefix)(secret_value_only)' — capture ONLY the secret part
|
|
20
|
+
# replace: 'REDACTED' — replace captured secret with literal
|
|
21
|
+
# This leaves the surrounding context (e.g. `?code=`) intact and redacts only
|
|
22
|
+
# the value. The broken pattern `(\?|&)code=[^&\s]+` with `replace: '${1}code=REDACTED'`
|
|
23
|
+
# was the root cause of the Phase B scrubbing regression (PR #776).
|
|
24
|
+
#
|
|
25
|
+
# See: docs/plans/k3s-ingress-observability/DESIGN.md (T8, T9)
|
|
26
|
+
|
|
27
|
+
deploymentMode: DaemonSet
|
|
28
|
+
|
|
29
|
+
resources:
|
|
30
|
+
requests:
|
|
31
|
+
cpu: 50m
|
|
32
|
+
memory: 64Mi
|
|
33
|
+
limits:
|
|
34
|
+
cpu: 200m
|
|
35
|
+
memory: 256Mi # OQ-p3-37: bounded; OOM-kill restart preferred over runaway memory
|
|
36
|
+
|
|
37
|
+
config:
|
|
38
|
+
clients:
|
|
39
|
+
- url: http://olam-loki.monitoring.svc.cluster.local:3100/loki/api/v1/push
|
|
40
|
+
|
|
41
|
+
snippets:
|
|
42
|
+
pipelineStages:
|
|
43
|
+
# 1. Parse JSON access logs from Traefik (key field present in JSON line)
|
|
44
|
+
- match:
|
|
45
|
+
selector: '{container="traefik"}'
|
|
46
|
+
stages:
|
|
47
|
+
- json:
|
|
48
|
+
expressions:
|
|
49
|
+
request_method: RequestMethod
|
|
50
|
+
request_path: RequestPath
|
|
51
|
+
status: DownstreamStatus
|
|
52
|
+
request_id: requestId
|
|
53
|
+
service: ServiceName
|
|
54
|
+
router: RouterName
|
|
55
|
+
|
|
56
|
+
# 2. Scrub OAuth/token values from URL query params and Authorization headers.
|
|
57
|
+
#
|
|
58
|
+
# IMPORTANT — capture group semantics:
|
|
59
|
+
# The replace stage replaces each CAPTURE GROUP with the `replace` template
|
|
60
|
+
# value. Capture groups must wrap ONLY the secret value, not the surrounding
|
|
61
|
+
# context. The prefix (e.g. `?code=`) uses a non-capturing group `(?:...)` so
|
|
62
|
+
# it is preserved in the output while only the secret is replaced.
|
|
63
|
+
- replace:
|
|
64
|
+
# OAuth code= callback values — capture only the token value after `code=`
|
|
65
|
+
expression: '(?:\?|&)code=([^&\s]+)'
|
|
66
|
+
replace: 'REDACTED'
|
|
67
|
+
- replace:
|
|
68
|
+
# Bearer / access tokens in query strings — capture only the value
|
|
69
|
+
expression: '(?:\?|&)(?:access_token|token|api_key|secret)=([^&\s]+)'
|
|
70
|
+
replace: 'REDACTED'
|
|
71
|
+
- replace:
|
|
72
|
+
# OAuth state param (may carry session info) — capture only the value
|
|
73
|
+
expression: '(?:\?|&)state=([^&\s]+)'
|
|
74
|
+
replace: 'REDACTED'
|
|
75
|
+
- replace:
|
|
76
|
+
# Authorization header Bearer value — capture only the token after `Bearer `
|
|
77
|
+
expression: '(?:Authorization|authorization):\s*(?:Bearer|bearer)\s+(\S+)'
|
|
78
|
+
replace: 'REDACTED'
|
|
79
|
+
|
|
80
|
+
# 3. Rate-limit ingestion per-stream to prevent OOM cascade under chatty containers (OQ-p3-37)
|
|
81
|
+
- limit:
|
|
82
|
+
rate: 100 # max log lines/sec per stream
|
|
83
|
+
burst: 200
|
|
84
|
+
drop: true # drop excess lines; do NOT block tail
|
|
85
|
+
|
|
86
|
+
# 4. Promote parsed fields to labels (low-cardinality only — taxonomy compliance)
|
|
87
|
+
- labels:
|
|
88
|
+
service: # from Traefik JSON access log; matches taxonomy `service` label
|
|
89
|
+
router: # Traefik router name
|
|
90
|
+
status: # HTTP status code (within taxonomy)
|
|
91
|
+
|
|
92
|
+
# Retention is configured on Loki side (loki-values.yaml: 7 days / 168h).
|
|
93
|
+
|
|
94
|
+
serviceMonitor:
|
|
95
|
+
# Disabled in the source-of-truth values file so a standalone Phase B install
|
|
96
|
+
# (without kube-prometheus-stack) does not hard-fail with
|
|
97
|
+
# "no matches for kind ServiceMonitor in version monitoring.coreos.com/v1".
|
|
98
|
+
# The C1 e2e script flips this on at RUNTIME via
|
|
99
|
+
# helm upgrade ... --reuse-values --set serviceMonitor.enabled=true
|
|
100
|
+
# AFTER kube-prom-stack has installed the ServiceMonitor CRD. Source-of-truth
|
|
101
|
+
# stays standalone-friendly; runtime override wires Prometheus discovery.
|
|
102
|
+
enabled: false
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# Traefik Helm values — k3s-ingress-observability Phase A Task A3
|
|
2
|
+
# Pinned NodePort 30080 per OQ-p3-7 (world hooks bake this URL).
|
|
3
|
+
# Structured JSON access logs ready for Phase A Task A5 + Phase B Promtail pickup.
|
|
4
|
+
|
|
5
|
+
deployment:
|
|
6
|
+
replicas: 1 # SPOF mitigation = host systemd watchdog (Phase A Task A11), not HA replicas
|
|
7
|
+
|
|
8
|
+
ports:
|
|
9
|
+
web:
|
|
10
|
+
port: 8000
|
|
11
|
+
expose:
|
|
12
|
+
default: true
|
|
13
|
+
exposedPort: 80
|
|
14
|
+
nodePort: 30080 # PIN (OQ-p3-7); world hooks reach via host.docker.internal:30080
|
|
15
|
+
protocol: TCP
|
|
16
|
+
websecure:
|
|
17
|
+
port: 8443
|
|
18
|
+
expose:
|
|
19
|
+
default: true
|
|
20
|
+
exposedPort: 443
|
|
21
|
+
nodePort: 30443
|
|
22
|
+
protocol: TCP
|
|
23
|
+
# v1: HTTPS deferred to fatbox multi-org (Out-of-scope of this plan); TLS not configured.
|
|
24
|
+
|
|
25
|
+
service:
|
|
26
|
+
type: NodePort
|
|
27
|
+
|
|
28
|
+
# Structured access logs to stdout — Promtail picks up in Phase B.
|
|
29
|
+
# Authorization header redaction here; URL query-param scrubbing happens
|
|
30
|
+
# at Promtail pipeline_stages.replace per OQ-p3-6 (Traefik can't scrub query params natively).
|
|
31
|
+
logs:
|
|
32
|
+
general:
|
|
33
|
+
level: INFO
|
|
34
|
+
format: json
|
|
35
|
+
access:
|
|
36
|
+
enabled: true
|
|
37
|
+
format: json
|
|
38
|
+
fields:
|
|
39
|
+
headers:
|
|
40
|
+
defaultMode: keep
|
|
41
|
+
names:
|
|
42
|
+
Authorization: redact
|
|
43
|
+
Cookie: redact
|
|
44
|
+
|
|
45
|
+
# Built-in /metrics for Phase C Prometheus scrape
|
|
46
|
+
metrics:
|
|
47
|
+
prometheus:
|
|
48
|
+
enabled: true
|
|
49
|
+
addEntryPointsLabels: true
|
|
50
|
+
addRoutersLabels: true
|
|
51
|
+
addServicesLabels: true
|
|
52
|
+
|
|
53
|
+
# Dashboard disabled in cluster — operator uses Grafana (Phase B)
|
|
54
|
+
ingressRoute:
|
|
55
|
+
dashboard:
|
|
56
|
+
enabled: false
|
|
57
|
+
|
|
58
|
+
# IngressRoute CRD enabled
|
|
59
|
+
providers:
|
|
60
|
+
kubernetesCRD:
|
|
61
|
+
enabled: true
|
|
62
|
+
allowCrossNamespace: false # explicit; matches namespace-isolation strategy from A1
|
|
63
|
+
kubernetesIngress:
|
|
64
|
+
enabled: false # CRD-only; vanilla Ingress not supported in this stack
|
|
65
|
+
|
|
66
|
+
# Resource bounds — observability stack target <500MB RAM idle (P2)
|
|
67
|
+
resources:
|
|
68
|
+
requests:
|
|
69
|
+
cpu: 100m
|
|
70
|
+
memory: 64Mi
|
|
71
|
+
limits:
|
|
72
|
+
cpu: 500m
|
|
73
|
+
memory: 256Mi
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
# 24-deploy-kg-service.yaml — kg-service Service + Deployment for local k3s dogfood.
|
|
2
|
+
#
|
|
3
|
+
# Bridges the gap between Phase C's ServiceMonitor (92-servicemonitor-kg-service.yaml)
|
|
4
|
+
# and a running service. The ServiceMonitor targets namespace `olam`,
|
|
5
|
+
# label `app: olam-kg-service`, port name `http` — this manifest satisfies that
|
|
6
|
+
# contract so Prometheus can scrape kg-service's /metrics endpoint.
|
|
7
|
+
#
|
|
8
|
+
# Canonical per-service manifest tree: packages/host-cp/k8s/manifests/kg-service/
|
|
9
|
+
# This file is the "peripheral-services entry point" view — it folds Service +
|
|
10
|
+
# Deployment into a single file for `kubectl apply -f manifests/` convenience.
|
|
11
|
+
#
|
|
12
|
+
# Secrets prerequisite: operator MUST create `olam-kg-service-secret` in the
|
|
13
|
+
# `olam` namespace BEFORE applying this manifest. See README.md § Secrets.
|
|
14
|
+
#
|
|
15
|
+
# Image: pinned to sha256 digest (not :latest) per T4 threat model.
|
|
16
|
+
# Digest resolves to ghcr.io/pleri/olam-kg-service:0.1.158.
|
|
17
|
+
# To update:
|
|
18
|
+
# TOKEN=$(curl -s "https://ghcr.io/token?scope=repository:pleri/olam-kg-service:pull&service=ghcr.io" | jq -r .token)
|
|
19
|
+
# curl -sI -H "Authorization: Bearer $TOKEN" \
|
|
20
|
+
# -H "Accept: application/vnd.oci.image.index.v1+json,application/vnd.docker.distribution.manifest.list.v2+json" \
|
|
21
|
+
# https://ghcr.io/v2/pleri/olam-kg-service/manifests/<tag> | grep docker-content-digest
|
|
22
|
+
#
|
|
23
|
+
# Memory: bge-small-en-v1.5 ONNX model is pre-cached in the image (~90 MB).
|
|
24
|
+
# Container needs ≥512Mi to load the model + serve requests. Limit set to 1Gi.
|
|
25
|
+
#
|
|
26
|
+
# Apply-manifests.sh: this file is SKIPPED by the phase-a-e2e harness
|
|
27
|
+
# (apply-manifests.sh skip-list includes 2[3-4]-deploy-*) because the
|
|
28
|
+
# harness cluster has no operator secrets or kg-data PVC.
|
|
29
|
+
# Operator-side `kubectl apply -f manifests/` applies it.
|
|
30
|
+
---
|
|
31
|
+
apiVersion: v1
|
|
32
|
+
kind: ServiceAccount
|
|
33
|
+
metadata:
|
|
34
|
+
name: olam-kg-service
|
|
35
|
+
namespace: olam
|
|
36
|
+
labels:
|
|
37
|
+
app: olam-kg-service
|
|
38
|
+
app.kubernetes.io/managed-by: olam
|
|
39
|
+
---
|
|
40
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
|
41
|
+
kind: Role
|
|
42
|
+
metadata:
|
|
43
|
+
name: olam-kg-service
|
|
44
|
+
namespace: olam
|
|
45
|
+
labels:
|
|
46
|
+
app: olam-kg-service
|
|
47
|
+
app.kubernetes.io/managed-by: olam
|
|
48
|
+
rules:
|
|
49
|
+
- apiGroups: ["apps"]
|
|
50
|
+
resources: ["deployments"]
|
|
51
|
+
resourceNames: ["olam-kg-service"]
|
|
52
|
+
verbs: ["get", "patch", "watch"]
|
|
53
|
+
---
|
|
54
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
|
55
|
+
kind: RoleBinding
|
|
56
|
+
metadata:
|
|
57
|
+
name: olam-kg-service
|
|
58
|
+
namespace: olam
|
|
59
|
+
labels:
|
|
60
|
+
app: olam-kg-service
|
|
61
|
+
app.kubernetes.io/managed-by: olam
|
|
62
|
+
subjects:
|
|
63
|
+
- kind: ServiceAccount
|
|
64
|
+
name: olam-kg-service
|
|
65
|
+
namespace: olam
|
|
66
|
+
roleRef:
|
|
67
|
+
kind: Role
|
|
68
|
+
name: olam-kg-service
|
|
69
|
+
apiGroup: rbac.authorization.k8s.io
|
|
70
|
+
---
|
|
71
|
+
# ConfigMap — non-sensitive env vars.
|
|
72
|
+
# Sensitive values (OLAM_KG_BEARER_TOKEN) live in `olam-kg-service-secret`.
|
|
73
|
+
apiVersion: v1
|
|
74
|
+
kind: ConfigMap
|
|
75
|
+
metadata:
|
|
76
|
+
name: olam-kg-service-env
|
|
77
|
+
namespace: olam
|
|
78
|
+
labels:
|
|
79
|
+
app: olam-kg-service
|
|
80
|
+
app.kubernetes.io/managed-by: olam
|
|
81
|
+
data:
|
|
82
|
+
# Port kg-service listens on — must match Service targetPort below.
|
|
83
|
+
OLAM_KG_SERVICE_PORT: "9997"
|
|
84
|
+
# CRITICAL: kg-service defaults to 127.0.0.1 bind. In k8s the readiness
|
|
85
|
+
# probe hits the pod IP, so 127.0.0.1-only listener causes probe failures.
|
|
86
|
+
# Force all-interfaces bind without requiring an image rebuild.
|
|
87
|
+
OLAM_KG_SERVICE_BIND: "0.0.0.0"
|
|
88
|
+
# Data directory — backed by the PVC mounted at /data.
|
|
89
|
+
OLAM_KG_DATA_PATH: "/data/kg"
|
|
90
|
+
# Auth-service URL — cluster-internal DNS (olam namespace).
|
|
91
|
+
OLAM_AUTH_SERVICE_URL: "http://olam-auth-service.olam.svc.cluster.local:9999"
|
|
92
|
+
---
|
|
93
|
+
# PersistentVolumeClaim — backs /data (KG index + savings telemetry).
|
|
94
|
+
# 10Gi: graph index grows with codebase size. See kg-service/45-pvc.yaml rationale.
|
|
95
|
+
# local-path StorageClass ships with k3d. Substitute for non-k3d clusters.
|
|
96
|
+
apiVersion: v1
|
|
97
|
+
kind: PersistentVolumeClaim
|
|
98
|
+
metadata:
|
|
99
|
+
name: olam-kg-data
|
|
100
|
+
namespace: olam
|
|
101
|
+
labels:
|
|
102
|
+
app: olam-kg-service
|
|
103
|
+
app.kubernetes.io/managed-by: olam
|
|
104
|
+
spec:
|
|
105
|
+
accessModes:
|
|
106
|
+
- ReadWriteOnce
|
|
107
|
+
storageClassName: local-path
|
|
108
|
+
resources:
|
|
109
|
+
requests:
|
|
110
|
+
storage: 10Gi
|
|
111
|
+
---
|
|
112
|
+
apiVersion: apps/v1
|
|
113
|
+
kind: Deployment
|
|
114
|
+
metadata:
|
|
115
|
+
name: olam-kg-service
|
|
116
|
+
namespace: olam
|
|
117
|
+
labels:
|
|
118
|
+
app: olam-kg-service
|
|
119
|
+
app.kubernetes.io/managed-by: olam
|
|
120
|
+
spec:
|
|
121
|
+
replicas: 1
|
|
122
|
+
strategy:
|
|
123
|
+
type: RollingUpdate
|
|
124
|
+
rollingUpdate:
|
|
125
|
+
maxSurge: 1
|
|
126
|
+
maxUnavailable: 0
|
|
127
|
+
selector:
|
|
128
|
+
matchLabels:
|
|
129
|
+
app: olam-kg-service
|
|
130
|
+
template:
|
|
131
|
+
metadata:
|
|
132
|
+
labels:
|
|
133
|
+
app: olam-kg-service
|
|
134
|
+
spec:
|
|
135
|
+
# Disable k8s automatic Service env injection.
|
|
136
|
+
# Without this, k8s injects OLAM_KG_SERVICE_PORT as "tcp://..." which
|
|
137
|
+
# breaks Python's int() parse of the port env var.
|
|
138
|
+
enableServiceLinks: false
|
|
139
|
+
imagePullSecrets:
|
|
140
|
+
- name: ghcr-pull
|
|
141
|
+
serviceAccountName: olam-kg-service
|
|
142
|
+
securityContext:
|
|
143
|
+
runAsNonRoot: true
|
|
144
|
+
runAsUser: 1000
|
|
145
|
+
runAsGroup: 1000
|
|
146
|
+
fsGroup: 1000
|
|
147
|
+
initContainers:
|
|
148
|
+
- name: chown-data
|
|
149
|
+
# busybox:1.36 — sha256-pinned per T4 threat model.
|
|
150
|
+
image: busybox@sha256:73aaf090f3d85aa34ee199857f03fa3a95c8ede2ffd4cc2cdb5b94e566b11662
|
|
151
|
+
imagePullPolicy: IfNotPresent
|
|
152
|
+
securityContext:
|
|
153
|
+
runAsUser: 0
|
|
154
|
+
runAsNonRoot: false
|
|
155
|
+
allowPrivilegeEscalation: false
|
|
156
|
+
command: ["chown", "-R", "1000:1000", "/data"]
|
|
157
|
+
volumeMounts:
|
|
158
|
+
- name: kg-data
|
|
159
|
+
mountPath: /data
|
|
160
|
+
containers:
|
|
161
|
+
- name: olam-kg-service
|
|
162
|
+
# Digest resolves to ghcr.io/pleri/olam-kg-service:0.1.158
|
|
163
|
+
# Run `npm run refresh:manifest-digests` to update.
|
|
164
|
+
image: ghcr.io/pleri/olam-kg-service@sha256:72030f3054315e7ebf575f6dcb9b4965e1ddee13ea7bfdeb0bde32253beeb1c7
|
|
165
|
+
imagePullPolicy: IfNotPresent
|
|
166
|
+
securityContext:
|
|
167
|
+
runAsNonRoot: true
|
|
168
|
+
runAsUser: 1000
|
|
169
|
+
readOnlyRootFilesystem: true
|
|
170
|
+
allowPrivilegeEscalation: false
|
|
171
|
+
capabilities:
|
|
172
|
+
drop: ["ALL"]
|
|
173
|
+
ports:
|
|
174
|
+
# CRITICAL: port name `http` must match ServiceMonitor
|
|
175
|
+
# 92-servicemonitor-kg-service.yaml endpoints[0].port.
|
|
176
|
+
- name: http
|
|
177
|
+
containerPort: 9997
|
|
178
|
+
protocol: TCP
|
|
179
|
+
envFrom:
|
|
180
|
+
- configMapRef:
|
|
181
|
+
name: olam-kg-service-env
|
|
182
|
+
- secretRef:
|
|
183
|
+
name: olam-kg-service-secret
|
|
184
|
+
volumeMounts:
|
|
185
|
+
- name: kg-data
|
|
186
|
+
mountPath: /data
|
|
187
|
+
- name: tmp
|
|
188
|
+
mountPath: /tmp
|
|
189
|
+
readinessProbe:
|
|
190
|
+
# kg-service returns {"ok":true,"ready":true} once bge-small model loads.
|
|
191
|
+
# initialDelaySeconds 30 gives the model warmup thread time to complete.
|
|
192
|
+
httpGet:
|
|
193
|
+
path: /health
|
|
194
|
+
port: 9997
|
|
195
|
+
initialDelaySeconds: 30
|
|
196
|
+
periodSeconds: 5
|
|
197
|
+
timeoutSeconds: 3
|
|
198
|
+
failureThreshold: 12
|
|
199
|
+
livenessProbe:
|
|
200
|
+
httpGet:
|
|
201
|
+
path: /health
|
|
202
|
+
port: 9997
|
|
203
|
+
initialDelaySeconds: 60
|
|
204
|
+
periodSeconds: 20
|
|
205
|
+
timeoutSeconds: 5
|
|
206
|
+
failureThreshold: 3
|
|
207
|
+
resources:
|
|
208
|
+
requests:
|
|
209
|
+
cpu: "100m"
|
|
210
|
+
# bge-small ONNX model requires ~400Mi at runtime; 512Mi is the
|
|
211
|
+
# minimum viable request. Set higher if OOM-killed on first classify.
|
|
212
|
+
memory: "512Mi"
|
|
213
|
+
limits:
|
|
214
|
+
cpu: "1000m"
|
|
215
|
+
# 1Gi: bge-small model (~90Mi) + index cache + request headroom.
|
|
216
|
+
memory: "1Gi"
|
|
217
|
+
volumes:
|
|
218
|
+
- name: kg-data
|
|
219
|
+
persistentVolumeClaim:
|
|
220
|
+
claimName: olam-kg-data
|
|
221
|
+
- name: tmp
|
|
222
|
+
emptyDir: {}
|
|
223
|
+
---
|
|
224
|
+
# Service — exposes kg-service to the cluster.
|
|
225
|
+
# CRITICAL: `name: http` matches 92-servicemonitor-kg-service.yaml endpoints[0].port.
|
|
226
|
+
# Namespace `olam` matches ServiceMonitor's namespaceSelector.matchNames.
|
|
227
|
+
apiVersion: v1
|
|
228
|
+
kind: Service
|
|
229
|
+
metadata:
|
|
230
|
+
name: olam-kg-service
|
|
231
|
+
namespace: olam
|
|
232
|
+
labels:
|
|
233
|
+
# CRITICAL: matches 92-servicemonitor-kg-service.yaml spec.selector.matchLabels.
|
|
234
|
+
app: olam-kg-service
|
|
235
|
+
app.kubernetes.io/managed-by: olam
|
|
236
|
+
spec:
|
|
237
|
+
type: ClusterIP
|
|
238
|
+
selector:
|
|
239
|
+
app: olam-kg-service
|
|
240
|
+
ports:
|
|
241
|
+
# CRITICAL: name `http` matches ServiceMonitor endpoints[0].port.
|
|
242
|
+
- name: http
|
|
243
|
+
port: 9997
|
|
244
|
+
targetPort: 9997
|
|
245
|
+
protocol: TCP
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# IngressRoute — host-cp (bare /api/* per Decision 3 hybrid routing)
|
|
2
|
+
# host-cp preserves 50+ existing SPA fetch sites at /api/* (no strip-prefix).
|
|
3
|
+
apiVersion: traefik.io/v1alpha1
|
|
4
|
+
kind: IngressRoute
|
|
5
|
+
metadata:
|
|
6
|
+
name: olam-host-cp
|
|
7
|
+
namespace: olam
|
|
8
|
+
spec:
|
|
9
|
+
entryPoints:
|
|
10
|
+
- web
|
|
11
|
+
routes:
|
|
12
|
+
# host-cp is the catch-all (per Decision 3 hybrid routing); explicit low priority
|
|
13
|
+
# so service-prefix routes (kg, agent-memory, etc.) win when their longer prefix matches.
|
|
14
|
+
# Default Traefik priority is rule-string length; OR'd rules inflate the host-cp aggregate
|
|
15
|
+
# ABOVE more-specific PathPrefix matches, causing /api/kg/* to land on host-cp incorrectly.
|
|
16
|
+
# Explicit priority avoids the silent precedence bug (caught in PR #736 live-validation).
|
|
17
|
+
- match: PathPrefix(`/api/`) || PathPrefix(`/session/`) || PathPrefix(`/v1/`) || Path(`/health`)
|
|
18
|
+
kind: Rule
|
|
19
|
+
priority: 10
|
|
20
|
+
services:
|
|
21
|
+
- name: olam-host-cp
|
|
22
|
+
port: 19000
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# IngressRoute — kg-service via /api/kg/* strip-prefix (Decision 3 new-services pattern)
|
|
2
|
+
apiVersion: traefik.io/v1alpha1
|
|
3
|
+
kind: Middleware
|
|
4
|
+
metadata:
|
|
5
|
+
name: strip-api-kg
|
|
6
|
+
namespace: olam
|
|
7
|
+
spec:
|
|
8
|
+
stripPrefix:
|
|
9
|
+
prefixes:
|
|
10
|
+
- /api/kg
|
|
11
|
+
---
|
|
12
|
+
apiVersion: traefik.io/v1alpha1
|
|
13
|
+
kind: IngressRoute
|
|
14
|
+
metadata:
|
|
15
|
+
name: olam-kg-service
|
|
16
|
+
namespace: olam
|
|
17
|
+
spec:
|
|
18
|
+
entryPoints:
|
|
19
|
+
- web
|
|
20
|
+
routes:
|
|
21
|
+
# Priority 100 > host-cp's 10 so /api/kg/* wins over host-cp's catch-all /api/*.
|
|
22
|
+
- match: PathPrefix(`/api/kg/`)
|
|
23
|
+
kind: Rule
|
|
24
|
+
priority: 100
|
|
25
|
+
services:
|
|
26
|
+
- name: olam-kg-service
|
|
27
|
+
port: 9997
|
|
28
|
+
middlewares:
|
|
29
|
+
- name: strip-api-kg
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# IngressRoute — agent-memory via /api/agent-memory/* strip-prefix (Decision 3 new-services pattern)
|
|
2
|
+
apiVersion: traefik.io/v1alpha1
|
|
3
|
+
kind: Middleware
|
|
4
|
+
metadata:
|
|
5
|
+
name: strip-api-agent-memory
|
|
6
|
+
namespace: olam
|
|
7
|
+
spec:
|
|
8
|
+
stripPrefix:
|
|
9
|
+
prefixes:
|
|
10
|
+
- /api/agent-memory
|
|
11
|
+
---
|
|
12
|
+
apiVersion: traefik.io/v1alpha1
|
|
13
|
+
kind: IngressRoute
|
|
14
|
+
metadata:
|
|
15
|
+
name: olam-agent-memory
|
|
16
|
+
namespace: olam
|
|
17
|
+
spec:
|
|
18
|
+
entryPoints:
|
|
19
|
+
- web
|
|
20
|
+
routes:
|
|
21
|
+
# Priority 100 > host-cp's 10 so /api/agent-memory/* wins over host-cp's catch-all /api/*.
|
|
22
|
+
- match: PathPrefix(`/api/agent-memory/`)
|
|
23
|
+
kind: Rule
|
|
24
|
+
priority: 100
|
|
25
|
+
services:
|
|
26
|
+
- name: olam-memory-service
|
|
27
|
+
port: 3111 # Real memory-service listen port (per packages/memory-service/src/worker.ts:206 + AGENTMEMORY_HOST_INTERNAL_URL in container.ts:101). Pass-1 plan said 3112 (incorrect); A6 corrects to 3111.
|
|
28
|
+
middlewares:
|
|
29
|
+
- name: strip-api-agent-memory
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# NetworkPolicy — olam namespace ingress fence (Phase A Task A9)
|
|
2
|
+
#
|
|
3
|
+
# Defense-in-depth: even if a world agent escapes its container or steals a
|
|
4
|
+
# bearer token, NetworkPolicy ensures it can only reach olam services via the
|
|
5
|
+
# Traefik ingress path (which enforces bearer auth on world-originated calls
|
|
6
|
+
# per A6 — see packages/peripheral-services/manifests/30-traefik-ingressroute-host-cp.yaml).
|
|
7
|
+
# Direct pod-to-pod access bypassing ingress is denied.
|
|
8
|
+
#
|
|
9
|
+
# Enforcement matrix — two separate enforcement paths exist; the comment below
|
|
10
|
+
# previously conflated them (corrected 2026-05-21, see dogfood incident finding #2):
|
|
11
|
+
#
|
|
12
|
+
# k3d/k3s with --disable-network-policy=false (production k3s default):
|
|
13
|
+
# k3s ships a built-in NetworkPolicy controller that enforces NetworkPolicies
|
|
14
|
+
# via iptables rules, INDEPENDENT of the CNI. Flannel itself does not enforce,
|
|
15
|
+
# but the k3s controller does. Result: NetworkPolicies ARE enforced even on
|
|
16
|
+
# default Flannel k3s/k3d clusters — this is what the operator's colima+k3d
|
|
17
|
+
# dogfood cluster experienced (the fence was live despite using Flannel).
|
|
18
|
+
#
|
|
19
|
+
# k3d/k3s with --disable-network-policy=true (this harness — cluster-up.sh):
|
|
20
|
+
# The harness explicitly passes --k3s-arg '--disable-network-policy@server:*'
|
|
21
|
+
# to disable the k3s built-in controller. With the controller off, enforcement
|
|
22
|
+
# depends entirely on the CNI: Flannel = no enforcement; Calico = enforced.
|
|
23
|
+
# The harness uses Calico precisely so tests exercise real enforcement.
|
|
24
|
+
#
|
|
25
|
+
# Production k3s (default, no --disable-network-policy):
|
|
26
|
+
# Controller-enforced via iptables unless the operator explicitly disables it.
|
|
27
|
+
#
|
|
28
|
+
# See docs/architecture/networkpolicy-fence.md for the full environment matrix
|
|
29
|
+
# and docs/incidents/2026-05-21-phase-c-dogfood.md (finding #2) for the live
|
|
30
|
+
# evidence that k3s' bundled controller enforces on Flannel clusters.
|
|
31
|
+
#
|
|
32
|
+
# Threat mitigated: T6 (world→host SSRF via unauthenticated ingress route).
|
|
33
|
+
# Companion mitigations (do not remove A6 + A9 together): bearer auth (A6),
|
|
34
|
+
# 127.0.0.1 bind on host-cp + kube-apiserver (OS-level, separate from k8s).
|
|
35
|
+
apiVersion: networking.k8s.io/v1
|
|
36
|
+
kind: NetworkPolicy
|
|
37
|
+
metadata:
|
|
38
|
+
name: olam-ingress-fence
|
|
39
|
+
namespace: olam
|
|
40
|
+
labels:
|
|
41
|
+
app.kubernetes.io/part-of: olam
|
|
42
|
+
app.kubernetes.io/component: security-fence
|
|
43
|
+
olam.io/phase: a
|
|
44
|
+
olam.io/task: a9
|
|
45
|
+
spec:
|
|
46
|
+
# Selects every pod in the olam namespace. Intra-namespace traffic is allowed
|
|
47
|
+
# explicitly below so olam services can call each other; cross-namespace and
|
|
48
|
+
# external traffic must traverse Traefik (which the second rule allows).
|
|
49
|
+
podSelector: {}
|
|
50
|
+
policyTypes:
|
|
51
|
+
- Ingress
|
|
52
|
+
ingress:
|
|
53
|
+
# Allow inbound from Traefik (canonical ingress path). The label selector
|
|
54
|
+
# matches the standard Helm-chart label that k3s' bundled Traefik install
|
|
55
|
+
# sets (`app.kubernetes.io/name: traefik`); also matched by the upstream
|
|
56
|
+
# `traefik/traefik` chart used by Phase A Task A3.
|
|
57
|
+
- from:
|
|
58
|
+
- namespaceSelector:
|
|
59
|
+
matchLabels:
|
|
60
|
+
kubernetes.io/metadata.name: kube-system
|
|
61
|
+
podSelector:
|
|
62
|
+
matchLabels:
|
|
63
|
+
app.kubernetes.io/name: traefik
|
|
64
|
+
# Allow intra-namespace pod-to-pod traffic — olam services may call each
|
|
65
|
+
# other directly (host-cp → kg-service, etc.) without round-tripping
|
|
66
|
+
# through Traefik. Audit log on world-originated calls still fires at the
|
|
67
|
+
# bearer-auth layer (A6), so this allowance does not weaken T6 mitigation.
|
|
68
|
+
- from:
|
|
69
|
+
- podSelector: {}
|
|
70
|
+
# Allow inbound from the monitoring namespace — Phase C's Prometheus
|
|
71
|
+
# (kube-prometheus-stack) scrapes pod IPs directly for /metrics
|
|
72
|
+
# collection. Without this rule, ServiceMonitor targets in `olam` ns
|
|
73
|
+
# appear "up" but yield 0 samples (the scrape connection silently fails
|
|
74
|
+
# at CNI level on enforcing CNIs). Surfaced during 2026-05-21 operator
|
|
75
|
+
# dogfood — see docs/incidents/2026-05-21-phase-c-dogfood.md, finding #2.
|
|
76
|
+
# Scope: monitoring → olam ingress only (not the reverse direction).
|
|
77
|
+
- from:
|
|
78
|
+
- namespaceSelector:
|
|
79
|
+
matchLabels:
|
|
80
|
+
kubernetes.io/metadata.name: monitoring
|