@pleri/olam-cli 0.1.160 → 0.1.162

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/README.md +11 -0
  2. package/dist/agent-stream/agent-sdk-to-chunks.js +20 -2
  3. package/dist/commands/bootstrap.d.ts +15 -0
  4. package/dist/commands/bootstrap.d.ts.map +1 -1
  5. package/dist/commands/bootstrap.js +58 -5
  6. package/dist/commands/bootstrap.js.map +1 -1
  7. package/dist/commands/flywheel/migrate-overlays.d.ts +1 -0
  8. package/dist/commands/flywheel/migrate-overlays.d.ts.map +1 -1
  9. package/dist/commands/flywheel/migrate-overlays.js +29 -3
  10. package/dist/commands/flywheel/migrate-overlays.js.map +1 -1
  11. package/dist/commands/skills-source.d.ts.map +1 -1
  12. package/dist/commands/skills-source.js +57 -2
  13. package/dist/commands/skills-source.js.map +1 -1
  14. package/dist/commands/skills.d.ts.map +1 -1
  15. package/dist/commands/skills.js +14 -0
  16. package/dist/commands/skills.js.map +1 -1
  17. package/dist/image-digests.json +7 -7
  18. package/dist/index.js +2424 -1781
  19. package/dist/lib/bootstrap-kubernetes.d.ts +42 -0
  20. package/dist/lib/bootstrap-kubernetes.d.ts.map +1 -0
  21. package/dist/lib/bootstrap-kubernetes.js +367 -0
  22. package/dist/lib/bootstrap-kubernetes.js.map +1 -0
  23. package/dist/lib/config.d.ts.map +1 -1
  24. package/dist/lib/config.js +6 -1
  25. package/dist/lib/config.js.map +1 -1
  26. package/dist/mcp-server.js +568 -368
  27. package/hermes-bundle/version.json +1 -1
  28. package/host-cp/k8s/manifests/50-deployment.yaml +1 -1
  29. package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
  30. package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
  31. package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
  32. package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +1 -1
  33. package/host-cp/observability/grafana-port-forward.sh +283 -0
  34. package/host-cp/observability/kyverno-cardinality-mutate.sh +462 -0
  35. package/host-cp/observability/loki-ingest.sh +253 -0
  36. package/host-cp/observability/prom-no-double-grafana.sh +311 -0
  37. package/host-cp/peripheral-services/helm-values/grafana-values.yaml +159 -0
  38. package/host-cp/peripheral-services/helm-values/kube-prom-stack-values.yaml +229 -0
  39. package/host-cp/peripheral-services/helm-values/kyverno-values.yaml +85 -0
  40. package/host-cp/peripheral-services/helm-values/loki-values.yaml +166 -0
  41. package/host-cp/peripheral-services/helm-values/promtail-staging.yaml +92 -0
  42. package/host-cp/peripheral-services/helm-values/promtail-values.yaml +102 -0
  43. package/host-cp/peripheral-services/helm-values/traefik-values.yaml +73 -0
  44. package/host-cp/peripheral-services/manifests/20-namespace.yaml +6 -0
  45. package/host-cp/peripheral-services/manifests/24-deploy-kg-service.yaml +245 -0
  46. package/host-cp/peripheral-services/manifests/30-traefik-ingressroute-host-cp.yaml +22 -0
  47. package/host-cp/peripheral-services/manifests/40-traefik-ingressroute-kg.yaml +29 -0
  48. package/host-cp/peripheral-services/manifests/50-traefik-ingressroute-agent-memory.yaml +29 -0
  49. package/host-cp/peripheral-services/manifests/60-networkpolicy-ingress.yaml +80 -0
  50. package/host-cp/peripheral-services/manifests/65-networkpolicy-loki-prom-deny.yaml +67 -0
  51. package/host-cp/peripheral-services/manifests/80-grafana-dashboard-configmap.yaml +1349 -0
  52. package/host-cp/peripheral-services/manifests/90-prom-alert-cardinality.yaml +50 -0
  53. package/host-cp/peripheral-services/manifests/91-servicemonitor-host-cp.yaml +70 -0
  54. package/host-cp/peripheral-services/manifests/92-servicemonitor-kg-service.yaml +70 -0
  55. package/host-cp/peripheral-services/manifests/93-servicemonitor-memory-service.yaml +87 -0
  56. package/host-cp/peripheral-services/manifests/95-prom-recording-rules.yaml +108 -0
  57. package/host-cp/peripheral-services/manifests/96-kyverno-cardinality-mutate.yaml +195 -0
  58. package/host-cp/src/plan-chat-service.mjs +147 -1
  59. package/package.json +1 -1
@@ -0,0 +1,92 @@
1
+ # Promtail Helm values — Phase A Task A5 staging (Phase B consumes)
2
+ #
3
+ # Tails every container's stdout; ships to Loki single-binary (Phase B installs Loki).
4
+ # Per OQ-p3-6: Traefik native config can redact HEADERS but NOT URL query params —
5
+ # query-param scrubbing for `?token=`, `?code=`, `?access_token=`, `?state=` happens
6
+ # HERE at Promtail ingest via pipeline_stages.replace regex.
7
+ #
8
+ # Resource limits per OQ-p3-37 (Promtail OOM risk under chatty container-cp 100ms cadence):
9
+ # - memory limit 256Mi
10
+ # - pipeline_stages.limit rate 100 lines/sec/stream
11
+ #
12
+ # Scrape config matches every pod log; namespace-scope labels are added so Loki LogQL queries
13
+ # can filter by service / namespace / pod.
14
+ #
15
+ # SECURITY NOTE — replace stage regex semantics (load-bearing):
16
+ # Promtail's `replace` stage iterates over CAPTURE GROUPS, not full matches.
17
+ # The `replace` field is a Go text/template string; `${1}` is NOT valid Go
18
+ # template syntax and silently becomes a literal. The correct pattern is:
19
+ # expression: '(?:prefix)(secret_value_only)' — capture ONLY the secret part
20
+ # replace: 'REDACTED' — replace captured secret with literal
21
+ # See promtail-values.yaml header comment for full details.
22
+
23
+ deploymentMode: DaemonSet
24
+
25
+ resources:
26
+ requests:
27
+ cpu: 50m
28
+ memory: 64Mi
29
+ limits:
30
+ cpu: 200m
31
+ memory: 256Mi # OQ-p3-37: bounded; OOM-kill restart preferred over runaway memory
32
+
33
+ config:
34
+ clients:
35
+ - url: http://olam-loki.monitoring.svc.cluster.local:3100/loki/api/v1/push
36
+
37
+ snippets:
38
+ pipelineStages:
39
+ # 1. Parse JSON access logs from Traefik (key field present in JSON line)
40
+ - match:
41
+ selector: '{container="traefik"}'
42
+ stages:
43
+ - json:
44
+ expressions:
45
+ request_method: RequestMethod
46
+ request_path: RequestPath
47
+ status: DownstreamStatus
48
+ request_id: requestId
49
+ service: ServiceName
50
+ router: RouterName
51
+
52
+ # 2. Scrub OAuth/token values from URL query params and Authorization headers.
53
+ #
54
+ # IMPORTANT — capture group semantics:
55
+ # The replace stage replaces each CAPTURE GROUP with the `replace` template
56
+ # value. Capture groups must wrap ONLY the secret value, not the surrounding
57
+ # context. The prefix (e.g. `?code=`) uses a non-capturing group `(?:...)` so
58
+ # it is preserved in the output while only the secret is replaced.
59
+ - replace:
60
+ # OAuth code= callback values — capture only the token value after `code=`
61
+ expression: '(?:\?|&)code=([^&\s]+)'
62
+ replace: 'REDACTED'
63
+ - replace:
64
+ # Bearer / access tokens in query strings — capture only the value
65
+ expression: '(?:\?|&)(?:access_token|token|api_key|secret)=([^&\s]+)'
66
+ replace: 'REDACTED'
67
+ - replace:
68
+ # OAuth state param (may carry session info) — capture only the value
69
+ expression: '(?:\?|&)state=([^&\s]+)'
70
+ replace: 'REDACTED'
71
+ - replace:
72
+ # Authorization header Bearer value — capture only the token after `Bearer `
73
+ expression: '(?:Authorization|authorization):\s*(?:Bearer|bearer)\s+(\S+)'
74
+ replace: 'REDACTED'
75
+
76
+ # 3. Rate-limit ingestion per-stream to prevent OOM cascade under chatty containers (OQ-p3-37)
77
+ - limit:
78
+ rate: 100 # max log lines/sec per stream
79
+ burst: 200
80
+ drop: true # drop excess lines; do NOT block tail
81
+
82
+ # 4. Promote parsed fields to labels (low-cardinality only — taxonomy compliance)
83
+ - labels:
84
+ service: # from Traefik JSON access log; matches taxonomy `service` label
85
+ router: # Traefik router name
86
+ status: # HTTP status code (within taxonomy)
87
+
88
+ # Retention is configured on Loki side (Phase B), not Promtail.
89
+ # Sample retention target: 7 days per Performance budget Row.
90
+
91
+ serviceMonitor:
92
+ enabled: true # Prometheus (Phase C) scrapes Promtail's own /metrics for self-observability
@@ -0,0 +1,102 @@
1
+ # Promtail Helm values — k3s-ingress-observability Phase B Task B1 (production)
2
+ #
3
+ # Production Promtail values. Staging copy at promtail-staging.yaml has the
4
+ # same scrubbing pipeline shape; this file sets the Loki client URL +
5
+ # production resource limits.
6
+ #
7
+ # Scrubbing pipeline:
8
+ # - 4 `replace` stages: code=, token/access_token/api_key/secret=, state=, Authorization
9
+ # - `limit` stage: rate=100/burst=200/drop=true (OQ-p3-37: Promtail OOM under chatty containers)
10
+ # Client URL: http://olam-loki.monitoring.svc.cluster.local:3100/loki/api/v1/push
11
+ # Service name `olam-loki` is the Helm release name used in scripts/e2e/loki-ingest.sh
12
+ # (`helm upgrade --install olam-loki grafana/loki ...`); the chart's Service
13
+ # is named after the release, so `olam-loki` is the in-cluster DNS hostname.
14
+ #
15
+ # SECURITY NOTE — replace stage regex semantics (load-bearing):
16
+ # Promtail's `replace` stage iterates over CAPTURE GROUPS, not full matches.
17
+ # The `replace` field is a Go text/template string; `${1}` is NOT valid Go
18
+ # template syntax and silently becomes a literal. The correct pattern is:
19
+ # expression: '(?:prefix)(secret_value_only)' — capture ONLY the secret part
20
+ # replace: 'REDACTED' — replace captured secret with literal
21
+ # This leaves the surrounding context (e.g. `?code=`) intact and redacts only
22
+ # the value. The broken pattern `(\?|&)code=[^&\s]+` with `replace: '${1}code=REDACTED'`
23
+ # was the root cause of the Phase B scrubbing regression (PR #776).
24
+ #
25
+ # See: docs/plans/k3s-ingress-observability/DESIGN.md (T8, T9)
26
+
27
+ deploymentMode: DaemonSet
28
+
29
+ resources:
30
+ requests:
31
+ cpu: 50m
32
+ memory: 64Mi
33
+ limits:
34
+ cpu: 200m
35
+ memory: 256Mi # OQ-p3-37: bounded; OOM-kill restart preferred over runaway memory
36
+
37
+ config:
38
+ clients:
39
+ - url: http://olam-loki.monitoring.svc.cluster.local:3100/loki/api/v1/push
40
+
41
+ snippets:
42
+ pipelineStages:
43
+ # 1. Parse JSON access logs from Traefik (key field present in JSON line)
44
+ - match:
45
+ selector: '{container="traefik"}'
46
+ stages:
47
+ - json:
48
+ expressions:
49
+ request_method: RequestMethod
50
+ request_path: RequestPath
51
+ status: DownstreamStatus
52
+ request_id: requestId
53
+ service: ServiceName
54
+ router: RouterName
55
+
56
+ # 2. Scrub OAuth/token values from URL query params and Authorization headers.
57
+ #
58
+ # IMPORTANT — capture group semantics:
59
+ # The replace stage replaces each CAPTURE GROUP with the `replace` template
60
+ # value. Capture groups must wrap ONLY the secret value, not the surrounding
61
+ # context. The prefix (e.g. `?code=`) uses a non-capturing group `(?:...)` so
62
+ # it is preserved in the output while only the secret is replaced.
63
+ - replace:
64
+ # OAuth code= callback values — capture only the token value after `code=`
65
+ expression: '(?:\?|&)code=([^&\s]+)'
66
+ replace: 'REDACTED'
67
+ - replace:
68
+ # Bearer / access tokens in query strings — capture only the value
69
+ expression: '(?:\?|&)(?:access_token|token|api_key|secret)=([^&\s]+)'
70
+ replace: 'REDACTED'
71
+ - replace:
72
+ # OAuth state param (may carry session info) — capture only the value
73
+ expression: '(?:\?|&)state=([^&\s]+)'
74
+ replace: 'REDACTED'
75
+ - replace:
76
+ # Authorization header Bearer value — capture only the token after `Bearer `
77
+ expression: '(?:Authorization|authorization):\s*(?:Bearer|bearer)\s+(\S+)'
78
+ replace: 'REDACTED'
79
+
80
+ # 3. Rate-limit ingestion per-stream to prevent OOM cascade under chatty containers (OQ-p3-37)
81
+ - limit:
82
+ rate: 100 # max log lines/sec per stream
83
+ burst: 200
84
+ drop: true # drop excess lines; do NOT block tail
85
+
86
+ # 4. Promote parsed fields to labels (low-cardinality only — taxonomy compliance)
87
+ - labels:
88
+ service: # from Traefik JSON access log; matches taxonomy `service` label
89
+ router: # Traefik router name
90
+ status: # HTTP status code (within taxonomy)
91
+
92
+ # Retention is configured on Loki side (loki-values.yaml: 7 days / 168h).
93
+
94
+ serviceMonitor:
95
+ # Disabled in the source-of-truth values file so a standalone Phase B install
96
+ # (without kube-prometheus-stack) does not hard-fail with
97
+ # "no matches for kind ServiceMonitor in version monitoring.coreos.com/v1".
98
+ # The C1 e2e script flips this on at RUNTIME via
99
+ # helm upgrade ... --reuse-values --set serviceMonitor.enabled=true
100
+ # AFTER kube-prom-stack has installed the ServiceMonitor CRD. Source-of-truth
101
+ # stays standalone-friendly; runtime override wires Prometheus discovery.
102
+ enabled: false
@@ -0,0 +1,73 @@
1
+ # Traefik Helm values — k3s-ingress-observability Phase A Task A3
2
+ # Pinned NodePort 30080 per OQ-p3-7 (world hooks bake this URL).
3
+ # Structured JSON access logs ready for Phase A Task A5 + Phase B Promtail pickup.
4
+
5
+ deployment:
6
+ replicas: 1 # SPOF mitigation = host systemd watchdog (Phase A Task A11), not HA replicas
7
+
8
+ ports:
9
+ web:
10
+ port: 8000
11
+ expose:
12
+ default: true
13
+ exposedPort: 80
14
+ nodePort: 30080 # PIN (OQ-p3-7); world hooks reach via host.docker.internal:30080
15
+ protocol: TCP
16
+ websecure:
17
+ port: 8443
18
+ expose:
19
+ default: true
20
+ exposedPort: 443
21
+ nodePort: 30443
22
+ protocol: TCP
23
+ # v1: HTTPS deferred to fatbox multi-org (Out-of-scope of this plan); TLS not configured.
24
+
25
+ service:
26
+ type: NodePort
27
+
28
+ # Structured access logs to stdout — Promtail picks up in Phase B.
29
+ # Authorization header redaction here; URL query-param scrubbing happens
30
+ # at Promtail pipeline_stages.replace per OQ-p3-6 (Traefik can't scrub query params natively).
31
+ logs:
32
+ general:
33
+ level: INFO
34
+ format: json
35
+ access:
36
+ enabled: true
37
+ format: json
38
+ fields:
39
+ headers:
40
+ defaultMode: keep
41
+ names:
42
+ Authorization: redact
43
+ Cookie: redact
44
+
45
+ # Built-in /metrics for Phase C Prometheus scrape
46
+ metrics:
47
+ prometheus:
48
+ enabled: true
49
+ addEntryPointsLabels: true
50
+ addRoutersLabels: true
51
+ addServicesLabels: true
52
+
53
+ # Dashboard disabled in cluster — operator uses Grafana (Phase B)
54
+ ingressRoute:
55
+ dashboard:
56
+ enabled: false
57
+
58
+ # IngressRoute CRD enabled
59
+ providers:
60
+ kubernetesCRD:
61
+ enabled: true
62
+ allowCrossNamespace: false # explicit; matches namespace-isolation strategy from A1
63
+ kubernetesIngress:
64
+ enabled: false # CRD-only; vanilla Ingress not supported in this stack
65
+
66
+ # Resource bounds — observability stack target <500MB RAM idle (P2)
67
+ resources:
68
+ requests:
69
+ cpu: 100m
70
+ memory: 64Mi
71
+ limits:
72
+ cpu: 500m
73
+ memory: 256Mi
@@ -0,0 +1,6 @@
1
+ # Namespace for k3s-ingress-observability peripheral services
2
+ # (Traefik installs to kube-system; observability stack to monitoring; this is for IngressRoute CRDs targeting olam services)
3
+ apiVersion: v1
4
+ kind: Namespace
5
+ metadata:
6
+ name: olam
@@ -0,0 +1,245 @@
1
+ # 24-deploy-kg-service.yaml — kg-service Service + Deployment for local k3s dogfood.
2
+ #
3
+ # Bridges the gap between Phase C's ServiceMonitor (92-servicemonitor-kg-service.yaml)
4
+ # and a running service. The ServiceMonitor targets namespace `olam`,
5
+ # label `app: olam-kg-service`, port name `http` — this manifest satisfies that
6
+ # contract so Prometheus can scrape kg-service's /metrics endpoint.
7
+ #
8
+ # Canonical per-service manifest tree: packages/host-cp/k8s/manifests/kg-service/
9
+ # This file is the "peripheral-services entry point" view — it folds Service +
10
+ # Deployment into a single file for `kubectl apply -f manifests/` convenience.
11
+ #
12
+ # Secrets prerequisite: operator MUST create `olam-kg-service-secret` in the
13
+ # `olam` namespace BEFORE applying this manifest. See README.md § Secrets.
14
+ #
15
+ # Image: pinned to sha256 digest (not :latest) per T4 threat model.
16
+ # Digest resolves to ghcr.io/pleri/olam-kg-service:0.1.158.
17
+ # To update:
18
+ # TOKEN=$(curl -s "https://ghcr.io/token?scope=repository:pleri/olam-kg-service:pull&service=ghcr.io" | jq -r .token)
19
+ # curl -sI -H "Authorization: Bearer $TOKEN" \
20
+ # -H "Accept: application/vnd.oci.image.index.v1+json,application/vnd.docker.distribution.manifest.list.v2+json" \
21
+ # https://ghcr.io/v2/pleri/olam-kg-service/manifests/<tag> | grep docker-content-digest
22
+ #
23
+ # Memory: bge-small-en-v1.5 ONNX model is pre-cached in the image (~90 MB).
24
+ # Container needs ≥512Mi to load the model + serve requests. Limit set to 1Gi.
25
+ #
26
+ # Apply-manifests.sh: this file is SKIPPED by the phase-a-e2e harness
27
+ # (apply-manifests.sh skip-list includes 2[3-4]-deploy-*) because the
28
+ # harness cluster has no operator secrets or kg-data PVC.
29
+ # Operator-side `kubectl apply -f manifests/` applies it.
30
+ ---
31
+ apiVersion: v1
32
+ kind: ServiceAccount
33
+ metadata:
34
+ name: olam-kg-service
35
+ namespace: olam
36
+ labels:
37
+ app: olam-kg-service
38
+ app.kubernetes.io/managed-by: olam
39
+ ---
40
+ apiVersion: rbac.authorization.k8s.io/v1
41
+ kind: Role
42
+ metadata:
43
+ name: olam-kg-service
44
+ namespace: olam
45
+ labels:
46
+ app: olam-kg-service
47
+ app.kubernetes.io/managed-by: olam
48
+ rules:
49
+ - apiGroups: ["apps"]
50
+ resources: ["deployments"]
51
+ resourceNames: ["olam-kg-service"]
52
+ verbs: ["get", "patch", "watch"]
53
+ ---
54
+ apiVersion: rbac.authorization.k8s.io/v1
55
+ kind: RoleBinding
56
+ metadata:
57
+ name: olam-kg-service
58
+ namespace: olam
59
+ labels:
60
+ app: olam-kg-service
61
+ app.kubernetes.io/managed-by: olam
62
+ subjects:
63
+ - kind: ServiceAccount
64
+ name: olam-kg-service
65
+ namespace: olam
66
+ roleRef:
67
+ kind: Role
68
+ name: olam-kg-service
69
+ apiGroup: rbac.authorization.k8s.io
70
+ ---
71
+ # ConfigMap — non-sensitive env vars.
72
+ # Sensitive values (OLAM_KG_BEARER_TOKEN) live in `olam-kg-service-secret`.
73
+ apiVersion: v1
74
+ kind: ConfigMap
75
+ metadata:
76
+ name: olam-kg-service-env
77
+ namespace: olam
78
+ labels:
79
+ app: olam-kg-service
80
+ app.kubernetes.io/managed-by: olam
81
+ data:
82
+ # Port kg-service listens on — must match Service targetPort below.
83
+ OLAM_KG_SERVICE_PORT: "9997"
84
+ # CRITICAL: kg-service defaults to 127.0.0.1 bind. In k8s the readiness
85
+ # probe hits the pod IP, so 127.0.0.1-only listener causes probe failures.
86
+ # Force all-interfaces bind without requiring an image rebuild.
87
+ OLAM_KG_SERVICE_BIND: "0.0.0.0"
88
+ # Data directory — backed by the PVC mounted at /data.
89
+ OLAM_KG_DATA_PATH: "/data/kg"
90
+ # Auth-service URL — cluster-internal DNS (olam namespace).
91
+ OLAM_AUTH_SERVICE_URL: "http://olam-auth-service.olam.svc.cluster.local:9999"
92
+ ---
93
+ # PersistentVolumeClaim — backs /data (KG index + savings telemetry).
94
+ # 10Gi: graph index grows with codebase size. See kg-service/45-pvc.yaml rationale.
95
+ # local-path StorageClass ships with k3d. Substitute for non-k3d clusters.
96
+ apiVersion: v1
97
+ kind: PersistentVolumeClaim
98
+ metadata:
99
+ name: olam-kg-data
100
+ namespace: olam
101
+ labels:
102
+ app: olam-kg-service
103
+ app.kubernetes.io/managed-by: olam
104
+ spec:
105
+ accessModes:
106
+ - ReadWriteOnce
107
+ storageClassName: local-path
108
+ resources:
109
+ requests:
110
+ storage: 10Gi
111
+ ---
112
+ apiVersion: apps/v1
113
+ kind: Deployment
114
+ metadata:
115
+ name: olam-kg-service
116
+ namespace: olam
117
+ labels:
118
+ app: olam-kg-service
119
+ app.kubernetes.io/managed-by: olam
120
+ spec:
121
+ replicas: 1
122
+ strategy:
123
+ type: RollingUpdate
124
+ rollingUpdate:
125
+ maxSurge: 1
126
+ maxUnavailable: 0
127
+ selector:
128
+ matchLabels:
129
+ app: olam-kg-service
130
+ template:
131
+ metadata:
132
+ labels:
133
+ app: olam-kg-service
134
+ spec:
135
+ # Disable k8s automatic Service env injection.
136
+ # Without this, k8s injects OLAM_KG_SERVICE_PORT as "tcp://..." which
137
+ # breaks Python's int() parse of the port env var.
138
+ enableServiceLinks: false
139
+ imagePullSecrets:
140
+ - name: ghcr-pull
141
+ serviceAccountName: olam-kg-service
142
+ securityContext:
143
+ runAsNonRoot: true
144
+ runAsUser: 1000
145
+ runAsGroup: 1000
146
+ fsGroup: 1000
147
+ initContainers:
148
+ - name: chown-data
149
+ # busybox:1.36 — sha256-pinned per T4 threat model.
150
+ image: busybox@sha256:73aaf090f3d85aa34ee199857f03fa3a95c8ede2ffd4cc2cdb5b94e566b11662
151
+ imagePullPolicy: IfNotPresent
152
+ securityContext:
153
+ runAsUser: 0
154
+ runAsNonRoot: false
155
+ allowPrivilegeEscalation: false
156
+ command: ["chown", "-R", "1000:1000", "/data"]
157
+ volumeMounts:
158
+ - name: kg-data
159
+ mountPath: /data
160
+ containers:
161
+ - name: olam-kg-service
162
+ # Digest resolves to ghcr.io/pleri/olam-kg-service:0.1.158
163
+ # Run `npm run refresh:manifest-digests` to update.
164
+ image: ghcr.io/pleri/olam-kg-service@sha256:72030f3054315e7ebf575f6dcb9b4965e1ddee13ea7bfdeb0bde32253beeb1c7
165
+ imagePullPolicy: IfNotPresent
166
+ securityContext:
167
+ runAsNonRoot: true
168
+ runAsUser: 1000
169
+ readOnlyRootFilesystem: true
170
+ allowPrivilegeEscalation: false
171
+ capabilities:
172
+ drop: ["ALL"]
173
+ ports:
174
+ # CRITICAL: port name `http` must match ServiceMonitor
175
+ # 92-servicemonitor-kg-service.yaml endpoints[0].port.
176
+ - name: http
177
+ containerPort: 9997
178
+ protocol: TCP
179
+ envFrom:
180
+ - configMapRef:
181
+ name: olam-kg-service-env
182
+ - secretRef:
183
+ name: olam-kg-service-secret
184
+ volumeMounts:
185
+ - name: kg-data
186
+ mountPath: /data
187
+ - name: tmp
188
+ mountPath: /tmp
189
+ readinessProbe:
190
+ # kg-service returns {"ok":true,"ready":true} once bge-small model loads.
191
+ # initialDelaySeconds 30 gives the model warmup thread time to complete.
192
+ httpGet:
193
+ path: /health
194
+ port: 9997
195
+ initialDelaySeconds: 30
196
+ periodSeconds: 5
197
+ timeoutSeconds: 3
198
+ failureThreshold: 12
199
+ livenessProbe:
200
+ httpGet:
201
+ path: /health
202
+ port: 9997
203
+ initialDelaySeconds: 60
204
+ periodSeconds: 20
205
+ timeoutSeconds: 5
206
+ failureThreshold: 3
207
+ resources:
208
+ requests:
209
+ cpu: "100m"
210
+ # bge-small ONNX model requires ~400Mi at runtime; 512Mi is the
211
+ # minimum viable request. Set higher if OOM-killed on first classify.
212
+ memory: "512Mi"
213
+ limits:
214
+ cpu: "1000m"
215
+ # 1Gi: bge-small model (~90Mi) + index cache + request headroom.
216
+ memory: "1Gi"
217
+ volumes:
218
+ - name: kg-data
219
+ persistentVolumeClaim:
220
+ claimName: olam-kg-data
221
+ - name: tmp
222
+ emptyDir: {}
223
+ ---
224
+ # Service — exposes kg-service to the cluster.
225
+ # CRITICAL: `name: http` matches 92-servicemonitor-kg-service.yaml endpoints[0].port.
226
+ # Namespace `olam` matches ServiceMonitor's namespaceSelector.matchNames.
227
+ apiVersion: v1
228
+ kind: Service
229
+ metadata:
230
+ name: olam-kg-service
231
+ namespace: olam
232
+ labels:
233
+ # CRITICAL: matches 92-servicemonitor-kg-service.yaml spec.selector.matchLabels.
234
+ app: olam-kg-service
235
+ app.kubernetes.io/managed-by: olam
236
+ spec:
237
+ type: ClusterIP
238
+ selector:
239
+ app: olam-kg-service
240
+ ports:
241
+ # CRITICAL: name `http` matches ServiceMonitor endpoints[0].port.
242
+ - name: http
243
+ port: 9997
244
+ targetPort: 9997
245
+ protocol: TCP
@@ -0,0 +1,22 @@
1
+ # IngressRoute — host-cp (bare /api/* per Decision 3 hybrid routing)
2
+ # host-cp preserves 50+ existing SPA fetch sites at /api/* (no strip-prefix).
3
+ apiVersion: traefik.io/v1alpha1
4
+ kind: IngressRoute
5
+ metadata:
6
+ name: olam-host-cp
7
+ namespace: olam
8
+ spec:
9
+ entryPoints:
10
+ - web
11
+ routes:
12
+ # host-cp is the catch-all (per Decision 3 hybrid routing); explicit low priority
13
+ # so service-prefix routes (kg, agent-memory, etc.) win when their longer prefix matches.
14
+ # Default Traefik priority is rule-string length; OR'd rules inflate the host-cp aggregate
15
+ # ABOVE more-specific PathPrefix matches, causing /api/kg/* to land on host-cp incorrectly.
16
+ # Explicit priority avoids the silent precedence bug (caught in PR #736 live-validation).
17
+ - match: PathPrefix(`/api/`) || PathPrefix(`/session/`) || PathPrefix(`/v1/`) || Path(`/health`)
18
+ kind: Rule
19
+ priority: 10
20
+ services:
21
+ - name: olam-host-cp
22
+ port: 19000
@@ -0,0 +1,29 @@
1
+ # IngressRoute — kg-service via /api/kg/* strip-prefix (Decision 3 new-services pattern)
2
+ apiVersion: traefik.io/v1alpha1
3
+ kind: Middleware
4
+ metadata:
5
+ name: strip-api-kg
6
+ namespace: olam
7
+ spec:
8
+ stripPrefix:
9
+ prefixes:
10
+ - /api/kg
11
+ ---
12
+ apiVersion: traefik.io/v1alpha1
13
+ kind: IngressRoute
14
+ metadata:
15
+ name: olam-kg-service
16
+ namespace: olam
17
+ spec:
18
+ entryPoints:
19
+ - web
20
+ routes:
21
+ # Priority 100 > host-cp's 10 so /api/kg/* wins over host-cp's catch-all /api/*.
22
+ - match: PathPrefix(`/api/kg/`)
23
+ kind: Rule
24
+ priority: 100
25
+ services:
26
+ - name: olam-kg-service
27
+ port: 9997
28
+ middlewares:
29
+ - name: strip-api-kg
@@ -0,0 +1,29 @@
1
+ # IngressRoute — agent-memory via /api/agent-memory/* strip-prefix (Decision 3 new-services pattern)
2
+ apiVersion: traefik.io/v1alpha1
3
+ kind: Middleware
4
+ metadata:
5
+ name: strip-api-agent-memory
6
+ namespace: olam
7
+ spec:
8
+ stripPrefix:
9
+ prefixes:
10
+ - /api/agent-memory
11
+ ---
12
+ apiVersion: traefik.io/v1alpha1
13
+ kind: IngressRoute
14
+ metadata:
15
+ name: olam-agent-memory
16
+ namespace: olam
17
+ spec:
18
+ entryPoints:
19
+ - web
20
+ routes:
21
+ # Priority 100 > host-cp's 10 so /api/agent-memory/* wins over host-cp's catch-all /api/*.
22
+ - match: PathPrefix(`/api/agent-memory/`)
23
+ kind: Rule
24
+ priority: 100
25
+ services:
26
+ - name: olam-memory-service
27
+ port: 3111 # Real memory-service listen port (per packages/memory-service/src/worker.ts:206 + AGENTMEMORY_HOST_INTERNAL_URL in container.ts:101). Pass-1 plan said 3112 (incorrect); A6 corrects to 3111.
28
+ middlewares:
29
+ - name: strip-api-agent-memory
@@ -0,0 +1,80 @@
1
+ # NetworkPolicy — olam namespace ingress fence (Phase A Task A9)
2
+ #
3
+ # Defense-in-depth: even if a world agent escapes its container or steals a
4
+ # bearer token, NetworkPolicy ensures it can only reach olam services via the
5
+ # Traefik ingress path (which enforces bearer auth on world-originated calls
6
+ # per A6 — see packages/peripheral-services/manifests/30-traefik-ingressroute-host-cp.yaml).
7
+ # Direct pod-to-pod access bypassing ingress is denied.
8
+ #
9
+ # Enforcement matrix — two separate enforcement paths exist; the comment below
10
+ # previously conflated them (corrected 2026-05-21, see dogfood incident finding #2):
11
+ #
12
+ # k3d/k3s with --disable-network-policy=false (production k3s default):
13
+ # k3s ships a built-in NetworkPolicy controller that enforces NetworkPolicies
14
+ # via iptables rules, INDEPENDENT of the CNI. Flannel itself does not enforce,
15
+ # but the k3s controller does. Result: NetworkPolicies ARE enforced even on
16
+ # default Flannel k3s/k3d clusters — this is what the operator's colima+k3d
17
+ # dogfood cluster experienced (the fence was live despite using Flannel).
18
+ #
19
+ # k3d/k3s with --disable-network-policy=true (this harness — cluster-up.sh):
20
+ # The harness explicitly passes --k3s-arg '--disable-network-policy@server:*'
21
+ # to disable the k3s built-in controller. With the controller off, enforcement
22
+ # depends entirely on the CNI: Flannel = no enforcement; Calico = enforced.
23
+ # The harness uses Calico precisely so tests exercise real enforcement.
24
+ #
25
+ # Production k3s (default, no --disable-network-policy):
26
+ # Controller-enforced via iptables unless the operator explicitly disables it.
27
+ #
28
+ # See docs/architecture/networkpolicy-fence.md for the full environment matrix
29
+ # and docs/incidents/2026-05-21-phase-c-dogfood.md (finding #2) for the live
30
+ # evidence that k3s' bundled controller enforces on Flannel clusters.
31
+ #
32
+ # Threat mitigated: T6 (world→host SSRF via unauthenticated ingress route).
33
+ # Companion mitigations (do not remove A6 + A9 together): bearer auth (A6),
34
+ # 127.0.0.1 bind on host-cp + kube-apiserver (OS-level, separate from k8s).
35
+ apiVersion: networking.k8s.io/v1
36
+ kind: NetworkPolicy
37
+ metadata:
38
+ name: olam-ingress-fence
39
+ namespace: olam
40
+ labels:
41
+ app.kubernetes.io/part-of: olam
42
+ app.kubernetes.io/component: security-fence
43
+ olam.io/phase: a
44
+ olam.io/task: a9
45
+ spec:
46
+ # Selects every pod in the olam namespace. Intra-namespace traffic is allowed
47
+ # explicitly below so olam services can call each other; cross-namespace and
48
+ # external traffic must traverse Traefik (which the second rule allows).
49
+ podSelector: {}
50
+ policyTypes:
51
+ - Ingress
52
+ ingress:
53
+ # Allow inbound from Traefik (canonical ingress path). The label selector
54
+ # matches the standard Helm-chart label that k3s' bundled Traefik install
55
+ # sets (`app.kubernetes.io/name: traefik`); also matched by the upstream
56
+ # `traefik/traefik` chart used by Phase A Task A3.
57
+ - from:
58
+ - namespaceSelector:
59
+ matchLabels:
60
+ kubernetes.io/metadata.name: kube-system
61
+ podSelector:
62
+ matchLabels:
63
+ app.kubernetes.io/name: traefik
64
+ # Allow intra-namespace pod-to-pod traffic — olam services may call each
65
+ # other directly (host-cp → kg-service, etc.) without round-tripping
66
+ # through Traefik. Audit log on world-originated calls still fires at the
67
+ # bearer-auth layer (A6), so this allowance does not weaken T6 mitigation.
68
+ - from:
69
+ - podSelector: {}
70
+ # Allow inbound from the monitoring namespace — Phase C's Prometheus
71
+ # (kube-prometheus-stack) scrapes pod IPs directly for /metrics
72
+ # collection. Without this rule, ServiceMonitor targets in `olam` ns
73
+ # appear "up" but yield 0 samples (the scrape connection silently fails
74
+ # at CNI level on enforcing CNIs). Surfaced during 2026-05-21 operator
75
+ # dogfood — see docs/incidents/2026-05-21-phase-c-dogfood.md, finding #2.
76
+ # Scope: monitoring → olam ingress only (not the reverse direction).
77
+ - from:
78
+ - namespaceSelector:
79
+ matchLabels:
80
+ kubernetes.io/metadata.name: monitoring