@pleri/olam-cli 0.1.161 → 0.1.162
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -4
- package/dist/agent-stream/agent-sdk-to-chunks.js +20 -2
- package/dist/commands/bootstrap.d.ts.map +1 -1
- package/dist/commands/bootstrap.js +35 -11
- package/dist/commands/bootstrap.js.map +1 -1
- package/dist/commands/flywheel/migrate-overlays.d.ts +1 -0
- package/dist/commands/flywheel/migrate-overlays.d.ts.map +1 -1
- package/dist/commands/flywheel/migrate-overlays.js +29 -3
- package/dist/commands/flywheel/migrate-overlays.js.map +1 -1
- package/dist/commands/skills-source.d.ts.map +1 -1
- package/dist/commands/skills-source.js +57 -2
- package/dist/commands/skills-source.js.map +1 -1
- package/dist/commands/skills.d.ts.map +1 -1
- package/dist/commands/skills.js +14 -0
- package/dist/commands/skills.js.map +1 -1
- package/dist/image-digests.json +7 -7
- package/dist/index.js +996 -618
- package/dist/lib/bootstrap-kubernetes.d.ts.map +1 -1
- package/dist/lib/bootstrap-kubernetes.js +93 -13
- package/dist/lib/bootstrap-kubernetes.js.map +1 -1
- package/dist/mcp-server.js +568 -368
- package/hermes-bundle/version.json +1 -1
- package/host-cp/k8s/manifests/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +1 -1
- package/host-cp/observability/grafana-port-forward.sh +12 -2
- package/host-cp/observability/kyverno-cardinality-mutate.sh +12 -2
- package/host-cp/observability/loki-ingest.sh +12 -2
- package/host-cp/observability/prom-no-double-grafana.sh +15 -5
- package/host-cp/peripheral-services/helm-values/grafana-values.yaml +159 -0
- package/host-cp/peripheral-services/helm-values/kube-prom-stack-values.yaml +229 -0
- package/host-cp/peripheral-services/helm-values/kyverno-values.yaml +85 -0
- package/host-cp/peripheral-services/helm-values/loki-values.yaml +166 -0
- package/host-cp/peripheral-services/helm-values/promtail-staging.yaml +92 -0
- package/host-cp/peripheral-services/helm-values/promtail-values.yaml +102 -0
- package/host-cp/peripheral-services/helm-values/traefik-values.yaml +73 -0
- package/host-cp/peripheral-services/manifests/20-namespace.yaml +6 -0
- package/host-cp/peripheral-services/manifests/24-deploy-kg-service.yaml +245 -0
- package/host-cp/peripheral-services/manifests/30-traefik-ingressroute-host-cp.yaml +22 -0
- package/host-cp/peripheral-services/manifests/40-traefik-ingressroute-kg.yaml +29 -0
- package/host-cp/peripheral-services/manifests/50-traefik-ingressroute-agent-memory.yaml +29 -0
- package/host-cp/peripheral-services/manifests/60-networkpolicy-ingress.yaml +80 -0
- package/host-cp/peripheral-services/manifests/65-networkpolicy-loki-prom-deny.yaml +67 -0
- package/host-cp/peripheral-services/manifests/80-grafana-dashboard-configmap.yaml +1349 -0
- package/host-cp/peripheral-services/manifests/90-prom-alert-cardinality.yaml +50 -0
- package/host-cp/peripheral-services/manifests/91-servicemonitor-host-cp.yaml +70 -0
- package/host-cp/peripheral-services/manifests/92-servicemonitor-kg-service.yaml +70 -0
- package/host-cp/peripheral-services/manifests/93-servicemonitor-memory-service.yaml +87 -0
- package/host-cp/peripheral-services/manifests/95-prom-recording-rules.yaml +108 -0
- package/host-cp/peripheral-services/manifests/96-kyverno-cardinality-mutate.yaml +195 -0
- package/host-cp/src/plan-chat-service.mjs +147 -1
- package/package.json +1 -1
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
# 24-deploy-kg-service.yaml — kg-service Service + Deployment for local k3s dogfood.
|
|
2
|
+
#
|
|
3
|
+
# Bridges the gap between Phase C's ServiceMonitor (92-servicemonitor-kg-service.yaml)
|
|
4
|
+
# and a running service. The ServiceMonitor targets namespace `olam`,
|
|
5
|
+
# label `app: olam-kg-service`, port name `http` — this manifest satisfies that
|
|
6
|
+
# contract so Prometheus can scrape kg-service's /metrics endpoint.
|
|
7
|
+
#
|
|
8
|
+
# Canonical per-service manifest tree: packages/host-cp/k8s/manifests/kg-service/
|
|
9
|
+
# This file is the "peripheral-services entry point" view — it folds Service +
|
|
10
|
+
# Deployment into a single file for `kubectl apply -f manifests/` convenience.
|
|
11
|
+
#
|
|
12
|
+
# Secrets prerequisite: operator MUST create `olam-kg-service-secret` in the
|
|
13
|
+
# `olam` namespace BEFORE applying this manifest. See README.md § Secrets.
|
|
14
|
+
#
|
|
15
|
+
# Image: pinned to sha256 digest (not :latest) per T4 threat model.
|
|
16
|
+
# Digest resolves to ghcr.io/pleri/olam-kg-service:0.1.158.
|
|
17
|
+
# To update:
|
|
18
|
+
# TOKEN=$(curl -s "https://ghcr.io/token?scope=repository:pleri/olam-kg-service:pull&service=ghcr.io" | jq -r .token)
|
|
19
|
+
# curl -sI -H "Authorization: Bearer $TOKEN" \
|
|
20
|
+
# -H "Accept: application/vnd.oci.image.index.v1+json,application/vnd.docker.distribution.manifest.list.v2+json" \
|
|
21
|
+
# https://ghcr.io/v2/pleri/olam-kg-service/manifests/<tag> | grep docker-content-digest
|
|
22
|
+
#
|
|
23
|
+
# Memory: bge-small-en-v1.5 ONNX model is pre-cached in the image (~90 MB).
|
|
24
|
+
# Container needs ≥512Mi to load the model + serve requests. Limit set to 1Gi.
|
|
25
|
+
#
|
|
26
|
+
# Apply-manifests.sh: this file is SKIPPED by the phase-a-e2e harness
|
|
27
|
+
# (apply-manifests.sh skip-list includes 2[3-4]-deploy-*) because the
|
|
28
|
+
# harness cluster has no operator secrets or kg-data PVC.
|
|
29
|
+
# Operator-side `kubectl apply -f manifests/` applies it.
|
|
30
|
+
---
|
|
31
|
+
apiVersion: v1
|
|
32
|
+
kind: ServiceAccount
|
|
33
|
+
metadata:
|
|
34
|
+
name: olam-kg-service
|
|
35
|
+
namespace: olam
|
|
36
|
+
labels:
|
|
37
|
+
app: olam-kg-service
|
|
38
|
+
app.kubernetes.io/managed-by: olam
|
|
39
|
+
---
|
|
40
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
|
41
|
+
kind: Role
|
|
42
|
+
metadata:
|
|
43
|
+
name: olam-kg-service
|
|
44
|
+
namespace: olam
|
|
45
|
+
labels:
|
|
46
|
+
app: olam-kg-service
|
|
47
|
+
app.kubernetes.io/managed-by: olam
|
|
48
|
+
rules:
|
|
49
|
+
- apiGroups: ["apps"]
|
|
50
|
+
resources: ["deployments"]
|
|
51
|
+
resourceNames: ["olam-kg-service"]
|
|
52
|
+
verbs: ["get", "patch", "watch"]
|
|
53
|
+
---
|
|
54
|
+
apiVersion: rbac.authorization.k8s.io/v1
|
|
55
|
+
kind: RoleBinding
|
|
56
|
+
metadata:
|
|
57
|
+
name: olam-kg-service
|
|
58
|
+
namespace: olam
|
|
59
|
+
labels:
|
|
60
|
+
app: olam-kg-service
|
|
61
|
+
app.kubernetes.io/managed-by: olam
|
|
62
|
+
subjects:
|
|
63
|
+
- kind: ServiceAccount
|
|
64
|
+
name: olam-kg-service
|
|
65
|
+
namespace: olam
|
|
66
|
+
roleRef:
|
|
67
|
+
kind: Role
|
|
68
|
+
name: olam-kg-service
|
|
69
|
+
apiGroup: rbac.authorization.k8s.io
|
|
70
|
+
---
|
|
71
|
+
# ConfigMap — non-sensitive env vars.
|
|
72
|
+
# Sensitive values (OLAM_KG_BEARER_TOKEN) live in `olam-kg-service-secret`.
|
|
73
|
+
apiVersion: v1
|
|
74
|
+
kind: ConfigMap
|
|
75
|
+
metadata:
|
|
76
|
+
name: olam-kg-service-env
|
|
77
|
+
namespace: olam
|
|
78
|
+
labels:
|
|
79
|
+
app: olam-kg-service
|
|
80
|
+
app.kubernetes.io/managed-by: olam
|
|
81
|
+
data:
|
|
82
|
+
# Port kg-service listens on — must match Service targetPort below.
|
|
83
|
+
OLAM_KG_SERVICE_PORT: "9997"
|
|
84
|
+
# CRITICAL: kg-service defaults to 127.0.0.1 bind. In k8s the readiness
|
|
85
|
+
# probe hits the pod IP, so 127.0.0.1-only listener causes probe failures.
|
|
86
|
+
# Force all-interfaces bind without requiring an image rebuild.
|
|
87
|
+
OLAM_KG_SERVICE_BIND: "0.0.0.0"
|
|
88
|
+
# Data directory — backed by the PVC mounted at /data.
|
|
89
|
+
OLAM_KG_DATA_PATH: "/data/kg"
|
|
90
|
+
# Auth-service URL — cluster-internal DNS (olam namespace).
|
|
91
|
+
OLAM_AUTH_SERVICE_URL: "http://olam-auth-service.olam.svc.cluster.local:9999"
|
|
92
|
+
---
|
|
93
|
+
# PersistentVolumeClaim — backs /data (KG index + savings telemetry).
|
|
94
|
+
# 10Gi: graph index grows with codebase size. See kg-service/45-pvc.yaml rationale.
|
|
95
|
+
# local-path StorageClass ships with k3d. Substitute for non-k3d clusters.
|
|
96
|
+
apiVersion: v1
|
|
97
|
+
kind: PersistentVolumeClaim
|
|
98
|
+
metadata:
|
|
99
|
+
name: olam-kg-data
|
|
100
|
+
namespace: olam
|
|
101
|
+
labels:
|
|
102
|
+
app: olam-kg-service
|
|
103
|
+
app.kubernetes.io/managed-by: olam
|
|
104
|
+
spec:
|
|
105
|
+
accessModes:
|
|
106
|
+
- ReadWriteOnce
|
|
107
|
+
storageClassName: local-path
|
|
108
|
+
resources:
|
|
109
|
+
requests:
|
|
110
|
+
storage: 10Gi
|
|
111
|
+
---
|
|
112
|
+
apiVersion: apps/v1
|
|
113
|
+
kind: Deployment
|
|
114
|
+
metadata:
|
|
115
|
+
name: olam-kg-service
|
|
116
|
+
namespace: olam
|
|
117
|
+
labels:
|
|
118
|
+
app: olam-kg-service
|
|
119
|
+
app.kubernetes.io/managed-by: olam
|
|
120
|
+
spec:
|
|
121
|
+
replicas: 1
|
|
122
|
+
strategy:
|
|
123
|
+
type: RollingUpdate
|
|
124
|
+
rollingUpdate:
|
|
125
|
+
maxSurge: 1
|
|
126
|
+
maxUnavailable: 0
|
|
127
|
+
selector:
|
|
128
|
+
matchLabels:
|
|
129
|
+
app: olam-kg-service
|
|
130
|
+
template:
|
|
131
|
+
metadata:
|
|
132
|
+
labels:
|
|
133
|
+
app: olam-kg-service
|
|
134
|
+
spec:
|
|
135
|
+
# Disable k8s automatic Service env injection.
|
|
136
|
+
# Without this, k8s injects OLAM_KG_SERVICE_PORT as "tcp://..." which
|
|
137
|
+
# breaks Python's int() parse of the port env var.
|
|
138
|
+
enableServiceLinks: false
|
|
139
|
+
imagePullSecrets:
|
|
140
|
+
- name: ghcr-pull
|
|
141
|
+
serviceAccountName: olam-kg-service
|
|
142
|
+
securityContext:
|
|
143
|
+
runAsNonRoot: true
|
|
144
|
+
runAsUser: 1000
|
|
145
|
+
runAsGroup: 1000
|
|
146
|
+
fsGroup: 1000
|
|
147
|
+
initContainers:
|
|
148
|
+
- name: chown-data
|
|
149
|
+
# busybox:1.36 — sha256-pinned per T4 threat model.
|
|
150
|
+
image: busybox@sha256:73aaf090f3d85aa34ee199857f03fa3a95c8ede2ffd4cc2cdb5b94e566b11662
|
|
151
|
+
imagePullPolicy: IfNotPresent
|
|
152
|
+
securityContext:
|
|
153
|
+
runAsUser: 0
|
|
154
|
+
runAsNonRoot: false
|
|
155
|
+
allowPrivilegeEscalation: false
|
|
156
|
+
command: ["chown", "-R", "1000:1000", "/data"]
|
|
157
|
+
volumeMounts:
|
|
158
|
+
- name: kg-data
|
|
159
|
+
mountPath: /data
|
|
160
|
+
containers:
|
|
161
|
+
- name: olam-kg-service
|
|
162
|
+
# Digest resolves to ghcr.io/pleri/olam-kg-service:0.1.158
|
|
163
|
+
# Run `npm run refresh:manifest-digests` to update.
|
|
164
|
+
image: ghcr.io/pleri/olam-kg-service@sha256:72030f3054315e7ebf575f6dcb9b4965e1ddee13ea7bfdeb0bde32253beeb1c7
|
|
165
|
+
imagePullPolicy: IfNotPresent
|
|
166
|
+
securityContext:
|
|
167
|
+
runAsNonRoot: true
|
|
168
|
+
runAsUser: 1000
|
|
169
|
+
readOnlyRootFilesystem: true
|
|
170
|
+
allowPrivilegeEscalation: false
|
|
171
|
+
capabilities:
|
|
172
|
+
drop: ["ALL"]
|
|
173
|
+
ports:
|
|
174
|
+
# CRITICAL: port name `http` must match ServiceMonitor
|
|
175
|
+
# 92-servicemonitor-kg-service.yaml endpoints[0].port.
|
|
176
|
+
- name: http
|
|
177
|
+
containerPort: 9997
|
|
178
|
+
protocol: TCP
|
|
179
|
+
envFrom:
|
|
180
|
+
- configMapRef:
|
|
181
|
+
name: olam-kg-service-env
|
|
182
|
+
- secretRef:
|
|
183
|
+
name: olam-kg-service-secret
|
|
184
|
+
volumeMounts:
|
|
185
|
+
- name: kg-data
|
|
186
|
+
mountPath: /data
|
|
187
|
+
- name: tmp
|
|
188
|
+
mountPath: /tmp
|
|
189
|
+
readinessProbe:
|
|
190
|
+
# kg-service returns {"ok":true,"ready":true} once bge-small model loads.
|
|
191
|
+
# initialDelaySeconds 30 gives the model warmup thread time to complete.
|
|
192
|
+
httpGet:
|
|
193
|
+
path: /health
|
|
194
|
+
port: 9997
|
|
195
|
+
initialDelaySeconds: 30
|
|
196
|
+
periodSeconds: 5
|
|
197
|
+
timeoutSeconds: 3
|
|
198
|
+
failureThreshold: 12
|
|
199
|
+
livenessProbe:
|
|
200
|
+
httpGet:
|
|
201
|
+
path: /health
|
|
202
|
+
port: 9997
|
|
203
|
+
initialDelaySeconds: 60
|
|
204
|
+
periodSeconds: 20
|
|
205
|
+
timeoutSeconds: 5
|
|
206
|
+
failureThreshold: 3
|
|
207
|
+
resources:
|
|
208
|
+
requests:
|
|
209
|
+
cpu: "100m"
|
|
210
|
+
# bge-small ONNX model requires ~400Mi at runtime; 512Mi is the
|
|
211
|
+
# minimum viable request. Set higher if OOM-killed on first classify.
|
|
212
|
+
memory: "512Mi"
|
|
213
|
+
limits:
|
|
214
|
+
cpu: "1000m"
|
|
215
|
+
# 1Gi: bge-small model (~90Mi) + index cache + request headroom.
|
|
216
|
+
memory: "1Gi"
|
|
217
|
+
volumes:
|
|
218
|
+
- name: kg-data
|
|
219
|
+
persistentVolumeClaim:
|
|
220
|
+
claimName: olam-kg-data
|
|
221
|
+
- name: tmp
|
|
222
|
+
emptyDir: {}
|
|
223
|
+
---
|
|
224
|
+
# Service — exposes kg-service to the cluster.
|
|
225
|
+
# CRITICAL: `name: http` matches 92-servicemonitor-kg-service.yaml endpoints[0].port.
|
|
226
|
+
# Namespace `olam` matches ServiceMonitor's namespaceSelector.matchNames.
|
|
227
|
+
apiVersion: v1
|
|
228
|
+
kind: Service
|
|
229
|
+
metadata:
|
|
230
|
+
name: olam-kg-service
|
|
231
|
+
namespace: olam
|
|
232
|
+
labels:
|
|
233
|
+
# CRITICAL: matches 92-servicemonitor-kg-service.yaml spec.selector.matchLabels.
|
|
234
|
+
app: olam-kg-service
|
|
235
|
+
app.kubernetes.io/managed-by: olam
|
|
236
|
+
spec:
|
|
237
|
+
type: ClusterIP
|
|
238
|
+
selector:
|
|
239
|
+
app: olam-kg-service
|
|
240
|
+
ports:
|
|
241
|
+
# CRITICAL: name `http` matches ServiceMonitor endpoints[0].port.
|
|
242
|
+
- name: http
|
|
243
|
+
port: 9997
|
|
244
|
+
targetPort: 9997
|
|
245
|
+
protocol: TCP
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# IngressRoute — host-cp (bare /api/* per Decision 3 hybrid routing)
|
|
2
|
+
# host-cp preserves 50+ existing SPA fetch sites at /api/* (no strip-prefix).
|
|
3
|
+
apiVersion: traefik.io/v1alpha1
|
|
4
|
+
kind: IngressRoute
|
|
5
|
+
metadata:
|
|
6
|
+
name: olam-host-cp
|
|
7
|
+
namespace: olam
|
|
8
|
+
spec:
|
|
9
|
+
entryPoints:
|
|
10
|
+
- web
|
|
11
|
+
routes:
|
|
12
|
+
# host-cp is the catch-all (per Decision 3 hybrid routing); explicit low priority
|
|
13
|
+
# so service-prefix routes (kg, agent-memory, etc.) win when their longer prefix matches.
|
|
14
|
+
# Default Traefik priority is rule-string length; OR'd rules inflate the host-cp aggregate
|
|
15
|
+
# ABOVE more-specific PathPrefix matches, causing /api/kg/* to land on host-cp incorrectly.
|
|
16
|
+
# Explicit priority avoids the silent precedence bug (caught in PR #736 live-validation).
|
|
17
|
+
- match: PathPrefix(`/api/`) || PathPrefix(`/session/`) || PathPrefix(`/v1/`) || Path(`/health`)
|
|
18
|
+
kind: Rule
|
|
19
|
+
priority: 10
|
|
20
|
+
services:
|
|
21
|
+
- name: olam-host-cp
|
|
22
|
+
port: 19000
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# IngressRoute — kg-service via /api/kg/* strip-prefix (Decision 3 new-services pattern)
|
|
2
|
+
apiVersion: traefik.io/v1alpha1
|
|
3
|
+
kind: Middleware
|
|
4
|
+
metadata:
|
|
5
|
+
name: strip-api-kg
|
|
6
|
+
namespace: olam
|
|
7
|
+
spec:
|
|
8
|
+
stripPrefix:
|
|
9
|
+
prefixes:
|
|
10
|
+
- /api/kg
|
|
11
|
+
---
|
|
12
|
+
apiVersion: traefik.io/v1alpha1
|
|
13
|
+
kind: IngressRoute
|
|
14
|
+
metadata:
|
|
15
|
+
name: olam-kg-service
|
|
16
|
+
namespace: olam
|
|
17
|
+
spec:
|
|
18
|
+
entryPoints:
|
|
19
|
+
- web
|
|
20
|
+
routes:
|
|
21
|
+
# Priority 100 > host-cp's 10 so /api/kg/* wins over host-cp's catch-all /api/*.
|
|
22
|
+
- match: PathPrefix(`/api/kg/`)
|
|
23
|
+
kind: Rule
|
|
24
|
+
priority: 100
|
|
25
|
+
services:
|
|
26
|
+
- name: olam-kg-service
|
|
27
|
+
port: 9997
|
|
28
|
+
middlewares:
|
|
29
|
+
- name: strip-api-kg
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# IngressRoute — agent-memory via /api/agent-memory/* strip-prefix (Decision 3 new-services pattern)
|
|
2
|
+
apiVersion: traefik.io/v1alpha1
|
|
3
|
+
kind: Middleware
|
|
4
|
+
metadata:
|
|
5
|
+
name: strip-api-agent-memory
|
|
6
|
+
namespace: olam
|
|
7
|
+
spec:
|
|
8
|
+
stripPrefix:
|
|
9
|
+
prefixes:
|
|
10
|
+
- /api/agent-memory
|
|
11
|
+
---
|
|
12
|
+
apiVersion: traefik.io/v1alpha1
|
|
13
|
+
kind: IngressRoute
|
|
14
|
+
metadata:
|
|
15
|
+
name: olam-agent-memory
|
|
16
|
+
namespace: olam
|
|
17
|
+
spec:
|
|
18
|
+
entryPoints:
|
|
19
|
+
- web
|
|
20
|
+
routes:
|
|
21
|
+
# Priority 100 > host-cp's 10 so /api/agent-memory/* wins over host-cp's catch-all /api/*.
|
|
22
|
+
- match: PathPrefix(`/api/agent-memory/`)
|
|
23
|
+
kind: Rule
|
|
24
|
+
priority: 100
|
|
25
|
+
services:
|
|
26
|
+
- name: olam-memory-service
|
|
27
|
+
port: 3111 # Real memory-service listen port (per packages/memory-service/src/worker.ts:206 + AGENTMEMORY_HOST_INTERNAL_URL in container.ts:101). Pass-1 plan said 3112 (incorrect); A6 corrects to 3111.
|
|
28
|
+
middlewares:
|
|
29
|
+
- name: strip-api-agent-memory
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# NetworkPolicy — olam namespace ingress fence (Phase A Task A9)
|
|
2
|
+
#
|
|
3
|
+
# Defense-in-depth: even if a world agent escapes its container or steals a
|
|
4
|
+
# bearer token, NetworkPolicy ensures it can only reach olam services via the
|
|
5
|
+
# Traefik ingress path (which enforces bearer auth on world-originated calls
|
|
6
|
+
# per A6 — see packages/peripheral-services/manifests/30-traefik-ingressroute-host-cp.yaml).
|
|
7
|
+
# Direct pod-to-pod access bypassing ingress is denied.
|
|
8
|
+
#
|
|
9
|
+
# Enforcement matrix — two separate enforcement paths exist; the comment below
|
|
10
|
+
# previously conflated them (corrected 2026-05-21, see dogfood incident finding #2):
|
|
11
|
+
#
|
|
12
|
+
# k3d/k3s with --disable-network-policy=false (production k3s default):
|
|
13
|
+
# k3s ships a built-in NetworkPolicy controller that enforces NetworkPolicies
|
|
14
|
+
# via iptables rules, INDEPENDENT of the CNI. Flannel itself does not enforce,
|
|
15
|
+
# but the k3s controller does. Result: NetworkPolicies ARE enforced even on
|
|
16
|
+
# default Flannel k3s/k3d clusters — this is what the operator's colima+k3d
|
|
17
|
+
# dogfood cluster experienced (the fence was live despite using Flannel).
|
|
18
|
+
#
|
|
19
|
+
# k3d/k3s with --disable-network-policy=true (this harness — cluster-up.sh):
|
|
20
|
+
# The harness explicitly passes --k3s-arg '--disable-network-policy@server:*'
|
|
21
|
+
# to disable the k3s built-in controller. With the controller off, enforcement
|
|
22
|
+
# depends entirely on the CNI: Flannel = no enforcement; Calico = enforced.
|
|
23
|
+
# The harness uses Calico precisely so tests exercise real enforcement.
|
|
24
|
+
#
|
|
25
|
+
# Production k3s (default, no --disable-network-policy):
|
|
26
|
+
# Controller-enforced via iptables unless the operator explicitly disables it.
|
|
27
|
+
#
|
|
28
|
+
# See docs/architecture/networkpolicy-fence.md for the full environment matrix
|
|
29
|
+
# and docs/incidents/2026-05-21-phase-c-dogfood.md (finding #2) for the live
|
|
30
|
+
# evidence that k3s' bundled controller enforces on Flannel clusters.
|
|
31
|
+
#
|
|
32
|
+
# Threat mitigated: T6 (world→host SSRF via unauthenticated ingress route).
|
|
33
|
+
# Companion mitigations (do not remove A6 + A9 together): bearer auth (A6),
|
|
34
|
+
# 127.0.0.1 bind on host-cp + kube-apiserver (OS-level, separate from k8s).
|
|
35
|
+
apiVersion: networking.k8s.io/v1
|
|
36
|
+
kind: NetworkPolicy
|
|
37
|
+
metadata:
|
|
38
|
+
name: olam-ingress-fence
|
|
39
|
+
namespace: olam
|
|
40
|
+
labels:
|
|
41
|
+
app.kubernetes.io/part-of: olam
|
|
42
|
+
app.kubernetes.io/component: security-fence
|
|
43
|
+
olam.io/phase: a
|
|
44
|
+
olam.io/task: a9
|
|
45
|
+
spec:
|
|
46
|
+
# Selects every pod in the olam namespace. Intra-namespace traffic is allowed
|
|
47
|
+
# explicitly below so olam services can call each other; cross-namespace and
|
|
48
|
+
# external traffic must traverse Traefik (which the second rule allows).
|
|
49
|
+
podSelector: {}
|
|
50
|
+
policyTypes:
|
|
51
|
+
- Ingress
|
|
52
|
+
ingress:
|
|
53
|
+
# Allow inbound from Traefik (canonical ingress path). The label selector
|
|
54
|
+
# matches the standard Helm-chart label that k3s' bundled Traefik install
|
|
55
|
+
# sets (`app.kubernetes.io/name: traefik`); also matched by the upstream
|
|
56
|
+
# `traefik/traefik` chart used by Phase A Task A3.
|
|
57
|
+
- from:
|
|
58
|
+
- namespaceSelector:
|
|
59
|
+
matchLabels:
|
|
60
|
+
kubernetes.io/metadata.name: kube-system
|
|
61
|
+
podSelector:
|
|
62
|
+
matchLabels:
|
|
63
|
+
app.kubernetes.io/name: traefik
|
|
64
|
+
# Allow intra-namespace pod-to-pod traffic — olam services may call each
|
|
65
|
+
# other directly (host-cp → kg-service, etc.) without round-tripping
|
|
66
|
+
# through Traefik. Audit log on world-originated calls still fires at the
|
|
67
|
+
# bearer-auth layer (A6), so this allowance does not weaken T6 mitigation.
|
|
68
|
+
- from:
|
|
69
|
+
- podSelector: {}
|
|
70
|
+
# Allow inbound from the monitoring namespace — Phase C's Prometheus
|
|
71
|
+
# (kube-prometheus-stack) scrapes pod IPs directly for /metrics
|
|
72
|
+
# collection. Without this rule, ServiceMonitor targets in `olam` ns
|
|
73
|
+
# appear "up" but yield 0 samples (the scrape connection silently fails
|
|
74
|
+
# at CNI level on enforcing CNIs). Surfaced during 2026-05-21 operator
|
|
75
|
+
# dogfood — see docs/incidents/2026-05-21-phase-c-dogfood.md, finding #2.
|
|
76
|
+
# Scope: monitoring → olam ingress only (not the reverse direction).
|
|
77
|
+
- from:
|
|
78
|
+
- namespaceSelector:
|
|
79
|
+
matchLabels:
|
|
80
|
+
kubernetes.io/metadata.name: monitoring
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# NetworkPolicy — monitoring namespace default-deny + same-namespace allow
|
|
2
|
+
# (Phase A Task A9; companion to 60-networkpolicy-ingress.yaml)
|
|
3
|
+
#
|
|
4
|
+
# Loki + Prometheus + Grafana accept inbound ONLY from pods in the same
|
|
5
|
+
# `monitoring` namespace (intra-stack: Promtail → Loki, Grafana → Loki + Prom,
|
|
6
|
+
# kube-prometheus-stack scrape targets within the stack). Cross-namespace
|
|
7
|
+
# traffic — including from `olam` (host-cp, kg-service, agent-memory) and
|
|
8
|
+
# kube-system (Traefik) — is denied.
|
|
9
|
+
#
|
|
10
|
+
# Operator access pattern is `kubectl port-forward -n monitoring svc/grafana
|
|
11
|
+
# 3000` (Decision 16). port-forward uses the kube-apiserver's exec channel,
|
|
12
|
+
# NOT pod-to-pod networking, so it bypasses NetworkPolicy by design.
|
|
13
|
+
#
|
|
14
|
+
# Decision 17 forbids any IngressRoute / Ingress that exposes Loki / Prom /
|
|
15
|
+
# Grafana from outside the cluster; audit:no-ingress-route enforces that at
|
|
16
|
+
# commit time, and this NetworkPolicy is the runtime defense-in-depth layer
|
|
17
|
+
# (caught even if the audit is bypassed or a Helm chart renders a route).
|
|
18
|
+
#
|
|
19
|
+
# Forward-declaration note: Loki + Prometheus land in Phase B/C. Until those
|
|
20
|
+
# manifests add pods to the `monitoring` namespace, this policy applies to an
|
|
21
|
+
# empty pod set and is a no-op. Phase B/C must label their pods so this
|
|
22
|
+
# selector keeps matching (kube-prometheus-stack's default labels already
|
|
23
|
+
# satisfy `kubernetes.io/metadata.name: monitoring` via namespace metadata).
|
|
24
|
+
#
|
|
25
|
+
# Enforcement requires NetworkPolicy-capable CNI (see 60-* doc block).
|
|
26
|
+
# Threat mitigated: T7 (Grafana admin secret exfil) + secondary T6 mitigation.
|
|
27
|
+
---
|
|
28
|
+
# Forward-declare the monitoring namespace so the NetworkPolicy below has a
|
|
29
|
+
# valid target. Phase B/C kube-prometheus-stack installs into this namespace
|
|
30
|
+
# and may add labels — its install MUST NOT delete the namespace; Helm uses
|
|
31
|
+
# `--create-namespace=false` once this manifest seeds it.
|
|
32
|
+
apiVersion: v1
|
|
33
|
+
kind: Namespace
|
|
34
|
+
metadata:
|
|
35
|
+
name: monitoring
|
|
36
|
+
labels:
|
|
37
|
+
kubernetes.io/metadata.name: monitoring
|
|
38
|
+
app.kubernetes.io/part-of: olam-observability
|
|
39
|
+
olam.io/phase: a
|
|
40
|
+
olam.io/task: a9
|
|
41
|
+
---
|
|
42
|
+
apiVersion: networking.k8s.io/v1
|
|
43
|
+
kind: NetworkPolicy
|
|
44
|
+
metadata:
|
|
45
|
+
name: monitoring-default-deny
|
|
46
|
+
namespace: monitoring
|
|
47
|
+
labels:
|
|
48
|
+
app.kubernetes.io/part-of: olam-observability
|
|
49
|
+
app.kubernetes.io/component: security-fence
|
|
50
|
+
olam.io/phase: a
|
|
51
|
+
olam.io/task: a9
|
|
52
|
+
spec:
|
|
53
|
+
# Selects every pod in the monitoring namespace. Phase B/C pods (loki,
|
|
54
|
+
# prometheus, grafana, promtail, alertmanager — whatever the chart renders)
|
|
55
|
+
# all match this empty selector automatically.
|
|
56
|
+
podSelector: {}
|
|
57
|
+
policyTypes:
|
|
58
|
+
- Ingress
|
|
59
|
+
ingress:
|
|
60
|
+
# Allow inbound only from same-namespace pods. Cross-namespace traffic
|
|
61
|
+
# (olam services, kube-system Traefik, default ns) is denied — see header
|
|
62
|
+
# for why this is the correct posture (operator uses kubectl port-forward,
|
|
63
|
+
# which bypasses NetworkPolicy via the kube-apiserver exec channel).
|
|
64
|
+
- from:
|
|
65
|
+
- namespaceSelector:
|
|
66
|
+
matchLabels:
|
|
67
|
+
kubernetes.io/metadata.name: monitoring
|