sdtk-ops-kit 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/README.md +146 -0
  2. package/assets/manifest/toolkit-bundle.manifest.json +187 -0
  3. package/assets/manifest/toolkit-bundle.sha256.txt +36 -0
  4. package/assets/toolkit/toolkit/AGENTS.md +65 -0
  5. package/assets/toolkit/toolkit/SDTKOPS_TOOLKIT.md +166 -0
  6. package/assets/toolkit/toolkit/install.ps1 +138 -0
  7. package/assets/toolkit/toolkit/scripts/install-claude-skills.ps1 +81 -0
  8. package/assets/toolkit/toolkit/scripts/install-codex-skills.ps1 +127 -0
  9. package/assets/toolkit/toolkit/scripts/uninstall-claude-skills.ps1 +65 -0
  10. package/assets/toolkit/toolkit/scripts/uninstall-codex-skills.ps1 +53 -0
  11. package/assets/toolkit/toolkit/sdtk-spec.config.json +6 -0
  12. package/assets/toolkit/toolkit/sdtk-spec.config.profiles.example.json +12 -0
  13. package/assets/toolkit/toolkit/skills/ops-backup/SKILL.md +93 -0
  14. package/assets/toolkit/toolkit/skills/ops-backup/references/backup-script-patterns.md +108 -0
  15. package/assets/toolkit/toolkit/skills/ops-ci-cd/SKILL.md +88 -0
  16. package/assets/toolkit/toolkit/skills/ops-ci-cd/references/pipeline-examples.md +113 -0
  17. package/assets/toolkit/toolkit/skills/ops-compliance/SKILL.md +105 -0
  18. package/assets/toolkit/toolkit/skills/ops-container/SKILL.md +95 -0
  19. package/assets/toolkit/toolkit/skills/ops-container/references/k8s-manifest-patterns.md +116 -0
  20. package/assets/toolkit/toolkit/skills/ops-cost/SKILL.md +88 -0
  21. package/assets/toolkit/toolkit/skills/ops-debug/SKILL.md +311 -0
  22. package/assets/toolkit/toolkit/skills/ops-debug/references/root-cause-tracing.md +138 -0
  23. package/assets/toolkit/toolkit/skills/ops-deploy/SKILL.md +102 -0
  24. package/assets/toolkit/toolkit/skills/ops-discover/SKILL.md +102 -0
  25. package/assets/toolkit/toolkit/skills/ops-incident/SKILL.md +113 -0
  26. package/assets/toolkit/toolkit/skills/ops-incident/references/communication-templates.md +34 -0
  27. package/assets/toolkit/toolkit/skills/ops-incident/references/postmortem-template.md +69 -0
  28. package/assets/toolkit/toolkit/skills/ops-incident/references/runbook-template.md +69 -0
  29. package/assets/toolkit/toolkit/skills/ops-infra-plan/SKILL.md +123 -0
  30. package/assets/toolkit/toolkit/skills/ops-infra-plan/references/iac-patterns.md +141 -0
  31. package/assets/toolkit/toolkit/skills/ops-monitor/SKILL.md +110 -0
  32. package/assets/toolkit/toolkit/skills/ops-monitor/references/alert-rules.md +80 -0
  33. package/assets/toolkit/toolkit/skills/ops-monitor/references/slo-templates.md +83 -0
  34. package/assets/toolkit/toolkit/skills/ops-parallel/SKILL.md +177 -0
  35. package/assets/toolkit/toolkit/skills/ops-plan/SKILL.md +169 -0
  36. package/assets/toolkit/toolkit/skills/ops-security-infra/SKILL.md +126 -0
  37. package/assets/toolkit/toolkit/skills/ops-security-infra/references/cicd-security-pipeline.md +55 -0
  38. package/assets/toolkit/toolkit/skills/ops-security-infra/references/security-headers.md +24 -0
  39. package/assets/toolkit/toolkit/skills/ops-verify/SKILL.md +180 -0
  40. package/bin/sdtk-ops.js +14 -0
  41. package/package.json +46 -0
  42. package/src/commands/generate.js +12 -0
  43. package/src/commands/help.js +53 -0
  44. package/src/commands/init.js +86 -0
  45. package/src/commands/runtime.js +201 -0
  46. package/src/index.js +65 -0
  47. package/src/lib/args.js +107 -0
  48. package/src/lib/errors.js +41 -0
  49. package/src/lib/powershell.js +65 -0
  50. package/src/lib/scope.js +58 -0
  51. package/src/lib/toolkit-payload.js +123 -0
@@ -0,0 +1,80 @@
1
+ <!-- Based on agency-agents by AgentLand Contributors (MIT License, 2025). Adapted for SDTK-OPS. -->
2
+
3
+ # Alert Rules
4
+
5
+ ## Prometheus Rules Example
6
+
7
+ ```yaml
8
+ groups:
9
+ - name: application.rules
10
+ rules:
11
+ - alert: HighErrorRate
12
+ expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
13
+ for: 5m
14
+ labels:
15
+ severity: critical
16
+ annotations:
17
+ summary: "High error rate detected"
18
+ description: "Error rate is above threshold for the last 5 minutes"
19
+ runbook: "docs/runbooks/high-error-rate.md"
20
+
21
+ - alert: HighResponseTime
22
+ expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 0.5
23
+ for: 2m
24
+ labels:
25
+ severity: warning
26
+ annotations:
27
+ summary: "High response time detected"
28
+ description: "95th percentile latency is above 500ms"
29
+ runbook: "docs/runbooks/high-latency.md"
30
+
31
+ - name: infrastructure.rules
32
+ rules:
33
+ - alert: HighCPUUsage
34
+ expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
35
+ for: 5m
36
+ labels:
37
+ severity: warning
38
+ annotations:
39
+ summary: "High CPU usage detected"
40
+ description: "CPU usage is above 80% for 5 minutes"
41
+ runbook: "docs/runbooks/high-cpu.md"
42
+
43
+ - alert: HighMemoryUsage
44
+ expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 90
45
+ for: 5m
46
+ labels:
47
+ severity: critical
48
+ annotations:
49
+ summary: "High memory usage detected"
50
+ description: "Memory usage is above 90%"
51
+ runbook: "docs/runbooks/high-memory.md"
52
+
53
+ - alert: DiskSpaceLow
54
+ expr: 100 - ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes) > 85
55
+ for: 2m
56
+ labels:
57
+ severity: warning
58
+ annotations:
59
+ summary: "Low disk space"
60
+ description: "Disk usage is above 85%"
61
+ runbook: "docs/runbooks/disk-space-low.md"
62
+
63
+ - alert: ServiceDown
64
+ expr: up == 0
65
+ for: 1m
66
+ labels:
67
+ severity: critical
68
+ annotations:
69
+ summary: "Service is down"
70
+ description: "A monitored target has been unavailable for more than 1 minute"
71
+ runbook: "docs/runbooks/service-down.md"
72
+ ```
73
+
74
+ ## Design Notes
75
+
76
+ - alert on user impact and resource exhaustion, not every internal event
77
+ - attach a runbook to every page-worthy alert
78
+ - use warning and critical levels where the system benefits from early warning
79
+ - deduplicate rules that describe the same symptom
80
+
@@ -0,0 +1,83 @@
1
+ <!-- Based on agency-agents by AgentLand Contributors (MIT License, 2025). Adapted for SDTK-OPS. -->
2
+
3
+ # SLO Templates
4
+
5
+ ## SLI Definition Template
6
+
7
+ ```yaml
8
+ service: service-name
9
+ owner: owning-team
10
+ review_cadence: monthly
11
+
12
+ slis:
13
+ availability:
14
+ description: "Successful responses to valid requests"
15
+ metric: |
16
+ sum(rate(http_requests_total{service="service-name", status!~"5.."}[5m]))
17
+ /
18
+ sum(rate(http_requests_total{service="service-name"}[5m]))
19
+ good_event: "HTTP status below 500"
20
+ valid_event: "Any user-visible request excluding health checks"
21
+
22
+ latency:
23
+ description: "Requests served within the required threshold"
24
+ metric: |
25
+ histogram_quantile(0.99,
26
+ sum(rate(http_request_duration_seconds_bucket{service="service-name"}[5m]))
27
+ by (le)
28
+ )
29
+ threshold: "300ms at p99"
30
+
31
+ correctness:
32
+ description: "Requests returning correct business results"
33
+ metric: "business_logic_errors_total / requests_total"
34
+ good_event: "No business logic error"
35
+ ```
36
+
37
+ ## SLO Definition Template
38
+
39
+ ```yaml
40
+ slos:
41
+ - sli: availability
42
+ target: 99.95%
43
+ window: 30d
44
+ error_budget: "21.6 minutes per month"
45
+ burn_rate_alerts:
46
+ - severity: critical
47
+ short_window: 5m
48
+ long_window: 1h
49
+ burn_rate: 14.4x
50
+ - severity: warning
51
+ short_window: 30m
52
+ long_window: 6h
53
+ burn_rate: 6x
54
+
55
+ - sli: latency
56
+ target: 99.0%
57
+ window: 30d
58
+ error_budget: "7.2 hours per month"
59
+
60
+ - sli: correctness
61
+ target: 99.99%
62
+ window: 30d
63
+ ```
64
+
65
+ ## Error Budget Policy Template
66
+
67
+ ```yaml
68
+ error_budget_policy:
69
+ budget_remaining_above_50pct: "Normal feature development"
70
+ budget_remaining_25_to_50pct: "Reliability review before risky changes"
71
+ budget_remaining_below_25pct: "Prioritize reliability work until budget recovers"
72
+ budget_exhausted: "Freeze all non-critical deploys and require leadership review"
73
+ ```
74
+
75
+ ## Review Checklist
76
+
77
+ Before approving an SLO definition, confirm:
78
+ - the SLI measures user-visible behavior
79
+ - the target matches business criticality
80
+ - the window is explicit
81
+ - burn-rate alerts exist for fast and slow budget exhaustion
82
+ - the team can explain what action follows each budget state
83
+
@@ -0,0 +1,177 @@
1
+ <!-- Based on superpowers by Jesse Vincent (MIT License, 2025). Adapted for SDTK-OPS. -->
2
+ ---
3
+ name: ops-parallel
4
+ description: Parallel operations dispatch. Use when facing 2+ independent infrastructure tasks that can be worked on without shared state or sequential dependencies.
5
+ ---
6
+
7
+ # Ops Parallel
8
+
9
+ ## Overview
10
+
11
+ You delegate tasks to specialized agents with isolated context. By precisely crafting their instructions and context, you ensure they stay focused and succeed at their task. They should never inherit your session's context or history. You construct exactly what they need. This also preserves your own context for coordination work.
12
+
13
+ When you have multiple unrelated operational problems or task slices, investigating them sequentially wastes time. Each investigation is independent and can happen in parallel.
14
+
15
+ **Core principle:** Dispatch one agent per independent problem domain. Let them work concurrently.
16
+
17
+ ## When to Use
18
+
19
+ ```dot
20
+ digraph when_to_use {
21
+ "Multiple failures?" [shape=diamond];
22
+ "Are they independent?" [shape=diamond];
23
+ "Single agent investigates all" [shape=box];
24
+ "One agent per problem domain" [shape=box];
25
+ "Can they work in parallel?" [shape=diamond];
26
+ "Sequential agents" [shape=box];
27
+ "Parallel dispatch" [shape=box];
28
+
29
+ "Multiple failures?" -> "Are they independent?" [label="yes"];
30
+ "Are they independent?" -> "Single agent investigates all" [label="no - related"];
31
+ "Are they independent?" -> "Can they work in parallel?" [label="yes"];
32
+ "Can they work in parallel?" -> "Parallel dispatch" [label="yes"];
33
+ "Can they work in parallel?" -> "Sequential agents" [label="no - shared state"];
34
+ }
35
+ ```
36
+
37
+ **Use when:**
38
+ - multiple infrastructure failures have different root causes
39
+ - multiple subsystems need work independently
40
+ - each problem can be understood without context from the others
41
+ - no shared state exists between investigations
42
+
43
+ **Do not use when:**
44
+ - failures are related (fixing one might fix others)
45
+ - you need to understand the full system state first
46
+ - agents would interfere with each other
47
+
48
+ ## The Pattern
49
+
50
+ ### 1. Identify Independent Domains
51
+
52
+ Group failures by what is broken:
53
+ - monitoring alert thresholds
54
+ - CI/CD deployment gates
55
+ - backup retention procedures
56
+
57
+ Each domain is independent. Fixing monitoring thresholds should not affect backup retention.
58
+
59
+ ### 2. Create Focused Agent Tasks
60
+
61
+ Each agent gets:
62
+ - **Specific scope:** one subsystem, environment, or operational workflow
63
+ - **Clear goal:** solve exactly one task slice
64
+ - **Constraints:** do not change unrelated code or infrastructure
65
+ - **Expected output:** summary of what was found and what changed
66
+
67
+ ### 3. Dispatch in Parallel
68
+
69
+ ```text
70
+ Task("Tune monitoring alerts for checkout service")
71
+ Task("Fix CI deployment gate for staging rollouts")
72
+ Task("Update backup retention procedure and verification notes")
73
+ ```
74
+
75
+ All three run concurrently.
76
+
77
+ ### 4. Review and Integrate
78
+
79
+ When agents return:
80
+ - read each summary
81
+ - verify fixes do not conflict
82
+ - run the top-level verification for the combined result
83
+ - integrate all accepted changes
84
+
85
+ ## Agent Prompt Structure
86
+
87
+ Good agent prompts are:
88
+ 1. **Focused** - one clear problem domain
89
+ 2. **Self-contained** - all context needed to understand the problem
90
+ 3. **Specific about output** - what the agent should return
91
+
92
+ ```markdown
93
+ Fix the monitoring configuration for checkout alerts:
94
+
95
+ 1. alert fires on every rollout even when service recovers within 30 seconds
96
+ 2. paging threshold should reflect sustained error rate, not one failed probe
97
+ 3. do not change CI/CD or backup files
98
+
99
+ Your task:
100
+
101
+ 1. read the current alert config and related runbook
102
+ 2. identify the root cause of noisy alerts
103
+ 3. fix only the monitoring slice
104
+ 4. return a short summary of root cause and changes
105
+
106
+ Do NOT refactor unrelated infrastructure.
107
+ ```
108
+
109
+ ## Common Mistakes
110
+
111
+ **Bad:** "Fix all infra issues" - agent gets lost
112
+ **Good:** "Fix checkout monitoring alerts" - focused scope
113
+
114
+ **Bad:** "Fix the pipeline" - no context
115
+ **Good:** include failing step names, exact files, and constraints
116
+
117
+ **Bad:** no constraints - agent may refactor everything
118
+ **Good:** "Do NOT touch unrelated systems"
119
+
120
+ **Bad:** vague output - you do not know what changed
121
+ **Good:** "Return summary of root cause and changes"
122
+
123
+ ## When NOT to Use
124
+
125
+ **Related failures:** fixing one might fix others, so investigate together first
126
+ **Need full context:** understanding requires seeing the whole system
127
+ **Exploratory debugging:** you do not know what is broken yet
128
+ **Shared state:** agents would interfere by editing the same files or environments
129
+
130
+ ## Real Example from Session
131
+
132
+ **Scenario:** three independent operations tasks after a reliability review
133
+
134
+ **Tasks:**
135
+ - monitoring setup for checkout alerts needs tuning
136
+ - CI/CD pipeline rollout gate is missing a health checkpoint
137
+ - backup procedure lacks a restore verification note
138
+
139
+ **Decision:** independent domains. Monitoring, CI/CD, and backup procedures can be investigated separately.
140
+
141
+ **Dispatch:**
142
+ ```
143
+ Agent 1 -> Fix monitoring alert configuration
144
+ Agent 2 -> Fix CI/CD rollout gate
145
+ Agent 3 -> Fix backup verification procedure
146
+ ```
147
+
148
+ **Results:**
149
+ - Agent 1: tuned thresholds and reduced noisy paging
150
+ - Agent 2: added a health gate before promotion
151
+ - Agent 3: documented restore verification evidence
152
+
153
+ **Integration:** all fixes were independent, no conflicts, combined verification succeeded
154
+
155
+ **Time saved:** three problem domains advanced in parallel instead of one by one
156
+
157
+ ## Key Benefits
158
+
159
+ 1. **Parallelization** - multiple investigations happen simultaneously
160
+ 2. **Focus** - each agent has narrow scope and less context to track
161
+ 3. **Independence** - agents do not interfere with each other
162
+ 4. **Speed** - three problems solved in the time of one
163
+
164
+ ## Verification
165
+
166
+ After agents return:
167
+ 1. **Review each summary** - understand what changed
168
+ 2. **Check for conflicts** - did agents edit the same files or assumptions?
169
+ 3. **Run top-level verification** - use `ops-verify` on the combined result
170
+ 4. **Spot check** - agents can make systematic errors
171
+
172
+ ## Real-World Impact
173
+
174
+ From debugging sessions:
175
+ - multiple independent failures were investigated concurrently
176
+ - all investigations completed faster than a sequential pass
177
+ - focused prompts reduced confusion and rework
@@ -0,0 +1,169 @@
1
+ <!-- Based on superpowers by Jesse Vincent (MIT License, 2025) and gstack by Garry Tan (MIT License, 2026). Adapted for SDTK-OPS. -->
2
+ ---
3
+ name: ops-plan
4
+ description: Infrastructure and operations planning. Use when planning infrastructure changes, deployment strategies, or operational procedures before execution -- numbered steps, dependency ordering, rollback strategy per step.
5
+ ---
6
+
7
+ # Ops Plan
8
+
9
+ ## Overview
10
+
11
+ Write infrastructure and operations plans assuming the implementer has zero context for the system, environment, or operational history. Document exactly what changes, in what order, how to verify each step, and how to roll it back safely.
12
+
13
+ Keep the plan reviewable, explicit, and small enough to execute without improvisation.
14
+
15
+ ## Scope Check
16
+
17
+ If the request covers multiple independent systems, environments, or operational initiatives, suggest splitting it into separate plans. One plan should produce one coherent, reviewable operational outcome.
18
+
19
+ Challenge scope before planning:
20
+ - What already exists that partially solves this?
21
+ - What is the minimum change that achieves the goal safely?
22
+ - Which work is required now, and which work can be deferred?
23
+
24
+ ## File Structure And Affected Systems
25
+
26
+ Before defining tasks, map out what will be created or modified and what each item owns.
27
+
28
+ Include:
29
+ - manifests, IaC modules, runbooks, scripts, or policy files
30
+ - affected services, environments, regions, or accounts
31
+ - external dependencies such as DNS, secrets, CI/CD, or databases
32
+
33
+ Lock in decomposition here. Each task should have one clear operational responsibility.
34
+
35
+ ## Bite-Sized Task Granularity
36
+
37
+ Each step should be one operational action that can be verified independently.
38
+
39
+ Examples:
40
+ - update one manifest or values file
41
+ - validate one configuration change
42
+ - apply one change to staging
43
+ - verify one health gate
44
+ - record one rollback checkpoint
45
+
46
+ Do not write giant plan steps like "deploy all infrastructure" or "complete migration".
47
+
48
+ ## Plan Document Header
49
+
50
+ Every plan MUST start with this header:
51
+
52
+ ```markdown
53
+ # [Change Name] Operations Plan
54
+
55
+ > For implementers: execute steps in order. Do not mark any step complete until `ops-verify` confirms the expected evidence.
56
+
57
+ **Goal:** [One sentence describing what this change achieves]
58
+
59
+ **Architecture:** [2-3 sentences about the approach]
60
+
61
+ **Affected Systems:** [Services, environments, accounts, regions, pipelines]
62
+
63
+ **Rollback Strategy:** [One sentence summary of rollback posture]
64
+
65
+ **Risk Level:** [LOW | MEDIUM | HIGH]
66
+
67
+ ---
68
+ ```
69
+
70
+ ## Required Sections For Infrastructure Plans
71
+
72
+ Every infrastructure plan must include:
73
+ - **Resource Sizing**
74
+ - CPU, memory, storage estimates
75
+ - auto-scaling boundaries
76
+ - **Networking**
77
+ - DNS changes
78
+ - ingress or load balancer updates
79
+ - security groups, network policies, firewall rules
80
+ - **Security**
81
+ - IAM roles
82
+ - secrets handling
83
+ - network policy or access boundary changes
84
+ - **Rollback Checklist per step**
85
+ - use this exact shape:
86
+
87
+ | Step | Rollback Action | Verification |
88
+ |------|-----------------|--------------|
89
+ | 1 | Revert manifest to previous revision | Health endpoint returns 200 |
90
+
91
+ - **Migration Safety**
92
+ - backward compatibility
93
+ - feature flags
94
+ - dual-write, dual-read, or phased rollout needs
95
+
96
+ ## Assumption Tracking
97
+
98
+ Record assumptions in this exact table format:
99
+
100
+ | # | Assumption | Verified | Risk if wrong |
101
+ |---|------------|----------|---------------|
102
+ | A1 | Example assumption | No | Medium |
103
+
104
+ Do not bury assumptions in prose. If an assumption can break rollout or rollback, it must be tracked explicitly.
105
+
106
+ ## Infrastructure Review Lens
107
+
108
+ Before approving the plan, check:
109
+ - dependency order across systems and environments
110
+ - blast radius if one step fails
111
+ - rollback feasibility after each step
112
+ - health checks, smoke checks, and stability windows
113
+ - happy path, no-op path, failure path, and rollback path
114
+ - observability coverage during and after the change
115
+ - operator-visible evidence for each important milestone
116
+
117
+ ## Task Structure
118
+
119
+ Use this shape for each task:
120
+
121
+ ````markdown
122
+ ### Task N: [Change Slice]
123
+
124
+ **Files / Systems:**
125
+ - Modify: `exact/path/to/file`
126
+ - Environment: `staging|production|shared`
127
+ - Verify: `exact command or evidence`
128
+
129
+ **Rollback Checklist:**
130
+
131
+ | Step | Rollback Action | Verification |
132
+ |------|-----------------|--------------|
133
+ | N | [Exact rollback action] | [Exact evidence] |
134
+
135
+ - [ ] **Step 1: Prepare the change**
136
+ - update the target manifest, script, or runbook
137
+
138
+ - [ ] **Step 2: Validate locally or in dry-run mode**
139
+ - run the exact validation command
140
+
141
+ - [ ] **Step 3: Apply to the target environment**
142
+ - perform one bounded operational change
143
+
144
+ - [ ] **Step 4: Verify expected state**
145
+ - record the exact health or status evidence
146
+
147
+ - [ ] **Step 5: Record rollback checkpoint**
148
+ - confirm the rollback command and previous good state
149
+ ````
150
+
151
+ ## Common Mistakes
152
+
153
+ | Mistake | Why it fails |
154
+ |---------|--------------|
155
+ | "Implement the infrastructure" as one step | No safe checkpoint, no isolated rollback |
156
+ | Missing rollback action per task | Recovery becomes guesswork during incidents |
157
+ | No resource sizing or network notes | Hidden capacity and connectivity risks surface late |
158
+ | Security assumptions left implicit | Secrets, IAM, or access drift breaks rollout |
159
+ | Verification only at the end | You lose the exact step where the system broke |
160
+ | Migration plan ignores backward compatibility | Deploy succeeds but runtime traffic fails |
161
+
162
+ ## Execution Handoff
163
+
164
+ After saving the plan:
165
+ - review assumptions, dependencies, and rollback notes
166
+ - use `ops-parallel` only for truly independent slices
167
+ - invoke `ops-verify` before marking any step complete
168
+
169
+ The plan is not complete until the implementer can execute it without guessing.
@@ -0,0 +1,126 @@
1
+ <!-- Based on agency-agents by AgentLand Contributors (MIT License, 2025). Adapted for SDTK-OPS. -->
2
+ ---
3
+ name: ops-security-infra
4
+ description: Infrastructure security. Use when hardening infrastructure, managing secrets, defining network policies, or setting up security scanning in CI/CD -- covers STRIDE for infrastructure, secrets management, and detection-as-code.
5
+ ---
6
+
7
+ # Ops Security Infra
8
+
9
+ ## Overview
10
+
11
+ Infrastructure security must be designed into identity, network, secrets, logging, and delivery paths at the same time. The goal is not perfect theoretical safety. The goal is to remove obvious exposure, narrow trust boundaries, and make security controls reviewable and repeatable.
12
+
13
+ ## When to Use
14
+
15
+ Use for:
16
+ - infrastructure hardening reviews
17
+ - IAM and access-boundary changes
18
+ - secrets-management design
19
+ - network policy and ingress review
20
+ - CI/CD security scanning design
21
+ - detection rule delivery and coverage tracking
22
+
23
+ ## STRIDE For Infrastructure
24
+
25
+ | Threat | Infrastructure Example | Primary Control |
26
+ |--------|------------------------|-----------------|
27
+ | Spoofing | compromised IAM role or forged workload identity | MFA, short-lived credentials, strong workload identity |
28
+ | Tampering | unauthorized infrastructure-as-code or config change | peer review, protected branches, signed change path |
29
+ | Repudiation | operator denies a risky change | immutable audit logging |
30
+ | Information Disclosure | secrets exposed in config, logs, or state files | secret manager, redaction, encryption |
31
+ | Denial of Service | public endpoints or shared resources overwhelmed | rate limiting, WAF, scaling limits, network controls |
32
+ | Elevation of Privilege | broad admin roles or wildcard policies | least-privilege IAM and isolated admin paths |
33
+
34
+ ## Network Security
35
+
36
+ Use these defaults:
37
+ - security groups and firewall rules default deny
38
+ - no `0.0.0.0/0` access except for intentionally public load balancers or edge endpoints
39
+ - private services stay private by default
40
+ - Kubernetes network policies start from deny-all and then allow only required traffic
41
+ - management interfaces live behind stronger access controls than the workload path
42
+
43
+ ## Secrets Management
44
+
45
+ | Option | When To Use | Security Level |
46
+ |--------|-------------|----------------|
47
+ | Cloud KMS | key management and envelope encryption | high |
48
+ | Secret Manager | application and infrastructure secrets at runtime | high |
49
+ | HashiCorp Vault | complex multi-platform secret workflows and dynamic credentials | high |
50
+ | Sealed Secrets | Kubernetes-native encrypted secret delivery | medium to high |
51
+
52
+ Rules:
53
+ - never hardcode secrets in repo files
54
+ - never log secrets or raw tokens
55
+ - prefer short-lived credentials over long-lived static keys
56
+ - rotate secrets on a fixed schedule and on compromise
57
+
58
+ ## <HARD-GATE>
59
+
60
+ NEVER:
61
+ - store secrets in git
62
+ - log secrets
63
+ - leave credentials unrotated beyond 90 days without reviewed exception
64
+
65
+ ALWAYS:
66
+ - prefer short-lived credentials such as OIDC or STS where available
67
+ - audit secret access
68
+ - review who can decrypt or retrieve production secrets
69
+
70
+ ## CI CD Security Pipeline
71
+
72
+ The pipeline should scan:
73
+ - static code issues with Semgrep or equivalent
74
+ - dependency and image vulnerabilities with Trivy or equivalent
75
+ - secret exposure with Gitleaks or equivalent
76
+
77
+ Use `./references/cicd-security-pipeline.md` for a GitHub Actions example adapted from the source material.
78
+
79
+ ## Detection As Code
80
+
81
+ Treat security detections as code:
82
+ - keep rules in version control
83
+ - validate them in CI before deployment
84
+ - map each rule to MITRE ATT&CK coverage where that model is relevant
85
+ - record known false positives and data-source dependencies
86
+ - deploy through a controlled pipeline, not ad hoc console edits
87
+
88
+ The goal is awareness and repeatability, not a specific SIEM vendor.
89
+
90
+ ## Security Hardening Checklist
91
+
92
+ Review these items:
93
+ 1. default-deny network posture
94
+ 2. encryption at rest
95
+ 3. encryption in transit
96
+ 4. secrets in a manager, not config files
97
+ 5. least-privilege IAM
98
+ 6. security scanning in CI/CD
99
+ 7. immutable or strongly protected audit logging
100
+ 8. container image scanning
101
+ 9. SSH key rotation or stronger admin access controls
102
+ 10. MFA for privileged infrastructure access
103
+
104
+ ## Common Mistakes
105
+
106
+ | Mistake | Why it fails |
107
+ |---------|--------------|
108
+ | Overly permissive IAM | One compromise turns into full-environment access |
109
+ | Secrets in environment dumps or logs | Detection becomes recovery plus disclosure response |
110
+ | Security treated as a final review step | Core architecture assumptions stay unsafe |
111
+ | No audit logging for privileged actions | Investigation and compliance both fail |
112
+ | Security scans exist but do not block anything | Vulnerable changes keep shipping |
113
+
114
+ ## Reference Files
115
+
116
+ Use:
117
+ - `./references/cicd-security-pipeline.md`
118
+ - `./references/security-headers.md`
119
+
120
+ ## Execution Handoff
121
+
122
+ After defining the security change:
123
+ - send infrastructure boundary changes to `ops-infra-plan`
124
+ - route CI/CD controls through `ops-ci-cd`
125
+ - verify the final control state with `ops-verify`
126
+
@@ -0,0 +1,55 @@
1
+ <!-- Based on agency-agents by AgentLand Contributors (MIT License, 2025). Adapted for SDTK-OPS. -->
2
+
3
+ # CI CD Security Pipeline
4
+
5
+ ```yaml
6
+ name: Security Scan
7
+
8
+ on:
9
+ pull_request:
10
+ branches: [main]
11
+
12
+ jobs:
13
+ sast:
14
+ name: Static Analysis
15
+ runs-on: ubuntu-latest
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+ - name: Run Semgrep SAST
19
+ uses: semgrep/semgrep-action@v1
20
+ with:
21
+ config: >-
22
+ p/owasp-top-ten
23
+ p/cwe-top-25
24
+
25
+ dependency_scan:
26
+ name: Dependency Audit
27
+ runs-on: ubuntu-latest
28
+ steps:
29
+ - uses: actions/checkout@v4
30
+ - name: Run Trivy vulnerability scanner
31
+ uses: aquasecurity/trivy-action@master
32
+ with:
33
+ scan-type: fs
34
+ severity: CRITICAL,HIGH
35
+ exit-code: "1"
36
+
37
+ secrets_scan:
38
+ name: Secrets Detection
39
+ runs-on: ubuntu-latest
40
+ steps:
41
+ - uses: actions/checkout@v4
42
+ with:
43
+ fetch-depth: 0
44
+ - name: Run Gitleaks
45
+ uses: gitleaks/gitleaks-action@v2
46
+ env:
47
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
48
+ ```
49
+
50
+ ## Pattern Notes
51
+
52
+ - fail the pipeline on critical findings, not just log them
53
+ - keep secrets in the platform secret store
54
+ - expand beyond GitHub Actions only if the team actually uses another runner
55
+