faultray 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- faultray-1.2.0.dist-info/METADATA +705 -0
- faultray-1.2.0.dist-info/RECORD +168 -0
- faultray-1.2.0.dist-info/WHEEL +4 -0
- faultray-1.2.0.dist-info/entry_points.txt +4 -0
- faultray-1.2.0.dist-info/licenses/LICENSE +21 -0
- infrasim/__init__.py +79 -0
- infrasim/ai/__init__.py +0 -0
- infrasim/ai/analyzer.py +636 -0
- infrasim/ai/architecture_advisor.py +1205 -0
- infrasim/ai/nl_to_infra.py +1059 -0
- infrasim/api/__init__.py +0 -0
- infrasim/api/auth.py +170 -0
- infrasim/api/billing.py +190 -0
- infrasim/api/database.py +265 -0
- infrasim/api/graphql_api.py +386 -0
- infrasim/api/insurance_api.py +562 -0
- infrasim/api/leaderboard.py +362 -0
- infrasim/api/oauth.py +189 -0
- infrasim/api/server.py +1736 -0
- infrasim/api/static/graph.js +319 -0
- infrasim/api/static/style.css +2125 -0
- infrasim/api/static/sw.js +66 -0
- infrasim/api/teams.py +481 -0
- infrasim/api/templates/advisor.html +422 -0
- infrasim/api/templates/analyze.html +186 -0
- infrasim/api/templates/base.html +261 -0
- infrasim/api/templates/blast_radius.html +1141 -0
- infrasim/api/templates/compliance.html +265 -0
- infrasim/api/templates/components.html +164 -0
- infrasim/api/templates/cost.html +121 -0
- infrasim/api/templates/dashboard.html +279 -0
- infrasim/api/templates/graph.html +89 -0
- infrasim/api/templates/landing.html +787 -0
- infrasim/api/templates/reports.html +132 -0
- infrasim/api/templates/security.html +149 -0
- infrasim/api/templates/settings.html +170 -0
- infrasim/api/templates/simulation.html +310 -0
- infrasim/api/widget.py +120 -0
- infrasim/cache.py +177 -0
- infrasim/ci/__init__.py +1 -0
- infrasim/ci/github_action.py +443 -0
- infrasim/ci/sarif_exporter.py +403 -0
- infrasim/cli/__init__.py +44 -0
- infrasim/cli/admin.py +862 -0
- infrasim/cli/advisor_cmd.py +395 -0
- infrasim/cli/analyze.py +248 -0
- infrasim/cli/auto_fix.py +151 -0
- infrasim/cli/autoscale_cmd.py +355 -0
- infrasim/cli/backtest.py +106 -0
- infrasim/cli/benchmark_cmd.py +281 -0
- infrasim/cli/config_cmd.py +93 -0
- infrasim/cli/daemon_cmd.py +110 -0
- infrasim/cli/diff_cmd.py +118 -0
- infrasim/cli/discovery.py +472 -0
- infrasim/cli/dna_cmd.py +124 -0
- infrasim/cli/drift_cmd.py +341 -0
- infrasim/cli/evaluate.py +1094 -0
- infrasim/cli/feeds.py +228 -0
- infrasim/cli/genome.py +585 -0
- infrasim/cli/history_cmd.py +132 -0
- infrasim/cli/main.py +412 -0
- infrasim/cli/marketplace_cmd.py +464 -0
- infrasim/cli/nl_command.py +210 -0
- infrasim/cli/ops.py +1292 -0
- infrasim/cli/predictive.py +363 -0
- infrasim/cli/quickstart.py +179 -0
- infrasim/cli/replay_cmd.py +567 -0
- infrasim/cli/simulate.py +420 -0
- infrasim/cli/sla_cmd.py +337 -0
- infrasim/cli/supply_chain_cmd.py +122 -0
- infrasim/cli/tf_check.py +422 -0
- infrasim/cli/timeline_cmd.py +376 -0
- infrasim/cli/twin_cmd.py +138 -0
- infrasim/config.py +119 -0
- infrasim/daemon.py +206 -0
- infrasim/differ.py +149 -0
- infrasim/discovery/__init__.py +0 -0
- infrasim/discovery/aws_scanner.py +1167 -0
- infrasim/discovery/azure_scanner.py +769 -0
- infrasim/discovery/flow_analyzer.py +290 -0
- infrasim/discovery/gcp_scanner.py +704 -0
- infrasim/discovery/k8s_scanner.py +568 -0
- infrasim/discovery/metric_calibrator.py +324 -0
- infrasim/discovery/prometheus.py +308 -0
- infrasim/discovery/prometheus_monitor.py +82 -0
- infrasim/discovery/scanner.py +193 -0
- infrasim/discovery/terraform.py +600 -0
- infrasim/features.py +48 -0
- infrasim/feeds/__init__.py +1 -0
- infrasim/feeds/analyzer.py +416 -0
- infrasim/feeds/fetcher.py +152 -0
- infrasim/feeds/sources.py +69 -0
- infrasim/feeds/store.py +139 -0
- infrasim/history.py +379 -0
- infrasim/i18n.py +79 -0
- infrasim/integrations/__init__.py +1 -0
- infrasim/integrations/datadog.py +122 -0
- infrasim/integrations/grafana.py +85 -0
- infrasim/integrations/incident_correlator.py +471 -0
- infrasim/integrations/issue_tracker.py +110 -0
- infrasim/integrations/opsgenie.py +61 -0
- infrasim/integrations/slack_bot.py +376 -0
- infrasim/integrations/terraform_provider.py +264 -0
- infrasim/integrations/webhooks.py +446 -0
- infrasim/licensing.py +76 -0
- infrasim/log_config.py +33 -0
- infrasim/marketplace/__init__.py +19 -0
- infrasim/marketplace/builtin_packages.py +1341 -0
- infrasim/marketplace/catalog.py +516 -0
- infrasim/marketplace.py +266 -0
- infrasim/model/__init__.py +0 -0
- infrasim/model/components.py +310 -0
- infrasim/model/demo.py +123 -0
- infrasim/model/dna.py +229 -0
- infrasim/model/graph.py +380 -0
- infrasim/model/loader.py +282 -0
- infrasim/plugins/__init__.py +5 -0
- infrasim/plugins/registry.py +129 -0
- infrasim/remediation/__init__.py +0 -0
- infrasim/remediation/auto_pipeline.py +344 -0
- infrasim/remediation/iac_exporter.py +1708 -0
- infrasim/remediation/iac_generator.py +863 -0
- infrasim/reporter/__init__.py +0 -0
- infrasim/reporter/compliance.py +438 -0
- infrasim/reporter/evidence_generator.py +513 -0
- infrasim/reporter/executive_report.py +625 -0
- infrasim/reporter/export.py +368 -0
- infrasim/reporter/html_report.py +308 -0
- infrasim/reporter/pdf_report.py +215 -0
- infrasim/reporter/report.py +144 -0
- infrasim/reporter/templates/report.html +509 -0
- infrasim/scoring.py +415 -0
- infrasim/simulator/__init__.py +0 -0
- infrasim/simulator/advisor_engine.py +407 -0
- infrasim/simulator/autoscaling_engine.py +339 -0
- infrasim/simulator/availability_model.py +485 -0
- infrasim/simulator/backtest_engine.py +139 -0
- infrasim/simulator/bayesian_model.py +236 -0
- infrasim/simulator/benchmarking.py +652 -0
- infrasim/simulator/capacity_engine.py +668 -0
- infrasim/simulator/carbon_engine.py +274 -0
- infrasim/simulator/cascade.py +701 -0
- infrasim/simulator/chaos_calendar.py +362 -0
- infrasim/simulator/chaos_genome.py +1286 -0
- infrasim/simulator/compliance_engine.py +703 -0
- infrasim/simulator/cost_engine.py +340 -0
- infrasim/simulator/digital_twin.py +286 -0
- infrasim/simulator/dr_engine.py +348 -0
- infrasim/simulator/drift_detector.py +982 -0
- infrasim/simulator/dynamic_engine.py +1091 -0
- infrasim/simulator/engine.py +296 -0
- infrasim/simulator/financial_risk.py +345 -0
- infrasim/simulator/gameday_engine.py +387 -0
- infrasim/simulator/incident_db.py +982 -0
- infrasim/simulator/incident_replay.py +812 -0
- infrasim/simulator/markov_model.py +278 -0
- infrasim/simulator/monte_carlo.py +276 -0
- infrasim/simulator/ops_engine.py +1914 -0
- infrasim/simulator/planner.py +670 -0
- infrasim/simulator/predictive_engine.py +347 -0
- infrasim/simulator/resilience_timeline.py +612 -0
- infrasim/simulator/scenarios.py +932 -0
- infrasim/simulator/security_engine.py +525 -0
- infrasim/simulator/sla_validator.py +788 -0
- infrasim/simulator/supply_chain_engine.py +371 -0
- infrasim/simulator/traffic.py +460 -0
- infrasim/simulator/whatif_engine.py +858 -0
- infrasim/telemetry.py +61 -0
|
@@ -0,0 +1,705 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: faultray
|
|
3
|
+
Version: 1.2.0
|
|
4
|
+
Summary: FaultRay — Zero-risk infrastructure chaos engineering. Prove your availability ceiling mathematically.
|
|
5
|
+
Project-URL: Homepage, https://github.com/mattyopon/infrasim
|
|
6
|
+
Project-URL: Documentation, https://github.com/mattyopon/infrasim#readme
|
|
7
|
+
Project-URL: Repository, https://github.com/mattyopon/infrasim
|
|
8
|
+
Project-URL: Issues, https://github.com/mattyopon/infrasim/issues
|
|
9
|
+
Project-URL: Changelog, https://github.com/mattyopon/infrasim#changelog
|
|
10
|
+
Author-email: Yutaro Maeda <mattyopon@gmail.com>
|
|
11
|
+
License: MIT
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Keywords: chaos-engineering,devops,infrastructure,reliability,simulation,slo,sre
|
|
14
|
+
Classifier: Development Status :: 4 - Beta
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: Intended Audience :: System Administrators
|
|
17
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Software Development :: Testing
|
|
22
|
+
Classifier: Topic :: System :: Networking
|
|
23
|
+
Classifier: Topic :: System :: Systems Administration
|
|
24
|
+
Classifier: Typing :: Typed
|
|
25
|
+
Requires-Python: >=3.11
|
|
26
|
+
Requires-Dist: aiosqlite>=0.20
|
|
27
|
+
Requires-Dist: fastapi>=0.115
|
|
28
|
+
Requires-Dist: httpx>=0.28
|
|
29
|
+
Requires-Dist: jinja2>=3.1
|
|
30
|
+
Requires-Dist: networkx>=3.0
|
|
31
|
+
Requires-Dist: psutil>=6.0
|
|
32
|
+
Requires-Dist: pydantic>=2.0
|
|
33
|
+
Requires-Dist: pyyaml>=6.0
|
|
34
|
+
Requires-Dist: rich>=13.0
|
|
35
|
+
Requires-Dist: sqlalchemy>=2.0
|
|
36
|
+
Requires-Dist: typer>=0.15
|
|
37
|
+
Requires-Dist: uvicorn>=0.34
|
|
38
|
+
Provides-Extra: all-clouds
|
|
39
|
+
Requires-Dist: azure-identity>=1.15; extra == 'all-clouds'
|
|
40
|
+
Requires-Dist: boto3>=1.28; extra == 'all-clouds'
|
|
41
|
+
Requires-Dist: google-cloud-compute>=1.0; extra == 'all-clouds'
|
|
42
|
+
Requires-Dist: kubernetes>=28.0; extra == 'all-clouds'
|
|
43
|
+
Provides-Extra: aws
|
|
44
|
+
Requires-Dist: boto3>=1.28; extra == 'aws'
|
|
45
|
+
Provides-Extra: azure
|
|
46
|
+
Requires-Dist: azure-identity>=1.15; extra == 'azure'
|
|
47
|
+
Requires-Dist: azure-mgmt-compute>=30.0; extra == 'azure'
|
|
48
|
+
Requires-Dist: azure-mgmt-redis>=14.0; extra == 'azure'
|
|
49
|
+
Requires-Dist: azure-mgmt-sql>=3.0; extra == 'azure'
|
|
50
|
+
Provides-Extra: dev
|
|
51
|
+
Requires-Dist: pytest-asyncio>=0.24; extra == 'dev'
|
|
52
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
53
|
+
Requires-Dist: ruff>=0.8; extra == 'dev'
|
|
54
|
+
Provides-Extra: gcp
|
|
55
|
+
Requires-Dist: google-cloud-compute>=1.0; extra == 'gcp'
|
|
56
|
+
Requires-Dist: google-cloud-redis>=2.0; extra == 'gcp'
|
|
57
|
+
Requires-Dist: google-cloud-sql-admin>=1.0; extra == 'gcp'
|
|
58
|
+
Requires-Dist: google-cloud-storage>=2.0; extra == 'gcp'
|
|
59
|
+
Provides-Extra: k8s
|
|
60
|
+
Requires-Dist: kubernetes>=28.0; extra == 'k8s'
|
|
61
|
+
Description-Content-Type: text/markdown
|
|
62
|
+
|
|
63
|
+
# FaultRay — Zero-Risk Infrastructure Chaos Simulation
|
|
64
|
+
|
|
65
|
+
> **Simulate infrastructure failures without touching production.**
|
|
66
|
+
> **Prove your system's availability ceiling mathematically.**
|
|
67
|
+
|
|
68
|
+
[](https://www.python.org/downloads/)
|
|
69
|
+
[](LICENSE)
|
|
70
|
+
[]()
|
|
71
|
+
[]()
|
|
72
|
+
[](Dockerfile)
|
|
73
|
+
[]()
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## Why FaultRay?
|
|
78
|
+
|
|
79
|
+
Most chaos engineering tools inject real faults into real infrastructure. FaultRay takes a fundamentally different approach: **pure mathematical simulation** that models your entire dependency graph in memory, runs 150+ failure scenarios, and proves your system's theoretical availability ceiling — all without touching a single server.
|
|
80
|
+
|
|
81
|
+
| | **Gremlin** | **Steadybit** | **AWS FIS** | **FaultRay** |
|
|
82
|
+
|---|---|---|---|---|
|
|
83
|
+
| **Approach** | Fault injection | Fault injection | Fault injection | Mathematical simulation |
|
|
84
|
+
| **Risk to production** | Medium-High | Medium | Medium | **Zero** |
|
|
85
|
+
| **Setup required** | Agent per host | Agent per host | AWS-only | **Single pip install** |
|
|
86
|
+
| **Scenario count** | Manual config | Manual config | AWS services only | **150+ auto-generated** |
|
|
87
|
+
| **Availability proof** | No | No | No | **3-Layer Limit Model** |
|
|
88
|
+
| **Cost** | $$$$ | $$$ | $$ (AWS-only) | **Free / OSS** |
|
|
89
|
+
| **Dependency graph** | No | Limited | No | **Full NetworkX graph** |
|
|
90
|
+
| **Terraform integration** | No | No | Native | **tfstate + plan analysis** |
|
|
91
|
+
| **Security feed** | No | No | No | **Auto CVE scenarios** |
|
|
92
|
+
|
|
93
|
+
**Key differentiators:**
|
|
94
|
+
|
|
95
|
+
- **Zero risk** — Runs entirely in memory. No agents, no sidecars, no production impact.
|
|
96
|
+
- **5 simulation engines** — Cascade, Dynamic, Ops, What-If, and Capacity engines working together.
|
|
97
|
+
- **3-Layer Availability Limit Model** — The only tool that mathematically proves your system's availability ceiling (see below).
|
|
98
|
+
|
|
99
|
+
---
|
|
100
|
+
|
|
101
|
+
## Quick Start
|
|
102
|
+
|
|
103
|
+
### pip
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
# Install
|
|
107
|
+
pip install -e .
|
|
108
|
+
|
|
109
|
+
# Run demo (6-component web stack simulation)
|
|
110
|
+
faultray demo
|
|
111
|
+
|
|
112
|
+
# With web dashboard
|
|
113
|
+
faultray demo --web
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### Docker
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
# Web dashboard (http://localhost:8000)
|
|
120
|
+
docker compose up web
|
|
121
|
+
|
|
122
|
+
# Demo mode with dashboard
|
|
123
|
+
docker compose --profile demo up demo
|
|
124
|
+
|
|
125
|
+
# CLI mode
|
|
126
|
+
docker compose --profile cli run cli simulate
|
|
127
|
+
|
|
128
|
+
# Build from source
|
|
129
|
+
docker build -t faultray .
|
|
130
|
+
docker run -p 8000:8000 faultray
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### Demo Output
|
|
134
|
+
|
|
135
|
+
```
|
|
136
|
+
╭────────── FaultRay Chaos Simulation Report ──────────╮
|
|
137
|
+
│ Resilience Score: 36/100 │
|
|
138
|
+
│ Scenarios tested: 150 │
|
|
139
|
+
│ Critical: 7 Warning: 66 Passed: 77 │
|
|
140
|
+
╰──────────────────────────────────────────────────────╯
|
|
141
|
+
|
|
142
|
+
CRITICAL FINDINGS
|
|
143
|
+
|
|
144
|
+
10.0/10 CRITICAL Traffic spike (10x)
|
|
145
|
+
Cascade path:
|
|
146
|
+
├── DOWN nginx (LB)
|
|
147
|
+
├── DOWN api-server-1
|
|
148
|
+
├── DOWN api-server-2
|
|
149
|
+
├── DOWN PostgreSQL (primary)
|
|
150
|
+
├── DOWN Redis (cache)
|
|
151
|
+
└── DOWN RabbitMQ
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
---
|
|
155
|
+
|
|
156
|
+
## Features
|
|
157
|
+
|
|
158
|
+
| | Feature | Description |
|
|
159
|
+
|---|---|---|
|
|
160
|
+
| :shield: | **Zero Risk Simulation** | Runs entirely in memory — no agents, no sidecars, no production impact |
|
|
161
|
+
| :chart_with_upwards_trend: | **150+ Chaos Scenarios** | 30 categories of failure scenarios auto-generated from your topology |
|
|
162
|
+
| :link: | **Dependency Graph Analysis** | NetworkX-powered graph modeling with cascade fault prediction |
|
|
163
|
+
| :triangular_ruler: | **3-Layer Availability Proof** | Mathematically proves your system's theoretical availability ceiling |
|
|
164
|
+
| :dart: | **SLO/SLI Tracking** | Availability, latency, and error rate tracking against SLO targets |
|
|
165
|
+
| :crystal_ball: | **What-If Analysis** | Parameter sweep for fault tolerance sensitivity analysis |
|
|
166
|
+
| :bar_chart: | **Capacity Planning** | Growth forecasting with SLO compliance evaluation |
|
|
167
|
+
| :ocean: | **10 Traffic Models** | DDoS, diurnal, flash crowd, growth trend, and more |
|
|
168
|
+
| :clock1: | **Ops Simulation** | Long-running (days/weeks) operational simulation with SLO tracking |
|
|
169
|
+
| :zap: | **Dynamic Simulation** | Time-stepped simulation with traffic pattern integration |
|
|
170
|
+
| :newspaper: | **Security Feed** | Auto-generates scenarios from CISA, NVD, Krebs, BleepingComputer |
|
|
171
|
+
| :globe_with_meridians: | **Terraform Integration** | Import from tfstate/plan with change impact analysis |
|
|
172
|
+
| :desktop_computer: | **Web Dashboard** | D3.js interactive graph + Grafana-style dashboard |
|
|
173
|
+
| :mag: | **Multiple Discovery** | Local scan, Prometheus, Terraform, YAML |
|
|
174
|
+
|
|
175
|
+
---
|
|
176
|
+
|
|
177
|
+
## 3-Layer Availability Limit Model
|
|
178
|
+
|
|
179
|
+
**This is FaultRay's unique contribution to chaos engineering.**
|
|
180
|
+
|
|
181
|
+
Traditional chaos tools answer "what breaks?" FaultRay answers **"what is the maximum availability your architecture can physically achieve?"** using a three-layer mathematical model.
|
|
182
|
+
|
|
183
|
+
```
|
|
184
|
+
┌─────────────────────────────────────────┐
|
|
185
|
+
│ │
|
|
186
|
+
Layer 3 ──────── │ Theoretical Limit 6.65 nines │ ── Upper bound
|
|
187
|
+
│ (perfect redundancy + perfect failover)│ (unreachable)
|
|
188
|
+
│ │
|
|
189
|
+
Layer 2 ──────── │ Hardware Limit 5.91 nines │ ── Physical ceiling
|
|
190
|
+
│ (component MTBF × redundancy) │ (hard constraint)
|
|
191
|
+
│ │
|
|
192
|
+
Layer 1 ──────── │ Software Limit 4.00 nines │ ── Practical ceiling
|
|
193
|
+
│ (deployment + config + human error) │ (your real target)
|
|
194
|
+
│ │
|
|
195
|
+
└─────────────────────────────────────────┘
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
### Layer 1: Software Availability Limit (practical ceiling)
|
|
199
|
+
|
|
200
|
+
Accounts for deployment failures, configuration drift, human error, and software bugs. Most organizations cannot exceed **4.00 nines (99.99%)** at this layer without extreme operational maturity.
|
|
201
|
+
|
|
202
|
+
### Layer 2: Hardware Availability Limit (physical ceiling)
|
|
203
|
+
|
|
204
|
+
Calculated from component MTBF (Mean Time Between Failures), redundancy factor, and failover time. Even with perfect software, hardware constraints cap availability at approximately **5.91 nines (99.999%)**.
|
|
205
|
+
|
|
206
|
+
### Layer 3: Theoretical Availability Limit (mathematical upper bound)
|
|
207
|
+
|
|
208
|
+
Assumes perfect redundancy, instant failover, and zero software errors. This is the mathematical ceiling your architecture can never exceed: **6.65 nines (99.99997%)**.
|
|
209
|
+
|
|
210
|
+
**Why this matters:** If your SLO target is 99.99% but your Layer 1 limit is 99.95%, no amount of engineering effort will close the gap without architectural changes. FaultRay tells you this **before** you waste months trying.
|
|
211
|
+
|
|
212
|
+
---
|
|
213
|
+
|
|
214
|
+
## 5 Simulation Engines
|
|
215
|
+
|
|
216
|
+
### 1. Cascade Engine
|
|
217
|
+
Models fault propagation through dependency graphs. Identifies single points of failure, compound failures, and cascade paths.
|
|
218
|
+
```bash
|
|
219
|
+
faultray load infra.yaml
|
|
220
|
+
faultray simulate --html report.html
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
### 2. Dynamic Engine
|
|
224
|
+
Time-stepped simulation with traffic pattern integration. Models real-world load variations over hours or days.
|
|
225
|
+
```bash
|
|
226
|
+
faultray dynamic infra.yaml --traffic diurnal --duration 24h --step 1min
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
### 3. Ops Engine
|
|
230
|
+
Long-running operational simulation (days to weeks) with SLO tracking, incident generation, and deployment events.
|
|
231
|
+
```bash
|
|
232
|
+
faultray ops-sim infra.yaml --days 7 --step 5min
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
### 4. What-If Engine
|
|
236
|
+
Parameter sweep analysis to understand fault tolerance sensitivity across multiple dimensions.
|
|
237
|
+
```bash
|
|
238
|
+
faultray whatif infra.yaml --parameter mttr_factor --values "0.5,1.0,2.0,4.0"
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
### 5. Capacity Engine
|
|
242
|
+
Growth forecasting with resource exhaustion prediction and SLO compliance evaluation.
|
|
243
|
+
```bash
|
|
244
|
+
faultray capacity infra.yaml --growth 0.15 --slo 99.9
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
---
|
|
248
|
+
|
|
249
|
+
## Usage
|
|
250
|
+
|
|
251
|
+
### From YAML Definition
|
|
252
|
+
|
|
253
|
+
```yaml
|
|
254
|
+
# infra.yaml
|
|
255
|
+
components:
|
|
256
|
+
- id: nginx
|
|
257
|
+
type: load_balancer
|
|
258
|
+
port: 443
|
|
259
|
+
replicas: 2
|
|
260
|
+
metrics: { cpu_percent: 25, memory_percent: 30 }
|
|
261
|
+
capacity: { max_connections: 10000 }
|
|
262
|
+
|
|
263
|
+
- id: api
|
|
264
|
+
type: app_server
|
|
265
|
+
port: 8080
|
|
266
|
+
metrics: { cpu_percent: 65, memory_percent: 70 }
|
|
267
|
+
capacity: { max_connections: 500, connection_pool_size: 100 }
|
|
268
|
+
|
|
269
|
+
- id: postgres
|
|
270
|
+
type: database
|
|
271
|
+
port: 5432
|
|
272
|
+
metrics: { cpu_percent: 45, memory_percent: 80, disk_percent: 72 }
|
|
273
|
+
capacity: { max_connections: 100 }
|
|
274
|
+
|
|
275
|
+
dependencies:
|
|
276
|
+
- source: nginx
|
|
277
|
+
target: api
|
|
278
|
+
type: requires
|
|
279
|
+
- source: api
|
|
280
|
+
target: postgres
|
|
281
|
+
type: requires
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
```bash
|
|
285
|
+
faultray load infra.yaml
|
|
286
|
+
faultray simulate --html report.html
|
|
287
|
+
```
|
|
288
|
+
|
|
289
|
+
### From Terraform
|
|
290
|
+
|
|
291
|
+
```bash
|
|
292
|
+
# Import from state file
|
|
293
|
+
faultray tf-import --state terraform.tfstate
|
|
294
|
+
|
|
295
|
+
# Import from live terraform
|
|
296
|
+
faultray tf-import --dir ./terraform
|
|
297
|
+
|
|
298
|
+
# Analyze plan impact
|
|
299
|
+
terraform plan -out=plan.out
|
|
300
|
+
faultray tf-plan plan.out --html plan-report.html
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
### From Prometheus
|
|
304
|
+
|
|
305
|
+
```bash
|
|
306
|
+
faultray scan --prometheus-url http://prometheus:9090
|
|
307
|
+
faultray simulate
|
|
308
|
+
```
|
|
309
|
+
|
|
310
|
+
### Security News Feed
|
|
311
|
+
|
|
312
|
+
```bash
|
|
313
|
+
# Fetch latest security news and generate scenarios
|
|
314
|
+
faultray feed-update
|
|
315
|
+
|
|
316
|
+
# View generated scenarios
|
|
317
|
+
faultray feed-list
|
|
318
|
+
|
|
319
|
+
# Simulate with feed scenarios included automatically
|
|
320
|
+
faultray simulate
|
|
321
|
+
```
|
|
322
|
+
|
|
323
|
+
### Web Dashboard
|
|
324
|
+
|
|
325
|
+
```bash
|
|
326
|
+
faultray serve --port 8080
|
|
327
|
+
# Open http://localhost:8080
|
|
328
|
+
```
|
|
329
|
+
|
|
330
|
+
### Operational Simulation
|
|
331
|
+
|
|
332
|
+
Simulate long-running operations and track SLO compliance and incident patterns over time.
|
|
333
|
+
|
|
334
|
+
```bash
|
|
335
|
+
# Run 7-day operational simulation with 5-minute time steps
|
|
336
|
+
faultray ops-sim infra.yaml --days 7 --step 5min
|
|
337
|
+
|
|
338
|
+
# Run with default parameters
|
|
339
|
+
faultray ops-sim --defaults
|
|
340
|
+
```
|
|
341
|
+
|
|
342
|
+
### What-If Analysis
|
|
343
|
+
|
|
344
|
+
Sweep parameters to analyze fault tolerance sensitivity across multiple dimensions.
|
|
345
|
+
|
|
346
|
+
```bash
|
|
347
|
+
# Run with default parameter sweep
|
|
348
|
+
faultray whatif infra.yaml --defaults
|
|
349
|
+
|
|
350
|
+
# Sweep a specific parameter
|
|
351
|
+
faultray whatif --parameter mttr_factor --values "0.5,1.0,2.0,4.0"
|
|
352
|
+
```
|
|
353
|
+
|
|
354
|
+
### Capacity Planning
|
|
355
|
+
|
|
356
|
+
Forecast resource exhaustion and evaluate SLO compliance under growth projections.
|
|
357
|
+
|
|
358
|
+
```bash
|
|
359
|
+
# Capacity planning with 15% annual growth targeting 99.9% SLO
|
|
360
|
+
faultray capacity infra.yaml --growth 0.15 --slo 99.9
|
|
361
|
+
```
|
|
362
|
+
|
|
363
|
+
### Traffic Patterns
|
|
364
|
+
|
|
365
|
+
10 traffic models available for dynamic simulation:
|
|
366
|
+
|
|
367
|
+
| Pattern | Description |
|
|
368
|
+
|---------|-------------|
|
|
369
|
+
| `CONSTANT` | Steady-state constant traffic |
|
|
370
|
+
| `RAMP` | Linear traffic increase |
|
|
371
|
+
| `SPIKE` | Instantaneous traffic spike |
|
|
372
|
+
| `WAVE` | Sinusoidal wave pattern |
|
|
373
|
+
| `DDoS_VOLUMETRIC` | High-volume DDoS attack |
|
|
374
|
+
| `DDoS_SLOWLORIS` | Slowloris-style DDoS attack |
|
|
375
|
+
| `FLASH_CROWD` | Sudden viral popularity surge |
|
|
376
|
+
| `DIURNAL` | Daily cycle (high daytime, low nighttime) |
|
|
377
|
+
| `DIURNAL_WEEKLY` | Weekly cycle (high weekdays, low weekends) |
|
|
378
|
+
| `GROWTH_TREND` | Long-term organic growth trend |
|
|
379
|
+
|
|
380
|
+
```bash
|
|
381
|
+
# Dynamic simulation with traffic pattern
|
|
382
|
+
faultray dynamic infra.yaml --traffic diurnal --duration 24h --step 1min
|
|
383
|
+
```
|
|
384
|
+
|
|
385
|
+
---
|
|
386
|
+
|
|
387
|
+
## Chaos Scenarios (30 Categories)
|
|
388
|
+
|
|
389
|
+
| Category | Examples |
|
|
390
|
+
|----------|---------|
|
|
391
|
+
| **Single Failures** | Component down, CPU saturation, OOM, disk full, network partition |
|
|
392
|
+
| **Traffic** | 1.5x, 2x, 3x, 5x, 10x (DDoS-level) traffic spikes |
|
|
393
|
+
| **Compound** | All pairwise (C(n,2)) and triple (C(n,3)) simultaneous failures |
|
|
394
|
+
| **DB-Specific** | Log explosion, replication lag, connection storm, lock contention |
|
|
395
|
+
| **Cache-Specific** | Stampede, eviction storm, split brain |
|
|
396
|
+
| **Queue-Specific** | Backpressure, poison message |
|
|
397
|
+
| **LB-Specific** | Health check failure, TLS expiry, config reload failure |
|
|
398
|
+
| **App-Specific** | Memory leak, thread exhaustion, GC pause, bad deployment |
|
|
399
|
+
| **Infrastructure** | Zone failure, cascading timeouts, total meltdown, rolling restart |
|
|
400
|
+
| **Real-World** | Black Friday (10x + cache pressure), noisy neighbor, slow DB at peak |
|
|
401
|
+
| **Security Feed** | Auto-generated from CISA, NVD, Krebs, BleepingComputer, etc. |
|
|
402
|
+
|
|
403
|
+
---
|
|
404
|
+
|
|
405
|
+
## Risk Scoring
|
|
406
|
+
|
|
407
|
+
```
|
|
408
|
+
severity = (impact x spread) x likelihood
|
|
409
|
+
|
|
410
|
+
impact = weighted health status (DOWN=1.0, OVERLOADED=0.5, DEGRADED=0.25)
|
|
411
|
+
spread = affected_components / total_components
|
|
412
|
+
likelihood = proximity to failure threshold (0.2 = unlikely, 1.0 = imminent)
|
|
413
|
+
```
|
|
414
|
+
|
|
415
|
+
| Level | Score | Meaning |
|
|
416
|
+
|-------|-------|---------|
|
|
417
|
+
| CRITICAL | 7.0-10.0 | Cascading failure, major outage risk |
|
|
418
|
+
| WARNING | 4.0-6.9 | Degradation, limited cascade |
|
|
419
|
+
| PASSED | 0.0-3.9 | Low risk, contained impact |
|
|
420
|
+
|
|
421
|
+
---
|
|
422
|
+
|
|
423
|
+
## Architecture
|
|
424
|
+
|
|
425
|
+
```
|
|
426
|
+
Discovery Layer Model Layer Simulator Layer
|
|
427
|
+
┌─────────────┐ ┌─────────────────┐ ┌──────────────────┐
|
|
428
|
+
│ Local Scan │ │ InfraGraph │ │ 30-cat Scenarios │
|
|
429
|
+
│ Prometheus │───>│ Components │───>│ Cascade Engine │
|
|
430
|
+
│ Terraform │ │ Dependencies │ │ Dynamic Engine │
|
|
431
|
+
│ YAML Loader │ │ NetworkX Graph │ │ Ops Engine │
|
|
432
|
+
└─────────────┘ └─────────────────┘ │ What-If Engine │
|
|
433
|
+
│ Capacity Engine │
|
|
434
|
+
│ Traffic Models │
|
|
435
|
+
│ Feed Scenarios │
|
|
436
|
+
│ Risk Scoring │
|
|
437
|
+
│ 3-Layer Limits │
|
|
438
|
+
└──────────────────┘
|
|
439
|
+
│
|
|
440
|
+
┌─────────────────┐ ┌──────────────────┐
|
|
441
|
+
│ Web Dashboard │<───│ CLI Reporter │
|
|
442
|
+
│ FastAPI + D3.js │ │ HTML Reporter │
|
|
443
|
+
│ Docker Ready │ │ JSON Export │
|
|
444
|
+
└─────────────────┘ └──────────────────┘
|
|
445
|
+
```
|
|
446
|
+
|
|
447
|
+
---
|
|
448
|
+
|
|
449
|
+
## CLI Commands
|
|
450
|
+
|
|
451
|
+
| Command | Description |
|
|
452
|
+
|---------|-------------|
|
|
453
|
+
| `faultray scan` | Discover local system or Prometheus infrastructure |
|
|
454
|
+
| `faultray simulate` | Run chaos simulation (150+ scenarios) |
|
|
455
|
+
| `faultray dynamic` | Run dynamic time-stepped simulation with traffic patterns |
|
|
456
|
+
| `faultray ops-sim` | Long-running operational simulation with SLO tracking |
|
|
457
|
+
| `faultray show` | Display infrastructure model summary |
|
|
458
|
+
| `faultray load <yaml>` | Load infrastructure from YAML |
|
|
459
|
+
| `faultray tf-import` | Import from Terraform state |
|
|
460
|
+
| `faultray tf-plan <plan>` | Analyze Terraform plan impact |
|
|
461
|
+
| `faultray report` | Generate HTML report |
|
|
462
|
+
| `faultray serve` | Launch web dashboard |
|
|
463
|
+
| `faultray demo` | Run demo with sample infrastructure |
|
|
464
|
+
| `faultray feed-update` | Update scenarios from security news |
|
|
465
|
+
| `faultray feed-list` | Show stored feed scenarios |
|
|
466
|
+
| `faultray feed-sources` | Show configured news sources |
|
|
467
|
+
| `faultray feed-clear` | Clear feed scenario store |
|
|
468
|
+
| `faultray whatif` | Run what-if analysis (parameter sweep) |
|
|
469
|
+
| `faultray capacity` | Capacity planning with growth forecasting |
|
|
470
|
+
|
|
471
|
+
---
|
|
472
|
+
|
|
473
|
+
## Docker
|
|
474
|
+
|
|
475
|
+
### Docker Compose Services
|
|
476
|
+
|
|
477
|
+
| Service | Description | Command |
|
|
478
|
+
|---------|-------------|---------|
|
|
479
|
+
| `web` | Web dashboard on port 8000 | `docker compose up web` |
|
|
480
|
+
| `demo` | Demo mode with sample infrastructure | `docker compose --profile demo up demo` |
|
|
481
|
+
| `cli` | CLI mode for running simulations | `docker compose --profile cli run cli <command>` |
|
|
482
|
+
|
|
483
|
+
### Docker Build
|
|
484
|
+
|
|
485
|
+
```bash
|
|
486
|
+
# Build
|
|
487
|
+
docker build -t faultray .
|
|
488
|
+
|
|
489
|
+
# Run web dashboard
|
|
490
|
+
docker run -p 8000:8000 faultray
|
|
491
|
+
|
|
492
|
+
# Run CLI command
|
|
493
|
+
docker run --rm faultray faultray simulate
|
|
494
|
+
|
|
495
|
+
# Mount custom infrastructure definition
|
|
496
|
+
docker run --rm -v $(pwd)/infra.yaml:/app/infra.yaml faultray faultray load /app/infra.yaml
|
|
497
|
+
```
|
|
498
|
+
|
|
499
|
+
### Docker Compose Examples
|
|
500
|
+
|
|
501
|
+
```bash
|
|
502
|
+
# Start web dashboard
|
|
503
|
+
docker compose up web
|
|
504
|
+
|
|
505
|
+
# Run a simulation via CLI
|
|
506
|
+
docker compose --profile cli run cli load examples/demo-infra.yaml
|
|
507
|
+
|
|
508
|
+
# Run with Terraform state mounted
|
|
509
|
+
docker compose --profile cli run -v $(pwd)/terraform.tfstate:/app/terraform.tfstate \
|
|
510
|
+
cli tf-import --state /app/terraform.tfstate
|
|
511
|
+
```
|
|
512
|
+
|
|
513
|
+
---
|
|
514
|
+
|
|
515
|
+
## Development
|
|
516
|
+
|
|
517
|
+
```bash
|
|
518
|
+
# Install in development mode
|
|
519
|
+
pip install -e ".[dev]"
|
|
520
|
+
|
|
521
|
+
# Run tests (89 tests, < 1 second)
|
|
522
|
+
pytest tests/ -v
|
|
523
|
+
|
|
524
|
+
# Lint
|
|
525
|
+
ruff check src/ tests/
|
|
526
|
+
|
|
527
|
+
# Build Docker image
|
|
528
|
+
docker build -t faultray:dev .
|
|
529
|
+
```
|
|
530
|
+
|
|
531
|
+
### Test Coverage
|
|
532
|
+
|
|
533
|
+
| Module | Tests | Coverage |
|
|
534
|
+
|--------|-------|----------|
|
|
535
|
+
| Cascade Engine | 14 | Fault propagation, severity scoring, compound failures |
|
|
536
|
+
| Dynamic Engine | 14 | CLI output, severity classification, boundary values |
|
|
537
|
+
| Ops Engine | 9 | SLO tracking, traffic patterns, deployments |
|
|
538
|
+
| Capacity Engine | 8 | Forecasting, right-sizing, SLO targets |
|
|
539
|
+
| Scenarios | 4 | Rolling restart edge cases, scenario generation |
|
|
540
|
+
| Traffic | 11 | All 10 traffic patterns + determinism |
|
|
541
|
+
| Feeds | 11 | Analysis, scoring, store operations |
|
|
542
|
+
| Loader | 10 | YAML parsing, validation, circular dependency detection |
|
|
543
|
+
| Graph | 2 | Cascade paths, critical path limits |
|
|
544
|
+
| **Total** | **89** | **All passing** |
|
|
545
|
+
|
|
546
|
+
### Requirements
|
|
547
|
+
|
|
548
|
+
- Python 3.11+
|
|
549
|
+
- Dependencies: typer, rich, pydantic, networkx, psutil, fastapi, uvicorn, jinja2, httpx, pyyaml
|
|
550
|
+
|
|
551
|
+
---
|
|
552
|
+
|
|
553
|
+
## Changelog
|
|
554
|
+
|
|
555
|
+
### v5.14 (2026-03-14)
|
|
556
|
+
- 3-Layer Availability Limit Model: mathematical proof of system availability ceiling
|
|
557
|
+
- Layer 1 (Software 4.00 nines), Layer 2 (Hardware 5.91 nines), Layer 3 (Theoretical 6.65 nines)
|
|
558
|
+
- README overhauled to commercial/OSS quality with bilingual EN/JP support
|
|
559
|
+
|
|
560
|
+
### v5.13 (2026-03-14)
|
|
561
|
+
- Docker Compose multi-service configuration (web, demo, cli profiles)
|
|
562
|
+
- Volume mounts for persistent feed data and report output
|
|
563
|
+
|
|
564
|
+
### v5.12 (2026-03-14)
|
|
565
|
+
- Dockerfile with Python 3.11-slim base
|
|
566
|
+
- Container-ready web dashboard deployment
|
|
567
|
+
|
|
568
|
+
### v5.11 (2026-03-14)
|
|
569
|
+
- Competitive positioning against Gremlin, Steadybit, AWS FIS
|
|
570
|
+
- Feature matrix documentation
|
|
571
|
+
|
|
572
|
+
### v5.10 (2026-03-14)
|
|
573
|
+
- Architecture diagram updated with all 5 engines and 3-Layer Limits
|
|
574
|
+
- JSON export support for simulation results
|
|
575
|
+
|
|
576
|
+
### v5.9 (2026-03-14)
|
|
577
|
+
- Traffic model descriptions translated to English
|
|
578
|
+
- Bilingual documentation structure (EN/JP)
|
|
579
|
+
|
|
580
|
+
### v5.8 (2026-03-14)
|
|
581
|
+
- Dynamic Engine label in architecture (was "Ops Engine" duplicate)
|
|
582
|
+
- CLI command table aligned with all registered subcommands
|
|
583
|
+
|
|
584
|
+
### v5.7 (2026-03-14)
|
|
585
|
+
- Risk scoring formula documentation improvements
|
|
586
|
+
- Severity threshold boundary clarification
|
|
587
|
+
|
|
588
|
+
### v5.6 (2026-03-14)
|
|
589
|
+
- Fix: Rolling restart scenario now keeps at least 1 server running
|
|
590
|
+
- 4 new scenario edge case tests
|
|
591
|
+
|
|
592
|
+
### v5.5 (2026-03-14)
|
|
593
|
+
- Fix: Dynamic simulation results always showed 0 critical/0 warning (float vs string comparison)
|
|
594
|
+
- Fix: `dynamic` command passed report object instead of results list
|
|
595
|
+
- Fix: `--deploy-hour` validation (0-23 range)
|
|
596
|
+
- 14 new dynamic CLI tests
|
|
597
|
+
|
|
598
|
+
### v5.4 (2026-03-14)
|
|
599
|
+
- Pydantic field_validators for input boundary defense
|
|
600
|
+
|
|
601
|
+
### v5.3 (2026-03-13)
|
|
602
|
+
- Fix TypeError in dynamic CLI command
|
|
603
|
+
|
|
604
|
+
### v5.2 (2026-03-13)
|
|
605
|
+
- Security hardening and robustness improvements
|
|
606
|
+
|
|
607
|
+
### v5.1 (2026-03-13)
|
|
608
|
+
- Consistency fixes, test coverage, CLI validation
|
|
609
|
+
|
|
610
|
+
### v5.0 (2026-03-13)
|
|
611
|
+
- README overhaul, graph fixes, CLI UX improvements
|
|
612
|
+
|
|
613
|
+
---
|
|
614
|
+
|
|
615
|
+
## License
|
|
616
|
+
|
|
617
|
+
MIT License - see [LICENSE](LICENSE)
|
|
618
|
+
|
|
619
|
+
---
|
|
620
|
+
|
|
621
|
+
---
|
|
622
|
+
|
|
623
|
+
# FaultRay — ゼロリスク・インフラ障害シミュレーション(日本語)
|
|
624
|
+
|
|
625
|
+
> **本番環境に一切触れずにインフラ障害をシミュレーション。**
|
|
626
|
+
> **システムの可用性上限を数学的に証明。**
|
|
627
|
+
|
|
628
|
+
## なぜ FaultRay なのか?
|
|
629
|
+
|
|
630
|
+
従来のカオスエンジニアリングツール(Gremlin, Steadybit, AWS FIS)は**実際のインフラに障害を注入**します。FaultRay はまったく異なるアプローチ:**純粋な数学的シミュレーション**で依存関係グラフ全体をメモリ上にモデル化し、150以上の障害シナリオを実行して、システムの理論的可用性上限を証明します。サーバーに一切触れません。
|
|
631
|
+
|
|
632
|
+
| | **Gremlin** | **Steadybit** | **AWS FIS** | **FaultRay** |
|
|
633
|
+
|---|---|---|---|---|
|
|
634
|
+
| **アプローチ** | 障害注入 | 障害注入 | 障害注入 | 数学的シミュレーション |
|
|
635
|
+
| **本番リスク** | 中〜高 | 中 | 中 | **ゼロ** |
|
|
636
|
+
| **セットアップ** | ホスト毎にエージェント | ホスト毎にエージェント | AWSのみ | **pip install のみ** |
|
|
637
|
+
| **シナリオ数** | 手動設定 | 手動設定 | AWSサービスのみ | **150+自動生成** |
|
|
638
|
+
| **可用性証明** | なし | なし | なし | **3層限界モデル** |
|
|
639
|
+
| **コスト** | $$$$ | $$$ | $$ | **無料 / OSS** |
|
|
640
|
+
|
|
641
|
+
## クイックスタート
|
|
642
|
+
|
|
643
|
+
### pip
|
|
644
|
+
|
|
645
|
+
```bash
|
|
646
|
+
# インストール
|
|
647
|
+
pip install -e .
|
|
648
|
+
|
|
649
|
+
# デモ実行(6コンポーネントWebスタック)
|
|
650
|
+
faultray demo
|
|
651
|
+
|
|
652
|
+
# Web ダッシュボード付き
|
|
653
|
+
faultray demo --web
|
|
654
|
+
```
|
|
655
|
+
|
|
656
|
+
### Docker
|
|
657
|
+
|
|
658
|
+
```bash
|
|
659
|
+
# Web ダッシュボード(http://localhost:8000)
|
|
660
|
+
docker compose up web
|
|
661
|
+
|
|
662
|
+
# デモモード
|
|
663
|
+
docker compose --profile demo up demo
|
|
664
|
+
|
|
665
|
+
# CLI モード
|
|
666
|
+
docker compose --profile cli run cli simulate
|
|
667
|
+
```
|
|
668
|
+
|
|
669
|
+
## 主要機能
|
|
670
|
+
|
|
671
|
+
- :shield: **ゼロリスクシミュレーション** — 完全にメモリ上で実行。エージェント不要、本番への影響ゼロ
|
|
672
|
+
- :chart_with_upwards_trend: **150以上のカオスシナリオ** — 30カテゴリの障害シナリオをトポロジーから自動生成
|
|
673
|
+
- :link: **依存関係グラフ解析** — NetworkX によるグラフモデリングと連鎖障害予測
|
|
674
|
+
- :triangular_ruler: **3層可用性限界証明** — システムの理論的可用性上限を数学的に証明
|
|
675
|
+
- :dart: **SLO/SLI 追跡** — 可用性・レイテンシ・エラー率のSLO目標に対する追跡
|
|
676
|
+
- :crystal_ball: **What-If 分析** — パラメータスイープによる障害耐性の感度分析
|
|
677
|
+
- :bar_chart: **キャパシティプランニング** — 成長予測に基づくSLO達成可否の評価
|
|
678
|
+
- :ocean: **10種類のトラフィックモデル** — DDoS・日次変動・フラッシュクラウド等
|
|
679
|
+
- :newspaper: **セキュリティフィード** — CISA, NVD等から最新脅威シナリオを自動追加
|
|
680
|
+
- :globe_with_meridians: **Terraform 連携** — tfstate/plan からインフラ自動インポートと変更影響分析
|
|
681
|
+
- :desktop_computer: **Web ダッシュボード** — D3.js インタラクティブグラフ + Grafana風ダッシュボード
|
|
682
|
+
|
|
683
|
+
## 3層可用性限界モデル(最大の特徴)
|
|
684
|
+
|
|
685
|
+
FaultRay 独自の理論モデルです。従来のカオスツールが「何が壊れるか?」に答えるのに対し、FaultRay は **「あなたのアーキテクチャが物理的に達成できる最大可用性はいくつか?」** に答えます。
|
|
686
|
+
|
|
687
|
+
| 層 | 名称 | 上限 | 説明 |
|
|
688
|
+
|---|---|---|---|
|
|
689
|
+
| **Layer 3** | 理論限界 | 6.65 nines | 完全な冗長性+瞬時フェイルオーバーを仮定した数学的上限(到達不可) |
|
|
690
|
+
| **Layer 2** | ハードウェア限界 | 5.91 nines | コンポーネントMTBF × 冗長係数から算出される物理的上限 |
|
|
691
|
+
| **Layer 1** | ソフトウェア限界 | 4.00 nines | デプロイ失敗・設定ドリフト・ヒューマンエラーを考慮した実用上限 |
|
|
692
|
+
|
|
693
|
+
**重要な意味:** SLO目標が99.99%でもLayer 1の限界が99.95%なら、どれだけエンジニアリング努力を重ねてもアーキテクチャ変更なしにはギャップを埋められません。FaultRay は**数ヶ月の無駄な努力の前に**それを教えてくれます。
|
|
694
|
+
|
|
695
|
+
## 5つのシミュレーションエンジン
|
|
696
|
+
|
|
697
|
+
1. **カスケードエンジン** — 依存関係グラフを通じた障害伝搬モデリング
|
|
698
|
+
2. **ダイナミックエンジン** — トラフィックパターン連動の時間ステップ型シミュレーション
|
|
699
|
+
3. **Opsエンジン** — 長期間(数日〜数週間)の運用シミュレーション
|
|
700
|
+
4. **What-Ifエンジン** — パラメータスイープによる感度分析
|
|
701
|
+
5. **キャパシティエンジン** — 成長予測とリソース枯渇予測
|
|
702
|
+
|
|
703
|
+
## ライセンス
|
|
704
|
+
|
|
705
|
+
MIT License - [LICENSE](LICENSE) を参照
|