upright 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.md +10 -0
  3. data/README.md +455 -0
  4. data/Rakefile +6 -0
  5. data/app/assets/stylesheets/upright/_global.css +104 -0
  6. data/app/assets/stylesheets/upright/artifact.css +148 -0
  7. data/app/assets/stylesheets/upright/base.css +68 -0
  8. data/app/assets/stylesheets/upright/buttons.css +21 -0
  9. data/app/assets/stylesheets/upright/dashboard.css +287 -0
  10. data/app/assets/stylesheets/upright/forms.css +104 -0
  11. data/app/assets/stylesheets/upright/header.css +124 -0
  12. data/app/assets/stylesheets/upright/layout.css +100 -0
  13. data/app/assets/stylesheets/upright/map.css +25 -0
  14. data/app/assets/stylesheets/upright/pagination.css +45 -0
  15. data/app/assets/stylesheets/upright/probes.css +72 -0
  16. data/app/assets/stylesheets/upright/reset.css +26 -0
  17. data/app/assets/stylesheets/upright/tables.css +63 -0
  18. data/app/assets/stylesheets/upright/typography.css +27 -0
  19. data/app/assets/stylesheets/upright/uptime-bars.css +154 -0
  20. data/app/controllers/concerns/upright/authentication.rb +21 -0
  21. data/app/controllers/concerns/upright/subdomain_scoping.rb +18 -0
  22. data/app/controllers/upright/alertmanager_proxy_controller.rb +21 -0
  23. data/app/controllers/upright/application_controller.rb +12 -0
  24. data/app/controllers/upright/artifacts_controller.rb +5 -0
  25. data/app/controllers/upright/dashboards/uptimes_controller.rb +6 -0
  26. data/app/controllers/upright/jobs_controller.rb +4 -0
  27. data/app/controllers/upright/probe_results_controller.rb +17 -0
  28. data/app/controllers/upright/prometheus_proxy_controller.rb +62 -0
  29. data/app/controllers/upright/sessions_controller.rb +29 -0
  30. data/app/controllers/upright/sites_controller.rb +5 -0
  31. data/app/helpers/upright/application_helper.rb +11 -0
  32. data/app/helpers/upright/dashboards_helper.rb +31 -0
  33. data/app/helpers/upright/probe_results_helper.rb +49 -0
  34. data/app/javascript/upright/application.js +2 -0
  35. data/app/javascript/upright/controllers/application.js +5 -0
  36. data/app/javascript/upright/controllers/form_controller.js +7 -0
  37. data/app/javascript/upright/controllers/index.js +4 -0
  38. data/app/javascript/upright/controllers/popover_controller.js +15 -0
  39. data/app/javascript/upright/controllers/probe_results_chart_controller.js +79 -0
  40. data/app/javascript/upright/controllers/results_table_controller.js +16 -0
  41. data/app/javascript/upright/controllers/sites_map_controller.js +33 -0
  42. data/app/jobs/upright/application_job.rb +2 -0
  43. data/app/jobs/upright/probe_check_job.rb +42 -0
  44. data/app/models/concerns/upright/exception_recording.rb +38 -0
  45. data/app/models/concerns/upright/playwright/form_authentication.rb +27 -0
  46. data/app/models/concerns/upright/playwright/helpers.rb +7 -0
  47. data/app/models/concerns/upright/playwright/lifecycle.rb +44 -0
  48. data/app/models/concerns/upright/playwright/logging.rb +87 -0
  49. data/app/models/concerns/upright/playwright/otel_tracing.rb +137 -0
  50. data/app/models/concerns/upright/playwright/video_recording.rb +60 -0
  51. data/app/models/concerns/upright/probe_yaml_source.rb +10 -0
  52. data/app/models/concerns/upright/probeable.rb +125 -0
  53. data/app/models/concerns/upright/staggerable.rb +22 -0
  54. data/app/models/concerns/upright/traceroute/otel_tracing.rb +108 -0
  55. data/app/models/upright/application_record.rb +3 -0
  56. data/app/models/upright/artifact.rb +61 -0
  57. data/app/models/upright/current.rb +9 -0
  58. data/app/models/upright/http/request.rb +59 -0
  59. data/app/models/upright/http/response.rb +55 -0
  60. data/app/models/upright/playwright/authenticator/base.rb +128 -0
  61. data/app/models/upright/playwright/storage_state.rb +31 -0
  62. data/app/models/upright/probe_result.rb +31 -0
  63. data/app/models/upright/probes/http_probe.rb +102 -0
  64. data/app/models/upright/probes/playwright/base.rb +48 -0
  65. data/app/models/upright/probes/smtp_probe.rb +48 -0
  66. data/app/models/upright/probes/traceroute_probe.rb +32 -0
  67. data/app/models/upright/probes/uptime/summary.rb +36 -0
  68. data/app/models/upright/probes/uptime.rb +36 -0
  69. data/app/models/upright/traceroute/hop.rb +49 -0
  70. data/app/models/upright/traceroute/ip_metadata_lookup.rb +107 -0
  71. data/app/models/upright/traceroute/mtr_parser.rb +47 -0
  72. data/app/models/upright/traceroute/result.rb +57 -0
  73. data/app/models/upright/user.rb +14 -0
  74. data/app/views/layouts/upright/_header.html.erb +23 -0
  75. data/app/views/layouts/upright/application.html.erb +25 -0
  76. data/app/views/upright/active_storage/attachments/_attachment.html.erb +21 -0
  77. data/app/views/upright/alertmanager_proxy/show.html.erb +1 -0
  78. data/app/views/upright/artifacts/show.html.erb +9 -0
  79. data/app/views/upright/dashboards/_uptime_bars.html.erb +17 -0
  80. data/app/views/upright/dashboards/_uptime_probe_row.html.erb +22 -0
  81. data/app/views/upright/dashboards/uptimes/show.html.erb +17 -0
  82. data/app/views/upright/jobs/show.html.erb +1 -0
  83. data/app/views/upright/probe_results/_pagination.html.erb +19 -0
  84. data/app/views/upright/probe_results/index.html.erb +72 -0
  85. data/app/views/upright/prometheus_proxy/show.html.erb +1 -0
  86. data/app/views/upright/sessions/new.html.erb +6 -0
  87. data/app/views/upright/sites/index.html.erb +22 -0
  88. data/config/brakeman.ignore +39 -0
  89. data/config/ci.rb +7 -0
  90. data/config/importmap.rb +18 -0
  91. data/config/routes.rb +41 -0
  92. data/db/migrate/20250114000001_create_upright_probe_results.rb +19 -0
  93. data/lib/generators/upright/install/install_generator.rb +83 -0
  94. data/lib/generators/upright/install/templates/alertmanager.yml +14 -0
  95. data/lib/generators/upright/install/templates/deploy.yml +118 -0
  96. data/lib/generators/upright/install/templates/development_alertmanager.yml +11 -0
  97. data/lib/generators/upright/install/templates/development_prometheus.yml +12 -0
  98. data/lib/generators/upright/install/templates/docker-compose.yml +38 -0
  99. data/lib/generators/upright/install/templates/http_probes.yml +14 -0
  100. data/lib/generators/upright/install/templates/omniauth.rb +8 -0
  101. data/lib/generators/upright/install/templates/otel_collector.yml +24 -0
  102. data/lib/generators/upright/install/templates/prometheus.yml +10 -0
  103. data/lib/generators/upright/install/templates/puma.rb +40 -0
  104. data/lib/generators/upright/install/templates/sites.yml +26 -0
  105. data/lib/generators/upright/install/templates/smtp_probes.yml +9 -0
  106. data/lib/generators/upright/install/templates/upright.rb +21 -0
  107. data/lib/generators/upright/install/templates/upright.rules.yml +256 -0
  108. data/lib/generators/upright/playwright_probe/playwright_probe_generator.rb +30 -0
  109. data/lib/generators/upright/playwright_probe/templates/authenticator.rb.tt +14 -0
  110. data/lib/generators/upright/playwright_probe/templates/probe.rb.tt +14 -0
  111. data/lib/omniauth/strategies/static_credentials.rb +57 -0
  112. data/lib/tasks/upright_tasks.rake +4 -0
  113. data/lib/upright/configuration.rb +106 -0
  114. data/lib/upright/engine.rb +157 -0
  115. data/lib/upright/metrics.rb +62 -0
  116. data/lib/upright/playwright/collect_performance_metrics.js +36 -0
  117. data/lib/upright/site.rb +49 -0
  118. data/lib/upright/tracing.rb +49 -0
  119. data/lib/upright/version.rb +3 -0
  120. data/lib/upright.rb +68 -0
  121. metadata +513 -0
@@ -0,0 +1,118 @@
1
+ # Upright Kamal Configuration
2
+ # Deploy with: bin/kamal deploy
3
+
4
+ service: <%= app_name %>
5
+ image: <%= app_name %>/<%= app_name %>
6
+
7
+ servers:
8
+ web:
9
+ hosts:
10
+ - <%= app_domain %>: [london]
11
+ # Add more sites:
12
+ # - nyc.<%= app_domain %>: [new_york]
13
+ # - sfo.<%= app_domain %>: [san_francisco]
14
+ jobs:
15
+ hosts:
16
+ - <%= app_domain %>: [london]
17
+ # Add more sites:
18
+ # - nyc.<%= app_domain %>: [new_york]
19
+ # - sfo.<%= app_domain %>: [san_francisco]
20
+ cmd: bin/jobs
21
+
22
+ ssh:
23
+ user: app
24
+
25
+ registry:
26
+ server: ghcr.io
27
+ username:
28
+ - KAMAL_REGISTRY_USERNAME
29
+ password:
30
+ - KAMAL_REGISTRY_PASSWORD
31
+
32
+ builder:
33
+ arch: amd64
34
+ secrets:
35
+ - GITHUB_TOKEN
36
+
37
+ proxy:
38
+ app_port: 3000
39
+ ssl: true
40
+ hosts:
41
+ - "*.<%= app_domain %>"
42
+
43
+ env:
44
+ clear:
45
+ OTEL_EXPORTER_OTLP_ENDPOINT: http://<%= app_name %>-otel_collector:4318/v1/traces
46
+ PROMETHEUS_URL: http://<%= app_name %>-prometheus:9090
47
+ ALERTMANAGER_URL: http://<%= app_name %>-alertmanager:9093
48
+ secret:
49
+ - RAILS_MASTER_KEY
50
+ - PROMETHEUS_OTLP_TOKEN
51
+ tags:
52
+ london:
53
+ SITE_SUBDOMAIN: lon
54
+ # Add more sites:
55
+ # new_york:
56
+ # SITE_SUBDOMAIN: nyc
57
+ # san_francisco:
58
+ # SITE_SUBDOMAIN: sfo
59
+
60
+ volumes:
61
+ - "<%= app_name %>_storage:/rails/storage"
62
+
63
+ accessories:
64
+ otel_collector:
65
+ image: otel/opentelemetry-collector-contrib:0.126.0
66
+ env:
67
+ secret:
68
+ - OTEL_GATEWAY_USERNAME
69
+ - OTEL_GATEWAY_PASSWORD
70
+ - OTEL_GATEWAY_AUTH
71
+ - PROMETHEUS_OTLP_TOKEN
72
+ files:
73
+ - config/otel_collector.yml:/etc/otelcol-contrib/config.yaml
74
+ volumes:
75
+ - /var/run/docker.sock:/var/run/docker.sock
76
+ - /var/lib/docker/containers:/var/lib/docker/containers
77
+ options:
78
+ user: 0
79
+ roles:
80
+ - jobs
81
+
82
+ playwright:
83
+ image: jacoblincool/playwright:chromium-server-1.55.0
84
+ port: "127.0.0.1:53333:53333"
85
+ roles:
86
+ - jobs
87
+
88
+ prometheus:
89
+ image: prom/prometheus:v3.2.1
90
+ hosts:
91
+ - <%= app_domain %>
92
+ cmd: >-
93
+ --config.file=/etc/prometheus/prometheus.yml
94
+ --storage.tsdb.path=/prometheus
95
+ --storage.tsdb.retention.time=30d
96
+ --web.enable-otlp-receiver
97
+ --web.enable-lifecycle
98
+ files:
99
+ - config/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
100
+ - config/prometheus/rules/upright.rules.yml:/etc/prometheus/rules/upright.rules.yml
101
+ volumes:
102
+ - prometheus_data:/prometheus
103
+
104
+ alertmanager:
105
+ image: prom/alertmanager:v0.28.1
106
+ hosts:
107
+ - <%= app_domain %>
108
+ cmd: --config.file=/etc/alertmanager/alertmanager.yml
109
+ files:
110
+ - config/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
111
+ volumes:
112
+ - alertmanager_data:/alertmanager
113
+
114
+ aliases:
115
+ console: app exec -i "bin/rails console"
116
+ logs: app logs -f
117
+
118
+ retain_containers: 3
@@ -0,0 +1,11 @@
1
+ global:
2
+ resolve_timeout: 5m
3
+
4
+ route:
5
+ receiver: default
6
+ group_wait: 30s
7
+ group_interval: 5m
8
+ repeat_interval: 4h
9
+
10
+ receivers:
11
+ - name: default
@@ -0,0 +1,12 @@
1
+ global:
2
+ scrape_interval: 15s
3
+ evaluation_interval: 15s
4
+
5
+ rule_files:
6
+ - rules/*.yml
7
+
8
+ scrape_configs:
9
+ - job_name: "upright"
10
+ scrape_interval: 5s
11
+ static_configs:
12
+ - targets: ["localhost:9394"]
@@ -0,0 +1,38 @@
1
+ services:
2
+ prometheus:
3
+ image: prom/prometheus:v3.4.1
4
+ container_name: <%= app_name %>-prometheus
5
+ network_mode: host
6
+ volumes:
7
+ - ./config/prometheus/development/prometheus.yml:/etc/prometheus/prometheus.yml:ro
8
+ - ./config/prometheus/rules:/etc/prometheus/rules:ro
9
+ - prometheus_data:/prometheus
10
+ command:
11
+ - '--config.file=/etc/prometheus/prometheus.yml'
12
+ - '--storage.tsdb.path=/prometheus'
13
+ - '--web.enable-lifecycle'
14
+ - '--web.enable-remote-write-receiver'
15
+
16
+ alertmanager:
17
+ image: prom/alertmanager:v0.28.1
18
+ container_name: <%= app_name %>-alertmanager
19
+ ports:
20
+ - "9093:9093"
21
+ volumes:
22
+ - ./config/alertmanager/development/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
23
+ - alertmanager_data:/alertmanager
24
+ command:
25
+ - '--config.file=/etc/alertmanager/alertmanager.yml'
26
+ - '--storage.path=/alertmanager'
27
+
28
+ playwright:
29
+ image: jacoblincool/playwright:chromium-server-1.56.1
30
+ container_name: <%= app_name %>-playwright
31
+ ports:
32
+ - "53333:53333"
33
+ environment:
34
+ - DEBUG=true
35
+
36
+ volumes:
37
+ prometheus_data:
38
+ alertmanager_data:
@@ -0,0 +1,14 @@
1
+ # HTTP Probe Definitions
2
+ # Add your HTTP health check probes here
3
+ #
4
+ # Required fields:
5
+ # - name: Unique identifier for the probe
6
+ # - url: The URL to check
7
+ #
8
+ # Optional fields:
9
+ # - expected_status: HTTP status code to expect (default: 200-399)
10
+ # - basic_auth_credentials: Key in Rails credentials for basic auth
11
+ # - proxy: Key in Rails credentials for proxy settings
12
+
13
+ - name: Example Health Check
14
+ url: https://example.com
@@ -0,0 +1,8 @@
1
+ # WARNING: Change the default password before deploying to production!
2
+ # Set the ADMIN_PASSWORD environment variable or update the credentials below.
3
+
4
+ Rails.application.config.middleware.use OmniAuth::Builder do
5
+ provider :static_credentials,
6
+ title: "Sign In",
7
+ credentials: { "admin" => ENV.fetch("ADMIN_PASSWORD", "upright") }
8
+ end
@@ -0,0 +1,24 @@
1
+ receivers:
2
+ otlp:
3
+ protocols:
4
+ http:
5
+ endpoint: 0.0.0.0:4318
6
+
7
+ processors:
8
+ batch:
9
+ timeout: 2s
10
+
11
+ exporters:
12
+ debug:
13
+ verbosity: detailed
14
+
15
+ service:
16
+ pipelines:
17
+ traces:
18
+ receivers: [otlp]
19
+ processors: [batch]
20
+ exporters: [debug]
21
+ metrics:
22
+ receivers: [otlp]
23
+ processors: [batch]
24
+ exporters: [debug]
@@ -0,0 +1,10 @@
1
+ global:
2
+ evaluation_interval: 15s
3
+
4
+ alerting:
5
+ alertmanagers:
6
+ - static_configs:
7
+ - targets: ['localhost:9093']
8
+
9
+ rule_files:
10
+ - rules/*.yml
@@ -0,0 +1,40 @@
1
+ # This configuration file will be evaluated by Puma. The top-level methods that
2
+ # are invoked here are part of Puma's configuration DSL. For more information
3
+ # about methods provided by the DSL, see https://puma.io/puma/Puma/DSL.html.
4
+
5
+ # Puma starts a configurable number of processes (workers) and each process
6
+ # serves each request in a thread from an internal thread pool.
7
+ #
8
+ # The ideal number of threads per worker depends both on how much time the
9
+ # application spends waiting for IO operations and on how much you wish to
10
+ # to prioritize throughput over latency.
11
+ #
12
+ # As a rule of thumb, increasing the number of threads will increase how much
13
+ # traffic a given process can handle (throughput), but due to CRuby's
14
+ # Global VM Lock (GVL) it has diminishing returns and will degrade the
15
+ # response time (latency) of the application.
16
+ #
17
+ # The default is set to 3 threads as it's deemed a decent compromise between
18
+ # throughput and latency for the average Rails application.
19
+ #
20
+ # Any libraries that use a connection pool or another resource pool should
21
+ # be configured to provide at least as many connections as the number of
22
+ # threads. This includes Active Record's `pool` parameter in `database.yml`.
23
+ threads_count = ENV.fetch("RAILS_MAX_THREADS", 3)
24
+ threads threads_count, threads_count
25
+
26
+ # Specifies the `port` that Puma will listen on to receive requests; default is 3000.
27
+ port ENV.fetch("PORT", 3000)
28
+
29
+ # Allow puma to be restarted by `bin/rails restart` command.
30
+ plugin :tmp_restart
31
+
32
+ # Expose Prometheus metrics at http://0.0.0.0:9394/metrics
33
+ activate_control_app "auto", no_token: true
34
+ plugin :yabeda
35
+ plugin :yabeda_prometheus
36
+ prometheus_silence_logger true
37
+
38
+ # Specify the PID file. Defaults to tmp/pids/server.pid in development.
39
+ # In other environments, only set the PID file if requested.
40
+ pidfile ENV["PIDFILE"] if ENV["PIDFILE"]
@@ -0,0 +1,26 @@
1
+ # Upright Site Configuration
2
+ # Define the geographic locations where probes run
3
+ # Domain is automatically inferred as {code}.{hostname}
4
+
5
+ shared:
6
+ sites:
7
+ - code: lon
8
+ city: London
9
+ country: GB
10
+ geohash: gcpvj0
11
+
12
+ # Multi-site example:
13
+ #
14
+ # shared:
15
+ # sites:
16
+ # - code: ams
17
+ # city: Amsterdam
18
+ # country: NL
19
+ # geohash: u17982
20
+ # provider: digitalocean
21
+ #
22
+ # - code: nyc
23
+ # city: New York City
24
+ # country: US
25
+ # geohash: dr5reg
26
+ # provider: digitalocean
@@ -0,0 +1,9 @@
1
+ # SMTP Probe Definitions
2
+ # Add your SMTP health check probes here
3
+ #
4
+ # Required fields:
5
+ # - name: Unique identifier for the probe
6
+ # - host: SMTP server hostname
7
+
8
+ # - name: Example Mail Server
9
+ # host: mail.example.com
@@ -0,0 +1,21 @@
1
+ # See: https://github.com/basecamp/upright
2
+
3
+ Upright.configure do |config|
4
+ config.service_name = "<%= Rails.application.class.module_parent_name.underscore %>"
5
+ config.user_agent = "<%= Rails.application.class.module_parent_name.underscore %>/1.0"
6
+ config.hostname = Rails.env.local? ? "<%= Rails.application.class.module_parent_name.downcase %>.localhost" : "<%= Rails.application.class.module_parent_name.downcase %>.com"
7
+
8
+ # Playwright browser server URL
9
+ # config.playwright_server_url = ENV["PLAYWRIGHT_SERVER_URL"]
10
+
11
+ # OpenTelemetry endpoint
12
+ # config.otel_endpoint = ENV["OTEL_EXPORTER_OTLP_ENDPOINT"]
13
+
14
+ # Authentication via OpenID Connect (Logto, Keycloak, Duo, Okta, etc.)
15
+ # config.auth_provider = :openid_connect
16
+ # config.auth_options = {
17
+ # issuer: ENV["OIDC_ISSUER"],
18
+ # client_id: ENV["OIDC_CLIENT_ID"],
19
+ # client_secret: ENV["OIDC_CLIENT_SECRET"]
20
+ # }
21
+ end
@@ -0,0 +1,256 @@
1
+ # Upright Prometheus Alert Rules
2
+ # Generated by: rails generate upright:install
3
+
4
+ groups:
5
+ - name: upright_recording
6
+ interval: 30s
7
+ rules:
8
+ # Fraction of regions reporting DOWN (0.0 to 1.0)
9
+ - record: upright:probe_down_fraction
10
+ expr: |
11
+ count by (name, type, probe_target) (upright_probe_up == 0)
12
+ /
13
+ count by (name, type, probe_target) (upright_probe_up)
14
+
15
+ # Daily uptime percentage (0.0 to 1.0)
16
+ # Uptime = percentage of time in past day when majority of sites reported UP
17
+ - record: upright:probe_uptime_daily
18
+ expr: |
19
+ avg_over_time((
20
+ sum by (name, type, probe_target) (upright_probe_up)
21
+ /
22
+ count by (name, type, probe_target) (upright_probe_up)
23
+ > bool 0.5
24
+ )[1d:])
25
+
26
+ - name: upright_alerts
27
+ rules:
28
+ # Mass failure alert - fires when >20% of all probes are failing
29
+ # When this fires, individual probe alerts are suppressed via inhibition rules
30
+ - alert: UprightMassFailure
31
+ annotations:
32
+ summary: "Mass failure - {{ $value | humanizePercentage }} of probes down"
33
+ description: "Many probes failing simultaneously, likely an Upright or network issue"
34
+ upright: "https://app.<%= app_domain %>/?status=fail"
35
+ expr: |
36
+ count(upright:probe_down_fraction > 0.5)
37
+ /
38
+ count(count by (name, type, probe_target) (upright_probe_up))
39
+ > 0.2
40
+ labels:
41
+ severity: critical
42
+ group: upright
43
+
44
+ # Pool has fewer active sites than peak in last hour
45
+ - alert: UprightPoolDegraded
46
+ annotations:
47
+ summary: "Upright pool degraded - {{ $value }} site(s) missing"
48
+ description: "Upright pool degraded - {{ $value }} site(s) missing"
49
+ upright: "https://app.<%= app_domain %>/"
50
+ expr: |
51
+ max_over_time(count(count by (site_code) (upright_probe_up))[1h:1m])
52
+ -
53
+ count(count by (site_code) (upright_probe_up))
54
+ > 0
55
+ for: 5m
56
+ labels:
57
+ severity: critical
58
+ group: upright
59
+
60
+ # No HTTP metrics at all - probes completely stopped
61
+ - alert: UprightHTTPProbesMissing
62
+ annotations:
63
+ summary: "HTTP probes missing - no metrics received"
64
+ description: "No HTTP probe metrics are being reported. Upright may be down or probes are stuck."
65
+ upright: "https://app.<%= app_domain %>/?probe_type=http"
66
+ expr: absent(upright_probe_up{type="http"})
67
+ for: 5m
68
+ labels:
69
+ severity: critical
70
+ group: upright
71
+
72
+ # No SMTP metrics at all - probes completely stopped
73
+ - alert: UprightSMTPProbesMissing
74
+ annotations:
75
+ summary: "SMTP probes missing - no metrics received"
76
+ description: "No SMTP probe metrics are being reported. Upright may be down or probes are stuck."
77
+ upright: "https://app.<%= app_domain %>/?probe_type=smtp"
78
+ expr: absent(upright_probe_up{type="smtp"})
79
+ for: 5m
80
+ labels:
81
+ severity: critical
82
+ group: upright
83
+
84
+ # No Playwright metrics at all - probes completely stopped
85
+ - alert: UprightPlaywrightProbesMissing
86
+ annotations:
87
+ summary: "Playwright probes missing - no metrics received"
88
+ description: "No Playwright probe metrics are being reported. Browser probes may be stuck or Upright is down."
89
+ upright: "https://app.<%= app_domain %>/?probe_type=playwright"
90
+ expr: absent(upright_probe_up{type="playwright"})
91
+ for: 30m
92
+ labels:
93
+ severity: critical
94
+ group: upright
95
+
96
+ # Site stopped reporting HTTP probes (was reporting in last hour, not now)
97
+ - alert: UprightHTTPSiteMissing
98
+ annotations:
99
+ summary: "HTTP probes missing from {{ $labels.site_city }}"
100
+ description: "{{ $labels.site_city }} ({{ $labels.site_code }}) stopped reporting HTTP probe metrics."
101
+ upright: "https://{{ $labels.site_code }}.<%= app_domain %>/?probe_type=http"
102
+ expr: |
103
+ count by (site_code, site_city) (max_over_time(upright_probe_up{type="http"}[1h]))
104
+ unless
105
+ count by (site_code, site_city) (upright_probe_up{type="http"})
106
+ for: 5m
107
+ labels:
108
+ severity: warning
109
+ group: upright
110
+
111
+ # Site stopped reporting SMTP probes (was reporting in last hour, not now)
112
+ - alert: UprightSMTPSiteMissing
113
+ annotations:
114
+ summary: "SMTP probes missing from {{ $labels.site_city }}"
115
+ description: "{{ $labels.site_city }} ({{ $labels.site_code }}) stopped reporting SMTP probe metrics."
116
+ upright: "https://{{ $labels.site_code }}.<%= app_domain %>/?probe_type=smtp"
117
+ expr: |
118
+ count by (site_code, site_city) (max_over_time(upright_probe_up{type="smtp"}[1h]))
119
+ unless
120
+ count by (site_code, site_city) (upright_probe_up{type="smtp"})
121
+ for: 5m
122
+ labels:
123
+ severity: warning
124
+ group: upright
125
+
126
+ # Site stopped reporting Playwright probes (was reporting in last hour, not now)
127
+ - alert: UprightPlaywrightSiteMissing
128
+ annotations:
129
+ summary: "Playwright probes missing from {{ $labels.site_city }}"
130
+ description: "{{ $labels.site_city }} ({{ $labels.site_code }}) stopped reporting Playwright probe metrics. Probes may be stuck."
131
+ upright: "https://{{ $labels.site_code }}.<%= app_domain %>/?probe_type=playwright"
132
+ expr: |
133
+ count by (site_code, site_city) (max_over_time(upright_probe_up{type="playwright"}[1h]))
134
+ unless
135
+ count by (site_code, site_city) (upright_probe_up{type="playwright"})
136
+ for: 30m
137
+ labels:
138
+ severity: warning
139
+ group: upright
140
+
141
+ # HTTP probe alerts (simple endpoint monitoring)
142
+ - name: upright_http_alerts
143
+ rules:
144
+ - alert: UprightHTTPProbeDown
145
+ annotations:
146
+ summary: "HTTP: {{ $labels.name }} is DOWN"
147
+ description: "{{ $value | humanizePercentage }} of regions report failure on {{ $labels.probe_target }}"
148
+ upright: "https://app.<%= app_domain %>/?probe_type={{ $labels.type }}&status=fail&probe_name={{ $labels.name | urlquery }}"
149
+ expr: upright:probe_down_fraction{type="http"} > 0.5
150
+ labels:
151
+ severity: page
152
+ group: upright
153
+
154
+ - alert: UprightHTTPProbeDegraded
155
+ annotations:
156
+ summary: "HTTP: {{ $labels.name }} degraded - {{ $value | humanizePercentage }} of regions down"
157
+ description: "{{ $value | humanizePercentage }} of regions report failure on {{ $labels.probe_target }}"
158
+ upright: "https://app.<%= app_domain %>/?probe_type={{ $labels.type }}&status=fail&probe_name={{ $labels.name | urlquery }}"
159
+ expr: upright:probe_down_fraction{type="http"} > 0.25 and upright:probe_down_fraction{type="http"} <= 0.5
160
+ for: 5m
161
+ labels:
162
+ severity: warning
163
+ group: upright
164
+
165
+ - alert: UprightHTTPRegionalFailure
166
+ annotations:
167
+ summary: "HTTP: {{ $labels.name }} failing from {{ $labels.site_city }}"
168
+ description: "HTTP: {{ $labels.probe_target }} failing from {{ $labels.site_city }}"
169
+ upright: "https://{{ $labels.site_code }}.<%= app_domain %>/?probe_type={{ $labels.type }}&status=fail&probe_name={{ $labels.name | urlquery }}"
170
+ # Only fire when exactly 1 site is down (isolated regional issue)
171
+ expr: |
172
+ upright_probe_up{type="http"} == 0
173
+ and on (name, type, probe_target)
174
+ count by (name, type, probe_target) (upright_probe_up == 0) == 1
175
+ for: 2m
176
+ labels:
177
+ severity: warning
178
+ group: upright
179
+
180
+ # SMTP probe alerts (mail server monitoring)
181
+ - name: upright_smtp_alerts
182
+ rules:
183
+ - alert: UprightSMTPProbeDown
184
+ annotations:
185
+ summary: "SMTP: {{ $labels.name }} is DOWN"
186
+ description: "{{ $value | humanizePercentage }} of regions report failure on {{ $labels.probe_target }}"
187
+ upright: "https://app.<%= app_domain %>/?probe_type={{ $labels.type }}&status=fail&probe_name={{ $labels.name | urlquery }}"
188
+ expr: upright:probe_down_fraction{type="smtp"} > 0.5
189
+ labels:
190
+ severity: page
191
+ group: upright
192
+
193
+ - alert: UprightSMTPProbeDegraded
194
+ annotations:
195
+ summary: "SMTP: {{ $labels.name }} degraded - {{ $value | humanizePercentage }} of regions down"
196
+ description: "{{ $value | humanizePercentage }} of regions report failure on {{ $labels.probe_target }}"
197
+ upright: "https://app.<%= app_domain %>/?probe_type={{ $labels.type }}&status=fail&probe_name={{ $labels.name | urlquery }}"
198
+ expr: upright:probe_down_fraction{type="smtp"} > 0.25 and upright:probe_down_fraction{type="smtp"} <= 0.5
199
+ for: 5m
200
+ labels:
201
+ severity: warning
202
+ group: upright
203
+
204
+ - alert: UprightSMTPRegionalFailure
205
+ annotations:
206
+ summary: "SMTP: {{ $labels.name }} failing from {{ $labels.site_city }}"
207
+ description: "SMTP: {{ $labels.probe_target }} failing from {{ $labels.site_city }}"
208
+ upright: "https://{{ $labels.site_code }}.<%= app_domain %>/?probe_type={{ $labels.type }}&status=fail&probe_name={{ $labels.name | urlquery }}"
209
+ # Only fire when exactly 1 site is down (isolated regional issue)
210
+ expr: |
211
+ upright_probe_up{type="smtp"} == 0
212
+ and on (name, type, probe_target)
213
+ count by (name, type, probe_target) (upright_probe_up == 0) == 1
214
+ for: 2m
215
+ labels:
216
+ severity: warning
217
+ group: upright
218
+
219
+ # Playwright probe alerts (browser-based login flows)
220
+ - name: upright_playwright_alerts
221
+ rules:
222
+ - alert: UprightPlaywrightProbeFailed
223
+ annotations:
224
+ summary: "Browser: {{ $labels.name }} probe FAILED"
225
+ description: "{{ $value | humanizePercentage }} of regions report failure for browser-based probe on {{ $labels.name }}"
226
+ upright: "https://app.<%= app_domain %>/?probe_type={{ $labels.type }}&status=fail&probe_name={{ $labels.name | urlquery }}"
227
+ expr: upright:probe_down_fraction{type="playwright"} > 0.5
228
+ labels:
229
+ severity: page
230
+ group: upright
231
+
232
+ - alert: UprightPlaywrightProbeDegraded
233
+ annotations:
234
+ summary: "Browser: {{ $labels.name }} degraded - {{ $value | humanizePercentage }} of regions down"
235
+ description: "Not all regions report success for browser-based probe {{ $labels.name }}"
236
+ upright: "https://app.<%= app_domain %>/?probe_type={{ $labels.type }}&status=fail&probe_name={{ $labels.name | urlquery }}"
237
+ expr: upright:probe_down_fraction{type="playwright"} > 0.25 and upright:probe_down_fraction{type="playwright"} <= 0.5
238
+ for: 20m
239
+ labels:
240
+ severity: warning
241
+ group: upright
242
+
243
+ - alert: UprightPlaywrightRegionalFailure
244
+ annotations:
245
+ summary: "Browser: {{ $labels.name }} failing from {{ $labels.site_city }}"
246
+ description: "Browser: {{ $labels.name }} failing from {{ $labels.site_city }}"
247
+ upright: "https://{{ $labels.site_code }}.<%= app_domain %>/?probe_type={{ $labels.type }}&status=fail&probe_name={{ $labels.name | urlquery }}"
248
+ # Only fire when exactly 1 site is down (isolated regional issue)
249
+ expr: |
250
+ upright_probe_up{type="playwright"} == 0
251
+ and on (name, type, probe_target)
252
+ count by (name, type, probe_target) (upright_probe_up == 0) == 1
253
+ for: 20m
254
+ labels:
255
+ severity: warning
256
+ group: upright
@@ -0,0 +1,30 @@
1
+ module Upright
2
+ module Generators
3
+ class PlaywrightProbeGenerator < Rails::Generators::NamedBase
4
+ source_root File.expand_path("templates", __dir__)
5
+
6
+ class_option :with_authenticator, type: :boolean, default: false,
7
+ desc: "Generate an authenticator class for this probe"
8
+
9
+ def create_probe_file
10
+ template "probe.rb.tt", File.join("probes", "#{file_name}_probe.rb")
11
+ end
12
+
13
+ def create_authenticator_file
14
+ if options[:with_authenticator]
15
+ template "authenticator.rb.tt", File.join("probes/authenticators", "#{file_name}.rb")
16
+ end
17
+ end
18
+
19
+ private
20
+
21
+ def probe_class_name
22
+ "#{class_name}Probe"
23
+ end
24
+
25
+ def authenticator_name
26
+ file_name.underscore.to_sym
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,14 @@
1
+ class Playwright::Authenticator::<%= class_name %> < Upright::Playwright::Authenticator::Base
2
+ def signin_redirect_url = "https://example.com/dashboard"
3
+ def signin_path = "/login"
4
+ def service_name = :<%= file_name %>
5
+
6
+ def authenticate
7
+ # Your authentication logic here
8
+ # page.goto("https://example.com/login")
9
+ # page.get_by_label("Email").fill(credentials.<%= file_name %>.email)
10
+ # page.get_by_label("Password").fill(credentials.<%= file_name %>.password)
11
+ # page.get_by_role("button", name: "Sign in").click
12
+ raise NotImplementedError, "Implement the authenticate method for <%= class_name %>"
13
+ end
14
+ end
@@ -0,0 +1,14 @@
1
+ class Probes::Playwright::<%= probe_class_name %> < Upright::Probes::Playwright::Base
2
+ <% if options[:with_authenticator] -%>
3
+ authenticate_with_form :<%= authenticator_name %>
4
+
5
+ <% end -%>
6
+ def check
7
+ # Your probe logic here
8
+ # page.goto("https://example.com")
9
+ # page.fill('[name="email"]', "user@example.com")
10
+ # page.click('button[type="submit"]')
11
+ # page.wait_for_selector(".success")
12
+ raise NotImplementedError, "Implement the check method for <%= probe_class_name %>"
13
+ end
14
+ end