upright 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.md +10 -0
- data/README.md +455 -0
- data/Rakefile +6 -0
- data/app/assets/stylesheets/upright/_global.css +104 -0
- data/app/assets/stylesheets/upright/artifact.css +148 -0
- data/app/assets/stylesheets/upright/base.css +68 -0
- data/app/assets/stylesheets/upright/buttons.css +21 -0
- data/app/assets/stylesheets/upright/dashboard.css +287 -0
- data/app/assets/stylesheets/upright/forms.css +104 -0
- data/app/assets/stylesheets/upright/header.css +124 -0
- data/app/assets/stylesheets/upright/layout.css +100 -0
- data/app/assets/stylesheets/upright/map.css +25 -0
- data/app/assets/stylesheets/upright/pagination.css +45 -0
- data/app/assets/stylesheets/upright/probes.css +72 -0
- data/app/assets/stylesheets/upright/reset.css +26 -0
- data/app/assets/stylesheets/upright/tables.css +63 -0
- data/app/assets/stylesheets/upright/typography.css +27 -0
- data/app/assets/stylesheets/upright/uptime-bars.css +154 -0
- data/app/controllers/concerns/upright/authentication.rb +21 -0
- data/app/controllers/concerns/upright/subdomain_scoping.rb +18 -0
- data/app/controllers/upright/alertmanager_proxy_controller.rb +21 -0
- data/app/controllers/upright/application_controller.rb +12 -0
- data/app/controllers/upright/artifacts_controller.rb +5 -0
- data/app/controllers/upright/dashboards/uptimes_controller.rb +6 -0
- data/app/controllers/upright/jobs_controller.rb +4 -0
- data/app/controllers/upright/probe_results_controller.rb +17 -0
- data/app/controllers/upright/prometheus_proxy_controller.rb +62 -0
- data/app/controllers/upright/sessions_controller.rb +29 -0
- data/app/controllers/upright/sites_controller.rb +5 -0
- data/app/helpers/upright/application_helper.rb +11 -0
- data/app/helpers/upright/dashboards_helper.rb +31 -0
- data/app/helpers/upright/probe_results_helper.rb +49 -0
- data/app/javascript/upright/application.js +2 -0
- data/app/javascript/upright/controllers/application.js +5 -0
- data/app/javascript/upright/controllers/form_controller.js +7 -0
- data/app/javascript/upright/controllers/index.js +4 -0
- data/app/javascript/upright/controllers/popover_controller.js +15 -0
- data/app/javascript/upright/controllers/probe_results_chart_controller.js +79 -0
- data/app/javascript/upright/controllers/results_table_controller.js +16 -0
- data/app/javascript/upright/controllers/sites_map_controller.js +33 -0
- data/app/jobs/upright/application_job.rb +2 -0
- data/app/jobs/upright/probe_check_job.rb +42 -0
- data/app/models/concerns/upright/exception_recording.rb +38 -0
- data/app/models/concerns/upright/playwright/form_authentication.rb +27 -0
- data/app/models/concerns/upright/playwright/helpers.rb +7 -0
- data/app/models/concerns/upright/playwright/lifecycle.rb +44 -0
- data/app/models/concerns/upright/playwright/logging.rb +87 -0
- data/app/models/concerns/upright/playwright/otel_tracing.rb +137 -0
- data/app/models/concerns/upright/playwright/video_recording.rb +60 -0
- data/app/models/concerns/upright/probe_yaml_source.rb +10 -0
- data/app/models/concerns/upright/probeable.rb +125 -0
- data/app/models/concerns/upright/staggerable.rb +22 -0
- data/app/models/concerns/upright/traceroute/otel_tracing.rb +108 -0
- data/app/models/upright/application_record.rb +3 -0
- data/app/models/upright/artifact.rb +61 -0
- data/app/models/upright/current.rb +9 -0
- data/app/models/upright/http/request.rb +59 -0
- data/app/models/upright/http/response.rb +55 -0
- data/app/models/upright/playwright/authenticator/base.rb +128 -0
- data/app/models/upright/playwright/storage_state.rb +31 -0
- data/app/models/upright/probe_result.rb +31 -0
- data/app/models/upright/probes/http_probe.rb +102 -0
- data/app/models/upright/probes/playwright/base.rb +48 -0
- data/app/models/upright/probes/smtp_probe.rb +48 -0
- data/app/models/upright/probes/traceroute_probe.rb +32 -0
- data/app/models/upright/probes/uptime/summary.rb +36 -0
- data/app/models/upright/probes/uptime.rb +36 -0
- data/app/models/upright/traceroute/hop.rb +49 -0
- data/app/models/upright/traceroute/ip_metadata_lookup.rb +107 -0
- data/app/models/upright/traceroute/mtr_parser.rb +47 -0
- data/app/models/upright/traceroute/result.rb +57 -0
- data/app/models/upright/user.rb +14 -0
- data/app/views/layouts/upright/_header.html.erb +23 -0
- data/app/views/layouts/upright/application.html.erb +25 -0
- data/app/views/upright/active_storage/attachments/_attachment.html.erb +21 -0
- data/app/views/upright/alertmanager_proxy/show.html.erb +1 -0
- data/app/views/upright/artifacts/show.html.erb +9 -0
- data/app/views/upright/dashboards/_uptime_bars.html.erb +17 -0
- data/app/views/upright/dashboards/_uptime_probe_row.html.erb +22 -0
- data/app/views/upright/dashboards/uptimes/show.html.erb +17 -0
- data/app/views/upright/jobs/show.html.erb +1 -0
- data/app/views/upright/probe_results/_pagination.html.erb +19 -0
- data/app/views/upright/probe_results/index.html.erb +72 -0
- data/app/views/upright/prometheus_proxy/show.html.erb +1 -0
- data/app/views/upright/sessions/new.html.erb +6 -0
- data/app/views/upright/sites/index.html.erb +22 -0
- data/config/brakeman.ignore +39 -0
- data/config/ci.rb +7 -0
- data/config/importmap.rb +18 -0
- data/config/routes.rb +41 -0
- data/db/migrate/20250114000001_create_upright_probe_results.rb +19 -0
- data/lib/generators/upright/install/install_generator.rb +83 -0
- data/lib/generators/upright/install/templates/alertmanager.yml +14 -0
- data/lib/generators/upright/install/templates/deploy.yml +118 -0
- data/lib/generators/upright/install/templates/development_alertmanager.yml +11 -0
- data/lib/generators/upright/install/templates/development_prometheus.yml +12 -0
- data/lib/generators/upright/install/templates/docker-compose.yml +38 -0
- data/lib/generators/upright/install/templates/http_probes.yml +14 -0
- data/lib/generators/upright/install/templates/omniauth.rb +8 -0
- data/lib/generators/upright/install/templates/otel_collector.yml +24 -0
- data/lib/generators/upright/install/templates/prometheus.yml +10 -0
- data/lib/generators/upright/install/templates/puma.rb +40 -0
- data/lib/generators/upright/install/templates/sites.yml +26 -0
- data/lib/generators/upright/install/templates/smtp_probes.yml +9 -0
- data/lib/generators/upright/install/templates/upright.rb +21 -0
- data/lib/generators/upright/install/templates/upright.rules.yml +256 -0
- data/lib/generators/upright/playwright_probe/playwright_probe_generator.rb +30 -0
- data/lib/generators/upright/playwright_probe/templates/authenticator.rb.tt +14 -0
- data/lib/generators/upright/playwright_probe/templates/probe.rb.tt +14 -0
- data/lib/omniauth/strategies/static_credentials.rb +57 -0
- data/lib/tasks/upright_tasks.rake +4 -0
- data/lib/upright/configuration.rb +106 -0
- data/lib/upright/engine.rb +157 -0
- data/lib/upright/metrics.rb +62 -0
- data/lib/upright/playwright/collect_performance_metrics.js +36 -0
- data/lib/upright/site.rb +49 -0
- data/lib/upright/tracing.rb +49 -0
- data/lib/upright/version.rb +3 -0
- data/lib/upright.rb +68 -0
- metadata +513 -0
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# Upright Kamal Configuration
|
|
2
|
+
# Deploy with: bin/kamal deploy
|
|
3
|
+
|
|
4
|
+
service: <%= app_name %>
|
|
5
|
+
image: <%= app_name %>/<%= app_name %>
|
|
6
|
+
|
|
7
|
+
servers:
|
|
8
|
+
web:
|
|
9
|
+
hosts:
|
|
10
|
+
- <%= app_domain %>: [london]
|
|
11
|
+
# Add more sites:
|
|
12
|
+
# - nyc.<%= app_domain %>: [new_york]
|
|
13
|
+
# - sfo.<%= app_domain %>: [san_francisco]
|
|
14
|
+
jobs:
|
|
15
|
+
hosts:
|
|
16
|
+
- <%= app_domain %>: [london]
|
|
17
|
+
# Add more sites:
|
|
18
|
+
# - nyc.<%= app_domain %>: [new_york]
|
|
19
|
+
# - sfo.<%= app_domain %>: [san_francisco]
|
|
20
|
+
cmd: bin/jobs
|
|
21
|
+
|
|
22
|
+
ssh:
|
|
23
|
+
user: app
|
|
24
|
+
|
|
25
|
+
registry:
|
|
26
|
+
server: ghcr.io
|
|
27
|
+
username:
|
|
28
|
+
- KAMAL_REGISTRY_USERNAME
|
|
29
|
+
password:
|
|
30
|
+
- KAMAL_REGISTRY_PASSWORD
|
|
31
|
+
|
|
32
|
+
builder:
|
|
33
|
+
arch: amd64
|
|
34
|
+
secrets:
|
|
35
|
+
- GITHUB_TOKEN
|
|
36
|
+
|
|
37
|
+
proxy:
|
|
38
|
+
app_port: 3000
|
|
39
|
+
ssl: true
|
|
40
|
+
hosts:
|
|
41
|
+
- "*.<%= app_domain %>"
|
|
42
|
+
|
|
43
|
+
env:
|
|
44
|
+
clear:
|
|
45
|
+
OTEL_EXPORTER_OTLP_ENDPOINT: http://<%= app_name %>-otel_collector:4318/v1/traces
|
|
46
|
+
PROMETHEUS_URL: http://<%= app_name %>-prometheus:9090
|
|
47
|
+
ALERTMANAGER_URL: http://<%= app_name %>-alertmanager:9093
|
|
48
|
+
secret:
|
|
49
|
+
- RAILS_MASTER_KEY
|
|
50
|
+
- PROMETHEUS_OTLP_TOKEN
|
|
51
|
+
tags:
|
|
52
|
+
london:
|
|
53
|
+
SITE_SUBDOMAIN: lon
|
|
54
|
+
# Add more sites:
|
|
55
|
+
# new_york:
|
|
56
|
+
# SITE_SUBDOMAIN: nyc
|
|
57
|
+
# san_francisco:
|
|
58
|
+
# SITE_SUBDOMAIN: sfo
|
|
59
|
+
|
|
60
|
+
volumes:
|
|
61
|
+
- "<%= app_name %>_storage:/rails/storage"
|
|
62
|
+
|
|
63
|
+
accessories:
|
|
64
|
+
otel_collector:
|
|
65
|
+
image: otel/opentelemetry-collector-contrib:0.126.0
|
|
66
|
+
env:
|
|
67
|
+
secret:
|
|
68
|
+
- OTEL_GATEWAY_USERNAME
|
|
69
|
+
- OTEL_GATEWAY_PASSWORD
|
|
70
|
+
- OTEL_GATEWAY_AUTH
|
|
71
|
+
- PROMETHEUS_OTLP_TOKEN
|
|
72
|
+
files:
|
|
73
|
+
- config/otel_collector.yml:/etc/otelcol-contrib/config.yaml
|
|
74
|
+
volumes:
|
|
75
|
+
- /var/run/docker.sock:/var/run/docker.sock
|
|
76
|
+
- /var/lib/docker/containers:/var/lib/docker/containers
|
|
77
|
+
options:
|
|
78
|
+
user: 0
|
|
79
|
+
roles:
|
|
80
|
+
- jobs
|
|
81
|
+
|
|
82
|
+
playwright:
|
|
83
|
+
image: jacoblincool/playwright:chromium-server-1.55.0
|
|
84
|
+
port: "127.0.0.1:53333:53333"
|
|
85
|
+
roles:
|
|
86
|
+
- jobs
|
|
87
|
+
|
|
88
|
+
prometheus:
|
|
89
|
+
image: prom/prometheus:v3.2.1
|
|
90
|
+
hosts:
|
|
91
|
+
- <%= app_domain %>
|
|
92
|
+
cmd: >-
|
|
93
|
+
--config.file=/etc/prometheus/prometheus.yml
|
|
94
|
+
--storage.tsdb.path=/prometheus
|
|
95
|
+
--storage.tsdb.retention.time=30d
|
|
96
|
+
--web.enable-otlp-receiver
|
|
97
|
+
--web.enable-lifecycle
|
|
98
|
+
files:
|
|
99
|
+
- config/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
|
|
100
|
+
- config/prometheus/rules/upright.rules.yml:/etc/prometheus/rules/upright.rules.yml
|
|
101
|
+
volumes:
|
|
102
|
+
- prometheus_data:/prometheus
|
|
103
|
+
|
|
104
|
+
alertmanager:
|
|
105
|
+
image: prom/alertmanager:v0.28.1
|
|
106
|
+
hosts:
|
|
107
|
+
- <%= app_domain %>
|
|
108
|
+
cmd: --config.file=/etc/alertmanager/alertmanager.yml
|
|
109
|
+
files:
|
|
110
|
+
- config/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
|
|
111
|
+
volumes:
|
|
112
|
+
- alertmanager_data:/alertmanager
|
|
113
|
+
|
|
114
|
+
aliases:
|
|
115
|
+
console: app exec -i "bin/rails console"
|
|
116
|
+
logs: app logs -f
|
|
117
|
+
|
|
118
|
+
retain_containers: 3
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
services:
|
|
2
|
+
prometheus:
|
|
3
|
+
image: prom/prometheus:v3.4.1
|
|
4
|
+
container_name: <%= app_name %>-prometheus
|
|
5
|
+
network_mode: host
|
|
6
|
+
volumes:
|
|
7
|
+
- ./config/prometheus/development/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
|
8
|
+
- ./config/prometheus/rules:/etc/prometheus/rules:ro
|
|
9
|
+
- prometheus_data:/prometheus
|
|
10
|
+
command:
|
|
11
|
+
- '--config.file=/etc/prometheus/prometheus.yml'
|
|
12
|
+
- '--storage.tsdb.path=/prometheus'
|
|
13
|
+
- '--web.enable-lifecycle'
|
|
14
|
+
- '--web.enable-remote-write-receiver'
|
|
15
|
+
|
|
16
|
+
alertmanager:
|
|
17
|
+
image: prom/alertmanager:v0.28.1
|
|
18
|
+
container_name: <%= app_name %>-alertmanager
|
|
19
|
+
ports:
|
|
20
|
+
- "9093:9093"
|
|
21
|
+
volumes:
|
|
22
|
+
- ./config/alertmanager/development/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
|
|
23
|
+
- alertmanager_data:/alertmanager
|
|
24
|
+
command:
|
|
25
|
+
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
|
26
|
+
- '--storage.path=/alertmanager'
|
|
27
|
+
|
|
28
|
+
playwright:
|
|
29
|
+
image: jacoblincool/playwright:chromium-server-1.56.1
|
|
30
|
+
container_name: <%= app_name %>-playwright
|
|
31
|
+
ports:
|
|
32
|
+
- "53333:53333"
|
|
33
|
+
environment:
|
|
34
|
+
- DEBUG=true
|
|
35
|
+
|
|
36
|
+
volumes:
|
|
37
|
+
prometheus_data:
|
|
38
|
+
alertmanager_data:
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# HTTP Probe Definitions
|
|
2
|
+
# Add your HTTP health check probes here
|
|
3
|
+
#
|
|
4
|
+
# Required fields:
|
|
5
|
+
# - name: Unique identifier for the probe
|
|
6
|
+
# - url: The URL to check
|
|
7
|
+
#
|
|
8
|
+
# Optional fields:
|
|
9
|
+
# - expected_status: HTTP status code to expect (default: 200-399)
|
|
10
|
+
# - basic_auth_credentials: Key in Rails credentials for basic auth
|
|
11
|
+
# - proxy: Key in Rails credentials for proxy settings
|
|
12
|
+
|
|
13
|
+
- name: Example Health Check
|
|
14
|
+
url: https://example.com
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# WARNING: Change the default password before deploying to production!
|
|
2
|
+
# Set the ADMIN_PASSWORD environment variable or update the credentials below.
|
|
3
|
+
|
|
4
|
+
Rails.application.config.middleware.use OmniAuth::Builder do
|
|
5
|
+
provider :static_credentials,
|
|
6
|
+
title: "Sign In",
|
|
7
|
+
credentials: { "admin" => ENV.fetch("ADMIN_PASSWORD", "upright") }
|
|
8
|
+
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
receivers:
|
|
2
|
+
otlp:
|
|
3
|
+
protocols:
|
|
4
|
+
http:
|
|
5
|
+
endpoint: 0.0.0.0:4318
|
|
6
|
+
|
|
7
|
+
processors:
|
|
8
|
+
batch:
|
|
9
|
+
timeout: 2s
|
|
10
|
+
|
|
11
|
+
exporters:
|
|
12
|
+
debug:
|
|
13
|
+
verbosity: detailed
|
|
14
|
+
|
|
15
|
+
service:
|
|
16
|
+
pipelines:
|
|
17
|
+
traces:
|
|
18
|
+
receivers: [otlp]
|
|
19
|
+
processors: [batch]
|
|
20
|
+
exporters: [debug]
|
|
21
|
+
metrics:
|
|
22
|
+
receivers: [otlp]
|
|
23
|
+
processors: [batch]
|
|
24
|
+
exporters: [debug]
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# This configuration file will be evaluated by Puma. The top-level methods that
|
|
2
|
+
# are invoked here are part of Puma's configuration DSL. For more information
|
|
3
|
+
# about methods provided by the DSL, see https://puma.io/puma/Puma/DSL.html.
|
|
4
|
+
|
|
5
|
+
# Puma starts a configurable number of processes (workers) and each process
|
|
6
|
+
# serves each request in a thread from an internal thread pool.
|
|
7
|
+
#
|
|
8
|
+
# The ideal number of threads per worker depends both on how much time the
|
|
9
|
+
# application spends waiting for IO operations and on how much you wish to
|
|
10
|
+
# to prioritize throughput over latency.
|
|
11
|
+
#
|
|
12
|
+
# As a rule of thumb, increasing the number of threads will increase how much
|
|
13
|
+
# traffic a given process can handle (throughput), but due to CRuby's
|
|
14
|
+
# Global VM Lock (GVL) it has diminishing returns and will degrade the
|
|
15
|
+
# response time (latency) of the application.
|
|
16
|
+
#
|
|
17
|
+
# The default is set to 3 threads as it's deemed a decent compromise between
|
|
18
|
+
# throughput and latency for the average Rails application.
|
|
19
|
+
#
|
|
20
|
+
# Any libraries that use a connection pool or another resource pool should
|
|
21
|
+
# be configured to provide at least as many connections as the number of
|
|
22
|
+
# threads. This includes Active Record's `pool` parameter in `database.yml`.
|
|
23
|
+
threads_count = ENV.fetch("RAILS_MAX_THREADS", 3)
|
|
24
|
+
threads threads_count, threads_count
|
|
25
|
+
|
|
26
|
+
# Specifies the `port` that Puma will listen on to receive requests; default is 3000.
|
|
27
|
+
port ENV.fetch("PORT", 3000)
|
|
28
|
+
|
|
29
|
+
# Allow puma to be restarted by `bin/rails restart` command.
|
|
30
|
+
plugin :tmp_restart
|
|
31
|
+
|
|
32
|
+
# Expose Prometheus metrics at http://0.0.0.0:9394/metrics
|
|
33
|
+
activate_control_app "auto", no_token: true
|
|
34
|
+
plugin :yabeda
|
|
35
|
+
plugin :yabeda_prometheus
|
|
36
|
+
prometheus_silence_logger true
|
|
37
|
+
|
|
38
|
+
# Specify the PID file. Defaults to tmp/pids/server.pid in development.
|
|
39
|
+
# In other environments, only set the PID file if requested.
|
|
40
|
+
pidfile ENV["PIDFILE"] if ENV["PIDFILE"]
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# Upright Site Configuration
|
|
2
|
+
# Define the geographic locations where probes run
|
|
3
|
+
# Domain is automatically inferred as {code}.{hostname}
|
|
4
|
+
|
|
5
|
+
shared:
|
|
6
|
+
sites:
|
|
7
|
+
- code: lon
|
|
8
|
+
city: London
|
|
9
|
+
country: GB
|
|
10
|
+
geohash: gcpvj0
|
|
11
|
+
|
|
12
|
+
# Multi-site example:
|
|
13
|
+
#
|
|
14
|
+
# shared:
|
|
15
|
+
# sites:
|
|
16
|
+
# - code: ams
|
|
17
|
+
# city: Amsterdam
|
|
18
|
+
# country: NL
|
|
19
|
+
# geohash: u17982
|
|
20
|
+
# provider: digitalocean
|
|
21
|
+
#
|
|
22
|
+
# - code: nyc
|
|
23
|
+
# city: New York City
|
|
24
|
+
# country: US
|
|
25
|
+
# geohash: dr5reg
|
|
26
|
+
# provider: digitalocean
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# See: https://github.com/basecamp/upright
|
|
2
|
+
|
|
3
|
+
Upright.configure do |config|
|
|
4
|
+
config.service_name = "<%= Rails.application.class.module_parent_name.underscore %>"
|
|
5
|
+
config.user_agent = "<%= Rails.application.class.module_parent_name.underscore %>/1.0"
|
|
6
|
+
config.hostname = Rails.env.local? ? "<%= Rails.application.class.module_parent_name.downcase %>.localhost" : "<%= Rails.application.class.module_parent_name.downcase %>.com"
|
|
7
|
+
|
|
8
|
+
# Playwright browser server URL
|
|
9
|
+
# config.playwright_server_url = ENV["PLAYWRIGHT_SERVER_URL"]
|
|
10
|
+
|
|
11
|
+
# OpenTelemetry endpoint
|
|
12
|
+
# config.otel_endpoint = ENV["OTEL_EXPORTER_OTLP_ENDPOINT"]
|
|
13
|
+
|
|
14
|
+
# Authentication via OpenID Connect (Logto, Keycloak, Duo, Okta, etc.)
|
|
15
|
+
# config.auth_provider = :openid_connect
|
|
16
|
+
# config.auth_options = {
|
|
17
|
+
# issuer: ENV["OIDC_ISSUER"],
|
|
18
|
+
# client_id: ENV["OIDC_CLIENT_ID"],
|
|
19
|
+
# client_secret: ENV["OIDC_CLIENT_SECRET"]
|
|
20
|
+
# }
|
|
21
|
+
end
|
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
# Upright Prometheus Alert Rules
|
|
2
|
+
# Generated by: rails generate upright:install
|
|
3
|
+
|
|
4
|
+
groups:
|
|
5
|
+
- name: upright_recording
|
|
6
|
+
interval: 30s
|
|
7
|
+
rules:
|
|
8
|
+
# Fraction of regions reporting DOWN (0.0 to 1.0)
|
|
9
|
+
- record: upright:probe_down_fraction
|
|
10
|
+
expr: |
|
|
11
|
+
count by (name, type, probe_target) (upright_probe_up == 0)
|
|
12
|
+
/
|
|
13
|
+
count by (name, type, probe_target) (upright_probe_up)
|
|
14
|
+
|
|
15
|
+
# Daily uptime percentage (0.0 to 1.0)
|
|
16
|
+
# Uptime = percentage of time in past day when majority of sites reported UP
|
|
17
|
+
- record: upright:probe_uptime_daily
|
|
18
|
+
expr: |
|
|
19
|
+
avg_over_time((
|
|
20
|
+
sum by (name, type, probe_target) (upright_probe_up)
|
|
21
|
+
/
|
|
22
|
+
count by (name, type, probe_target) (upright_probe_up)
|
|
23
|
+
> bool 0.5
|
|
24
|
+
)[1d:])
|
|
25
|
+
|
|
26
|
+
- name: upright_alerts
|
|
27
|
+
rules:
|
|
28
|
+
# Mass failure alert - fires when >20% of all probes are failing
|
|
29
|
+
# When this fires, individual probe alerts are suppressed via inhibition rules
|
|
30
|
+
- alert: UprightMassFailure
|
|
31
|
+
annotations:
|
|
32
|
+
summary: "Mass failure - {{ $value | humanizePercentage }} of probes down"
|
|
33
|
+
description: "Many probes failing simultaneously, likely an Upright or network issue"
|
|
34
|
+
upright: "https://app.<%= app_domain %>/?status=fail"
|
|
35
|
+
expr: |
|
|
36
|
+
count(upright:probe_down_fraction > 0.5)
|
|
37
|
+
/
|
|
38
|
+
count(count by (name, type, probe_target) (upright_probe_up))
|
|
39
|
+
> 0.2
|
|
40
|
+
labels:
|
|
41
|
+
severity: critical
|
|
42
|
+
group: upright
|
|
43
|
+
|
|
44
|
+
# Pool has fewer active sites than peak in last hour
|
|
45
|
+
- alert: UprightPoolDegraded
|
|
46
|
+
annotations:
|
|
47
|
+
summary: "Upright pool degraded - {{ $value }} site(s) missing"
|
|
48
|
+
description: "Upright pool degraded - {{ $value }} site(s) missing"
|
|
49
|
+
upright: "https://app.<%= app_domain %>/"
|
|
50
|
+
expr: |
|
|
51
|
+
max_over_time(count(count by (site_code) (upright_probe_up))[1h:1m])
|
|
52
|
+
-
|
|
53
|
+
count(count by (site_code) (upright_probe_up))
|
|
54
|
+
> 0
|
|
55
|
+
for: 5m
|
|
56
|
+
labels:
|
|
57
|
+
severity: critical
|
|
58
|
+
group: upright
|
|
59
|
+
|
|
60
|
+
# No HTTP metrics at all - probes completely stopped
|
|
61
|
+
- alert: UprightHTTPProbesMissing
|
|
62
|
+
annotations:
|
|
63
|
+
summary: "HTTP probes missing - no metrics received"
|
|
64
|
+
description: "No HTTP probe metrics are being reported. Upright may be down or probes are stuck."
|
|
65
|
+
upright: "https://app.<%= app_domain %>/?probe_type=http"
|
|
66
|
+
expr: absent(upright_probe_up{type="http"})
|
|
67
|
+
for: 5m
|
|
68
|
+
labels:
|
|
69
|
+
severity: critical
|
|
70
|
+
group: upright
|
|
71
|
+
|
|
72
|
+
# No SMTP metrics at all - probes completely stopped
|
|
73
|
+
- alert: UprightSMTPProbesMissing
|
|
74
|
+
annotations:
|
|
75
|
+
summary: "SMTP probes missing - no metrics received"
|
|
76
|
+
description: "No SMTP probe metrics are being reported. Upright may be down or probes are stuck."
|
|
77
|
+
upright: "https://app.<%= app_domain %>/?probe_type=smtp"
|
|
78
|
+
expr: absent(upright_probe_up{type="smtp"})
|
|
79
|
+
for: 5m
|
|
80
|
+
labels:
|
|
81
|
+
severity: critical
|
|
82
|
+
group: upright
|
|
83
|
+
|
|
84
|
+
# No Playwright metrics at all - probes completely stopped
|
|
85
|
+
- alert: UprightPlaywrightProbesMissing
|
|
86
|
+
annotations:
|
|
87
|
+
summary: "Playwright probes missing - no metrics received"
|
|
88
|
+
description: "No Playwright probe metrics are being reported. Browser probes may be stuck or Upright is down."
|
|
89
|
+
upright: "https://app.<%= app_domain %>/?probe_type=playwright"
|
|
90
|
+
expr: absent(upright_probe_up{type="playwright"})
|
|
91
|
+
for: 30m
|
|
92
|
+
labels:
|
|
93
|
+
severity: critical
|
|
94
|
+
group: upright
|
|
95
|
+
|
|
96
|
+
# Site stopped reporting HTTP probes (was reporting in last hour, not now)
|
|
97
|
+
- alert: UprightHTTPSiteMissing
|
|
98
|
+
annotations:
|
|
99
|
+
summary: "HTTP probes missing from {{ $labels.site_city }}"
|
|
100
|
+
description: "{{ $labels.site_city }} ({{ $labels.site_code }}) stopped reporting HTTP probe metrics."
|
|
101
|
+
upright: "https://{{ $labels.site_code }}.<%= app_domain %>/?probe_type=http"
|
|
102
|
+
expr: |
|
|
103
|
+
count by (site_code, site_city) (max_over_time(upright_probe_up{type="http"}[1h]))
|
|
104
|
+
unless
|
|
105
|
+
count by (site_code, site_city) (upright_probe_up{type="http"})
|
|
106
|
+
for: 5m
|
|
107
|
+
labels:
|
|
108
|
+
severity: warning
|
|
109
|
+
group: upright
|
|
110
|
+
|
|
111
|
+
# Site stopped reporting SMTP probes (was reporting in last hour, not now)
|
|
112
|
+
- alert: UprightSMTPSiteMissing
|
|
113
|
+
annotations:
|
|
114
|
+
summary: "SMTP probes missing from {{ $labels.site_city }}"
|
|
115
|
+
description: "{{ $labels.site_city }} ({{ $labels.site_code }}) stopped reporting SMTP probe metrics."
|
|
116
|
+
upright: "https://{{ $labels.site_code }}.<%= app_domain %>/?probe_type=smtp"
|
|
117
|
+
expr: |
|
|
118
|
+
count by (site_code, site_city) (max_over_time(upright_probe_up{type="smtp"}[1h]))
|
|
119
|
+
unless
|
|
120
|
+
count by (site_code, site_city) (upright_probe_up{type="smtp"})
|
|
121
|
+
for: 5m
|
|
122
|
+
labels:
|
|
123
|
+
severity: warning
|
|
124
|
+
group: upright
|
|
125
|
+
|
|
126
|
+
# Site stopped reporting Playwright probes (was reporting in last hour, not now)
|
|
127
|
+
- alert: UprightPlaywrightSiteMissing
|
|
128
|
+
annotations:
|
|
129
|
+
summary: "Playwright probes missing from {{ $labels.site_city }}"
|
|
130
|
+
description: "{{ $labels.site_city }} ({{ $labels.site_code }}) stopped reporting Playwright probe metrics. Probes may be stuck."
|
|
131
|
+
upright: "https://{{ $labels.site_code }}.<%= app_domain %>/?probe_type=playwright"
|
|
132
|
+
expr: |
|
|
133
|
+
count by (site_code, site_city) (max_over_time(upright_probe_up{type="playwright"}[1h]))
|
|
134
|
+
unless
|
|
135
|
+
count by (site_code, site_city) (upright_probe_up{type="playwright"})
|
|
136
|
+
for: 30m
|
|
137
|
+
labels:
|
|
138
|
+
severity: warning
|
|
139
|
+
group: upright
|
|
140
|
+
|
|
141
|
+
# HTTP probe alerts (simple endpoint monitoring)
|
|
142
|
+
- name: upright_http_alerts
|
|
143
|
+
rules:
|
|
144
|
+
- alert: UprightHTTPProbeDown
|
|
145
|
+
annotations:
|
|
146
|
+
summary: "HTTP: {{ $labels.name }} is DOWN"
|
|
147
|
+
description: "{{ $value | humanizePercentage }} of regions report failure on {{ $labels.probe_target }}"
|
|
148
|
+
upright: "https://app.<%= app_domain %>/?probe_type={{ $labels.type }}&status=fail&probe_name={{ $labels.name | urlquery }}"
|
|
149
|
+
expr: upright:probe_down_fraction{type="http"} > 0.5
|
|
150
|
+
labels:
|
|
151
|
+
severity: page
|
|
152
|
+
group: upright
|
|
153
|
+
|
|
154
|
+
- alert: UprightHTTPProbeDegraded
|
|
155
|
+
annotations:
|
|
156
|
+
summary: "HTTP: {{ $labels.name }} degraded - {{ $value | humanizePercentage }} of regions down"
|
|
157
|
+
description: "{{ $value | humanizePercentage }} of regions report failure on {{ $labels.probe_target }}"
|
|
158
|
+
upright: "https://app.<%= app_domain %>/?probe_type={{ $labels.type }}&status=fail&probe_name={{ $labels.name | urlquery }}"
|
|
159
|
+
expr: upright:probe_down_fraction{type="http"} > 0.25 and upright:probe_down_fraction{type="http"} <= 0.5
|
|
160
|
+
for: 5m
|
|
161
|
+
labels:
|
|
162
|
+
severity: warning
|
|
163
|
+
group: upright
|
|
164
|
+
|
|
165
|
+
- alert: UprightHTTPRegionalFailure
|
|
166
|
+
annotations:
|
|
167
|
+
summary: "HTTP: {{ $labels.name }} failing from {{ $labels.site_city }}"
|
|
168
|
+
description: "HTTP: {{ $labels.probe_target }} failing from {{ $labels.site_city }}"
|
|
169
|
+
upright: "https://{{ $labels.site_code }}.<%= app_domain %>/?probe_type={{ $labels.type }}&status=fail&probe_name={{ $labels.name | urlquery }}"
|
|
170
|
+
# Only fire when exactly 1 site is down (isolated regional issue)
|
|
171
|
+
expr: |
|
|
172
|
+
upright_probe_up{type="http"} == 0
|
|
173
|
+
and on (name, type, probe_target)
|
|
174
|
+
count by (name, type, probe_target) (upright_probe_up == 0) == 1
|
|
175
|
+
for: 2m
|
|
176
|
+
labels:
|
|
177
|
+
severity: warning
|
|
178
|
+
group: upright
|
|
179
|
+
|
|
180
|
+
# SMTP probe alerts (mail server monitoring)
|
|
181
|
+
- name: upright_smtp_alerts
|
|
182
|
+
rules:
|
|
183
|
+
- alert: UprightSMTPProbeDown
|
|
184
|
+
annotations:
|
|
185
|
+
summary: "SMTP: {{ $labels.name }} is DOWN"
|
|
186
|
+
description: "{{ $value | humanizePercentage }} of regions report failure on {{ $labels.probe_target }}"
|
|
187
|
+
upright: "https://app.<%= app_domain %>/?probe_type={{ $labels.type }}&status=fail&probe_name={{ $labels.name | urlquery }}"
|
|
188
|
+
expr: upright:probe_down_fraction{type="smtp"} > 0.5
|
|
189
|
+
labels:
|
|
190
|
+
severity: page
|
|
191
|
+
group: upright
|
|
192
|
+
|
|
193
|
+
- alert: UprightSMTPProbeDegraded
|
|
194
|
+
annotations:
|
|
195
|
+
summary: "SMTP: {{ $labels.name }} degraded - {{ $value | humanizePercentage }} of regions down"
|
|
196
|
+
description: "{{ $value | humanizePercentage }} of regions report failure on {{ $labels.probe_target }}"
|
|
197
|
+
upright: "https://app.<%= app_domain %>/?probe_type={{ $labels.type }}&status=fail&probe_name={{ $labels.name | urlquery }}"
|
|
198
|
+
expr: upright:probe_down_fraction{type="smtp"} > 0.25 and upright:probe_down_fraction{type="smtp"} <= 0.5
|
|
199
|
+
for: 5m
|
|
200
|
+
labels:
|
|
201
|
+
severity: warning
|
|
202
|
+
group: upright
|
|
203
|
+
|
|
204
|
+
- alert: UprightSMTPRegionalFailure
|
|
205
|
+
annotations:
|
|
206
|
+
summary: "SMTP: {{ $labels.name }} failing from {{ $labels.site_city }}"
|
|
207
|
+
description: "SMTP: {{ $labels.probe_target }} failing from {{ $labels.site_city }}"
|
|
208
|
+
upright: "https://{{ $labels.site_code }}.<%= app_domain %>/?probe_type={{ $labels.type }}&status=fail&probe_name={{ $labels.name | urlquery }}"
|
|
209
|
+
# Only fire when exactly 1 site is down (isolated regional issue)
|
|
210
|
+
expr: |
|
|
211
|
+
upright_probe_up{type="smtp"} == 0
|
|
212
|
+
and on (name, type, probe_target)
|
|
213
|
+
count by (name, type, probe_target) (upright_probe_up == 0) == 1
|
|
214
|
+
for: 2m
|
|
215
|
+
labels:
|
|
216
|
+
severity: warning
|
|
217
|
+
group: upright
|
|
218
|
+
|
|
219
|
+
# Playwright probe alerts (browser-based login flows)
|
|
220
|
+
- name: upright_playwright_alerts
|
|
221
|
+
rules:
|
|
222
|
+
- alert: UprightPlaywrightProbeFailed
|
|
223
|
+
annotations:
|
|
224
|
+
summary: "Browser: {{ $labels.name }} probe FAILED"
|
|
225
|
+
description: "{{ $value | humanizePercentage }} of regions report failure for browser-based probe on {{ $labels.name }}"
|
|
226
|
+
upright: "https://app.<%= app_domain %>/?probe_type={{ $labels.type }}&status=fail&probe_name={{ $labels.name | urlquery }}"
|
|
227
|
+
expr: upright:probe_down_fraction{type="playwright"} > 0.5
|
|
228
|
+
labels:
|
|
229
|
+
severity: page
|
|
230
|
+
group: upright
|
|
231
|
+
|
|
232
|
+
- alert: UprightPlaywrightProbeDegraded
|
|
233
|
+
annotations:
|
|
234
|
+
summary: "Browser: {{ $labels.name }} degraded - {{ $value | humanizePercentage }} of regions down"
|
|
235
|
+
description: "Not all regions report success for browser-based probe {{ $labels.name }}"
|
|
236
|
+
upright: "https://app.<%= app_domain %>/?probe_type={{ $labels.type }}&status=fail&probe_name={{ $labels.name | urlquery }}"
|
|
237
|
+
expr: upright:probe_down_fraction{type="playwright"} > 0.25 and upright:probe_down_fraction{type="playwright"} <= 0.5
|
|
238
|
+
for: 20m
|
|
239
|
+
labels:
|
|
240
|
+
severity: warning
|
|
241
|
+
group: upright
|
|
242
|
+
|
|
243
|
+
- alert: UprightPlaywrightRegionalFailure
|
|
244
|
+
annotations:
|
|
245
|
+
summary: "Browser: {{ $labels.name }} failing from {{ $labels.site_city }}"
|
|
246
|
+
description: "Browser: {{ $labels.name }} failing from {{ $labels.site_city }}"
|
|
247
|
+
upright: "https://{{ $labels.site_code }}.<%= app_domain %>/?probe_type={{ $labels.type }}&status=fail&probe_name={{ $labels.name | urlquery }}"
|
|
248
|
+
# Only fire when exactly 1 site is down (isolated regional issue)
|
|
249
|
+
expr: |
|
|
250
|
+
upright_probe_up{type="playwright"} == 0
|
|
251
|
+
and on (name, type, probe_target)
|
|
252
|
+
count by (name, type, probe_target) (upright_probe_up == 0) == 1
|
|
253
|
+
for: 20m
|
|
254
|
+
labels:
|
|
255
|
+
severity: warning
|
|
256
|
+
group: upright
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
module Upright
|
|
2
|
+
module Generators
|
|
3
|
+
class PlaywrightProbeGenerator < Rails::Generators::NamedBase
|
|
4
|
+
source_root File.expand_path("templates", __dir__)
|
|
5
|
+
|
|
6
|
+
class_option :with_authenticator, type: :boolean, default: false,
|
|
7
|
+
desc: "Generate an authenticator class for this probe"
|
|
8
|
+
|
|
9
|
+
def create_probe_file
|
|
10
|
+
template "probe.rb.tt", File.join("probes", "#{file_name}_probe.rb")
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def create_authenticator_file
|
|
14
|
+
if options[:with_authenticator]
|
|
15
|
+
template "authenticator.rb.tt", File.join("probes/authenticators", "#{file_name}.rb")
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
private
|
|
20
|
+
|
|
21
|
+
def probe_class_name
|
|
22
|
+
"#{class_name}Probe"
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def authenticator_name
|
|
26
|
+
file_name.underscore.to_sym
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
class Playwright::Authenticator::<%= class_name %> < Upright::Playwright::Authenticator::Base
|
|
2
|
+
def signin_redirect_url = "https://example.com/dashboard"
|
|
3
|
+
def signin_path = "/login"
|
|
4
|
+
def service_name = :<%= file_name %>
|
|
5
|
+
|
|
6
|
+
def authenticate
|
|
7
|
+
# Your authentication logic here
|
|
8
|
+
# page.goto("https://example.com/login")
|
|
9
|
+
# page.get_by_label("Email").fill(credentials.<%= file_name %>.email)
|
|
10
|
+
# page.get_by_label("Password").fill(credentials.<%= file_name %>.password)
|
|
11
|
+
# page.get_by_role("button", name: "Sign in").click
|
|
12
|
+
raise NotImplementedError, "Implement the authenticate method for <%= class_name %>"
|
|
13
|
+
end
|
|
14
|
+
end
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
class Probes::Playwright::<%= probe_class_name %> < Upright::Probes::Playwright::Base
|
|
2
|
+
<% if options[:with_authenticator] -%>
|
|
3
|
+
authenticate_with_form :<%= authenticator_name %>
|
|
4
|
+
|
|
5
|
+
<% end -%>
|
|
6
|
+
def check
|
|
7
|
+
# Your probe logic here
|
|
8
|
+
# page.goto("https://example.com")
|
|
9
|
+
# page.fill('[name="email"]', "user@example.com")
|
|
10
|
+
# page.click('button[type="submit"]')
|
|
11
|
+
# page.wait_for_selector(".success")
|
|
12
|
+
raise NotImplementedError, "Implement the check method for <%= probe_class_name %>"
|
|
13
|
+
end
|
|
14
|
+
end
|