@hasna/uptime 0.1.19 → 0.1.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +28 -0
- package/README.md +15 -0
- package/dist/api.js +76 -5
- package/dist/cli/index.js +78 -7
- package/dist/cloud-plan.js +1 -1
- package/dist/index.js +77 -6
- package/docs/architecture.md +43 -0
- package/docs/aws-deployment-runbook.md +26 -3
- package/docs/aws-runtime-security.md +473 -0
- package/docs/cloud-source-of-truth.md +482 -0
- package/docs/deployment-metadata.example.json +52 -0
- package/docs/monitoring-product-contract.md +493 -0
- package/docs/operational-tracking.md +91 -0
- package/infra/aws/main.tf +1 -0
- package/infra/aws/terraform.tfvars.example +1 -1
- package/infra/aws/variables.tf +2 -2
- package/package.json +3 -2
|
@@ -0,0 +1,493 @@
|
|
|
1
|
+
# Monitoring Product Contract
|
|
2
|
+
|
|
3
|
+
This document defines the hosted Open Uptime product scope. It translates the
|
|
4
|
+
cloud and AWS architecture into user-visible monitoring, operator workflows,
|
|
5
|
+
data contracts, and dashboard/canvas requirements.
|
|
6
|
+
|
|
7
|
+
Open Uptime should feel closer to Pingdom than Sentry: it watches websites,
|
|
8
|
+
pages, APIs, domains, DNS, TLS, servers, private health checks, deployment
|
|
9
|
+
signals, and heartbeat jobs. It is not an exception-tracing product.
|
|
10
|
+
|
|
11
|
+
## Current Local Product
|
|
12
|
+
|
|
13
|
+
The current release is intentionally small:
|
|
14
|
+
|
|
15
|
+
- monitor kinds: `http` and `tcp`;
|
|
16
|
+
- API routes: summary, report, monitor CRUD, immediate checks, incidents, and
|
|
17
|
+
recent results;
|
|
18
|
+
- dashboard: local HTML for add/edit/check/pause/delete, summary cards, recent
|
|
19
|
+
results, and incident table;
|
|
20
|
+
- reports: immediate snapshot report through Mailery, Telephony, and Logs using
|
|
21
|
+
local/dev integration settings;
|
|
22
|
+
- availability: check-count uptime percentage, not elapsed-time SLA;
|
|
23
|
+
- persistence: local SQLite only.
|
|
24
|
+
|
|
25
|
+
Hosted mode must not expose this local surface as-is. It needs cloud auth,
|
|
26
|
+
workspace/project scope, probe separation, source imports, target policy, cloud
|
|
27
|
+
storage, evidence redaction, and richer operator workflows first.
|
|
28
|
+
|
|
29
|
+
Hard cutover gate: do not expose hosted dashboard, API, MCP-over-HTTP, report,
|
|
30
|
+
render, canvas, artifact, or check surfaces until the P0 hosted gates in this
|
|
31
|
+
document have automated tests.
|
|
32
|
+
|
|
33
|
+
## Product Objects
|
|
34
|
+
|
|
35
|
+
Cloud Open Uptime owns these product objects:
|
|
36
|
+
|
|
37
|
+
- `workspace`: auth and isolation boundary.
|
|
38
|
+
- `asset`: monitorable thing imported from Projects, Servers, Domains,
|
|
39
|
+
Deployment, or created manually.
|
|
40
|
+
- `monitor`: policy for checking an asset or target.
|
|
41
|
+
- `check_job`: scheduled work item for a probe.
|
|
42
|
+
- `check_result`: immutable result for one monitor version and schedule slot.
|
|
43
|
+
- `probe`: public or private worker identity and capabilities.
|
|
44
|
+
- `incident`: duration-based downtime or degradation record.
|
|
45
|
+
- `maintenance_window`: planned suppression or expected outage period.
|
|
46
|
+
- `browser_evidence`: redacted artifact refs for browser/page checks.
|
|
47
|
+
- `report_schedule`: recurring SLA/operator report definition.
|
|
48
|
+
- `report_run`: generated report artifact and delivery state.
|
|
49
|
+
- `notification_policy`: escalation, dedupe, silence, and delivery routing.
|
|
50
|
+
- `import_batch`: preview/apply/rollback record for inventory imports.
|
|
51
|
+
- `dashboard_view`: saved fleet/dashboard filter or JSON Render view.
|
|
52
|
+
- `canvas`: React Flow project canvas backed by Projects cloud stores.
|
|
53
|
+
|
|
54
|
+
Every object is workspace-scoped and carries ownership/provenance/audit metadata.
|
|
55
|
+
|
|
56
|
+
High-cardinality objects such as `check_result`, `browser_evidence`,
|
|
57
|
+
`audit_events`, and report artifacts require retention and partitioning/index
|
|
58
|
+
rules before production. Fleet and report pages must have query plans or load
|
|
59
|
+
tests for the expected monitor count, interval mix, and retention window.
|
|
60
|
+
|
|
61
|
+
## Monitor Kinds
|
|
62
|
+
|
|
63
|
+
Initial hosted monitor kinds:
|
|
64
|
+
|
|
65
|
+
| Kind | Purpose | Source |
|
|
66
|
+
| --- | --- | --- |
|
|
67
|
+
| `http` | Website/API availability, status, latency, redirects, headers, body, JSON assertions. | Manual, Projects, Domains, Deployment |
|
|
68
|
+
| `browser_page` | Page load, console errors, uncaught exceptions, failed resources, screenshot/trace evidence. | Manual, Projects, Domains, Deployment |
|
|
69
|
+
| `tcp` | Public or private port connect. | Manual for public, Servers/Deployment for private |
|
|
70
|
+
| `server_health` | Private server health URL/port from server inventory. | Servers only |
|
|
71
|
+
| `dns` | A/AAAA/CNAME/MX/TXT/NS checks, authoritative/recursive drift. | Domains |
|
|
72
|
+
| `tls` | Certificate expiry, hostname match, chain validity. | Domains, HTTP assets |
|
|
73
|
+
| `domain_expiry` | Domain/RDAP expiry threshold. | Domains |
|
|
74
|
+
| `deployment` | Latest environment URL/resource status and rollback/failure signal. | Deployment |
|
|
75
|
+
| `heartbeat` | Job/service check-in before a deadline. | Manual/API |
|
|
76
|
+
| `report_delivery` | Scheduled report generation/delivery health. | Open Uptime internal |
|
|
77
|
+
|
|
78
|
+
Each kind needs:
|
|
79
|
+
|
|
80
|
+
- config schema;
|
|
81
|
+
- assertion schema;
|
|
82
|
+
- normalized result schema;
|
|
83
|
+
- failure reason taxonomy;
|
|
84
|
+
- allowed probe classes;
|
|
85
|
+
- default interval, timeout, retry, and down policy;
|
|
86
|
+
- CLI/API/MCP/SDK representation;
|
|
87
|
+
- JSON Render summary/detail specs;
|
|
88
|
+
- dashboard creation/editing flow;
|
|
89
|
+
- tests for success, failure, validation, and redaction.
|
|
90
|
+
|
|
91
|
+
Monitor kinds are enabled by feature flag. `http` and `tcp` can be the first
|
|
92
|
+
cloud-safe kinds after the hosted core is in place. `browser_page`, private
|
|
93
|
+
`server_health`, hosted report delivery, and broader deployment/import monitors
|
|
94
|
+
remain disabled until their specific acceptance gates pass.
|
|
95
|
+
|
|
96
|
+
## Target And Probe Rules
|
|
97
|
+
|
|
98
|
+
Public monitors can be manually created only when the target policy classifies
|
|
99
|
+
the target as public and safe.
|
|
100
|
+
|
|
101
|
+
Private monitors must come from approved inventory refs and run only on private
|
|
102
|
+
probes. Hosted forms must not allow an operator to type arbitrary private IPs,
|
|
103
|
+
private DNS names, metadata endpoints, or loopback addresses as private monitor
|
|
104
|
+
targets.
|
|
105
|
+
|
|
106
|
+
Probe selection rules:
|
|
107
|
+
|
|
108
|
+
- public checks run on public probes only;
|
|
109
|
+
- private checks run on private probes only;
|
|
110
|
+
- private probe identity is bound to workspace, machine id, source inventories,
|
|
111
|
+
capabilities, and trust class;
|
|
112
|
+
- each result stores probe id, probe class, probe location, monitor version, and
|
|
113
|
+
schedule slot;
|
|
114
|
+
- down policy defines whether one location, quorum, or authoritative private
|
|
115
|
+
result opens an incident;
|
|
116
|
+
- stale or duplicate result submissions cannot close or reopen incidents.
|
|
117
|
+
|
|
118
|
+
Job lease semantics:
|
|
119
|
+
|
|
120
|
+
- deterministic job identity is
|
|
121
|
+
`workspace_id/monitor_id/monitor_version/schedule_slot/probe_policy`;
|
|
122
|
+
- scheduler inserts jobs idempotently and has a bounded catch-up window;
|
|
123
|
+
- duplicate scheduler instances cannot create duplicate authoritative jobs;
|
|
124
|
+
- probes claim jobs transactionally with `lease_expires_at` and a fencing token;
|
|
125
|
+
- result ingest requires the active fencing token, probe identity, schedule slot,
|
|
126
|
+
monitor version, and idempotency key;
|
|
127
|
+
- stale fencing tokens and duplicate submissions are rejected or marked
|
|
128
|
+
duplicate without mutating incidents;
|
|
129
|
+
- deploy drain pauses new claims and lets in-flight leases expire or finish;
|
|
130
|
+
- alarms fire for stale leases and backlog.
|
|
131
|
+
|
|
132
|
+
## Inventory Import Workflow
|
|
133
|
+
|
|
134
|
+
Imports are preview/apply workflows.
|
|
135
|
+
|
|
136
|
+
Preview sources:
|
|
137
|
+
|
|
138
|
+
- Projects: project identity, status, owner, stage, priority, GitHub refs,
|
|
139
|
+
service metadata, project stores, canvases, and render specs.
|
|
140
|
+
- Servers: hostname, health URL, ports, Tailscale fields, project refs, and
|
|
141
|
+
readiness snapshots for private health monitors.
|
|
142
|
+
- Domains: domain records, DNS records, SSL/TLS expiry, domain expiry, root
|
|
143
|
+
HTTP checks, and discovered page candidates.
|
|
144
|
+
- Deployment: environment URLs, provider/resource status, live deployment refs,
|
|
145
|
+
region, deployment failures, and rollback signals.
|
|
146
|
+
|
|
147
|
+
Source classification:
|
|
148
|
+
|
|
149
|
+
- `cloud-capable`: source has an authenticated cloud API/store with versions,
|
|
150
|
+
tombstones, and safe refs;
|
|
151
|
+
- `preview-only`: source can produce safe candidates but apply is disabled;
|
|
152
|
+
- `link-only`: source can be referenced by id/snapshot but not treated as an
|
|
153
|
+
authoritative hosted dependency;
|
|
154
|
+
- `blocked`: source cannot be used until its owner fixes auth, secrets,
|
|
155
|
+
tombstones, or data quality.
|
|
156
|
+
|
|
157
|
+
Initial classification:
|
|
158
|
+
|
|
159
|
+
- Projects: preview/link identity metadata is feasible; cloud-primary canvases
|
|
160
|
+
require Projects cloud-backed per-project stores and local-path stripping.
|
|
161
|
+
- Servers: preview approved inventory candidates; private apply waits for
|
|
162
|
+
private probe trust and source refs.
|
|
163
|
+
- Domains: preview DNS/TLS/domain-expiry/root HTTP; avoid copying ownership PII,
|
|
164
|
+
raw WHOIS/history, or treating hard deletes as tombstones.
|
|
165
|
+
- Deployment: link-only until authenticated, secret-ref-only, and safe for hosted
|
|
166
|
+
consumption.
|
|
167
|
+
|
|
168
|
+
Preview must show:
|
|
169
|
+
|
|
170
|
+
- proposed asset and monitor rows;
|
|
171
|
+
- source provenance and freshness;
|
|
172
|
+
- target policy decision;
|
|
173
|
+
- dedupe/conflict action: create, update, unchanged, stale, blocked, conflict;
|
|
174
|
+
- owner/team/environment/tag mapping;
|
|
175
|
+
- warnings and required approvals;
|
|
176
|
+
- no secret values and no raw local paths in hosted render payloads.
|
|
177
|
+
|
|
178
|
+
Apply must:
|
|
179
|
+
|
|
180
|
+
- create/update assets and monitors idempotently;
|
|
181
|
+
- store provenance snapshots and import-batch audit events;
|
|
182
|
+
- allow rollback of newly-created monitor config from a batch;
|
|
183
|
+
- preserve historical results and incidents;
|
|
184
|
+
- mark stale sources without deleting history automatically;
|
|
185
|
+
- rollback creates, updates, stale markings, provenance changes, and conflict
|
|
186
|
+
decisions from before/after snapshots, while preserving historical results and
|
|
187
|
+
incidents.
|
|
188
|
+
|
|
189
|
+
Preview/apply parity is required across API, CLI, MCP, and SDK for agent use.
|
|
190
|
+
|
|
191
|
+
## Browser/Page Monitoring
|
|
192
|
+
|
|
193
|
+
`browser_page` is hosted-only and remains disabled until the evidence pipeline is
|
|
194
|
+
implemented.
|
|
195
|
+
|
|
196
|
+
Checks capture:
|
|
197
|
+
|
|
198
|
+
- navigation status and final URL;
|
|
199
|
+
- load timing and Core Web Vitals-lite metrics;
|
|
200
|
+
- console errors matching policy;
|
|
201
|
+
- uncaught page exceptions;
|
|
202
|
+
- failed script/image/API resources;
|
|
203
|
+
- mixed content or blocked resources where available;
|
|
204
|
+
- screenshot and optional trace/network artifact refs;
|
|
205
|
+
- DOM assertions where configured.
|
|
206
|
+
|
|
207
|
+
Evidence handling:
|
|
208
|
+
|
|
209
|
+
- redact before persistence;
|
|
210
|
+
- mask configured selectors/regions;
|
|
211
|
+
- scrub cookies, auth headers, tokens, query secrets, storage values, form
|
|
212
|
+
values, console payloads, and network payloads;
|
|
213
|
+
- store artifact refs, checksums, sizes, redaction status, and retention class;
|
|
214
|
+
- expose signed URLs only after workspace/artifact authorization;
|
|
215
|
+
- default retention is short.
|
|
216
|
+
- feature flag remains off until Playwright/container smoke and load tests,
|
|
217
|
+
redaction fail-closed tests, S3 signed URL auth tests, and budget alarms pass.
|
|
218
|
+
|
|
219
|
+
Browser result grouping:
|
|
220
|
+
|
|
221
|
+
- console/page/network errors get grouping keys by monitor, URL pattern, error
|
|
222
|
+
type, normalized message, source file/resource host, and stack signature where
|
|
223
|
+
safe;
|
|
224
|
+
- repeated failures update incident timeline and evidence count instead of
|
|
225
|
+
creating noisy duplicate incidents.
|
|
226
|
+
|
|
227
|
+
## Incident Workflow
|
|
228
|
+
|
|
229
|
+
Incident states:
|
|
230
|
+
|
|
231
|
+
- `open`
|
|
232
|
+
- `acknowledged`
|
|
233
|
+
- `silenced`
|
|
234
|
+
- `maintenance`
|
|
235
|
+
- `resolved`
|
|
236
|
+
- `closed`
|
|
237
|
+
- `reopened`
|
|
238
|
+
|
|
239
|
+
State transitions:
|
|
240
|
+
|
|
241
|
+
- checks can open incidents after monitor down policy is satisfied;
|
|
242
|
+
- checks can auto-resolve incidents only when recovery policy is satisfied and
|
|
243
|
+
the result is from an authoritative schedule slot/probe;
|
|
244
|
+
- operators can acknowledge, unacknowledge, silence, unsilence, assign, comment,
|
|
245
|
+
attach evidence, create maintenance, manually close, and reopen;
|
|
246
|
+
- maintenance suppresses notifications and SLA impact according to report policy
|
|
247
|
+
but does not delete check results;
|
|
248
|
+
- stale probe results and duplicate jobs cannot change incident state.
|
|
249
|
+
|
|
250
|
+
The implementation must publish a state-machine table covering allowed
|
|
251
|
+
transitions, actor/source of transition, required reason, audit action,
|
|
252
|
+
notification behavior, report impact, and reversal behavior. Tests must cover
|
|
253
|
+
auto-open, auto-resolve, ack, silence, maintenance, manual close, reopen,
|
|
254
|
+
assignment, comments, evidence attachment, notification dedupe, and stale-result
|
|
255
|
+
rejection.
|
|
256
|
+
|
|
257
|
+
Incident detail must show:
|
|
258
|
+
|
|
259
|
+
- affected asset and monitor;
|
|
260
|
+
- status, severity, owner/team, assignee;
|
|
261
|
+
- source inventory refs;
|
|
262
|
+
- timeline events;
|
|
263
|
+
- check results and probe identity;
|
|
264
|
+
- notification attempts and suppressions;
|
|
265
|
+
- evidence artifacts;
|
|
266
|
+
- related incidents;
|
|
267
|
+
- SLA impact for selected windows;
|
|
268
|
+
- audit trail for operator actions.
|
|
269
|
+
|
|
270
|
+
## Notifications And Reports
|
|
271
|
+
|
|
272
|
+
Notifications:
|
|
273
|
+
|
|
274
|
+
- use workspace-authorized channel refs only;
|
|
275
|
+
- support email, SMS/phone, Open Logs, and future webhooks through service-owned
|
|
276
|
+
refs;
|
|
277
|
+
- support dedupe, escalation, silence, maintenance suppression, retry/backoff,
|
|
278
|
+
and failure alarms;
|
|
279
|
+
- mask private targets unless the channel is authorized for that target class.
|
|
280
|
+
|
|
281
|
+
Reports:
|
|
282
|
+
|
|
283
|
+
- scheduled SLA/operator reports, not only immediate snapshots;
|
|
284
|
+
- duration-based availability by time window;
|
|
285
|
+
- timezone and business-hour support;
|
|
286
|
+
- maintenance exclusion policy;
|
|
287
|
+
- filters by owner, team, environment, project, source, monitor kind, severity,
|
|
288
|
+
and incident state;
|
|
289
|
+
- generated JSON and HTML artifacts;
|
|
290
|
+
- delivery attempts with idempotency and retry state;
|
|
291
|
+
- report-run monitor to detect stuck or failed reports.
|
|
292
|
+
|
|
293
|
+
Report run state machine:
|
|
294
|
+
|
|
295
|
+
- `scheduled`
|
|
296
|
+
- `generating`
|
|
297
|
+
- `generated`
|
|
298
|
+
- `delivering`
|
|
299
|
+
- `delivered`
|
|
300
|
+
- `partially_failed`
|
|
301
|
+
- `failed`
|
|
302
|
+
- `cancelled`
|
|
303
|
+
|
|
304
|
+
Report acceptance needs DST/timezone golden tests, business-hour window tests,
|
|
305
|
+
recipient/channel authorization recheck at send time, retry exhaustion behavior,
|
|
306
|
+
idempotent delivery keys, redacted artifacts in S3, reporter-specific IAM, and
|
|
307
|
+
alarms for stuck or failed runs.
|
|
308
|
+
|
|
309
|
+
The local direct `apiUrl`/key style can remain for local development. Hosted
|
|
310
|
+
report APIs reject raw URLs, keys, arbitrary recipients, and arbitrary Logs
|
|
311
|
+
project ids.
|
|
312
|
+
|
|
313
|
+
## Dashboard Views
|
|
314
|
+
|
|
315
|
+
Hosted UI is a work-focused operator app, not a marketing surface.
|
|
316
|
+
|
|
317
|
+
Required views:
|
|
318
|
+
|
|
319
|
+
- fleet overview: owner/environment/source/probe health, open incidents,
|
|
320
|
+
stale monitors, muted/maintenance items, report status, probe health;
|
|
321
|
+
- monitor list: filters, saved views, bulk actions, source provenance,
|
|
322
|
+
current status, SLA, latency, incident count, probe policy;
|
|
323
|
+
- monitor detail: config, assertions, target policy decision, recent results,
|
|
324
|
+
incidents, evidence, timeline, source refs, audit;
|
|
325
|
+
- incident queue: severity, state, assignee, owner/team, duration, affected
|
|
326
|
+
assets, silence/maintenance state, notification state;
|
|
327
|
+
- incident detail: full operator timeline and action panel;
|
|
328
|
+
- import preview/apply: candidate diffs, warnings, approvals, apply progress,
|
|
329
|
+
rollback record;
|
|
330
|
+
- probe management: public/private probes, heartbeats, version drift,
|
|
331
|
+
capabilities, assigned jobs, failures, revocation;
|
|
332
|
+
- browser errors: grouped console/page/resource failures, screenshots/traces,
|
|
333
|
+
retention and redaction status;
|
|
334
|
+
- reports: schedules, report runs, generated artifacts, delivery attempts,
|
|
335
|
+
retry failures;
|
|
336
|
+
- settings: workspaces, roles/scopes, channel refs, policies, retention,
|
|
337
|
+
maintenance windows.
|
|
338
|
+
|
|
339
|
+
All views need authenticated empty, loading, error, stale-data, and partial-data
|
|
340
|
+
states. Fleet pages need explicit freshness timestamps and pagination or
|
|
341
|
+
virtualized tables for scale.
|
|
342
|
+
|
|
343
|
+
User-facing dashboard acceptance requires:
|
|
344
|
+
|
|
345
|
+
- fleet overview, monitor detail, incident detail, import preview, probe health,
|
|
346
|
+
browser errors, report schedules/runs, settings, and project canvas embedding;
|
|
347
|
+
- RBAC-aware action visibility;
|
|
348
|
+
- freshness indicators on every live data panel;
|
|
349
|
+
- no local SQLite fallback in hosted dashboards;
|
|
350
|
+
- redaction based on viewer authorization.
|
|
351
|
+
|
|
352
|
+
## JSON Render And React Flow
|
|
353
|
+
|
|
354
|
+
Open Uptime exposes JSON Render specs for operator surfaces:
|
|
355
|
+
|
|
356
|
+
- `uptime.fleet`
|
|
357
|
+
- `uptime.monitor`
|
|
358
|
+
- `uptime.incident`
|
|
359
|
+
- `uptime.import_preview`
|
|
360
|
+
- `uptime.probe_health`
|
|
361
|
+
- `uptime.browser_errors`
|
|
362
|
+
- `uptime.report_schedule`
|
|
363
|
+
- `uptime.report_run`
|
|
364
|
+
- `uptime.canvas_node`
|
|
365
|
+
|
|
366
|
+
Projects owns canvas storage. Open Uptime owns render specs and dashboard query
|
|
367
|
+
payloads that can be embedded in project canvases.
|
|
368
|
+
|
|
369
|
+
Canvas node types:
|
|
370
|
+
|
|
371
|
+
- fleet summary node;
|
|
372
|
+
- monitor status node;
|
|
373
|
+
- incident queue node;
|
|
374
|
+
- browser evidence node;
|
|
375
|
+
- probe health node;
|
|
376
|
+
- import batch node;
|
|
377
|
+
- report run node;
|
|
378
|
+
- source inventory node.
|
|
379
|
+
|
|
380
|
+
Canvas/render requirements:
|
|
381
|
+
|
|
382
|
+
- specs are versioned and validated;
|
|
383
|
+
- nodes link to workspace/project/monitor/incident/probe/report/import ids;
|
|
384
|
+
- no raw local paths or secret values in hosted payloads;
|
|
385
|
+
- private target labels are redacted by viewer authorization;
|
|
386
|
+
- React Flow nodes support drill-in links to hosted views;
|
|
387
|
+
- multiple canvases per project are supported through Projects cloud stores.
|
|
388
|
+
|
|
389
|
+
## API Surface
|
|
390
|
+
|
|
391
|
+
Target hosted API namespace: `/api/v1`.
|
|
392
|
+
|
|
393
|
+
Hosted mode uses `/api/v1` as the canonical API. Legacy local `/api/*` routes
|
|
394
|
+
are either local-only or explicit translations that still enforce the hosted
|
|
395
|
+
route-to-scope matrix. Tests must inventory every hosted route and prove 401,
|
|
396
|
+
403, and cross-workspace denial behavior.
|
|
397
|
+
|
|
398
|
+
Required groups:
|
|
399
|
+
|
|
400
|
+
- `/assets`
|
|
401
|
+
- `/monitors`
|
|
402
|
+
- `/monitor-kinds`
|
|
403
|
+
- `/checks/jobs`
|
|
404
|
+
- `/checks/results`
|
|
405
|
+
- `/probes`
|
|
406
|
+
- `/incidents`
|
|
407
|
+
- `/incidents/:id/actions`
|
|
408
|
+
- `/maintenance-windows`
|
|
409
|
+
- `/imports/preview`
|
|
410
|
+
- `/imports/apply`
|
|
411
|
+
- `/imports/:id/rollback`
|
|
412
|
+
- `/browser/errors`
|
|
413
|
+
- `/evidence`
|
|
414
|
+
- `/reports/schedules`
|
|
415
|
+
- `/reports/runs`
|
|
416
|
+
- `/notifications/policies`
|
|
417
|
+
- `/render/*`
|
|
418
|
+
- `/canvases/*` for links/projections, with Projects owning persistence
|
|
419
|
+
|
|
420
|
+
MCP and SDK surfaces must mirror the same product operations. CLI commands can
|
|
421
|
+
lag hosted UI breadth, but must expose safe dry-run/import/report/probe/admin
|
|
422
|
+
operations needed for operators and agents.
|
|
423
|
+
|
|
424
|
+
## First Hosted Milestones
|
|
425
|
+
|
|
426
|
+
Milestone 1: cloud-safe core
|
|
427
|
+
|
|
428
|
+
- hosted auth/RBAC;
|
|
429
|
+
- Postgres store and migrations;
|
|
430
|
+
- target policy;
|
|
431
|
+
- HTTP/TCP monitor parity through jobs/probes;
|
|
432
|
+
- import preview from Projects/Domains/Servers/Deployment in dry-run mode;
|
|
433
|
+
- incident queue with ack/silence/maintenance/comment;
|
|
434
|
+
- scheduled report data model without external delivery;
|
|
435
|
+
- JSON Render fleet/monitor/incident specs.
|
|
436
|
+
|
|
437
|
+
Milestone 1 explicitly excludes hosted browser checks, private server-health
|
|
438
|
+
apply, report delivery to external channels, deployment apply, and cloud-primary
|
|
439
|
+
project canvases. It can include safe dry-run previews and link-only refs.
|
|
440
|
+
|
|
441
|
+
Milestone 2: imports and private probes
|
|
442
|
+
|
|
443
|
+
- import apply/rollback;
|
|
444
|
+
- private probe enrollment for an operator machine;
|
|
445
|
+
- server health checks from Open Servers;
|
|
446
|
+
- DNS/TLS/domain expiry checks from Open Domains;
|
|
447
|
+
- report delivery through authorized Mailery/Telephony/Logs refs;
|
|
448
|
+
- project canvas embedding through cloud-backed Projects stores.
|
|
449
|
+
|
|
450
|
+
Milestone 3: browser/page monitoring
|
|
451
|
+
|
|
452
|
+
- Playwright/container runtime;
|
|
453
|
+
- redaction/artifact pipeline;
|
|
454
|
+
- browser page check kind;
|
|
455
|
+
- grouped page errors;
|
|
456
|
+
- screenshot/trace evidence;
|
|
457
|
+
- retention and cost controls.
|
|
458
|
+
|
|
459
|
+
Milestone 4: broader probe fleet and polish
|
|
460
|
+
|
|
461
|
+
- additional public locations;
|
|
462
|
+
- quorum/down policies;
|
|
463
|
+
- richer dashboards and saved views;
|
|
464
|
+
- deployment monitors;
|
|
465
|
+
- self-monitoring and SLA reporting polish.
|
|
466
|
+
|
|
467
|
+
## Acceptance Criteria
|
|
468
|
+
|
|
469
|
+
- Hosted mode exposes no monitor data without auth.
|
|
470
|
+
- Hosted mode has one canonical `/api/v1` route map and route-to-scope tests.
|
|
471
|
+
- Monitor kind schemas and result schemas exist for every enabled kind.
|
|
472
|
+
- Public probes cannot reach denied target classes; private monitors require
|
|
473
|
+
inventory provenance and private probe authorization.
|
|
474
|
+
- Check jobs have deterministic identity, transactional leases, fencing tokens,
|
|
475
|
+
duplicate/stale rejection, and scheduler/probe race tests.
|
|
476
|
+
- Import preview/apply is idempotent and never copies secrets or raw local paths.
|
|
477
|
+
- Import rollback covers creates, updates, stale markings, provenance changes,
|
|
478
|
+
and conflict decisions from before/after snapshots.
|
|
479
|
+
- Incidents have tested state transitions and cannot be changed by stale/duplicate
|
|
480
|
+
results.
|
|
481
|
+
- Reports use duration-based SLA windows and authorized channel refs.
|
|
482
|
+
- Report runs have a durable state machine, retry/idempotency behavior, timezone
|
|
483
|
+
tests, authorization recheck, and failure alarms.
|
|
484
|
+
- Browser checks are disabled until redacted S3 evidence and signed URL controls
|
|
485
|
+
are implemented.
|
|
486
|
+
- High-cardinality tables have partition, index, retention, and query/load-test
|
|
487
|
+
acceptance.
|
|
488
|
+
- Dashboards cover fleet, monitor detail, incident detail, imports, probes,
|
|
489
|
+
browser errors, reports, and canvases with proper empty/error/stale states.
|
|
490
|
+
- JSON Render specs validate and can be embedded into Projects React Flow
|
|
491
|
+
canvases without leaking private targets or secrets.
|
|
492
|
+
- Feature flags keep browser checks, private probes, hosted report delivery, and
|
|
493
|
+
cloud-primary canvases disabled until their acceptance gates pass.
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# Open Uptime Operational Tracking
|
|
2
|
+
|
|
3
|
+
This document describes the public operational-tracking contract for hosted Open
|
|
4
|
+
Uptime work. Deployment-specific goal ids, task ids, local machine names, local
|
|
5
|
+
workspace paths, and private cloud-account details belong in private deployment
|
|
6
|
+
metadata, not in this repository.
|
|
7
|
+
|
|
8
|
+
## Public Ledger
|
|
9
|
+
|
|
10
|
+
Use these repository documents as the public source for hosted design and
|
|
11
|
+
acceptance criteria:
|
|
12
|
+
|
|
13
|
+
- `docs/cloud-source-of-truth.md`
|
|
14
|
+
- `docs/aws-runtime-security.md`
|
|
15
|
+
- `docs/monitoring-product-contract.md`
|
|
16
|
+
- `docs/aws-deployment-runbook.md`
|
|
17
|
+
|
|
18
|
+
Private deployment metadata may record exact task ids, reviewer ids, AWS account
|
|
19
|
+
ids, backend state keys, and machine-local paths. Keep that metadata outside the
|
|
20
|
+
package and outside public docs.
|
|
21
|
+
|
|
22
|
+
## Cloud-Primary Status
|
|
23
|
+
|
|
24
|
+
Local CLI records and local project databases are not cloud authority. Hosted
|
|
25
|
+
Open Uptime must treat cloud-backed state as authoritative only after the cloud
|
|
26
|
+
store, auth, leases, and probe enrollment gates pass.
|
|
27
|
+
|
|
28
|
+
Known public blockers:
|
|
29
|
+
|
|
30
|
+
- Projects, knowledge, notes, mementos, and todos may still be local-first in a
|
|
31
|
+
development environment.
|
|
32
|
+
- The first AWS bridge uses explicit EFS-backed SQLite for a single protected web
|
|
33
|
+
task. The target cloud-primary store is still Postgres plus object storage.
|
|
34
|
+
- Private probe operator machines are not cloud-primary by local filesystem
|
|
35
|
+
status. Any primary/operator status must be represented as a time-limited
|
|
36
|
+
cloud lease.
|
|
37
|
+
|
|
38
|
+
## Hard Hosted Gate
|
|
39
|
+
|
|
40
|
+
Do not expose hosted dashboard, API, MCP, report delivery, JSON Render/canvas
|
|
41
|
+
specs, artifacts, browser evidence, or check execution until these are tested:
|
|
42
|
+
|
|
43
|
+
- hosted auth/RBAC and workspace scoping
|
|
44
|
+
- explicit EFS-backed hosted SQLite for the first deploy, followed by Postgres
|
|
45
|
+
persistence with migrations, tombstones, audit, and no hidden local fallback
|
|
46
|
+
- shared target policy with SSRF protections
|
|
47
|
+
- scheduler `check_jobs` and probe lease fencing
|
|
48
|
+
- report delivery through open-mailery/open-telephony/open-logs channel refs
|
|
49
|
+
- JSON Render and canvas redaction
|
|
50
|
+
- browser evidence isolation
|
|
51
|
+
- private probe identity, revocation, and isolation
|
|
52
|
+
- AWS IAM/RDS/S3/ALB/ECS runtime boundaries
|
|
53
|
+
- Apache-2.0 OSS release readiness
|
|
54
|
+
- independent adversarial review lanes
|
|
55
|
+
|
|
56
|
+
## Idempotency Keys
|
|
57
|
+
|
|
58
|
+
Use stable idempotency keys for delegated or replayable work:
|
|
59
|
+
|
|
60
|
+
- `open-uptime:hosted-exposure-guard:v1`
|
|
61
|
+
- `open-uptime:hosted-auth-rbac:v1`
|
|
62
|
+
- `open-uptime:target-policy:v1`
|
|
63
|
+
- `open-uptime:postgres-cloud-store:v1`
|
|
64
|
+
- `open-uptime:check-jobs-probe-lease:v1`
|
|
65
|
+
- `open-uptime:monitor-schemas:v1`
|
|
66
|
+
- `open-uptime:inventory-import:v1`
|
|
67
|
+
- `open-uptime:incident-workflow:v1`
|
|
68
|
+
- `open-uptime:report-delivery:v1`
|
|
69
|
+
- `open-uptime:json-render-canvases:v1`
|
|
70
|
+
- `open-uptime:aws-runtime:v1`
|
|
71
|
+
- `open-uptime:private-probe:v1`
|
|
72
|
+
- `open-uptime:release-validation:v1`
|
|
73
|
+
|
|
74
|
+
## Validation Commands
|
|
75
|
+
|
|
76
|
+
Use these baseline checks before resuming implementation:
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
git status --short
|
|
80
|
+
bun run build
|
|
81
|
+
bun run typecheck
|
|
82
|
+
bun test
|
|
83
|
+
terraform -chdir=infra/aws fmt -check -recursive
|
|
84
|
+
terraform -chdir=infra/aws validate
|
|
85
|
+
uptime --version
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
AWS plan, apply, and smoke tests should run only from an approved private
|
|
89
|
+
deployment root or infrastructure repository with remote state, locking, reviewed
|
|
90
|
+
KMS/secrets, budget recipients, rollback instructions, and no plaintext secret
|
|
91
|
+
values in Terraform state.
|
package/infra/aws/main.tf
CHANGED
|
@@ -1153,6 +1153,7 @@ resource "aws_ecs_task_definition" "service" {
|
|
|
1153
1153
|
}] : []
|
|
1154
1154
|
environment = concat([
|
|
1155
1155
|
{ name = "HASNA_UPTIME_MODE", value = "hosted" },
|
|
1156
|
+
{ name = "HASNA_UPTIME_HOSTED_AUTH_MODE", value = "production" },
|
|
1156
1157
|
{ name = "HASNA_UPTIME_WORKSPACE_ID", value = var.workspace_id },
|
|
1157
1158
|
{ name = "HASNA_UPTIME_COMPONENT", value = each.key },
|
|
1158
1159
|
{ name = "HASNA_UPTIME_HOSTNAME", value = var.hostname },
|
|
@@ -19,7 +19,7 @@ alb_ingress_cidr_blocks = []
|
|
|
19
19
|
private_subnet_ids = ["subnet-replace-private-a", "subnet-replace-private-b"]
|
|
20
20
|
private_route_table_ids = ["rtb-replace-private"]
|
|
21
21
|
container_image = "123456789012.dkr.ecr.us-east-1.amazonaws.com/open-uptime@sha256:aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
|
|
22
|
-
runtime_package_version = "0.1.
|
|
22
|
+
runtime_package_version = "0.1.21"
|
|
23
23
|
certificate_arn = null
|
|
24
24
|
hosted_zone_id = null
|
|
25
25
|
app_env_secret_arn = "arn:aws:secretsmanager:us-east-1:123456789012:secret:open-uptime/prod/app/env"
|
package/infra/aws/variables.tf
CHANGED
|
@@ -201,7 +201,7 @@ variable "container_image" {
|
|
|
201
201
|
variable "runtime_package_version" {
|
|
202
202
|
description = "Published @hasna/uptime package version that CodeBuild should build into the ECR image."
|
|
203
203
|
type = string
|
|
204
|
-
default = "0.1.
|
|
204
|
+
default = "0.1.21"
|
|
205
205
|
|
|
206
206
|
validation {
|
|
207
207
|
condition = can(regex("^[0-9]+\\.[0-9]+\\.[0-9]+(-[0-9A-Za-z.-]+)?$", var.runtime_package_version))
|
|
@@ -242,7 +242,7 @@ variable "app_env_secret_arn" {
|
|
|
242
242
|
}
|
|
243
243
|
|
|
244
244
|
variable "hosted_token_secret_arn" {
|
|
245
|
-
description = "Secrets Manager/SSM ARN
|
|
245
|
+
description = "Secrets Manager/SSM ARN injected as HASNA_UPTIME_HOSTED_TOKEN. Hosted deployments should store scoped hosted-token JSON descriptors here, not a single broad raw token."
|
|
246
246
|
type = string
|
|
247
247
|
|
|
248
248
|
validation {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@hasna/uptime",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.21",
|
|
4
4
|
"description": "Local-first uptime and downtime monitoring service with CLI, MCP, SDK, SQLite persistence, and a dashboard.",
|
|
5
5
|
"license": "Apache-2.0",
|
|
6
6
|
"type": "module",
|
|
@@ -30,7 +30,8 @@
|
|
|
30
30
|
"infra/aws/.terraform.lock.hcl",
|
|
31
31
|
"infra/aws/*.tf",
|
|
32
32
|
"infra/aws/terraform.tfvars.example",
|
|
33
|
-
"docs
|
|
33
|
+
"docs/*.md",
|
|
34
|
+
"docs/*.json",
|
|
34
35
|
"CHANGELOG.md",
|
|
35
36
|
"LICENSE",
|
|
36
37
|
"NOTICE",
|