@hasna/uptime 0.1.20 → 0.1.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -6,6 +6,28 @@ project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6
6
 
7
7
  ## [Unreleased]
8
8
 
9
+ ## [0.1.22] - 2026-06-28
10
+
11
+ ### Fixed
12
+
13
+ - Clarified production hosted-token errors and docs to cover both explicit
14
+ hosted auth production mode and `NODE_ENV=production`.
15
+ - Added built CLI entrypoint regression coverage for packaged hosted startup
16
+ rejecting raw hosted tokens under `NODE_ENV=production`.
17
+
18
+ ## [0.1.21] - 2026-06-28
19
+
20
+ ### Fixed
21
+
22
+ - Fixed hosted production auth mode detection in bundled package output so
23
+ `NODE_ENV=production` rejects legacy raw hosted tokens unless scoped hosted
24
+ token JSON is configured.
25
+
26
+ ### Changed
27
+
28
+ - Included all cloud deployment docs in the npm package so package consumers
29
+ receive the runtime security, source-of-truth, metadata, and runbook context.
30
+
9
31
  ## [0.1.20] - 2026-06-28
10
32
 
11
33
  ### Added
package/README.md CHANGED
@@ -107,7 +107,8 @@ Hosted tokens can be provided as a single legacy token through
107
107
  ```
108
108
 
109
109
  Use scoped JSON for hosted deployments. A single raw hosted token is kept only
110
- for local compatibility and expands to broad read/write/probe/report scopes.
110
+ for local compatibility and expands to broad read/write/probe/report scopes;
111
+ it is rejected when hosted auth mode or `NODE_ENV` is `production`.
111
112
  Endpoints that accept request bodies require `content-type: application/json`.
112
113
 
113
114
  ## Uptime Semantics
package/dist/api.js CHANGED
@@ -4286,7 +4286,7 @@ function parseHostedTokenValue(value, defaultWorkspaceId, source) {
4286
4286
  return parseHostedTokensConfig(trimmed, defaultWorkspaceId, source);
4287
4287
  }
4288
4288
  if (isHostedProductionMode()) {
4289
- throw new ApiError(`${source} must be scoped hosted token JSON when HASNA_UPTIME_HOSTED_AUTH_MODE=production`, 500);
4289
+ throw new ApiError(`${source} must be scoped hosted token JSON when hosted auth mode or NODE_ENV is production`, 500);
4290
4290
  }
4291
4291
  return [{
4292
4292
  token: trimmed,
@@ -4339,7 +4339,10 @@ function isRecord(value) {
4339
4339
  return typeof value === "object" && value !== null && !Array.isArray(value);
4340
4340
  }
4341
4341
  function isHostedProductionMode() {
4342
- return process.env.HASNA_UPTIME_HOSTED_AUTH_MODE === "production" || false;
4342
+ return runtimeEnv("HASNA_UPTIME_HOSTED_AUTH_MODE") === "production" || runtimeEnv("NODE_ENV") === "production";
4343
+ }
4344
+ function runtimeEnv(name) {
4345
+ return process.env[name];
4343
4346
  }
4344
4347
  function resolveHostedAllowedOrigins(options) {
4345
4348
  const configured = options.hostedAllowedOrigins ?? splitCsv(process.env.HASNA_UPTIME_ALLOWED_ORIGINS);
package/dist/cli/index.js CHANGED
@@ -6880,7 +6880,7 @@ function parseHostedTokenValue(value, defaultWorkspaceId, source) {
6880
6880
  return parseHostedTokensConfig(trimmed, defaultWorkspaceId, source);
6881
6881
  }
6882
6882
  if (isHostedProductionMode()) {
6883
- throw new ApiError(`${source} must be scoped hosted token JSON when HASNA_UPTIME_HOSTED_AUTH_MODE=production`, 500);
6883
+ throw new ApiError(`${source} must be scoped hosted token JSON when hosted auth mode or NODE_ENV is production`, 500);
6884
6884
  }
6885
6885
  return [{
6886
6886
  token: trimmed,
@@ -6933,7 +6933,10 @@ function isRecord(value) {
6933
6933
  return typeof value === "object" && value !== null && !Array.isArray(value);
6934
6934
  }
6935
6935
  function isHostedProductionMode() {
6936
- return process.env.HASNA_UPTIME_HOSTED_AUTH_MODE === "production" || false;
6936
+ return runtimeEnv("HASNA_UPTIME_HOSTED_AUTH_MODE") === "production" || runtimeEnv("NODE_ENV") === "production";
6937
+ }
6938
+ function runtimeEnv(name) {
6939
+ return process.env[name];
6937
6940
  }
6938
6941
  function resolveHostedAllowedOrigins(options) {
6939
6942
  const configured = options.hostedAllowedOrigins ?? splitCsv(process.env.HASNA_UPTIME_ALLOWED_ORIGINS);
@@ -7011,7 +7014,7 @@ function buildAwsDeploymentPlan(options = {}) {
7011
7014
  const image = clean(options.image, `${imageRepositoryUri}@sha256:<image-digest>`);
7012
7015
  const evidenceBucket = clean(options.evidenceBucket, `hasna-${stage}-${prefix}-evidence`);
7013
7016
  const hostedSqliteDbPath = clean(options.hostedSqliteDbPath, DEFAULT_HOSTED_SQLITE_DB);
7014
- const runtimePackageVersion = clean(options.runtimePackageVersion, "0.1.20");
7017
+ const runtimePackageVersion = clean(options.runtimePackageVersion, "0.1.22");
7015
7018
  const protectedAccessMode = options.protectedAccessMode ?? DEFAULT_PROTECTED_ACCESS_MODE;
7016
7019
  const protectedAccessUrl = protectedAccessMode === "cloudfront_default_domain" ? "https://<cloudfront-domain>" : `https://${hostname}`;
7017
7020
  const cluster = `${prefix}-${stage}`;
@@ -21,7 +21,7 @@ function buildAwsDeploymentPlan(options = {}) {
21
21
  const image = clean(options.image, `${imageRepositoryUri}@sha256:<image-digest>`);
22
22
  const evidenceBucket = clean(options.evidenceBucket, `hasna-${stage}-${prefix}-evidence`);
23
23
  const hostedSqliteDbPath = clean(options.hostedSqliteDbPath, DEFAULT_HOSTED_SQLITE_DB);
24
- const runtimePackageVersion = clean(options.runtimePackageVersion, "0.1.20");
24
+ const runtimePackageVersion = clean(options.runtimePackageVersion, "0.1.22");
25
25
  const protectedAccessMode = options.protectedAccessMode ?? DEFAULT_PROTECTED_ACCESS_MODE;
26
26
  const protectedAccessUrl = protectedAccessMode === "cloudfront_default_domain" ? "https://<cloudfront-domain>" : `https://${hostname}`;
27
27
  const cluster = `${prefix}-${stage}`;
package/dist/index.js CHANGED
@@ -4286,7 +4286,7 @@ function parseHostedTokenValue(value, defaultWorkspaceId, source) {
4286
4286
  return parseHostedTokensConfig(trimmed, defaultWorkspaceId, source);
4287
4287
  }
4288
4288
  if (isHostedProductionMode()) {
4289
- throw new ApiError(`${source} must be scoped hosted token JSON when HASNA_UPTIME_HOSTED_AUTH_MODE=production`, 500);
4289
+ throw new ApiError(`${source} must be scoped hosted token JSON when hosted auth mode or NODE_ENV is production`, 500);
4290
4290
  }
4291
4291
  return [{
4292
4292
  token: trimmed,
@@ -4339,7 +4339,10 @@ function isRecord(value) {
4339
4339
  return typeof value === "object" && value !== null && !Array.isArray(value);
4340
4340
  }
4341
4341
  function isHostedProductionMode() {
4342
- return process.env.HASNA_UPTIME_HOSTED_AUTH_MODE === "production" || false;
4342
+ return runtimeEnv("HASNA_UPTIME_HOSTED_AUTH_MODE") === "production" || runtimeEnv("NODE_ENV") === "production";
4343
+ }
4344
+ function runtimeEnv(name) {
4345
+ return process.env[name];
4343
4346
  }
4344
4347
  function resolveHostedAllowedOrigins(options) {
4345
4348
  const configured = options.hostedAllowedOrigins ?? splitCsv(process.env.HASNA_UPTIME_ALLOWED_ORIGINS);
@@ -4417,7 +4420,7 @@ function buildAwsDeploymentPlan(options = {}) {
4417
4420
  const image = clean(options.image, `${imageRepositoryUri}@sha256:<image-digest>`);
4418
4421
  const evidenceBucket = clean(options.evidenceBucket, `hasna-${stage}-${prefix}-evidence`);
4419
4422
  const hostedSqliteDbPath = clean(options.hostedSqliteDbPath, DEFAULT_HOSTED_SQLITE_DB);
4420
- const runtimePackageVersion = clean(options.runtimePackageVersion, "0.1.20");
4423
+ const runtimePackageVersion = clean(options.runtimePackageVersion, "0.1.22");
4421
4424
  const protectedAccessMode = options.protectedAccessMode ?? DEFAULT_PROTECTED_ACCESS_MODE;
4422
4425
  const protectedAccessUrl = protectedAccessMode === "cloudfront_default_domain" ? "https://<cloudfront-domain>" : `https://${hostname}`;
4423
4426
  const cluster = `${prefix}-${stage}`;
@@ -0,0 +1,43 @@
1
+ # Architecture
2
+
3
+ Open Uptime has four public surfaces over one local service model:
4
+
5
+ - SDK: `createUptimeClient()`
6
+ - CLI: `uptime`
7
+ - MCP: `uptime-mcp`
8
+ - API/dashboard: `uptime serve`
9
+
10
+ State is stored in SQLite through `UptimeStore`. `UptimeService` owns monitor
11
+ checks, retry policy, incident reconciliation, scheduler ticks, and summaries.
12
+ The CLI, MCP server, and API call the service rather than maintaining separate
13
+ business logic.
14
+
15
+ The local HTTP API is intended for same-origin dashboard use and local
16
+ automation. State-changing API requests reject mismatched browser `Origin`
17
+ headers, and JSON mutation endpoints require `content-type: application/json`.
18
+
19
+ ## Data Model
20
+
21
+ - `monitors`: configured HTTP/TCP monitors and current status.
22
+ - `check_results`: immutable check attempts after retry resolution.
23
+ - `incidents`: open/closed downtime windows per monitor.
24
+
25
+ ## Check Semantics
26
+
27
+ HTTP monitors are up when the request completes before timeout and the response
28
+ status is either the configured `expectedStatus` or any 2xx/3xx status when no
29
+ specific status is configured. TCP monitors are up when a connection can be
30
+ opened before timeout.
31
+
32
+ Retries happen before a result is recorded. One stored check result represents
33
+ the final outcome for that scheduled check.
34
+
35
+ Monitor interval, timeout, and retry settings are bounded in the store so every
36
+ surface (SDK, CLI, API, and MCP) shares the same protection against runaway
37
+ checks. MCP schemas mirror those bounds for earlier validation.
38
+
39
+ `uptimePercent` is intentionally a check-count availability metric in the first
40
+ release: up stored results divided by all stored results for that monitor. It is
41
+ not elapsed-time SLA accounting. Incident windows are stored separately so a
42
+ later report can add duration-based availability without changing the check
43
+ history model.
@@ -0,0 +1,473 @@
1
+ # AWS Runtime And Security
2
+
3
+ This document defines the target AWS architecture for running Open Uptime as an
4
+ internal cloud service in a reviewed AWS account. It is a design contract for
5
+ later infrastructure and deployment implementation work; it does not mean Open
6
+ Uptime is safe to expose today.
7
+
8
+ Current deployment bridge as of 2026-06-28: the repo Terraform includes an
9
+ EFS-backed SQLite runtime path (`HASNA_UPTIME_HOSTED_SQLITE_DB`) with an EFS
10
+ access point and AWS Backup so exactly one protected web task can start without
11
+ the future async Postgres adapter. Scheduler, public-probe, reporter, and
12
+ migration workers must stay at desired count `0` and do not receive EFS mounts
13
+ or EFS write IAM in this bridge. The Postgres/RDS sections below remain
14
+ target-state security requirements for the eventual cloud source of truth.
15
+
16
+ ## Current Account State
17
+
18
+ Private target-account inventory belongs in private deployment evidence, not in
19
+ the OSS repository. Before live deployment, record:
20
+
21
+ - selected AWS account/profile and region;
22
+ - target VPC, public ALB subnets, private task subnets, KMS key, protected edge
23
+ mode, and, when using a custom hostname, Route53/edge ownership plus ACM
24
+ certificate;
25
+ - whether an approved application Postgres exists for the future target state;
26
+ - whether Open Uptime already has service-specific ECR, ECS, ALB, DNS, secrets,
27
+ alarms, backup policy, and evidence bucket resources.
28
+
29
+ The implementation phase must locate/check out the approved infrastructure
30
+ repository or create the infra change in the correct owner repository before
31
+ touching live AWS resources.
32
+
33
+ ## Architecture Decision
34
+
35
+ Use ECS Fargate with ALB and S3. The first deployable runtime may use
36
+ EFS-backed SQLite with AWS Backup for a single web writer while scheduler,
37
+ public probe, reporter, and migration workers remain disabled. The target cloud
38
+ source of truth is Postgres; browser evidence and generated report artifacts
39
+ live in hardened S3.
40
+
41
+ App Runner is not the first choice because Open Uptime needs explicit VPC
42
+ placement, private RDS access, separated worker roles, private probe ingestion,
43
+ egress controls, and future VPC endpoints. Lambda is not the first choice
44
+ because the scheduler/probe model is stateful enough to benefit from long-lived
45
+ workers and cloud leases.
46
+
47
+ ## Runtime Components
48
+
49
+ Deploy separate ECS/Fargate services or task roles:
50
+
51
+ - `open-uptime-web`: dashboard, API, JSON Render endpoints, canvas endpoints,
52
+ report preview, import preview/apply, and health endpoint.
53
+ - `open-uptime-scheduler`: creates deterministic `check_jobs`, reconciles
54
+ incidents, schedules reports, and holds the scheduler lease.
55
+ - `open-uptime-public-probe`: claims public check jobs, runs HTTP/DNS/TLS/domain
56
+ and browser checks, writes results, and uploads redacted evidence.
57
+ - `open-uptime-reporter`: generates scheduled report artifacts, performs
58
+ idempotent delivery attempts through authorized channel refs, and owns report
59
+ retry/backoff state. This can share code with the scheduler, but it needs its
60
+ own IAM scope and alarms.
61
+ - `open-uptime-migration`: one-off task for schema migrations and controlled
62
+ backfills.
63
+ - `open-uptime-private-probe`: not an AWS public service by default. Approved
64
+ private machines run signed probe agents that submit results to
65
+ the hosted API.
66
+
67
+ Each role gets a separate task role and least-privilege policy. The web task
68
+ must not be the component that performs arbitrary target checks.
69
+
70
+ Hosted web can enqueue, preview, or approve authorized work, but it cannot run
71
+ checks inline. The existing local `/api/check-all`, `/api/monitors/:id/check`,
72
+ and `serve --check` behavior must either remain local-only or become cloud
73
+ enqueue operations. Scheduler creates jobs; probes claim jobs with TTL/fencing
74
+ tokens; result ingest requires the active fencing token and idempotency key.
75
+
76
+ ## Probe Locations And Down Semantics
77
+
78
+ Initial public probe location is `us-east-1`. The data model and APIs must still
79
+ support multiple locations so later probes can be added without rewriting
80
+ monitor semantics.
81
+
82
+ Each monitor defines:
83
+
84
+ - allowed probe classes: public, private, or both;
85
+ - allowed probe locations or inventory-linked private probes;
86
+ - interval and retry limits;
87
+ - quorum policy for multi-probe checks;
88
+ - down policy: for example one location after final retry, N-of-M locations, or
89
+ private-probe authoritative;
90
+ - stale-result policy so old probe output cannot reopen or close incidents.
91
+
92
+ The first implementation can run one public location, but it must record
93
+ `probe_id`, `probe_location`, `probe_class`, `monitor_version`, and
94
+ `schedule_slot` on every result. Future regional probes must not change result
95
+ identity or SLA calculations.
96
+
97
+ ## Network Layout
98
+
99
+ Target shape inside the approved VPC:
100
+
101
+ - public subnets: ALB and NAT gateways only;
102
+ - new private application subnets: web, scheduler, migration tasks;
103
+ - controlled egress subnets: public probe workers, routed through NAT or another
104
+ explicitly inspected egress path;
105
+ - RDS access stays private and restricted by security groups;
106
+ - VPC endpoints should be used for S3, Secrets Manager, CloudWatch Logs, ECR,
107
+ and SSM where practical. Interface endpoint private DNS is VPC-wide, so shared
108
+ VPC deployments must either use the approved networking root or explicitly
109
+ allow every affected source security group.
110
+
111
+ Security groups:
112
+
113
+ - `open-uptime-alb-sg`: in `cloudfront_default_domain` mode, inbound `80` only
114
+ from AWS's CloudFront origin-facing managed prefix list; in `alb_https_cert`
115
+ mode, inbound `443` only from the approved edge/source CIDR policy. Outbound
116
+ is only to the web target group.
117
+ - `open-uptime-web-sg`: inbound only from ALB, outbound to RDS, S3 endpoint,
118
+ Secrets Manager, Logs, and internal service endpoints.
119
+ - `open-uptime-scheduler-sg`: no inbound, outbound to RDS, Logs, Secrets
120
+ Manager, and notification services through approved endpoints.
121
+ - `open-uptime-public-probe-sg`: no inbound, outbound through the public target
122
+ policy path once public probe execution is enabled. Keep desired count `0`
123
+ until the public-probe worker claims cloud jobs through the hosted HTTP runner,
124
+ emits target-policy evidence, and has AWS smoke evidence.
125
+ - `open-uptime-rds-client-sg`: allowed by the canonical RDS security group for
126
+ the dedicated Uptime DB user.
127
+
128
+ Private monitors must not run from public probe workers. They run from approved
129
+ private probes and are created only from approved inventory refs.
130
+
131
+ ## Web Exposure And Auth
132
+
133
+ The expected hostname is an approved internal hostname such as
134
+ `uptime.example.com`; the final hosted zone and record must be selected in the
135
+ infra PR.
136
+
137
+ Public web exposure requires defense in depth:
138
+
139
+ - first deployment may terminate viewer TLS at CloudFront's default HTTPS
140
+ domain, restrict ALB HTTP origin ingress to CloudFront origin-facing ranges,
141
+ and require the module's CloudFront-only origin verification header at the ALB
142
+ listener;
143
+ - CloudFront prefix-list ingress is not distribution-bound by itself. In
144
+ `cloudfront_default_domain` mode, set
145
+ `enable_cloudfront_origin_verify_header = true` and provide a high-entropy
146
+ `cloudfront_origin_verify_header_value` from an approved private operator
147
+ workflow before setting web desired count above `0`. Terraform treats the
148
+ value as sensitive, but the value is still persisted in encrypted Terraform
149
+ state and in AWS CloudFront/ALB configuration readable by principals with
150
+ distribution or listener-rule read access;
151
+ - custom hostname deployment terminates TLS with ACM on ALB or CloudFront after
152
+ Route53/edge ownership is approved;
153
+ - edge access can be Cloudflare Access, OIDC, Cognito, or another Hasna-approved
154
+ identity layer;
155
+ - hosted web tasks must set `HASNA_UPTIME_ALLOWED_ORIGINS` to the public HTTPS
156
+ edge origin so browser mutation checks do not compare CloudFront HTTPS origins
157
+ against the private HTTP ALB origin hop;
158
+ - Open Uptime still enforces app-level auth and workspace RBAC on every route
159
+ except `/health`;
160
+ - `/health` returns only service liveness/readiness and no monitor data;
161
+ - all dashboard, API, MCP-over-HTTP, JSON Render, canvas, import, report, and
162
+ artifact endpoints require actor, workspace, and scope.
163
+
164
+ Do not rely on a single shared `HASNA_UPTIME_API_TOKEN` for hosted mode. That
165
+ token style can remain a local/trusted automation compatibility mode only.
166
+
167
+ Minimum route-to-scope matrix:
168
+
169
+ | Surface | Routes | Required scope |
170
+ | --- | --- | --- |
171
+ | Health | `GET /health` | none |
172
+ | Dashboard | `GET /`, static assets | `uptime:read` |
173
+ | Summary/read API | `GET /api/summary`, `/api/monitors`, `/api/incidents`, `/api/results`, `/api/report` | `uptime:read` |
174
+ | Monitor writes | monitor create/update/delete/pause/resume, import apply | `uptime:write` |
175
+ | Check enqueue | check all, check one, schedule preview | `uptime:write` or `uptime:probe:enqueue` |
176
+ | Probe ingest | claim job, heartbeat, submit result, upload evidence ref | `uptime:probe` |
177
+ | Reports | preview, schedule, run, delivery retry | `uptime:report` |
178
+ | Admin | migrations, rollback, token rotation, probe revocation | `uptime:admin` |
179
+ | JSON Render/canvas | render specs, canvases, nodes, edges | `uptime:read` plus project/canvas authorization |
180
+ | Artifacts | signed URL creation and metadata | `uptime:read` plus artifact policy |
181
+
182
+ Hosted tests must prove unauthenticated dashboard/API reads return 401, wrong
183
+ scope returns 403, and cross-workspace requests cannot read, mutate, enqueue,
184
+ ingest, report, render, or access artifacts.
185
+
186
+ ## Persistence
187
+
188
+ Postgres:
189
+
190
+ - choose one exact shape in the infra PR: preferably a dedicated `uptime`
191
+ database on the approved application Postgres instance; a dedicated schema in
192
+ an approved database is acceptable only if ownership, backups, and role grants
193
+ are explicit;
194
+ - use separate least-privileged roles/users: `uptime_migrator`, `uptime_web`,
195
+ `uptime_scheduler`, `uptime_probe`, and `uptime_reporter` or read/report role;
196
+ - runtime roles must not have DDL privileges;
197
+ - migration role is manually invoked, time-limited, and not attached to the
198
+ normal web/scheduler/probe services;
199
+ - require TLS;
200
+ - run migrations before web/scheduler/probe rollout;
201
+ - include `workspace_id`, `version`, `deleted_at`, audit/idempotency fields, and
202
+ optimistic concurrency on mutable tables;
203
+ - enable automated backups and PITR on the RDS instance;
204
+ - take pre-cutover snapshots before migration or destructive schema changes;
205
+ - block destructive migrations unless backup and rollback checks pass.
206
+
207
+ S3:
208
+
209
+ - create a dedicated evidence/artifact bucket or scoped prefix;
210
+ - enable KMS encryption, versioning, lifecycle/retention, and public access
211
+ block;
212
+ - store browser screenshots, traces, network evidence, generated report HTML,
213
+ generated report JSON, and import/export artifacts only after redaction;
214
+ - access artifacts through short-lived signed URLs with workspace authorization
215
+ and audit logging.
216
+
217
+ Target state: no hosted runtime writes authoritative state to EFS or local task
218
+ storage. Current bridge exception: one web task may write the explicit
219
+ `/data/uptime/uptime.db` EFS-backed SQLite file until the async Postgres adapter
220
+ and cloud leases exist. Ephemeral task storage is for temporary files only.
221
+
222
+ Project stores:
223
+
224
+ - project canvases, JSON Render specs, loop refs, handoffs, and linked service
225
+ refs must be in cloud-backed project stores before hosted canvases are
226
+ declared cloud-primary;
227
+ - local `$HASNA_PROJECTS_HOME/data/<workspace_id>/project.db` and older `by-id`
228
+ paths are cache/import sources only;
229
+ - hosted render payloads must not include raw local paths or secrets.
230
+
231
+ ## Secrets And IAM
232
+
233
+ Secrets are referenced, not copied.
234
+
235
+ Expected secret refs:
236
+
237
+ - `open-uptime/prod/rds`
238
+ - `open-uptime/prod/app/env`
239
+ - `open-uptime/prod/probe/public`
240
+ - `open-uptime/prod/probe/private`
241
+ - `open-uptime/prod/reporting`
242
+ - service-owned Mailery, Telephony, Logs, Projects, Todos, Knowledge, Notes,
243
+ Mementos, Servers, Domains, and Deployment refs as needed.
244
+
245
+ ECS task definitions must use Secrets Manager or SSM `valueFrom` entries for
246
+ secret-bearing values. They must not inline plaintext secret values in
247
+ environment arrays. Cloud records store channel ids, secret refs, and redacted
248
+ metadata only.
249
+
250
+ IAM split:
251
+
252
+ - execution role: ECR pull, log write, and ECS runtime secret retrieval needed
253
+ to start the task;
254
+ - web role: read/write Uptime DB, read authorized secret refs, write logs,
255
+ generate signed artifact URLs;
256
+ - scheduler role: Uptime DB, logs, reporting channel metadata, and no arbitrary
257
+ outbound target execution permissions;
258
+ - public probe role: claim jobs, submit results, write evidence artifacts, read
259
+ only probe-scoped secret refs;
260
+ - reporter role: read report schedules/runs, write report artifacts, resolve
261
+ approved delivery channel refs, submit delivery attempts, and write logs;
262
+ - migration role: migrations/backfill only, time-limited and manually invoked.
263
+
264
+ Provider credentials for deployment should come from GitHub OIDC or an operator
265
+ role. Do not store AWS access keys in Open Deployment local DB rows.
266
+
267
+ ## Egress And SSRF Boundaries
268
+
269
+ AWS network controls and application target policy are both required. The
270
+ current hosted API enforces configuration-time checks for direct denied hosts,
271
+ secret-bearing URLs, and private DNS suffixes. The SDK also exposes
272
+ `runHostedHttpCheck` for hosted public HTTP probes; it resolves DNS at
273
+ execution time, denies unsafe answers, pins the validated address into the
274
+ request, validates redirects, and records target-policy decisions. Public probe
275
+ execution remains disabled until cloud check-job leases are wired to this runner
276
+ and the behavior is validated in AWS.
277
+
278
+ The required hosted public-probe policy must deny:
279
+
280
+ - loopback, link-local, metadata endpoints, RFC1918, multicast, wildcard,
281
+ unspecified, carrier-grade NAT, and IPv6 ULA/link-local ranges;
282
+ - DNS names that resolve to denied ranges;
283
+ - redirects to denied ranges;
284
+ - URL userinfo and secret-like query strings;
285
+ - TCP targets not approved by monitor kind and source provenance.
286
+
287
+ Public probe workers should use a restricted egress path and emit target policy
288
+ decision logs. Private targets are routed only to private probes with explicit
289
+ inventory provenance and probe authorization.
290
+
291
+ NAT strategy must be deliberate. Public probe egress may become a major fixed
292
+ cost, so the infra PR must choose between NAT gateways, VPC endpoints where
293
+ possible, or a smaller controlled-egress design, and document expected cost and
294
+ failure modes.
295
+
296
+ ## Browser Worker Requirements
297
+
298
+ Browser/page checks stay disabled in hosted mode until the deployed container and
299
+ artifact pipeline satisfy this contract:
300
+
301
+ - Playwright or equivalent browser runtime is present in the probe image;
302
+ - CPU, memory, and ephemeral storage sizing are documented and load-tested;
303
+ - per-check browser contexts are isolated;
304
+ - concurrency is bounded per task and per workspace;
305
+ - browser sandboxing is enabled where compatible with Fargate, or a documented
306
+ compensating isolation control exists;
307
+ - HAR/trace/console/network data is redacted before upload;
308
+ - screenshots support selector and region masking;
309
+ - evidence uploads fail closed if redaction, encryption, or artifact metadata
310
+ write fails;
311
+ - retention defaults are short and tied to a cost estimate.
312
+
313
+ ## Private Probe Lifecycle
314
+
315
+ Private probes are first-class cloud actors:
316
+
317
+ - enrollment creates a probe identity bound to workspace, machine id, allowed
318
+ source inventories, capabilities, and trust class;
319
+ - credentials are scoped, rotatable, revocable, and never reused by public
320
+ probes;
321
+ - probes heartbeat with version, capabilities, queue lag, and local clock skew;
322
+ - offline buffering is bounded and cannot submit results after lease expiry;
323
+ - upgrade policy defines minimum supported version and forced disable behavior;
324
+ - compromised-probe response revokes credentials, quarantines recent results,
325
+ blocks further job claims, and records audit/log events.
326
+
327
+ ## Observability
328
+
329
+ Minimum CloudWatch/monitoring resources:
330
+
331
+ - log groups for web, scheduler, public probe, migration, and private probe
332
+ ingestion, with retention;
333
+ - metrics and alarms for ALB target health, ALB 5xx, API 5xx, latency, ECS task
334
+ restarts, desired/running count drift, CPU, memory, RDS connections, RDS CPU,
335
+ storage, S3 errors, evidence upload failures, probe heartbeat lag, check job
336
+ backlog, result ingest failures, incident notification failures, reporter lag,
337
+ report delivery retry exhaustion, report run failures, and migration failures;
338
+ - dashboard for current service health, queue/job backlog, probe fleet health,
339
+ open incidents, report delivery, and deploy version;
340
+ - Open Logs integration for structured app events with no secret values;
341
+ - self-monitoring monitor definitions seeded after deployment.
342
+
343
+ Alert destinations must use service-owned channel refs, not raw webhook URLs or
344
+ request-provided credentials.
345
+
346
+ ## Backup, Restore, And Rollback
347
+
348
+ Before production cutover:
349
+
350
+ 1. run migration dry-run with counts/schema versions/conflict counts only;
351
+ 2. back up local `~/.hasna/uptime` data and dependent local stores;
352
+ 3. take RDS snapshot or verify PITR point;
353
+ 4. verify S3 versioning/lifecycle;
354
+ 5. freeze legacy local writes for migrated surfaces;
355
+ 6. run migration/backfill;
356
+ 7. compare cloud and local read-only summaries;
357
+ 8. run restore drill in a non-production target;
358
+ 9. document rollback command sequence and responsible actor.
359
+
360
+ Rollback sequence:
361
+
362
+ 1. pause scheduler and probe claims;
363
+ 2. revoke the private probe primary/operator lease if involved;
364
+ 3. make cloud writes read-only;
365
+ 4. roll ECS service back to previous task definition if the app release failed;
366
+ 5. restore DB or run compensating migration if the data release failed;
367
+ 6. keep S3 artifacts versioned and quarantined;
368
+ 7. point local CLIs back to fallback only if the cloud cutover is explicitly
369
+ rolled back;
370
+ 8. record the event in audit events, Open Logs, Projects, and Todos.
371
+
372
+ Destructive infrastructure actions require final snapshots and deletion
373
+ protection. Generic Open Deployment code that skips final snapshots is not
374
+ acceptable for Open Uptime production resources.
375
+
376
+ ## Deployment Pipeline
377
+
378
+ Minimum implementation path:
379
+
380
+ 1. review the repo-owned `Dockerfile` and package-image `Dockerfile.package`;
381
+ 2. add the ECR repository and CodeBuild package image builder;
382
+ 3. build the published npm package into ECR and record the immutable digest;
383
+ 4. run typecheck, tests, package checks, and container smoke locally/CI;
384
+ 5. for the EFS bridge, keep the desired count at one web task maximum and zero
385
+ scheduler/public-probe/reporter/migration tasks;
386
+ 6. deploy ECS services by digest with deployment circuit breaker enabled;
387
+ 7. verify `/health`, auth-denied reads, authenticated dashboard/API mutations
388
+ through the public edge origin, direct-origin denial, EFS backup evidence, and
389
+ web alarms; defer probe heartbeat, check job claim, evidence upload, and
390
+ report delivery smokes until worker roles are cloud-backed;
391
+ 8. publish/update packages only after hosted smoke passes if code changed.
392
+
393
+ Open Deployment may record deployment metadata, but it must not be exposed as a
394
+ public deployment controller and must not inject plaintext secrets into task
395
+ definitions.
396
+
397
+ Before Open Deployment can drive production Uptime deploys, it must:
398
+
399
+ - persist provider deployment ids separately from local deployment row ids;
400
+ - pass `cluster/service` or task definition ids correctly for ECS status, logs,
401
+ and rollback;
402
+ - include execution role, task role, log configuration, health check, target
403
+ group/service networking, deployment circuit breaker, and `secrets.valueFrom`
404
+ in ECS task definitions;
405
+ - disable destructive production helpers that skip final snapshots;
406
+ - stop storing raw production secret values in local provider/environment rows.
407
+
408
+ ## Cost Controls
409
+
410
+ Every AWS resource must carry owner/project/environment/service tags. The infra
411
+ PR must include a rough monthly estimate for:
412
+
413
+ - ALB;
414
+ - ECS Fargate web task for the bridge and later scheduler/probe tasks;
415
+ - NAT gateway and/or approved private VPC endpoints for ECR, Logs, Secrets
416
+ Manager or SSM, and S3, including runtime ECS evidence for image pull, secret
417
+ injection, log delivery, S3 access, and EFS mount behavior;
418
+ - EFS/Backup bridge costs and later RDS incremental usage for the Uptime
419
+ schema/database;
420
+ - S3 evidence/artifact storage and requests;
421
+ - CloudWatch logs/metrics/alarms;
422
+ - KMS requests;
423
+ - CloudFront default-domain edge costs, and Route53/ACM where applicable.
424
+
425
+ Evidence retention and browser trace capture are the primary variable costs.
426
+ Default retention must be short until usage is measured.
427
+
428
+ The AWS Terraform starter exposes optional AWS Budgets alerts through
429
+ `monthly_budget_limit_usd` and `budget_alert_email_addresses`; the approved
430
+ infra root must set real recipients before live scale-out. Budget alarms are
431
+ required before browser evidence or public probe scale-out.
432
+
433
+ ## Implementation Blockers
434
+
435
+ - A private Hasna AWS bridge now has zero-count runtime resources, including
436
+ ECR, dormant ECS services, ALB, CloudFront default-domain distribution,
437
+ evidence bucket, encrypted logs, Backup, EFS, and service secret containers.
438
+ It is not live: services remain at desired count `0`, secrets have
439
+ `AWSCURRENT` values, scoped hosted-token descriptors can be used for operator
440
+ smokes, and no ACM cert or Route53 record exists for a later custom-hostname
441
+ path. Full production identity/RBAC is still not implemented.
442
+ - Open Uptime is still SQLite-only for this bridge; only one protected web task
443
+ may write EFS until Postgres and cloud leases exist.
444
+ - Hosted API/dashboard auth, workspace RBAC, target policy, and Postgres leases
445
+ are not implemented.
446
+ - Route/scope matrix, report worker ownership, private probe lifecycle, probe
447
+ location/down semantics, and browser worker sizing are design requirements but
448
+ not implemented.
449
+ - Open Deployment's current AWS provider is not production-grade for this
450
+ service because it can register task definitions without roles/logs/secrets and
451
+ includes unsafe DB deletion/default secret handling patterns.
452
+
453
+ ## Acceptance Criteria
454
+
455
+ - A reviewed infra PR defines all runtime resources and outputs consumed by Open
456
+ Uptime and any deployment tooling.
457
+ - The first bridge deploy runs one web task maximum with web-only EFS write IAM;
458
+ scheduler, public probe, reporter, and migration roles remain disabled until
459
+ their cloud data paths are implemented.
460
+ - Hosted routes except `/health` require app auth and workspace RBAC.
461
+ - Final cloud-primary runtime state is Postgres plus S3 artifacts; the current
462
+ EFS SQLite bridge is explicitly temporary and not the target source of truth.
463
+ - ECS task definitions use secret refs, not plaintext secret values.
464
+ - ECS task definitions include explicit container health checks: web checks
465
+ `/health`, while disabled non-web roles use a hosted-environment sanity check
466
+ until their long-running worker commands are implemented.
467
+ - Public probes cannot reach denied target classes; private monitors require
468
+ private probes and approved inventory refs.
469
+ - Backups, restore drill, rollback sequence, alarms, and cost estimate are
470
+ documented and verified before production cutover.
471
+ - Route-to-scope tests, reporter worker tests, private probe lifecycle tests,
472
+ browser worker smoke/load tests, and probe quorum/down-semantics tests pass
473
+ before hosted cutover.