npm - @hasna/uptime - Versions diffs - 0.1.9 → 0.1.11 - Mend

@hasna/uptime 0.1.9 → 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/CHANGELOG.md +34 -0
package/SECURITY.md +4 -2
package/dist/api.js +179 -58
package/dist/checks.d.ts +2 -1
package/dist/checks.d.ts.map +1 -1
package/dist/checks.js +2 -1
package/dist/cli/index.js +180 -59
package/dist/cloud-plan.js +1 -1
package/dist/imports.d.ts +6 -2
package/dist/imports.d.ts.map +1 -1
package/dist/imports.js +72 -8
package/dist/index.js +180 -59
package/dist/mcp/index.js +166 -47
package/dist/service.d.ts +36 -10
package/dist/service.d.ts.map +1 -1
package/dist/service.js +166 -47
package/dist/store.d.ts +13 -3
package/dist/store.d.ts.map +1 -1
package/dist/store.js +140 -26
package/dist/types.d.ts +3 -0
package/dist/types.d.ts.map +1 -1
package/docs/aws-deployment-runbook.md +327 -14
package/infra/aws/outputs.tf +35 -0
package/infra/aws/terraform.tfvars.example +1 -1
package/infra/aws/variables.tf +1 -1
package/package.json +1 -1

package/docs/aws-deployment-runbook.md CHANGED Viewed

@@ -40,19 +40,31 @@ write a sourceable env file with a placeholder probe identity.
 1. Locate the real infrastructure repository or create the change in the
    approved owner repository.
-2. Confirm the AWS caller identity:
+2. Set the operator shell variables used by the command snippets:
    ```bash
-   aws sts get-caller-identity --profile <aws-profile>
+   : "${AWS_PROFILE_NAME:?set AWS_PROFILE_NAME to the reviewed AWS profile}"
+   AWS_REGION="${AWS_REGION:-us-east-1}"
+   TF_DIR="${TF_DIR:-infra/aws}"
+   PLAN_FILE="${PLAN_FILE:-open-uptime.tfplan}"
    ```
-3. Confirm the target VPC, private subnets, KMS key, and EFS/Backup plan inputs
+3. Confirm the AWS caller identity:
+   ```bash
+   aws sts get-caller-identity --profile "$AWS_PROFILE_NAME"
+   ```
+4. Confirm the target VPC, private subnets, KMS key, and EFS/Backup plan inputs
    still match the plan.
-4. Confirm the protected access mode. The first deploy can use the CloudFront
+5. Confirm the protected access mode. The first deploy can use the CloudFront
    default HTTPS domain without custom DNS or ACM. Custom hostname deploys still
    require Route53/edge ownership and an ACM certificate.
-5. Confirm the deployment role uses short-lived credentials or OIDC, not copied
+6. Confirm the deployment role uses short-lived credentials or OIDC, not copied
    access keys.
+7. Create a private evidence directory outside the public repository. Store
+   command output, plan summaries, screenshots, and incident notes there. Do
+   not store tokens, database URLs, probe private keys, or secret values.
 ## Required Resources
@@ -81,14 +93,259 @@ copy-pastable AWS mutation commands.
 Plan the included Terraform/OpenTofu starter without a backend:
 ```bash
-terraform -chdir=infra/aws fmt -check
-terraform -chdir=infra/aws init -backend=false
-terraform -chdir=infra/aws validate
-terraform -chdir=infra/aws plan -out open-uptime.tfplan
+terraform -chdir="$TF_DIR" fmt -check
+terraform -chdir="$TF_DIR" init -backend=false
+terraform -chdir="$TF_DIR" validate
+terraform -chdir="$TF_DIR" plan -out "$PLAN_FILE"
 ```
 Use Terraform/OpenTofu 1.9 or newer for this starter.
+## Zero-Count Apply
+The first reviewed apply must create infrastructure with every ECS service at
+desired count `0`.
+1. Confirm the plan has no deletes or replacements and that all ECS services are
+   dormant:
+   ```bash
+   terraform -chdir="$TF_DIR" show -json "$PLAN_FILE" \
+     | jq -r '.resource_changes[] | select(.type=="aws_ecs_service") | [.address, .change.after.desired_count] | @tsv'
+   ```
+2. Confirm Terraform is not managing secret values:
+   ```bash
+   terraform -chdir="$TF_DIR" show -json "$PLAN_FILE" \
+     | jq -r '.resource_changes[] | select(.type | test("secret_version|random_password|random_string")) | .address'
+   ```
+   This command must print nothing.
+3. Apply only the reviewed zero-count plan:
+   ```bash
+   terraform -chdir="$TF_DIR" apply "$PLAN_FILE"
+   ```
+4. Capture outputs, the source commit, the package version, the plan summary,
+   and the caller identity in private deployment evidence.
+## Image And Secrets
+After the zero-count apply, build the image through the approved deploy pipeline
+or the declared image builder. Record only the immutable digest, not build logs
+that contain environment values:
+```bash
+IMAGE_BUILDER_PROJECT="$(terraform -chdir="$TF_DIR" output -raw image_builder_project_name)"
+aws codebuild start-build \
+  --profile "$AWS_PROFILE_NAME" \
+  --region "$AWS_REGION" \
+  --project-name "$IMAGE_BUILDER_PROJECT"
+```
+Update the approved infra root so `container_image` is the immutable ECR digest,
+then re-plan with all services still at `0`.
+Populate Secrets Manager values out of band. Verify metadata only:
+```bash
+terraform -chdir="$TF_DIR" output -json secret_refs | jq -r '.[]' | while read -r SECRET_ID; do
+  aws secretsmanager describe-secret \
+    --profile "$AWS_PROFILE_NAME" \
+    --region "$AWS_REGION" \
+    --secret-id "$SECRET_ID"
+  aws secretsmanager list-secret-version-ids \
+    --profile "$AWS_PROFILE_NAME" \
+    --region "$AWS_REGION" \
+    --secret-id "$SECRET_ID"
+done
+```
+Each required secret must have an `AWSCURRENT` version before any task is
+started. Never run `get-secret-value` in shared logs or public evidence.
+## Protected Web Scale-Up
+Before setting `desired_counts.web = 1`, verify:
+- the image is an immutable digest, not a mutable tag or placeholder;
+- required secrets have `AWSCURRENT` versions;
+- `HASNA_UPTIME_ALLOWED_ORIGINS` matches the public HTTPS edge origin;
+- CloudFront origin access is distribution-bound, not just narrowed to
+  CloudFront origin-facing ranges;
+- web egress to ECR, Secrets Manager, CloudWatch Logs, S3, EFS, and any required
+  endpoints has been proven through NAT or VPC endpoints;
+- scheduler, public-probe, reporter, and migration remain at `0`.
+Scale only the web task, then capture the ECS deployment id and task definition
+ARN:
+```bash
+ECS_CLUSTER="$(terraform -chdir="$TF_DIR" output -raw ecs_cluster_name)"
+WEB_SERVICE="$(terraform -chdir="$TF_DIR" output -json service_names | jq -r '.[] | select(endswith("-web"))')"
+aws ecs describe-services \
+  --profile "$AWS_PROFILE_NAME" \
+  --region "$AWS_REGION" \
+  --cluster "$ECS_CLUSTER" \
+  --services "$WEB_SERVICE" \
+  --query 'services[0].{taskDefinition:taskDefinition,deployments:deployments[*].{id:id,status:status,desired:desiredCount,running:runningCount}}'
+```
+## Smoke Checks
+Run these checks through the public edge URL and record status codes and request
+ids. Use a scoped hosted token only from the operator secret store.
+```bash
+EDGE_URL="$(terraform -chdir="$TF_DIR" output -raw protected_access_url)"
+: "${HOSTED_TOKEN_FILE:?set HOSTED_TOKEN_FILE to a 0600 file containing the scoped hosted token}"
+HOSTED_TOKEN="$(tr -d '\n' < "$HOSTED_TOKEN_FILE")"
+curl -fsS "$EDGE_URL/health"
+curl -i "$EDGE_URL/"
+curl -i "$EDGE_URL/api/v1/summary"
+curl -i -H "Authorization: Bearer $HOSTED_TOKEN" "$EDGE_URL/api/v1/summary"
+```
+Expected results:
+- `/health` returns `200` and no monitor data.
+- Dashboard and API reads without auth return `401` or the approved identity
+  layer denial.
+- Authenticated API reads return only the authorized workspace.
+- Direct ALB origin access is denied unless it is the approved CloudFront origin
+  path.
+## Logs And Alarms
+Inspect recent web logs without printing secrets:
+```bash
+WEB_LOG_GROUP="$(terraform -chdir="$TF_DIR" output -json log_group_names | jq -r '.web')"
+aws logs tail "$WEB_LOG_GROUP" \
+  --profile "$AWS_PROFILE_NAME" \
+  --region "$AWS_REGION" \
+  --since 15m
+```
+Verify the initial web alarms exist and are not already alarming:
+```bash
+WEB_5XX_ALARM="$(terraform -chdir="$TF_DIR" output -json alarm_names | jq -r '.web_5xx')"
+WEB_UNHEALTHY_ALARM="$(terraform -chdir="$TF_DIR" output -json alarm_names | jq -r '.web_unhealthy')"
+aws cloudwatch describe-alarms \
+  --profile "$AWS_PROFILE_NAME" \
+  --region "$AWS_REGION" \
+  --alarm-names "$WEB_5XX_ALARM" "$WEB_UNHEALTHY_ALARM" \
+  --query 'MetricAlarms[*].{name:AlarmName,state:StateValue,reason:StateReason}'
+```
+Scheduler-stall, stale-probe, and report-delivery alarms stay blocked until
+those workers are implemented, emit metrics, and are enabled.
+## Backups And Restore Evidence
+Verify EFS backup coverage after the first apply:
+```bash
+BACKUP_VAULT="$(terraform -chdir="$TF_DIR" output -raw backup_vault_name)"
+EFS_FILE_SYSTEM_ID="$(terraform -chdir="$TF_DIR" output -raw efs_file_system_id)"
+EFS_FILE_SYSTEM_ARN="$(aws efs describe-file-systems \
+  --profile "$AWS_PROFILE_NAME" \
+  --region "$AWS_REGION" \
+  --file-system-id "$EFS_FILE_SYSTEM_ID" \
+  --query 'FileSystems[0].FileSystemArn' \
+  --output text)"
+aws backup list-protected-resources \
+  --profile "$AWS_PROFILE_NAME" \
+  --region "$AWS_REGION" \
+  --query "Results[?ResourceArn=='$EFS_FILE_SYSTEM_ARN'].[ResourceArn,LastBackupTime]"
+aws backup list-recovery-points-by-backup-vault \
+  --profile "$AWS_PROFILE_NAME" \
+  --region "$AWS_REGION" \
+  --backup-vault-name "$BACKUP_VAULT" \
+  --query "RecoveryPoints[?ResourceArn=='$EFS_FILE_SYSTEM_ARN'].[RecoveryPointArn,Status,CreationDate]"
+```
+A restore drill must restore to a separate file system or staging target first.
+Do not overwrite the production EFS file system during a drill. Record the
+recovery point ARN, restore job id, target resource, validation result, and
+cleanup action.
+Run the restore drill with a dedicated restore role and a staging security group
+and subnet. The metadata keys are AWS Backup EFS restore metadata; keep the
+staging file system encrypted with the Open Uptime KMS key.
+```bash
+: "${RECOVERY_POINT_ARN:?set RECOVERY_POINT_ARN to the selected recovery point ARN}"
+: "${RESTORE_ROLE_ARN:?set RESTORE_ROLE_ARN to the AWS Backup restore role ARN}"
+: "${STAGING_SUBNET_ID:?set STAGING_SUBNET_ID to the staging private subnet id}"
+: "${STAGING_SECURITY_GROUP_ID:?set STAGING_SECURITY_GROUP_ID to the staging EFS security group id}"
+KMS_KEY_ARN="$(terraform -chdir="$TF_DIR" output -raw kms_key_arn)"
+RESTORE_JOB_ID="$(aws backup start-restore-job \
+  --profile "$AWS_PROFILE_NAME" \
+  --region "$AWS_REGION" \
+  --recovery-point-arn "$RECOVERY_POINT_ARN" \
+  --iam-role-arn "$RESTORE_ROLE_ARN" \
+  --resource-type EFS \
+  --metadata "file-system-id=$EFS_FILE_SYSTEM_ID,newFileSystem=true,encrypted=true,kmsKeyId=$KMS_KEY_ARN,performanceMode=generalPurpose,throughputMode=bursting" \
+  --query 'RestoreJobId' \
+  --output text)"
+aws backup describe-restore-job \
+  --profile "$AWS_PROFILE_NAME" \
+  --region "$AWS_REGION" \
+  --restore-job-id "$RESTORE_JOB_ID" \
+  --query '{status:Status,createdResourceArn:CreatedResourceArn,statusMessage:StatusMessage}'
+```
+Poll `describe-restore-job` until `Status` is `COMPLETED`, then create a
+temporary mount target for the restored file system in the staging subnet:
+```bash
+RESTORED_EFS_ID="$(aws backup describe-restore-job \
+  --profile "$AWS_PROFILE_NAME" \
+  --region "$AWS_REGION" \
+  --restore-job-id "$RESTORE_JOB_ID" \
+  --query 'CreatedResourceArn' \
+  --output text | awk -F/ '{print $NF}')"
+aws efs create-mount-target \
+  --profile "$AWS_PROFILE_NAME" \
+  --region "$AWS_REGION" \
+  --file-system-id "$RESTORED_EFS_ID" \
+  --subnet-id "$STAGING_SUBNET_ID" \
+  --security-groups "$STAGING_SECURITY_GROUP_ID"
+```
+Validate the restored `/data/uptime/uptime.db` from a staging host or task with
+read-only SQLite integrity checks. Capture only counts and integrity status, not
+monitor targets or secrets:
+```bash
+sqlite3 /mnt/restore/uptime/uptime.db 'PRAGMA integrity_check;'
+sqlite3 /mnt/restore/uptime/uptime.db 'SELECT COUNT(*) FROM monitors;'
+```
+After evidence is recorded, delete the staging mount target and restored file
+system. Never mount the restored file system over production during a drill.
+## Reports And Reporter Gate
+Report preview can be tested locally or through authenticated read APIs. Hosted
+delivery attempts through Mailery, Telephony, or Open Logs must stay disabled
+until the reporter has cloud channel refs, idempotency storage, retry/backoff
+state, audit rows, and delivery alarms.
+Do not set `desired_counts.reporter = 1` until a reviewed runbook section exists
+for report retry, duplicate suppression, provider failure handling, and delivery
+audit export.
 ## Private Probe Operator
 The operator machine should be a private probe/operator machine, not the hosted
@@ -112,6 +369,11 @@ routes are backed by cloud check jobs and cloud audit rows.
   URLs, or probe private keys in task definitions. Use ECS `secrets.valueFrom`
   refs such as `HASNA_UPTIME_HOSTED_TOKEN`.
 - Do not run public probe workers against private targets.
+- Do not enable public probe workers until runtime target policy resolves and
+  pins DNS answers, rejects redirects and DNS rebinding into denied ranges, and
+  emits target-policy decision records. The current configuration-time policy
+  blocks direct denied hosts, including IPv4-mapped IPv6 forms, but it is not a
+  substitute for execution-time DNS and redirect enforcement.
 - Do not enable scheduler, public-probe, reporter, or migration workers against
   the EFS SQLite bridge; those services need Postgres/cloud leases first.
 - Do not expose dashboard/API routes without hosted auth and workspace checks.
@@ -128,8 +390,59 @@ routes are backed by cloud check jobs and cloud audit rows.
 ## Rollback
-Before each service update, record the previous task definition ARN. Roll back
-by disabling scheduler/reporter work first, then restoring the previous web or
-worker task definition. EFS backup restore requires separate operator approval,
-a selected recovery point, a replacement mount target/access point cutover, and
-an audit event.
+Before each service update, record the previous task definition ARN and current
+desired counts:
+```bash
+ECS_CLUSTER="$(terraform -chdir="$TF_DIR" output -raw ecs_cluster_name)"
+WEB_SERVICE="$(terraform -chdir="$TF_DIR" output -json service_names | jq -r '.[] | select(endswith("-web"))')"
+aws ecs describe-services \
+  --profile "$AWS_PROFILE_NAME" \
+  --region "$AWS_REGION" \
+  --cluster "$ECS_CLUSTER" \
+  --services "$WEB_SERVICE" \
+  --query 'services[0].{taskDefinition:taskDefinition,desired:desiredCount,running:runningCount}'
+```
+If web health fails after scale-up, first scale web back to `0`:
+```bash
+aws ecs update-service \
+  --profile "$AWS_PROFILE_NAME" \
+  --region "$AWS_REGION" \
+  --cluster "$ECS_CLUSTER" \
+  --service "$WEB_SERVICE" \
+  --desired-count 0
+```
+If a later task definition is bad, restore the previous task definition and keep
+workers disabled:
+```bash
+: "${PREVIOUS_TASK_DEFINITION_ARN:?set PREVIOUS_TASK_DEFINITION_ARN from the pre-update evidence}"
+aws ecs update-service \
+  --profile "$AWS_PROFILE_NAME" \
+  --region "$AWS_REGION" \
+  --cluster "$ECS_CLUSTER" \
+  --service "$WEB_SERVICE" \
+  --task-definition "$PREVIOUS_TASK_DEFINITION_ARN" \
+  --desired-count 1
+```
+Disable scheduler/reporter/probe work before data rollback. EFS backup restore
+requires separate operator approval, a selected recovery point, a replacement
+mount target/access point cutover, validation in staging, and an audit event.
+## Evidence Checklist
+A deployment record is not complete until it contains:
+- source commit, package version, published package integrity, and image digest;
+- Terraform plan summary and zero-count desired-count proof;
+- secret metadata proof showing `AWSCURRENT` without secret values;
+- protected edge smoke results and direct-origin denial evidence;
+- ECS service/task definition evidence;
+- CloudWatch log tail and alarm-state readback;
+- backup vault, protected-resource, recovery-point, and restore-drill evidence;
+- rollback command transcript or dry-run notes;
+- explicit list of remaining disabled workers and why they remain disabled.

package/infra/aws/outputs.tf CHANGED Viewed

@@ -26,6 +26,41 @@ output "evidence_bucket" {
   value = aws_s3_bucket.evidence.bucket
 }
+output "kms_key_arn" {
+  value = var.kms_key_arn
+}
+output "secret_refs" {
+  value = {
+    app_env      = var.app_env_secret_arn
+    hosted_token = var.hosted_token_secret_arn
+    public_probe = var.public_probe_secret_arn
+    reporting    = var.reporting_secret_arn
+  }
+}
+output "log_group_names" {
+  value = merge(
+    { image_builder = aws_cloudwatch_log_group.image_builder.name },
+    { for role, group in aws_cloudwatch_log_group.service : role => group.name },
+  )
+}
+output "alarm_names" {
+  value = {
+    web_5xx       = aws_cloudwatch_metric_alarm.web_5xx.alarm_name
+    web_unhealthy = aws_cloudwatch_metric_alarm.web_unhealthy.alarm_name
+  }
+}
+output "backup_vault_name" {
+  value = aws_backup_vault.data.name
+}
+output "backup_plan_id" {
+  value = aws_backup_plan.data.id
+}
 output "efs_file_system_id" {
   value = aws_efs_file_system.data.id
 }

package/infra/aws/terraform.tfvars.example CHANGED Viewed

@@ -15,7 +15,7 @@ public_subnet_ids        = ["subnet-replace-public-a", "subnet-replace-public-b"
 alb_ingress_cidr_blocks = []
 private_subnet_ids       = ["subnet-replace-private-a", "subnet-replace-private-b"]
 container_image          = "123456789012.dkr.ecr.us-east-1.amazonaws.com/open-uptime@sha256:aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
-runtime_package_version  = "0.1.9"
+runtime_package_version  = "0.1.11"
 certificate_arn          = null
 hosted_zone_id           = null
 app_env_secret_arn       = "arn:aws:secretsmanager:us-east-1:123456789012:secret:open-uptime/prod/app/env"

package/infra/aws/variables.tf CHANGED Viewed

@@ -116,7 +116,7 @@ variable "container_image" {
 variable "runtime_package_version" {
   description = "Published @hasna/uptime package version that CodeBuild should build into the ECR image."
   type        = string
-  default     = "0.1.9"
+  default     = "0.1.11"
   validation {
     condition     = can(regex("^[0-9]+\\.[0-9]+\\.[0-9]+(-[0-9A-Za-z.-]+)?$", var.runtime_package_version))

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@hasna/uptime",
-  "version": "0.1.9",
+  "version": "0.1.11",
   "description": "Local-first uptime and downtime monitoring service with CLI, MCP, SDK, SQLite persistence, and a dashboard.",
   "license": "Apache-2.0",
   "type": "module",