@growth-labs/monitoring 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +8 -0
- package/package.json +6 -3
- package/src/alerting/README.md +13 -0
- package/src/alerting/dedup.ts +70 -0
- package/src/alerting/escalation.ts +29 -0
- package/src/alerting/index.ts +95 -0
- package/src/alerting/thresholds.ts +148 -0
- package/src/index.ts +23 -0
- package/src/prober/README.md +18 -0
- package/src/prober/index.ts +83 -0
- package/src/prober/persist.ts +46 -0
- package/src/prober/runners/get-runner.ts +49 -0
- package/src/prober/runners/happy-path-runner.ts +270 -0
- package/src/prober/runners/post-runner.ts +50 -0
- package/src/prober/surfaces.ts +52 -0
- package/src/schemas/README.md +14 -0
- package/src/schemas/drizzle/schema.ts +59 -0
- package/src/schemas/index.ts +1 -0
- package/src/tail/README.md +18 -0
- package/src/tail/categorize.ts +156 -0
- package/src/tail/fingerprint.ts +21 -0
- package/src/tail/index.ts +71 -0
- package/src/tail/persist.ts +93 -0
- package/src/tail/redact.ts +30 -0
- package/src/tail/sample.ts +34 -0
- package/src/types.ts +111 -0
- package/src/virtual.d.ts +4 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,13 @@
|
|
|
1
1
|
# @growth-labs/monitoring
|
|
2
2
|
|
|
3
|
+
## 0.1.1 — 2026-05-21
|
|
4
|
+
|
|
5
|
+
### Fixed
|
|
6
|
+
|
|
7
|
+
- Include package source files in the published npm tarball so shipped `.js.map`
|
|
8
|
+
and `.d.ts.map` entries resolve without Vite/Astro sourcemap warnings in
|
|
9
|
+
tarball-installed consumers.
|
|
10
|
+
|
|
3
11
|
## 0.1.0 - 2026-05-18
|
|
4
12
|
|
|
5
13
|
- Added synthetic prober factory with GET, POST, and code-signin happy-path runners.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@growth-labs/monitoring",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.1",
|
|
4
4
|
"description": "Operational observability primitives for Growth Labs Cloudflare Workers: synthetic probes, Tail Worker capture, alerting, D1 schemas, and a status page.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"types": "./dist/index.d.ts",
|
|
@@ -50,8 +50,11 @@
|
|
|
50
50
|
},
|
|
51
51
|
"files": [
|
|
52
52
|
"dist",
|
|
53
|
-
"src
|
|
54
|
-
"src
|
|
53
|
+
"src/**/*.ts",
|
|
54
|
+
"src/**/*.astro",
|
|
55
|
+
"src/**/*.css",
|
|
56
|
+
"src/**/*.md",
|
|
57
|
+
"src/**/*.sql",
|
|
55
58
|
"README.md",
|
|
56
59
|
"CHANGELOG.md",
|
|
57
60
|
"SPEC.md"
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Monitoring Alerting
|
|
2
|
+
|
|
3
|
+
The alerting layer is shared by the prober and Tail Worker.
|
|
4
|
+
|
|
5
|
+
- Probe alerts open incidents after consecutive failed checks and close them
|
|
6
|
+
after consecutive successful checks.
|
|
7
|
+
- Error alerts fire on new fingerprints outside the dedup window and on rate
|
|
8
|
+
spikes inside the configured rate window.
|
|
9
|
+
- Incident helpers read and write `gl_uptime_incidents`.
|
|
10
|
+
- Escalation helpers generate actions at 30 minutes, 4 hours, and 24 hours.
|
|
11
|
+
|
|
12
|
+
Alert delivery is delegated to `@growth-labs/notify`. This package owns when an
|
|
13
|
+
alert should fire; notify owns Slack/email transport behavior.
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import { errorMessage, generateId, type IdFactory, type UptimeIncident } from '../types.js'
|
|
2
|
+
import type { AlertAction } from './index.js'
|
|
3
|
+
|
|
4
|
+
export async function isIncidentOpen(
|
|
5
|
+
surface: string,
|
|
6
|
+
db: D1Database,
|
|
7
|
+
): Promise<UptimeIncident | null> {
|
|
8
|
+
return db
|
|
9
|
+
.prepare(`
|
|
10
|
+
SELECT id, surface, opened_at, closed_at, trigger_check_id, resolve_check_id, severity, notes
|
|
11
|
+
FROM gl_uptime_incidents
|
|
12
|
+
WHERE surface = ? AND closed_at IS NULL
|
|
13
|
+
ORDER BY opened_at DESC
|
|
14
|
+
LIMIT 1
|
|
15
|
+
`)
|
|
16
|
+
.bind(surface)
|
|
17
|
+
.first<UptimeIncident>()
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export async function openIncident(
|
|
21
|
+
action: AlertAction,
|
|
22
|
+
db: D1Database,
|
|
23
|
+
idFactory: IdFactory = generateId,
|
|
24
|
+
now = Date.now,
|
|
25
|
+
): Promise<string> {
|
|
26
|
+
const existing = await isIncidentOpen(action.surface, db)
|
|
27
|
+
if (existing) return existing.id
|
|
28
|
+
|
|
29
|
+
const id = idFactory()
|
|
30
|
+
try {
|
|
31
|
+
await db
|
|
32
|
+
.prepare(`
|
|
33
|
+
INSERT INTO gl_uptime_incidents
|
|
34
|
+
(id, surface, opened_at, trigger_check_id, severity, notes)
|
|
35
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
36
|
+
`)
|
|
37
|
+
.bind(
|
|
38
|
+
id,
|
|
39
|
+
action.surface,
|
|
40
|
+
Math.floor(now() / 1000),
|
|
41
|
+
typeof action.payload?.triggerCheckId === 'string' ? action.payload.triggerCheckId : null,
|
|
42
|
+
action.severity === 'critical' ? 'critical' : 'warning',
|
|
43
|
+
typeof action.payload?.notes === 'string' ? action.payload.notes : null,
|
|
44
|
+
)
|
|
45
|
+
.run()
|
|
46
|
+
} catch (error) {
|
|
47
|
+
console.error(`[monitoring] failed to open incident ${action.surface}: ${errorMessage(error)}`)
|
|
48
|
+
}
|
|
49
|
+
return id
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
export async function closeIncident(
|
|
53
|
+
incidentId: string,
|
|
54
|
+
resolveCheckId: string,
|
|
55
|
+
db: D1Database,
|
|
56
|
+
now = Date.now,
|
|
57
|
+
): Promise<void> {
|
|
58
|
+
try {
|
|
59
|
+
await db
|
|
60
|
+
.prepare(`
|
|
61
|
+
UPDATE gl_uptime_incidents
|
|
62
|
+
SET closed_at = ?, resolve_check_id = ?
|
|
63
|
+
WHERE id = ? AND closed_at IS NULL
|
|
64
|
+
`)
|
|
65
|
+
.bind(Math.floor(now() / 1000), resolveCheckId, incidentId)
|
|
66
|
+
.run()
|
|
67
|
+
} catch (error) {
|
|
68
|
+
console.error(`[monitoring] failed to close incident ${incidentId}: ${errorMessage(error)}`)
|
|
69
|
+
}
|
|
70
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import type { UptimeIncident } from '../types.js'
|
|
2
|
+
import type { AlertAction } from './index.js'
|
|
3
|
+
|
|
4
|
+
export function evaluateEscalation(
|
|
5
|
+
incident: UptimeIncident,
|
|
6
|
+
nowSeconds = Math.floor(Date.now() / 1000),
|
|
7
|
+
alreadySent: Set<string> = new Set(),
|
|
8
|
+
): AlertAction | null {
|
|
9
|
+
if (incident.closed_at !== null) return null
|
|
10
|
+
const ageSeconds = nowSeconds - incident.opened_at
|
|
11
|
+
const checkpoints = [
|
|
12
|
+
{ key: '24h', seconds: 24 * 60 * 60, severity: 'critical' as const },
|
|
13
|
+
{ key: '4h', seconds: 4 * 60 * 60, severity: 'critical' as const },
|
|
14
|
+
{ key: '30m', seconds: 30 * 60, severity: 'critical' as const },
|
|
15
|
+
]
|
|
16
|
+
const checkpoint = checkpoints.find((item) => ageSeconds >= item.seconds)
|
|
17
|
+
if (!checkpoint) return null
|
|
18
|
+
const dedupKey = `escalation:${incident.id}:${checkpoint.key}`
|
|
19
|
+
if (alreadySent.has(dedupKey)) return null
|
|
20
|
+
return {
|
|
21
|
+
kind: 'escalation',
|
|
22
|
+
surface: incident.surface,
|
|
23
|
+
severity: checkpoint.severity,
|
|
24
|
+
title: `${incident.surface} incident still open`,
|
|
25
|
+
body: `Incident ${incident.id} has been open for ${checkpoint.key}.`,
|
|
26
|
+
dedupKey,
|
|
27
|
+
payload: { incidentId: incident.id, checkpoint: checkpoint.key },
|
|
28
|
+
}
|
|
29
|
+
}
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import { type NotifyEnv, type NotifyResult, notify } from '@growth-labs/notify'
|
|
2
|
+
import type { CategorizedEvent } from '../tail/categorize.js'
|
|
3
|
+
import type {
|
|
4
|
+
AlertSeverity,
|
|
5
|
+
NotifyConfig,
|
|
6
|
+
PersistedCheckResult,
|
|
7
|
+
ThresholdConfig,
|
|
8
|
+
} from '../types.js'
|
|
9
|
+
import { closeIncident, openIncident } from './dedup.js'
|
|
10
|
+
import { evaluateError, evaluateProbe } from './thresholds.js'
|
|
11
|
+
|
|
12
|
+
export type AlertKind =
|
|
13
|
+
| 'open-incident'
|
|
14
|
+
| 'close-incident'
|
|
15
|
+
| 'rate-spike'
|
|
16
|
+
| 'new-error'
|
|
17
|
+
| 'escalation'
|
|
18
|
+
|
|
19
|
+
export interface AlertAction {
|
|
20
|
+
kind: AlertKind
|
|
21
|
+
surface: string
|
|
22
|
+
severity: AlertSeverity
|
|
23
|
+
title: string
|
|
24
|
+
body: string
|
|
25
|
+
dedupKey: string
|
|
26
|
+
payload?: Record<string, unknown>
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export interface AlertEngine {
|
|
30
|
+
activeIncidentSurfaces: Set<string>
|
|
31
|
+
handleProbeResult(
|
|
32
|
+
result: PersistedCheckResult,
|
|
33
|
+
surfaceName: string,
|
|
34
|
+
db: D1Database,
|
|
35
|
+
env: Record<string, unknown>,
|
|
36
|
+
): Promise<AlertAction | null>
|
|
37
|
+
handleErrorEvent(
|
|
38
|
+
event: CategorizedEvent & { fingerprint: string },
|
|
39
|
+
db: D1Database,
|
|
40
|
+
env: Record<string, unknown>,
|
|
41
|
+
): Promise<AlertAction | null>
|
|
42
|
+
send(env: Record<string, unknown>, action: AlertAction): Promise<NotifyResult>
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
export interface AlerterConfig {
|
|
46
|
+
notifyConfig: NotifyConfig
|
|
47
|
+
alertingConfig?: ThresholdConfig
|
|
48
|
+
notifyFn?: (env: Record<string, unknown>, action: AlertAction) => Promise<NotifyResult>
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
export function createAlerter(config: AlerterConfig): AlertEngine {
|
|
52
|
+
const activeIncidentSurfaces = new Set<string>()
|
|
53
|
+
const thresholds = config.alertingConfig ?? {}
|
|
54
|
+
|
|
55
|
+
async function send(env: Record<string, unknown>, action: AlertAction): Promise<NotifyResult> {
|
|
56
|
+
if (config.notifyFn) return config.notifyFn(env, action)
|
|
57
|
+
return notify(env as NotifyEnv, {
|
|
58
|
+
...config.notifyConfig,
|
|
59
|
+
severity: action.severity,
|
|
60
|
+
title: action.title,
|
|
61
|
+
body: action.body,
|
|
62
|
+
dedupKey: action.dedupKey,
|
|
63
|
+
})
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
return {
|
|
67
|
+
activeIncidentSurfaces,
|
|
68
|
+
async handleProbeResult(result, surfaceName, db, env) {
|
|
69
|
+
const action = await evaluateProbe(result, surfaceName, db, thresholds)
|
|
70
|
+
if (!action) return null
|
|
71
|
+
if (action.kind === 'open-incident') {
|
|
72
|
+
await openIncident(action, db)
|
|
73
|
+
activeIncidentSurfaces.add(action.surface)
|
|
74
|
+
}
|
|
75
|
+
if (action.kind === 'close-incident' && typeof action.payload?.incidentId === 'string') {
|
|
76
|
+
await closeIncident(action.payload.incidentId, result.id, db)
|
|
77
|
+
activeIncidentSurfaces.delete(action.surface)
|
|
78
|
+
}
|
|
79
|
+
await send(env, action)
|
|
80
|
+
return action
|
|
81
|
+
},
|
|
82
|
+
async handleErrorEvent(event, db, env) {
|
|
83
|
+
const action = await evaluateError(event, db, thresholds)
|
|
84
|
+
if (!action) return null
|
|
85
|
+
if (action.kind === 'rate-spike') activeIncidentSurfaces.add(action.surface)
|
|
86
|
+
await send(env, action)
|
|
87
|
+
return action
|
|
88
|
+
},
|
|
89
|
+
send,
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
export { closeIncident, isIncidentOpen, openIncident } from './dedup.js'
|
|
94
|
+
export { evaluateEscalation } from './escalation.js'
|
|
95
|
+
export { evaluateError, evaluateProbe } from './thresholds.js'
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
import type { CategorizedEvent } from '../tail/categorize.js'
|
|
2
|
+
import type { CheckResult, ThresholdConfig, UptimeIncident } from '../types.js'
|
|
3
|
+
import { isIncidentOpen } from './dedup.js'
|
|
4
|
+
import type { AlertAction } from './index.js'
|
|
5
|
+
|
|
6
|
+
interface CheckRow {
|
|
7
|
+
id: string
|
|
8
|
+
status: string
|
|
9
|
+
checked_at: number
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
interface CountRow {
|
|
13
|
+
count: number
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
const DEFAULT_FAILURES_TO_OPEN = 2
|
|
17
|
+
const DEFAULT_SUCCESSES_TO_CLOSE = 1
|
|
18
|
+
const DEFAULT_DEDUP_WINDOW_MS = 60 * 60 * 1000
|
|
19
|
+
const DEFAULT_RATE_SPIKE_THRESHOLD = 10
|
|
20
|
+
const DEFAULT_RATE_SPIKE_WINDOW_MS = 5 * 60 * 1000
|
|
21
|
+
|
|
22
|
+
export async function evaluateProbe(
|
|
23
|
+
result: CheckResult,
|
|
24
|
+
surfaceName: string,
|
|
25
|
+
db: D1Database,
|
|
26
|
+
config: ThresholdConfig = {},
|
|
27
|
+
): Promise<AlertAction | null> {
|
|
28
|
+
const open = await isIncidentOpen(surfaceName, db)
|
|
29
|
+
if (result.status === 'pass') {
|
|
30
|
+
if (!open) return null
|
|
31
|
+
const successesToClose = config.consecutiveSuccessesToClose ?? DEFAULT_SUCCESSES_TO_CLOSE
|
|
32
|
+
const recent = await recentChecks(db, surfaceName, successesToClose)
|
|
33
|
+
if (recent.length >= successesToClose && recent.every((row) => row.status === 'pass')) {
|
|
34
|
+
return closeIncidentAction(open, result)
|
|
35
|
+
}
|
|
36
|
+
return null
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
if (open) return null
|
|
40
|
+
const failuresToOpen = config.consecutiveFailuresToOpen ?? DEFAULT_FAILURES_TO_OPEN
|
|
41
|
+
const recent = await recentChecks(db, surfaceName, failuresToOpen)
|
|
42
|
+
if (recent.length < failuresToOpen) return null
|
|
43
|
+
if (!recent.every((row) => row.status !== 'pass')) return null
|
|
44
|
+
|
|
45
|
+
const severity = config.minSeverity ?? 'warning'
|
|
46
|
+
return {
|
|
47
|
+
kind: 'open-incident',
|
|
48
|
+
surface: surfaceName,
|
|
49
|
+
severity,
|
|
50
|
+
title: `${surfaceName} synthetic check failing`,
|
|
51
|
+
body: `${failuresToOpen} consecutive synthetic checks failed. Latest status: ${result.status}${
|
|
52
|
+
result.errorMessage ? ` (${result.errorMessage})` : ''
|
|
53
|
+
}.`,
|
|
54
|
+
dedupKey: `incident:${surfaceName}`,
|
|
55
|
+
payload: { triggerCheckId: result.id },
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
export async function evaluateError(
|
|
60
|
+
event: CategorizedEvent & { fingerprint: string },
|
|
61
|
+
db: D1Database,
|
|
62
|
+
config: ThresholdConfig = {},
|
|
63
|
+
): Promise<AlertAction | null> {
|
|
64
|
+
const occurredAtSeconds = Math.floor(event.occurredAt / 1000)
|
|
65
|
+
const rateWindowMs = config.rateSpikeWindowMs ?? DEFAULT_RATE_SPIKE_WINDOW_MS
|
|
66
|
+
const rateSpikeThreshold = config.rateSpikeThreshold ?? DEFAULT_RATE_SPIKE_THRESHOLD
|
|
67
|
+
const rateCount = await countFingerprintSince(
|
|
68
|
+
db,
|
|
69
|
+
event.fingerprint,
|
|
70
|
+
occurredAtSeconds - Math.floor(rateWindowMs / 1000),
|
|
71
|
+
)
|
|
72
|
+
if (rateCount >= rateSpikeThreshold) {
|
|
73
|
+
return {
|
|
74
|
+
kind: 'rate-spike',
|
|
75
|
+
surface: event.surface,
|
|
76
|
+
severity: 'critical',
|
|
77
|
+
title: `${event.surface} error rate spike`,
|
|
78
|
+
body: `${rateCount} events with fingerprint ${event.fingerprint} occurred in the last ${Math.round(
|
|
79
|
+
rateWindowMs / 60_000,
|
|
80
|
+
)} minutes.`,
|
|
81
|
+
dedupKey: `rate-spike:${event.surface}:${event.fingerprint}`,
|
|
82
|
+
payload: { fingerprint: event.fingerprint, count: rateCount },
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
const dedupWindowMs = config.newErrorDedupWindowMs ?? DEFAULT_DEDUP_WINDOW_MS
|
|
87
|
+
const dedupCount = await countFingerprintSince(
|
|
88
|
+
db,
|
|
89
|
+
event.fingerprint,
|
|
90
|
+
occurredAtSeconds - Math.floor(dedupWindowMs / 1000),
|
|
91
|
+
)
|
|
92
|
+
if (dedupCount > 0) return null
|
|
93
|
+
|
|
94
|
+
return {
|
|
95
|
+
kind: 'new-error',
|
|
96
|
+
surface: event.surface,
|
|
97
|
+
severity:
|
|
98
|
+
event.category === 'console-warn' || event.category === 'slow-request'
|
|
99
|
+
? 'warning'
|
|
100
|
+
: 'critical',
|
|
101
|
+
title: `${event.surface} new error fingerprint`,
|
|
102
|
+
body: `${event.category} fingerprint ${event.fingerprint}: ${event.message}`,
|
|
103
|
+
dedupKey: `new-error:${event.fingerprint}`,
|
|
104
|
+
payload: { fingerprint: event.fingerprint, category: event.category },
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
async function recentChecks(db: D1Database, surface: string, limit: number): Promise<CheckRow[]> {
|
|
109
|
+
const { results } = await db
|
|
110
|
+
.prepare(`
|
|
111
|
+
SELECT id, status, checked_at
|
|
112
|
+
FROM gl_uptime_checks
|
|
113
|
+
WHERE surface = ?
|
|
114
|
+
ORDER BY checked_at DESC
|
|
115
|
+
LIMIT ?
|
|
116
|
+
`)
|
|
117
|
+
.bind(surface, limit)
|
|
118
|
+
.all<CheckRow>()
|
|
119
|
+
return results ?? []
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
async function countFingerprintSince(
|
|
123
|
+
db: D1Database,
|
|
124
|
+
fingerprint: string,
|
|
125
|
+
since: number,
|
|
126
|
+
): Promise<number> {
|
|
127
|
+
const row = await db
|
|
128
|
+
.prepare(`
|
|
129
|
+
SELECT COUNT(*) AS count
|
|
130
|
+
FROM gl_errors
|
|
131
|
+
WHERE fingerprint = ? AND occurred_at >= ?
|
|
132
|
+
`)
|
|
133
|
+
.bind(fingerprint, since)
|
|
134
|
+
.first<CountRow>()
|
|
135
|
+
return Number(row?.count ?? 0)
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
function closeIncidentAction(open: UptimeIncident, result: CheckResult): AlertAction {
|
|
139
|
+
return {
|
|
140
|
+
kind: 'close-incident',
|
|
141
|
+
surface: open.surface,
|
|
142
|
+
severity: 'info',
|
|
143
|
+
title: `${open.surface} recovered`,
|
|
144
|
+
body: `Synthetic check ${result.id ?? 'unknown'} passed and closed incident ${open.id}.`,
|
|
145
|
+
dedupKey: `incident-closed:${open.id}`,
|
|
146
|
+
payload: { incidentId: open.id, resolveCheckId: result.id },
|
|
147
|
+
}
|
|
148
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
export type { AlertAction, AlertEngine, AlertKind } from './alerting/index.js'
|
|
2
|
+
export { createAlerter } from './alerting/index.js'
|
|
3
|
+
export type { ProberConfig } from './prober/index.js'
|
|
4
|
+
export { createProber } from './prober/index.js'
|
|
5
|
+
export type { StatusPageAppConfig } from './status-page/app.js'
|
|
6
|
+
export { createStatusPageApp } from './status-page/app.js'
|
|
7
|
+
export type { TailWorkerConfig } from './tail/index.js'
|
|
8
|
+
export { createTailWorker } from './tail/index.js'
|
|
9
|
+
export type {
|
|
10
|
+
AlertSeverity,
|
|
11
|
+
CheckResult,
|
|
12
|
+
CheckStatus,
|
|
13
|
+
CheckType,
|
|
14
|
+
ErrorRollup,
|
|
15
|
+
ErrorSeverity,
|
|
16
|
+
NotifyConfig,
|
|
17
|
+
PersistedCheckResult,
|
|
18
|
+
SamplingConfig,
|
|
19
|
+
StatusPageConfig,
|
|
20
|
+
SurfaceStatus,
|
|
21
|
+
ThresholdConfig,
|
|
22
|
+
UptimeIncident,
|
|
23
|
+
} from './types.js'
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# Monitoring Prober
|
|
2
|
+
|
|
3
|
+
`createProber(config)` returns a `scheduledHandler` for Cloudflare Cron Workers.
|
|
4
|
+
The handler matches `event.cron` against configured surfaces and runs all matching
|
|
5
|
+
checks with `ctx.waitUntil`.
|
|
6
|
+
|
|
7
|
+
Supported checks:
|
|
8
|
+
|
|
9
|
+
- `kind: 'get'`: `GET`, `redirect: 'manual'`, status assertion, optional content marker.
|
|
10
|
+
- `kind: 'post'`: form-url-encoded `POST`, `redirect: 'manual'`, status assertion, optional `Location` regex.
|
|
11
|
+
- `kind: 'happy_path'`: code-signin flow with Gmail polling and account-page assertion.
|
|
12
|
+
|
|
13
|
+
All D1 writes are best-effort. A failed persistence call logs to `console.error`
|
|
14
|
+
but does not throw to the scheduled runtime.
|
|
15
|
+
|
|
16
|
+
Invariant: `auth-monitor` must set `notifyConfig.emailProvider` to `'resend'`
|
|
17
|
+
when email alerts are enabled. The monitor checks Cloudflare-hosted auth
|
|
18
|
+
surfaces, so email alert delivery cannot rely on Cloudflare Email.
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import { createAlerter } from '../alerting/index.js'
|
|
2
|
+
import type {
|
|
3
|
+
ExecutionContextLike,
|
|
4
|
+
NotifyConfig,
|
|
5
|
+
ProbeAlertingConfig,
|
|
6
|
+
ScheduledEventLike,
|
|
7
|
+
} from '../types.js'
|
|
8
|
+
import { errorMessage } from '../types.js'
|
|
9
|
+
import { persistCheckResult } from './persist.js'
|
|
10
|
+
import { runGet } from './runners/get-runner.js'
|
|
11
|
+
import { type HappyPathRuntime, runHappyPath } from './runners/happy-path-runner.js'
|
|
12
|
+
import { runPost } from './runners/post-runner.js'
|
|
13
|
+
import type { SurfaceConfig } from './surfaces.js'
|
|
14
|
+
|
|
15
|
+
export interface ProberConfig {
|
|
16
|
+
realmId: string
|
|
17
|
+
d1Binding: string
|
|
18
|
+
notifyConfig: NotifyConfig
|
|
19
|
+
alertingConfig?: ProbeAlertingConfig
|
|
20
|
+
surfaces: SurfaceConfig[]
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export function createProber(config: ProberConfig) {
|
|
24
|
+
const alerter = createAlerter({
|
|
25
|
+
notifyConfig: config.notifyConfig,
|
|
26
|
+
alertingConfig: config.alertingConfig,
|
|
27
|
+
})
|
|
28
|
+
|
|
29
|
+
async function checkSurface(surface: SurfaceConfig, env: Record<string, unknown>): Promise<void> {
|
|
30
|
+
const db = env[config.d1Binding] as D1Database | undefined
|
|
31
|
+
try {
|
|
32
|
+
const result = await runSurface(surface, env)
|
|
33
|
+
if (!db) {
|
|
34
|
+
console.error(`[monitoring] missing D1 binding env.${config.d1Binding}`)
|
|
35
|
+
return
|
|
36
|
+
}
|
|
37
|
+
const persisted = await persistCheckResult(db, surface, result)
|
|
38
|
+
await alerter.handleProbeResult(persisted, surface.name, db, env)
|
|
39
|
+
} catch (error) {
|
|
40
|
+
console.error(
|
|
41
|
+
`[monitoring] synthetic check failed for ${surface.name}: ${errorMessage(error)}`,
|
|
42
|
+
)
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
async function scheduledHandler(
|
|
47
|
+
event: ScheduledEventLike,
|
|
48
|
+
env: Record<string, unknown>,
|
|
49
|
+
ctx: ExecutionContextLike,
|
|
50
|
+
): Promise<void> {
|
|
51
|
+
for (const surface of config.surfaces.filter(
|
|
52
|
+
(candidate) => candidate.schedule === event.cron,
|
|
53
|
+
)) {
|
|
54
|
+
ctx.waitUntil(checkSurface(surface, env))
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
return { scheduledHandler, checkSurface }
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
async function runSurface(surface: SurfaceConfig, env: Record<string, unknown>) {
|
|
62
|
+
switch (surface.kind) {
|
|
63
|
+
case 'get':
|
|
64
|
+
return runGet(surface)
|
|
65
|
+
case 'post':
|
|
66
|
+
return runPost(surface)
|
|
67
|
+
case 'happy_path':
|
|
68
|
+
return runHappyPath(surface, { env } satisfies HappyPathRuntime)
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
export { persistCheckResult } from './persist.js'
|
|
73
|
+
export { runGet } from './runners/get-runner.js'
|
|
74
|
+
export { extractCodeFromMessage, runHappyPath } from './runners/happy-path-runner.js'
|
|
75
|
+
export { runPost } from './runners/post-runner.js'
|
|
76
|
+
export type {
|
|
77
|
+
CodeSigninHappyPathConfig,
|
|
78
|
+
GetSurface,
|
|
79
|
+
GmailPollingConfig,
|
|
80
|
+
HappyPathSurface,
|
|
81
|
+
PostSurface,
|
|
82
|
+
SurfaceConfig,
|
|
83
|
+
} from './surfaces.js'
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import {
|
|
2
|
+
type CheckResult,
|
|
3
|
+
errorMessage,
|
|
4
|
+
generateId,
|
|
5
|
+
type IdFactory,
|
|
6
|
+
type PersistedCheckResult,
|
|
7
|
+
} from '../types.js'
|
|
8
|
+
import type { SurfaceConfig } from './surfaces.js'
|
|
9
|
+
|
|
10
|
+
export async function persistCheckResult(
|
|
11
|
+
db: D1Database,
|
|
12
|
+
surface: SurfaceConfig,
|
|
13
|
+
result: CheckResult,
|
|
14
|
+
idFactory: IdFactory = generateId,
|
|
15
|
+
now = Date.now,
|
|
16
|
+
): Promise<PersistedCheckResult> {
|
|
17
|
+
const id = result.id ?? idFactory()
|
|
18
|
+
const checkedAt = result.checkedAt ?? Math.floor(now() / 1000)
|
|
19
|
+
const persisted: PersistedCheckResult = { ...result, id, checkedAt }
|
|
20
|
+
|
|
21
|
+
try {
|
|
22
|
+
await db
|
|
23
|
+
.prepare(`
|
|
24
|
+
INSERT INTO gl_uptime_checks
|
|
25
|
+
(id, surface, check_type, status, status_code, latency_ms, error_message, checked_at)
|
|
26
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
27
|
+
`)
|
|
28
|
+
.bind(
|
|
29
|
+
id,
|
|
30
|
+
surface.name,
|
|
31
|
+
surface.kind,
|
|
32
|
+
result.status,
|
|
33
|
+
result.statusCode ?? null,
|
|
34
|
+
result.latencyMs,
|
|
35
|
+
result.errorMessage ?? null,
|
|
36
|
+
checkedAt,
|
|
37
|
+
)
|
|
38
|
+
.run()
|
|
39
|
+
} catch (error) {
|
|
40
|
+
console.error(
|
|
41
|
+
`[monitoring] failed to persist uptime check ${surface.name}: ${errorMessage(error)}`,
|
|
42
|
+
)
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
return persisted
|
|
46
|
+
}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import { type CheckResult, errorMessage, isTimeoutMessage, type RuntimeFetch } from '../../types.js'
|
|
2
|
+
import type { GetSurface } from '../surfaces.js'
|
|
3
|
+
|
|
4
|
+
export async function runGet(
|
|
5
|
+
surface: GetSurface,
|
|
6
|
+
runtime: RuntimeFetch = {},
|
|
7
|
+
): Promise<CheckResult> {
|
|
8
|
+
const start = Date.now()
|
|
9
|
+
const fetcher = runtime.fetcher ?? fetch
|
|
10
|
+
try {
|
|
11
|
+
const response = await fetcher(surface.url, {
|
|
12
|
+
method: 'GET',
|
|
13
|
+
redirect: 'manual',
|
|
14
|
+
signal: AbortSignal.timeout(surface.timeoutMs),
|
|
15
|
+
})
|
|
16
|
+
const latencyMs = Date.now() - start
|
|
17
|
+
|
|
18
|
+
if (response.status !== surface.assertions.statusCode) {
|
|
19
|
+
return {
|
|
20
|
+
status: 'fail',
|
|
21
|
+
statusCode: response.status,
|
|
22
|
+
latencyMs,
|
|
23
|
+
errorMessage: `expected ${surface.assertions.statusCode}, got ${response.status}`,
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
if (surface.assertions.contentMarker) {
|
|
28
|
+
const body = await response.text()
|
|
29
|
+
if (!body.includes(surface.assertions.contentMarker)) {
|
|
30
|
+
return {
|
|
31
|
+
status: 'fail',
|
|
32
|
+
statusCode: response.status,
|
|
33
|
+
latencyMs,
|
|
34
|
+
errorMessage: `content marker "${surface.assertions.contentMarker}" not found in body`,
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
return { status: 'pass', statusCode: response.status, latencyMs }
|
|
40
|
+
} catch (error) {
|
|
41
|
+
const latencyMs = Date.now() - start
|
|
42
|
+
const message = errorMessage(error)
|
|
43
|
+
return {
|
|
44
|
+
status: isTimeoutMessage(message) ? 'timeout' : 'fail',
|
|
45
|
+
latencyMs,
|
|
46
|
+
errorMessage: message,
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|