@growth-labs/monitoring 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +8 -0
- package/package.json +6 -3
- package/src/alerting/README.md +13 -0
- package/src/alerting/dedup.ts +70 -0
- package/src/alerting/escalation.ts +29 -0
- package/src/alerting/index.ts +95 -0
- package/src/alerting/thresholds.ts +148 -0
- package/src/index.ts +23 -0
- package/src/prober/README.md +18 -0
- package/src/prober/index.ts +83 -0
- package/src/prober/persist.ts +46 -0
- package/src/prober/runners/get-runner.ts +49 -0
- package/src/prober/runners/happy-path-runner.ts +270 -0
- package/src/prober/runners/post-runner.ts +50 -0
- package/src/prober/surfaces.ts +52 -0
- package/src/schemas/README.md +14 -0
- package/src/schemas/drizzle/schema.ts +59 -0
- package/src/schemas/index.ts +1 -0
- package/src/tail/README.md +18 -0
- package/src/tail/categorize.ts +156 -0
- package/src/tail/fingerprint.ts +21 -0
- package/src/tail/index.ts +71 -0
- package/src/tail/persist.ts +93 -0
- package/src/tail/redact.ts +30 -0
- package/src/tail/sample.ts +34 -0
- package/src/types.ts +111 -0
- package/src/virtual.d.ts +4 -0
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import { createAlerter } from '../alerting/index.js'
|
|
2
|
+
import {
|
|
3
|
+
errorMessage,
|
|
4
|
+
type NotifyConfig,
|
|
5
|
+
type SamplingConfig,
|
|
6
|
+
type TailAlertingConfig,
|
|
7
|
+
} from '../types.js'
|
|
8
|
+
import { categorize, type TraceItem } from './categorize.js'
|
|
9
|
+
import { computeFingerprint } from './fingerprint.js'
|
|
10
|
+
import { persistErrorEvent } from './persist.js'
|
|
11
|
+
import { redact } from './redact.js'
|
|
12
|
+
import { shouldKeep } from './sample.js'
|
|
13
|
+
|
|
14
|
+
export interface TailWorkerConfig {
|
|
15
|
+
realmId: string
|
|
16
|
+
d1Binding: string
|
|
17
|
+
waeBinding: string
|
|
18
|
+
notifyConfig: NotifyConfig
|
|
19
|
+
alertingConfig?: TailAlertingConfig
|
|
20
|
+
sampling: SamplingConfig
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export function createTailWorker(config: TailWorkerConfig) {
|
|
24
|
+
const alerter = createAlerter({
|
|
25
|
+
notifyConfig: config.notifyConfig,
|
|
26
|
+
alertingConfig: config.alertingConfig,
|
|
27
|
+
})
|
|
28
|
+
|
|
29
|
+
async function tailHandler(events: TraceItem[], env: Record<string, unknown>): Promise<void> {
|
|
30
|
+
const db = env[config.d1Binding] as D1Database | undefined
|
|
31
|
+
const wae = env[config.waeBinding] as
|
|
32
|
+
| { writeDataPoint(dataPoint: unknown): unknown }
|
|
33
|
+
| undefined
|
|
34
|
+
if (!db) {
|
|
35
|
+
console.error(`[monitoring] missing D1 binding env.${config.d1Binding}`)
|
|
36
|
+
return
|
|
37
|
+
}
|
|
38
|
+
for (const traceEvent of events) {
|
|
39
|
+
for (const rawEvent of categorize(traceEvent, {
|
|
40
|
+
slowRequestThresholdMs: config.sampling.slowRequestThresholdMs,
|
|
41
|
+
})) {
|
|
42
|
+
try {
|
|
43
|
+
const fingerprint = await computeFingerprint(rawEvent)
|
|
44
|
+
const withFingerprint = { ...rawEvent, fingerprint }
|
|
45
|
+
if (
|
|
46
|
+
!shouldKeep(withFingerprint.category, config.sampling, {
|
|
47
|
+
surface: withFingerprint.surface,
|
|
48
|
+
activeIncidentSurfaces: alerter.activeIncidentSurfaces,
|
|
49
|
+
})
|
|
50
|
+
) {
|
|
51
|
+
continue
|
|
52
|
+
}
|
|
53
|
+
const redacted = redact(withFingerprint) as typeof withFingerprint
|
|
54
|
+
await persistErrorEvent(db, wae, { realmId: config.realmId }, redacted)
|
|
55
|
+
await alerter.handleErrorEvent(withFingerprint, db, env)
|
|
56
|
+
} catch (error) {
|
|
57
|
+
console.error(`[monitoring] failed to process tail event: ${errorMessage(error)}`)
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
return { tail: tailHandler, tailHandler }
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
export type { CategorizedEvent, ErrorCategory, TraceItem } from './categorize.js'
|
|
67
|
+
export { categorize, normalizePath } from './categorize.js'
|
|
68
|
+
export { computeFingerprint, normalizeMessage } from './fingerprint.js'
|
|
69
|
+
export { persistErrorEvent, severityFromCategory } from './persist.js'
|
|
70
|
+
export { redact, redactString, redactSurface } from './redact.js'
|
|
71
|
+
export { shouldKeep } from './sample.js'
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import { type ErrorSeverity, errorMessage, generateId, type IdFactory } from '../types.js'
|
|
2
|
+
import type { CategorizedEvent, ErrorCategory } from './categorize.js'
|
|
3
|
+
|
|
4
|
+
interface PersistConfig {
|
|
5
|
+
realmId: string
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
interface AnalyticsEngineBinding {
|
|
9
|
+
writeDataPoint(dataPoint: unknown): Promise<unknown> | unknown
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export interface FingerprintedEvent extends CategorizedEvent {
|
|
13
|
+
fingerprint: string
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export async function persistErrorEvent(
|
|
17
|
+
db: D1Database,
|
|
18
|
+
wae: AnalyticsEngineBinding | undefined,
|
|
19
|
+
config: PersistConfig,
|
|
20
|
+
event: FingerprintedEvent,
|
|
21
|
+
idFactory: IdFactory = generateId,
|
|
22
|
+
): Promise<void> {
|
|
23
|
+
const severity = severityFromCategory(event.category)
|
|
24
|
+
try {
|
|
25
|
+
await db
|
|
26
|
+
.prepare(`
|
|
27
|
+
INSERT INTO gl_errors
|
|
28
|
+
(id, realm_key, surface, severity, message, stack, request_id, status_code, duration_ms, occurred_at, fingerprint)
|
|
29
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
30
|
+
`)
|
|
31
|
+
.bind(
|
|
32
|
+
idFactory(),
|
|
33
|
+
config.realmId,
|
|
34
|
+
event.surface,
|
|
35
|
+
severity,
|
|
36
|
+
event.message,
|
|
37
|
+
event.stack ?? null,
|
|
38
|
+
event.requestId ?? null,
|
|
39
|
+
event.statusCode ?? null,
|
|
40
|
+
event.durationMs ?? null,
|
|
41
|
+
Math.floor(event.occurredAt / 1000),
|
|
42
|
+
event.fingerprint,
|
|
43
|
+
)
|
|
44
|
+
.run()
|
|
45
|
+
} catch (error) {
|
|
46
|
+
console.error(`[monitoring] failed to persist error event: ${errorMessage(error)}`)
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
if (!wae) return
|
|
50
|
+
try {
|
|
51
|
+
await wae.writeDataPoint({
|
|
52
|
+
blobs: [
|
|
53
|
+
severity,
|
|
54
|
+
config.realmId,
|
|
55
|
+
'',
|
|
56
|
+
'',
|
|
57
|
+
event.surface,
|
|
58
|
+
'',
|
|
59
|
+
'',
|
|
60
|
+
'',
|
|
61
|
+
'',
|
|
62
|
+
'',
|
|
63
|
+
'',
|
|
64
|
+
'',
|
|
65
|
+
'',
|
|
66
|
+
'',
|
|
67
|
+
'',
|
|
68
|
+
'',
|
|
69
|
+
'',
|
|
70
|
+
event.category,
|
|
71
|
+
event.fingerprint,
|
|
72
|
+
'',
|
|
73
|
+
],
|
|
74
|
+
doubles: [event.occurredAt, event.statusCode ?? 0, event.durationMs ?? 0],
|
|
75
|
+
indexes: [severity],
|
|
76
|
+
})
|
|
77
|
+
} catch (error) {
|
|
78
|
+
console.error(`[monitoring] failed to write WAE error event: ${errorMessage(error)}`)
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
export function severityFromCategory(category: ErrorCategory): ErrorSeverity {
|
|
83
|
+
switch (category) {
|
|
84
|
+
case 'exception':
|
|
85
|
+
return 'exception'
|
|
86
|
+
case 'console-warn':
|
|
87
|
+
case 'slow-request':
|
|
88
|
+
return 'warning'
|
|
89
|
+
case 'fivexx':
|
|
90
|
+
case 'console-error':
|
|
91
|
+
return 'error'
|
|
92
|
+
}
|
|
93
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import type { CategorizedEvent } from './categorize.js'
|
|
2
|
+
|
|
3
|
+
const MAX_MESSAGE_LENGTH = 2000
|
|
4
|
+
const MAX_STACK_LENGTH = 4000
|
|
5
|
+
|
|
6
|
+
export function redact(event: CategorizedEvent): CategorizedEvent {
|
|
7
|
+
return {
|
|
8
|
+
...event,
|
|
9
|
+
message: truncate(redactString(event.message), MAX_MESSAGE_LENGTH),
|
|
10
|
+
stack: event.stack ? truncate(redactString(event.stack), MAX_STACK_LENGTH) : undefined,
|
|
11
|
+
surface: redactSurface(event.surface),
|
|
12
|
+
}
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export function redactString(value: string): string {
|
|
16
|
+
return value
|
|
17
|
+
.replace(/\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b/g, '[redacted-email]')
|
|
18
|
+
.replace(/\beyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\b/g, '[redacted-jwt]')
|
|
19
|
+
.replace(/\bcfut_[A-Za-z0-9_-]+\b/g, '[redacted-cf-token]')
|
|
20
|
+
.replace(/\bsk-[A-Za-z0-9_-]{20,}\b/g, '[redacted-api-key]')
|
|
21
|
+
.replace(/\b(?=[A-Za-z0-9]{32,}\b)(?=[A-Za-z0-9]*\d)[A-Za-z0-9]+\b/g, '[redacted-token]')
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export function redactSurface(surface: string): string {
|
|
25
|
+
return surface.replace(/\?[^\s]+/, '')
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
function truncate(value: string, maxLength: number): string {
|
|
29
|
+
return value.length > maxLength ? value.slice(0, maxLength) : value
|
|
30
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import type { SamplingConfig } from '../types.js'
|
|
2
|
+
import type { ErrorCategory } from './categorize.js'
|
|
3
|
+
|
|
4
|
+
interface SamplingOptions {
|
|
5
|
+
surface?: string
|
|
6
|
+
activeIncidentSurfaces?: Set<string>
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export function shouldKeep(
|
|
10
|
+
category: ErrorCategory,
|
|
11
|
+
config: SamplingConfig,
|
|
12
|
+
options: SamplingOptions = {},
|
|
13
|
+
): boolean {
|
|
14
|
+
if (options.surface && options.activeIncidentSurfaces?.has(options.surface)) return true
|
|
15
|
+
const pct = pctForCategory(category, config)
|
|
16
|
+
if (pct >= 1) return true
|
|
17
|
+
if (pct <= 0) return false
|
|
18
|
+
return Math.random() < pct
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
function pctForCategory(category: ErrorCategory, config: SamplingConfig): number {
|
|
22
|
+
switch (category) {
|
|
23
|
+
case 'exception':
|
|
24
|
+
return config.exceptionsPct
|
|
25
|
+
case 'fivexx':
|
|
26
|
+
return config.fivexxPct
|
|
27
|
+
case 'console-error':
|
|
28
|
+
return config.consoleErrorPct
|
|
29
|
+
case 'console-warn':
|
|
30
|
+
return config.consoleWarnPct
|
|
31
|
+
case 'slow-request':
|
|
32
|
+
return config.slowRequestsPct
|
|
33
|
+
}
|
|
34
|
+
}
|
package/src/types.ts
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
export type CheckType = 'get' | 'post' | 'happy_path'
|
|
2
|
+
export type CheckStatus = 'pass' | 'fail' | 'timeout'
|
|
3
|
+
export type AlertSeverity = 'info' | 'warning' | 'critical'
|
|
4
|
+
export type ErrorSeverity = 'error' | 'warning' | 'exception'
|
|
5
|
+
|
|
6
|
+
export interface CheckResult {
|
|
7
|
+
id?: string
|
|
8
|
+
status: CheckStatus
|
|
9
|
+
statusCode?: number
|
|
10
|
+
latencyMs: number
|
|
11
|
+
errorMessage?: string
|
|
12
|
+
checkedAt?: number
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export interface PersistedCheckResult extends CheckResult {
|
|
16
|
+
id: string
|
|
17
|
+
checkedAt: number
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export interface NotifyConfig {
|
|
21
|
+
channels: Array<'slack' | 'email'>
|
|
22
|
+
emailProvider?: 'cloudflare' | 'resend'
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export interface ProbeAlertingConfig {
|
|
26
|
+
consecutiveFailuresToOpen?: number
|
|
27
|
+
consecutiveSuccessesToClose?: number
|
|
28
|
+
minSeverity?: 'warning' | 'critical'
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export interface TailAlertingConfig {
|
|
32
|
+
newErrorDedupWindowMs?: number
|
|
33
|
+
rateSpikeThreshold?: number
|
|
34
|
+
rateSpikeWindowMs?: number
|
|
35
|
+
surfaceDownThresholdPct?: number
|
|
36
|
+
surfaceDownWindowMs?: number
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
export interface ThresholdConfig extends ProbeAlertingConfig, TailAlertingConfig {}
|
|
40
|
+
|
|
41
|
+
export interface SamplingConfig {
|
|
42
|
+
exceptionsPct: number
|
|
43
|
+
fivexxPct: number
|
|
44
|
+
consoleErrorPct: number
|
|
45
|
+
consoleWarnPct: number
|
|
46
|
+
slowRequestsPct: number
|
|
47
|
+
slowRequestThresholdMs: number
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
export interface RuntimeFetch {
|
|
51
|
+
fetcher?: typeof fetch
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export interface UptimeIncident {
|
|
55
|
+
id: string
|
|
56
|
+
surface: string
|
|
57
|
+
opened_at: number
|
|
58
|
+
closed_at: number | null
|
|
59
|
+
trigger_check_id: string | null
|
|
60
|
+
resolve_check_id: string | null
|
|
61
|
+
severity: 'warning' | 'critical'
|
|
62
|
+
notes: string | null
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
export interface SurfaceStatus {
|
|
66
|
+
name: string
|
|
67
|
+
status: 'green' | 'yellow' | 'red'
|
|
68
|
+
lastCheckedAt: number | null
|
|
69
|
+
uptime7d: number
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
export interface ErrorRollup {
|
|
73
|
+
fingerprint: string
|
|
74
|
+
surface: string
|
|
75
|
+
message: string
|
|
76
|
+
count: number
|
|
77
|
+
lastOccurredAt: number
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
export interface StatusPageSurfaceConfig {
|
|
81
|
+
name: string
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
export interface StatusPageConfig {
|
|
85
|
+
realm: string
|
|
86
|
+
d1Binding?: string
|
|
87
|
+
surfaces: StatusPageSurfaceConfig[]
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
export interface ScheduledEventLike {
|
|
91
|
+
cron: string
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
export interface ExecutionContextLike {
|
|
95
|
+
waitUntil(promise: Promise<unknown>): void
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
export type IdFactory = () => string
|
|
99
|
+
|
|
100
|
+
export function generateId(): string {
|
|
101
|
+
return crypto.randomUUID()
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
export function errorMessage(error: unknown): string {
|
|
105
|
+
return error instanceof Error ? error.message : String(error)
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
export function isTimeoutMessage(message: string): boolean {
|
|
109
|
+
const lower = message.toLowerCase()
|
|
110
|
+
return lower.includes('timed out') || lower.includes('timeout') || lower.includes('aborted')
|
|
111
|
+
}
|
package/src/virtual.d.ts
ADDED