@dotdo/postgres 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/backup/backup-manager.d.ts +244 -0
- package/dist/backup/backup-manager.d.ts.map +1 -0
- package/dist/backup/backup-manager.js +726 -0
- package/dist/backup/backup-manager.js.map +1 -0
- package/dist/observability/production-metrics.d.ts +318 -0
- package/dist/observability/production-metrics.d.ts.map +1 -0
- package/dist/observability/production-metrics.js +747 -0
- package/dist/observability/production-metrics.js.map +1 -0
- package/dist/pitr/pitr-manager.d.ts +240 -0
- package/dist/pitr/pitr-manager.d.ts.map +1 -0
- package/dist/pitr/pitr-manager.js +837 -0
- package/dist/pitr/pitr-manager.js.map +1 -0
- package/dist/streaming/cdc-iceberg-connector.d.ts +1 -1
- package/dist/streaming/cdc-iceberg-connector.js +1 -1
- package/dist/streaming/live-cdc-stream.d.ts +1 -1
- package/dist/streaming/live-cdc-stream.js +1 -1
- package/package.json +4 -4
- package/src/__tests__/backup.test.ts +944 -0
- package/src/__tests__/observability.test.ts +1089 -0
- package/src/__tests__/pitr.test.ts +1240 -0
- package/src/backup/backup-manager.ts +1006 -0
- package/src/observability/production-metrics.ts +1054 -0
- package/src/pitr/pitr-manager.ts +1136 -0
|
@@ -0,0 +1,1054 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Production Observability Metrics for PostgreSQL Durable Objects
|
|
3
|
+
*
|
|
4
|
+
* Provides query metrics, connection stats, storage tier monitoring,
|
|
5
|
+
* health checks, Prometheus export, and alerting capabilities.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
// =============================================================================
|
|
9
|
+
// Constants
|
|
10
|
+
// =============================================================================
|
|
11
|
+
|
|
12
|
+
/** Default threshold for classifying queries as "slow" (milliseconds) */
|
|
13
|
+
const DEFAULT_SLOW_QUERY_THRESHOLD_MS = 100
|
|
14
|
+
|
|
15
|
+
/** Maximum number of slow queries to retain in the log */
|
|
16
|
+
const MAX_SLOW_QUERY_LOG_SIZE = 100
|
|
17
|
+
|
|
18
|
+
/** Default maximum number of query digest patterns to track */
|
|
19
|
+
const DEFAULT_MAX_QUERY_DIGESTS = 1000
|
|
20
|
+
|
|
21
|
+
/** Default error rate alert threshold (percent) */
|
|
22
|
+
const DEFAULT_ERROR_RATE_THRESHOLD_PERCENT = 5
|
|
23
|
+
|
|
24
|
+
/** Default P99 latency alert threshold (milliseconds) */
|
|
25
|
+
const DEFAULT_P99_LATENCY_THRESHOLD_MS = 1000
|
|
26
|
+
|
|
27
|
+
/** Reservoir sample size for large dataset percentile calculations */
|
|
28
|
+
const PERCENTILE_SAMPLE_SIZE = 1000
|
|
29
|
+
|
|
30
|
+
/** Threshold for switching to reservoir sampling */
|
|
31
|
+
const RESERVOIR_SAMPLING_THRESHOLD = 10000
|
|
32
|
+
|
|
33
|
+
/** Number of microtask yields for the health check timeout */
|
|
34
|
+
const HEALTH_CHECK_TIMEOUT_YIELD_COUNT = 20
|
|
35
|
+
|
|
36
|
+
/** Default health check timeout (milliseconds) */
|
|
37
|
+
export const DEFAULT_HEALTH_CHECK_TIMEOUT_MS = 5000
|
|
38
|
+
|
|
39
|
+
/** Simulated heap usage in bytes (Workers environment approximation) */
|
|
40
|
+
const SIMULATED_HEAP_USED_BYTES = 50 * 1024 * 1024
|
|
41
|
+
|
|
42
|
+
/** Cloudflare Workers memory limit in bytes */
|
|
43
|
+
const WORKER_MEMORY_LIMIT_BYTES = 128 * 1024 * 1024
|
|
44
|
+
|
|
45
|
+
/** Alert evaluation frequency - evaluate every N queries to avoid performance issues */
|
|
46
|
+
const ALERT_EVALUATION_INTERVAL = 100
|
|
47
|
+
|
|
48
|
+
/** Time window durations for metrics windowing */
|
|
49
|
+
const ONE_MINUTE_MS = 60_000
|
|
50
|
+
const FIVE_MINUTES_MS = 300_000
|
|
51
|
+
const FIFTEEN_MINUTES_MS = 900_000
|
|
52
|
+
|
|
53
|
+
/** Storage cost estimates per byte (simplified) */
|
|
54
|
+
const COST_PER_BYTE_HOT = 0.000001 // ~$1/MB (DO SQLite blob storage)
|
|
55
|
+
const COST_PER_BYTE_WARM = 0.0000001 // Free (Cloudflare Cache)
|
|
56
|
+
const COST_PER_BYTE_COLD = 0.000000015 // R2 pricing
|
|
57
|
+
|
|
58
|
+
/** Default Prometheus histogram boundaries (seconds) */
|
|
59
|
+
const DEFAULT_HISTOGRAM_BOUNDARIES = [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0]
|
|
60
|
+
|
|
61
|
+
// =============================================================================
|
|
62
|
+
// Types
|
|
63
|
+
// =============================================================================
|
|
64
|
+
|
|
65
|
+
/** Configuration for the ProductionMetrics collector */
|
|
66
|
+
export interface ProductionMetricsConfig {
|
|
67
|
+
/** Service name used in metric labels and exports */
|
|
68
|
+
serviceName: string
|
|
69
|
+
/** Service version for dashboard display */
|
|
70
|
+
serviceVersion?: string
|
|
71
|
+
/** Durable Object identifier */
|
|
72
|
+
doId: string
|
|
73
|
+
/** Deployment environment (e.g., 'production', 'staging') */
|
|
74
|
+
environment?: string
|
|
75
|
+
/** Collection interval for periodic metrics gathering (milliseconds) */
|
|
76
|
+
collectIntervalMs?: number
|
|
77
|
+
/** Custom histogram bucket boundaries for query duration (seconds) */
|
|
78
|
+
histogramBoundaries?: number[]
|
|
79
|
+
/** Threshold for classifying queries as slow (milliseconds) */
|
|
80
|
+
slowQueryThresholdMs?: number
|
|
81
|
+
/** Maximum number of unique query digest patterns to track */
|
|
82
|
+
maxQueryDigests?: number
|
|
83
|
+
/** Whether to collect detailed per-tier storage metrics */
|
|
84
|
+
enableDetailedStorageMetrics?: boolean
|
|
85
|
+
/** Alert threshold configuration */
|
|
86
|
+
alertThresholds?: AlertThresholdConfig
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
interface AlertThresholdConfig {
|
|
90
|
+
errorRatePercent?: number
|
|
91
|
+
p99LatencyMs?: number
|
|
92
|
+
memoryUsagePercent?: number
|
|
93
|
+
storageErrorRate?: number
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
export interface QueryRecord {
|
|
97
|
+
sql: string
|
|
98
|
+
durationMs: number
|
|
99
|
+
rowsReturned: number
|
|
100
|
+
success: boolean
|
|
101
|
+
error?: string
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
export interface QueryMetricsSnapshot {
|
|
105
|
+
totalQueries: number
|
|
106
|
+
avgDurationMs: number
|
|
107
|
+
p50DurationMs: number
|
|
108
|
+
p95DurationMs: number
|
|
109
|
+
p99DurationMs: number
|
|
110
|
+
queriesPerSecond: number
|
|
111
|
+
errorRate: number
|
|
112
|
+
totalErrors: number
|
|
113
|
+
totalRowsReturned: number
|
|
114
|
+
byOperation: Record<string, { count: number; avgDurationMs: number }>
|
|
115
|
+
windows: {
|
|
116
|
+
oneMinute: WindowMetrics
|
|
117
|
+
fiveMinutes: WindowMetrics
|
|
118
|
+
fifteenMinutes: WindowMetrics
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
interface WindowMetrics {
|
|
123
|
+
totalQueries: number
|
|
124
|
+
avgDurationMs: number
|
|
125
|
+
errorRate: number
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
export interface ConnectionStats {
|
|
129
|
+
activeConnections: number
|
|
130
|
+
totalConnectionsOpened: number
|
|
131
|
+
peakConnections: number
|
|
132
|
+
connectionErrors: number
|
|
133
|
+
avgConnectionDurationMs: number
|
|
134
|
+
idleConnections: number
|
|
135
|
+
poolUtilization: number
|
|
136
|
+
websocketConnections: number
|
|
137
|
+
httpConnections: number
|
|
138
|
+
avgWaitTimeMs: number
|
|
139
|
+
uptimeMs: number
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
export interface StorageTierSnapshot {
|
|
143
|
+
hot: TierStats
|
|
144
|
+
warm: TierStats
|
|
145
|
+
cold: TierStats
|
|
146
|
+
promotions: { coldToWarm: number; warmToHot: number }
|
|
147
|
+
demotions: { hotToWarm: number; warmToCold: number }
|
|
148
|
+
estimatedCosts: { hot: number; warm: number; cold: number; total: number }
|
|
149
|
+
tieringEfficiency: number
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
interface TierStats {
|
|
153
|
+
hitRate: number
|
|
154
|
+
bytesRead: number
|
|
155
|
+
bytesWritten: number
|
|
156
|
+
totalOperations: number
|
|
157
|
+
avgLatencyMs: number
|
|
158
|
+
errors: number
|
|
159
|
+
usageBytes: number
|
|
160
|
+
healthStatus: 'healthy' | 'degraded' | 'unhealthy'
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
export interface HealthCheckResult {
|
|
164
|
+
status: 'healthy' | 'degraded' | 'unhealthy'
|
|
165
|
+
service?: string
|
|
166
|
+
uptimeMs?: number
|
|
167
|
+
checks: Record<string, ComponentHealth>
|
|
168
|
+
responseTimeMs?: number
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
interface ComponentHealth {
|
|
172
|
+
status: 'healthy' | 'degraded' | 'unhealthy'
|
|
173
|
+
error?: string
|
|
174
|
+
details?: any
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
export interface MetricsDashboard {
|
|
178
|
+
queries: QueryMetricsSnapshot
|
|
179
|
+
connections: ConnectionStats
|
|
180
|
+
storage: StorageTierSnapshot
|
|
181
|
+
health: HealthCheckResult
|
|
182
|
+
service: { name: string; doId: string; uptimeMs: number; version?: string }
|
|
183
|
+
memory: { heapUsedBytes: number; heapTotalBytes: number }
|
|
184
|
+
alerts: Alert[]
|
|
185
|
+
timestamp: number
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
export interface AlertThreshold {
|
|
189
|
+
name: string
|
|
190
|
+
type: string
|
|
191
|
+
threshold: number
|
|
192
|
+
windowMs?: number
|
|
193
|
+
severity: 'warning' | 'critical'
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
interface Alert {
|
|
197
|
+
name?: string
|
|
198
|
+
type: string
|
|
199
|
+
severity: 'warning' | 'critical'
|
|
200
|
+
message: string
|
|
201
|
+
triggeredAt: number
|
|
202
|
+
value?: number
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
export type MetricsExportFormat = 'prometheus' | 'json'
|
|
206
|
+
|
|
207
|
+
export interface PrometheusMetric {
|
|
208
|
+
name: string
|
|
209
|
+
help: string
|
|
210
|
+
type: 'counter' | 'gauge' | 'histogram'
|
|
211
|
+
value?: number
|
|
212
|
+
labels?: Record<string, string>
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
export interface QueryDigest {
|
|
216
|
+
pattern: string
|
|
217
|
+
count: number
|
|
218
|
+
avgDurationMs: number
|
|
219
|
+
totalDurationMs: number
|
|
220
|
+
lastSeen: number
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
export interface SlowQueryLog {
|
|
224
|
+
sql: string
|
|
225
|
+
durationMs: number
|
|
226
|
+
rowsReturned: number
|
|
227
|
+
timestamp: number
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
// =============================================================================
|
|
231
|
+
// Internal State Types
|
|
232
|
+
// =============================================================================
|
|
233
|
+
|
|
234
|
+
interface QueryEntry {
|
|
235
|
+
durationMs: number
|
|
236
|
+
success: boolean
|
|
237
|
+
timestamp: number
|
|
238
|
+
operation: string
|
|
239
|
+
rowsReturned: number
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
interface ConnectionEntry {
|
|
243
|
+
type: 'websocket' | 'http' | 'unknown'
|
|
244
|
+
openedAt: number
|
|
245
|
+
idle: boolean
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
interface StorageOp {
|
|
249
|
+
tier: 'hot' | 'warm' | 'cold'
|
|
250
|
+
operation: 'read' | 'write'
|
|
251
|
+
hit?: boolean
|
|
252
|
+
bytes: number
|
|
253
|
+
durationMs?: number
|
|
254
|
+
timestamp: number
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
// =============================================================================
|
|
258
|
+
// Utility Functions
|
|
259
|
+
// =============================================================================
|
|
260
|
+
|
|
261
|
+
/** Extracts the SQL operation type (SELECT, INSERT, UPDATE, DELETE, OTHER) from a query string */
|
|
262
|
+
function extractOperation(sql: string): string {
|
|
263
|
+
const trimmed = sql.trim().toUpperCase()
|
|
264
|
+
if (trimmed.startsWith('SELECT')) return 'SELECT'
|
|
265
|
+
if (trimmed.startsWith('INSERT')) return 'INSERT'
|
|
266
|
+
if (trimmed.startsWith('UPDATE')) return 'UPDATE'
|
|
267
|
+
if (trimmed.startsWith('DELETE')) return 'DELETE'
|
|
268
|
+
return 'OTHER'
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
/** Normalizes a query by replacing numeric literals with placeholders for digest grouping */
|
|
272
|
+
function normalizeQuery(sql: string): string {
|
|
273
|
+
return sql.replace(/\b\d+\b/g, '$N')
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
/**
|
|
277
|
+
* Calculates the p-th percentile from a pre-sorted array using the nearest-rank method.
|
|
278
|
+
* Returns 0 for empty arrays.
|
|
279
|
+
*/
|
|
280
|
+
function percentile(sorted: number[], p: number): number {
|
|
281
|
+
if (sorted.length === 0) return 0
|
|
282
|
+
if (sorted.length === 1) return sorted[0]
|
|
283
|
+
const idx = Math.ceil((p / 100) * sorted.length)
|
|
284
|
+
return sorted[Math.min(idx, sorted.length - 1)]
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
// =============================================================================
|
|
288
|
+
// ProductionMetrics Class
|
|
289
|
+
// =============================================================================
|
|
290
|
+
|
|
291
|
+
/**
|
|
292
|
+
* Collects and exposes production metrics for PostgreSQL Durable Objects.
|
|
293
|
+
* Tracks query performance, connection statistics, storage tier operations,
|
|
294
|
+
* and provides health checks, Prometheus export, and alerting.
|
|
295
|
+
*/
|
|
296
|
+
export class ProductionMetrics {
|
|
297
|
+
private config: ProductionMetricsConfig
|
|
298
|
+
private startTime: number
|
|
299
|
+
|
|
300
|
+
// Query metrics state
|
|
301
|
+
private queries: QueryEntry[] = []
|
|
302
|
+
private slowQueries: SlowQueryLog[] = []
|
|
303
|
+
private digestMap: Map<string, QueryDigest> = new Map()
|
|
304
|
+
private totalErrors = 0
|
|
305
|
+
private totalRowsReturned = 0
|
|
306
|
+
|
|
307
|
+
// Connection state
|
|
308
|
+
private connections: ConnectionEntry[] = []
|
|
309
|
+
private totalConnectionsOpened = 0
|
|
310
|
+
private peakConnections = 0
|
|
311
|
+
private connectionErrors = 0
|
|
312
|
+
private connectionDurations: number[] = []
|
|
313
|
+
private waitTimes: number[] = []
|
|
314
|
+
private idleConnections = 0
|
|
315
|
+
|
|
316
|
+
// Storage state
|
|
317
|
+
private storageOps: StorageOp[] = []
|
|
318
|
+
private tierPromotions = { coldToWarm: 0, warmToHot: 0 }
|
|
319
|
+
private tierDemotions = { hotToWarm: 0, warmToCold: 0 }
|
|
320
|
+
private tierErrors: Record<string, number> = { hot: 0, warm: 0, cold: 0 }
|
|
321
|
+
private tierUsage: Record<string, number> = { hot: 0, warm: 0, cold: 0 }
|
|
322
|
+
private tierHealth: Record<string, string> = { hot: 'healthy', warm: 'healthy', cold: 'healthy' }
|
|
323
|
+
|
|
324
|
+
// Dependencies
|
|
325
|
+
private pgliteInstance: any = null
|
|
326
|
+
private storageOrchestrator: any = null
|
|
327
|
+
|
|
328
|
+
// Alert state
|
|
329
|
+
private activeAlerts: Alert[] = []
|
|
330
|
+
private customThresholds: AlertThreshold[] = []
|
|
331
|
+
|
|
332
|
+
constructor(config: ProductionMetricsConfig) {
|
|
333
|
+
this.config = config
|
|
334
|
+
this.startTime = Date.now()
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
// ===========================================================================
|
|
338
|
+
// Query Metrics
|
|
339
|
+
// ===========================================================================
|
|
340
|
+
|
|
341
|
+
/** Records a completed query, updating metrics, digests, slow query log, and alerts */
|
|
342
|
+
recordQuery(record: QueryRecord): void {
|
|
343
|
+
const operation = extractOperation(record.sql)
|
|
344
|
+
const entry: QueryEntry = {
|
|
345
|
+
durationMs: Math.max(0, record.durationMs),
|
|
346
|
+
success: record.success,
|
|
347
|
+
timestamp: Date.now(),
|
|
348
|
+
operation,
|
|
349
|
+
rowsReturned: record.rowsReturned,
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
this.queries.push(entry)
|
|
353
|
+
this.totalRowsReturned += record.rowsReturned
|
|
354
|
+
|
|
355
|
+
if (!record.success) {
|
|
356
|
+
this.totalErrors++
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
this.trackSlowQuery(record)
|
|
360
|
+
this.updateQueryDigest(record)
|
|
361
|
+
|
|
362
|
+
// Evaluate alerts periodically to avoid performance overhead on high-throughput workloads
|
|
363
|
+
if (this.queries.length <= ALERT_EVALUATION_INTERVAL || this.queries.length % ALERT_EVALUATION_INTERVAL === 0) {
|
|
364
|
+
this.evaluateAlerts()
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
/** Adds a query to the slow query log if it exceeds the configured threshold */
|
|
369
|
+
private trackSlowQuery(record: QueryRecord): void {
|
|
370
|
+
const threshold = this.config.slowQueryThresholdMs || DEFAULT_SLOW_QUERY_THRESHOLD_MS
|
|
371
|
+
if (record.durationMs >= threshold) {
|
|
372
|
+
this.slowQueries.push({
|
|
373
|
+
sql: record.sql,
|
|
374
|
+
durationMs: record.durationMs,
|
|
375
|
+
rowsReturned: record.rowsReturned,
|
|
376
|
+
timestamp: Date.now(),
|
|
377
|
+
})
|
|
378
|
+
if (this.slowQueries.length > MAX_SLOW_QUERY_LOG_SIZE) {
|
|
379
|
+
this.slowQueries = this.slowQueries.slice(-MAX_SLOW_QUERY_LOG_SIZE)
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
/** Updates the query digest map with a new query record */
|
|
385
|
+
private updateQueryDigest(record: QueryRecord): void {
|
|
386
|
+
const pattern = normalizeQuery(record.sql)
|
|
387
|
+
const existing = this.digestMap.get(pattern)
|
|
388
|
+
if (existing) {
|
|
389
|
+
existing.count++
|
|
390
|
+
existing.totalDurationMs += record.durationMs
|
|
391
|
+
existing.avgDurationMs = existing.totalDurationMs / existing.count
|
|
392
|
+
existing.lastSeen = Date.now()
|
|
393
|
+
} else {
|
|
394
|
+
const maxDigests = this.config.maxQueryDigests || DEFAULT_MAX_QUERY_DIGESTS
|
|
395
|
+
if (this.digestMap.size < maxDigests) {
|
|
396
|
+
this.digestMap.set(pattern, {
|
|
397
|
+
pattern,
|
|
398
|
+
count: 1,
|
|
399
|
+
avgDurationMs: record.durationMs,
|
|
400
|
+
totalDurationMs: record.durationMs,
|
|
401
|
+
lastSeen: Date.now(),
|
|
402
|
+
})
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
/** Returns a comprehensive snapshot of query performance metrics */
|
|
408
|
+
getQueryMetrics(): QueryMetricsSnapshot {
|
|
409
|
+
const total = this.queries.length
|
|
410
|
+
const durations = this.getSortedDurationsSample(total)
|
|
411
|
+
const totalDurationSum = total > 0 ? this.queries.reduce((s, q) => s + q.durationMs, 0) : 0
|
|
412
|
+
const avgDuration = total > 0 ? totalDurationSum / total : 0
|
|
413
|
+
const errors = this.queries.filter((q) => !q.success).length
|
|
414
|
+
const errorRate = total > 0 ? errors / total : 0
|
|
415
|
+
|
|
416
|
+
const now = Date.now()
|
|
417
|
+
const elapsedSeconds = Math.max(1, (now - this.startTime) / 1000)
|
|
418
|
+
const queriesPerSecond = total / elapsedSeconds
|
|
419
|
+
|
|
420
|
+
const byOperation = this.computeOperationBreakdown()
|
|
421
|
+
|
|
422
|
+
const windows = {
|
|
423
|
+
oneMinute: this.getWindowMetrics(now - ONE_MINUTE_MS),
|
|
424
|
+
fiveMinutes: this.getWindowMetrics(now - FIVE_MINUTES_MS),
|
|
425
|
+
fifteenMinutes: this.getWindowMetrics(now - FIFTEEN_MINUTES_MS),
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
return {
|
|
429
|
+
totalQueries: total,
|
|
430
|
+
avgDurationMs: avgDuration,
|
|
431
|
+
p50DurationMs: percentile(durations, 50),
|
|
432
|
+
p95DurationMs: percentile(durations, 95),
|
|
433
|
+
p99DurationMs: percentile(durations, 99),
|
|
434
|
+
queriesPerSecond,
|
|
435
|
+
errorRate,
|
|
436
|
+
totalErrors: errors,
|
|
437
|
+
totalRowsReturned: this.totalRowsReturned,
|
|
438
|
+
byOperation,
|
|
439
|
+
windows,
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
/** Returns sorted duration samples, using reservoir sampling for large datasets */
|
|
444
|
+
private getSortedDurationsSample(total: number): number[] {
|
|
445
|
+
if (total > RESERVOIR_SAMPLING_THRESHOLD) {
|
|
446
|
+
const sample: number[] = []
|
|
447
|
+
for (let i = 0; i < total; i++) {
|
|
448
|
+
if (i < PERCENTILE_SAMPLE_SIZE) {
|
|
449
|
+
sample.push(this.queries[i].durationMs)
|
|
450
|
+
} else {
|
|
451
|
+
const j = Math.floor(Math.random() * (i + 1))
|
|
452
|
+
if (j < PERCENTILE_SAMPLE_SIZE) {
|
|
453
|
+
sample[j] = this.queries[i].durationMs
|
|
454
|
+
}
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
return sample.sort((a, b) => a - b)
|
|
458
|
+
}
|
|
459
|
+
return this.queries.map((q) => q.durationMs).sort((a, b) => a - b)
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
/** Computes per-operation count and average duration breakdown */
|
|
463
|
+
private computeOperationBreakdown(): Record<string, { count: number; avgDurationMs: number }> {
|
|
464
|
+
const byOperation: Record<string, { count: number; avgDurationMs: number }> = {}
|
|
465
|
+
for (const q of this.queries) {
|
|
466
|
+
if (!byOperation[q.operation]) {
|
|
467
|
+
byOperation[q.operation] = { count: 0, avgDurationMs: 0 }
|
|
468
|
+
}
|
|
469
|
+
byOperation[q.operation].count++
|
|
470
|
+
}
|
|
471
|
+
for (const op of Object.keys(byOperation)) {
|
|
472
|
+
const opQueries = this.queries.filter((q) => q.operation === op)
|
|
473
|
+
const sum = opQueries.reduce((s, q) => s + q.durationMs, 0)
|
|
474
|
+
byOperation[op].avgDurationMs = opQueries.length > 0 ? sum / opQueries.length : 0
|
|
475
|
+
}
|
|
476
|
+
return byOperation
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
/** Returns a copy of the slow query log */
|
|
480
|
+
getSlowQueryLog(): SlowQueryLog[] {
|
|
481
|
+
return [...this.slowQueries]
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
/** Returns all tracked query digest patterns */
|
|
485
|
+
getQueryDigests(): QueryDigest[] {
|
|
486
|
+
return Array.from(this.digestMap.values())
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
/** Resets all query-related metrics, including digests and slow query log */
|
|
490
|
+
resetQueryMetrics(): void {
|
|
491
|
+
this.queries = []
|
|
492
|
+
this.slowQueries = []
|
|
493
|
+
this.digestMap.clear()
|
|
494
|
+
this.totalErrors = 0
|
|
495
|
+
this.totalRowsReturned = 0
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
// ===========================================================================
|
|
499
|
+
// Connection Stats
|
|
500
|
+
// ===========================================================================
|
|
501
|
+
|
|
502
|
+
/** Records a new connection being opened */
|
|
503
|
+
recordConnectionOpen(options?: { type?: 'websocket' | 'http' }): void {
|
|
504
|
+
const conn: ConnectionEntry = {
|
|
505
|
+
type: options?.type || 'unknown',
|
|
506
|
+
openedAt: Date.now(),
|
|
507
|
+
idle: false,
|
|
508
|
+
}
|
|
509
|
+
this.connections.push(conn)
|
|
510
|
+
this.totalConnectionsOpened++
|
|
511
|
+
if (this.connections.length > this.peakConnections) {
|
|
512
|
+
this.peakConnections = this.connections.length
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
/** Records a connection being closed, optionally with its total duration */
|
|
517
|
+
recordConnectionClose(options?: { durationMs?: number }): void {
|
|
518
|
+
if (this.connections.length > 0) {
|
|
519
|
+
const conn = this.connections.pop()!
|
|
520
|
+
if (conn.idle) {
|
|
521
|
+
this.idleConnections = Math.max(0, this.idleConnections - 1)
|
|
522
|
+
}
|
|
523
|
+
if (options?.durationMs !== undefined) {
|
|
524
|
+
this.connectionDurations.push(options.durationMs)
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
/** Records a connection error */
|
|
530
|
+
recordConnectionError(_message: string): void {
|
|
531
|
+
this.connectionErrors++
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
/** Records a connection transitioning to idle state */
|
|
535
|
+
recordConnectionIdle(): void {
|
|
536
|
+
this.idleConnections++
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
/** Records a connection being acquired from the pool, with wait time */
|
|
540
|
+
recordConnectionAcquired(options: { waitTimeMs: number }): void {
|
|
541
|
+
this.waitTimes.push(options.waitTimeMs)
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
/** Returns a snapshot of connection statistics */
|
|
545
|
+
getConnectionStats(): ConnectionStats {
|
|
546
|
+
const wsCount = this.connections.filter((c) => c.type === 'websocket').length
|
|
547
|
+
const httpCount = this.connections.filter((c) => c.type === 'http').length
|
|
548
|
+
|
|
549
|
+
const avgDuration = this.connectionDurations.length > 0
|
|
550
|
+
? this.connectionDurations.reduce((s, d) => s + d, 0) / this.connectionDurations.length
|
|
551
|
+
: 0
|
|
552
|
+
|
|
553
|
+
const avgWait = this.waitTimes.length > 0
|
|
554
|
+
? this.waitTimes.reduce((s, w) => s + w, 0) / this.waitTimes.length
|
|
555
|
+
: 0
|
|
556
|
+
|
|
557
|
+
return {
|
|
558
|
+
activeConnections: this.connections.length,
|
|
559
|
+
totalConnectionsOpened: this.totalConnectionsOpened,
|
|
560
|
+
peakConnections: this.peakConnections,
|
|
561
|
+
connectionErrors: this.connectionErrors,
|
|
562
|
+
avgConnectionDurationMs: avgDuration,
|
|
563
|
+
idleConnections: this.idleConnections,
|
|
564
|
+
poolUtilization: Math.min(1.0, this.connections.length), // max_connections=1 in DO model
|
|
565
|
+
websocketConnections: wsCount,
|
|
566
|
+
httpConnections: httpCount,
|
|
567
|
+
avgWaitTimeMs: avgWait,
|
|
568
|
+
uptimeMs: Date.now() - this.startTime,
|
|
569
|
+
}
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
// ===========================================================================
|
|
573
|
+
// Storage Tier Stats
|
|
574
|
+
// ===========================================================================
|
|
575
|
+
|
|
576
|
+
/** Records a storage tier operation (read/write) with optional hit/miss and timing */
|
|
577
|
+
recordStorageOperation(
|
|
578
|
+
tier: 'hot' | 'warm' | 'cold',
|
|
579
|
+
operation: 'read' | 'write',
|
|
580
|
+
options: { hit?: boolean; bytes: number; durationMs?: number }
|
|
581
|
+
): void {
|
|
582
|
+
this.storageOps.push({
|
|
583
|
+
tier,
|
|
584
|
+
operation,
|
|
585
|
+
hit: options.hit,
|
|
586
|
+
bytes: options.bytes,
|
|
587
|
+
durationMs: options.durationMs,
|
|
588
|
+
timestamp: Date.now(),
|
|
589
|
+
})
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
/** Records a data promotion between storage tiers */
|
|
593
|
+
recordTierPromotion(from: string, to: string, _details: { key: string; bytes: number }): void {
|
|
594
|
+
if (from === 'cold' && to === 'warm') this.tierPromotions.coldToWarm++
|
|
595
|
+
if (from === 'warm' && to === 'hot') this.tierPromotions.warmToHot++
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
/** Records a data demotion between storage tiers */
|
|
599
|
+
recordTierDemotion(from: string, to: string, _details: { key: string; bytes: number }): void {
|
|
600
|
+
if (from === 'hot' && to === 'warm') this.tierDemotions.hotToWarm++
|
|
601
|
+
if (from === 'warm' && to === 'cold') this.tierDemotions.warmToCold++
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
/** Records a storage error and triggers alert if threshold is exceeded */
|
|
605
|
+
recordStorageError(tier: string, _message: string): void {
|
|
606
|
+
this.tierErrors[tier] = (this.tierErrors[tier] || 0) + 1
|
|
607
|
+
|
|
608
|
+
// Check alert threshold
|
|
609
|
+
const totalStorageErrors = Object.values(this.tierErrors).reduce((s, e) => s + e, 0)
|
|
610
|
+
const threshold = this.config.alertThresholds?.storageErrorRate
|
|
611
|
+
if (threshold !== undefined && totalStorageErrors > threshold) {
|
|
612
|
+
this.addAlert({
|
|
613
|
+
type: 'storage_error',
|
|
614
|
+
severity: 'critical',
|
|
615
|
+
message: `Storage error rate exceeded threshold: ${totalStorageErrors} errors`,
|
|
616
|
+
triggeredAt: Date.now(),
|
|
617
|
+
value: totalStorageErrors,
|
|
618
|
+
})
|
|
619
|
+
}
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
/** Records the current storage usage for a tier */
|
|
623
|
+
recordStorageUsage(tier: 'hot' | 'warm' | 'cold', bytes: number): void {
|
|
624
|
+
this.tierUsage[tier] = bytes
|
|
625
|
+
}
|
|
626
|
+
|
|
627
|
+
/** Records a change in tier health status */
|
|
628
|
+
recordTierHealthChange(tier: string, status: 'healthy' | 'degraded' | 'unhealthy'): void {
|
|
629
|
+
this.tierHealth[tier] = status
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
/** Returns a comprehensive snapshot of storage tier statistics including costs */
|
|
633
|
+
getStorageTierStats(): StorageTierSnapshot {
|
|
634
|
+
const getTierStats = (tier: 'hot' | 'warm' | 'cold'): TierStats => {
|
|
635
|
+
const ops = this.storageOps.filter((o) => o.tier === tier)
|
|
636
|
+
const reads = ops.filter((o) => o.operation === 'read')
|
|
637
|
+
const hits = reads.filter((o) => o.hit === true).length
|
|
638
|
+
const hitRate = reads.length > 0 ? hits / reads.length : 0
|
|
639
|
+
|
|
640
|
+
const bytesRead = ops.filter((o) => o.operation === 'read').reduce((s, o) => s + o.bytes, 0)
|
|
641
|
+
const bytesWritten = ops.filter((o) => o.operation === 'write').reduce((s, o) => s + o.bytes, 0)
|
|
642
|
+
|
|
643
|
+
const durationsWithValues = ops.filter((o) => o.durationMs !== undefined)
|
|
644
|
+
const avgLatencyMs = durationsWithValues.length > 0
|
|
645
|
+
? durationsWithValues.reduce((s, o) => s + (o.durationMs || 0), 0) / durationsWithValues.length
|
|
646
|
+
: 0
|
|
647
|
+
|
|
648
|
+
return {
|
|
649
|
+
hitRate,
|
|
650
|
+
bytesRead,
|
|
651
|
+
bytesWritten,
|
|
652
|
+
totalOperations: ops.length,
|
|
653
|
+
avgLatencyMs,
|
|
654
|
+
errors: this.tierErrors[tier] || 0,
|
|
655
|
+
usageBytes: this.tierUsage[tier] || 0,
|
|
656
|
+
healthStatus: (this.tierHealth[tier] || 'healthy') as 'healthy' | 'degraded' | 'unhealthy',
|
|
657
|
+
}
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
const hotStats = getTierStats('hot')
|
|
661
|
+
const warmStats = getTierStats('warm')
|
|
662
|
+
const coldStats = getTierStats('cold')
|
|
663
|
+
|
|
664
|
+
// Estimate costs based on bytes stored
|
|
665
|
+
const hotCost = (this.tierUsage['hot'] || 0) * COST_PER_BYTE_HOT
|
|
666
|
+
const warmCost = (this.tierUsage['warm'] || 0) * COST_PER_BYTE_WARM
|
|
667
|
+
const coldCost = (this.tierUsage['cold'] || 0) * COST_PER_BYTE_COLD
|
|
668
|
+
|
|
669
|
+
// Calculate tiering efficiency (hot hit rate as primary metric)
|
|
670
|
+
const hotReads = this.storageOps.filter((o) => o.tier === 'hot' && o.operation === 'read')
|
|
671
|
+
const hotHits = hotReads.filter((o) => o.hit === true).length
|
|
672
|
+
const tieringEfficiency = hotReads.length > 0 ? hotHits / hotReads.length : 0
|
|
673
|
+
|
|
674
|
+
return {
|
|
675
|
+
hot: hotStats,
|
|
676
|
+
warm: warmStats,
|
|
677
|
+
cold: coldStats,
|
|
678
|
+
promotions: { ...this.tierPromotions },
|
|
679
|
+
demotions: { ...this.tierDemotions },
|
|
680
|
+
estimatedCosts: {
|
|
681
|
+
hot: hotCost,
|
|
682
|
+
warm: warmCost,
|
|
683
|
+
cold: coldCost,
|
|
684
|
+
total: hotCost + warmCost + coldCost,
|
|
685
|
+
},
|
|
686
|
+
tieringEfficiency,
|
|
687
|
+
}
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
// ===========================================================================
|
|
691
|
+
// Health Checks
|
|
692
|
+
// ===========================================================================
|
|
693
|
+
|
|
694
|
+
/** Sets the PGLite instance for health check queries */
|
|
695
|
+
setPGLiteInstance(pglite: any): void {
|
|
696
|
+
this.pgliteInstance = pglite
|
|
697
|
+
}
|
|
698
|
+
|
|
699
|
+
/** Sets the storage orchestrator for health check tier status */
|
|
700
|
+
setStorageOrchestrator(orchestrator: any): void {
|
|
701
|
+
this.storageOrchestrator = orchestrator
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
/** Returns a simple liveness probe result (always healthy if the service is running) */
|
|
705
|
+
liveness(): { status: 'healthy'; service: string; uptimeMs: number } {
|
|
706
|
+
return {
|
|
707
|
+
status: 'healthy',
|
|
708
|
+
service: this.config.serviceName,
|
|
709
|
+
uptimeMs: Date.now() - this.startTime,
|
|
710
|
+
}
|
|
711
|
+
}
|
|
712
|
+
|
|
713
|
+
/** Performs a readiness check including PGLite, storage, and memory health */
|
|
714
|
+
async readiness(_options?: { timeoutMs?: number }): Promise<HealthCheckResult> {
|
|
715
|
+
const startTime = Date.now()
|
|
716
|
+
const checks: Record<string, ComponentHealth> = {}
|
|
717
|
+
|
|
718
|
+
checks.pglite = await this.checkPGLiteHealth()
|
|
719
|
+
checks.storage = this.checkStorageHealth()
|
|
720
|
+
checks.memory = {
|
|
721
|
+
status: 'healthy',
|
|
722
|
+
details: {
|
|
723
|
+
heapUsedBytes: SIMULATED_HEAP_USED_BYTES,
|
|
724
|
+
heapTotalBytes: WORKER_MEMORY_LIMIT_BYTES,
|
|
725
|
+
},
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
const overallStatus = this.determineOverallStatus(checks)
|
|
729
|
+
|
|
730
|
+
return {
|
|
731
|
+
status: overallStatus,
|
|
732
|
+
checks,
|
|
733
|
+
responseTimeMs: Date.now() - startTime,
|
|
734
|
+
}
|
|
735
|
+
}
|
|
736
|
+
|
|
737
|
+
/** Checks PGLite health with a microtask-based timeout */
|
|
738
|
+
private async checkPGLiteHealth(): Promise<ComponentHealth> {
|
|
739
|
+
if (!this.pgliteInstance) {
|
|
740
|
+
return { status: 'unhealthy', error: 'PGLite not initialized' }
|
|
741
|
+
}
|
|
742
|
+
|
|
743
|
+
try {
|
|
744
|
+
const queryPromise = this.pgliteInstance.query('SELECT 1 as result')
|
|
745
|
+
let settled = false
|
|
746
|
+
const wrappedQuery = queryPromise.then(
|
|
747
|
+
(v: any) => { settled = true; return v },
|
|
748
|
+
(e: any) => { settled = true; throw e }
|
|
749
|
+
)
|
|
750
|
+
const timeoutCheck = new Promise<never>(async (_, reject) => {
|
|
751
|
+
for (let i = 0; i < HEALTH_CHECK_TIMEOUT_YIELD_COUNT; i++) {
|
|
752
|
+
await Promise.resolve()
|
|
753
|
+
if (settled) return
|
|
754
|
+
}
|
|
755
|
+
if (!settled) {
|
|
756
|
+
reject(new Error('Health check timeout'))
|
|
757
|
+
}
|
|
758
|
+
})
|
|
759
|
+
await Promise.race([wrappedQuery, timeoutCheck])
|
|
760
|
+
return { status: 'healthy' }
|
|
761
|
+
} catch (e) {
|
|
762
|
+
const errorMsg = e instanceof Error ? e.message : 'Unknown health check error'
|
|
763
|
+
return {
|
|
764
|
+
status: 'unhealthy',
|
|
765
|
+
error: errorMsg.includes('timeout') ? 'Health check timeout' : errorMsg,
|
|
766
|
+
}
|
|
767
|
+
}
|
|
768
|
+
}
|
|
769
|
+
|
|
770
|
+
/** Checks storage orchestrator health */
|
|
771
|
+
private checkStorageHealth(): ComponentHealth {
|
|
772
|
+
if (!this.storageOrchestrator) {
|
|
773
|
+
return { status: 'healthy', details: { message: 'No orchestrator configured' } }
|
|
774
|
+
}
|
|
775
|
+
|
|
776
|
+
const tierHealth = this.storageOrchestrator.getTierHealth()
|
|
777
|
+
const allHealthy = Object.values(tierHealth).every((t: any) => t.status === 'healthy')
|
|
778
|
+
const anyDegraded = Object.values(tierHealth).some((t: any) => t.status === 'degraded')
|
|
779
|
+
|
|
780
|
+
return {
|
|
781
|
+
status: allHealthy ? 'healthy' : (anyDegraded ? 'degraded' : 'unhealthy'),
|
|
782
|
+
details: tierHealth,
|
|
783
|
+
}
|
|
784
|
+
}
|
|
785
|
+
|
|
786
|
+
/** Determines the worst overall status from all component checks */
|
|
787
|
+
private determineOverallStatus(checks: Record<string, ComponentHealth>): 'healthy' | 'degraded' | 'unhealthy' {
|
|
788
|
+
const statuses = Object.values(checks).map((c) => c.status)
|
|
789
|
+
if (statuses.includes('unhealthy')) return 'unhealthy'
|
|
790
|
+
if (statuses.includes('degraded')) return 'degraded'
|
|
791
|
+
return 'healthy'
|
|
792
|
+
}
|
|
793
|
+
|
|
794
|
+
/** Performs a deep health check including WAL status */
|
|
795
|
+
async deepCheck(): Promise<HealthCheckResult> {
|
|
796
|
+
const startTime = Date.now()
|
|
797
|
+
const readinessResult = await this.readiness()
|
|
798
|
+
|
|
799
|
+
// Add WAL check
|
|
800
|
+
readinessResult.checks.wal = {
|
|
801
|
+
status: 'healthy',
|
|
802
|
+
details: { lastArchive: 'N/A' },
|
|
803
|
+
}
|
|
804
|
+
|
|
805
|
+
readinessResult.responseTimeMs = Date.now() - startTime
|
|
806
|
+
return readinessResult
|
|
807
|
+
}
|
|
808
|
+
|
|
809
|
+
// ===========================================================================
|
|
810
|
+
// Metrics Export
|
|
811
|
+
// ===========================================================================
|
|
812
|
+
|
|
813
|
+
/** Exports all metrics in Prometheus text exposition format */
|
|
814
|
+
exportPrometheus(): string {
|
|
815
|
+
const lines: string[] = []
|
|
816
|
+
const labels = `service="${this.config.serviceName}",do_id="${this.config.doId}"`
|
|
817
|
+
const queryMetrics = this.getQueryMetrics()
|
|
818
|
+
|
|
819
|
+
// Query total counter
|
|
820
|
+
lines.push('# HELP postgres_query_total Total number of queries executed')
|
|
821
|
+
lines.push('# TYPE postgres_query_total counter')
|
|
822
|
+
lines.push(`postgres_query_total{${labels}} ${queryMetrics.totalQueries}`)
|
|
823
|
+
|
|
824
|
+
// Query errors
|
|
825
|
+
lines.push('# HELP postgres_query_errors_total Total number of query errors')
|
|
826
|
+
lines.push('# TYPE postgres_query_errors_total counter')
|
|
827
|
+
lines.push(`postgres_query_errors_total{${labels}} ${queryMetrics.totalErrors}`)
|
|
828
|
+
|
|
829
|
+
// Query duration histogram
|
|
830
|
+
lines.push('# HELP postgres_query_duration_seconds Query execution time in seconds')
|
|
831
|
+
lines.push('# TYPE postgres_query_duration_seconds histogram')
|
|
832
|
+
|
|
833
|
+
const boundaries = this.config.histogramBoundaries || DEFAULT_HISTOGRAM_BOUNDARIES
|
|
834
|
+
const durations = this.queries.map((q) => q.durationMs / 1000).sort((a, b) => a - b)
|
|
835
|
+
let sum = 0
|
|
836
|
+
|
|
837
|
+
for (const boundary of boundaries) {
|
|
838
|
+
const count = durations.filter((d) => d <= boundary).length
|
|
839
|
+
lines.push(`postgres_query_duration_seconds_bucket{${labels},le="${boundary}"} ${count}`)
|
|
840
|
+
}
|
|
841
|
+
lines.push(`postgres_query_duration_seconds_bucket{${labels},le="+Inf"} ${durations.length}`)
|
|
842
|
+
|
|
843
|
+
sum = durations.reduce((s, d) => s + d, 0)
|
|
844
|
+
lines.push(`postgres_query_duration_seconds_count{${labels}} ${durations.length}`)
|
|
845
|
+
lines.push(`postgres_query_duration_seconds_sum{${labels}} ${sum}`)
|
|
846
|
+
|
|
847
|
+
// Connections
|
|
848
|
+
const connStats = this.getConnectionStats()
|
|
849
|
+
lines.push('# HELP postgres_connections_active Current active connections')
|
|
850
|
+
lines.push('# TYPE postgres_connections_active gauge')
|
|
851
|
+
lines.push(`postgres_connections_active{${labels}} ${connStats.activeConnections}`)
|
|
852
|
+
|
|
853
|
+
// Storage operations
|
|
854
|
+
lines.push('# HELP postgres_storage_operations_total Total storage operations')
|
|
855
|
+
lines.push('# TYPE postgres_storage_operations_total counter')
|
|
856
|
+
for (const tier of ['hot', 'warm', 'cold'] as const) {
|
|
857
|
+
const count = this.storageOps.filter((o) => o.tier === tier).length
|
|
858
|
+
lines.push(`postgres_storage_operations_total{${labels},tier="${tier}"} ${count}`)
|
|
859
|
+
}
|
|
860
|
+
|
|
861
|
+
return lines.join('\n')
|
|
862
|
+
}
|
|
863
|
+
|
|
864
|
+
/** Exports all metrics as a structured JSON object */
|
|
865
|
+
exportJSON(): { metrics: any; timestamp: number; service: string } {
|
|
866
|
+
return {
|
|
867
|
+
metrics: {
|
|
868
|
+
queries: this.getQueryMetrics(),
|
|
869
|
+
connections: this.getConnectionStats(),
|
|
870
|
+
storage: this.getStorageTierStats(),
|
|
871
|
+
},
|
|
872
|
+
timestamp: Date.now(),
|
|
873
|
+
service: this.config.serviceName,
|
|
874
|
+
}
|
|
875
|
+
}
|
|
876
|
+
|
|
877
|
+
/** Creates an HTTP request handler that serves metrics in Prometheus or JSON format */
|
|
878
|
+
createMetricsHandler(): (request: Request) => Promise<Response> {
|
|
879
|
+
return async (request: Request) => {
|
|
880
|
+
const accept = request.headers.get('Accept') || 'text/plain'
|
|
881
|
+
|
|
882
|
+
if (accept.includes('application/json')) {
|
|
883
|
+
const json = this.exportJSON()
|
|
884
|
+
return new Response(JSON.stringify(json), {
|
|
885
|
+
status: 200,
|
|
886
|
+
headers: { 'content-type': 'application/json' },
|
|
887
|
+
})
|
|
888
|
+
}
|
|
889
|
+
|
|
890
|
+
// Default: Prometheus format
|
|
891
|
+
const prometheus = this.exportPrometheus()
|
|
892
|
+
return new Response(prometheus, {
|
|
893
|
+
status: 200,
|
|
894
|
+
headers: { 'content-type': 'text/plain; charset=utf-8' },
|
|
895
|
+
})
|
|
896
|
+
}
|
|
897
|
+
}
|
|
898
|
+
|
|
899
|
+
// ===========================================================================
|
|
900
|
+
// Dashboard
|
|
901
|
+
// ===========================================================================
|
|
902
|
+
|
|
903
|
+
/** Returns a complete metrics dashboard snapshot for display */
|
|
904
|
+
getDashboard(): MetricsDashboard {
|
|
905
|
+
return {
|
|
906
|
+
queries: this.getQueryMetrics(),
|
|
907
|
+
connections: this.getConnectionStats(),
|
|
908
|
+
storage: this.getStorageTierStats(),
|
|
909
|
+
health: {
|
|
910
|
+
status: 'healthy',
|
|
911
|
+
checks: {},
|
|
912
|
+
},
|
|
913
|
+
service: {
|
|
914
|
+
name: this.config.serviceName,
|
|
915
|
+
doId: this.config.doId,
|
|
916
|
+
uptimeMs: Date.now() - this.startTime,
|
|
917
|
+
version: this.config.serviceVersion,
|
|
918
|
+
},
|
|
919
|
+
memory: {
|
|
920
|
+
heapUsedBytes: SIMULATED_HEAP_USED_BYTES,
|
|
921
|
+
heapTotalBytes: WORKER_MEMORY_LIMIT_BYTES,
|
|
922
|
+
},
|
|
923
|
+
alerts: this.getActiveAlerts(),
|
|
924
|
+
timestamp: Date.now(),
|
|
925
|
+
}
|
|
926
|
+
}
|
|
927
|
+
|
|
928
|
+
// ===========================================================================
|
|
929
|
+
// Alerts
|
|
930
|
+
// ===========================================================================
|
|
931
|
+
|
|
932
|
+
/** Evaluates all alert conditions against current metrics, adding/removing alerts as needed */
|
|
933
|
+
evaluateAlerts(): void {
|
|
934
|
+
const queryMetrics = this.getQueryMetrics()
|
|
935
|
+
|
|
936
|
+
const errorThreshold = this.config.alertThresholds?.errorRatePercent ?? DEFAULT_ERROR_RATE_THRESHOLD_PERCENT
|
|
937
|
+
const errorRatePercent = queryMetrics.errorRate * 100
|
|
938
|
+
if (errorRatePercent > errorThreshold) {
|
|
939
|
+
this.addAlert({
|
|
940
|
+
type: 'error_rate',
|
|
941
|
+
severity: 'critical',
|
|
942
|
+
message: `Error rate ${errorRatePercent.toFixed(1)}% exceeds threshold ${errorThreshold}%`,
|
|
943
|
+
triggeredAt: Date.now(),
|
|
944
|
+
value: errorRatePercent,
|
|
945
|
+
})
|
|
946
|
+
} else {
|
|
947
|
+
// Resolve error rate alerts
|
|
948
|
+
this.activeAlerts = this.activeAlerts.filter((a) => a.type !== 'error_rate')
|
|
949
|
+
}
|
|
950
|
+
|
|
951
|
+
const p99Threshold = this.config.alertThresholds?.p99LatencyMs ?? DEFAULT_P99_LATENCY_THRESHOLD_MS
|
|
952
|
+
if (queryMetrics.p99DurationMs > p99Threshold) {
|
|
953
|
+
this.addAlert({
|
|
954
|
+
type: 'high_latency',
|
|
955
|
+
severity: 'warning',
|
|
956
|
+
message: `P99 latency ${queryMetrics.p99DurationMs}ms exceeds threshold ${p99Threshold}ms`,
|
|
957
|
+
triggeredAt: Date.now(),
|
|
958
|
+
value: queryMetrics.p99DurationMs,
|
|
959
|
+
})
|
|
960
|
+
} else {
|
|
961
|
+
this.activeAlerts = this.activeAlerts.filter((a) => a.type !== 'high_latency')
|
|
962
|
+
}
|
|
963
|
+
|
|
964
|
+
// Custom thresholds (e.g. slow query count)
|
|
965
|
+
for (const threshold of this.customThresholds) {
|
|
966
|
+
if (threshold.type === 'slow_query_count') {
|
|
967
|
+
const slowCount = this.slowQueries.length
|
|
968
|
+
if (slowCount > threshold.threshold) {
|
|
969
|
+
this.addAlert({
|
|
970
|
+
name: threshold.name,
|
|
971
|
+
type: threshold.type,
|
|
972
|
+
severity: threshold.severity,
|
|
973
|
+
message: `Slow query count ${slowCount} exceeds threshold ${threshold.threshold}`,
|
|
974
|
+
triggeredAt: Date.now(),
|
|
975
|
+
value: slowCount,
|
|
976
|
+
})
|
|
977
|
+
}
|
|
978
|
+
}
|
|
979
|
+
}
|
|
980
|
+
}
|
|
981
|
+
|
|
982
|
+
/** Returns a copy of all currently active alerts */
|
|
983
|
+
getActiveAlerts(): Alert[] {
|
|
984
|
+
return [...this.activeAlerts]
|
|
985
|
+
}
|
|
986
|
+
|
|
987
|
+
/** Registers a custom alert threshold for evaluation */
|
|
988
|
+
registerAlertThreshold(threshold: AlertThreshold): void {
|
|
989
|
+
this.customThresholds.push(threshold)
|
|
990
|
+
}
|
|
991
|
+
|
|
992
|
+
// ===========================================================================
|
|
993
|
+
// Reset
|
|
994
|
+
// ===========================================================================
|
|
995
|
+
|
|
996
|
+
/** Resets all metrics state (queries, connections, storage, alerts) */
|
|
997
|
+
resetAll(): void {
|
|
998
|
+
this.resetQueryMetrics()
|
|
999
|
+
this.connections = []
|
|
1000
|
+
this.totalConnectionsOpened = 0
|
|
1001
|
+
this.peakConnections = 0
|
|
1002
|
+
this.connectionErrors = 0
|
|
1003
|
+
this.connectionDurations = []
|
|
1004
|
+
this.waitTimes = []
|
|
1005
|
+
this.idleConnections = 0
|
|
1006
|
+
this.storageOps = []
|
|
1007
|
+
this.tierPromotions = { coldToWarm: 0, warmToHot: 0 }
|
|
1008
|
+
this.tierDemotions = { hotToWarm: 0, warmToCold: 0 }
|
|
1009
|
+
this.tierErrors = { hot: 0, warm: 0, cold: 0 }
|
|
1010
|
+
this.tierUsage = { hot: 0, warm: 0, cold: 0 }
|
|
1011
|
+
this.activeAlerts = []
|
|
1012
|
+
}
|
|
1013
|
+
|
|
1014
|
+
// ===========================================================================
|
|
1015
|
+
// Private Helpers
|
|
1016
|
+
// ===========================================================================
|
|
1017
|
+
|
|
1018
|
+
private getWindowMetrics(sinceTimestamp: number): WindowMetrics {
|
|
1019
|
+
const windowQueries = this.queries.filter((q) => q.timestamp >= sinceTimestamp)
|
|
1020
|
+
const total = windowQueries.length
|
|
1021
|
+
const errors = windowQueries.filter((q) => !q.success).length
|
|
1022
|
+
const avgDuration = total > 0
|
|
1023
|
+
? windowQueries.reduce((s, q) => s + q.durationMs, 0) / total
|
|
1024
|
+
: 0
|
|
1025
|
+
|
|
1026
|
+
return {
|
|
1027
|
+
totalQueries: total,
|
|
1028
|
+
avgDurationMs: avgDuration,
|
|
1029
|
+
errorRate: total > 0 ? errors / total : 0,
|
|
1030
|
+
}
|
|
1031
|
+
}
|
|
1032
|
+
|
|
1033
|
+
private addAlert(alert: Alert): void {
|
|
1034
|
+
// Don't add duplicate alerts of same type
|
|
1035
|
+
const existing = this.activeAlerts.find((a) =>
|
|
1036
|
+
a.type === alert.type && (alert.name ? a.name === alert.name : true)
|
|
1037
|
+
)
|
|
1038
|
+
if (!existing) {
|
|
1039
|
+
this.activeAlerts.push(alert)
|
|
1040
|
+
}
|
|
1041
|
+
}
|
|
1042
|
+
}
|
|
1043
|
+
|
|
1044
|
+
// =============================================================================
|
|
1045
|
+
// Factory Function
|
|
1046
|
+
// =============================================================================
|
|
1047
|
+
|
|
1048
|
+
/** Creates a ProductionMetrics instance, validating required configuration */
|
|
1049
|
+
export function createProductionMetrics(config: ProductionMetricsConfig): ProductionMetrics {
|
|
1050
|
+
if (!config.serviceName) {
|
|
1051
|
+
throw new Error('ProductionMetrics requires a non-empty serviceName')
|
|
1052
|
+
}
|
|
1053
|
+
return new ProductionMetrics(config)
|
|
1054
|
+
}
|