@arcote.tech/arc-cli 0.7.19 → 0.7.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +767 -160
- package/package.json +9 -9
- package/src/builder/access-extractor.ts +11 -15
- package/src/builder/module-builder.ts +210 -35
- package/src/deploy/bootstrap.ts +8 -3
- package/src/deploy/caddyfile.ts +43 -8
- package/src/deploy/compose.ts +73 -0
- package/src/deploy/config.ts +15 -0
- package/src/deploy/observability-configs.ts +688 -48
- package/src/platform/server.ts +3 -0
- package/src/platform/shared.ts +34 -73
- package/src/platform/startup.ts +2 -2
|
@@ -35,8 +35,9 @@ function pickRetention(o: DeployObservability | undefined) {
|
|
|
35
35
|
}
|
|
36
36
|
|
|
37
37
|
/** OpenTelemetry Collector — receives OTLP from app containers + browser,
|
|
38
|
-
*
|
|
39
|
-
*
|
|
38
|
+
* derives span-metrics + service-graph BEFORE tail sampling (no sampling
|
|
39
|
+
* bias), scrapes host + per-container resource usage, fans out to Tempo
|
|
40
|
+
* (traces), Loki (logs), Prometheus remote-write (metrics). */
|
|
40
41
|
export function generateOtelCollectorConfig(cfg: DeployConfig): string {
|
|
41
42
|
const envNames = Object.keys(cfg.envs);
|
|
42
43
|
return `# Generated by \`arc platform deploy\` — do not edit by hand.
|
|
@@ -55,6 +56,69 @@ ${envNames.map((name) => ` - "https://${cfg.envs[name]!.domain}"`).jo
|
|
|
55
56
|
- tracestate
|
|
56
57
|
- content-type
|
|
57
58
|
|
|
59
|
+
# Host-level CPU / memory / load / disk / filesystem / network metrics.
|
|
60
|
+
# The host root is bind-mounted read-only at /hostfs (see compose).
|
|
61
|
+
hostmetrics:
|
|
62
|
+
collection_interval: 30s
|
|
63
|
+
root_path: /hostfs
|
|
64
|
+
scrapers:
|
|
65
|
+
cpu:
|
|
66
|
+
metrics:
|
|
67
|
+
system.cpu.utilization:
|
|
68
|
+
enabled: true
|
|
69
|
+
memory:
|
|
70
|
+
metrics:
|
|
71
|
+
system.memory.utilization:
|
|
72
|
+
enabled: true
|
|
73
|
+
load: {}
|
|
74
|
+
disk: {}
|
|
75
|
+
filesystem:
|
|
76
|
+
metrics:
|
|
77
|
+
system.filesystem.utilization:
|
|
78
|
+
enabled: true
|
|
79
|
+
exclude_fs_types:
|
|
80
|
+
fs_types: [autofs, binfmt_misc, bpf, cgroup2, configfs, debugfs, devpts, devtmpfs, fusectl, hugetlbfs, iso9660, mqueue, nsfs, overlay, proc, procfs, pstore, rpc_pipefs, securityfs, selinuxfs, squashfs, sysfs, tracefs, tmpfs]
|
|
81
|
+
match_type: strict
|
|
82
|
+
exclude_mount_points:
|
|
83
|
+
mount_points: ["/var/lib/docker/.*", "/run/.*", "/snap/.*", "/boot/.*"]
|
|
84
|
+
match_type: regexp
|
|
85
|
+
network: {}
|
|
86
|
+
paging: {}
|
|
87
|
+
|
|
88
|
+
# Per-container CPU / memory / network / block-IO + restarts straight from
|
|
89
|
+
# the Docker daemon (socket bind-mounted read-only, see compose).
|
|
90
|
+
# api_version pinned: the receiver defaults to Docker API 1.25, which modern
|
|
91
|
+
# daemons (Engine 25+ require >= 1.40) reject — without this the receiver
|
|
92
|
+
# fails to start and takes the whole collector down. Quoted so YAML doesn't
|
|
93
|
+
# parse 1.40 → 1.4. Must be <= the daemon's max; 1.40 is the safe floor.
|
|
94
|
+
docker_stats:
|
|
95
|
+
endpoint: unix:///var/run/docker.sock
|
|
96
|
+
api_version: "1.40"
|
|
97
|
+
collection_interval: 30s
|
|
98
|
+
metrics:
|
|
99
|
+
container.restarts:
|
|
100
|
+
enabled: true
|
|
101
|
+
container.uptime:
|
|
102
|
+
enabled: true
|
|
103
|
+
|
|
104
|
+
connectors:
|
|
105
|
+
# Span→metrics computed from 100% of spans (pipeline runs BEFORE tail
|
|
106
|
+
# sampling) — lowering the sampling policy later never skews dashboards.
|
|
107
|
+
spanmetrics:
|
|
108
|
+
histogram:
|
|
109
|
+
unit: ms
|
|
110
|
+
explicit:
|
|
111
|
+
buckets: [2ms, 6ms, 10ms, 50ms, 100ms, 250ms, 500ms, 1s, 2s, 5s, 10s]
|
|
112
|
+
metrics_flush_interval: 15s
|
|
113
|
+
# Emits traces_service_graph_* (same metric names Tempo's generator would).
|
|
114
|
+
servicegraph:
|
|
115
|
+
metrics_flush_interval: 15s
|
|
116
|
+
store:
|
|
117
|
+
ttl: 5s
|
|
118
|
+
max_items: 5000
|
|
119
|
+
# Joins the raw-trace pipeline to the sampled-storage pipeline.
|
|
120
|
+
forward: {}
|
|
121
|
+
|
|
58
122
|
processors:
|
|
59
123
|
batch:
|
|
60
124
|
timeout: 5s
|
|
@@ -65,7 +129,8 @@ processors:
|
|
|
65
129
|
# Errors + slow traces zachowywane w 100%, normalne traces również 100%
|
|
66
130
|
# przy obecnej skali (boostrap produkcji). Tail sampling matchuje OR po
|
|
67
131
|
# policies — bez "always" policy WSZYSTKIE OK traces byłyby droppowane.
|
|
68
|
-
# Obniż 'random_100pct' do np. 10% gdy ruch eksploduje
|
|
132
|
+
# Obniż 'random_100pct' do np. 10% gdy ruch eksploduje — span-metrics są
|
|
133
|
+
# liczone przed samplingiem, więc dashboardy pozostaną dokładne.
|
|
69
134
|
tail_sampling:
|
|
70
135
|
decision_wait: 10s
|
|
71
136
|
num_traces: 50000
|
|
@@ -90,6 +155,34 @@ processors:
|
|
|
90
155
|
- key: http.request.header.cookie
|
|
91
156
|
action: delete
|
|
92
157
|
|
|
158
|
+
# Cardinality guard for span-metrics: the SPA fallback answers EVERY path,
|
|
159
|
+
# so raw span names (one per bot-scanned URL) would explode Prometheus
|
|
160
|
+
# series. Static assets collapse to "<METHOD> static", /route/* to
|
|
161
|
+
# "<METHOD> /route", anything else outside the known API surface to
|
|
162
|
+
# "<METHOD> other". Runs BEFORE the spanmetrics connector. Note: "$$" is a
|
|
163
|
+
# literal "$" (collector env expansion), RE2 has no lookahead → IsMatch+not.
|
|
164
|
+
transform/span_names:
|
|
165
|
+
error_mode: ignore
|
|
166
|
+
trace_statements:
|
|
167
|
+
- context: span
|
|
168
|
+
statements:
|
|
169
|
+
- set(name, Concat([attributes["http.request.method"], "static"], " ")) where IsMatch(name, "^[A-Z]+ /.*\\\\.(js|mjs|css|map|ico|png|jpe?g|svg|gif|webp|avif|woff2?|ttf|otf|txt|xml|json|webmanifest)$$")
|
|
170
|
+
- replace_pattern(name, "^([A-Z]+) /route(/.*)?$$", "$$1 /route")
|
|
171
|
+
- set(name, Concat([attributes["http.request.method"], "other"], " ")) where IsMatch(name, "^[A-Z]+ /") and not IsMatch(name, "^[A-Z]+ ((/api|/command|/query)(/.*)?|/route|/ws|/health|/otel(/.*)?|/)$$")
|
|
172
|
+
|
|
173
|
+
# Stable service.name for infra metric streams (becomes the service_name
|
|
174
|
+
# label after resource_to_telemetry_conversion).
|
|
175
|
+
resource/host:
|
|
176
|
+
attributes:
|
|
177
|
+
- key: service.name
|
|
178
|
+
value: arc-host
|
|
179
|
+
action: upsert
|
|
180
|
+
resource/docker:
|
|
181
|
+
attributes:
|
|
182
|
+
- key: service.name
|
|
183
|
+
value: arc-docker
|
|
184
|
+
action: upsert
|
|
185
|
+
|
|
93
186
|
exporters:
|
|
94
187
|
otlp/tempo:
|
|
95
188
|
endpoint: tempo:4317
|
|
@@ -105,6 +198,10 @@ exporters:
|
|
|
105
198
|
endpoint: http://prometheus:9090/api/v1/write
|
|
106
199
|
tls:
|
|
107
200
|
insecure: true
|
|
201
|
+
# Copy resource attributes (service.name, deployment.environment, …)
|
|
202
|
+
# onto every series — dashboards filter by service_name.
|
|
203
|
+
resource_to_telemetry_conversion:
|
|
204
|
+
enabled: true
|
|
108
205
|
|
|
109
206
|
extensions:
|
|
110
207
|
health_check: {}
|
|
@@ -112,19 +209,42 @@ extensions:
|
|
|
112
209
|
|
|
113
210
|
service:
|
|
114
211
|
extensions: [health_check, zpages]
|
|
212
|
+
# Collector self-metrics. Since 0.111 the default bind is localhost only —
|
|
213
|
+
# Prometheus scrapes otel-collector:8888, so listen on all interfaces.
|
|
214
|
+
telemetry:
|
|
215
|
+
metrics:
|
|
216
|
+
level: detailed
|
|
217
|
+
readers:
|
|
218
|
+
- pull:
|
|
219
|
+
exporter:
|
|
220
|
+
prometheus:
|
|
221
|
+
host: "0.0.0.0"
|
|
222
|
+
port: 8888
|
|
115
223
|
pipelines:
|
|
116
|
-
traces:
|
|
224
|
+
traces/in:
|
|
117
225
|
receivers: [otlp]
|
|
118
|
-
processors: [
|
|
226
|
+
processors: [attributes, transform/span_names]
|
|
227
|
+
exporters: [spanmetrics, servicegraph, forward]
|
|
228
|
+
traces/sampled:
|
|
229
|
+
receivers: [forward]
|
|
230
|
+
processors: [tail_sampling, batch]
|
|
119
231
|
exporters: [otlp/tempo]
|
|
120
232
|
logs:
|
|
121
233
|
receivers: [otlp]
|
|
122
234
|
processors: [attributes, batch]
|
|
123
235
|
exporters: [otlphttp/loki]
|
|
124
236
|
metrics:
|
|
125
|
-
receivers: [otlp]
|
|
237
|
+
receivers: [otlp, spanmetrics, servicegraph]
|
|
126
238
|
processors: [batch]
|
|
127
239
|
exporters: [prometheusremotewrite]
|
|
240
|
+
metrics/host:
|
|
241
|
+
receivers: [hostmetrics]
|
|
242
|
+
processors: [resource/host, batch]
|
|
243
|
+
exporters: [prometheusremotewrite]
|
|
244
|
+
metrics/docker:
|
|
245
|
+
receivers: [docker_stats]
|
|
246
|
+
processors: [resource/docker, batch]
|
|
247
|
+
exporters: [prometheusremotewrite]
|
|
128
248
|
`;
|
|
129
249
|
}
|
|
130
250
|
|
|
@@ -162,20 +282,9 @@ storage:
|
|
|
162
282
|
wal:
|
|
163
283
|
path: /var/tempo/wal
|
|
164
284
|
|
|
165
|
-
metrics_generator
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
source: tempo
|
|
169
|
-
storage:
|
|
170
|
-
path: /var/tempo/generator/wal
|
|
171
|
-
remote_write:
|
|
172
|
-
- url: http://prometheus:9090/api/v1/write
|
|
173
|
-
send_exemplars: true
|
|
174
|
-
|
|
175
|
-
overrides:
|
|
176
|
-
defaults:
|
|
177
|
-
metrics_generator:
|
|
178
|
-
processors: [service-graphs, span-metrics]
|
|
285
|
+
# NOTE: no metrics_generator — span-metrics + service-graph are produced by
|
|
286
|
+
# the otel-collector connectors BEFORE tail sampling (accurate rates even
|
|
287
|
+
# when sampling is later tightened) and remote-written to Prometheus there.
|
|
179
288
|
`;
|
|
180
289
|
}
|
|
181
290
|
|
|
@@ -237,12 +346,248 @@ scrape_configs:
|
|
|
237
346
|
- job_name: otel-collector
|
|
238
347
|
static_configs:
|
|
239
348
|
- targets: [otel-collector:8888]
|
|
349
|
+
- job_name: caddy
|
|
350
|
+
static_configs:
|
|
351
|
+
- targets: [caddy:2020]
|
|
352
|
+
- job_name: loki
|
|
353
|
+
static_configs:
|
|
354
|
+
- targets: [loki:3100]
|
|
355
|
+
- job_name: tempo
|
|
356
|
+
static_configs:
|
|
357
|
+
- targets: [tempo:3200]
|
|
358
|
+
- job_name: grafana
|
|
359
|
+
static_configs:
|
|
360
|
+
- targets: [grafana:3000]
|
|
361
|
+
- job_name: alloy
|
|
362
|
+
static_configs:
|
|
363
|
+
- targets: [alloy:12345]
|
|
240
364
|
|
|
241
365
|
# remote-write inbound is enabled via the --web.enable-remote-write-receiver
|
|
242
366
|
# command-line flag (compose.ts). Retention via --storage.tsdb.retention.time.
|
|
243
367
|
`;
|
|
244
368
|
}
|
|
245
369
|
|
|
370
|
+
/** Grafana Alloy — tails stdout/stderr of every container on the host via
|
|
371
|
+
* the Docker API and ships it to Loki. Complements the in-app console→OTLP
|
|
372
|
+
* bridge: infra containers (caddy, postgres, tempo, …) and app crash output
|
|
373
|
+
* (OOM, Bun panic — anything printed before/outside the OTel SDK) all land
|
|
374
|
+
* in Loki under the `container` / `compose_service` labels. */
|
|
375
|
+
export function generateAlloyConfig(): string {
|
|
376
|
+
return `// Generated by \`arc platform deploy\` — do not edit by hand.
|
|
377
|
+
discovery.docker "containers" {
|
|
378
|
+
host = "unix:///var/run/docker.sock"
|
|
379
|
+
refresh_interval = "15s"
|
|
380
|
+
|
|
381
|
+
// Only containers managed by a compose project (our stack). Ad-hoc / rogue
|
|
382
|
+
// containers (manual debug runs, other stacks) are excluded — one bad
|
|
383
|
+
// stream (e.g. log entries older than Loki's reject window) otherwise 400s
|
|
384
|
+
// the whole loki.write batch and drops good app logs with it.
|
|
385
|
+
filter {
|
|
386
|
+
name = "label"
|
|
387
|
+
values = ["com.docker.compose.project"]
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
discovery.relabel "containers" {
|
|
392
|
+
targets = discovery.docker.containers.targets
|
|
393
|
+
|
|
394
|
+
rule {
|
|
395
|
+
source_labels = ["__meta_docker_container_name"]
|
|
396
|
+
regex = "/(.*)"
|
|
397
|
+
target_label = "container"
|
|
398
|
+
}
|
|
399
|
+
rule {
|
|
400
|
+
source_labels = ["__meta_docker_container_label_com_docker_compose_service"]
|
|
401
|
+
target_label = "compose_service"
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
loki.source.docker "containers" {
|
|
406
|
+
host = "unix:///var/run/docker.sock"
|
|
407
|
+
targets = discovery.docker.containers.targets
|
|
408
|
+
relabel_rules = discovery.relabel.containers.rules
|
|
409
|
+
labels = { source = "docker" }
|
|
410
|
+
forward_to = [loki.write.loki.receiver]
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
loki.write "loki" {
|
|
414
|
+
endpoint {
|
|
415
|
+
url = "http://loki:3100/loki/api/v1/push"
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
`;
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
/** Grafana alerting provisioning — a starter rule pack covering the failure
|
|
422
|
+
* modes that matter on a single-host deploy: error rate, latency, disk,
|
|
423
|
+
* memory, container restarts, app silence, telemetry export failures and
|
|
424
|
+
* scrape-target health. Notification routing (webhook) is only emitted when
|
|
425
|
+
* `observability.alertWebhookUrl` is configured — without it the rules are
|
|
426
|
+
* still visible/firing in the Grafana UI. */
|
|
427
|
+
export function generateGrafanaAlerting(cfg: DeployConfig): string {
|
|
428
|
+
const webhookUrl = cfg.observability?.alertWebhookUrl;
|
|
429
|
+
|
|
430
|
+
interface AlertRule {
|
|
431
|
+
uid: string;
|
|
432
|
+
title: string;
|
|
433
|
+
expr: string;
|
|
434
|
+
/** Threshold for the C (threshold) expression node. */
|
|
435
|
+
threshold: number;
|
|
436
|
+
/** Comparison operator. Default "gt". */
|
|
437
|
+
op?: "gt" | "lt";
|
|
438
|
+
/** Pending period, e.g. "5m". "0s" fires immediately. */
|
|
439
|
+
pendingFor: string;
|
|
440
|
+
summary: string;
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
const rules: AlertRule[] = [
|
|
444
|
+
{
|
|
445
|
+
uid: "arc-high-error-rate",
|
|
446
|
+
title: "High server error rate (>5%)",
|
|
447
|
+
expr: 'sum(rate(traces_span_metrics_calls_total{span_kind="SPAN_KIND_SERVER", status_code="STATUS_CODE_ERROR"}[5m])) / clamp_min(sum(rate(traces_span_metrics_calls_total{span_kind="SPAN_KIND_SERVER"}[5m])), 0.001)',
|
|
448
|
+
threshold: 0.05,
|
|
449
|
+
pendingFor: "5m",
|
|
450
|
+
summary: "More than 5% of server spans are errors over the last 5 minutes.",
|
|
451
|
+
},
|
|
452
|
+
{
|
|
453
|
+
uid: "arc-high-latency-p95",
|
|
454
|
+
title: "High p95 latency (>1s)",
|
|
455
|
+
expr: 'histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_kind="SPAN_KIND_SERVER"}[5m])))',
|
|
456
|
+
threshold: 1000,
|
|
457
|
+
pendingFor: "10m",
|
|
458
|
+
summary: "Server p95 latency above 1s for 10 minutes.",
|
|
459
|
+
},
|
|
460
|
+
{
|
|
461
|
+
uid: "arc-host-disk-high",
|
|
462
|
+
title: "Host disk usage >85%",
|
|
463
|
+
expr: 'max by (mountpoint) (sum by (device, mountpoint) (system_filesystem_usage_bytes{state="used"}) / sum by (device, mountpoint) (system_filesystem_usage_bytes))',
|
|
464
|
+
threshold: 0.85,
|
|
465
|
+
pendingFor: "15m",
|
|
466
|
+
summary: "A host filesystem is more than 85% full.",
|
|
467
|
+
},
|
|
468
|
+
{
|
|
469
|
+
uid: "arc-host-memory-high",
|
|
470
|
+
title: "Host memory usage >90%",
|
|
471
|
+
expr: 'sum(system_memory_usage_bytes{state="used"}) / sum(system_memory_usage_bytes)',
|
|
472
|
+
threshold: 0.9,
|
|
473
|
+
pendingFor: "10m",
|
|
474
|
+
summary: "Host memory usage above 90% for 10 minutes.",
|
|
475
|
+
},
|
|
476
|
+
{
|
|
477
|
+
uid: "arc-container-restarts",
|
|
478
|
+
title: "Container restarted",
|
|
479
|
+
expr: 'sum by (container_name) (increase(container_restarts_total[15m]))',
|
|
480
|
+
threshold: 0,
|
|
481
|
+
pendingFor: "0s",
|
|
482
|
+
summary: "A container restarted within the last 15 minutes.",
|
|
483
|
+
},
|
|
484
|
+
{
|
|
485
|
+
uid: "arc-app-silent",
|
|
486
|
+
title: "App stopped reporting metrics",
|
|
487
|
+
expr: "absent(arc_commands_total)",
|
|
488
|
+
threshold: 0,
|
|
489
|
+
pendingFor: "10m",
|
|
490
|
+
summary: "No arc_commands_total series for 10 minutes — app down or telemetry broken.",
|
|
491
|
+
},
|
|
492
|
+
{
|
|
493
|
+
uid: "arc-collector-export-failures",
|
|
494
|
+
title: "Telemetry export failures",
|
|
495
|
+
expr: 'sum(increase({__name__=~"otelcol_exporter_send_failed_(spans|metric_points|log_records)"}[15m]))',
|
|
496
|
+
threshold: 0,
|
|
497
|
+
pendingFor: "0s",
|
|
498
|
+
summary: "The otel-collector failed to export telemetry within the last 15 minutes.",
|
|
499
|
+
},
|
|
500
|
+
{
|
|
501
|
+
uid: "arc-target-down",
|
|
502
|
+
title: "Scrape target down",
|
|
503
|
+
expr: "min(up)",
|
|
504
|
+
threshold: 1,
|
|
505
|
+
op: "lt",
|
|
506
|
+
pendingFor: "5m",
|
|
507
|
+
summary: "A Prometheus scrape target has been down for 5 minutes.",
|
|
508
|
+
},
|
|
509
|
+
];
|
|
510
|
+
|
|
511
|
+
const ruleYaml = rules
|
|
512
|
+
.map((rule) => {
|
|
513
|
+
const op = rule.op ?? "gt";
|
|
514
|
+
return ` - uid: ${rule.uid}
|
|
515
|
+
title: ${JSON.stringify(rule.title)}
|
|
516
|
+
condition: C
|
|
517
|
+
for: ${rule.pendingFor}
|
|
518
|
+
noDataState: OK
|
|
519
|
+
execErrState: OK
|
|
520
|
+
annotations:
|
|
521
|
+
summary: ${JSON.stringify(rule.summary)}
|
|
522
|
+
labels:
|
|
523
|
+
source: arc
|
|
524
|
+
data:
|
|
525
|
+
- refId: A
|
|
526
|
+
relativeTimeRange: { from: 600, to: 0 }
|
|
527
|
+
datasourceUid: prometheus
|
|
528
|
+
model:
|
|
529
|
+
expr: ${JSON.stringify(rule.expr)}
|
|
530
|
+
instant: true
|
|
531
|
+
intervalMs: 1000
|
|
532
|
+
maxDataPoints: 43200
|
|
533
|
+
refId: A
|
|
534
|
+
- refId: B
|
|
535
|
+
relativeTimeRange: { from: 0, to: 0 }
|
|
536
|
+
datasourceUid: __expr__
|
|
537
|
+
model:
|
|
538
|
+
type: reduce
|
|
539
|
+
expression: A
|
|
540
|
+
reducer: last
|
|
541
|
+
refId: B
|
|
542
|
+
- refId: C
|
|
543
|
+
relativeTimeRange: { from: 0, to: 0 }
|
|
544
|
+
datasourceUid: __expr__
|
|
545
|
+
model:
|
|
546
|
+
type: threshold
|
|
547
|
+
expression: B
|
|
548
|
+
refId: C
|
|
549
|
+
conditions:
|
|
550
|
+
- evaluator:
|
|
551
|
+
type: ${op}
|
|
552
|
+
params: [${rule.threshold}]`;
|
|
553
|
+
})
|
|
554
|
+
.join("\n");
|
|
555
|
+
|
|
556
|
+
const contactSection = webhookUrl
|
|
557
|
+
? `
|
|
558
|
+
contactPoints:
|
|
559
|
+
- orgId: 1
|
|
560
|
+
name: arc-webhook
|
|
561
|
+
receivers:
|
|
562
|
+
- uid: arc-webhook
|
|
563
|
+
type: webhook
|
|
564
|
+
settings:
|
|
565
|
+
url: ${JSON.stringify(webhookUrl)}
|
|
566
|
+
httpMethod: POST
|
|
567
|
+
|
|
568
|
+
policies:
|
|
569
|
+
- orgId: 1
|
|
570
|
+
receiver: arc-webhook
|
|
571
|
+
group_by: ["grafana_folder", "alertname"]
|
|
572
|
+
group_wait: 30s
|
|
573
|
+
group_interval: 5m
|
|
574
|
+
repeat_interval: 4h
|
|
575
|
+
`
|
|
576
|
+
: "";
|
|
577
|
+
|
|
578
|
+
return `# Generated by \`arc platform deploy\` — do not edit by hand.
|
|
579
|
+
apiVersion: 1
|
|
580
|
+
|
|
581
|
+
groups:
|
|
582
|
+
- orgId: 1
|
|
583
|
+
name: arc-alerts
|
|
584
|
+
folder: Arc
|
|
585
|
+
interval: 1m
|
|
586
|
+
rules:
|
|
587
|
+
${ruleYaml}
|
|
588
|
+
${contactSection}`;
|
|
589
|
+
}
|
|
590
|
+
|
|
246
591
|
/** Grafana datasource provisioning — Tempo + Loki + Prometheus, all pre-wired. */
|
|
247
592
|
export function generateGrafanaDatasources(): string {
|
|
248
593
|
return `# Generated by \`arc platform deploy\` — do not edit by hand.
|
|
@@ -267,10 +612,17 @@ datasources:
|
|
|
267
612
|
uid: loki
|
|
268
613
|
jsonData:
|
|
269
614
|
derivedFields:
|
|
615
|
+
# Plain-text logs that happen to contain "trace_id=<id>".
|
|
270
616
|
- datasourceUid: tempo
|
|
271
617
|
matcherRegex: "trace_id=(\\\\w+)"
|
|
272
618
|
name: TraceID
|
|
273
619
|
url: $\${__value.raw}
|
|
620
|
+
# OTLP-ingested logs — trace_id arrives as structured metadata.
|
|
621
|
+
- datasourceUid: tempo
|
|
622
|
+
matcherType: label
|
|
623
|
+
matcherRegex: trace_id
|
|
624
|
+
name: TraceID (OTLP)
|
|
625
|
+
url: $\${__value.raw}
|
|
274
626
|
- name: Prometheus
|
|
275
627
|
type: prometheus
|
|
276
628
|
access: proxy
|
|
@@ -322,7 +674,7 @@ export function generateArcOverviewDashboard(): string {
|
|
|
322
674
|
label: "Service",
|
|
323
675
|
type: "query",
|
|
324
676
|
datasource: { type: "prometheus", uid: "prometheus" },
|
|
325
|
-
query: "label_values(
|
|
677
|
+
query: "label_values(traces_span_metrics_calls_total, service_name)",
|
|
326
678
|
refresh: 2,
|
|
327
679
|
includeAll: false,
|
|
328
680
|
multi: false,
|
|
@@ -335,20 +687,20 @@ export function generateArcOverviewDashboard(): string {
|
|
|
335
687
|
panelStat(
|
|
336
688
|
"Request rate (req/s)",
|
|
337
689
|
{ x: 0, y: 0, w: 6, h: 4 },
|
|
338
|
-
'sum(rate(
|
|
690
|
+
'sum(rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m]))',
|
|
339
691
|
"reqps",
|
|
340
692
|
),
|
|
341
693
|
panelStat(
|
|
342
694
|
"Error rate (%)",
|
|
343
695
|
{ x: 6, y: 0, w: 6, h: 4 },
|
|
344
|
-
'sum(rate(
|
|
696
|
+
'sum(rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER", status_code="STATUS_CODE_ERROR"}[5m])) / clamp_min(sum(rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])), 0.001) * 100',
|
|
345
697
|
"percent",
|
|
346
698
|
{ red: 1, orange: 0.1 },
|
|
347
699
|
),
|
|
348
700
|
panelStat(
|
|
349
701
|
"P99 latency",
|
|
350
702
|
{ x: 12, y: 0, w: 6, h: 4 },
|
|
351
|
-
'histogram_quantile(0.99, sum(rate(
|
|
703
|
+
'histogram_quantile(0.99, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
|
|
352
704
|
"ms",
|
|
353
705
|
{ red: 1000, orange: 300 },
|
|
354
706
|
),
|
|
@@ -363,7 +715,7 @@ export function generateArcOverviewDashboard(): string {
|
|
|
363
715
|
panelTimeseries(
|
|
364
716
|
"Request rate by route",
|
|
365
717
|
{ x: 0, y: 4, w: 12, h: 8 },
|
|
366
|
-
'sum by (span_name) (rate(
|
|
718
|
+
'sum by (span_name) (rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[1m]))',
|
|
367
719
|
"{{span_name}}",
|
|
368
720
|
"reqps",
|
|
369
721
|
),
|
|
@@ -372,18 +724,19 @@ export function generateArcOverviewDashboard(): string {
|
|
|
372
724
|
{ x: 12, y: 4, w: 12, h: 8 },
|
|
373
725
|
[
|
|
374
726
|
{
|
|
375
|
-
expr: 'histogram_quantile(0.5, sum(rate(
|
|
727
|
+
expr: 'histogram_quantile(0.5, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
|
|
376
728
|
legend: "p50",
|
|
377
729
|
},
|
|
378
730
|
{
|
|
379
|
-
expr: 'histogram_quantile(0.95, sum(rate(
|
|
731
|
+
expr: 'histogram_quantile(0.95, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
|
|
380
732
|
legend: "p95",
|
|
381
733
|
},
|
|
382
734
|
{
|
|
383
|
-
expr: 'histogram_quantile(0.99, sum(rate(
|
|
735
|
+
expr: 'histogram_quantile(0.99, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
|
|
384
736
|
legend: "p99",
|
|
385
737
|
},
|
|
386
738
|
],
|
|
739
|
+
"",
|
|
387
740
|
"ms",
|
|
388
741
|
),
|
|
389
742
|
|
|
@@ -398,24 +751,24 @@ export function generateArcOverviewDashboard(): string {
|
|
|
398
751
|
panelTimeseries(
|
|
399
752
|
"Command p95 latency",
|
|
400
753
|
{ x: 12, y: 12, w: 12, h: 8 },
|
|
401
|
-
'histogram_quantile(0.95, sum by (arc_command_name, le) (rate(
|
|
754
|
+
'histogram_quantile(0.95, sum by (arc_command_name, le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service"}[5m])))',
|
|
402
755
|
"{{arc_command_name}}",
|
|
403
756
|
"ms",
|
|
404
757
|
),
|
|
405
758
|
|
|
406
759
|
// Row: database
|
|
407
760
|
panelTimeseries(
|
|
408
|
-
"DB
|
|
761
|
+
"DB ops/sec by collection",
|
|
409
762
|
{ x: 0, y: 20, w: 12, h: 8 },
|
|
410
|
-
'sum by (db_collection_name) (rate(
|
|
411
|
-
"{{db_collection_name}}",
|
|
763
|
+
'sum by (db_collection_name, db_operation_name) (rate(arc_db_operation_duration_milliseconds_count{service_name="$service"}[1m]))',
|
|
764
|
+
"{{db_collection_name}} {{db_operation_name}}",
|
|
412
765
|
"ops",
|
|
413
766
|
),
|
|
414
767
|
panelTimeseries(
|
|
415
|
-
"DB
|
|
768
|
+
"DB p95 latency by operation",
|
|
416
769
|
{ x: 12, y: 20, w: 12, h: 8 },
|
|
417
|
-
'histogram_quantile(0.95, sum by (
|
|
418
|
-
"{{
|
|
770
|
+
'histogram_quantile(0.95, sum by (db_operation_name, le) (rate(arc_db_operation_duration_milliseconds_bucket{service_name="$service"}[5m])))',
|
|
771
|
+
"{{db_operation_name}}",
|
|
419
772
|
"ms",
|
|
420
773
|
),
|
|
421
774
|
|
|
@@ -427,7 +780,9 @@ export function generateArcOverviewDashboard(): string {
|
|
|
427
780
|
datasource: { type: "loki", uid: "loki" },
|
|
428
781
|
targets: [
|
|
429
782
|
{
|
|
430
|
-
|
|
783
|
+
// severity_text is OTLP structured metadata — filter via the
|
|
784
|
+
// pipeline stage, NOT the stream selector (not an index label).
|
|
785
|
+
expr: '{service_name="$service"} | severity_text=~`ERROR|FATAL`',
|
|
431
786
|
refId: "A",
|
|
432
787
|
},
|
|
433
788
|
],
|
|
@@ -464,7 +819,7 @@ export function generateArcTracesDashboard(): string {
|
|
|
464
819
|
label: "Service",
|
|
465
820
|
type: "query",
|
|
466
821
|
datasource: { type: "prometheus", uid: "prometheus" },
|
|
467
|
-
query: "label_values(
|
|
822
|
+
query: "label_values(traces_span_metrics_calls_total, service_name)",
|
|
468
823
|
refresh: 2,
|
|
469
824
|
current: { text: "arc-prod", value: "arc-prod" },
|
|
470
825
|
},
|
|
@@ -587,23 +942,38 @@ export function generateArcLogsDashboard(): string {
|
|
|
587
942
|
query: "",
|
|
588
943
|
current: { text: "", value: "" },
|
|
589
944
|
},
|
|
945
|
+
{
|
|
946
|
+
name: "container",
|
|
947
|
+
label: "Container",
|
|
948
|
+
type: "query",
|
|
949
|
+
datasource: { type: "loki", uid: "loki" },
|
|
950
|
+
query: "label_values(container)",
|
|
951
|
+
refresh: 2,
|
|
952
|
+
includeAll: true,
|
|
953
|
+
multi: false,
|
|
954
|
+
current: { text: "All", value: "$__all" },
|
|
955
|
+
},
|
|
590
956
|
],
|
|
591
957
|
},
|
|
592
958
|
panels: [
|
|
593
|
-
// Stat: total log lines, last hour
|
|
959
|
+
// Stat: total log lines, last hour. severity_text is OTLP structured
|
|
960
|
+
// metadata — usable only in pipeline stages, never in stream selectors.
|
|
594
961
|
panelStat(
|
|
595
962
|
"Logs ingested (1h)",
|
|
596
963
|
{ x: 0, y: 0, w: 6, h: 4 },
|
|
597
|
-
'sum(
|
|
964
|
+
'sum(count_over_time({service_name="$service"}[1h]))',
|
|
598
965
|
"short",
|
|
966
|
+
undefined,
|
|
967
|
+
LOKI_DS,
|
|
599
968
|
),
|
|
600
969
|
// Stat: errors (last hour)
|
|
601
970
|
panelStat(
|
|
602
971
|
"Errors (1h)",
|
|
603
972
|
{ x: 6, y: 0, w: 6, h: 4 },
|
|
604
|
-
'sum(
|
|
973
|
+
'sum(count_over_time({service_name="$service"} | severity_text=~`ERROR|FATAL` [1h]))',
|
|
605
974
|
"short",
|
|
606
975
|
{ orange: 1, red: 50 },
|
|
976
|
+
LOKI_DS,
|
|
607
977
|
),
|
|
608
978
|
// Time series: log volume by severity
|
|
609
979
|
{
|
|
@@ -657,6 +1027,30 @@ export function generateArcLogsDashboard(): string {
|
|
|
657
1027
|
sortOrder: "Descending",
|
|
658
1028
|
},
|
|
659
1029
|
},
|
|
1030
|
+
// Container stdout/stderr (collected by Alloy from the Docker API) —
|
|
1031
|
+
// caddy access logs, postgres, the observability stack itself, plus
|
|
1032
|
+
// app crash output that never reached the OTLP pipeline.
|
|
1033
|
+
{
|
|
1034
|
+
title: "Container logs ($container)",
|
|
1035
|
+
type: "logs",
|
|
1036
|
+
gridPos: { x: 0, y: 26, w: 24, h: 14 },
|
|
1037
|
+
datasource: { type: "loki", uid: "loki" },
|
|
1038
|
+
targets: [
|
|
1039
|
+
{
|
|
1040
|
+
expr: '{container=~"$container"} |~ "$search"',
|
|
1041
|
+
refId: "A",
|
|
1042
|
+
},
|
|
1043
|
+
],
|
|
1044
|
+
options: {
|
|
1045
|
+
showTime: true,
|
|
1046
|
+
showLabels: true,
|
|
1047
|
+
showCommonLabels: false,
|
|
1048
|
+
wrapLogMessage: true,
|
|
1049
|
+
enableLogDetails: true,
|
|
1050
|
+
dedupStrategy: "none",
|
|
1051
|
+
sortOrder: "Descending",
|
|
1052
|
+
},
|
|
1053
|
+
},
|
|
660
1054
|
],
|
|
661
1055
|
};
|
|
662
1056
|
return JSON.stringify(dashboard, null, 2);
|
|
@@ -722,6 +1116,7 @@ export function generateArcSamplingDashboard(): string {
|
|
|
722
1116
|
legend: "exported",
|
|
723
1117
|
},
|
|
724
1118
|
],
|
|
1119
|
+
"",
|
|
725
1120
|
"ops",
|
|
726
1121
|
),
|
|
727
1122
|
|
|
@@ -790,20 +1185,20 @@ export function generateArcCommandDashboard(): string {
|
|
|
790
1185
|
panelStat(
|
|
791
1186
|
"P50 latency",
|
|
792
1187
|
{ x: 6, y: 0, w: 6, h: 4 },
|
|
793
|
-
'histogram_quantile(0.5, sum by (le) (rate(
|
|
1188
|
+
'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
|
|
794
1189
|
"ms",
|
|
795
1190
|
),
|
|
796
1191
|
panelStat(
|
|
797
1192
|
"P95 latency",
|
|
798
1193
|
{ x: 12, y: 0, w: 6, h: 4 },
|
|
799
|
-
'histogram_quantile(0.95, sum by (le) (rate(
|
|
1194
|
+
'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
|
|
800
1195
|
"ms",
|
|
801
1196
|
{ orange: 200, red: 1000 },
|
|
802
1197
|
),
|
|
803
1198
|
panelStat(
|
|
804
1199
|
"P99 latency",
|
|
805
1200
|
{ x: 18, y: 0, w: 6, h: 4 },
|
|
806
|
-
'histogram_quantile(0.99, sum by (le) (rate(
|
|
1201
|
+
'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
|
|
807
1202
|
"ms",
|
|
808
1203
|
{ orange: 500, red: 2000 },
|
|
809
1204
|
),
|
|
@@ -820,18 +1215,19 @@ export function generateArcCommandDashboard(): string {
|
|
|
820
1215
|
{ x: 12, y: 4, w: 12, h: 8 },
|
|
821
1216
|
[
|
|
822
1217
|
{
|
|
823
|
-
expr: 'histogram_quantile(0.5, sum by (le) (rate(
|
|
1218
|
+
expr: 'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
|
|
824
1219
|
legend: "p50",
|
|
825
1220
|
},
|
|
826
1221
|
{
|
|
827
|
-
expr: 'histogram_quantile(0.95, sum by (le) (rate(
|
|
1222
|
+
expr: 'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
|
|
828
1223
|
legend: "p95",
|
|
829
1224
|
},
|
|
830
1225
|
{
|
|
831
|
-
expr: 'histogram_quantile(0.99, sum by (le) (rate(
|
|
1226
|
+
expr: 'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
|
|
832
1227
|
legend: "p99",
|
|
833
1228
|
},
|
|
834
1229
|
],
|
|
1230
|
+
"",
|
|
835
1231
|
"ms",
|
|
836
1232
|
),
|
|
837
1233
|
|
|
@@ -855,6 +1251,240 @@ export function generateArcCommandDashboard(): string {
|
|
|
855
1251
|
return JSON.stringify(dashboard, null, 2);
|
|
856
1252
|
}
|
|
857
1253
|
|
|
1254
|
+
/** Infrastructure dashboard — host CPU/RAM/disk/network (hostmetrics
|
|
1255
|
+
* receiver, service_name="arc-host") + per-container resource usage and
|
|
1256
|
+
* restarts (docker_stats receiver, service_name="arc-docker"). The panels
|
|
1257
|
+
* answering "is the box healthy / which container is eating the host". */
|
|
1258
|
+
export function generateArcInfraDashboard(): string {
|
|
1259
|
+
const dashboard = {
|
|
1260
|
+
title: "Arc Infrastructure",
|
|
1261
|
+
uid: "arc-infra",
|
|
1262
|
+
schemaVersion: 39,
|
|
1263
|
+
version: 1,
|
|
1264
|
+
refresh: "30s",
|
|
1265
|
+
time: { from: "now-3h", to: "now" },
|
|
1266
|
+
tags: ["arc", "auto-provisioned"],
|
|
1267
|
+
panels: [
|
|
1268
|
+
// Row: host top-line stats
|
|
1269
|
+
panelStat(
|
|
1270
|
+
"Host CPU used",
|
|
1271
|
+
{ x: 0, y: 0, w: 6, h: 4 },
|
|
1272
|
+
'100 * (1 - avg(rate(system_cpu_time_seconds_total{state="idle"}[5m])))',
|
|
1273
|
+
"percent",
|
|
1274
|
+
{ orange: 70, red: 90 },
|
|
1275
|
+
),
|
|
1276
|
+
panelStat(
|
|
1277
|
+
"Host memory used",
|
|
1278
|
+
{ x: 6, y: 0, w: 6, h: 4 },
|
|
1279
|
+
'100 * sum(system_memory_usage_bytes{state="used"}) / sum(system_memory_usage_bytes)',
|
|
1280
|
+
"percent",
|
|
1281
|
+
{ orange: 80, red: 90 },
|
|
1282
|
+
),
|
|
1283
|
+
panelStat(
|
|
1284
|
+
"Disk used (worst mount)",
|
|
1285
|
+
{ x: 12, y: 0, w: 6, h: 4 },
|
|
1286
|
+
'100 * max(sum by (device, mountpoint) (system_filesystem_usage_bytes{state="used"}) / sum by (device, mountpoint) (system_filesystem_usage_bytes))',
|
|
1287
|
+
"percent",
|
|
1288
|
+
{ orange: 75, red: 85 },
|
|
1289
|
+
),
|
|
1290
|
+
panelStat(
|
|
1291
|
+
"Load (1m)",
|
|
1292
|
+
{ x: 18, y: 0, w: 6, h: 4 },
|
|
1293
|
+
"system_cpu_load_average_1m",
|
|
1294
|
+
"short",
|
|
1295
|
+
),
|
|
1296
|
+
|
|
1297
|
+
// Row: host CPU + memory over time
|
|
1298
|
+
panelTimeseries(
|
|
1299
|
+
"Host CPU utilization",
|
|
1300
|
+
{ x: 0, y: 4, w: 12, h: 8 },
|
|
1301
|
+
[
|
|
1302
|
+
{
|
|
1303
|
+
expr: '100 * (1 - avg(rate(system_cpu_time_seconds_total{state="idle"}[5m])))',
|
|
1304
|
+
legend: "used %",
|
|
1305
|
+
},
|
|
1306
|
+
{ expr: "system_cpu_load_average_1m", legend: "load 1m" },
|
|
1307
|
+
{ expr: "system_cpu_load_average_5m", legend: "load 5m" },
|
|
1308
|
+
{ expr: "system_cpu_load_average_15m", legend: "load 15m" },
|
|
1309
|
+
],
|
|
1310
|
+
"",
|
|
1311
|
+
"short",
|
|
1312
|
+
),
|
|
1313
|
+
panelTimeseries(
|
|
1314
|
+
"Host memory by state",
|
|
1315
|
+
{ x: 12, y: 4, w: 12, h: 8 },
|
|
1316
|
+
'sum by (state) (system_memory_usage_bytes)',
|
|
1317
|
+
"{{state}}",
|
|
1318
|
+
"bytes",
|
|
1319
|
+
),
|
|
1320
|
+
|
|
1321
|
+
// Row: disk
|
|
1322
|
+
panelTimeseries(
|
|
1323
|
+
"Filesystem usage by mount",
|
|
1324
|
+
{ x: 0, y: 12, w: 12, h: 8 },
|
|
1325
|
+
'100 * sum by (mountpoint) (system_filesystem_usage_bytes{state="used"}) / sum by (mountpoint) (system_filesystem_usage_bytes)',
|
|
1326
|
+
"{{mountpoint}}",
|
|
1327
|
+
"percent",
|
|
1328
|
+
),
|
|
1329
|
+
panelTimeseries(
|
|
1330
|
+
"Disk I/O",
|
|
1331
|
+
{ x: 12, y: 12, w: 12, h: 8 },
|
|
1332
|
+
'sum by (device, direction) (rate(system_disk_io_bytes_total[5m]))',
|
|
1333
|
+
"{{device}} {{direction}}",
|
|
1334
|
+
"Bps",
|
|
1335
|
+
),
|
|
1336
|
+
|
|
1337
|
+
// Row: network + paging
|
|
1338
|
+
panelTimeseries(
|
|
1339
|
+
"Network I/O",
|
|
1340
|
+
{ x: 0, y: 20, w: 12, h: 8 },
|
|
1341
|
+
'sum by (device, direction) (rate(system_network_io_bytes_total{device!="lo"}[5m]))',
|
|
1342
|
+
"{{device}} {{direction}}",
|
|
1343
|
+
"Bps",
|
|
1344
|
+
),
|
|
1345
|
+
panelTimeseries(
|
|
1346
|
+
"Container restarts (24h)",
|
|
1347
|
+
{ x: 12, y: 20, w: 12, h: 8 },
|
|
1348
|
+
'sum by (container_name) (increase(container_restarts_total[24h]))',
|
|
1349
|
+
"{{container_name}}",
|
|
1350
|
+
"short",
|
|
1351
|
+
),
|
|
1352
|
+
|
|
1353
|
+
// Row: per-container resources
|
|
1354
|
+
panelTimeseries(
|
|
1355
|
+
"Container CPU",
|
|
1356
|
+
{ x: 0, y: 28, w: 12, h: 8 },
|
|
1357
|
+
'container_cpu_utilization_ratio',
|
|
1358
|
+
"{{container_name}}",
|
|
1359
|
+
"percent",
|
|
1360
|
+
),
|
|
1361
|
+
panelTimeseries(
|
|
1362
|
+
"Container memory",
|
|
1363
|
+
{ x: 12, y: 28, w: 12, h: 8 },
|
|
1364
|
+
'container_memory_usage_total_bytes',
|
|
1365
|
+
"{{container_name}}",
|
|
1366
|
+
"bytes",
|
|
1367
|
+
),
|
|
1368
|
+
panelTimeseries(
|
|
1369
|
+
"Container network RX",
|
|
1370
|
+
{ x: 0, y: 36, w: 12, h: 8 },
|
|
1371
|
+
'sum by (container_name) (rate(container_network_io_usage_rx_bytes_total[5m]))',
|
|
1372
|
+
"{{container_name}}",
|
|
1373
|
+
"Bps",
|
|
1374
|
+
),
|
|
1375
|
+
panelTimeseries(
|
|
1376
|
+
"Container network TX",
|
|
1377
|
+
{ x: 12, y: 36, w: 12, h: 8 },
|
|
1378
|
+
'sum by (container_name) (rate(container_network_io_usage_tx_bytes_total[5m]))',
|
|
1379
|
+
"{{container_name}}",
|
|
1380
|
+
"Bps",
|
|
1381
|
+
),
|
|
1382
|
+
],
|
|
1383
|
+
};
|
|
1384
|
+
return JSON.stringify(dashboard, null, 2);
|
|
1385
|
+
}
|
|
1386
|
+
|
|
1387
|
+
/** Edge dashboard — Caddy reverse-proxy metrics (per-host request rate,
|
|
1388
|
+
* status codes, latency, in-flight) plus the JSON access logs that Alloy
|
|
1389
|
+
* ships to Loki. First stop for "is traffic reaching us / who is 404ing". */
|
|
1390
|
+
export function generateArcEdgeDashboard(): string {
|
|
1391
|
+
const dashboard = {
|
|
1392
|
+
title: "Arc Caddy / Edge",
|
|
1393
|
+
uid: "arc-edge",
|
|
1394
|
+
schemaVersion: 39,
|
|
1395
|
+
version: 1,
|
|
1396
|
+
refresh: "30s",
|
|
1397
|
+
time: { from: "now-1h", to: "now" },
|
|
1398
|
+
tags: ["arc", "auto-provisioned"],
|
|
1399
|
+
panels: [
|
|
1400
|
+
// Row: top-line stats
|
|
1401
|
+
panelStat(
|
|
1402
|
+
"Requests/s",
|
|
1403
|
+
{ x: 0, y: 0, w: 6, h: 4 },
|
|
1404
|
+
"sum(rate(caddy_http_request_duration_seconds_count[1m]))",
|
|
1405
|
+
"reqps",
|
|
1406
|
+
),
|
|
1407
|
+
panelStat(
|
|
1408
|
+
"In-flight requests",
|
|
1409
|
+
{ x: 6, y: 0, w: 6, h: 4 },
|
|
1410
|
+
"sum(caddy_http_requests_in_flight)",
|
|
1411
|
+
"short",
|
|
1412
|
+
),
|
|
1413
|
+
panelStat(
|
|
1414
|
+
"Handler errors/s",
|
|
1415
|
+
{ x: 12, y: 0, w: 6, h: 4 },
|
|
1416
|
+
"sum(rate(caddy_http_request_errors_total[5m]))",
|
|
1417
|
+
"ops",
|
|
1418
|
+
{ orange: 0.1, red: 1 },
|
|
1419
|
+
),
|
|
1420
|
+
panelStat(
|
|
1421
|
+
"P95 latency",
|
|
1422
|
+
{ x: 18, y: 0, w: 6, h: 4 },
|
|
1423
|
+
"histogram_quantile(0.95, sum by (le) (rate(caddy_http_request_duration_seconds_bucket[5m])))",
|
|
1424
|
+
"s",
|
|
1425
|
+
{ orange: 0.3, red: 1 },
|
|
1426
|
+
),
|
|
1427
|
+
|
|
1428
|
+
// Row: traffic breakdown
|
|
1429
|
+
panelTimeseries(
|
|
1430
|
+
"Request rate by host",
|
|
1431
|
+
{ x: 0, y: 4, w: 12, h: 8 },
|
|
1432
|
+
"sum by (host) (rate(caddy_http_request_duration_seconds_count[1m]))",
|
|
1433
|
+
"{{host}}",
|
|
1434
|
+
"reqps",
|
|
1435
|
+
),
|
|
1436
|
+
panelTimeseries(
|
|
1437
|
+
"Responses by status code",
|
|
1438
|
+
{ x: 12, y: 4, w: 12, h: 8 },
|
|
1439
|
+
"sum by (code) (rate(caddy_http_request_duration_seconds_count[1m]))",
|
|
1440
|
+
"{{code}}",
|
|
1441
|
+
"reqps",
|
|
1442
|
+
),
|
|
1443
|
+
|
|
1444
|
+
// Row: latency + error log volume
|
|
1445
|
+
panelTimeseries(
|
|
1446
|
+
"P95 latency by host",
|
|
1447
|
+
{ x: 0, y: 12, w: 12, h: 8 },
|
|
1448
|
+
"histogram_quantile(0.95, sum by (host, le) (rate(caddy_http_request_duration_seconds_bucket[5m])))",
|
|
1449
|
+
"{{host}}",
|
|
1450
|
+
"s",
|
|
1451
|
+
),
|
|
1452
|
+
panelTimeseries(
|
|
1453
|
+
"4xx/5xx responses (access log)",
|
|
1454
|
+
{ x: 12, y: 12, w: 12, h: 8 },
|
|
1455
|
+
'sum by (status) (count_over_time({compose_service="caddy"} | json | status >= 400 [$__interval]))',
|
|
1456
|
+
"{{status}}",
|
|
1457
|
+
"short",
|
|
1458
|
+
LOKI_DS,
|
|
1459
|
+
),
|
|
1460
|
+
|
|
1461
|
+
// Row: live access log
|
|
1462
|
+
{
|
|
1463
|
+
title: "Access log (live)",
|
|
1464
|
+
type: "logs",
|
|
1465
|
+
gridPos: { x: 0, y: 20, w: 24, h: 12 },
|
|
1466
|
+
datasource: { type: "loki", uid: "loki" },
|
|
1467
|
+
targets: [
|
|
1468
|
+
{
|
|
1469
|
+
expr: '{compose_service="caddy"}',
|
|
1470
|
+
refId: "A",
|
|
1471
|
+
},
|
|
1472
|
+
],
|
|
1473
|
+
options: {
|
|
1474
|
+
showTime: true,
|
|
1475
|
+
showLabels: false,
|
|
1476
|
+
showCommonLabels: false,
|
|
1477
|
+
wrapLogMessage: true,
|
|
1478
|
+
enableLogDetails: true,
|
|
1479
|
+
dedupStrategy: "none",
|
|
1480
|
+
sortOrder: "Descending",
|
|
1481
|
+
},
|
|
1482
|
+
},
|
|
1483
|
+
],
|
|
1484
|
+
};
|
|
1485
|
+
return JSON.stringify(dashboard, null, 2);
|
|
1486
|
+
}
|
|
1487
|
+
|
|
858
1488
|
/** All config files needed on the host. Returns map of relative-path → contents
|
|
859
1489
|
* so bootstrap can write+upload them in one pass. */
|
|
860
1490
|
export function generateObservabilityConfigs(
|
|
@@ -865,14 +1495,18 @@ export function generateObservabilityConfigs(
|
|
|
865
1495
|
"observability/tempo.yaml": generateTempoConfig(cfg),
|
|
866
1496
|
"observability/loki-config.yaml": generateLokiConfig(cfg),
|
|
867
1497
|
"observability/prometheus.yml": generatePrometheusConfig(cfg),
|
|
1498
|
+
"observability/alloy-config.alloy": generateAlloyConfig(),
|
|
868
1499
|
"observability/grafana-datasources.yaml": generateGrafanaDatasources(),
|
|
869
1500
|
"observability/grafana-dashboards.yaml": generateGrafanaDashboardsProvider(),
|
|
1501
|
+
"observability/grafana-alerting/alerting.yaml": generateGrafanaAlerting(cfg),
|
|
870
1502
|
"observability/grafana-dashboards/arc-overview.json": generateArcOverviewDashboard(),
|
|
871
1503
|
"observability/grafana-dashboards/arc-traces.json": generateArcTracesDashboard(),
|
|
872
1504
|
"observability/grafana-dashboards/arc-service-map.json": generateArcServiceMapDashboard(),
|
|
873
1505
|
"observability/grafana-dashboards/arc-logs.json": generateArcLogsDashboard(),
|
|
874
1506
|
"observability/grafana-dashboards/arc-sampling.json": generateArcSamplingDashboard(),
|
|
875
1507
|
"observability/grafana-dashboards/arc-command.json": generateArcCommandDashboard(),
|
|
1508
|
+
"observability/grafana-dashboards/arc-infra.json": generateArcInfraDashboard(),
|
|
1509
|
+
"observability/grafana-dashboards/arc-edge.json": generateArcEdgeDashboard(),
|
|
876
1510
|
};
|
|
877
1511
|
}
|
|
878
1512
|
|
|
@@ -882,6 +1516,10 @@ export function generateObservabilityConfigs(
|
|
|
882
1516
|
|
|
883
1517
|
interface GridPos { x: number; y: number; w: number; h: number; }
|
|
884
1518
|
interface Threshold { orange?: number; red?: number; }
|
|
1519
|
+
interface PanelDatasource { type: string; uid: string; }
|
|
1520
|
+
|
|
1521
|
+
const PROMETHEUS_DS: PanelDatasource = { type: "prometheus", uid: "prometheus" };
|
|
1522
|
+
const LOKI_DS: PanelDatasource = { type: "loki", uid: "loki" };
|
|
885
1523
|
|
|
886
1524
|
function panelStat(
|
|
887
1525
|
title: string,
|
|
@@ -889,6 +1527,7 @@ function panelStat(
|
|
|
889
1527
|
expr: string,
|
|
890
1528
|
unit: string,
|
|
891
1529
|
thresholds?: Threshold,
|
|
1530
|
+
datasource: PanelDatasource = PROMETHEUS_DS,
|
|
892
1531
|
) {
|
|
893
1532
|
const steps: { color: string; value: number | null }[] = [
|
|
894
1533
|
{ color: "green", value: null },
|
|
@@ -903,7 +1542,7 @@ function panelStat(
|
|
|
903
1542
|
title,
|
|
904
1543
|
type: "stat",
|
|
905
1544
|
gridPos,
|
|
906
|
-
datasource
|
|
1545
|
+
datasource,
|
|
907
1546
|
targets: [{ expr, refId: "A", legendFormat: title }],
|
|
908
1547
|
fieldConfig: {
|
|
909
1548
|
defaults: {
|
|
@@ -928,6 +1567,7 @@ function panelTimeseries(
|
|
|
928
1567
|
query: string | { expr: string; legend: string }[],
|
|
929
1568
|
legend: string,
|
|
930
1569
|
unit: string,
|
|
1570
|
+
datasource: PanelDatasource = PROMETHEUS_DS,
|
|
931
1571
|
) {
|
|
932
1572
|
const targets = Array.isArray(query)
|
|
933
1573
|
? query.map((q, i) => ({
|
|
@@ -940,7 +1580,7 @@ function panelTimeseries(
|
|
|
940
1580
|
title,
|
|
941
1581
|
type: "timeseries",
|
|
942
1582
|
gridPos,
|
|
943
|
-
datasource
|
|
1583
|
+
datasource,
|
|
944
1584
|
targets,
|
|
945
1585
|
fieldConfig: {
|
|
946
1586
|
defaults: {
|