@arcote.tech/arc-cli 0.7.18 → 0.7.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +651 -105
- package/package.json +9 -9
- package/src/deploy/bootstrap.ts +8 -3
- package/src/deploy/caddyfile.ts +43 -8
- package/src/deploy/compose.ts +73 -0
- package/src/deploy/config.ts +15 -0
- package/src/deploy/observability-configs.ts +674 -48
- package/src/platform/server.ts +3 -0
|
@@ -35,8 +35,9 @@ function pickRetention(o: DeployObservability | undefined) {
|
|
|
35
35
|
}
|
|
36
36
|
|
|
37
37
|
/** OpenTelemetry Collector — receives OTLP from app containers + browser,
|
|
38
|
-
*
|
|
39
|
-
*
|
|
38
|
+
* derives span-metrics + service-graph BEFORE tail sampling (no sampling
|
|
39
|
+
* bias), scrapes host + per-container resource usage, fans out to Tempo
|
|
40
|
+
* (traces), Loki (logs), Prometheus remote-write (metrics). */
|
|
40
41
|
export function generateOtelCollectorConfig(cfg: DeployConfig): string {
|
|
41
42
|
const envNames = Object.keys(cfg.envs);
|
|
42
43
|
return `# Generated by \`arc platform deploy\` — do not edit by hand.
|
|
@@ -55,6 +56,64 @@ ${envNames.map((name) => ` - "https://${cfg.envs[name]!.domain}"`).jo
|
|
|
55
56
|
- tracestate
|
|
56
57
|
- content-type
|
|
57
58
|
|
|
59
|
+
# Host-level CPU / memory / load / disk / filesystem / network metrics.
|
|
60
|
+
# The host root is bind-mounted read-only at /hostfs (see compose).
|
|
61
|
+
hostmetrics:
|
|
62
|
+
collection_interval: 30s
|
|
63
|
+
root_path: /hostfs
|
|
64
|
+
scrapers:
|
|
65
|
+
cpu:
|
|
66
|
+
metrics:
|
|
67
|
+
system.cpu.utilization:
|
|
68
|
+
enabled: true
|
|
69
|
+
memory:
|
|
70
|
+
metrics:
|
|
71
|
+
system.memory.utilization:
|
|
72
|
+
enabled: true
|
|
73
|
+
load: {}
|
|
74
|
+
disk: {}
|
|
75
|
+
filesystem:
|
|
76
|
+
metrics:
|
|
77
|
+
system.filesystem.utilization:
|
|
78
|
+
enabled: true
|
|
79
|
+
exclude_fs_types:
|
|
80
|
+
fs_types: [autofs, binfmt_misc, bpf, cgroup2, configfs, debugfs, devpts, devtmpfs, fusectl, hugetlbfs, iso9660, mqueue, nsfs, overlay, proc, procfs, pstore, rpc_pipefs, securityfs, selinuxfs, squashfs, sysfs, tracefs, tmpfs]
|
|
81
|
+
match_type: strict
|
|
82
|
+
exclude_mount_points:
|
|
83
|
+
mount_points: ["/var/lib/docker/.*", "/run/.*", "/snap/.*", "/boot/.*"]
|
|
84
|
+
match_type: regexp
|
|
85
|
+
network: {}
|
|
86
|
+
paging: {}
|
|
87
|
+
|
|
88
|
+
# Per-container CPU / memory / network / block-IO + restarts straight from
|
|
89
|
+
# the Docker daemon (socket bind-mounted read-only, see compose).
|
|
90
|
+
docker_stats:
|
|
91
|
+
endpoint: unix:///var/run/docker.sock
|
|
92
|
+
collection_interval: 30s
|
|
93
|
+
metrics:
|
|
94
|
+
container.restarts:
|
|
95
|
+
enabled: true
|
|
96
|
+
container.uptime:
|
|
97
|
+
enabled: true
|
|
98
|
+
|
|
99
|
+
connectors:
|
|
100
|
+
# Span→metrics computed from 100% of spans (pipeline runs BEFORE tail
|
|
101
|
+
# sampling) — lowering the sampling policy later never skews dashboards.
|
|
102
|
+
spanmetrics:
|
|
103
|
+
histogram:
|
|
104
|
+
unit: ms
|
|
105
|
+
explicit:
|
|
106
|
+
buckets: [2ms, 6ms, 10ms, 50ms, 100ms, 250ms, 500ms, 1s, 2s, 5s, 10s]
|
|
107
|
+
metrics_flush_interval: 15s
|
|
108
|
+
# Emits traces_service_graph_* (same metric names Tempo's generator would).
|
|
109
|
+
servicegraph:
|
|
110
|
+
metrics_flush_interval: 15s
|
|
111
|
+
store:
|
|
112
|
+
ttl: 5s
|
|
113
|
+
max_items: 5000
|
|
114
|
+
# Joins the raw-trace pipeline to the sampled-storage pipeline.
|
|
115
|
+
forward: {}
|
|
116
|
+
|
|
58
117
|
processors:
|
|
59
118
|
batch:
|
|
60
119
|
timeout: 5s
|
|
@@ -65,7 +124,8 @@ processors:
|
|
|
65
124
|
# Errors + slow traces zachowywane w 100%, normalne traces również 100%
|
|
66
125
|
# przy obecnej skali (boostrap produkcji). Tail sampling matchuje OR po
|
|
67
126
|
# policies — bez "always" policy WSZYSTKIE OK traces byłyby droppowane.
|
|
68
|
-
# Obniż 'random_100pct' do np. 10% gdy ruch eksploduje
|
|
127
|
+
# Obniż 'random_100pct' do np. 10% gdy ruch eksploduje — span-metrics są
|
|
128
|
+
# liczone przed samplingiem, więc dashboardy pozostaną dokładne.
|
|
69
129
|
tail_sampling:
|
|
70
130
|
decision_wait: 10s
|
|
71
131
|
num_traces: 50000
|
|
@@ -90,6 +150,34 @@ processors:
|
|
|
90
150
|
- key: http.request.header.cookie
|
|
91
151
|
action: delete
|
|
92
152
|
|
|
153
|
+
# Cardinality guard for span-metrics: the SPA fallback answers EVERY path,
|
|
154
|
+
# so raw span names (one per bot-scanned URL) would explode Prometheus
|
|
155
|
+
# series. Static assets collapse to "<METHOD> static", /route/* to
|
|
156
|
+
# "<METHOD> /route", anything else outside the known API surface to
|
|
157
|
+
# "<METHOD> other". Runs BEFORE the spanmetrics connector. Note: "$$" is a
|
|
158
|
+
# literal "$" (collector env expansion), RE2 has no lookahead → IsMatch+not.
|
|
159
|
+
transform/span_names:
|
|
160
|
+
error_mode: ignore
|
|
161
|
+
trace_statements:
|
|
162
|
+
- context: span
|
|
163
|
+
statements:
|
|
164
|
+
- set(name, Concat([attributes["http.request.method"], "static"], " ")) where IsMatch(name, "^[A-Z]+ /.*\\\\.(js|mjs|css|map|ico|png|jpe?g|svg|gif|webp|avif|woff2?|ttf|otf|txt|xml|json|webmanifest)$$")
|
|
165
|
+
- replace_pattern(name, "^([A-Z]+) /route(/.*)?$$", "$$1 /route")
|
|
166
|
+
- set(name, Concat([attributes["http.request.method"], "other"], " ")) where IsMatch(name, "^[A-Z]+ /") and not IsMatch(name, "^[A-Z]+ ((/api|/command|/query)(/.*)?|/route|/ws|/health|/otel(/.*)?|/)$$")
|
|
167
|
+
|
|
168
|
+
# Stable service.name for infra metric streams (becomes the service_name
|
|
169
|
+
# label after resource_to_telemetry_conversion).
|
|
170
|
+
resource/host:
|
|
171
|
+
attributes:
|
|
172
|
+
- key: service.name
|
|
173
|
+
value: arc-host
|
|
174
|
+
action: upsert
|
|
175
|
+
resource/docker:
|
|
176
|
+
attributes:
|
|
177
|
+
- key: service.name
|
|
178
|
+
value: arc-docker
|
|
179
|
+
action: upsert
|
|
180
|
+
|
|
93
181
|
exporters:
|
|
94
182
|
otlp/tempo:
|
|
95
183
|
endpoint: tempo:4317
|
|
@@ -105,6 +193,10 @@ exporters:
|
|
|
105
193
|
endpoint: http://prometheus:9090/api/v1/write
|
|
106
194
|
tls:
|
|
107
195
|
insecure: true
|
|
196
|
+
# Copy resource attributes (service.name, deployment.environment, …)
|
|
197
|
+
# onto every series — dashboards filter by service_name.
|
|
198
|
+
resource_to_telemetry_conversion:
|
|
199
|
+
enabled: true
|
|
108
200
|
|
|
109
201
|
extensions:
|
|
110
202
|
health_check: {}
|
|
@@ -112,19 +204,42 @@ extensions:
|
|
|
112
204
|
|
|
113
205
|
service:
|
|
114
206
|
extensions: [health_check, zpages]
|
|
207
|
+
# Collector self-metrics. Since 0.111 the default bind is localhost only —
|
|
208
|
+
# Prometheus scrapes otel-collector:8888, so listen on all interfaces.
|
|
209
|
+
telemetry:
|
|
210
|
+
metrics:
|
|
211
|
+
level: detailed
|
|
212
|
+
readers:
|
|
213
|
+
- pull:
|
|
214
|
+
exporter:
|
|
215
|
+
prometheus:
|
|
216
|
+
host: "0.0.0.0"
|
|
217
|
+
port: 8888
|
|
115
218
|
pipelines:
|
|
116
|
-
traces:
|
|
219
|
+
traces/in:
|
|
117
220
|
receivers: [otlp]
|
|
118
|
-
processors: [
|
|
221
|
+
processors: [attributes, transform/span_names]
|
|
222
|
+
exporters: [spanmetrics, servicegraph, forward]
|
|
223
|
+
traces/sampled:
|
|
224
|
+
receivers: [forward]
|
|
225
|
+
processors: [tail_sampling, batch]
|
|
119
226
|
exporters: [otlp/tempo]
|
|
120
227
|
logs:
|
|
121
228
|
receivers: [otlp]
|
|
122
229
|
processors: [attributes, batch]
|
|
123
230
|
exporters: [otlphttp/loki]
|
|
124
231
|
metrics:
|
|
125
|
-
receivers: [otlp]
|
|
232
|
+
receivers: [otlp, spanmetrics, servicegraph]
|
|
126
233
|
processors: [batch]
|
|
127
234
|
exporters: [prometheusremotewrite]
|
|
235
|
+
metrics/host:
|
|
236
|
+
receivers: [hostmetrics]
|
|
237
|
+
processors: [resource/host, batch]
|
|
238
|
+
exporters: [prometheusremotewrite]
|
|
239
|
+
metrics/docker:
|
|
240
|
+
receivers: [docker_stats]
|
|
241
|
+
processors: [resource/docker, batch]
|
|
242
|
+
exporters: [prometheusremotewrite]
|
|
128
243
|
`;
|
|
129
244
|
}
|
|
130
245
|
|
|
@@ -162,20 +277,9 @@ storage:
|
|
|
162
277
|
wal:
|
|
163
278
|
path: /var/tempo/wal
|
|
164
279
|
|
|
165
|
-
metrics_generator
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
source: tempo
|
|
169
|
-
storage:
|
|
170
|
-
path: /var/tempo/generator/wal
|
|
171
|
-
remote_write:
|
|
172
|
-
- url: http://prometheus:9090/api/v1/write
|
|
173
|
-
send_exemplars: true
|
|
174
|
-
|
|
175
|
-
overrides:
|
|
176
|
-
defaults:
|
|
177
|
-
metrics_generator:
|
|
178
|
-
processors: [service-graphs, span-metrics]
|
|
280
|
+
# NOTE: no metrics_generator — span-metrics + service-graph are produced by
|
|
281
|
+
# the otel-collector connectors BEFORE tail sampling (accurate rates even
|
|
282
|
+
# when sampling is later tightened) and remote-written to Prometheus there.
|
|
179
283
|
`;
|
|
180
284
|
}
|
|
181
285
|
|
|
@@ -237,12 +341,239 @@ scrape_configs:
|
|
|
237
341
|
- job_name: otel-collector
|
|
238
342
|
static_configs:
|
|
239
343
|
- targets: [otel-collector:8888]
|
|
344
|
+
- job_name: caddy
|
|
345
|
+
static_configs:
|
|
346
|
+
- targets: [caddy:2020]
|
|
347
|
+
- job_name: loki
|
|
348
|
+
static_configs:
|
|
349
|
+
- targets: [loki:3100]
|
|
350
|
+
- job_name: tempo
|
|
351
|
+
static_configs:
|
|
352
|
+
- targets: [tempo:3200]
|
|
353
|
+
- job_name: grafana
|
|
354
|
+
static_configs:
|
|
355
|
+
- targets: [grafana:3000]
|
|
356
|
+
- job_name: alloy
|
|
357
|
+
static_configs:
|
|
358
|
+
- targets: [alloy:12345]
|
|
240
359
|
|
|
241
360
|
# remote-write inbound is enabled via the --web.enable-remote-write-receiver
|
|
242
361
|
# command-line flag (compose.ts). Retention via --storage.tsdb.retention.time.
|
|
243
362
|
`;
|
|
244
363
|
}
|
|
245
364
|
|
|
365
|
+
/** Grafana Alloy — tails stdout/stderr of every container on the host via
|
|
366
|
+
* the Docker API and ships it to Loki. Complements the in-app console→OTLP
|
|
367
|
+
* bridge: infra containers (caddy, postgres, tempo, …) and app crash output
|
|
368
|
+
* (OOM, Bun panic — anything printed before/outside the OTel SDK) all land
|
|
369
|
+
* in Loki under the `container` / `compose_service` labels. */
|
|
370
|
+
export function generateAlloyConfig(): string {
|
|
371
|
+
return `// Generated by \`arc platform deploy\` — do not edit by hand.
|
|
372
|
+
discovery.docker "containers" {
|
|
373
|
+
host = "unix:///var/run/docker.sock"
|
|
374
|
+
refresh_interval = "15s"
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
discovery.relabel "containers" {
|
|
378
|
+
targets = discovery.docker.containers.targets
|
|
379
|
+
|
|
380
|
+
rule {
|
|
381
|
+
source_labels = ["__meta_docker_container_name"]
|
|
382
|
+
regex = "/(.*)"
|
|
383
|
+
target_label = "container"
|
|
384
|
+
}
|
|
385
|
+
rule {
|
|
386
|
+
source_labels = ["__meta_docker_container_label_com_docker_compose_service"]
|
|
387
|
+
target_label = "compose_service"
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
loki.source.docker "containers" {
|
|
392
|
+
host = "unix:///var/run/docker.sock"
|
|
393
|
+
targets = discovery.docker.containers.targets
|
|
394
|
+
relabel_rules = discovery.relabel.containers.rules
|
|
395
|
+
labels = { source = "docker" }
|
|
396
|
+
forward_to = [loki.write.loki.receiver]
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
loki.write "loki" {
|
|
400
|
+
endpoint {
|
|
401
|
+
url = "http://loki:3100/loki/api/v1/push"
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
`;
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
/** Grafana alerting provisioning — a starter rule pack covering the failure
|
|
408
|
+
* modes that matter on a single-host deploy: error rate, latency, disk,
|
|
409
|
+
* memory, container restarts, app silence, telemetry export failures and
|
|
410
|
+
* scrape-target health. Notification routing (webhook) is only emitted when
|
|
411
|
+
* `observability.alertWebhookUrl` is configured — without it the rules are
|
|
412
|
+
* still visible/firing in the Grafana UI. */
|
|
413
|
+
export function generateGrafanaAlerting(cfg: DeployConfig): string {
|
|
414
|
+
const webhookUrl = cfg.observability?.alertWebhookUrl;
|
|
415
|
+
|
|
416
|
+
interface AlertRule {
|
|
417
|
+
uid: string;
|
|
418
|
+
title: string;
|
|
419
|
+
expr: string;
|
|
420
|
+
/** Threshold for the C (threshold) expression node. */
|
|
421
|
+
threshold: number;
|
|
422
|
+
/** Comparison operator. Default "gt". */
|
|
423
|
+
op?: "gt" | "lt";
|
|
424
|
+
/** Pending period, e.g. "5m". "0s" fires immediately. */
|
|
425
|
+
pendingFor: string;
|
|
426
|
+
summary: string;
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
const rules: AlertRule[] = [
|
|
430
|
+
{
|
|
431
|
+
uid: "arc-high-error-rate",
|
|
432
|
+
title: "High server error rate (>5%)",
|
|
433
|
+
expr: 'sum(rate(traces_span_metrics_calls_total{span_kind="SPAN_KIND_SERVER", status_code="STATUS_CODE_ERROR"}[5m])) / clamp_min(sum(rate(traces_span_metrics_calls_total{span_kind="SPAN_KIND_SERVER"}[5m])), 0.001)',
|
|
434
|
+
threshold: 0.05,
|
|
435
|
+
pendingFor: "5m",
|
|
436
|
+
summary: "More than 5% of server spans are errors over the last 5 minutes.",
|
|
437
|
+
},
|
|
438
|
+
{
|
|
439
|
+
uid: "arc-high-latency-p95",
|
|
440
|
+
title: "High p95 latency (>1s)",
|
|
441
|
+
expr: 'histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_kind="SPAN_KIND_SERVER"}[5m])))',
|
|
442
|
+
threshold: 1000,
|
|
443
|
+
pendingFor: "10m",
|
|
444
|
+
summary: "Server p95 latency above 1s for 10 minutes.",
|
|
445
|
+
},
|
|
446
|
+
{
|
|
447
|
+
uid: "arc-host-disk-high",
|
|
448
|
+
title: "Host disk usage >85%",
|
|
449
|
+
expr: 'max by (mountpoint) (sum by (device, mountpoint) (system_filesystem_usage_bytes{state="used"}) / sum by (device, mountpoint) (system_filesystem_usage_bytes))',
|
|
450
|
+
threshold: 0.85,
|
|
451
|
+
pendingFor: "15m",
|
|
452
|
+
summary: "A host filesystem is more than 85% full.",
|
|
453
|
+
},
|
|
454
|
+
{
|
|
455
|
+
uid: "arc-host-memory-high",
|
|
456
|
+
title: "Host memory usage >90%",
|
|
457
|
+
expr: 'sum(system_memory_usage_bytes{state="used"}) / sum(system_memory_usage_bytes)',
|
|
458
|
+
threshold: 0.9,
|
|
459
|
+
pendingFor: "10m",
|
|
460
|
+
summary: "Host memory usage above 90% for 10 minutes.",
|
|
461
|
+
},
|
|
462
|
+
{
|
|
463
|
+
uid: "arc-container-restarts",
|
|
464
|
+
title: "Container restarted",
|
|
465
|
+
expr: 'sum by (container_name) (increase(container_restarts_total[15m]))',
|
|
466
|
+
threshold: 0,
|
|
467
|
+
pendingFor: "0s",
|
|
468
|
+
summary: "A container restarted within the last 15 minutes.",
|
|
469
|
+
},
|
|
470
|
+
{
|
|
471
|
+
uid: "arc-app-silent",
|
|
472
|
+
title: "App stopped reporting metrics",
|
|
473
|
+
expr: "absent(arc_commands_total)",
|
|
474
|
+
threshold: 0,
|
|
475
|
+
pendingFor: "10m",
|
|
476
|
+
summary: "No arc_commands_total series for 10 minutes — app down or telemetry broken.",
|
|
477
|
+
},
|
|
478
|
+
{
|
|
479
|
+
uid: "arc-collector-export-failures",
|
|
480
|
+
title: "Telemetry export failures",
|
|
481
|
+
expr: 'sum(increase({__name__=~"otelcol_exporter_send_failed_(spans|metric_points|log_records)"}[15m]))',
|
|
482
|
+
threshold: 0,
|
|
483
|
+
pendingFor: "0s",
|
|
484
|
+
summary: "The otel-collector failed to export telemetry within the last 15 minutes.",
|
|
485
|
+
},
|
|
486
|
+
{
|
|
487
|
+
uid: "arc-target-down",
|
|
488
|
+
title: "Scrape target down",
|
|
489
|
+
expr: "min(up)",
|
|
490
|
+
threshold: 1,
|
|
491
|
+
op: "lt",
|
|
492
|
+
pendingFor: "5m",
|
|
493
|
+
summary: "A Prometheus scrape target has been down for 5 minutes.",
|
|
494
|
+
},
|
|
495
|
+
];
|
|
496
|
+
|
|
497
|
+
const ruleYaml = rules
|
|
498
|
+
.map((rule) => {
|
|
499
|
+
const op = rule.op ?? "gt";
|
|
500
|
+
return ` - uid: ${rule.uid}
|
|
501
|
+
title: ${JSON.stringify(rule.title)}
|
|
502
|
+
condition: C
|
|
503
|
+
for: ${rule.pendingFor}
|
|
504
|
+
noDataState: OK
|
|
505
|
+
execErrState: OK
|
|
506
|
+
annotations:
|
|
507
|
+
summary: ${JSON.stringify(rule.summary)}
|
|
508
|
+
labels:
|
|
509
|
+
source: arc
|
|
510
|
+
data:
|
|
511
|
+
- refId: A
|
|
512
|
+
relativeTimeRange: { from: 600, to: 0 }
|
|
513
|
+
datasourceUid: prometheus
|
|
514
|
+
model:
|
|
515
|
+
expr: ${JSON.stringify(rule.expr)}
|
|
516
|
+
instant: true
|
|
517
|
+
intervalMs: 1000
|
|
518
|
+
maxDataPoints: 43200
|
|
519
|
+
refId: A
|
|
520
|
+
- refId: B
|
|
521
|
+
relativeTimeRange: { from: 0, to: 0 }
|
|
522
|
+
datasourceUid: __expr__
|
|
523
|
+
model:
|
|
524
|
+
type: reduce
|
|
525
|
+
expression: A
|
|
526
|
+
reducer: last
|
|
527
|
+
refId: B
|
|
528
|
+
- refId: C
|
|
529
|
+
relativeTimeRange: { from: 0, to: 0 }
|
|
530
|
+
datasourceUid: __expr__
|
|
531
|
+
model:
|
|
532
|
+
type: threshold
|
|
533
|
+
expression: B
|
|
534
|
+
refId: C
|
|
535
|
+
conditions:
|
|
536
|
+
- evaluator:
|
|
537
|
+
type: ${op}
|
|
538
|
+
params: [${rule.threshold}]`;
|
|
539
|
+
})
|
|
540
|
+
.join("\n");
|
|
541
|
+
|
|
542
|
+
const contactSection = webhookUrl
|
|
543
|
+
? `
|
|
544
|
+
contactPoints:
|
|
545
|
+
- orgId: 1
|
|
546
|
+
name: arc-webhook
|
|
547
|
+
receivers:
|
|
548
|
+
- uid: arc-webhook
|
|
549
|
+
type: webhook
|
|
550
|
+
settings:
|
|
551
|
+
url: ${JSON.stringify(webhookUrl)}
|
|
552
|
+
httpMethod: POST
|
|
553
|
+
|
|
554
|
+
policies:
|
|
555
|
+
- orgId: 1
|
|
556
|
+
receiver: arc-webhook
|
|
557
|
+
group_by: ["grafana_folder", "alertname"]
|
|
558
|
+
group_wait: 30s
|
|
559
|
+
group_interval: 5m
|
|
560
|
+
repeat_interval: 4h
|
|
561
|
+
`
|
|
562
|
+
: "";
|
|
563
|
+
|
|
564
|
+
return `# Generated by \`arc platform deploy\` — do not edit by hand.
|
|
565
|
+
apiVersion: 1
|
|
566
|
+
|
|
567
|
+
groups:
|
|
568
|
+
- orgId: 1
|
|
569
|
+
name: arc-alerts
|
|
570
|
+
folder: Arc
|
|
571
|
+
interval: 1m
|
|
572
|
+
rules:
|
|
573
|
+
${ruleYaml}
|
|
574
|
+
${contactSection}`;
|
|
575
|
+
}
|
|
576
|
+
|
|
246
577
|
/** Grafana datasource provisioning — Tempo + Loki + Prometheus, all pre-wired. */
|
|
247
578
|
export function generateGrafanaDatasources(): string {
|
|
248
579
|
return `# Generated by \`arc platform deploy\` — do not edit by hand.
|
|
@@ -267,10 +598,17 @@ datasources:
|
|
|
267
598
|
uid: loki
|
|
268
599
|
jsonData:
|
|
269
600
|
derivedFields:
|
|
601
|
+
# Plain-text logs that happen to contain "trace_id=<id>".
|
|
270
602
|
- datasourceUid: tempo
|
|
271
603
|
matcherRegex: "trace_id=(\\\\w+)"
|
|
272
604
|
name: TraceID
|
|
273
605
|
url: $\${__value.raw}
|
|
606
|
+
# OTLP-ingested logs — trace_id arrives as structured metadata.
|
|
607
|
+
- datasourceUid: tempo
|
|
608
|
+
matcherType: label
|
|
609
|
+
matcherRegex: trace_id
|
|
610
|
+
name: TraceID (OTLP)
|
|
611
|
+
url: $\${__value.raw}
|
|
274
612
|
- name: Prometheus
|
|
275
613
|
type: prometheus
|
|
276
614
|
access: proxy
|
|
@@ -322,7 +660,7 @@ export function generateArcOverviewDashboard(): string {
|
|
|
322
660
|
label: "Service",
|
|
323
661
|
type: "query",
|
|
324
662
|
datasource: { type: "prometheus", uid: "prometheus" },
|
|
325
|
-
query: "label_values(
|
|
663
|
+
query: "label_values(traces_span_metrics_calls_total, service_name)",
|
|
326
664
|
refresh: 2,
|
|
327
665
|
includeAll: false,
|
|
328
666
|
multi: false,
|
|
@@ -335,20 +673,20 @@ export function generateArcOverviewDashboard(): string {
|
|
|
335
673
|
panelStat(
|
|
336
674
|
"Request rate (req/s)",
|
|
337
675
|
{ x: 0, y: 0, w: 6, h: 4 },
|
|
338
|
-
'sum(rate(
|
|
676
|
+
'sum(rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m]))',
|
|
339
677
|
"reqps",
|
|
340
678
|
),
|
|
341
679
|
panelStat(
|
|
342
680
|
"Error rate (%)",
|
|
343
681
|
{ x: 6, y: 0, w: 6, h: 4 },
|
|
344
|
-
'sum(rate(
|
|
682
|
+
'sum(rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER", status_code="STATUS_CODE_ERROR"}[5m])) / clamp_min(sum(rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])), 0.001) * 100',
|
|
345
683
|
"percent",
|
|
346
684
|
{ red: 1, orange: 0.1 },
|
|
347
685
|
),
|
|
348
686
|
panelStat(
|
|
349
687
|
"P99 latency",
|
|
350
688
|
{ x: 12, y: 0, w: 6, h: 4 },
|
|
351
|
-
'histogram_quantile(0.99, sum(rate(
|
|
689
|
+
'histogram_quantile(0.99, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
|
|
352
690
|
"ms",
|
|
353
691
|
{ red: 1000, orange: 300 },
|
|
354
692
|
),
|
|
@@ -363,7 +701,7 @@ export function generateArcOverviewDashboard(): string {
|
|
|
363
701
|
panelTimeseries(
|
|
364
702
|
"Request rate by route",
|
|
365
703
|
{ x: 0, y: 4, w: 12, h: 8 },
|
|
366
|
-
'sum by (span_name) (rate(
|
|
704
|
+
'sum by (span_name) (rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[1m]))',
|
|
367
705
|
"{{span_name}}",
|
|
368
706
|
"reqps",
|
|
369
707
|
),
|
|
@@ -372,18 +710,19 @@ export function generateArcOverviewDashboard(): string {
|
|
|
372
710
|
{ x: 12, y: 4, w: 12, h: 8 },
|
|
373
711
|
[
|
|
374
712
|
{
|
|
375
|
-
expr: 'histogram_quantile(0.5, sum(rate(
|
|
713
|
+
expr: 'histogram_quantile(0.5, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
|
|
376
714
|
legend: "p50",
|
|
377
715
|
},
|
|
378
716
|
{
|
|
379
|
-
expr: 'histogram_quantile(0.95, sum(rate(
|
|
717
|
+
expr: 'histogram_quantile(0.95, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
|
|
380
718
|
legend: "p95",
|
|
381
719
|
},
|
|
382
720
|
{
|
|
383
|
-
expr: 'histogram_quantile(0.99, sum(rate(
|
|
721
|
+
expr: 'histogram_quantile(0.99, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
|
|
384
722
|
legend: "p99",
|
|
385
723
|
},
|
|
386
724
|
],
|
|
725
|
+
"",
|
|
387
726
|
"ms",
|
|
388
727
|
),
|
|
389
728
|
|
|
@@ -398,24 +737,24 @@ export function generateArcOverviewDashboard(): string {
|
|
|
398
737
|
panelTimeseries(
|
|
399
738
|
"Command p95 latency",
|
|
400
739
|
{ x: 12, y: 12, w: 12, h: 8 },
|
|
401
|
-
'histogram_quantile(0.95, sum by (arc_command_name, le) (rate(
|
|
740
|
+
'histogram_quantile(0.95, sum by (arc_command_name, le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service"}[5m])))',
|
|
402
741
|
"{{arc_command_name}}",
|
|
403
742
|
"ms",
|
|
404
743
|
),
|
|
405
744
|
|
|
406
745
|
// Row: database
|
|
407
746
|
panelTimeseries(
|
|
408
|
-
"DB
|
|
747
|
+
"DB ops/sec by collection",
|
|
409
748
|
{ x: 0, y: 20, w: 12, h: 8 },
|
|
410
|
-
'sum by (db_collection_name) (rate(
|
|
411
|
-
"{{db_collection_name}}",
|
|
749
|
+
'sum by (db_collection_name, db_operation_name) (rate(arc_db_operation_duration_milliseconds_count{service_name="$service"}[1m]))',
|
|
750
|
+
"{{db_collection_name}} {{db_operation_name}}",
|
|
412
751
|
"ops",
|
|
413
752
|
),
|
|
414
753
|
panelTimeseries(
|
|
415
|
-
"DB
|
|
754
|
+
"DB p95 latency by operation",
|
|
416
755
|
{ x: 12, y: 20, w: 12, h: 8 },
|
|
417
|
-
'histogram_quantile(0.95, sum by (
|
|
418
|
-
"{{
|
|
756
|
+
'histogram_quantile(0.95, sum by (db_operation_name, le) (rate(arc_db_operation_duration_milliseconds_bucket{service_name="$service"}[5m])))',
|
|
757
|
+
"{{db_operation_name}}",
|
|
419
758
|
"ms",
|
|
420
759
|
),
|
|
421
760
|
|
|
@@ -427,7 +766,9 @@ export function generateArcOverviewDashboard(): string {
|
|
|
427
766
|
datasource: { type: "loki", uid: "loki" },
|
|
428
767
|
targets: [
|
|
429
768
|
{
|
|
430
|
-
|
|
769
|
+
// severity_text is OTLP structured metadata — filter via the
|
|
770
|
+
// pipeline stage, NOT the stream selector (not an index label).
|
|
771
|
+
expr: '{service_name="$service"} | severity_text=~`ERROR|FATAL`',
|
|
431
772
|
refId: "A",
|
|
432
773
|
},
|
|
433
774
|
],
|
|
@@ -464,7 +805,7 @@ export function generateArcTracesDashboard(): string {
|
|
|
464
805
|
label: "Service",
|
|
465
806
|
type: "query",
|
|
466
807
|
datasource: { type: "prometheus", uid: "prometheus" },
|
|
467
|
-
query: "label_values(
|
|
808
|
+
query: "label_values(traces_span_metrics_calls_total, service_name)",
|
|
468
809
|
refresh: 2,
|
|
469
810
|
current: { text: "arc-prod", value: "arc-prod" },
|
|
470
811
|
},
|
|
@@ -587,23 +928,38 @@ export function generateArcLogsDashboard(): string {
|
|
|
587
928
|
query: "",
|
|
588
929
|
current: { text: "", value: "" },
|
|
589
930
|
},
|
|
931
|
+
{
|
|
932
|
+
name: "container",
|
|
933
|
+
label: "Container",
|
|
934
|
+
type: "query",
|
|
935
|
+
datasource: { type: "loki", uid: "loki" },
|
|
936
|
+
query: "label_values(container)",
|
|
937
|
+
refresh: 2,
|
|
938
|
+
includeAll: true,
|
|
939
|
+
multi: false,
|
|
940
|
+
current: { text: "All", value: "$__all" },
|
|
941
|
+
},
|
|
590
942
|
],
|
|
591
943
|
},
|
|
592
944
|
panels: [
|
|
593
|
-
// Stat: total log lines, last hour
|
|
945
|
+
// Stat: total log lines, last hour. severity_text is OTLP structured
|
|
946
|
+
// metadata — usable only in pipeline stages, never in stream selectors.
|
|
594
947
|
panelStat(
|
|
595
948
|
"Logs ingested (1h)",
|
|
596
949
|
{ x: 0, y: 0, w: 6, h: 4 },
|
|
597
|
-
'sum(
|
|
950
|
+
'sum(count_over_time({service_name="$service"}[1h]))',
|
|
598
951
|
"short",
|
|
952
|
+
undefined,
|
|
953
|
+
LOKI_DS,
|
|
599
954
|
),
|
|
600
955
|
// Stat: errors (last hour)
|
|
601
956
|
panelStat(
|
|
602
957
|
"Errors (1h)",
|
|
603
958
|
{ x: 6, y: 0, w: 6, h: 4 },
|
|
604
|
-
'sum(
|
|
959
|
+
'sum(count_over_time({service_name="$service"} | severity_text=~`ERROR|FATAL` [1h]))',
|
|
605
960
|
"short",
|
|
606
961
|
{ orange: 1, red: 50 },
|
|
962
|
+
LOKI_DS,
|
|
607
963
|
),
|
|
608
964
|
// Time series: log volume by severity
|
|
609
965
|
{
|
|
@@ -657,6 +1013,30 @@ export function generateArcLogsDashboard(): string {
|
|
|
657
1013
|
sortOrder: "Descending",
|
|
658
1014
|
},
|
|
659
1015
|
},
|
|
1016
|
+
// Container stdout/stderr (collected by Alloy from the Docker API) —
|
|
1017
|
+
// caddy access logs, postgres, the observability stack itself, plus
|
|
1018
|
+
// app crash output that never reached the OTLP pipeline.
|
|
1019
|
+
{
|
|
1020
|
+
title: "Container logs ($container)",
|
|
1021
|
+
type: "logs",
|
|
1022
|
+
gridPos: { x: 0, y: 26, w: 24, h: 14 },
|
|
1023
|
+
datasource: { type: "loki", uid: "loki" },
|
|
1024
|
+
targets: [
|
|
1025
|
+
{
|
|
1026
|
+
expr: '{container=~"$container"} |~ "$search"',
|
|
1027
|
+
refId: "A",
|
|
1028
|
+
},
|
|
1029
|
+
],
|
|
1030
|
+
options: {
|
|
1031
|
+
showTime: true,
|
|
1032
|
+
showLabels: true,
|
|
1033
|
+
showCommonLabels: false,
|
|
1034
|
+
wrapLogMessage: true,
|
|
1035
|
+
enableLogDetails: true,
|
|
1036
|
+
dedupStrategy: "none",
|
|
1037
|
+
sortOrder: "Descending",
|
|
1038
|
+
},
|
|
1039
|
+
},
|
|
660
1040
|
],
|
|
661
1041
|
};
|
|
662
1042
|
return JSON.stringify(dashboard, null, 2);
|
|
@@ -722,6 +1102,7 @@ export function generateArcSamplingDashboard(): string {
|
|
|
722
1102
|
legend: "exported",
|
|
723
1103
|
},
|
|
724
1104
|
],
|
|
1105
|
+
"",
|
|
725
1106
|
"ops",
|
|
726
1107
|
),
|
|
727
1108
|
|
|
@@ -790,20 +1171,20 @@ export function generateArcCommandDashboard(): string {
|
|
|
790
1171
|
panelStat(
|
|
791
1172
|
"P50 latency",
|
|
792
1173
|
{ x: 6, y: 0, w: 6, h: 4 },
|
|
793
|
-
'histogram_quantile(0.5, sum by (le) (rate(
|
|
1174
|
+
'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
|
|
794
1175
|
"ms",
|
|
795
1176
|
),
|
|
796
1177
|
panelStat(
|
|
797
1178
|
"P95 latency",
|
|
798
1179
|
{ x: 12, y: 0, w: 6, h: 4 },
|
|
799
|
-
'histogram_quantile(0.95, sum by (le) (rate(
|
|
1180
|
+
'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
|
|
800
1181
|
"ms",
|
|
801
1182
|
{ orange: 200, red: 1000 },
|
|
802
1183
|
),
|
|
803
1184
|
panelStat(
|
|
804
1185
|
"P99 latency",
|
|
805
1186
|
{ x: 18, y: 0, w: 6, h: 4 },
|
|
806
|
-
'histogram_quantile(0.99, sum by (le) (rate(
|
|
1187
|
+
'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
|
|
807
1188
|
"ms",
|
|
808
1189
|
{ orange: 500, red: 2000 },
|
|
809
1190
|
),
|
|
@@ -820,18 +1201,19 @@ export function generateArcCommandDashboard(): string {
|
|
|
820
1201
|
{ x: 12, y: 4, w: 12, h: 8 },
|
|
821
1202
|
[
|
|
822
1203
|
{
|
|
823
|
-
expr: 'histogram_quantile(0.5, sum by (le) (rate(
|
|
1204
|
+
expr: 'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
|
|
824
1205
|
legend: "p50",
|
|
825
1206
|
},
|
|
826
1207
|
{
|
|
827
|
-
expr: 'histogram_quantile(0.95, sum by (le) (rate(
|
|
1208
|
+
expr: 'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
|
|
828
1209
|
legend: "p95",
|
|
829
1210
|
},
|
|
830
1211
|
{
|
|
831
|
-
expr: 'histogram_quantile(0.99, sum by (le) (rate(
|
|
1212
|
+
expr: 'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
|
|
832
1213
|
legend: "p99",
|
|
833
1214
|
},
|
|
834
1215
|
],
|
|
1216
|
+
"",
|
|
835
1217
|
"ms",
|
|
836
1218
|
),
|
|
837
1219
|
|
|
@@ -855,6 +1237,240 @@ export function generateArcCommandDashboard(): string {
|
|
|
855
1237
|
return JSON.stringify(dashboard, null, 2);
|
|
856
1238
|
}
|
|
857
1239
|
|
|
1240
|
+
/** Infrastructure dashboard — host CPU/RAM/disk/network (hostmetrics
|
|
1241
|
+
* receiver, service_name="arc-host") + per-container resource usage and
|
|
1242
|
+
* restarts (docker_stats receiver, service_name="arc-docker"). The panels
|
|
1243
|
+
* answering "is the box healthy / which container is eating the host". */
|
|
1244
|
+
export function generateArcInfraDashboard(): string {
|
|
1245
|
+
const dashboard = {
|
|
1246
|
+
title: "Arc Infrastructure",
|
|
1247
|
+
uid: "arc-infra",
|
|
1248
|
+
schemaVersion: 39,
|
|
1249
|
+
version: 1,
|
|
1250
|
+
refresh: "30s",
|
|
1251
|
+
time: { from: "now-3h", to: "now" },
|
|
1252
|
+
tags: ["arc", "auto-provisioned"],
|
|
1253
|
+
panels: [
|
|
1254
|
+
// Row: host top-line stats
|
|
1255
|
+
panelStat(
|
|
1256
|
+
"Host CPU used",
|
|
1257
|
+
{ x: 0, y: 0, w: 6, h: 4 },
|
|
1258
|
+
'100 * (1 - avg(rate(system_cpu_time_seconds_total{state="idle"}[5m])))',
|
|
1259
|
+
"percent",
|
|
1260
|
+
{ orange: 70, red: 90 },
|
|
1261
|
+
),
|
|
1262
|
+
panelStat(
|
|
1263
|
+
"Host memory used",
|
|
1264
|
+
{ x: 6, y: 0, w: 6, h: 4 },
|
|
1265
|
+
'100 * sum(system_memory_usage_bytes{state="used"}) / sum(system_memory_usage_bytes)',
|
|
1266
|
+
"percent",
|
|
1267
|
+
{ orange: 80, red: 90 },
|
|
1268
|
+
),
|
|
1269
|
+
panelStat(
|
|
1270
|
+
"Disk used (worst mount)",
|
|
1271
|
+
{ x: 12, y: 0, w: 6, h: 4 },
|
|
1272
|
+
'100 * max(sum by (device, mountpoint) (system_filesystem_usage_bytes{state="used"}) / sum by (device, mountpoint) (system_filesystem_usage_bytes))',
|
|
1273
|
+
"percent",
|
|
1274
|
+
{ orange: 75, red: 85 },
|
|
1275
|
+
),
|
|
1276
|
+
panelStat(
|
|
1277
|
+
"Load (1m)",
|
|
1278
|
+
{ x: 18, y: 0, w: 6, h: 4 },
|
|
1279
|
+
"system_cpu_load_average_1m",
|
|
1280
|
+
"short",
|
|
1281
|
+
),
|
|
1282
|
+
|
|
1283
|
+
// Row: host CPU + memory over time
|
|
1284
|
+
panelTimeseries(
|
|
1285
|
+
"Host CPU utilization",
|
|
1286
|
+
{ x: 0, y: 4, w: 12, h: 8 },
|
|
1287
|
+
[
|
|
1288
|
+
{
|
|
1289
|
+
expr: '100 * (1 - avg(rate(system_cpu_time_seconds_total{state="idle"}[5m])))',
|
|
1290
|
+
legend: "used %",
|
|
1291
|
+
},
|
|
1292
|
+
{ expr: "system_cpu_load_average_1m", legend: "load 1m" },
|
|
1293
|
+
{ expr: "system_cpu_load_average_5m", legend: "load 5m" },
|
|
1294
|
+
{ expr: "system_cpu_load_average_15m", legend: "load 15m" },
|
|
1295
|
+
],
|
|
1296
|
+
"",
|
|
1297
|
+
"short",
|
|
1298
|
+
),
|
|
1299
|
+
panelTimeseries(
|
|
1300
|
+
"Host memory by state",
|
|
1301
|
+
{ x: 12, y: 4, w: 12, h: 8 },
|
|
1302
|
+
'sum by (state) (system_memory_usage_bytes)',
|
|
1303
|
+
"{{state}}",
|
|
1304
|
+
"bytes",
|
|
1305
|
+
),
|
|
1306
|
+
|
|
1307
|
+
// Row: disk
|
|
1308
|
+
panelTimeseries(
|
|
1309
|
+
"Filesystem usage by mount",
|
|
1310
|
+
{ x: 0, y: 12, w: 12, h: 8 },
|
|
1311
|
+
'100 * sum by (mountpoint) (system_filesystem_usage_bytes{state="used"}) / sum by (mountpoint) (system_filesystem_usage_bytes)',
|
|
1312
|
+
"{{mountpoint}}",
|
|
1313
|
+
"percent",
|
|
1314
|
+
),
|
|
1315
|
+
panelTimeseries(
|
|
1316
|
+
"Disk I/O",
|
|
1317
|
+
{ x: 12, y: 12, w: 12, h: 8 },
|
|
1318
|
+
'sum by (device, direction) (rate(system_disk_io_bytes_total[5m]))',
|
|
1319
|
+
"{{device}} {{direction}}",
|
|
1320
|
+
"Bps",
|
|
1321
|
+
),
|
|
1322
|
+
|
|
1323
|
+
// Row: network + paging
|
|
1324
|
+
panelTimeseries(
|
|
1325
|
+
"Network I/O",
|
|
1326
|
+
{ x: 0, y: 20, w: 12, h: 8 },
|
|
1327
|
+
'sum by (device, direction) (rate(system_network_io_bytes_total{device!="lo"}[5m]))',
|
|
1328
|
+
"{{device}} {{direction}}",
|
|
1329
|
+
"Bps",
|
|
1330
|
+
),
|
|
1331
|
+
panelTimeseries(
|
|
1332
|
+
"Container restarts (24h)",
|
|
1333
|
+
{ x: 12, y: 20, w: 12, h: 8 },
|
|
1334
|
+
'sum by (container_name) (increase(container_restarts_total[24h]))',
|
|
1335
|
+
"{{container_name}}",
|
|
1336
|
+
"short",
|
|
1337
|
+
),
|
|
1338
|
+
|
|
1339
|
+
// Row: per-container resources
|
|
1340
|
+
panelTimeseries(
|
|
1341
|
+
"Container CPU",
|
|
1342
|
+
{ x: 0, y: 28, w: 12, h: 8 },
|
|
1343
|
+
'container_cpu_utilization_ratio',
|
|
1344
|
+
"{{container_name}}",
|
|
1345
|
+
"percent",
|
|
1346
|
+
),
|
|
1347
|
+
panelTimeseries(
|
|
1348
|
+
"Container memory",
|
|
1349
|
+
{ x: 12, y: 28, w: 12, h: 8 },
|
|
1350
|
+
'container_memory_usage_total_bytes',
|
|
1351
|
+
"{{container_name}}",
|
|
1352
|
+
"bytes",
|
|
1353
|
+
),
|
|
1354
|
+
panelTimeseries(
|
|
1355
|
+
"Container network RX",
|
|
1356
|
+
{ x: 0, y: 36, w: 12, h: 8 },
|
|
1357
|
+
'sum by (container_name) (rate(container_network_io_usage_rx_bytes_total[5m]))',
|
|
1358
|
+
"{{container_name}}",
|
|
1359
|
+
"Bps",
|
|
1360
|
+
),
|
|
1361
|
+
panelTimeseries(
|
|
1362
|
+
"Container network TX",
|
|
1363
|
+
{ x: 12, y: 36, w: 12, h: 8 },
|
|
1364
|
+
'sum by (container_name) (rate(container_network_io_usage_tx_bytes_total[5m]))',
|
|
1365
|
+
"{{container_name}}",
|
|
1366
|
+
"Bps",
|
|
1367
|
+
),
|
|
1368
|
+
],
|
|
1369
|
+
};
|
|
1370
|
+
return JSON.stringify(dashboard, null, 2);
|
|
1371
|
+
}
|
|
1372
|
+
|
|
1373
|
+
/** Edge dashboard — Caddy reverse-proxy metrics (per-host request rate,
|
|
1374
|
+
* status codes, latency, in-flight) plus the JSON access logs that Alloy
|
|
1375
|
+
* ships to Loki. First stop for "is traffic reaching us / who is 404ing". */
|
|
1376
|
+
export function generateArcEdgeDashboard(): string {
|
|
1377
|
+
const dashboard = {
|
|
1378
|
+
title: "Arc Caddy / Edge",
|
|
1379
|
+
uid: "arc-edge",
|
|
1380
|
+
schemaVersion: 39,
|
|
1381
|
+
version: 1,
|
|
1382
|
+
refresh: "30s",
|
|
1383
|
+
time: { from: "now-1h", to: "now" },
|
|
1384
|
+
tags: ["arc", "auto-provisioned"],
|
|
1385
|
+
panels: [
|
|
1386
|
+
// Row: top-line stats
|
|
1387
|
+
panelStat(
|
|
1388
|
+
"Requests/s",
|
|
1389
|
+
{ x: 0, y: 0, w: 6, h: 4 },
|
|
1390
|
+
"sum(rate(caddy_http_request_duration_seconds_count[1m]))",
|
|
1391
|
+
"reqps",
|
|
1392
|
+
),
|
|
1393
|
+
panelStat(
|
|
1394
|
+
"In-flight requests",
|
|
1395
|
+
{ x: 6, y: 0, w: 6, h: 4 },
|
|
1396
|
+
"sum(caddy_http_requests_in_flight)",
|
|
1397
|
+
"short",
|
|
1398
|
+
),
|
|
1399
|
+
panelStat(
|
|
1400
|
+
"Handler errors/s",
|
|
1401
|
+
{ x: 12, y: 0, w: 6, h: 4 },
|
|
1402
|
+
"sum(rate(caddy_http_request_errors_total[5m]))",
|
|
1403
|
+
"ops",
|
|
1404
|
+
{ orange: 0.1, red: 1 },
|
|
1405
|
+
),
|
|
1406
|
+
panelStat(
|
|
1407
|
+
"P95 latency",
|
|
1408
|
+
{ x: 18, y: 0, w: 6, h: 4 },
|
|
1409
|
+
"histogram_quantile(0.95, sum by (le) (rate(caddy_http_request_duration_seconds_bucket[5m])))",
|
|
1410
|
+
"s",
|
|
1411
|
+
{ orange: 0.3, red: 1 },
|
|
1412
|
+
),
|
|
1413
|
+
|
|
1414
|
+
// Row: traffic breakdown
|
|
1415
|
+
panelTimeseries(
|
|
1416
|
+
"Request rate by host",
|
|
1417
|
+
{ x: 0, y: 4, w: 12, h: 8 },
|
|
1418
|
+
"sum by (host) (rate(caddy_http_request_duration_seconds_count[1m]))",
|
|
1419
|
+
"{{host}}",
|
|
1420
|
+
"reqps",
|
|
1421
|
+
),
|
|
1422
|
+
panelTimeseries(
|
|
1423
|
+
"Responses by status code",
|
|
1424
|
+
{ x: 12, y: 4, w: 12, h: 8 },
|
|
1425
|
+
"sum by (code) (rate(caddy_http_request_duration_seconds_count[1m]))",
|
|
1426
|
+
"{{code}}",
|
|
1427
|
+
"reqps",
|
|
1428
|
+
),
|
|
1429
|
+
|
|
1430
|
+
// Row: latency + error log volume
|
|
1431
|
+
panelTimeseries(
|
|
1432
|
+
"P95 latency by host",
|
|
1433
|
+
{ x: 0, y: 12, w: 12, h: 8 },
|
|
1434
|
+
"histogram_quantile(0.95, sum by (host, le) (rate(caddy_http_request_duration_seconds_bucket[5m])))",
|
|
1435
|
+
"{{host}}",
|
|
1436
|
+
"s",
|
|
1437
|
+
),
|
|
1438
|
+
panelTimeseries(
|
|
1439
|
+
"4xx/5xx responses (access log)",
|
|
1440
|
+
{ x: 12, y: 12, w: 12, h: 8 },
|
|
1441
|
+
'sum by (status) (count_over_time({compose_service="caddy"} | json | status >= 400 [$__interval]))',
|
|
1442
|
+
"{{status}}",
|
|
1443
|
+
"short",
|
|
1444
|
+
LOKI_DS,
|
|
1445
|
+
),
|
|
1446
|
+
|
|
1447
|
+
// Row: live access log
|
|
1448
|
+
{
|
|
1449
|
+
title: "Access log (live)",
|
|
1450
|
+
type: "logs",
|
|
1451
|
+
gridPos: { x: 0, y: 20, w: 24, h: 12 },
|
|
1452
|
+
datasource: { type: "loki", uid: "loki" },
|
|
1453
|
+
targets: [
|
|
1454
|
+
{
|
|
1455
|
+
expr: '{compose_service="caddy"}',
|
|
1456
|
+
refId: "A",
|
|
1457
|
+
},
|
|
1458
|
+
],
|
|
1459
|
+
options: {
|
|
1460
|
+
showTime: true,
|
|
1461
|
+
showLabels: false,
|
|
1462
|
+
showCommonLabels: false,
|
|
1463
|
+
wrapLogMessage: true,
|
|
1464
|
+
enableLogDetails: true,
|
|
1465
|
+
dedupStrategy: "none",
|
|
1466
|
+
sortOrder: "Descending",
|
|
1467
|
+
},
|
|
1468
|
+
},
|
|
1469
|
+
],
|
|
1470
|
+
};
|
|
1471
|
+
return JSON.stringify(dashboard, null, 2);
|
|
1472
|
+
}
|
|
1473
|
+
|
|
858
1474
|
/** All config files needed on the host. Returns map of relative-path → contents
|
|
859
1475
|
* so bootstrap can write+upload them in one pass. */
|
|
860
1476
|
export function generateObservabilityConfigs(
|
|
@@ -865,14 +1481,18 @@ export function generateObservabilityConfigs(
|
|
|
865
1481
|
"observability/tempo.yaml": generateTempoConfig(cfg),
|
|
866
1482
|
"observability/loki-config.yaml": generateLokiConfig(cfg),
|
|
867
1483
|
"observability/prometheus.yml": generatePrometheusConfig(cfg),
|
|
1484
|
+
"observability/alloy-config.alloy": generateAlloyConfig(),
|
|
868
1485
|
"observability/grafana-datasources.yaml": generateGrafanaDatasources(),
|
|
869
1486
|
"observability/grafana-dashboards.yaml": generateGrafanaDashboardsProvider(),
|
|
1487
|
+
"observability/grafana-alerting/alerting.yaml": generateGrafanaAlerting(cfg),
|
|
870
1488
|
"observability/grafana-dashboards/arc-overview.json": generateArcOverviewDashboard(),
|
|
871
1489
|
"observability/grafana-dashboards/arc-traces.json": generateArcTracesDashboard(),
|
|
872
1490
|
"observability/grafana-dashboards/arc-service-map.json": generateArcServiceMapDashboard(),
|
|
873
1491
|
"observability/grafana-dashboards/arc-logs.json": generateArcLogsDashboard(),
|
|
874
1492
|
"observability/grafana-dashboards/arc-sampling.json": generateArcSamplingDashboard(),
|
|
875
1493
|
"observability/grafana-dashboards/arc-command.json": generateArcCommandDashboard(),
|
|
1494
|
+
"observability/grafana-dashboards/arc-infra.json": generateArcInfraDashboard(),
|
|
1495
|
+
"observability/grafana-dashboards/arc-edge.json": generateArcEdgeDashboard(),
|
|
876
1496
|
};
|
|
877
1497
|
}
|
|
878
1498
|
|
|
@@ -882,6 +1502,10 @@ export function generateObservabilityConfigs(
|
|
|
882
1502
|
|
|
883
1503
|
interface GridPos { x: number; y: number; w: number; h: number; }
|
|
884
1504
|
interface Threshold { orange?: number; red?: number; }
|
|
1505
|
+
interface PanelDatasource { type: string; uid: string; }
|
|
1506
|
+
|
|
1507
|
+
const PROMETHEUS_DS: PanelDatasource = { type: "prometheus", uid: "prometheus" };
|
|
1508
|
+
const LOKI_DS: PanelDatasource = { type: "loki", uid: "loki" };
|
|
885
1509
|
|
|
886
1510
|
function panelStat(
|
|
887
1511
|
title: string,
|
|
@@ -889,6 +1513,7 @@ function panelStat(
|
|
|
889
1513
|
expr: string,
|
|
890
1514
|
unit: string,
|
|
891
1515
|
thresholds?: Threshold,
|
|
1516
|
+
datasource: PanelDatasource = PROMETHEUS_DS,
|
|
892
1517
|
) {
|
|
893
1518
|
const steps: { color: string; value: number | null }[] = [
|
|
894
1519
|
{ color: "green", value: null },
|
|
@@ -903,7 +1528,7 @@ function panelStat(
|
|
|
903
1528
|
title,
|
|
904
1529
|
type: "stat",
|
|
905
1530
|
gridPos,
|
|
906
|
-
datasource
|
|
1531
|
+
datasource,
|
|
907
1532
|
targets: [{ expr, refId: "A", legendFormat: title }],
|
|
908
1533
|
fieldConfig: {
|
|
909
1534
|
defaults: {
|
|
@@ -928,6 +1553,7 @@ function panelTimeseries(
|
|
|
928
1553
|
query: string | { expr: string; legend: string }[],
|
|
929
1554
|
legend: string,
|
|
930
1555
|
unit: string,
|
|
1556
|
+
datasource: PanelDatasource = PROMETHEUS_DS,
|
|
931
1557
|
) {
|
|
932
1558
|
const targets = Array.isArray(query)
|
|
933
1559
|
? query.map((q, i) => ({
|
|
@@ -940,7 +1566,7 @@ function panelTimeseries(
|
|
|
940
1566
|
title,
|
|
941
1567
|
type: "timeseries",
|
|
942
1568
|
gridPos,
|
|
943
|
-
datasource
|
|
1569
|
+
datasource,
|
|
944
1570
|
targets,
|
|
945
1571
|
fieldConfig: {
|
|
946
1572
|
defaults: {
|