@arcote.tech/arc-cli 0.7.19 → 0.7.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -35,8 +35,9 @@ function pickRetention(o: DeployObservability | undefined) {
35
35
  }
36
36
 
37
37
  /** OpenTelemetry Collector — receives OTLP from app containers + browser,
38
- * applies tail sampling, fans out to Tempo (traces), Loki (logs),
39
- * Prometheus remote-write (metrics). */
38
+ * derives span-metrics + service-graph BEFORE tail sampling (no sampling
39
+ * bias), scrapes host + per-container resource usage, fans out to Tempo
40
+ * (traces), Loki (logs), Prometheus remote-write (metrics). */
40
41
  export function generateOtelCollectorConfig(cfg: DeployConfig): string {
41
42
  const envNames = Object.keys(cfg.envs);
42
43
  return `# Generated by \`arc platform deploy\` — do not edit by hand.
@@ -55,6 +56,69 @@ ${envNames.map((name) => ` - "https://${cfg.envs[name]!.domain}"`).jo
55
56
  - tracestate
56
57
  - content-type
57
58
 
59
+ # Host-level CPU / memory / load / disk / filesystem / network metrics.
60
+ # The host root is bind-mounted read-only at /hostfs (see compose).
61
+ hostmetrics:
62
+ collection_interval: 30s
63
+ root_path: /hostfs
64
+ scrapers:
65
+ cpu:
66
+ metrics:
67
+ system.cpu.utilization:
68
+ enabled: true
69
+ memory:
70
+ metrics:
71
+ system.memory.utilization:
72
+ enabled: true
73
+ load: {}
74
+ disk: {}
75
+ filesystem:
76
+ metrics:
77
+ system.filesystem.utilization:
78
+ enabled: true
79
+ exclude_fs_types:
80
+ fs_types: [autofs, binfmt_misc, bpf, cgroup2, configfs, debugfs, devpts, devtmpfs, fusectl, hugetlbfs, iso9660, mqueue, nsfs, overlay, proc, procfs, pstore, rpc_pipefs, securityfs, selinuxfs, squashfs, sysfs, tracefs, tmpfs]
81
+ match_type: strict
82
+ exclude_mount_points:
83
+ mount_points: ["/var/lib/docker/.*", "/run/.*", "/snap/.*", "/boot/.*"]
84
+ match_type: regexp
85
+ network: {}
86
+ paging: {}
87
+
88
+ # Per-container CPU / memory / network / block-IO + restarts straight from
89
+ # the Docker daemon (socket bind-mounted read-only, see compose).
90
+ # api_version pinned: the receiver defaults to Docker API 1.25, which modern
91
+ # daemons (Engine 25+ require >= 1.40) reject — without this the receiver
92
+ # fails to start and takes the whole collector down. Quoted so YAML doesn't
93
+ # parse 1.40 → 1.4. Must be <= the daemon's max; 1.40 is the safe floor.
94
+ docker_stats:
95
+ endpoint: unix:///var/run/docker.sock
96
+ api_version: "1.40"
97
+ collection_interval: 30s
98
+ metrics:
99
+ container.restarts:
100
+ enabled: true
101
+ container.uptime:
102
+ enabled: true
103
+
104
+ connectors:
105
+ # Span→metrics computed from 100% of spans (pipeline runs BEFORE tail
106
+ # sampling) — lowering the sampling policy later never skews dashboards.
107
+ spanmetrics:
108
+ histogram:
109
+ unit: ms
110
+ explicit:
111
+ buckets: [2ms, 6ms, 10ms, 50ms, 100ms, 250ms, 500ms, 1s, 2s, 5s, 10s]
112
+ metrics_flush_interval: 15s
113
+ # Emits traces_service_graph_* (same metric names Tempo's generator would).
114
+ servicegraph:
115
+ metrics_flush_interval: 15s
116
+ store:
117
+ ttl: 5s
118
+ max_items: 5000
119
+ # Joins the raw-trace pipeline to the sampled-storage pipeline.
120
+ forward: {}
121
+
58
122
  processors:
59
123
  batch:
60
124
  timeout: 5s
@@ -65,7 +129,8 @@ processors:
65
129
  # Errors + slow traces zachowywane w 100%, normalne traces również 100%
66
130
  # przy obecnej skali (boostrap produkcji). Tail sampling matchuje OR po
67
131
  # policies — bez "always" policy WSZYSTKIE OK traces byłyby droppowane.
68
- # Obniż 'random_100pct' do np. 10% gdy ruch eksploduje.
132
+ # Obniż 'random_100pct' do np. 10% gdy ruch eksploduje — span-metrics są
133
+ # liczone przed samplingiem, więc dashboardy pozostaną dokładne.
69
134
  tail_sampling:
70
135
  decision_wait: 10s
71
136
  num_traces: 50000
@@ -90,6 +155,34 @@ processors:
90
155
  - key: http.request.header.cookie
91
156
  action: delete
92
157
 
158
+ # Cardinality guard for span-metrics: the SPA fallback answers EVERY path,
159
+ # so raw span names (one per bot-scanned URL) would explode Prometheus
160
+ # series. Static assets collapse to "<METHOD> static", /route/* to
161
+ # "<METHOD> /route", anything else outside the known API surface to
162
+ # "<METHOD> other". Runs BEFORE the spanmetrics connector. Note: "$$" is a
163
+ # literal "$" (collector env expansion), RE2 has no lookahead → IsMatch+not.
164
+ transform/span_names:
165
+ error_mode: ignore
166
+ trace_statements:
167
+ - context: span
168
+ statements:
169
+ - set(name, Concat([attributes["http.request.method"], "static"], " ")) where IsMatch(name, "^[A-Z]+ /.*\\\\.(js|mjs|css|map|ico|png|jpe?g|svg|gif|webp|avif|woff2?|ttf|otf|txt|xml|json|webmanifest)$$")
170
+ - replace_pattern(name, "^([A-Z]+) /route(/.*)?$$", "$$1 /route")
171
+ - set(name, Concat([attributes["http.request.method"], "other"], " ")) where IsMatch(name, "^[A-Z]+ /") and not IsMatch(name, "^[A-Z]+ ((/api|/command|/query)(/.*)?|/route|/ws|/health|/otel(/.*)?|/)$$")
172
+
173
+ # Stable service.name for infra metric streams (becomes the service_name
174
+ # label after resource_to_telemetry_conversion).
175
+ resource/host:
176
+ attributes:
177
+ - key: service.name
178
+ value: arc-host
179
+ action: upsert
180
+ resource/docker:
181
+ attributes:
182
+ - key: service.name
183
+ value: arc-docker
184
+ action: upsert
185
+
93
186
  exporters:
94
187
  otlp/tempo:
95
188
  endpoint: tempo:4317
@@ -105,6 +198,10 @@ exporters:
105
198
  endpoint: http://prometheus:9090/api/v1/write
106
199
  tls:
107
200
  insecure: true
201
+ # Copy resource attributes (service.name, deployment.environment, …)
202
+ # onto every series — dashboards filter by service_name.
203
+ resource_to_telemetry_conversion:
204
+ enabled: true
108
205
 
109
206
  extensions:
110
207
  health_check: {}
@@ -112,19 +209,42 @@ extensions:
112
209
 
113
210
  service:
114
211
  extensions: [health_check, zpages]
212
+ # Collector self-metrics. Since 0.111 the default bind is localhost only —
213
+ # Prometheus scrapes otel-collector:8888, so listen on all interfaces.
214
+ telemetry:
215
+ metrics:
216
+ level: detailed
217
+ readers:
218
+ - pull:
219
+ exporter:
220
+ prometheus:
221
+ host: "0.0.0.0"
222
+ port: 8888
115
223
  pipelines:
116
- traces:
224
+ traces/in:
117
225
  receivers: [otlp]
118
- processors: [tail_sampling, attributes, batch]
226
+ processors: [attributes, transform/span_names]
227
+ exporters: [spanmetrics, servicegraph, forward]
228
+ traces/sampled:
229
+ receivers: [forward]
230
+ processors: [tail_sampling, batch]
119
231
  exporters: [otlp/tempo]
120
232
  logs:
121
233
  receivers: [otlp]
122
234
  processors: [attributes, batch]
123
235
  exporters: [otlphttp/loki]
124
236
  metrics:
125
- receivers: [otlp]
237
+ receivers: [otlp, spanmetrics, servicegraph]
126
238
  processors: [batch]
127
239
  exporters: [prometheusremotewrite]
240
+ metrics/host:
241
+ receivers: [hostmetrics]
242
+ processors: [resource/host, batch]
243
+ exporters: [prometheusremotewrite]
244
+ metrics/docker:
245
+ receivers: [docker_stats]
246
+ processors: [resource/docker, batch]
247
+ exporters: [prometheusremotewrite]
128
248
  `;
129
249
  }
130
250
 
@@ -162,20 +282,9 @@ storage:
162
282
  wal:
163
283
  path: /var/tempo/wal
164
284
 
165
- metrics_generator:
166
- registry:
167
- external_labels:
168
- source: tempo
169
- storage:
170
- path: /var/tempo/generator/wal
171
- remote_write:
172
- - url: http://prometheus:9090/api/v1/write
173
- send_exemplars: true
174
-
175
- overrides:
176
- defaults:
177
- metrics_generator:
178
- processors: [service-graphs, span-metrics]
285
+ # NOTE: no metrics_generator — span-metrics + service-graph are produced by
286
+ # the otel-collector connectors BEFORE tail sampling (accurate rates even
287
+ # when sampling is later tightened) and remote-written to Prometheus there.
179
288
  `;
180
289
  }
181
290
 
@@ -237,12 +346,248 @@ scrape_configs:
237
346
  - job_name: otel-collector
238
347
  static_configs:
239
348
  - targets: [otel-collector:8888]
349
+ - job_name: caddy
350
+ static_configs:
351
+ - targets: [caddy:2020]
352
+ - job_name: loki
353
+ static_configs:
354
+ - targets: [loki:3100]
355
+ - job_name: tempo
356
+ static_configs:
357
+ - targets: [tempo:3200]
358
+ - job_name: grafana
359
+ static_configs:
360
+ - targets: [grafana:3000]
361
+ - job_name: alloy
362
+ static_configs:
363
+ - targets: [alloy:12345]
240
364
 
241
365
  # remote-write inbound is enabled via the --web.enable-remote-write-receiver
242
366
  # command-line flag (compose.ts). Retention via --storage.tsdb.retention.time.
243
367
  `;
244
368
  }
245
369
 
370
+ /** Grafana Alloy — tails stdout/stderr of every container on the host via
371
+ * the Docker API and ships it to Loki. Complements the in-app console→OTLP
372
+ * bridge: infra containers (caddy, postgres, tempo, …) and app crash output
373
+ * (OOM, Bun panic — anything printed before/outside the OTel SDK) all land
374
+ * in Loki under the `container` / `compose_service` labels. */
375
+ export function generateAlloyConfig(): string {
376
+ return `// Generated by \`arc platform deploy\` — do not edit by hand.
377
+ discovery.docker "containers" {
378
+ host = "unix:///var/run/docker.sock"
379
+ refresh_interval = "15s"
380
+
381
+ // Only containers managed by a compose project (our stack). Ad-hoc / rogue
382
+ // containers (manual debug runs, other stacks) are excluded — one bad
383
+ // stream (e.g. log entries older than Loki's reject window) otherwise 400s
384
+ // the whole loki.write batch and drops good app logs with it.
385
+ filter {
386
+ name = "label"
387
+ values = ["com.docker.compose.project"]
388
+ }
389
+ }
390
+
391
+ discovery.relabel "containers" {
392
+ targets = discovery.docker.containers.targets
393
+
394
+ rule {
395
+ source_labels = ["__meta_docker_container_name"]
396
+ regex = "/(.*)"
397
+ target_label = "container"
398
+ }
399
+ rule {
400
+ source_labels = ["__meta_docker_container_label_com_docker_compose_service"]
401
+ target_label = "compose_service"
402
+ }
403
+ }
404
+
405
+ loki.source.docker "containers" {
406
+ host = "unix:///var/run/docker.sock"
407
+ targets = discovery.docker.containers.targets
408
+ relabel_rules = discovery.relabel.containers.rules
409
+ labels = { source = "docker" }
410
+ forward_to = [loki.write.loki.receiver]
411
+ }
412
+
413
+ loki.write "loki" {
414
+ endpoint {
415
+ url = "http://loki:3100/loki/api/v1/push"
416
+ }
417
+ }
418
+ `;
419
+ }
420
+
421
+ /** Grafana alerting provisioning — a starter rule pack covering the failure
422
+ * modes that matter on a single-host deploy: error rate, latency, disk,
423
+ * memory, container restarts, app silence, telemetry export failures and
424
+ * scrape-target health. Notification routing (webhook) is only emitted when
425
+ * `observability.alertWebhookUrl` is configured — without it the rules are
426
+ * still visible/firing in the Grafana UI. */
427
+ export function generateGrafanaAlerting(cfg: DeployConfig): string {
428
+ const webhookUrl = cfg.observability?.alertWebhookUrl;
429
+
430
+ interface AlertRule {
431
+ uid: string;
432
+ title: string;
433
+ expr: string;
434
+ /** Threshold for the C (threshold) expression node. */
435
+ threshold: number;
436
+ /** Comparison operator. Default "gt". */
437
+ op?: "gt" | "lt";
438
+ /** Pending period, e.g. "5m". "0s" fires immediately. */
439
+ pendingFor: string;
440
+ summary: string;
441
+ }
442
+
443
+ const rules: AlertRule[] = [
444
+ {
445
+ uid: "arc-high-error-rate",
446
+ title: "High server error rate (>5%)",
447
+ expr: 'sum(rate(traces_span_metrics_calls_total{span_kind="SPAN_KIND_SERVER", status_code="STATUS_CODE_ERROR"}[5m])) / clamp_min(sum(rate(traces_span_metrics_calls_total{span_kind="SPAN_KIND_SERVER"}[5m])), 0.001)',
448
+ threshold: 0.05,
449
+ pendingFor: "5m",
450
+ summary: "More than 5% of server spans are errors over the last 5 minutes.",
451
+ },
452
+ {
453
+ uid: "arc-high-latency-p95",
454
+ title: "High p95 latency (>1s)",
455
+ expr: 'histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_kind="SPAN_KIND_SERVER"}[5m])))',
456
+ threshold: 1000,
457
+ pendingFor: "10m",
458
+ summary: "Server p95 latency above 1s for 10 minutes.",
459
+ },
460
+ {
461
+ uid: "arc-host-disk-high",
462
+ title: "Host disk usage >85%",
463
+ expr: 'max by (mountpoint) (sum by (device, mountpoint) (system_filesystem_usage_bytes{state="used"}) / sum by (device, mountpoint) (system_filesystem_usage_bytes))',
464
+ threshold: 0.85,
465
+ pendingFor: "15m",
466
+ summary: "A host filesystem is more than 85% full.",
467
+ },
468
+ {
469
+ uid: "arc-host-memory-high",
470
+ title: "Host memory usage >90%",
471
+ expr: 'sum(system_memory_usage_bytes{state="used"}) / sum(system_memory_usage_bytes)',
472
+ threshold: 0.9,
473
+ pendingFor: "10m",
474
+ summary: "Host memory usage above 90% for 10 minutes.",
475
+ },
476
+ {
477
+ uid: "arc-container-restarts",
478
+ title: "Container restarted",
479
+ expr: 'sum by (container_name) (increase(container_restarts_total[15m]))',
480
+ threshold: 0,
481
+ pendingFor: "0s",
482
+ summary: "A container restarted within the last 15 minutes.",
483
+ },
484
+ {
485
+ uid: "arc-app-silent",
486
+ title: "App stopped reporting metrics",
487
+ expr: "absent(arc_commands_total)",
488
+ threshold: 0,
489
+ pendingFor: "10m",
490
+ summary: "No arc_commands_total series for 10 minutes — app down or telemetry broken.",
491
+ },
492
+ {
493
+ uid: "arc-collector-export-failures",
494
+ title: "Telemetry export failures",
495
+ expr: 'sum(increase({__name__=~"otelcol_exporter_send_failed_(spans|metric_points|log_records)"}[15m]))',
496
+ threshold: 0,
497
+ pendingFor: "0s",
498
+ summary: "The otel-collector failed to export telemetry within the last 15 minutes.",
499
+ },
500
+ {
501
+ uid: "arc-target-down",
502
+ title: "Scrape target down",
503
+ expr: "min(up)",
504
+ threshold: 1,
505
+ op: "lt",
506
+ pendingFor: "5m",
507
+ summary: "A Prometheus scrape target has been down for 5 minutes.",
508
+ },
509
+ ];
510
+
511
+ const ruleYaml = rules
512
+ .map((rule) => {
513
+ const op = rule.op ?? "gt";
514
+ return ` - uid: ${rule.uid}
515
+ title: ${JSON.stringify(rule.title)}
516
+ condition: C
517
+ for: ${rule.pendingFor}
518
+ noDataState: OK
519
+ execErrState: OK
520
+ annotations:
521
+ summary: ${JSON.stringify(rule.summary)}
522
+ labels:
523
+ source: arc
524
+ data:
525
+ - refId: A
526
+ relativeTimeRange: { from: 600, to: 0 }
527
+ datasourceUid: prometheus
528
+ model:
529
+ expr: ${JSON.stringify(rule.expr)}
530
+ instant: true
531
+ intervalMs: 1000
532
+ maxDataPoints: 43200
533
+ refId: A
534
+ - refId: B
535
+ relativeTimeRange: { from: 0, to: 0 }
536
+ datasourceUid: __expr__
537
+ model:
538
+ type: reduce
539
+ expression: A
540
+ reducer: last
541
+ refId: B
542
+ - refId: C
543
+ relativeTimeRange: { from: 0, to: 0 }
544
+ datasourceUid: __expr__
545
+ model:
546
+ type: threshold
547
+ expression: B
548
+ refId: C
549
+ conditions:
550
+ - evaluator:
551
+ type: ${op}
552
+ params: [${rule.threshold}]`;
553
+ })
554
+ .join("\n");
555
+
556
+ const contactSection = webhookUrl
557
+ ? `
558
+ contactPoints:
559
+ - orgId: 1
560
+ name: arc-webhook
561
+ receivers:
562
+ - uid: arc-webhook
563
+ type: webhook
564
+ settings:
565
+ url: ${JSON.stringify(webhookUrl)}
566
+ httpMethod: POST
567
+
568
+ policies:
569
+ - orgId: 1
570
+ receiver: arc-webhook
571
+ group_by: ["grafana_folder", "alertname"]
572
+ group_wait: 30s
573
+ group_interval: 5m
574
+ repeat_interval: 4h
575
+ `
576
+ : "";
577
+
578
+ return `# Generated by \`arc platform deploy\` — do not edit by hand.
579
+ apiVersion: 1
580
+
581
+ groups:
582
+ - orgId: 1
583
+ name: arc-alerts
584
+ folder: Arc
585
+ interval: 1m
586
+ rules:
587
+ ${ruleYaml}
588
+ ${contactSection}`;
589
+ }
590
+
246
591
  /** Grafana datasource provisioning — Tempo + Loki + Prometheus, all pre-wired. */
247
592
  export function generateGrafanaDatasources(): string {
248
593
  return `# Generated by \`arc platform deploy\` — do not edit by hand.
@@ -267,10 +612,17 @@ datasources:
267
612
  uid: loki
268
613
  jsonData:
269
614
  derivedFields:
615
+ # Plain-text logs that happen to contain "trace_id=<id>".
270
616
  - datasourceUid: tempo
271
617
  matcherRegex: "trace_id=(\\\\w+)"
272
618
  name: TraceID
273
619
  url: $\${__value.raw}
620
+ # OTLP-ingested logs — trace_id arrives as structured metadata.
621
+ - datasourceUid: tempo
622
+ matcherType: label
623
+ matcherRegex: trace_id
624
+ name: TraceID (OTLP)
625
+ url: $\${__value.raw}
274
626
  - name: Prometheus
275
627
  type: prometheus
276
628
  access: proxy
@@ -322,7 +674,7 @@ export function generateArcOverviewDashboard(): string {
322
674
  label: "Service",
323
675
  type: "query",
324
676
  datasource: { type: "prometheus", uid: "prometheus" },
325
- query: "label_values(traces_spanmetrics_calls_total, service_name)",
677
+ query: "label_values(traces_span_metrics_calls_total, service_name)",
326
678
  refresh: 2,
327
679
  includeAll: false,
328
680
  multi: false,
@@ -335,20 +687,20 @@ export function generateArcOverviewDashboard(): string {
335
687
  panelStat(
336
688
  "Request rate (req/s)",
337
689
  { x: 0, y: 0, w: 6, h: 4 },
338
- 'sum(rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m]))',
690
+ 'sum(rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m]))',
339
691
  "reqps",
340
692
  ),
341
693
  panelStat(
342
694
  "Error rate (%)",
343
695
  { x: 6, y: 0, w: 6, h: 4 },
344
- 'sum(rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER", status_code="STATUS_CODE_ERROR"}[5m])) / clamp_min(sum(rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])), 0.001) * 100',
696
+ 'sum(rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER", status_code="STATUS_CODE_ERROR"}[5m])) / clamp_min(sum(rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])), 0.001) * 100',
345
697
  "percent",
346
698
  { red: 1, orange: 0.1 },
347
699
  ),
348
700
  panelStat(
349
701
  "P99 latency",
350
702
  { x: 12, y: 0, w: 6, h: 4 },
351
- 'histogram_quantile(0.99, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
703
+ 'histogram_quantile(0.99, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
352
704
  "ms",
353
705
  { red: 1000, orange: 300 },
354
706
  ),
@@ -363,7 +715,7 @@ export function generateArcOverviewDashboard(): string {
363
715
  panelTimeseries(
364
716
  "Request rate by route",
365
717
  { x: 0, y: 4, w: 12, h: 8 },
366
- 'sum by (span_name) (rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[1m]))',
718
+ 'sum by (span_name) (rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[1m]))',
367
719
  "{{span_name}}",
368
720
  "reqps",
369
721
  ),
@@ -372,18 +724,19 @@ export function generateArcOverviewDashboard(): string {
372
724
  { x: 12, y: 4, w: 12, h: 8 },
373
725
  [
374
726
  {
375
- expr: 'histogram_quantile(0.5, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
727
+ expr: 'histogram_quantile(0.5, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
376
728
  legend: "p50",
377
729
  },
378
730
  {
379
- expr: 'histogram_quantile(0.95, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
731
+ expr: 'histogram_quantile(0.95, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
380
732
  legend: "p95",
381
733
  },
382
734
  {
383
- expr: 'histogram_quantile(0.99, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
735
+ expr: 'histogram_quantile(0.99, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
384
736
  legend: "p99",
385
737
  },
386
738
  ],
739
+ "",
387
740
  "ms",
388
741
  ),
389
742
 
@@ -398,24 +751,24 @@ export function generateArcOverviewDashboard(): string {
398
751
  panelTimeseries(
399
752
  "Command p95 latency",
400
753
  { x: 12, y: 12, w: 12, h: 8 },
401
- 'histogram_quantile(0.95, sum by (arc_command_name, le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service"}[5m])))',
754
+ 'histogram_quantile(0.95, sum by (arc_command_name, le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service"}[5m])))',
402
755
  "{{arc_command_name}}",
403
756
  "ms",
404
757
  ),
405
758
 
406
759
  // Row: database
407
760
  panelTimeseries(
408
- "DB find ops/sec by collection",
761
+ "DB ops/sec by collection",
409
762
  { x: 0, y: 20, w: 12, h: 8 },
410
- 'sum by (db_collection_name) (rate(arc_db_find_ms_milliseconds_count{service_name="$service"}[1m]))',
411
- "{{db_collection_name}}",
763
+ 'sum by (db_collection_name, db_operation_name) (rate(arc_db_operation_duration_milliseconds_count{service_name="$service"}[1m]))',
764
+ "{{db_collection_name}} {{db_operation_name}}",
412
765
  "ops",
413
766
  ),
414
767
  panelTimeseries(
415
- "DB find p95 latency",
768
+ "DB p95 latency by operation",
416
769
  { x: 12, y: 20, w: 12, h: 8 },
417
- 'histogram_quantile(0.95, sum by (db_collection_name, le) (rate(arc_db_find_ms_milliseconds_bucket{service_name="$service"}[5m])))',
418
- "{{db_collection_name}}",
770
+ 'histogram_quantile(0.95, sum by (db_operation_name, le) (rate(arc_db_operation_duration_milliseconds_bucket{service_name="$service"}[5m])))',
771
+ "{{db_operation_name}}",
419
772
  "ms",
420
773
  ),
421
774
 
@@ -427,7 +780,9 @@ export function generateArcOverviewDashboard(): string {
427
780
  datasource: { type: "loki", uid: "loki" },
428
781
  targets: [
429
782
  {
430
- expr: '{service_name="$service"} |= `ERROR`',
783
+ // severity_text is OTLP structured metadata — filter via the
784
+ // pipeline stage, NOT the stream selector (not an index label).
785
+ expr: '{service_name="$service"} | severity_text=~`ERROR|FATAL`',
431
786
  refId: "A",
432
787
  },
433
788
  ],
@@ -464,7 +819,7 @@ export function generateArcTracesDashboard(): string {
464
819
  label: "Service",
465
820
  type: "query",
466
821
  datasource: { type: "prometheus", uid: "prometheus" },
467
- query: "label_values(traces_spanmetrics_calls_total, service_name)",
822
+ query: "label_values(traces_span_metrics_calls_total, service_name)",
468
823
  refresh: 2,
469
824
  current: { text: "arc-prod", value: "arc-prod" },
470
825
  },
@@ -587,23 +942,38 @@ export function generateArcLogsDashboard(): string {
587
942
  query: "",
588
943
  current: { text: "", value: "" },
589
944
  },
945
+ {
946
+ name: "container",
947
+ label: "Container",
948
+ type: "query",
949
+ datasource: { type: "loki", uid: "loki" },
950
+ query: "label_values(container)",
951
+ refresh: 2,
952
+ includeAll: true,
953
+ multi: false,
954
+ current: { text: "All", value: "$__all" },
955
+ },
590
956
  ],
591
957
  },
592
958
  panels: [
593
- // Stat: total log lines, last hour
959
+ // Stat: total log lines, last hour. severity_text is OTLP structured
960
+ // metadata — usable only in pipeline stages, never in stream selectors.
594
961
  panelStat(
595
962
  "Logs ingested (1h)",
596
963
  { x: 0, y: 0, w: 6, h: 4 },
597
- 'sum(increase({service_name="$service"}[1h]))',
964
+ 'sum(count_over_time({service_name="$service"}[1h]))',
598
965
  "short",
966
+ undefined,
967
+ LOKI_DS,
599
968
  ),
600
969
  // Stat: errors (last hour)
601
970
  panelStat(
602
971
  "Errors (1h)",
603
972
  { x: 6, y: 0, w: 6, h: 4 },
604
- 'sum(increase({service_name="$service", severity_text=~"ERROR|FATAL"}[1h]))',
973
+ 'sum(count_over_time({service_name="$service"} | severity_text=~`ERROR|FATAL` [1h]))',
605
974
  "short",
606
975
  { orange: 1, red: 50 },
976
+ LOKI_DS,
607
977
  ),
608
978
  // Time series: log volume by severity
609
979
  {
@@ -657,6 +1027,30 @@ export function generateArcLogsDashboard(): string {
657
1027
  sortOrder: "Descending",
658
1028
  },
659
1029
  },
1030
+ // Container stdout/stderr (collected by Alloy from the Docker API) —
1031
+ // caddy access logs, postgres, the observability stack itself, plus
1032
+ // app crash output that never reached the OTLP pipeline.
1033
+ {
1034
+ title: "Container logs ($container)",
1035
+ type: "logs",
1036
+ gridPos: { x: 0, y: 26, w: 24, h: 14 },
1037
+ datasource: { type: "loki", uid: "loki" },
1038
+ targets: [
1039
+ {
1040
+ expr: '{container=~"$container"} |~ "$search"',
1041
+ refId: "A",
1042
+ },
1043
+ ],
1044
+ options: {
1045
+ showTime: true,
1046
+ showLabels: true,
1047
+ showCommonLabels: false,
1048
+ wrapLogMessage: true,
1049
+ enableLogDetails: true,
1050
+ dedupStrategy: "none",
1051
+ sortOrder: "Descending",
1052
+ },
1053
+ },
660
1054
  ],
661
1055
  };
662
1056
  return JSON.stringify(dashboard, null, 2);
@@ -722,6 +1116,7 @@ export function generateArcSamplingDashboard(): string {
722
1116
  legend: "exported",
723
1117
  },
724
1118
  ],
1119
+ "",
725
1120
  "ops",
726
1121
  ),
727
1122
 
@@ -790,20 +1185,20 @@ export function generateArcCommandDashboard(): string {
790
1185
  panelStat(
791
1186
  "P50 latency",
792
1187
  { x: 6, y: 0, w: 6, h: 4 },
793
- 'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
1188
+ 'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
794
1189
  "ms",
795
1190
  ),
796
1191
  panelStat(
797
1192
  "P95 latency",
798
1193
  { x: 12, y: 0, w: 6, h: 4 },
799
- 'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
1194
+ 'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
800
1195
  "ms",
801
1196
  { orange: 200, red: 1000 },
802
1197
  ),
803
1198
  panelStat(
804
1199
  "P99 latency",
805
1200
  { x: 18, y: 0, w: 6, h: 4 },
806
- 'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
1201
+ 'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
807
1202
  "ms",
808
1203
  { orange: 500, red: 2000 },
809
1204
  ),
@@ -820,18 +1215,19 @@ export function generateArcCommandDashboard(): string {
820
1215
  { x: 12, y: 4, w: 12, h: 8 },
821
1216
  [
822
1217
  {
823
- expr: 'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
1218
+ expr: 'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
824
1219
  legend: "p50",
825
1220
  },
826
1221
  {
827
- expr: 'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
1222
+ expr: 'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
828
1223
  legend: "p95",
829
1224
  },
830
1225
  {
831
- expr: 'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
1226
+ expr: 'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
832
1227
  legend: "p99",
833
1228
  },
834
1229
  ],
1230
+ "",
835
1231
  "ms",
836
1232
  ),
837
1233
 
@@ -855,6 +1251,240 @@ export function generateArcCommandDashboard(): string {
855
1251
  return JSON.stringify(dashboard, null, 2);
856
1252
  }
857
1253
 
1254
+ /** Infrastructure dashboard — host CPU/RAM/disk/network (hostmetrics
1255
+ * receiver, service_name="arc-host") + per-container resource usage and
1256
+ * restarts (docker_stats receiver, service_name="arc-docker"). The panels
1257
+ * answering "is the box healthy / which container is eating the host". */
1258
+ export function generateArcInfraDashboard(): string {
1259
+ const dashboard = {
1260
+ title: "Arc Infrastructure",
1261
+ uid: "arc-infra",
1262
+ schemaVersion: 39,
1263
+ version: 1,
1264
+ refresh: "30s",
1265
+ time: { from: "now-3h", to: "now" },
1266
+ tags: ["arc", "auto-provisioned"],
1267
+ panels: [
1268
+ // Row: host top-line stats
1269
+ panelStat(
1270
+ "Host CPU used",
1271
+ { x: 0, y: 0, w: 6, h: 4 },
1272
+ '100 * (1 - avg(rate(system_cpu_time_seconds_total{state="idle"}[5m])))',
1273
+ "percent",
1274
+ { orange: 70, red: 90 },
1275
+ ),
1276
+ panelStat(
1277
+ "Host memory used",
1278
+ { x: 6, y: 0, w: 6, h: 4 },
1279
+ '100 * sum(system_memory_usage_bytes{state="used"}) / sum(system_memory_usage_bytes)',
1280
+ "percent",
1281
+ { orange: 80, red: 90 },
1282
+ ),
1283
+ panelStat(
1284
+ "Disk used (worst mount)",
1285
+ { x: 12, y: 0, w: 6, h: 4 },
1286
+ '100 * max(sum by (device, mountpoint) (system_filesystem_usage_bytes{state="used"}) / sum by (device, mountpoint) (system_filesystem_usage_bytes))',
1287
+ "percent",
1288
+ { orange: 75, red: 85 },
1289
+ ),
1290
+ panelStat(
1291
+ "Load (1m)",
1292
+ { x: 18, y: 0, w: 6, h: 4 },
1293
+ "system_cpu_load_average_1m",
1294
+ "short",
1295
+ ),
1296
+
1297
+ // Row: host CPU + memory over time
1298
+ panelTimeseries(
1299
+ "Host CPU utilization",
1300
+ { x: 0, y: 4, w: 12, h: 8 },
1301
+ [
1302
+ {
1303
+ expr: '100 * (1 - avg(rate(system_cpu_time_seconds_total{state="idle"}[5m])))',
1304
+ legend: "used %",
1305
+ },
1306
+ { expr: "system_cpu_load_average_1m", legend: "load 1m" },
1307
+ { expr: "system_cpu_load_average_5m", legend: "load 5m" },
1308
+ { expr: "system_cpu_load_average_15m", legend: "load 15m" },
1309
+ ],
1310
+ "",
1311
+ "short",
1312
+ ),
1313
+ panelTimeseries(
1314
+ "Host memory by state",
1315
+ { x: 12, y: 4, w: 12, h: 8 },
1316
+ 'sum by (state) (system_memory_usage_bytes)',
1317
+ "{{state}}",
1318
+ "bytes",
1319
+ ),
1320
+
1321
+ // Row: disk
1322
+ panelTimeseries(
1323
+ "Filesystem usage by mount",
1324
+ { x: 0, y: 12, w: 12, h: 8 },
1325
+ '100 * sum by (mountpoint) (system_filesystem_usage_bytes{state="used"}) / sum by (mountpoint) (system_filesystem_usage_bytes)',
1326
+ "{{mountpoint}}",
1327
+ "percent",
1328
+ ),
1329
+ panelTimeseries(
1330
+ "Disk I/O",
1331
+ { x: 12, y: 12, w: 12, h: 8 },
1332
+ 'sum by (device, direction) (rate(system_disk_io_bytes_total[5m]))',
1333
+ "{{device}} {{direction}}",
1334
+ "Bps",
1335
+ ),
1336
+
1337
+ // Row: network + paging
1338
+ panelTimeseries(
1339
+ "Network I/O",
1340
+ { x: 0, y: 20, w: 12, h: 8 },
1341
+ 'sum by (device, direction) (rate(system_network_io_bytes_total{device!="lo"}[5m]))',
1342
+ "{{device}} {{direction}}",
1343
+ "Bps",
1344
+ ),
1345
+ panelTimeseries(
1346
+ "Container restarts (24h)",
1347
+ { x: 12, y: 20, w: 12, h: 8 },
1348
+ 'sum by (container_name) (increase(container_restarts_total[24h]))',
1349
+ "{{container_name}}",
1350
+ "short",
1351
+ ),
1352
+
1353
+ // Row: per-container resources
1354
+ panelTimeseries(
1355
+ "Container CPU",
1356
+ { x: 0, y: 28, w: 12, h: 8 },
1357
+ 'container_cpu_utilization_ratio',
1358
+ "{{container_name}}",
1359
+ "percent",
1360
+ ),
1361
+ panelTimeseries(
1362
+ "Container memory",
1363
+ { x: 12, y: 28, w: 12, h: 8 },
1364
+ 'container_memory_usage_total_bytes',
1365
+ "{{container_name}}",
1366
+ "bytes",
1367
+ ),
1368
+ panelTimeseries(
1369
+ "Container network RX",
1370
+ { x: 0, y: 36, w: 12, h: 8 },
1371
+ 'sum by (container_name) (rate(container_network_io_usage_rx_bytes_total[5m]))',
1372
+ "{{container_name}}",
1373
+ "Bps",
1374
+ ),
1375
+ panelTimeseries(
1376
+ "Container network TX",
1377
+ { x: 12, y: 36, w: 12, h: 8 },
1378
+ 'sum by (container_name) (rate(container_network_io_usage_tx_bytes_total[5m]))',
1379
+ "{{container_name}}",
1380
+ "Bps",
1381
+ ),
1382
+ ],
1383
+ };
1384
+ return JSON.stringify(dashboard, null, 2);
1385
+ }
1386
+
1387
+ /** Edge dashboard — Caddy reverse-proxy metrics (per-host request rate,
1388
+ * status codes, latency, in-flight) plus the JSON access logs that Alloy
1389
+ * ships to Loki. First stop for "is traffic reaching us / who is 404ing". */
1390
+ export function generateArcEdgeDashboard(): string {
1391
+ const dashboard = {
1392
+ title: "Arc Caddy / Edge",
1393
+ uid: "arc-edge",
1394
+ schemaVersion: 39,
1395
+ version: 1,
1396
+ refresh: "30s",
1397
+ time: { from: "now-1h", to: "now" },
1398
+ tags: ["arc", "auto-provisioned"],
1399
+ panels: [
1400
+ // Row: top-line stats
1401
+ panelStat(
1402
+ "Requests/s",
1403
+ { x: 0, y: 0, w: 6, h: 4 },
1404
+ "sum(rate(caddy_http_request_duration_seconds_count[1m]))",
1405
+ "reqps",
1406
+ ),
1407
+ panelStat(
1408
+ "In-flight requests",
1409
+ { x: 6, y: 0, w: 6, h: 4 },
1410
+ "sum(caddy_http_requests_in_flight)",
1411
+ "short",
1412
+ ),
1413
+ panelStat(
1414
+ "Handler errors/s",
1415
+ { x: 12, y: 0, w: 6, h: 4 },
1416
+ "sum(rate(caddy_http_request_errors_total[5m]))",
1417
+ "ops",
1418
+ { orange: 0.1, red: 1 },
1419
+ ),
1420
+ panelStat(
1421
+ "P95 latency",
1422
+ { x: 18, y: 0, w: 6, h: 4 },
1423
+ "histogram_quantile(0.95, sum by (le) (rate(caddy_http_request_duration_seconds_bucket[5m])))",
1424
+ "s",
1425
+ { orange: 0.3, red: 1 },
1426
+ ),
1427
+
1428
+ // Row: traffic breakdown
1429
+ panelTimeseries(
1430
+ "Request rate by host",
1431
+ { x: 0, y: 4, w: 12, h: 8 },
1432
+ "sum by (host) (rate(caddy_http_request_duration_seconds_count[1m]))",
1433
+ "{{host}}",
1434
+ "reqps",
1435
+ ),
1436
+ panelTimeseries(
1437
+ "Responses by status code",
1438
+ { x: 12, y: 4, w: 12, h: 8 },
1439
+ "sum by (code) (rate(caddy_http_request_duration_seconds_count[1m]))",
1440
+ "{{code}}",
1441
+ "reqps",
1442
+ ),
1443
+
1444
+ // Row: latency + error log volume
1445
+ panelTimeseries(
1446
+ "P95 latency by host",
1447
+ { x: 0, y: 12, w: 12, h: 8 },
1448
+ "histogram_quantile(0.95, sum by (host, le) (rate(caddy_http_request_duration_seconds_bucket[5m])))",
1449
+ "{{host}}",
1450
+ "s",
1451
+ ),
1452
+ panelTimeseries(
1453
+ "4xx/5xx responses (access log)",
1454
+ { x: 12, y: 12, w: 12, h: 8 },
1455
+ 'sum by (status) (count_over_time({compose_service="caddy"} | json | status >= 400 [$__interval]))',
1456
+ "{{status}}",
1457
+ "short",
1458
+ LOKI_DS,
1459
+ ),
1460
+
1461
+ // Row: live access log
1462
+ {
1463
+ title: "Access log (live)",
1464
+ type: "logs",
1465
+ gridPos: { x: 0, y: 20, w: 24, h: 12 },
1466
+ datasource: { type: "loki", uid: "loki" },
1467
+ targets: [
1468
+ {
1469
+ expr: '{compose_service="caddy"}',
1470
+ refId: "A",
1471
+ },
1472
+ ],
1473
+ options: {
1474
+ showTime: true,
1475
+ showLabels: false,
1476
+ showCommonLabels: false,
1477
+ wrapLogMessage: true,
1478
+ enableLogDetails: true,
1479
+ dedupStrategy: "none",
1480
+ sortOrder: "Descending",
1481
+ },
1482
+ },
1483
+ ],
1484
+ };
1485
+ return JSON.stringify(dashboard, null, 2);
1486
+ }
1487
+
858
1488
  /** All config files needed on the host. Returns map of relative-path → contents
859
1489
  * so bootstrap can write+upload them in one pass. */
860
1490
  export function generateObservabilityConfigs(
@@ -865,14 +1495,18 @@ export function generateObservabilityConfigs(
865
1495
  "observability/tempo.yaml": generateTempoConfig(cfg),
866
1496
  "observability/loki-config.yaml": generateLokiConfig(cfg),
867
1497
  "observability/prometheus.yml": generatePrometheusConfig(cfg),
1498
+ "observability/alloy-config.alloy": generateAlloyConfig(),
868
1499
  "observability/grafana-datasources.yaml": generateGrafanaDatasources(),
869
1500
  "observability/grafana-dashboards.yaml": generateGrafanaDashboardsProvider(),
1501
+ "observability/grafana-alerting/alerting.yaml": generateGrafanaAlerting(cfg),
870
1502
  "observability/grafana-dashboards/arc-overview.json": generateArcOverviewDashboard(),
871
1503
  "observability/grafana-dashboards/arc-traces.json": generateArcTracesDashboard(),
872
1504
  "observability/grafana-dashboards/arc-service-map.json": generateArcServiceMapDashboard(),
873
1505
  "observability/grafana-dashboards/arc-logs.json": generateArcLogsDashboard(),
874
1506
  "observability/grafana-dashboards/arc-sampling.json": generateArcSamplingDashboard(),
875
1507
  "observability/grafana-dashboards/arc-command.json": generateArcCommandDashboard(),
1508
+ "observability/grafana-dashboards/arc-infra.json": generateArcInfraDashboard(),
1509
+ "observability/grafana-dashboards/arc-edge.json": generateArcEdgeDashboard(),
876
1510
  };
877
1511
  }
878
1512
 
@@ -882,6 +1516,10 @@ export function generateObservabilityConfigs(
882
1516
 
883
1517
  interface GridPos { x: number; y: number; w: number; h: number; }
884
1518
  interface Threshold { orange?: number; red?: number; }
1519
+ interface PanelDatasource { type: string; uid: string; }
1520
+
1521
+ const PROMETHEUS_DS: PanelDatasource = { type: "prometheus", uid: "prometheus" };
1522
+ const LOKI_DS: PanelDatasource = { type: "loki", uid: "loki" };
885
1523
 
886
1524
  function panelStat(
887
1525
  title: string,
@@ -889,6 +1527,7 @@ function panelStat(
889
1527
  expr: string,
890
1528
  unit: string,
891
1529
  thresholds?: Threshold,
1530
+ datasource: PanelDatasource = PROMETHEUS_DS,
892
1531
  ) {
893
1532
  const steps: { color: string; value: number | null }[] = [
894
1533
  { color: "green", value: null },
@@ -903,7 +1542,7 @@ function panelStat(
903
1542
  title,
904
1543
  type: "stat",
905
1544
  gridPos,
906
- datasource: { type: "prometheus", uid: "prometheus" },
1545
+ datasource,
907
1546
  targets: [{ expr, refId: "A", legendFormat: title }],
908
1547
  fieldConfig: {
909
1548
  defaults: {
@@ -928,6 +1567,7 @@ function panelTimeseries(
928
1567
  query: string | { expr: string; legend: string }[],
929
1568
  legend: string,
930
1569
  unit: string,
1570
+ datasource: PanelDatasource = PROMETHEUS_DS,
931
1571
  ) {
932
1572
  const targets = Array.isArray(query)
933
1573
  ? query.map((q, i) => ({
@@ -940,7 +1580,7 @@ function panelTimeseries(
940
1580
  title,
941
1581
  type: "timeseries",
942
1582
  gridPos,
943
- datasource: { type: "prometheus", uid: "prometheus" },
1583
+ datasource,
944
1584
  targets,
945
1585
  fieldConfig: {
946
1586
  defaults: {