@arcote.tech/arc-cli 0.7.19 → 0.7.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -35,8 +35,9 @@ function pickRetention(o: DeployObservability | undefined) {
35
35
  }
36
36
 
37
37
  /** OpenTelemetry Collector — receives OTLP from app containers + browser,
38
- * applies tail sampling, fans out to Tempo (traces), Loki (logs),
39
- * Prometheus remote-write (metrics). */
38
+ * derives span-metrics + service-graph BEFORE tail sampling (no sampling
39
+ * bias), scrapes host + per-container resource usage, fans out to Tempo
40
+ * (traces), Loki (logs), Prometheus remote-write (metrics). */
40
41
  export function generateOtelCollectorConfig(cfg: DeployConfig): string {
41
42
  const envNames = Object.keys(cfg.envs);
42
43
  return `# Generated by \`arc platform deploy\` — do not edit by hand.
@@ -55,6 +56,64 @@ ${envNames.map((name) => ` - "https://${cfg.envs[name]!.domain}"`).jo
55
56
  - tracestate
56
57
  - content-type
57
58
 
59
+ # Host-level CPU / memory / load / disk / filesystem / network metrics.
60
+ # The host root is bind-mounted read-only at /hostfs (see compose).
61
+ hostmetrics:
62
+ collection_interval: 30s
63
+ root_path: /hostfs
64
+ scrapers:
65
+ cpu:
66
+ metrics:
67
+ system.cpu.utilization:
68
+ enabled: true
69
+ memory:
70
+ metrics:
71
+ system.memory.utilization:
72
+ enabled: true
73
+ load: {}
74
+ disk: {}
75
+ filesystem:
76
+ metrics:
77
+ system.filesystem.utilization:
78
+ enabled: true
79
+ exclude_fs_types:
80
+ fs_types: [autofs, binfmt_misc, bpf, cgroup2, configfs, debugfs, devpts, devtmpfs, fusectl, hugetlbfs, iso9660, mqueue, nsfs, overlay, proc, procfs, pstore, rpc_pipefs, securityfs, selinuxfs, squashfs, sysfs, tracefs, tmpfs]
81
+ match_type: strict
82
+ exclude_mount_points:
83
+ mount_points: ["/var/lib/docker/.*", "/run/.*", "/snap/.*", "/boot/.*"]
84
+ match_type: regexp
85
+ network: {}
86
+ paging: {}
87
+
88
+ # Per-container CPU / memory / network / block-IO + restarts straight from
89
+ # the Docker daemon (socket bind-mounted read-only, see compose).
90
+ docker_stats:
91
+ endpoint: unix:///var/run/docker.sock
92
+ collection_interval: 30s
93
+ metrics:
94
+ container.restarts:
95
+ enabled: true
96
+ container.uptime:
97
+ enabled: true
98
+
99
+ connectors:
100
+ # Span→metrics computed from 100% of spans (pipeline runs BEFORE tail
101
+ # sampling) — lowering the sampling policy later never skews dashboards.
102
+ spanmetrics:
103
+ histogram:
104
+ unit: ms
105
+ explicit:
106
+ buckets: [2ms, 6ms, 10ms, 50ms, 100ms, 250ms, 500ms, 1s, 2s, 5s, 10s]
107
+ metrics_flush_interval: 15s
108
+ # Emits traces_service_graph_* (same metric names Tempo's generator would).
109
+ servicegraph:
110
+ metrics_flush_interval: 15s
111
+ store:
112
+ ttl: 5s
113
+ max_items: 5000
114
+ # Joins the raw-trace pipeline to the sampled-storage pipeline.
115
+ forward: {}
116
+
58
117
  processors:
59
118
  batch:
60
119
  timeout: 5s
@@ -65,7 +124,8 @@ processors:
65
124
  # Errors + slow traces zachowywane w 100%, normalne traces również 100%
66
125
  # przy obecnej skali (boostrap produkcji). Tail sampling matchuje OR po
67
126
  # policies — bez "always" policy WSZYSTKIE OK traces byłyby droppowane.
68
- # Obniż 'random_100pct' do np. 10% gdy ruch eksploduje.
127
+ # Obniż 'random_100pct' do np. 10% gdy ruch eksploduje — span-metrics są
128
+ # liczone przed samplingiem, więc dashboardy pozostaną dokładne.
69
129
  tail_sampling:
70
130
  decision_wait: 10s
71
131
  num_traces: 50000
@@ -90,6 +150,34 @@ processors:
90
150
  - key: http.request.header.cookie
91
151
  action: delete
92
152
 
153
+ # Cardinality guard for span-metrics: the SPA fallback answers EVERY path,
154
+ # so raw span names (one per bot-scanned URL) would explode Prometheus
155
+ # series. Static assets collapse to "<METHOD> static", /route/* to
156
+ # "<METHOD> /route", anything else outside the known API surface to
157
+ # "<METHOD> other". Runs BEFORE the spanmetrics connector. Note: "$$" is a
158
+ # literal "$" (collector env expansion), RE2 has no lookahead → IsMatch+not.
159
+ transform/span_names:
160
+ error_mode: ignore
161
+ trace_statements:
162
+ - context: span
163
+ statements:
164
+ - set(name, Concat([attributes["http.request.method"], "static"], " ")) where IsMatch(name, "^[A-Z]+ /.*\\\\.(js|mjs|css|map|ico|png|jpe?g|svg|gif|webp|avif|woff2?|ttf|otf|txt|xml|json|webmanifest)$$")
165
+ - replace_pattern(name, "^([A-Z]+) /route(/.*)?$$", "$$1 /route")
166
+ - set(name, Concat([attributes["http.request.method"], "other"], " ")) where IsMatch(name, "^[A-Z]+ /") and not IsMatch(name, "^[A-Z]+ ((/api|/command|/query)(/.*)?|/route|/ws|/health|/otel(/.*)?|/)$$")
167
+
168
+ # Stable service.name for infra metric streams (becomes the service_name
169
+ # label after resource_to_telemetry_conversion).
170
+ resource/host:
171
+ attributes:
172
+ - key: service.name
173
+ value: arc-host
174
+ action: upsert
175
+ resource/docker:
176
+ attributes:
177
+ - key: service.name
178
+ value: arc-docker
179
+ action: upsert
180
+
93
181
  exporters:
94
182
  otlp/tempo:
95
183
  endpoint: tempo:4317
@@ -105,6 +193,10 @@ exporters:
105
193
  endpoint: http://prometheus:9090/api/v1/write
106
194
  tls:
107
195
  insecure: true
196
+ # Copy resource attributes (service.name, deployment.environment, …)
197
+ # onto every series — dashboards filter by service_name.
198
+ resource_to_telemetry_conversion:
199
+ enabled: true
108
200
 
109
201
  extensions:
110
202
  health_check: {}
@@ -112,19 +204,42 @@ extensions:
112
204
 
113
205
  service:
114
206
  extensions: [health_check, zpages]
207
+ # Collector self-metrics. Since 0.111 the default bind is localhost only —
208
+ # Prometheus scrapes otel-collector:8888, so listen on all interfaces.
209
+ telemetry:
210
+ metrics:
211
+ level: detailed
212
+ readers:
213
+ - pull:
214
+ exporter:
215
+ prometheus:
216
+ host: "0.0.0.0"
217
+ port: 8888
115
218
  pipelines:
116
- traces:
219
+ traces/in:
117
220
  receivers: [otlp]
118
- processors: [tail_sampling, attributes, batch]
221
+ processors: [attributes, transform/span_names]
222
+ exporters: [spanmetrics, servicegraph, forward]
223
+ traces/sampled:
224
+ receivers: [forward]
225
+ processors: [tail_sampling, batch]
119
226
  exporters: [otlp/tempo]
120
227
  logs:
121
228
  receivers: [otlp]
122
229
  processors: [attributes, batch]
123
230
  exporters: [otlphttp/loki]
124
231
  metrics:
125
- receivers: [otlp]
232
+ receivers: [otlp, spanmetrics, servicegraph]
126
233
  processors: [batch]
127
234
  exporters: [prometheusremotewrite]
235
+ metrics/host:
236
+ receivers: [hostmetrics]
237
+ processors: [resource/host, batch]
238
+ exporters: [prometheusremotewrite]
239
+ metrics/docker:
240
+ receivers: [docker_stats]
241
+ processors: [resource/docker, batch]
242
+ exporters: [prometheusremotewrite]
128
243
  `;
129
244
  }
130
245
 
@@ -162,20 +277,9 @@ storage:
162
277
  wal:
163
278
  path: /var/tempo/wal
164
279
 
165
- metrics_generator:
166
- registry:
167
- external_labels:
168
- source: tempo
169
- storage:
170
- path: /var/tempo/generator/wal
171
- remote_write:
172
- - url: http://prometheus:9090/api/v1/write
173
- send_exemplars: true
174
-
175
- overrides:
176
- defaults:
177
- metrics_generator:
178
- processors: [service-graphs, span-metrics]
280
+ # NOTE: no metrics_generator — span-metrics + service-graph are produced by
281
+ # the otel-collector connectors BEFORE tail sampling (accurate rates even
282
+ # when sampling is later tightened) and remote-written to Prometheus there.
179
283
  `;
180
284
  }
181
285
 
@@ -237,12 +341,239 @@ scrape_configs:
237
341
  - job_name: otel-collector
238
342
  static_configs:
239
343
  - targets: [otel-collector:8888]
344
+ - job_name: caddy
345
+ static_configs:
346
+ - targets: [caddy:2020]
347
+ - job_name: loki
348
+ static_configs:
349
+ - targets: [loki:3100]
350
+ - job_name: tempo
351
+ static_configs:
352
+ - targets: [tempo:3200]
353
+ - job_name: grafana
354
+ static_configs:
355
+ - targets: [grafana:3000]
356
+ - job_name: alloy
357
+ static_configs:
358
+ - targets: [alloy:12345]
240
359
 
241
360
  # remote-write inbound is enabled via the --web.enable-remote-write-receiver
242
361
  # command-line flag (compose.ts). Retention via --storage.tsdb.retention.time.
243
362
  `;
244
363
  }
245
364
 
365
+ /** Grafana Alloy — tails stdout/stderr of every container on the host via
366
+ * the Docker API and ships it to Loki. Complements the in-app console→OTLP
367
+ * bridge: infra containers (caddy, postgres, tempo, …) and app crash output
368
+ * (OOM, Bun panic — anything printed before/outside the OTel SDK) all land
369
+ * in Loki under the `container` / `compose_service` labels. */
370
+ export function generateAlloyConfig(): string {
371
+ return `// Generated by \`arc platform deploy\` — do not edit by hand.
372
+ discovery.docker "containers" {
373
+ host = "unix:///var/run/docker.sock"
374
+ refresh_interval = "15s"
375
+ }
376
+
377
+ discovery.relabel "containers" {
378
+ targets = discovery.docker.containers.targets
379
+
380
+ rule {
381
+ source_labels = ["__meta_docker_container_name"]
382
+ regex = "/(.*)"
383
+ target_label = "container"
384
+ }
385
+ rule {
386
+ source_labels = ["__meta_docker_container_label_com_docker_compose_service"]
387
+ target_label = "compose_service"
388
+ }
389
+ }
390
+
391
+ loki.source.docker "containers" {
392
+ host = "unix:///var/run/docker.sock"
393
+ targets = discovery.docker.containers.targets
394
+ relabel_rules = discovery.relabel.containers.rules
395
+ labels = { source = "docker" }
396
+ forward_to = [loki.write.loki.receiver]
397
+ }
398
+
399
+ loki.write "loki" {
400
+ endpoint {
401
+ url = "http://loki:3100/loki/api/v1/push"
402
+ }
403
+ }
404
+ `;
405
+ }
406
+
407
+ /** Grafana alerting provisioning — a starter rule pack covering the failure
408
+ * modes that matter on a single-host deploy: error rate, latency, disk,
409
+ * memory, container restarts, app silence, telemetry export failures and
410
+ * scrape-target health. Notification routing (webhook) is only emitted when
411
+ * `observability.alertWebhookUrl` is configured — without it the rules are
412
+ * still visible/firing in the Grafana UI. */
413
+ export function generateGrafanaAlerting(cfg: DeployConfig): string {
414
+ const webhookUrl = cfg.observability?.alertWebhookUrl;
415
+
416
+ interface AlertRule {
417
+ uid: string;
418
+ title: string;
419
+ expr: string;
420
+ /** Threshold for the C (threshold) expression node. */
421
+ threshold: number;
422
+ /** Comparison operator. Default "gt". */
423
+ op?: "gt" | "lt";
424
+ /** Pending period, e.g. "5m". "0s" fires immediately. */
425
+ pendingFor: string;
426
+ summary: string;
427
+ }
428
+
429
+ const rules: AlertRule[] = [
430
+ {
431
+ uid: "arc-high-error-rate",
432
+ title: "High server error rate (>5%)",
433
+ expr: 'sum(rate(traces_span_metrics_calls_total{span_kind="SPAN_KIND_SERVER", status_code="STATUS_CODE_ERROR"}[5m])) / clamp_min(sum(rate(traces_span_metrics_calls_total{span_kind="SPAN_KIND_SERVER"}[5m])), 0.001)',
434
+ threshold: 0.05,
435
+ pendingFor: "5m",
436
+ summary: "More than 5% of server spans are errors over the last 5 minutes.",
437
+ },
438
+ {
439
+ uid: "arc-high-latency-p95",
440
+ title: "High p95 latency (>1s)",
441
+ expr: 'histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_kind="SPAN_KIND_SERVER"}[5m])))',
442
+ threshold: 1000,
443
+ pendingFor: "10m",
444
+ summary: "Server p95 latency above 1s for 10 minutes.",
445
+ },
446
+ {
447
+ uid: "arc-host-disk-high",
448
+ title: "Host disk usage >85%",
449
+ expr: 'max by (mountpoint) (sum by (device, mountpoint) (system_filesystem_usage_bytes{state="used"}) / sum by (device, mountpoint) (system_filesystem_usage_bytes))',
450
+ threshold: 0.85,
451
+ pendingFor: "15m",
452
+ summary: "A host filesystem is more than 85% full.",
453
+ },
454
+ {
455
+ uid: "arc-host-memory-high",
456
+ title: "Host memory usage >90%",
457
+ expr: 'sum(system_memory_usage_bytes{state="used"}) / sum(system_memory_usage_bytes)',
458
+ threshold: 0.9,
459
+ pendingFor: "10m",
460
+ summary: "Host memory usage above 90% for 10 minutes.",
461
+ },
462
+ {
463
+ uid: "arc-container-restarts",
464
+ title: "Container restarted",
465
+ expr: 'sum by (container_name) (increase(container_restarts_total[15m]))',
466
+ threshold: 0,
467
+ pendingFor: "0s",
468
+ summary: "A container restarted within the last 15 minutes.",
469
+ },
470
+ {
471
+ uid: "arc-app-silent",
472
+ title: "App stopped reporting metrics",
473
+ expr: "absent(arc_commands_total)",
474
+ threshold: 0,
475
+ pendingFor: "10m",
476
+ summary: "No arc_commands_total series for 10 minutes — app down or telemetry broken.",
477
+ },
478
+ {
479
+ uid: "arc-collector-export-failures",
480
+ title: "Telemetry export failures",
481
+ expr: 'sum(increase({__name__=~"otelcol_exporter_send_failed_(spans|metric_points|log_records)"}[15m]))',
482
+ threshold: 0,
483
+ pendingFor: "0s",
484
+ summary: "The otel-collector failed to export telemetry within the last 15 minutes.",
485
+ },
486
+ {
487
+ uid: "arc-target-down",
488
+ title: "Scrape target down",
489
+ expr: "min(up)",
490
+ threshold: 1,
491
+ op: "lt",
492
+ pendingFor: "5m",
493
+ summary: "A Prometheus scrape target has been down for 5 minutes.",
494
+ },
495
+ ];
496
+
497
+ const ruleYaml = rules
498
+ .map((rule) => {
499
+ const op = rule.op ?? "gt";
500
+ return ` - uid: ${rule.uid}
501
+ title: ${JSON.stringify(rule.title)}
502
+ condition: C
503
+ for: ${rule.pendingFor}
504
+ noDataState: OK
505
+ execErrState: OK
506
+ annotations:
507
+ summary: ${JSON.stringify(rule.summary)}
508
+ labels:
509
+ source: arc
510
+ data:
511
+ - refId: A
512
+ relativeTimeRange: { from: 600, to: 0 }
513
+ datasourceUid: prometheus
514
+ model:
515
+ expr: ${JSON.stringify(rule.expr)}
516
+ instant: true
517
+ intervalMs: 1000
518
+ maxDataPoints: 43200
519
+ refId: A
520
+ - refId: B
521
+ relativeTimeRange: { from: 0, to: 0 }
522
+ datasourceUid: __expr__
523
+ model:
524
+ type: reduce
525
+ expression: A
526
+ reducer: last
527
+ refId: B
528
+ - refId: C
529
+ relativeTimeRange: { from: 0, to: 0 }
530
+ datasourceUid: __expr__
531
+ model:
532
+ type: threshold
533
+ expression: B
534
+ refId: C
535
+ conditions:
536
+ - evaluator:
537
+ type: ${op}
538
+ params: [${rule.threshold}]`;
539
+ })
540
+ .join("\n");
541
+
542
+ const contactSection = webhookUrl
543
+ ? `
544
+ contactPoints:
545
+ - orgId: 1
546
+ name: arc-webhook
547
+ receivers:
548
+ - uid: arc-webhook
549
+ type: webhook
550
+ settings:
551
+ url: ${JSON.stringify(webhookUrl)}
552
+ httpMethod: POST
553
+
554
+ policies:
555
+ - orgId: 1
556
+ receiver: arc-webhook
557
+ group_by: ["grafana_folder", "alertname"]
558
+ group_wait: 30s
559
+ group_interval: 5m
560
+ repeat_interval: 4h
561
+ `
562
+ : "";
563
+
564
+ return `# Generated by \`arc platform deploy\` — do not edit by hand.
565
+ apiVersion: 1
566
+
567
+ groups:
568
+ - orgId: 1
569
+ name: arc-alerts
570
+ folder: Arc
571
+ interval: 1m
572
+ rules:
573
+ ${ruleYaml}
574
+ ${contactSection}`;
575
+ }
576
+
246
577
  /** Grafana datasource provisioning — Tempo + Loki + Prometheus, all pre-wired. */
247
578
  export function generateGrafanaDatasources(): string {
248
579
  return `# Generated by \`arc platform deploy\` — do not edit by hand.
@@ -267,10 +598,17 @@ datasources:
267
598
  uid: loki
268
599
  jsonData:
269
600
  derivedFields:
601
+ # Plain-text logs that happen to contain "trace_id=<id>".
270
602
  - datasourceUid: tempo
271
603
  matcherRegex: "trace_id=(\\\\w+)"
272
604
  name: TraceID
273
605
  url: $\${__value.raw}
606
+ # OTLP-ingested logs — trace_id arrives as structured metadata.
607
+ - datasourceUid: tempo
608
+ matcherType: label
609
+ matcherRegex: trace_id
610
+ name: TraceID (OTLP)
611
+ url: $\${__value.raw}
274
612
  - name: Prometheus
275
613
  type: prometheus
276
614
  access: proxy
@@ -322,7 +660,7 @@ export function generateArcOverviewDashboard(): string {
322
660
  label: "Service",
323
661
  type: "query",
324
662
  datasource: { type: "prometheus", uid: "prometheus" },
325
- query: "label_values(traces_spanmetrics_calls_total, service_name)",
663
+ query: "label_values(traces_span_metrics_calls_total, service_name)",
326
664
  refresh: 2,
327
665
  includeAll: false,
328
666
  multi: false,
@@ -335,20 +673,20 @@ export function generateArcOverviewDashboard(): string {
335
673
  panelStat(
336
674
  "Request rate (req/s)",
337
675
  { x: 0, y: 0, w: 6, h: 4 },
338
- 'sum(rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m]))',
676
+ 'sum(rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m]))',
339
677
  "reqps",
340
678
  ),
341
679
  panelStat(
342
680
  "Error rate (%)",
343
681
  { x: 6, y: 0, w: 6, h: 4 },
344
- 'sum(rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER", status_code="STATUS_CODE_ERROR"}[5m])) / clamp_min(sum(rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])), 0.001) * 100',
682
+ 'sum(rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER", status_code="STATUS_CODE_ERROR"}[5m])) / clamp_min(sum(rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])), 0.001) * 100',
345
683
  "percent",
346
684
  { red: 1, orange: 0.1 },
347
685
  ),
348
686
  panelStat(
349
687
  "P99 latency",
350
688
  { x: 12, y: 0, w: 6, h: 4 },
351
- 'histogram_quantile(0.99, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
689
+ 'histogram_quantile(0.99, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
352
690
  "ms",
353
691
  { red: 1000, orange: 300 },
354
692
  ),
@@ -363,7 +701,7 @@ export function generateArcOverviewDashboard(): string {
363
701
  panelTimeseries(
364
702
  "Request rate by route",
365
703
  { x: 0, y: 4, w: 12, h: 8 },
366
- 'sum by (span_name) (rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[1m]))',
704
+ 'sum by (span_name) (rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[1m]))',
367
705
  "{{span_name}}",
368
706
  "reqps",
369
707
  ),
@@ -372,18 +710,19 @@ export function generateArcOverviewDashboard(): string {
372
710
  { x: 12, y: 4, w: 12, h: 8 },
373
711
  [
374
712
  {
375
- expr: 'histogram_quantile(0.5, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
713
+ expr: 'histogram_quantile(0.5, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
376
714
  legend: "p50",
377
715
  },
378
716
  {
379
- expr: 'histogram_quantile(0.95, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
717
+ expr: 'histogram_quantile(0.95, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
380
718
  legend: "p95",
381
719
  },
382
720
  {
383
- expr: 'histogram_quantile(0.99, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
721
+ expr: 'histogram_quantile(0.99, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
384
722
  legend: "p99",
385
723
  },
386
724
  ],
725
+ "",
387
726
  "ms",
388
727
  ),
389
728
 
@@ -398,24 +737,24 @@ export function generateArcOverviewDashboard(): string {
398
737
  panelTimeseries(
399
738
  "Command p95 latency",
400
739
  { x: 12, y: 12, w: 12, h: 8 },
401
- 'histogram_quantile(0.95, sum by (arc_command_name, le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service"}[5m])))',
740
+ 'histogram_quantile(0.95, sum by (arc_command_name, le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service"}[5m])))',
402
741
  "{{arc_command_name}}",
403
742
  "ms",
404
743
  ),
405
744
 
406
745
  // Row: database
407
746
  panelTimeseries(
408
- "DB find ops/sec by collection",
747
+ "DB ops/sec by collection",
409
748
  { x: 0, y: 20, w: 12, h: 8 },
410
- 'sum by (db_collection_name) (rate(arc_db_find_ms_milliseconds_count{service_name="$service"}[1m]))',
411
- "{{db_collection_name}}",
749
+ 'sum by (db_collection_name, db_operation_name) (rate(arc_db_operation_duration_milliseconds_count{service_name="$service"}[1m]))',
750
+ "{{db_collection_name}} {{db_operation_name}}",
412
751
  "ops",
413
752
  ),
414
753
  panelTimeseries(
415
- "DB find p95 latency",
754
+ "DB p95 latency by operation",
416
755
  { x: 12, y: 20, w: 12, h: 8 },
417
- 'histogram_quantile(0.95, sum by (db_collection_name, le) (rate(arc_db_find_ms_milliseconds_bucket{service_name="$service"}[5m])))',
418
- "{{db_collection_name}}",
756
+ 'histogram_quantile(0.95, sum by (db_operation_name, le) (rate(arc_db_operation_duration_milliseconds_bucket{service_name="$service"}[5m])))',
757
+ "{{db_operation_name}}",
419
758
  "ms",
420
759
  ),
421
760
 
@@ -427,7 +766,9 @@ export function generateArcOverviewDashboard(): string {
427
766
  datasource: { type: "loki", uid: "loki" },
428
767
  targets: [
429
768
  {
430
- expr: '{service_name="$service"} |= `ERROR`',
769
+ // severity_text is OTLP structured metadata — filter via the
770
+ // pipeline stage, NOT the stream selector (not an index label).
771
+ expr: '{service_name="$service"} | severity_text=~`ERROR|FATAL`',
431
772
  refId: "A",
432
773
  },
433
774
  ],
@@ -464,7 +805,7 @@ export function generateArcTracesDashboard(): string {
464
805
  label: "Service",
465
806
  type: "query",
466
807
  datasource: { type: "prometheus", uid: "prometheus" },
467
- query: "label_values(traces_spanmetrics_calls_total, service_name)",
808
+ query: "label_values(traces_span_metrics_calls_total, service_name)",
468
809
  refresh: 2,
469
810
  current: { text: "arc-prod", value: "arc-prod" },
470
811
  },
@@ -587,23 +928,38 @@ export function generateArcLogsDashboard(): string {
587
928
  query: "",
588
929
  current: { text: "", value: "" },
589
930
  },
931
+ {
932
+ name: "container",
933
+ label: "Container",
934
+ type: "query",
935
+ datasource: { type: "loki", uid: "loki" },
936
+ query: "label_values(container)",
937
+ refresh: 2,
938
+ includeAll: true,
939
+ multi: false,
940
+ current: { text: "All", value: "$__all" },
941
+ },
590
942
  ],
591
943
  },
592
944
  panels: [
593
- // Stat: total log lines, last hour
945
+ // Stat: total log lines, last hour. severity_text is OTLP structured
946
+ // metadata — usable only in pipeline stages, never in stream selectors.
594
947
  panelStat(
595
948
  "Logs ingested (1h)",
596
949
  { x: 0, y: 0, w: 6, h: 4 },
597
- 'sum(increase({service_name="$service"}[1h]))',
950
+ 'sum(count_over_time({service_name="$service"}[1h]))',
598
951
  "short",
952
+ undefined,
953
+ LOKI_DS,
599
954
  ),
600
955
  // Stat: errors (last hour)
601
956
  panelStat(
602
957
  "Errors (1h)",
603
958
  { x: 6, y: 0, w: 6, h: 4 },
604
- 'sum(increase({service_name="$service", severity_text=~"ERROR|FATAL"}[1h]))',
959
+ 'sum(count_over_time({service_name="$service"} | severity_text=~`ERROR|FATAL` [1h]))',
605
960
  "short",
606
961
  { orange: 1, red: 50 },
962
+ LOKI_DS,
607
963
  ),
608
964
  // Time series: log volume by severity
609
965
  {
@@ -657,6 +1013,30 @@ export function generateArcLogsDashboard(): string {
657
1013
  sortOrder: "Descending",
658
1014
  },
659
1015
  },
1016
+ // Container stdout/stderr (collected by Alloy from the Docker API) —
1017
+ // caddy access logs, postgres, the observability stack itself, plus
1018
+ // app crash output that never reached the OTLP pipeline.
1019
+ {
1020
+ title: "Container logs ($container)",
1021
+ type: "logs",
1022
+ gridPos: { x: 0, y: 26, w: 24, h: 14 },
1023
+ datasource: { type: "loki", uid: "loki" },
1024
+ targets: [
1025
+ {
1026
+ expr: '{container=~"$container"} |~ "$search"',
1027
+ refId: "A",
1028
+ },
1029
+ ],
1030
+ options: {
1031
+ showTime: true,
1032
+ showLabels: true,
1033
+ showCommonLabels: false,
1034
+ wrapLogMessage: true,
1035
+ enableLogDetails: true,
1036
+ dedupStrategy: "none",
1037
+ sortOrder: "Descending",
1038
+ },
1039
+ },
660
1040
  ],
661
1041
  };
662
1042
  return JSON.stringify(dashboard, null, 2);
@@ -722,6 +1102,7 @@ export function generateArcSamplingDashboard(): string {
722
1102
  legend: "exported",
723
1103
  },
724
1104
  ],
1105
+ "",
725
1106
  "ops",
726
1107
  ),
727
1108
 
@@ -790,20 +1171,20 @@ export function generateArcCommandDashboard(): string {
790
1171
  panelStat(
791
1172
  "P50 latency",
792
1173
  { x: 6, y: 0, w: 6, h: 4 },
793
- 'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
1174
+ 'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
794
1175
  "ms",
795
1176
  ),
796
1177
  panelStat(
797
1178
  "P95 latency",
798
1179
  { x: 12, y: 0, w: 6, h: 4 },
799
- 'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
1180
+ 'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
800
1181
  "ms",
801
1182
  { orange: 200, red: 1000 },
802
1183
  ),
803
1184
  panelStat(
804
1185
  "P99 latency",
805
1186
  { x: 18, y: 0, w: 6, h: 4 },
806
- 'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
1187
+ 'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
807
1188
  "ms",
808
1189
  { orange: 500, red: 2000 },
809
1190
  ),
@@ -820,18 +1201,19 @@ export function generateArcCommandDashboard(): string {
820
1201
  { x: 12, y: 4, w: 12, h: 8 },
821
1202
  [
822
1203
  {
823
- expr: 'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
1204
+ expr: 'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
824
1205
  legend: "p50",
825
1206
  },
826
1207
  {
827
- expr: 'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
1208
+ expr: 'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
828
1209
  legend: "p95",
829
1210
  },
830
1211
  {
831
- expr: 'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
1212
+ expr: 'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
832
1213
  legend: "p99",
833
1214
  },
834
1215
  ],
1216
+ "",
835
1217
  "ms",
836
1218
  ),
837
1219
 
@@ -855,6 +1237,240 @@ export function generateArcCommandDashboard(): string {
855
1237
  return JSON.stringify(dashboard, null, 2);
856
1238
  }
857
1239
 
1240
+ /** Infrastructure dashboard — host CPU/RAM/disk/network (hostmetrics
1241
+ * receiver, service_name="arc-host") + per-container resource usage and
1242
+ * restarts (docker_stats receiver, service_name="arc-docker"). The panels
1243
+ * answering "is the box healthy / which container is eating the host". */
1244
+ export function generateArcInfraDashboard(): string {
1245
+ const dashboard = {
1246
+ title: "Arc Infrastructure",
1247
+ uid: "arc-infra",
1248
+ schemaVersion: 39,
1249
+ version: 1,
1250
+ refresh: "30s",
1251
+ time: { from: "now-3h", to: "now" },
1252
+ tags: ["arc", "auto-provisioned"],
1253
+ panels: [
1254
+ // Row: host top-line stats
1255
+ panelStat(
1256
+ "Host CPU used",
1257
+ { x: 0, y: 0, w: 6, h: 4 },
1258
+ '100 * (1 - avg(rate(system_cpu_time_seconds_total{state="idle"}[5m])))',
1259
+ "percent",
1260
+ { orange: 70, red: 90 },
1261
+ ),
1262
+ panelStat(
1263
+ "Host memory used",
1264
+ { x: 6, y: 0, w: 6, h: 4 },
1265
+ '100 * sum(system_memory_usage_bytes{state="used"}) / sum(system_memory_usage_bytes)',
1266
+ "percent",
1267
+ { orange: 80, red: 90 },
1268
+ ),
1269
+ panelStat(
1270
+ "Disk used (worst mount)",
1271
+ { x: 12, y: 0, w: 6, h: 4 },
1272
+ '100 * max(sum by (device, mountpoint) (system_filesystem_usage_bytes{state="used"}) / sum by (device, mountpoint) (system_filesystem_usage_bytes))',
1273
+ "percent",
1274
+ { orange: 75, red: 85 },
1275
+ ),
1276
+ panelStat(
1277
+ "Load (1m)",
1278
+ { x: 18, y: 0, w: 6, h: 4 },
1279
+ "system_cpu_load_average_1m",
1280
+ "short",
1281
+ ),
1282
+
1283
+ // Row: host CPU + memory over time
1284
+ panelTimeseries(
1285
+ "Host CPU utilization",
1286
+ { x: 0, y: 4, w: 12, h: 8 },
1287
+ [
1288
+ {
1289
+ expr: '100 * (1 - avg(rate(system_cpu_time_seconds_total{state="idle"}[5m])))',
1290
+ legend: "used %",
1291
+ },
1292
+ { expr: "system_cpu_load_average_1m", legend: "load 1m" },
1293
+ { expr: "system_cpu_load_average_5m", legend: "load 5m" },
1294
+ { expr: "system_cpu_load_average_15m", legend: "load 15m" },
1295
+ ],
1296
+ "",
1297
+ "short",
1298
+ ),
1299
+ panelTimeseries(
1300
+ "Host memory by state",
1301
+ { x: 12, y: 4, w: 12, h: 8 },
1302
+ 'sum by (state) (system_memory_usage_bytes)',
1303
+ "{{state}}",
1304
+ "bytes",
1305
+ ),
1306
+
1307
+ // Row: disk
1308
+ panelTimeseries(
1309
+ "Filesystem usage by mount",
1310
+ { x: 0, y: 12, w: 12, h: 8 },
1311
+ '100 * sum by (mountpoint) (system_filesystem_usage_bytes{state="used"}) / sum by (mountpoint) (system_filesystem_usage_bytes)',
1312
+ "{{mountpoint}}",
1313
+ "percent",
1314
+ ),
1315
+ panelTimeseries(
1316
+ "Disk I/O",
1317
+ { x: 12, y: 12, w: 12, h: 8 },
1318
+ 'sum by (device, direction) (rate(system_disk_io_bytes_total[5m]))',
1319
+ "{{device}} {{direction}}",
1320
+ "Bps",
1321
+ ),
1322
+
1323
+ // Row: network + paging
1324
+ panelTimeseries(
1325
+ "Network I/O",
1326
+ { x: 0, y: 20, w: 12, h: 8 },
1327
+ 'sum by (device, direction) (rate(system_network_io_bytes_total{device!="lo"}[5m]))',
1328
+ "{{device}} {{direction}}",
1329
+ "Bps",
1330
+ ),
1331
+ panelTimeseries(
1332
+ "Container restarts (24h)",
1333
+ { x: 12, y: 20, w: 12, h: 8 },
1334
+ 'sum by (container_name) (increase(container_restarts_total[24h]))',
1335
+ "{{container_name}}",
1336
+ "short",
1337
+ ),
1338
+
1339
+ // Row: per-container resources
1340
+ panelTimeseries(
1341
+ "Container CPU",
1342
+ { x: 0, y: 28, w: 12, h: 8 },
1343
+ 'container_cpu_utilization_ratio',
1344
+ "{{container_name}}",
1345
+ "percent",
1346
+ ),
1347
+ panelTimeseries(
1348
+ "Container memory",
1349
+ { x: 12, y: 28, w: 12, h: 8 },
1350
+ 'container_memory_usage_total_bytes',
1351
+ "{{container_name}}",
1352
+ "bytes",
1353
+ ),
1354
+ panelTimeseries(
1355
+ "Container network RX",
1356
+ { x: 0, y: 36, w: 12, h: 8 },
1357
+ 'sum by (container_name) (rate(container_network_io_usage_rx_bytes_total[5m]))',
1358
+ "{{container_name}}",
1359
+ "Bps",
1360
+ ),
1361
+ panelTimeseries(
1362
+ "Container network TX",
1363
+ { x: 12, y: 36, w: 12, h: 8 },
1364
+ 'sum by (container_name) (rate(container_network_io_usage_tx_bytes_total[5m]))',
1365
+ "{{container_name}}",
1366
+ "Bps",
1367
+ ),
1368
+ ],
1369
+ };
1370
+ return JSON.stringify(dashboard, null, 2);
1371
+ }
1372
+
1373
+ /** Edge dashboard — Caddy reverse-proxy metrics (per-host request rate,
1374
+ * status codes, latency, in-flight) plus the JSON access logs that Alloy
1375
+ * ships to Loki. First stop for "is traffic reaching us / who is 404ing". */
1376
+ export function generateArcEdgeDashboard(): string {
1377
+ const dashboard = {
1378
+ title: "Arc Caddy / Edge",
1379
+ uid: "arc-edge",
1380
+ schemaVersion: 39,
1381
+ version: 1,
1382
+ refresh: "30s",
1383
+ time: { from: "now-1h", to: "now" },
1384
+ tags: ["arc", "auto-provisioned"],
1385
+ panels: [
1386
+ // Row: top-line stats
1387
+ panelStat(
1388
+ "Requests/s",
1389
+ { x: 0, y: 0, w: 6, h: 4 },
1390
+ "sum(rate(caddy_http_request_duration_seconds_count[1m]))",
1391
+ "reqps",
1392
+ ),
1393
+ panelStat(
1394
+ "In-flight requests",
1395
+ { x: 6, y: 0, w: 6, h: 4 },
1396
+ "sum(caddy_http_requests_in_flight)",
1397
+ "short",
1398
+ ),
1399
+ panelStat(
1400
+ "Handler errors/s",
1401
+ { x: 12, y: 0, w: 6, h: 4 },
1402
+ "sum(rate(caddy_http_request_errors_total[5m]))",
1403
+ "ops",
1404
+ { orange: 0.1, red: 1 },
1405
+ ),
1406
+ panelStat(
1407
+ "P95 latency",
1408
+ { x: 18, y: 0, w: 6, h: 4 },
1409
+ "histogram_quantile(0.95, sum by (le) (rate(caddy_http_request_duration_seconds_bucket[5m])))",
1410
+ "s",
1411
+ { orange: 0.3, red: 1 },
1412
+ ),
1413
+
1414
+ // Row: traffic breakdown
1415
+ panelTimeseries(
1416
+ "Request rate by host",
1417
+ { x: 0, y: 4, w: 12, h: 8 },
1418
+ "sum by (host) (rate(caddy_http_request_duration_seconds_count[1m]))",
1419
+ "{{host}}",
1420
+ "reqps",
1421
+ ),
1422
+ panelTimeseries(
1423
+ "Responses by status code",
1424
+ { x: 12, y: 4, w: 12, h: 8 },
1425
+ "sum by (code) (rate(caddy_http_request_duration_seconds_count[1m]))",
1426
+ "{{code}}",
1427
+ "reqps",
1428
+ ),
1429
+
1430
+ // Row: latency + error log volume
1431
+ panelTimeseries(
1432
+ "P95 latency by host",
1433
+ { x: 0, y: 12, w: 12, h: 8 },
1434
+ "histogram_quantile(0.95, sum by (host, le) (rate(caddy_http_request_duration_seconds_bucket[5m])))",
1435
+ "{{host}}",
1436
+ "s",
1437
+ ),
1438
+ panelTimeseries(
1439
+ "4xx/5xx responses (access log)",
1440
+ { x: 12, y: 12, w: 12, h: 8 },
1441
+ 'sum by (status) (count_over_time({compose_service="caddy"} | json | status >= 400 [$__interval]))',
1442
+ "{{status}}",
1443
+ "short",
1444
+ LOKI_DS,
1445
+ ),
1446
+
1447
+ // Row: live access log
1448
+ {
1449
+ title: "Access log (live)",
1450
+ type: "logs",
1451
+ gridPos: { x: 0, y: 20, w: 24, h: 12 },
1452
+ datasource: { type: "loki", uid: "loki" },
1453
+ targets: [
1454
+ {
1455
+ expr: '{compose_service="caddy"}',
1456
+ refId: "A",
1457
+ },
1458
+ ],
1459
+ options: {
1460
+ showTime: true,
1461
+ showLabels: false,
1462
+ showCommonLabels: false,
1463
+ wrapLogMessage: true,
1464
+ enableLogDetails: true,
1465
+ dedupStrategy: "none",
1466
+ sortOrder: "Descending",
1467
+ },
1468
+ },
1469
+ ],
1470
+ };
1471
+ return JSON.stringify(dashboard, null, 2);
1472
+ }
1473
+
858
1474
  /** All config files needed on the host. Returns map of relative-path → contents
859
1475
  * so bootstrap can write+upload them in one pass. */
860
1476
  export function generateObservabilityConfigs(
@@ -865,14 +1481,18 @@ export function generateObservabilityConfigs(
865
1481
  "observability/tempo.yaml": generateTempoConfig(cfg),
866
1482
  "observability/loki-config.yaml": generateLokiConfig(cfg),
867
1483
  "observability/prometheus.yml": generatePrometheusConfig(cfg),
1484
+ "observability/alloy-config.alloy": generateAlloyConfig(),
868
1485
  "observability/grafana-datasources.yaml": generateGrafanaDatasources(),
869
1486
  "observability/grafana-dashboards.yaml": generateGrafanaDashboardsProvider(),
1487
+ "observability/grafana-alerting/alerting.yaml": generateGrafanaAlerting(cfg),
870
1488
  "observability/grafana-dashboards/arc-overview.json": generateArcOverviewDashboard(),
871
1489
  "observability/grafana-dashboards/arc-traces.json": generateArcTracesDashboard(),
872
1490
  "observability/grafana-dashboards/arc-service-map.json": generateArcServiceMapDashboard(),
873
1491
  "observability/grafana-dashboards/arc-logs.json": generateArcLogsDashboard(),
874
1492
  "observability/grafana-dashboards/arc-sampling.json": generateArcSamplingDashboard(),
875
1493
  "observability/grafana-dashboards/arc-command.json": generateArcCommandDashboard(),
1494
+ "observability/grafana-dashboards/arc-infra.json": generateArcInfraDashboard(),
1495
+ "observability/grafana-dashboards/arc-edge.json": generateArcEdgeDashboard(),
876
1496
  };
877
1497
  }
878
1498
 
@@ -882,6 +1502,10 @@ export function generateObservabilityConfigs(
882
1502
 
883
1503
  interface GridPos { x: number; y: number; w: number; h: number; }
884
1504
  interface Threshold { orange?: number; red?: number; }
1505
+ interface PanelDatasource { type: string; uid: string; }
1506
+
1507
+ const PROMETHEUS_DS: PanelDatasource = { type: "prometheus", uid: "prometheus" };
1508
+ const LOKI_DS: PanelDatasource = { type: "loki", uid: "loki" };
885
1509
 
886
1510
  function panelStat(
887
1511
  title: string,
@@ -889,6 +1513,7 @@ function panelStat(
889
1513
  expr: string,
890
1514
  unit: string,
891
1515
  thresholds?: Threshold,
1516
+ datasource: PanelDatasource = PROMETHEUS_DS,
892
1517
  ) {
893
1518
  const steps: { color: string; value: number | null }[] = [
894
1519
  { color: "green", value: null },
@@ -903,7 +1528,7 @@ function panelStat(
903
1528
  title,
904
1529
  type: "stat",
905
1530
  gridPos,
906
- datasource: { type: "prometheus", uid: "prometheus" },
1531
+ datasource,
907
1532
  targets: [{ expr, refId: "A", legendFormat: title }],
908
1533
  fieldConfig: {
909
1534
  defaults: {
@@ -928,6 +1553,7 @@ function panelTimeseries(
928
1553
  query: string | { expr: string; legend: string }[],
929
1554
  legend: string,
930
1555
  unit: string,
1556
+ datasource: PanelDatasource = PROMETHEUS_DS,
931
1557
  ) {
932
1558
  const targets = Array.isArray(query)
933
1559
  ? query.map((q, i) => ({
@@ -940,7 +1566,7 @@ function panelTimeseries(
940
1566
  title,
941
1567
  type: "timeseries",
942
1568
  gridPos,
943
- datasource: { type: "prometheus", uid: "prometheus" },
1569
+ datasource,
944
1570
  targets,
945
1571
  fieldConfig: {
946
1572
  defaults: {