npm - @arcote.tech/arc-cli - Versions diffs - 0.7.18 → 0.7.20 - Mend

@arcote.tech/arc-cli 0.7.18 → 0.7.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/dist/index.js +651 -105
package/package.json +9 -9
package/src/deploy/bootstrap.ts +8 -3
package/src/deploy/caddyfile.ts +43 -8
package/src/deploy/compose.ts +73 -0
package/src/deploy/config.ts +15 -0
package/src/deploy/observability-configs.ts +674 -48
package/src/platform/server.ts +3 -0

package/src/deploy/observability-configs.ts CHANGED Viewed

@@ -35,8 +35,9 @@ function pickRetention(o: DeployObservability | undefined) {
 }
 /** OpenTelemetry Collector — receives OTLP from app containers + browser,
- *  applies tail sampling, fans out to Tempo (traces), Loki (logs),
- *  Prometheus remote-write (metrics). */
+ *  derives span-metrics + service-graph BEFORE tail sampling (no sampling
+ *  bias), scrapes host + per-container resource usage, fans out to Tempo
+ *  (traces), Loki (logs), Prometheus remote-write (metrics). */
 export function generateOtelCollectorConfig(cfg: DeployConfig): string {
   const envNames = Object.keys(cfg.envs);
   return `# Generated by \`arc platform deploy\` — do not edit by hand.
@@ -55,6 +56,64 @@ ${envNames.map((name) => `            - "https://${cfg.envs[name]!.domain}"`).jo
             - tracestate
             - content-type
+  # Host-level CPU / memory / load / disk / filesystem / network metrics.
+  # The host root is bind-mounted read-only at /hostfs (see compose).
+  hostmetrics:
+    collection_interval: 30s
+    root_path: /hostfs
+    scrapers:
+      cpu:
+        metrics:
+          system.cpu.utilization:
+            enabled: true
+      memory:
+        metrics:
+          system.memory.utilization:
+            enabled: true
+      load: {}
+      disk: {}
+      filesystem:
+        metrics:
+          system.filesystem.utilization:
+            enabled: true
+        exclude_fs_types:
+          fs_types: [autofs, binfmt_misc, bpf, cgroup2, configfs, debugfs, devpts, devtmpfs, fusectl, hugetlbfs, iso9660, mqueue, nsfs, overlay, proc, procfs, pstore, rpc_pipefs, securityfs, selinuxfs, squashfs, sysfs, tracefs, tmpfs]
+          match_type: strict
+        exclude_mount_points:
+          mount_points: ["/var/lib/docker/.*", "/run/.*", "/snap/.*", "/boot/.*"]
+          match_type: regexp
+      network: {}
+      paging: {}
+  # Per-container CPU / memory / network / block-IO + restarts straight from
+  # the Docker daemon (socket bind-mounted read-only, see compose).
+  docker_stats:
+    endpoint: unix:///var/run/docker.sock
+    collection_interval: 30s
+    metrics:
+      container.restarts:
+        enabled: true
+      container.uptime:
+        enabled: true
+connectors:
+  # Span→metrics computed from 100% of spans (pipeline runs BEFORE tail
+  # sampling) — lowering the sampling policy later never skews dashboards.
+  spanmetrics:
+    histogram:
+      unit: ms
+      explicit:
+        buckets: [2ms, 6ms, 10ms, 50ms, 100ms, 250ms, 500ms, 1s, 2s, 5s, 10s]
+    metrics_flush_interval: 15s
+  # Emits traces_service_graph_* (same metric names Tempo's generator would).
+  servicegraph:
+    metrics_flush_interval: 15s
+    store:
+      ttl: 5s
+      max_items: 5000
+  # Joins the raw-trace pipeline to the sampled-storage pipeline.
+  forward: {}
 processors:
   batch:
     timeout: 5s
@@ -65,7 +124,8 @@ processors:
   # Errors + slow traces zachowywane w 100%, normalne traces również 100%
   # przy obecnej skali (boostrap produkcji). Tail sampling matchuje OR po
   # policies — bez "always" policy WSZYSTKIE OK traces byłyby droppowane.
-  # Obniż 'random_100pct' do np. 10% gdy ruch eksploduje.
+  # Obniż 'random_100pct' do np. 10% gdy ruch eksploduje — span-metrics są
+  # liczone przed samplingiem, więc dashboardy pozostaną dokładne.
   tail_sampling:
     decision_wait: 10s
     num_traces: 50000
@@ -90,6 +150,34 @@ processors:
       - key: http.request.header.cookie
         action: delete
+  # Cardinality guard for span-metrics: the SPA fallback answers EVERY path,
+  # so raw span names (one per bot-scanned URL) would explode Prometheus
+  # series. Static assets collapse to "<METHOD> static", /route/* to
+  # "<METHOD> /route", anything else outside the known API surface to
+  # "<METHOD> other". Runs BEFORE the spanmetrics connector. Note: "$$" is a
+  # literal "$" (collector env expansion), RE2 has no lookahead → IsMatch+not.
+  transform/span_names:
+    error_mode: ignore
+    trace_statements:
+      - context: span
+        statements:
+          - set(name, Concat([attributes["http.request.method"], "static"], " ")) where IsMatch(name, "^[A-Z]+ /.*\\\\.(js|mjs|css|map|ico|png|jpe?g|svg|gif|webp|avif|woff2?|ttf|otf|txt|xml|json|webmanifest)$$")
+          - replace_pattern(name, "^([A-Z]+) /route(/.*)?$$", "$$1 /route")
+          - set(name, Concat([attributes["http.request.method"], "other"], " ")) where IsMatch(name, "^[A-Z]+ /") and not IsMatch(name, "^[A-Z]+ ((/api|/command|/query)(/.*)?|/route|/ws|/health|/otel(/.*)?|/)$$")
+  # Stable service.name for infra metric streams (becomes the service_name
+  # label after resource_to_telemetry_conversion).
+  resource/host:
+    attributes:
+      - key: service.name
+        value: arc-host
+        action: upsert
+  resource/docker:
+    attributes:
+      - key: service.name
+        value: arc-docker
+        action: upsert
 exporters:
   otlp/tempo:
     endpoint: tempo:4317
@@ -105,6 +193,10 @@ exporters:
     endpoint: http://prometheus:9090/api/v1/write
     tls:
       insecure: true
+    # Copy resource attributes (service.name, deployment.environment, …)
+    # onto every series — dashboards filter by service_name.
+    resource_to_telemetry_conversion:
+      enabled: true
 extensions:
   health_check: {}
@@ -112,19 +204,42 @@ extensions:
 service:
   extensions: [health_check, zpages]
+  # Collector self-metrics. Since 0.111 the default bind is localhost only —
+  # Prometheus scrapes otel-collector:8888, so listen on all interfaces.
+  telemetry:
+    metrics:
+      level: detailed
+      readers:
+        - pull:
+            exporter:
+              prometheus:
+                host: "0.0.0.0"
+                port: 8888
   pipelines:
-    traces:
+    traces/in:
       receivers: [otlp]
-      processors: [tail_sampling, attributes, batch]
+      processors: [attributes, transform/span_names]
+      exporters: [spanmetrics, servicegraph, forward]
+    traces/sampled:
+      receivers: [forward]
+      processors: [tail_sampling, batch]
       exporters: [otlp/tempo]
     logs:
       receivers: [otlp]
       processors: [attributes, batch]
       exporters: [otlphttp/loki]
     metrics:
-      receivers: [otlp]
+      receivers: [otlp, spanmetrics, servicegraph]
       processors: [batch]
       exporters: [prometheusremotewrite]
+    metrics/host:
+      receivers: [hostmetrics]
+      processors: [resource/host, batch]
+      exporters: [prometheusremotewrite]
+    metrics/docker:
+      receivers: [docker_stats]
+      processors: [resource/docker, batch]
+      exporters: [prometheusremotewrite]
 `;
 }
@@ -162,20 +277,9 @@ storage:
     wal:
       path: /var/tempo/wal
-metrics_generator:
-  registry:
-    external_labels:
-      source: tempo
-  storage:
-    path: /var/tempo/generator/wal
-    remote_write:
-      - url: http://prometheus:9090/api/v1/write
-        send_exemplars: true
-overrides:
-  defaults:
-    metrics_generator:
-      processors: [service-graphs, span-metrics]
+# NOTE: no metrics_generator — span-metrics + service-graph are produced by
+# the otel-collector connectors BEFORE tail sampling (accurate rates even
+# when sampling is later tightened) and remote-written to Prometheus there.
 `;
 }
@@ -237,12 +341,239 @@ scrape_configs:
   - job_name: otel-collector
     static_configs:
       - targets: [otel-collector:8888]
+  - job_name: caddy
+    static_configs:
+      - targets: [caddy:2020]
+  - job_name: loki
+    static_configs:
+      - targets: [loki:3100]
+  - job_name: tempo
+    static_configs:
+      - targets: [tempo:3200]
+  - job_name: grafana
+    static_configs:
+      - targets: [grafana:3000]
+  - job_name: alloy
+    static_configs:
+      - targets: [alloy:12345]
 # remote-write inbound is enabled via the --web.enable-remote-write-receiver
 # command-line flag (compose.ts). Retention via --storage.tsdb.retention.time.
 `;
 }
+/** Grafana Alloy — tails stdout/stderr of every container on the host via
+ *  the Docker API and ships it to Loki. Complements the in-app console→OTLP
+ *  bridge: infra containers (caddy, postgres, tempo, …) and app crash output
+ *  (OOM, Bun panic — anything printed before/outside the OTel SDK) all land
+ *  in Loki under the `container` / `compose_service` labels. */
+export function generateAlloyConfig(): string {
+  return `// Generated by \`arc platform deploy\` — do not edit by hand.
+discovery.docker "containers" {
+  host             = "unix:///var/run/docker.sock"
+  refresh_interval = "15s"
+}
+discovery.relabel "containers" {
+  targets = discovery.docker.containers.targets
+  rule {
+    source_labels = ["__meta_docker_container_name"]
+    regex         = "/(.*)"
+    target_label  = "container"
+  }
+  rule {
+    source_labels = ["__meta_docker_container_label_com_docker_compose_service"]
+    target_label  = "compose_service"
+  }
+}
+loki.source.docker "containers" {
+  host          = "unix:///var/run/docker.sock"
+  targets       = discovery.docker.containers.targets
+  relabel_rules = discovery.relabel.containers.rules
+  labels        = { source = "docker" }
+  forward_to    = [loki.write.loki.receiver]
+}
+loki.write "loki" {
+  endpoint {
+    url = "http://loki:3100/loki/api/v1/push"
+  }
+}
+`;
+}
+/** Grafana alerting provisioning — a starter rule pack covering the failure
+ *  modes that matter on a single-host deploy: error rate, latency, disk,
+ *  memory, container restarts, app silence, telemetry export failures and
+ *  scrape-target health. Notification routing (webhook) is only emitted when
+ *  `observability.alertWebhookUrl` is configured — without it the rules are
+ *  still visible/firing in the Grafana UI. */
+export function generateGrafanaAlerting(cfg: DeployConfig): string {
+  const webhookUrl = cfg.observability?.alertWebhookUrl;
+  interface AlertRule {
+    uid: string;
+    title: string;
+    expr: string;
+    /** Threshold for the C (threshold) expression node. */
+    threshold: number;
+    /** Comparison operator. Default "gt". */
+    op?: "gt" | "lt";
+    /** Pending period, e.g. "5m". "0s" fires immediately. */
+    pendingFor: string;
+    summary: string;
+  }
+  const rules: AlertRule[] = [
+    {
+      uid: "arc-high-error-rate",
+      title: "High server error rate (>5%)",
+      expr: 'sum(rate(traces_span_metrics_calls_total{span_kind="SPAN_KIND_SERVER", status_code="STATUS_CODE_ERROR"}[5m])) / clamp_min(sum(rate(traces_span_metrics_calls_total{span_kind="SPAN_KIND_SERVER"}[5m])), 0.001)',
+      threshold: 0.05,
+      pendingFor: "5m",
+      summary: "More than 5% of server spans are errors over the last 5 minutes.",
+    },
+    {
+      uid: "arc-high-latency-p95",
+      title: "High p95 latency (>1s)",
+      expr: 'histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_kind="SPAN_KIND_SERVER"}[5m])))',
+      threshold: 1000,
+      pendingFor: "10m",
+      summary: "Server p95 latency above 1s for 10 minutes.",
+    },
+    {
+      uid: "arc-host-disk-high",
+      title: "Host disk usage >85%",
+      expr: 'max by (mountpoint) (sum by (device, mountpoint) (system_filesystem_usage_bytes{state="used"}) / sum by (device, mountpoint) (system_filesystem_usage_bytes))',
+      threshold: 0.85,
+      pendingFor: "15m",
+      summary: "A host filesystem is more than 85% full.",
+    },
+    {
+      uid: "arc-host-memory-high",
+      title: "Host memory usage >90%",
+      expr: 'sum(system_memory_usage_bytes{state="used"}) / sum(system_memory_usage_bytes)',
+      threshold: 0.9,
+      pendingFor: "10m",
+      summary: "Host memory usage above 90% for 10 minutes.",
+    },
+    {
+      uid: "arc-container-restarts",
+      title: "Container restarted",
+      expr: 'sum by (container_name) (increase(container_restarts_total[15m]))',
+      threshold: 0,
+      pendingFor: "0s",
+      summary: "A container restarted within the last 15 minutes.",
+    },
+    {
+      uid: "arc-app-silent",
+      title: "App stopped reporting metrics",
+      expr: "absent(arc_commands_total)",
+      threshold: 0,
+      pendingFor: "10m",
+      summary: "No arc_commands_total series for 10 minutes — app down or telemetry broken.",
+    },
+    {
+      uid: "arc-collector-export-failures",
+      title: "Telemetry export failures",
+      expr: 'sum(increase({__name__=~"otelcol_exporter_send_failed_(spans|metric_points|log_records)"}[15m]))',
+      threshold: 0,
+      pendingFor: "0s",
+      summary: "The otel-collector failed to export telemetry within the last 15 minutes.",
+    },
+    {
+      uid: "arc-target-down",
+      title: "Scrape target down",
+      expr: "min(up)",
+      threshold: 1,
+      op: "lt",
+      pendingFor: "5m",
+      summary: "A Prometheus scrape target has been down for 5 minutes.",
+    },
+  ];
+  const ruleYaml = rules
+    .map((rule) => {
+      const op = rule.op ?? "gt";
+      return `      - uid: ${rule.uid}
+        title: ${JSON.stringify(rule.title)}
+        condition: C
+        for: ${rule.pendingFor}
+        noDataState: OK
+        execErrState: OK
+        annotations:
+          summary: ${JSON.stringify(rule.summary)}
+        labels:
+          source: arc
+        data:
+          - refId: A
+            relativeTimeRange: { from: 600, to: 0 }
+            datasourceUid: prometheus
+            model:
+              expr: ${JSON.stringify(rule.expr)}
+              instant: true
+              intervalMs: 1000
+              maxDataPoints: 43200
+              refId: A
+          - refId: B
+            relativeTimeRange: { from: 0, to: 0 }
+            datasourceUid: __expr__
+            model:
+              type: reduce
+              expression: A
+              reducer: last
+              refId: B
+          - refId: C
+            relativeTimeRange: { from: 0, to: 0 }
+            datasourceUid: __expr__
+            model:
+              type: threshold
+              expression: B
+              refId: C
+              conditions:
+                - evaluator:
+                    type: ${op}
+                    params: [${rule.threshold}]`;
+    })
+    .join("\n");
+  const contactSection = webhookUrl
+    ? `
+contactPoints:
+  - orgId: 1
+    name: arc-webhook
+    receivers:
+      - uid: arc-webhook
+        type: webhook
+        settings:
+          url: ${JSON.stringify(webhookUrl)}
+          httpMethod: POST
+policies:
+  - orgId: 1
+    receiver: arc-webhook
+    group_by: ["grafana_folder", "alertname"]
+    group_wait: 30s
+    group_interval: 5m
+    repeat_interval: 4h
+`
+    : "";
+  return `# Generated by \`arc platform deploy\` — do not edit by hand.
+apiVersion: 1
+groups:
+  - orgId: 1
+    name: arc-alerts
+    folder: Arc
+    interval: 1m
+    rules:
+${ruleYaml}
+${contactSection}`;
+}
 /** Grafana datasource provisioning — Tempo + Loki + Prometheus, all pre-wired. */
 export function generateGrafanaDatasources(): string {
   return `# Generated by \`arc platform deploy\` — do not edit by hand.
@@ -267,10 +598,17 @@ datasources:
     uid: loki
     jsonData:
       derivedFields:
+        # Plain-text logs that happen to contain "trace_id=<id>".
         - datasourceUid: tempo
           matcherRegex: "trace_id=(\\\\w+)"
           name: TraceID
           url: $\${__value.raw}
+        # OTLP-ingested logs — trace_id arrives as structured metadata.
+        - datasourceUid: tempo
+          matcherType: label
+          matcherRegex: trace_id
+          name: TraceID (OTLP)
+          url: $\${__value.raw}
   - name: Prometheus
     type: prometheus
     access: proxy
@@ -322,7 +660,7 @@ export function generateArcOverviewDashboard(): string {
           label: "Service",
           type: "query",
           datasource: { type: "prometheus", uid: "prometheus" },
-          query: "label_values(traces_spanmetrics_calls_total, service_name)",
+          query: "label_values(traces_span_metrics_calls_total, service_name)",
           refresh: 2,
           includeAll: false,
           multi: false,
@@ -335,20 +673,20 @@ export function generateArcOverviewDashboard(): string {
       panelStat(
         "Request rate (req/s)",
         { x: 0, y: 0, w: 6, h: 4 },
-        'sum(rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m]))',
+        'sum(rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m]))',
         "reqps",
       ),
       panelStat(
         "Error rate (%)",
         { x: 6, y: 0, w: 6, h: 4 },
-        'sum(rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER", status_code="STATUS_CODE_ERROR"}[5m])) / clamp_min(sum(rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])), 0.001) * 100',
+        'sum(rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER", status_code="STATUS_CODE_ERROR"}[5m])) / clamp_min(sum(rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])), 0.001) * 100',
         "percent",
         { red: 1, orange: 0.1 },
       ),
       panelStat(
         "P99 latency",
         { x: 12, y: 0, w: 6, h: 4 },
-        'histogram_quantile(0.99, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
+        'histogram_quantile(0.99, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
         "ms",
         { red: 1000, orange: 300 },
       ),
@@ -363,7 +701,7 @@ export function generateArcOverviewDashboard(): string {
       panelTimeseries(
         "Request rate by route",
         { x: 0, y: 4, w: 12, h: 8 },
-        'sum by (span_name) (rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[1m]))',
+        'sum by (span_name) (rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[1m]))',
         "{{span_name}}",
         "reqps",
       ),
@@ -372,18 +710,19 @@ export function generateArcOverviewDashboard(): string {
         { x: 12, y: 4, w: 12, h: 8 },
         [
           {
-            expr: 'histogram_quantile(0.5, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
+            expr: 'histogram_quantile(0.5, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
             legend: "p50",
           },
           {
-            expr: 'histogram_quantile(0.95, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
+            expr: 'histogram_quantile(0.95, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
             legend: "p95",
           },
           {
-            expr: 'histogram_quantile(0.99, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
+            expr: 'histogram_quantile(0.99, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
             legend: "p99",
           },
         ],
+        "",
         "ms",
       ),
@@ -398,24 +737,24 @@ export function generateArcOverviewDashboard(): string {
       panelTimeseries(
         "Command p95 latency",
         { x: 12, y: 12, w: 12, h: 8 },
-        'histogram_quantile(0.95, sum by (arc_command_name, le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service"}[5m])))',
+        'histogram_quantile(0.95, sum by (arc_command_name, le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service"}[5m])))',
         "{{arc_command_name}}",
         "ms",
       ),
       // Row: database
       panelTimeseries(
-        "DB find ops/sec by collection",
+        "DB ops/sec by collection",
         { x: 0, y: 20, w: 12, h: 8 },
-        'sum by (db_collection_name) (rate(arc_db_find_ms_milliseconds_count{service_name="$service"}[1m]))',
-        "{{db_collection_name}}",
+        'sum by (db_collection_name, db_operation_name) (rate(arc_db_operation_duration_milliseconds_count{service_name="$service"}[1m]))',
+        "{{db_collection_name}} {{db_operation_name}}",
         "ops",
       ),
       panelTimeseries(
-        "DB find p95 latency",
+        "DB p95 latency by operation",
         { x: 12, y: 20, w: 12, h: 8 },
-        'histogram_quantile(0.95, sum by (db_collection_name, le) (rate(arc_db_find_ms_milliseconds_bucket{service_name="$service"}[5m])))',
-        "{{db_collection_name}}",
+        'histogram_quantile(0.95, sum by (db_operation_name, le) (rate(arc_db_operation_duration_milliseconds_bucket{service_name="$service"}[5m])))',
+        "{{db_operation_name}}",
         "ms",
       ),
@@ -427,7 +766,9 @@ export function generateArcOverviewDashboard(): string {
         datasource: { type: "loki", uid: "loki" },
         targets: [
           {
-            expr: '{service_name="$service"} |= `ERROR`',
+            // severity_text is OTLP structured metadata — filter via the
+            // pipeline stage, NOT the stream selector (not an index label).
+            expr: '{service_name="$service"} | severity_text=~`ERROR|FATAL`',
             refId: "A",
           },
         ],
@@ -464,7 +805,7 @@ export function generateArcTracesDashboard(): string {
           label: "Service",
           type: "query",
           datasource: { type: "prometheus", uid: "prometheus" },
-          query: "label_values(traces_spanmetrics_calls_total, service_name)",
+          query: "label_values(traces_span_metrics_calls_total, service_name)",
           refresh: 2,
           current: { text: "arc-prod", value: "arc-prod" },
         },
@@ -587,23 +928,38 @@ export function generateArcLogsDashboard(): string {
           query: "",
           current: { text: "", value: "" },
         },
+        {
+          name: "container",
+          label: "Container",
+          type: "query",
+          datasource: { type: "loki", uid: "loki" },
+          query: "label_values(container)",
+          refresh: 2,
+          includeAll: true,
+          multi: false,
+          current: { text: "All", value: "$__all" },
+        },
       ],
     },
     panels: [
-      // Stat: total log lines, last hour
+      // Stat: total log lines, last hour. severity_text is OTLP structured
+      // metadata — usable only in pipeline stages, never in stream selectors.
       panelStat(
         "Logs ingested (1h)",
         { x: 0, y: 0, w: 6, h: 4 },
-        'sum(increase({service_name="$service"}[1h]))',
+        'sum(count_over_time({service_name="$service"}[1h]))',
         "short",
+        undefined,
+        LOKI_DS,
       ),
       // Stat: errors (last hour)
       panelStat(
         "Errors (1h)",
         { x: 6, y: 0, w: 6, h: 4 },
-        'sum(increase({service_name="$service", severity_text=~"ERROR|FATAL"}[1h]))',
+        'sum(count_over_time({service_name="$service"} | severity_text=~`ERROR|FATAL` [1h]))',
         "short",
         { orange: 1, red: 50 },
+        LOKI_DS,
       ),
       // Time series: log volume by severity
       {
@@ -657,6 +1013,30 @@ export function generateArcLogsDashboard(): string {
           sortOrder: "Descending",
         },
       },
+      // Container stdout/stderr (collected by Alloy from the Docker API) —
+      // caddy access logs, postgres, the observability stack itself, plus
+      // app crash output that never reached the OTLP pipeline.
+      {
+        title: "Container logs ($container)",
+        type: "logs",
+        gridPos: { x: 0, y: 26, w: 24, h: 14 },
+        datasource: { type: "loki", uid: "loki" },
+        targets: [
+          {
+            expr: '{container=~"$container"} |~ "$search"',
+            refId: "A",
+          },
+        ],
+        options: {
+          showTime: true,
+          showLabels: true,
+          showCommonLabels: false,
+          wrapLogMessage: true,
+          enableLogDetails: true,
+          dedupStrategy: "none",
+          sortOrder: "Descending",
+        },
+      },
     ],
   };
   return JSON.stringify(dashboard, null, 2);
@@ -722,6 +1102,7 @@ export function generateArcSamplingDashboard(): string {
             legend: "exported",
           },
         ],
+        "",
         "ops",
       ),
@@ -790,20 +1171,20 @@ export function generateArcCommandDashboard(): string {
       panelStat(
         "P50 latency",
         { x: 6, y: 0, w: 6, h: 4 },
-        'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
+        'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
         "ms",
       ),
       panelStat(
         "P95 latency",
         { x: 12, y: 0, w: 6, h: 4 },
-        'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
+        'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
         "ms",
         { orange: 200, red: 1000 },
       ),
       panelStat(
         "P99 latency",
         { x: 18, y: 0, w: 6, h: 4 },
-        'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
+        'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
         "ms",
         { orange: 500, red: 2000 },
       ),
@@ -820,18 +1201,19 @@ export function generateArcCommandDashboard(): string {
         { x: 12, y: 4, w: 12, h: 8 },
         [
           {
-            expr: 'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
+            expr: 'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
             legend: "p50",
           },
           {
-            expr: 'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
+            expr: 'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
             legend: "p95",
           },
           {
-            expr: 'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
+            expr: 'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
             legend: "p99",
           },
         ],
+        "",
         "ms",
       ),
@@ -855,6 +1237,240 @@ export function generateArcCommandDashboard(): string {
   return JSON.stringify(dashboard, null, 2);
 }
+/** Infrastructure dashboard — host CPU/RAM/disk/network (hostmetrics
+ *  receiver, service_name="arc-host") + per-container resource usage and
+ *  restarts (docker_stats receiver, service_name="arc-docker"). The panels
+ *  answering "is the box healthy / which container is eating the host". */
+export function generateArcInfraDashboard(): string {
+  const dashboard = {
+    title: "Arc Infrastructure",
+    uid: "arc-infra",
+    schemaVersion: 39,
+    version: 1,
+    refresh: "30s",
+    time: { from: "now-3h", to: "now" },
+    tags: ["arc", "auto-provisioned"],
+    panels: [
+      // Row: host top-line stats
+      panelStat(
+        "Host CPU used",
+        { x: 0, y: 0, w: 6, h: 4 },
+        '100 * (1 - avg(rate(system_cpu_time_seconds_total{state="idle"}[5m])))',
+        "percent",
+        { orange: 70, red: 90 },
+      ),
+      panelStat(
+        "Host memory used",
+        { x: 6, y: 0, w: 6, h: 4 },
+        '100 * sum(system_memory_usage_bytes{state="used"}) / sum(system_memory_usage_bytes)',
+        "percent",
+        { orange: 80, red: 90 },
+      ),
+      panelStat(
+        "Disk used (worst mount)",
+        { x: 12, y: 0, w: 6, h: 4 },
+        '100 * max(sum by (device, mountpoint) (system_filesystem_usage_bytes{state="used"}) / sum by (device, mountpoint) (system_filesystem_usage_bytes))',
+        "percent",
+        { orange: 75, red: 85 },
+      ),
+      panelStat(
+        "Load (1m)",
+        { x: 18, y: 0, w: 6, h: 4 },
+        "system_cpu_load_average_1m",
+        "short",
+      ),
+      // Row: host CPU + memory over time
+      panelTimeseries(
+        "Host CPU utilization",
+        { x: 0, y: 4, w: 12, h: 8 },
+        [
+          {
+            expr: '100 * (1 - avg(rate(system_cpu_time_seconds_total{state="idle"}[5m])))',
+            legend: "used %",
+          },
+          { expr: "system_cpu_load_average_1m", legend: "load 1m" },
+          { expr: "system_cpu_load_average_5m", legend: "load 5m" },
+          { expr: "system_cpu_load_average_15m", legend: "load 15m" },
+        ],
+        "",
+        "short",
+      ),
+      panelTimeseries(
+        "Host memory by state",
+        { x: 12, y: 4, w: 12, h: 8 },
+        'sum by (state) (system_memory_usage_bytes)',
+        "{{state}}",
+        "bytes",
+      ),
+      // Row: disk
+      panelTimeseries(
+        "Filesystem usage by mount",
+        { x: 0, y: 12, w: 12, h: 8 },
+        '100 * sum by (mountpoint) (system_filesystem_usage_bytes{state="used"}) / sum by (mountpoint) (system_filesystem_usage_bytes)',
+        "{{mountpoint}}",
+        "percent",
+      ),
+      panelTimeseries(
+        "Disk I/O",
+        { x: 12, y: 12, w: 12, h: 8 },
+        'sum by (device, direction) (rate(system_disk_io_bytes_total[5m]))',
+        "{{device}} {{direction}}",
+        "Bps",
+      ),
+      // Row: network + paging
+      panelTimeseries(
+        "Network I/O",
+        { x: 0, y: 20, w: 12, h: 8 },
+        'sum by (device, direction) (rate(system_network_io_bytes_total{device!="lo"}[5m]))',
+        "{{device}} {{direction}}",
+        "Bps",
+      ),
+      panelTimeseries(
+        "Container restarts (24h)",
+        { x: 12, y: 20, w: 12, h: 8 },
+        'sum by (container_name) (increase(container_restarts_total[24h]))',
+        "{{container_name}}",
+        "short",
+      ),
+      // Row: per-container resources
+      panelTimeseries(
+        "Container CPU",
+        { x: 0, y: 28, w: 12, h: 8 },
+        'container_cpu_utilization_ratio',
+        "{{container_name}}",
+        "percent",
+      ),
+      panelTimeseries(
+        "Container memory",
+        { x: 12, y: 28, w: 12, h: 8 },
+        'container_memory_usage_total_bytes',
+        "{{container_name}}",
+        "bytes",
+      ),
+      panelTimeseries(
+        "Container network RX",
+        { x: 0, y: 36, w: 12, h: 8 },
+        'sum by (container_name) (rate(container_network_io_usage_rx_bytes_total[5m]))',
+        "{{container_name}}",
+        "Bps",
+      ),
+      panelTimeseries(
+        "Container network TX",
+        { x: 12, y: 36, w: 12, h: 8 },
+        'sum by (container_name) (rate(container_network_io_usage_tx_bytes_total[5m]))',
+        "{{container_name}}",
+        "Bps",
+      ),
+    ],
+  };
+  return JSON.stringify(dashboard, null, 2);
+}
+/** Edge dashboard — Caddy reverse-proxy metrics (per-host request rate,
+ *  status codes, latency, in-flight) plus the JSON access logs that Alloy
+ *  ships to Loki. First stop for "is traffic reaching us / who is 404ing". */
+export function generateArcEdgeDashboard(): string {
+  const dashboard = {
+    title: "Arc Caddy / Edge",
+    uid: "arc-edge",
+    schemaVersion: 39,
+    version: 1,
+    refresh: "30s",
+    time: { from: "now-1h", to: "now" },
+    tags: ["arc", "auto-provisioned"],
+    panels: [
+      // Row: top-line stats
+      panelStat(
+        "Requests/s",
+        { x: 0, y: 0, w: 6, h: 4 },
+        "sum(rate(caddy_http_request_duration_seconds_count[1m]))",
+        "reqps",
+      ),
+      panelStat(
+        "In-flight requests",
+        { x: 6, y: 0, w: 6, h: 4 },
+        "sum(caddy_http_requests_in_flight)",
+        "short",
+      ),
+      panelStat(
+        "Handler errors/s",
+        { x: 12, y: 0, w: 6, h: 4 },
+        "sum(rate(caddy_http_request_errors_total[5m]))",
+        "ops",
+        { orange: 0.1, red: 1 },
+      ),
+      panelStat(
+        "P95 latency",
+        { x: 18, y: 0, w: 6, h: 4 },
+        "histogram_quantile(0.95, sum by (le) (rate(caddy_http_request_duration_seconds_bucket[5m])))",
+        "s",
+        { orange: 0.3, red: 1 },
+      ),
+      // Row: traffic breakdown
+      panelTimeseries(
+        "Request rate by host",
+        { x: 0, y: 4, w: 12, h: 8 },
+        "sum by (host) (rate(caddy_http_request_duration_seconds_count[1m]))",
+        "{{host}}",
+        "reqps",
+      ),
+      panelTimeseries(
+        "Responses by status code",
+        { x: 12, y: 4, w: 12, h: 8 },
+        "sum by (code) (rate(caddy_http_request_duration_seconds_count[1m]))",
+        "{{code}}",
+        "reqps",
+      ),
+      // Row: latency + error log volume
+      panelTimeseries(
+        "P95 latency by host",
+        { x: 0, y: 12, w: 12, h: 8 },
+        "histogram_quantile(0.95, sum by (host, le) (rate(caddy_http_request_duration_seconds_bucket[5m])))",
+        "{{host}}",
+        "s",
+      ),
+      panelTimeseries(
+        "4xx/5xx responses (access log)",
+        { x: 12, y: 12, w: 12, h: 8 },
+        'sum by (status) (count_over_time({compose_service="caddy"} | json | status >= 400 [$__interval]))',
+        "{{status}}",
+        "short",
+        LOKI_DS,
+      ),
+      // Row: live access log
+      {
+        title: "Access log (live)",
+        type: "logs",
+        gridPos: { x: 0, y: 20, w: 24, h: 12 },
+        datasource: { type: "loki", uid: "loki" },
+        targets: [
+          {
+            expr: '{compose_service="caddy"}',
+            refId: "A",
+          },
+        ],
+        options: {
+          showTime: true,
+          showLabels: false,
+          showCommonLabels: false,
+          wrapLogMessage: true,
+          enableLogDetails: true,
+          dedupStrategy: "none",
+          sortOrder: "Descending",
+        },
+      },
+    ],
+  };
+  return JSON.stringify(dashboard, null, 2);
+}
 /** All config files needed on the host. Returns map of relative-path → contents
  *  so bootstrap can write+upload them in one pass. */
 export function generateObservabilityConfigs(
@@ -865,14 +1481,18 @@ export function generateObservabilityConfigs(
     "observability/tempo.yaml": generateTempoConfig(cfg),
     "observability/loki-config.yaml": generateLokiConfig(cfg),
     "observability/prometheus.yml": generatePrometheusConfig(cfg),
+    "observability/alloy-config.alloy": generateAlloyConfig(),
     "observability/grafana-datasources.yaml": generateGrafanaDatasources(),
     "observability/grafana-dashboards.yaml": generateGrafanaDashboardsProvider(),
+    "observability/grafana-alerting/alerting.yaml": generateGrafanaAlerting(cfg),
     "observability/grafana-dashboards/arc-overview.json": generateArcOverviewDashboard(),
     "observability/grafana-dashboards/arc-traces.json": generateArcTracesDashboard(),
     "observability/grafana-dashboards/arc-service-map.json": generateArcServiceMapDashboard(),
     "observability/grafana-dashboards/arc-logs.json": generateArcLogsDashboard(),
     "observability/grafana-dashboards/arc-sampling.json": generateArcSamplingDashboard(),
     "observability/grafana-dashboards/arc-command.json": generateArcCommandDashboard(),
+    "observability/grafana-dashboards/arc-infra.json": generateArcInfraDashboard(),
+    "observability/grafana-dashboards/arc-edge.json": generateArcEdgeDashboard(),
   };
 }
@@ -882,6 +1502,10 @@ export function generateObservabilityConfigs(
 interface GridPos { x: number; y: number; w: number; h: number; }
 interface Threshold { orange?: number; red?: number; }
+interface PanelDatasource { type: string; uid: string; }
+const PROMETHEUS_DS: PanelDatasource = { type: "prometheus", uid: "prometheus" };
+const LOKI_DS: PanelDatasource = { type: "loki", uid: "loki" };
 function panelStat(
   title: string,
@@ -889,6 +1513,7 @@ function panelStat(
   expr: string,
   unit: string,
   thresholds?: Threshold,
+  datasource: PanelDatasource = PROMETHEUS_DS,
 ) {
   const steps: { color: string; value: number | null }[] = [
     { color: "green", value: null },
@@ -903,7 +1528,7 @@ function panelStat(
     title,
     type: "stat",
     gridPos,
-    datasource: { type: "prometheus", uid: "prometheus" },
+    datasource,
     targets: [{ expr, refId: "A", legendFormat: title }],
     fieldConfig: {
       defaults: {
@@ -928,6 +1553,7 @@ function panelTimeseries(
   query: string | { expr: string; legend: string }[],
   legend: string,
   unit: string,
+  datasource: PanelDatasource = PROMETHEUS_DS,
 ) {
   const targets = Array.isArray(query)
     ? query.map((q, i) => ({
@@ -940,7 +1566,7 @@ function panelTimeseries(
     title,
     type: "timeseries",
     gridPos,
-    datasource: { type: "prometheus", uid: "prometheus" },
+    datasource,
     targets,
     fieldConfig: {
       defaults: {