npm - @arcote.tech/arc-cli - Versions diffs - 0.7.5 → 0.7.7 - Mend

@arcote.tech/arc-cli 0.7.5 → 0.7.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/dist/index.js +1614 -165
package/package.json +22 -9
package/src/builder/dependency-collector.ts +34 -1
package/src/commands/platform-deploy.ts +16 -0
package/src/deploy/bootstrap.ts +94 -2
package/src/deploy/caddyfile.ts +45 -2
package/src/deploy/compose.ts +147 -2
package/src/deploy/config.ts +55 -0
package/src/deploy/env-file.ts +14 -8
package/src/deploy/htpasswd.ts +20 -0
package/src/deploy/observability-configs.ts +958 -0
package/src/platform/server.ts +65 -4

package/src/deploy/observability-configs.ts ADDED Viewed

@@ -0,0 +1,958 @@
+import type { DeployConfig, DeployObservability } from "./config";
+// ---------------------------------------------------------------------------
+// Observability stack config templates.
+//
+// All strings are deterministic for the inputs (cfg + retention) — no random
+// IDs, no timestamps — so re-running deploy with unchanged config is a no-op
+// at the file-write level. Bootstrap diffs filesystem before bouncing
+// services, so this matters.
+//
+// Defaults:
+//   - traces:  7d  retention (Tempo block storage on local disk)
+//   - logs:    7d  retention (Loki chunks on local disk)
+//   - metrics: 30d retention (Prometheus TSDB on local disk)
+//
+// Tail sampling: every error + every span >500ms + 10% random. Decided in
+// the collector so per-service SDKs can be left at always-on without
+// flooding the backend.
+// ---------------------------------------------------------------------------
+const DEFAULT_RETENTION = {
+  traces: "168h", // 7d
+  logs: "168h",
+  metrics: "30d",
+} as const;
+function pickRetention(o: DeployObservability | undefined) {
+  return {
+    traces: o?.retention?.traces ?? DEFAULT_RETENTION.traces,
+    logs: o?.retention?.logs ?? DEFAULT_RETENTION.logs,
+    metrics: o?.retention?.metrics ?? DEFAULT_RETENTION.metrics,
+  };
+}
+/** OpenTelemetry Collector — receives OTLP from app containers + browser,
+ *  applies tail sampling, fans out to Tempo (traces), Loki (logs),
+ *  Prometheus remote-write (metrics). */
+export function generateOtelCollectorConfig(cfg: DeployConfig): string {
+  const envNames = Object.keys(cfg.envs);
+  return `# Generated by \`arc platform deploy\` — do not edit by hand.
+receivers:
+  otlp:
+    protocols:
+      grpc:
+        endpoint: 0.0.0.0:4317
+      http:
+        endpoint: 0.0.0.0:4318
+        cors:
+          allowed_origins:
+${envNames.map((name) => `            - "https://${cfg.envs[name]!.domain}"`).join("\n")}
+          allowed_headers:
+            - traceparent
+            - tracestate
+            - content-type
+processors:
+  batch:
+    timeout: 5s
+    send_batch_size: 512
+    send_batch_max_size: 1024
+  # Tail-based sampling — applied after a full trace has been assembled.
+  # Errors and slow traces are kept 100%, everything else at 10%.
+  tail_sampling:
+    decision_wait: 10s
+    num_traces: 50000
+    expected_new_traces_per_sec: 100
+    policies:
+      - name: errors
+        type: status_code
+        status_code: { status_codes: [ERROR] }
+      - name: slow
+        type: latency
+        latency: { threshold_ms: 500 }
+      - name: random_10pct
+        type: probabilistic
+        probabilistic: { sampling_percentage: 10 }
+  # Drop high-cardinality / PII attributes that might slip past app-side
+  # sanitization. Belt-and-suspenders before they hit long-term storage.
+  attributes:
+    actions:
+      - key: http.request.header.authorization
+        action: delete
+      - key: http.request.header.cookie
+        action: delete
+exporters:
+  otlp/tempo:
+    endpoint: tempo:4317
+    tls:
+      insecure: true
+  otlphttp/loki:
+    endpoint: http://loki:3100/otlp
+    tls:
+      insecure: true
+  prometheusremotewrite:
+    endpoint: http://prometheus:9090/api/v1/write
+    tls:
+      insecure: true
+extensions:
+  health_check: {}
+  zpages: {}
+service:
+  extensions: [health_check, zpages]
+  pipelines:
+    traces:
+      receivers: [otlp]
+      processors: [tail_sampling, attributes, batch]
+      exporters: [otlp/tempo]
+    logs:
+      receivers: [otlp]
+      processors: [attributes, batch]
+      exporters: [otlphttp/loki]
+    metrics:
+      receivers: [otlp]
+      processors: [batch]
+      exporters: [prometheusremotewrite]
+`;
+}
+/** Grafana Tempo — single-binary mode with local block storage. */
+export function generateTempoConfig(cfg: DeployConfig): string {
+  const retention = pickRetention(cfg.observability);
+  return `# Generated by \`arc platform deploy\` — do not edit by hand.
+server:
+  http_listen_port: 3200
+  grpc_listen_port: 9095
+distributor:
+  receivers:
+    otlp:
+      protocols:
+        grpc:
+          endpoint: 0.0.0.0:4317
+        http:
+          endpoint: 0.0.0.0:4318
+ingester:
+  trace_idle_period: 10s
+  max_block_bytes: 1048576
+  max_block_duration: 5m
+compactor:
+  compaction:
+    block_retention: ${retention.traces}
+storage:
+  trace:
+    backend: local
+    local:
+      path: /var/tempo/blocks
+    wal:
+      path: /var/tempo/wal
+metrics_generator:
+  registry:
+    external_labels:
+      source: tempo
+  storage:
+    path: /var/tempo/generator/wal
+    remote_write:
+      - url: http://prometheus:9090/api/v1/write
+        send_exemplars: true
+overrides:
+  defaults:
+    metrics_generator:
+      processors: [service-graphs, span-metrics]
+`;
+}
+/** Loki — single-binary mode, filesystem chunks. */
+export function generateLokiConfig(cfg: DeployConfig): string {
+  const retention = pickRetention(cfg.observability);
+  return `# Generated by \`arc platform deploy\` — do not edit by hand.
+auth_enabled: false
+server:
+  http_listen_port: 3100
+common:
+  instance_addr: 127.0.0.1
+  path_prefix: /loki
+  storage:
+    filesystem:
+      chunks_directory: /loki/chunks
+      rules_directory: /loki/rules
+  replication_factor: 1
+  ring:
+    kvstore:
+      store: inmemory
+schema_config:
+  configs:
+    - from: 2024-01-01
+      store: tsdb
+      object_store: filesystem
+      schema: v13
+      index:
+        prefix: index_
+        period: 24h
+limits_config:
+  retention_period: ${retention.logs}
+  allow_structured_metadata: true
+compactor:
+  working_directory: /loki/compactor
+  retention_enabled: true
+  delete_request_store: filesystem
+`;
+}
+/** Prometheus — accepts remote_write from the collector, scrapes itself.
+ *  Retention is set via the `--storage.tsdb.retention.time` command-line
+ *  flag in compose.ts (Prometheus rejects retention inside the YAML). */
+export function generatePrometheusConfig(_cfg: DeployConfig): string {
+  return `# Generated by \`arc platform deploy\` — do not edit by hand.
+global:
+  scrape_interval: 15s
+  evaluation_interval: 15s
+scrape_configs:
+  - job_name: prometheus
+    static_configs:
+      - targets: [localhost:9090]
+  - job_name: otel-collector
+    static_configs:
+      - targets: [otel-collector:8888]
+# remote-write inbound is enabled via the --web.enable-remote-write-receiver
+# command-line flag (compose.ts). Retention via --storage.tsdb.retention.time.
+`;
+}
+/** Grafana datasource provisioning — Tempo + Loki + Prometheus, all pre-wired. */
+export function generateGrafanaDatasources(): string {
+  return `# Generated by \`arc platform deploy\` — do not edit by hand.
+apiVersion: 1
+datasources:
+  - name: Tempo
+    type: tempo
+    access: proxy
+    url: http://tempo:3200
+    uid: tempo
+    jsonData:
+      tracesToLogsV2:
+        datasourceUid: loki
+        spanStartTimeShift: -5m
+        spanEndTimeShift: 5m
+      serviceMap:
+        datasourceUid: prometheus
+  - name: Loki
+    type: loki
+    access: proxy
+    url: http://loki:3100
+    uid: loki
+    jsonData:
+      derivedFields:
+        - datasourceUid: tempo
+          matcherRegex: "trace_id=(\\\\w+)"
+          name: TraceID
+          url: $\${__value.raw}
+  - name: Prometheus
+    type: prometheus
+    access: proxy
+    url: http://prometheus:9090
+    uid: prometheus
+    isDefault: true
+`;
+}
+/** Grafana dashboard-provider config — points Grafana at the bind-mounted
+ *  dashboards directory and tells it to refresh on file change. */
+export function generateGrafanaDashboardsProvider(): string {
+  return `# Generated by \`arc platform deploy\` — do not edit by hand.
+apiVersion: 1
+providers:
+  - name: arc
+    orgId: 1
+    folder: Arc
+    type: file
+    disableDeletion: false
+    editable: true
+    updateIntervalSeconds: 30
+    allowUiUpdates: true
+    options:
+      path: /etc/grafana/provisioning/dashboards/arc
+      foldersFromFilesStructure: false
+`;
+}
+/** Top-level "Arc Service Overview" dashboard. Single comprehensive panel
+ *  set that answers the 80% of operator questions: traffic volume, error
+ *  rate, latency percentiles, slowest commands, DB activity, recent logs.
+ *  Designed for Grafana 11+; uses the auto-provisioned Prometheus / Loki /
+ *  Tempo datasources by UID so it works without manual setup. */
+export function generateArcOverviewDashboard(): string {
+  const dashboard = {
+    title: "Arc Service Overview",
+    uid: "arc-overview",
+    schemaVersion: 39,
+    version: 1,
+    refresh: "30s",
+    time: { from: "now-1h", to: "now" },
+    timepicker: {},
+    tags: ["arc", "auto-provisioned"],
+    templating: {
+      list: [
+        {
+          name: "service",
+          label: "Service",
+          type: "query",
+          datasource: { type: "prometheus", uid: "prometheus" },
+          query: "label_values(traces_spanmetrics_calls_total, service_name)",
+          refresh: 2,
+          includeAll: false,
+          multi: false,
+          current: { text: "arc-prod", value: "arc-prod" },
+        },
+      ],
+    },
+    panels: [
+      // Row: top-line stats
+      panelStat(
+        "Request rate (req/s)",
+        { x: 0, y: 0, w: 6, h: 4 },
+        'sum(rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m]))',
+        "reqps",
+      ),
+      panelStat(
+        "Error rate (%)",
+        { x: 6, y: 0, w: 6, h: 4 },
+        'sum(rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER", status_code="STATUS_CODE_ERROR"}[5m])) / clamp_min(sum(rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])), 0.001) * 100',
+        "percent",
+        { red: 1, orange: 0.1 },
+      ),
+      panelStat(
+        "P99 latency",
+        { x: 12, y: 0, w: 6, h: 4 },
+        'histogram_quantile(0.99, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
+        "ms",
+        { red: 1000, orange: 300 },
+      ),
+      panelStat(
+        "Active commands/sec",
+        { x: 18, y: 0, w: 6, h: 4 },
+        'sum(rate(arc_commands_total{service_name="$service"}[5m]))',
+        "ops",
+      ),
+      // Row: request volume + latency over time
+      panelTimeseries(
+        "Request rate by route",
+        { x: 0, y: 4, w: 12, h: 8 },
+        'sum by (span_name) (rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[1m]))',
+        "{{span_name}}",
+        "reqps",
+      ),
+      panelTimeseries(
+        "Latency percentiles",
+        { x: 12, y: 4, w: 12, h: 8 },
+        [
+          {
+            expr: 'histogram_quantile(0.5, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
+            legend: "p50",
+          },
+          {
+            expr: 'histogram_quantile(0.95, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
+            legend: "p95",
+          },
+          {
+            expr: 'histogram_quantile(0.99, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
+            legend: "p99",
+          },
+        ],
+        "ms",
+      ),
+      // Row: commands
+      panelTimeseries(
+        "Commands per second",
+        { x: 0, y: 12, w: 12, h: 8 },
+        'sum by (arc_command_name) (rate(arc_commands_total{service_name="$service"}[1m]))',
+        "{{arc_command_name}}",
+        "ops",
+      ),
+      panelTimeseries(
+        "Command p95 latency",
+        { x: 12, y: 12, w: 12, h: 8 },
+        'histogram_quantile(0.95, sum by (arc_command_name, le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service"}[5m])))',
+        "{{arc_command_name}}",
+        "ms",
+      ),
+      // Row: database
+      panelTimeseries(
+        "DB find ops/sec by collection",
+        { x: 0, y: 20, w: 12, h: 8 },
+        'sum by (db_collection_name) (rate(arc_db_find_ms_milliseconds_count{service_name="$service"}[1m]))',
+        "{{db_collection_name}}",
+        "ops",
+      ),
+      panelTimeseries(
+        "DB find p95 latency",
+        { x: 12, y: 20, w: 12, h: 8 },
+        'histogram_quantile(0.95, sum by (db_collection_name, le) (rate(arc_db_find_ms_milliseconds_bucket{service_name="$service"}[5m])))',
+        "{{db_collection_name}}",
+        "ms",
+      ),
+      // Row: logs panel for the service (Loki)
+      {
+        title: "Recent error logs",
+        type: "logs",
+        gridPos: { x: 0, y: 28, w: 24, h: 8 },
+        datasource: { type: "loki", uid: "loki" },
+        targets: [
+          {
+            expr: '{service_name="$service"} |= `ERROR`',
+            refId: "A",
+          },
+        ],
+        options: {
+          showTime: true,
+          showLabels: false,
+          showCommonLabels: false,
+          wrapLogMessage: true,
+          enableLogDetails: true,
+          dedupStrategy: "none",
+          sortOrder: "Descending",
+        },
+      },
+    ],
+  };
+  return JSON.stringify(dashboard, null, 2);
+}
+/** Tempo Search dashboard — convenience entry point. Single panel that
+ *  links into Explore with a TraceQL search for the selected service. */
+export function generateArcTracesDashboard(): string {
+  const dashboard = {
+    title: "Arc Recent Traces",
+    uid: "arc-traces",
+    schemaVersion: 39,
+    version: 1,
+    refresh: "1m",
+    time: { from: "now-1h", to: "now" },
+    tags: ["arc", "auto-provisioned"],
+    templating: {
+      list: [
+        {
+          name: "service",
+          label: "Service",
+          type: "query",
+          datasource: { type: "prometheus", uid: "prometheus" },
+          query: "label_values(traces_spanmetrics_calls_total, service_name)",
+          refresh: 2,
+          current: { text: "arc-prod", value: "arc-prod" },
+        },
+      ],
+    },
+    panels: [
+      {
+        title: "Slowest traces (p95 ≥ 500ms)",
+        type: "traces",
+        gridPos: { x: 0, y: 0, w: 24, h: 14 },
+        datasource: { type: "tempo", uid: "tempo" },
+        targets: [
+          {
+            queryType: "traceql",
+            query: '{resource.service.name = "$service" && duration > 500ms}',
+            refId: "A",
+            limit: 20,
+          },
+        ],
+      },
+      {
+        title: "Recent errors",
+        type: "traces",
+        gridPos: { x: 0, y: 14, w: 24, h: 14 },
+        datasource: { type: "tempo", uid: "tempo" },
+        targets: [
+          {
+            queryType: "traceql",
+            query: '{resource.service.name = "$service" && status = error}',
+            refId: "A",
+            limit: 20,
+          },
+        ],
+      },
+    ],
+  };
+  return JSON.stringify(dashboard, null, 2);
+}
+/** Service Map / Topology dashboard — leverages Tempo's metrics_generator
+ *  service-graph output (`traces_service_graph_request_*`). Shows the call
+ *  graph between services with rate + latency on each edge. */
+export function generateArcServiceMapDashboard(): string {
+  const dashboard = {
+    title: "Arc Service Map",
+    uid: "arc-service-map",
+    schemaVersion: 39,
+    version: 1,
+    refresh: "30s",
+    time: { from: "now-1h", to: "now" },
+    tags: ["arc", "auto-provisioned"],
+    panels: [
+      // Service-to-service call rate
+      panelTimeseries(
+        "Service-to-service request rate",
+        { x: 0, y: 0, w: 24, h: 9 },
+        'sum by (client, server) (rate(traces_service_graph_request_total[1m]))',
+        "{{client}} → {{server}}",
+        "reqps",
+      ),
+      // Edge p95 latency
+      panelTimeseries(
+        "Inter-service p95 latency",
+        { x: 0, y: 9, w: 24, h: 9 },
+        'histogram_quantile(0.95, sum by (client, server, le) (rate(traces_service_graph_request_server_seconds_bucket[5m]))) * 1000',
+        "{{client}} → {{server}}",
+        "ms",
+      ),
+      // Service graph node-degree table (which services talk to which)
+      {
+        title: "Service-graph edges (last 5m)",
+        type: "table",
+        gridPos: { x: 0, y: 18, w: 24, h: 8 },
+        datasource: { type: "prometheus", uid: "prometheus" },
+        targets: [
+          {
+            expr: "sum by (client, server) (increase(traces_service_graph_request_total[5m]))",
+            refId: "A",
+            instant: true,
+            format: "table",
+          },
+        ],
+        transformations: [
+          { id: "organize", options: { excludeByName: { Time: true } } },
+          { id: "sortBy", options: { sort: [{ field: "Value", desc: true }] } },
+        ],
+      },
+    ],
+  };
+  return JSON.stringify(dashboard, null, 2);
+}
+/** Logs Explorer — Loki-focused dashboard. Top error patterns, ingest
+ *  rate per service, ad-hoc search with trace-link via derivedFields
+ *  (configured in datasources.yaml). */
+export function generateArcLogsDashboard(): string {
+  const dashboard = {
+    title: "Arc Logs Explorer",
+    uid: "arc-logs",
+    schemaVersion: 39,
+    version: 1,
+    refresh: "30s",
+    time: { from: "now-1h", to: "now" },
+    tags: ["arc", "auto-provisioned"],
+    templating: {
+      list: [
+        {
+          name: "service",
+          label: "Service",
+          type: "query",
+          datasource: { type: "loki", uid: "loki" },
+          query: "label_values(service_name)",
+          refresh: 2,
+          current: { text: "arc-prod", value: "arc-prod" },
+        },
+        {
+          name: "search",
+          label: "Filter",
+          type: "textbox",
+          query: "",
+          current: { text: "", value: "" },
+        },
+      ],
+    },
+    panels: [
+      // Stat: total log lines, last hour
+      panelStat(
+        "Logs ingested (1h)",
+        { x: 0, y: 0, w: 6, h: 4 },
+        'sum(increase({service_name="$service"}[1h]))',
+        "short",
+      ),
+      // Stat: errors (last hour)
+      panelStat(
+        "Errors (1h)",
+        { x: 6, y: 0, w: 6, h: 4 },
+        'sum(increase({service_name="$service", severity_text=~"ERROR|FATAL"}[1h]))',
+        "short",
+        { orange: 1, red: 50 },
+      ),
+      // Time series: log volume by severity
+      {
+        title: "Log volume by severity",
+        type: "timeseries",
+        gridPos: { x: 12, y: 0, w: 12, h: 8 },
+        datasource: { type: "loki", uid: "loki" },
+        targets: [
+          {
+            expr: 'sum by (severity_text) (count_over_time({service_name="$service"} |~ "$search" [$__interval]))',
+            refId: "A",
+            legendFormat: "{{severity_text}}",
+          },
+        ],
+        fieldConfig: {
+          defaults: {
+            unit: "short",
+            custom: {
+              drawStyle: "bars",
+              fillOpacity: 50,
+              lineWidth: 0,
+              stacking: { mode: "normal", group: "A" },
+            },
+          },
+          overrides: [],
+        },
+        options: {
+          legend: { displayMode: "list", placement: "bottom", showLegend: true },
+          tooltip: { mode: "multi", sort: "desc" },
+        },
+      },
+      // Tail: live logs (filtered)
+      {
+        title: "Live tail (filtered by $search)",
+        type: "logs",
+        gridPos: { x: 0, y: 8, w: 24, h: 18 },
+        datasource: { type: "loki", uid: "loki" },
+        targets: [
+          {
+            expr: '{service_name="$service"} |~ "$search"',
+            refId: "A",
+          },
+        ],
+        options: {
+          showTime: true,
+          showLabels: false,
+          showCommonLabels: false,
+          wrapLogMessage: true,
+          enableLogDetails: true,
+          dedupStrategy: "none",
+          sortOrder: "Descending",
+        },
+      },
+    ],
+  };
+  return JSON.stringify(dashboard, null, 2);
+}
+/** Tail Sampling Insights — otel-collector self-metrics that tell us
+ *  how aggressively we're sampling, which policies fire most, and
+ *  whether the exporter is keeping up. */
+export function generateArcSamplingDashboard(): string {
+  const dashboard = {
+    title: "Arc Tail Sampling & Collector Health",
+    uid: "arc-sampling",
+    schemaVersion: 39,
+    version: 1,
+    refresh: "30s",
+    time: { from: "now-3h", to: "now" },
+    tags: ["arc", "auto-provisioned"],
+    panels: [
+      panelStat(
+        "Spans received/sec",
+        { x: 0, y: 0, w: 6, h: 4 },
+        "sum(rate(otelcol_receiver_accepted_spans[5m]))",
+        "ops",
+      ),
+      panelStat(
+        "Spans exported/sec (sampled)",
+        { x: 6, y: 0, w: 6, h: 4 },
+        "sum(rate(otelcol_exporter_sent_spans[5m]))",
+        "ops",
+      ),
+      panelStat(
+        "Spans dropped (refused) / 5m",
+        { x: 12, y: 0, w: 6, h: 4 },
+        "sum(increase(otelcol_receiver_refused_spans[5m]))",
+        "short",
+        { orange: 1, red: 100 },
+      ),
+      panelStat(
+        "Export failures / 5m",
+        { x: 18, y: 0, w: 6, h: 4 },
+        "sum(increase(otelcol_exporter_send_failed_spans[5m]))",
+        "short",
+        { orange: 1, red: 50 },
+      ),
+      panelTimeseries(
+        "Tail-sampling policy decisions",
+        { x: 0, y: 4, w: 12, h: 8 },
+        'sum by (policy) (rate(otelcol_processor_tail_sampling_count_traces_sampled{sampled="true"}[1m]))',
+        "{{policy}} sampled",
+        "ops",
+      ),
+      panelTimeseries(
+        "Receiver vs Exporter (effective sampling rate)",
+        { x: 12, y: 4, w: 12, h: 8 },
+        [
+          {
+            expr: "sum(rate(otelcol_receiver_accepted_spans[1m]))",
+            legend: "received",
+          },
+          {
+            expr: "sum(rate(otelcol_exporter_sent_spans[1m]))",
+            legend: "exported",
+          },
+        ],
+        "ops",
+      ),
+      panelTimeseries(
+        "Collector queue size (BatchSpanProcessor)",
+        { x: 0, y: 12, w: 12, h: 8 },
+        "otelcol_processor_batch_batch_send_size_sum / clamp_min(otelcol_processor_batch_batch_send_size_count, 1)",
+        "avg batch size",
+        "short",
+      ),
+      panelTimeseries(
+        "Collector process memory",
+        { x: 12, y: 12, w: 12, h: 8 },
+        "process_resident_memory_bytes{job=\"otel-collector\"}",
+        "RSS",
+        "bytes",
+      ),
+    ],
+  };
+  return JSON.stringify(dashboard, null, 2);
+}
+/** Per-Command drill-down — single command selector, full latency
+ *  + rate + error breakdown. Linked from "Arc Service Overview"
+ *  panels via panel-link → opens this dashboard pre-filtered. */
+export function generateArcCommandDashboard(): string {
+  const dashboard = {
+    title: "Arc Command Drill-Down",
+    uid: "arc-command",
+    schemaVersion: 39,
+    version: 1,
+    refresh: "30s",
+    time: { from: "now-3h", to: "now" },
+    tags: ["arc", "auto-provisioned"],
+    templating: {
+      list: [
+        {
+          name: "service",
+          label: "Service",
+          type: "query",
+          datasource: { type: "prometheus", uid: "prometheus" },
+          query: "label_values(arc_commands_total, service_name)",
+          refresh: 2,
+          current: { text: "arc-prod", value: "arc-prod" },
+        },
+        {
+          name: "command",
+          label: "Command",
+          type: "query",
+          datasource: { type: "prometheus", uid: "prometheus" },
+          query:
+            'label_values(arc_commands_total{service_name="$service"}, arc_command_name)',
+          refresh: 2,
+          includeAll: false,
+          multi: false,
+        },
+      ],
+    },
+    panels: [
+      panelStat(
+        "Call rate",
+        { x: 0, y: 0, w: 6, h: 4 },
+        'sum(rate(arc_commands_total{service_name="$service", arc_command_name="$command"}[5m]))',
+        "ops",
+      ),
+      panelStat(
+        "P50 latency",
+        { x: 6, y: 0, w: 6, h: 4 },
+        'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
+        "ms",
+      ),
+      panelStat(
+        "P95 latency",
+        { x: 12, y: 0, w: 6, h: 4 },
+        'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
+        "ms",
+        { orange: 200, red: 1000 },
+      ),
+      panelStat(
+        "P99 latency",
+        { x: 18, y: 0, w: 6, h: 4 },
+        'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
+        "ms",
+        { orange: 500, red: 2000 },
+      ),
+      panelTimeseries(
+        "Call rate over time",
+        { x: 0, y: 4, w: 12, h: 8 },
+        'sum(rate(arc_commands_total{service_name="$service", arc_command_name="$command"}[1m]))',
+        "calls/s",
+        "ops",
+      ),
+      panelTimeseries(
+        "Latency percentiles",
+        { x: 12, y: 4, w: 12, h: 8 },
+        [
+          {
+            expr: 'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
+            legend: "p50",
+          },
+          {
+            expr: 'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
+            legend: "p95",
+          },
+          {
+            expr: 'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
+            legend: "p99",
+          },
+        ],
+        "ms",
+      ),
+      // Tempo traces for this specific command
+      {
+        title: "Recent traces (sampled)",
+        type: "traces",
+        gridPos: { x: 0, y: 12, w: 24, h: 14 },
+        datasource: { type: "tempo", uid: "tempo" },
+        targets: [
+          {
+            queryType: "traceql",
+            query: '{resource.service.name = "$service" && name = "command.$command"}',
+            refId: "A",
+            limit: 20,
+          },
+        ],
+      },
+    ],
+  };
+  return JSON.stringify(dashboard, null, 2);
+}
+/** All config files needed on the host. Returns map of relative-path → contents
+ *  so bootstrap can write+upload them in one pass. */
+export function generateObservabilityConfigs(
+  cfg: DeployConfig,
+): Record<string, string> {
+  return {
+    "observability/otel-collector-config.yaml": generateOtelCollectorConfig(cfg),
+    "observability/tempo.yaml": generateTempoConfig(cfg),
+    "observability/loki-config.yaml": generateLokiConfig(cfg),
+    "observability/prometheus.yml": generatePrometheusConfig(cfg),
+    "observability/grafana-datasources.yaml": generateGrafanaDatasources(),
+    "observability/grafana-dashboards.yaml": generateGrafanaDashboardsProvider(),
+    "observability/grafana-dashboards/arc-overview.json": generateArcOverviewDashboard(),
+    "observability/grafana-dashboards/arc-traces.json": generateArcTracesDashboard(),
+    "observability/grafana-dashboards/arc-service-map.json": generateArcServiceMapDashboard(),
+    "observability/grafana-dashboards/arc-logs.json": generateArcLogsDashboard(),
+    "observability/grafana-dashboards/arc-sampling.json": generateArcSamplingDashboard(),
+    "observability/grafana-dashboards/arc-command.json": generateArcCommandDashboard(),
+  };
+}
+// ---------------------------------------------------------------------------
+// Panel builders — keep dashboards readable. Returns Grafana panel JSON.
+// ---------------------------------------------------------------------------
+interface GridPos { x: number; y: number; w: number; h: number; }
+interface Threshold { orange?: number; red?: number; }
+function panelStat(
+  title: string,
+  gridPos: GridPos,
+  expr: string,
+  unit: string,
+  thresholds?: Threshold,
+) {
+  const steps: { color: string; value: number | null }[] = [
+    { color: "green", value: null },
+  ];
+  if (thresholds?.orange !== undefined) {
+    steps.push({ color: "orange", value: thresholds.orange });
+  }
+  if (thresholds?.red !== undefined) {
+    steps.push({ color: "red", value: thresholds.red });
+  }
+  return {
+    title,
+    type: "stat",
+    gridPos,
+    datasource: { type: "prometheus", uid: "prometheus" },
+    targets: [{ expr, refId: "A", legendFormat: title }],
+    fieldConfig: {
+      defaults: {
+        unit,
+        thresholds: { mode: "absolute", steps },
+      },
+      overrides: [],
+    },
+    options: {
+      colorMode: "value",
+      graphMode: "area",
+      justifyMode: "auto",
+      reduceOptions: { calcs: ["lastNotNull"], fields: "", values: false },
+      textMode: "auto",
+    },
+  };
+}
+function panelTimeseries(
+  title: string,
+  gridPos: GridPos,
+  query: string | { expr: string; legend: string }[],
+  legend: string,
+  unit: string,
+) {
+  const targets = Array.isArray(query)
+    ? query.map((q, i) => ({
+        expr: q.expr,
+        refId: String.fromCharCode(65 + i),
+        legendFormat: q.legend,
+      }))
+    : [{ expr: query, refId: "A", legendFormat: legend }];
+  return {
+    title,
+    type: "timeseries",
+    gridPos,
+    datasource: { type: "prometheus", uid: "prometheus" },
+    targets,
+    fieldConfig: {
+      defaults: {
+        unit,
+        custom: {
+          drawStyle: "line",
+          lineInterpolation: "smooth",
+          lineWidth: 1.5,
+          fillOpacity: 10,
+          showPoints: "never",
+        },
+      },
+      overrides: [],
+    },
+    options: {
+      legend: { displayMode: "list", placement: "bottom", showLegend: true },
+      tooltip: { mode: "multi", sort: "desc" },
+    },
+  };
+}