@arcote.tech/arc-cli 0.7.5 → 0.7.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,958 @@
1
+ import type { DeployConfig, DeployObservability } from "./config";
2
+
3
+ // ---------------------------------------------------------------------------
4
+ // Observability stack config templates.
5
+ //
6
+ // All strings are deterministic for the inputs (cfg + retention) — no random
7
+ // IDs, no timestamps — so re-running deploy with unchanged config is a no-op
8
+ // at the file-write level. Bootstrap diffs filesystem before bouncing
9
+ // services, so this matters.
10
+ //
11
+ // Defaults:
12
+ // - traces: 7d retention (Tempo block storage on local disk)
13
+ // - logs: 7d retention (Loki chunks on local disk)
14
+ // - metrics: 30d retention (Prometheus TSDB on local disk)
15
+ //
16
+ // Tail sampling: every error + every span >500ms + 10% random. Decided in
17
+ // the collector so per-service SDKs can be left at always-on without
18
+ // flooding the backend.
19
+ // ---------------------------------------------------------------------------
20
+
21
+ const DEFAULT_RETENTION = {
22
+ traces: "168h", // 7d
23
+ logs: "168h",
24
+ metrics: "30d",
25
+ } as const;
26
+
27
+ function pickRetention(o: DeployObservability | undefined) {
28
+ return {
29
+ traces: o?.retention?.traces ?? DEFAULT_RETENTION.traces,
30
+ logs: o?.retention?.logs ?? DEFAULT_RETENTION.logs,
31
+ metrics: o?.retention?.metrics ?? DEFAULT_RETENTION.metrics,
32
+ };
33
+ }
34
+
35
+ /** OpenTelemetry Collector — receives OTLP from app containers + browser,
36
+ * applies tail sampling, fans out to Tempo (traces), Loki (logs),
37
+ * Prometheus remote-write (metrics). */
38
+ export function generateOtelCollectorConfig(cfg: DeployConfig): string {
39
+ const envNames = Object.keys(cfg.envs);
40
+ return `# Generated by \`arc platform deploy\` — do not edit by hand.
41
+ receivers:
42
+ otlp:
43
+ protocols:
44
+ grpc:
45
+ endpoint: 0.0.0.0:4317
46
+ http:
47
+ endpoint: 0.0.0.0:4318
48
+ cors:
49
+ allowed_origins:
50
+ ${envNames.map((name) => ` - "https://${cfg.envs[name]!.domain}"`).join("\n")}
51
+ allowed_headers:
52
+ - traceparent
53
+ - tracestate
54
+ - content-type
55
+
56
+ processors:
57
+ batch:
58
+ timeout: 5s
59
+ send_batch_size: 512
60
+ send_batch_max_size: 1024
61
+
62
+ # Tail-based sampling — applied after a full trace has been assembled.
63
+ # Errors and slow traces are kept 100%, everything else at 10%.
64
+ tail_sampling:
65
+ decision_wait: 10s
66
+ num_traces: 50000
67
+ expected_new_traces_per_sec: 100
68
+ policies:
69
+ - name: errors
70
+ type: status_code
71
+ status_code: { status_codes: [ERROR] }
72
+ - name: slow
73
+ type: latency
74
+ latency: { threshold_ms: 500 }
75
+ - name: random_10pct
76
+ type: probabilistic
77
+ probabilistic: { sampling_percentage: 10 }
78
+
79
+ # Drop high-cardinality / PII attributes that might slip past app-side
80
+ # sanitization. Belt-and-suspenders before they hit long-term storage.
81
+ attributes:
82
+ actions:
83
+ - key: http.request.header.authorization
84
+ action: delete
85
+ - key: http.request.header.cookie
86
+ action: delete
87
+
88
+ exporters:
89
+ otlp/tempo:
90
+ endpoint: tempo:4317
91
+ tls:
92
+ insecure: true
93
+
94
+ otlphttp/loki:
95
+ endpoint: http://loki:3100/otlp
96
+ tls:
97
+ insecure: true
98
+
99
+ prometheusremotewrite:
100
+ endpoint: http://prometheus:9090/api/v1/write
101
+ tls:
102
+ insecure: true
103
+
104
+ extensions:
105
+ health_check: {}
106
+ zpages: {}
107
+
108
+ service:
109
+ extensions: [health_check, zpages]
110
+ pipelines:
111
+ traces:
112
+ receivers: [otlp]
113
+ processors: [tail_sampling, attributes, batch]
114
+ exporters: [otlp/tempo]
115
+ logs:
116
+ receivers: [otlp]
117
+ processors: [attributes, batch]
118
+ exporters: [otlphttp/loki]
119
+ metrics:
120
+ receivers: [otlp]
121
+ processors: [batch]
122
+ exporters: [prometheusremotewrite]
123
+ `;
124
+ }
125
+
126
+ /** Grafana Tempo — single-binary mode with local block storage. */
127
+ export function generateTempoConfig(cfg: DeployConfig): string {
128
+ const retention = pickRetention(cfg.observability);
129
+ return `# Generated by \`arc platform deploy\` — do not edit by hand.
130
+ server:
131
+ http_listen_port: 3200
132
+ grpc_listen_port: 9095
133
+
134
+ distributor:
135
+ receivers:
136
+ otlp:
137
+ protocols:
138
+ grpc:
139
+ endpoint: 0.0.0.0:4317
140
+ http:
141
+ endpoint: 0.0.0.0:4318
142
+
143
+ ingester:
144
+ trace_idle_period: 10s
145
+ max_block_bytes: 1048576
146
+ max_block_duration: 5m
147
+
148
+ compactor:
149
+ compaction:
150
+ block_retention: ${retention.traces}
151
+
152
+ storage:
153
+ trace:
154
+ backend: local
155
+ local:
156
+ path: /var/tempo/blocks
157
+ wal:
158
+ path: /var/tempo/wal
159
+
160
+ metrics_generator:
161
+ registry:
162
+ external_labels:
163
+ source: tempo
164
+ storage:
165
+ path: /var/tempo/generator/wal
166
+ remote_write:
167
+ - url: http://prometheus:9090/api/v1/write
168
+ send_exemplars: true
169
+
170
+ overrides:
171
+ defaults:
172
+ metrics_generator:
173
+ processors: [service-graphs, span-metrics]
174
+ `;
175
+ }
176
+
177
+ /** Loki — single-binary mode, filesystem chunks. */
178
+ export function generateLokiConfig(cfg: DeployConfig): string {
179
+ const retention = pickRetention(cfg.observability);
180
+ return `# Generated by \`arc platform deploy\` — do not edit by hand.
181
+ auth_enabled: false
182
+
183
+ server:
184
+ http_listen_port: 3100
185
+
186
+ common:
187
+ instance_addr: 127.0.0.1
188
+ path_prefix: /loki
189
+ storage:
190
+ filesystem:
191
+ chunks_directory: /loki/chunks
192
+ rules_directory: /loki/rules
193
+ replication_factor: 1
194
+ ring:
195
+ kvstore:
196
+ store: inmemory
197
+
198
+ schema_config:
199
+ configs:
200
+ - from: 2024-01-01
201
+ store: tsdb
202
+ object_store: filesystem
203
+ schema: v13
204
+ index:
205
+ prefix: index_
206
+ period: 24h
207
+
208
+ limits_config:
209
+ retention_period: ${retention.logs}
210
+ allow_structured_metadata: true
211
+
212
+ compactor:
213
+ working_directory: /loki/compactor
214
+ retention_enabled: true
215
+ delete_request_store: filesystem
216
+ `;
217
+ }
218
+
219
+ /** Prometheus — accepts remote_write from the collector, scrapes itself.
220
+ * Retention is set via the `--storage.tsdb.retention.time` command-line
221
+ * flag in compose.ts (Prometheus rejects retention inside the YAML). */
222
+ export function generatePrometheusConfig(_cfg: DeployConfig): string {
223
+ return `# Generated by \`arc platform deploy\` — do not edit by hand.
224
+ global:
225
+ scrape_interval: 15s
226
+ evaluation_interval: 15s
227
+
228
+ scrape_configs:
229
+ - job_name: prometheus
230
+ static_configs:
231
+ - targets: [localhost:9090]
232
+ - job_name: otel-collector
233
+ static_configs:
234
+ - targets: [otel-collector:8888]
235
+
236
+ # remote-write inbound is enabled via the --web.enable-remote-write-receiver
237
+ # command-line flag (compose.ts). Retention via --storage.tsdb.retention.time.
238
+ `;
239
+ }
240
+
241
+ /** Grafana datasource provisioning — Tempo + Loki + Prometheus, all pre-wired. */
242
+ export function generateGrafanaDatasources(): string {
243
+ return `# Generated by \`arc platform deploy\` — do not edit by hand.
244
+ apiVersion: 1
245
+ datasources:
246
+ - name: Tempo
247
+ type: tempo
248
+ access: proxy
249
+ url: http://tempo:3200
250
+ uid: tempo
251
+ jsonData:
252
+ tracesToLogsV2:
253
+ datasourceUid: loki
254
+ spanStartTimeShift: -5m
255
+ spanEndTimeShift: 5m
256
+ serviceMap:
257
+ datasourceUid: prometheus
258
+ - name: Loki
259
+ type: loki
260
+ access: proxy
261
+ url: http://loki:3100
262
+ uid: loki
263
+ jsonData:
264
+ derivedFields:
265
+ - datasourceUid: tempo
266
+ matcherRegex: "trace_id=(\\\\w+)"
267
+ name: TraceID
268
+ url: $\${__value.raw}
269
+ - name: Prometheus
270
+ type: prometheus
271
+ access: proxy
272
+ url: http://prometheus:9090
273
+ uid: prometheus
274
+ isDefault: true
275
+ `;
276
+ }
277
+
278
+ /** Grafana dashboard-provider config — points Grafana at the bind-mounted
279
+ * dashboards directory and tells it to refresh on file change. */
280
+ export function generateGrafanaDashboardsProvider(): string {
281
+ return `# Generated by \`arc platform deploy\` — do not edit by hand.
282
+ apiVersion: 1
283
+ providers:
284
+ - name: arc
285
+ orgId: 1
286
+ folder: Arc
287
+ type: file
288
+ disableDeletion: false
289
+ editable: true
290
+ updateIntervalSeconds: 30
291
+ allowUiUpdates: true
292
+ options:
293
+ path: /etc/grafana/provisioning/dashboards/arc
294
+ foldersFromFilesStructure: false
295
+ `;
296
+ }
297
+
298
+ /** Top-level "Arc Service Overview" dashboard. Single comprehensive panel
299
+ * set that answers the 80% of operator questions: traffic volume, error
300
+ * rate, latency percentiles, slowest commands, DB activity, recent logs.
301
+ * Designed for Grafana 11+; uses the auto-provisioned Prometheus / Loki /
302
+ * Tempo datasources by UID so it works without manual setup. */
303
+ export function generateArcOverviewDashboard(): string {
304
+ const dashboard = {
305
+ title: "Arc Service Overview",
306
+ uid: "arc-overview",
307
+ schemaVersion: 39,
308
+ version: 1,
309
+ refresh: "30s",
310
+ time: { from: "now-1h", to: "now" },
311
+ timepicker: {},
312
+ tags: ["arc", "auto-provisioned"],
313
+ templating: {
314
+ list: [
315
+ {
316
+ name: "service",
317
+ label: "Service",
318
+ type: "query",
319
+ datasource: { type: "prometheus", uid: "prometheus" },
320
+ query: "label_values(traces_spanmetrics_calls_total, service_name)",
321
+ refresh: 2,
322
+ includeAll: false,
323
+ multi: false,
324
+ current: { text: "arc-prod", value: "arc-prod" },
325
+ },
326
+ ],
327
+ },
328
+ panels: [
329
+ // Row: top-line stats
330
+ panelStat(
331
+ "Request rate (req/s)",
332
+ { x: 0, y: 0, w: 6, h: 4 },
333
+ 'sum(rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m]))',
334
+ "reqps",
335
+ ),
336
+ panelStat(
337
+ "Error rate (%)",
338
+ { x: 6, y: 0, w: 6, h: 4 },
339
+ 'sum(rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER", status_code="STATUS_CODE_ERROR"}[5m])) / clamp_min(sum(rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])), 0.001) * 100',
340
+ "percent",
341
+ { red: 1, orange: 0.1 },
342
+ ),
343
+ panelStat(
344
+ "P99 latency",
345
+ { x: 12, y: 0, w: 6, h: 4 },
346
+ 'histogram_quantile(0.99, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
347
+ "ms",
348
+ { red: 1000, orange: 300 },
349
+ ),
350
+ panelStat(
351
+ "Active commands/sec",
352
+ { x: 18, y: 0, w: 6, h: 4 },
353
+ 'sum(rate(arc_commands_total{service_name="$service"}[5m]))',
354
+ "ops",
355
+ ),
356
+
357
+ // Row: request volume + latency over time
358
+ panelTimeseries(
359
+ "Request rate by route",
360
+ { x: 0, y: 4, w: 12, h: 8 },
361
+ 'sum by (span_name) (rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[1m]))',
362
+ "{{span_name}}",
363
+ "reqps",
364
+ ),
365
+ panelTimeseries(
366
+ "Latency percentiles",
367
+ { x: 12, y: 4, w: 12, h: 8 },
368
+ [
369
+ {
370
+ expr: 'histogram_quantile(0.5, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
371
+ legend: "p50",
372
+ },
373
+ {
374
+ expr: 'histogram_quantile(0.95, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
375
+ legend: "p95",
376
+ },
377
+ {
378
+ expr: 'histogram_quantile(0.99, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
379
+ legend: "p99",
380
+ },
381
+ ],
382
+ "ms",
383
+ ),
384
+
385
+ // Row: commands
386
+ panelTimeseries(
387
+ "Commands per second",
388
+ { x: 0, y: 12, w: 12, h: 8 },
389
+ 'sum by (arc_command_name) (rate(arc_commands_total{service_name="$service"}[1m]))',
390
+ "{{arc_command_name}}",
391
+ "ops",
392
+ ),
393
+ panelTimeseries(
394
+ "Command p95 latency",
395
+ { x: 12, y: 12, w: 12, h: 8 },
396
+ 'histogram_quantile(0.95, sum by (arc_command_name, le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service"}[5m])))',
397
+ "{{arc_command_name}}",
398
+ "ms",
399
+ ),
400
+
401
+ // Row: database
402
+ panelTimeseries(
403
+ "DB find ops/sec by collection",
404
+ { x: 0, y: 20, w: 12, h: 8 },
405
+ 'sum by (db_collection_name) (rate(arc_db_find_ms_milliseconds_count{service_name="$service"}[1m]))',
406
+ "{{db_collection_name}}",
407
+ "ops",
408
+ ),
409
+ panelTimeseries(
410
+ "DB find p95 latency",
411
+ { x: 12, y: 20, w: 12, h: 8 },
412
+ 'histogram_quantile(0.95, sum by (db_collection_name, le) (rate(arc_db_find_ms_milliseconds_bucket{service_name="$service"}[5m])))',
413
+ "{{db_collection_name}}",
414
+ "ms",
415
+ ),
416
+
417
+ // Row: logs panel for the service (Loki)
418
+ {
419
+ title: "Recent error logs",
420
+ type: "logs",
421
+ gridPos: { x: 0, y: 28, w: 24, h: 8 },
422
+ datasource: { type: "loki", uid: "loki" },
423
+ targets: [
424
+ {
425
+ expr: '{service_name="$service"} |= `ERROR`',
426
+ refId: "A",
427
+ },
428
+ ],
429
+ options: {
430
+ showTime: true,
431
+ showLabels: false,
432
+ showCommonLabels: false,
433
+ wrapLogMessage: true,
434
+ enableLogDetails: true,
435
+ dedupStrategy: "none",
436
+ sortOrder: "Descending",
437
+ },
438
+ },
439
+ ],
440
+ };
441
+ return JSON.stringify(dashboard, null, 2);
442
+ }
443
+
444
+ /** Tempo Search dashboard — convenience entry point. Single panel that
445
+ * links into Explore with a TraceQL search for the selected service. */
446
+ export function generateArcTracesDashboard(): string {
447
+ const dashboard = {
448
+ title: "Arc Recent Traces",
449
+ uid: "arc-traces",
450
+ schemaVersion: 39,
451
+ version: 1,
452
+ refresh: "1m",
453
+ time: { from: "now-1h", to: "now" },
454
+ tags: ["arc", "auto-provisioned"],
455
+ templating: {
456
+ list: [
457
+ {
458
+ name: "service",
459
+ label: "Service",
460
+ type: "query",
461
+ datasource: { type: "prometheus", uid: "prometheus" },
462
+ query: "label_values(traces_spanmetrics_calls_total, service_name)",
463
+ refresh: 2,
464
+ current: { text: "arc-prod", value: "arc-prod" },
465
+ },
466
+ ],
467
+ },
468
+ panels: [
469
+ {
470
+ title: "Slowest traces (p95 ≥ 500ms)",
471
+ type: "traces",
472
+ gridPos: { x: 0, y: 0, w: 24, h: 14 },
473
+ datasource: { type: "tempo", uid: "tempo" },
474
+ targets: [
475
+ {
476
+ queryType: "traceql",
477
+ query: '{resource.service.name = "$service" && duration > 500ms}',
478
+ refId: "A",
479
+ limit: 20,
480
+ },
481
+ ],
482
+ },
483
+ {
484
+ title: "Recent errors",
485
+ type: "traces",
486
+ gridPos: { x: 0, y: 14, w: 24, h: 14 },
487
+ datasource: { type: "tempo", uid: "tempo" },
488
+ targets: [
489
+ {
490
+ queryType: "traceql",
491
+ query: '{resource.service.name = "$service" && status = error}',
492
+ refId: "A",
493
+ limit: 20,
494
+ },
495
+ ],
496
+ },
497
+ ],
498
+ };
499
+ return JSON.stringify(dashboard, null, 2);
500
+ }
501
+
502
+ /** Service Map / Topology dashboard — leverages Tempo's metrics_generator
503
+ * service-graph output (`traces_service_graph_request_*`). Shows the call
504
+ * graph between services with rate + latency on each edge. */
505
+ export function generateArcServiceMapDashboard(): string {
506
+ const dashboard = {
507
+ title: "Arc Service Map",
508
+ uid: "arc-service-map",
509
+ schemaVersion: 39,
510
+ version: 1,
511
+ refresh: "30s",
512
+ time: { from: "now-1h", to: "now" },
513
+ tags: ["arc", "auto-provisioned"],
514
+ panels: [
515
+ // Service-to-service call rate
516
+ panelTimeseries(
517
+ "Service-to-service request rate",
518
+ { x: 0, y: 0, w: 24, h: 9 },
519
+ 'sum by (client, server) (rate(traces_service_graph_request_total[1m]))',
520
+ "{{client}} → {{server}}",
521
+ "reqps",
522
+ ),
523
+ // Edge p95 latency
524
+ panelTimeseries(
525
+ "Inter-service p95 latency",
526
+ { x: 0, y: 9, w: 24, h: 9 },
527
+ 'histogram_quantile(0.95, sum by (client, server, le) (rate(traces_service_graph_request_server_seconds_bucket[5m]))) * 1000',
528
+ "{{client}} → {{server}}",
529
+ "ms",
530
+ ),
531
+ // Service graph node-degree table (which services talk to which)
532
+ {
533
+ title: "Service-graph edges (last 5m)",
534
+ type: "table",
535
+ gridPos: { x: 0, y: 18, w: 24, h: 8 },
536
+ datasource: { type: "prometheus", uid: "prometheus" },
537
+ targets: [
538
+ {
539
+ expr: "sum by (client, server) (increase(traces_service_graph_request_total[5m]))",
540
+ refId: "A",
541
+ instant: true,
542
+ format: "table",
543
+ },
544
+ ],
545
+ transformations: [
546
+ { id: "organize", options: { excludeByName: { Time: true } } },
547
+ { id: "sortBy", options: { sort: [{ field: "Value", desc: true }] } },
548
+ ],
549
+ },
550
+ ],
551
+ };
552
+ return JSON.stringify(dashboard, null, 2);
553
+ }
554
+
555
+ /** Logs Explorer — Loki-focused dashboard. Top error patterns, ingest
556
+ * rate per service, ad-hoc search with trace-link via derivedFields
557
+ * (configured in datasources.yaml). */
558
+ export function generateArcLogsDashboard(): string {
559
+ const dashboard = {
560
+ title: "Arc Logs Explorer",
561
+ uid: "arc-logs",
562
+ schemaVersion: 39,
563
+ version: 1,
564
+ refresh: "30s",
565
+ time: { from: "now-1h", to: "now" },
566
+ tags: ["arc", "auto-provisioned"],
567
+ templating: {
568
+ list: [
569
+ {
570
+ name: "service",
571
+ label: "Service",
572
+ type: "query",
573
+ datasource: { type: "loki", uid: "loki" },
574
+ query: "label_values(service_name)",
575
+ refresh: 2,
576
+ current: { text: "arc-prod", value: "arc-prod" },
577
+ },
578
+ {
579
+ name: "search",
580
+ label: "Filter",
581
+ type: "textbox",
582
+ query: "",
583
+ current: { text: "", value: "" },
584
+ },
585
+ ],
586
+ },
587
+ panels: [
588
+ // Stat: total log lines, last hour
589
+ panelStat(
590
+ "Logs ingested (1h)",
591
+ { x: 0, y: 0, w: 6, h: 4 },
592
+ 'sum(increase({service_name="$service"}[1h]))',
593
+ "short",
594
+ ),
595
+ // Stat: errors (last hour)
596
+ panelStat(
597
+ "Errors (1h)",
598
+ { x: 6, y: 0, w: 6, h: 4 },
599
+ 'sum(increase({service_name="$service", severity_text=~"ERROR|FATAL"}[1h]))',
600
+ "short",
601
+ { orange: 1, red: 50 },
602
+ ),
603
+ // Time series: log volume by severity
604
+ {
605
+ title: "Log volume by severity",
606
+ type: "timeseries",
607
+ gridPos: { x: 12, y: 0, w: 12, h: 8 },
608
+ datasource: { type: "loki", uid: "loki" },
609
+ targets: [
610
+ {
611
+ expr: 'sum by (severity_text) (count_over_time({service_name="$service"} |~ "$search" [$__interval]))',
612
+ refId: "A",
613
+ legendFormat: "{{severity_text}}",
614
+ },
615
+ ],
616
+ fieldConfig: {
617
+ defaults: {
618
+ unit: "short",
619
+ custom: {
620
+ drawStyle: "bars",
621
+ fillOpacity: 50,
622
+ lineWidth: 0,
623
+ stacking: { mode: "normal", group: "A" },
624
+ },
625
+ },
626
+ overrides: [],
627
+ },
628
+ options: {
629
+ legend: { displayMode: "list", placement: "bottom", showLegend: true },
630
+ tooltip: { mode: "multi", sort: "desc" },
631
+ },
632
+ },
633
+ // Tail: live logs (filtered)
634
+ {
635
+ title: "Live tail (filtered by $search)",
636
+ type: "logs",
637
+ gridPos: { x: 0, y: 8, w: 24, h: 18 },
638
+ datasource: { type: "loki", uid: "loki" },
639
+ targets: [
640
+ {
641
+ expr: '{service_name="$service"} |~ "$search"',
642
+ refId: "A",
643
+ },
644
+ ],
645
+ options: {
646
+ showTime: true,
647
+ showLabels: false,
648
+ showCommonLabels: false,
649
+ wrapLogMessage: true,
650
+ enableLogDetails: true,
651
+ dedupStrategy: "none",
652
+ sortOrder: "Descending",
653
+ },
654
+ },
655
+ ],
656
+ };
657
+ return JSON.stringify(dashboard, null, 2);
658
+ }
659
+
660
+ /** Tail Sampling Insights — otel-collector self-metrics that tell us
661
+ * how aggressively we're sampling, which policies fire most, and
662
+ * whether the exporter is keeping up. */
663
+ export function generateArcSamplingDashboard(): string {
664
+ const dashboard = {
665
+ title: "Arc Tail Sampling & Collector Health",
666
+ uid: "arc-sampling",
667
+ schemaVersion: 39,
668
+ version: 1,
669
+ refresh: "30s",
670
+ time: { from: "now-3h", to: "now" },
671
+ tags: ["arc", "auto-provisioned"],
672
+ panels: [
673
+ panelStat(
674
+ "Spans received/sec",
675
+ { x: 0, y: 0, w: 6, h: 4 },
676
+ "sum(rate(otelcol_receiver_accepted_spans[5m]))",
677
+ "ops",
678
+ ),
679
+ panelStat(
680
+ "Spans exported/sec (sampled)",
681
+ { x: 6, y: 0, w: 6, h: 4 },
682
+ "sum(rate(otelcol_exporter_sent_spans[5m]))",
683
+ "ops",
684
+ ),
685
+ panelStat(
686
+ "Spans dropped (refused) / 5m",
687
+ { x: 12, y: 0, w: 6, h: 4 },
688
+ "sum(increase(otelcol_receiver_refused_spans[5m]))",
689
+ "short",
690
+ { orange: 1, red: 100 },
691
+ ),
692
+ panelStat(
693
+ "Export failures / 5m",
694
+ { x: 18, y: 0, w: 6, h: 4 },
695
+ "sum(increase(otelcol_exporter_send_failed_spans[5m]))",
696
+ "short",
697
+ { orange: 1, red: 50 },
698
+ ),
699
+
700
+ panelTimeseries(
701
+ "Tail-sampling policy decisions",
702
+ { x: 0, y: 4, w: 12, h: 8 },
703
+ 'sum by (policy) (rate(otelcol_processor_tail_sampling_count_traces_sampled{sampled="true"}[1m]))',
704
+ "{{policy}} sampled",
705
+ "ops",
706
+ ),
707
+ panelTimeseries(
708
+ "Receiver vs Exporter (effective sampling rate)",
709
+ { x: 12, y: 4, w: 12, h: 8 },
710
+ [
711
+ {
712
+ expr: "sum(rate(otelcol_receiver_accepted_spans[1m]))",
713
+ legend: "received",
714
+ },
715
+ {
716
+ expr: "sum(rate(otelcol_exporter_sent_spans[1m]))",
717
+ legend: "exported",
718
+ },
719
+ ],
720
+ "ops",
721
+ ),
722
+
723
+ panelTimeseries(
724
+ "Collector queue size (BatchSpanProcessor)",
725
+ { x: 0, y: 12, w: 12, h: 8 },
726
+ "otelcol_processor_batch_batch_send_size_sum / clamp_min(otelcol_processor_batch_batch_send_size_count, 1)",
727
+ "avg batch size",
728
+ "short",
729
+ ),
730
+ panelTimeseries(
731
+ "Collector process memory",
732
+ { x: 12, y: 12, w: 12, h: 8 },
733
+ "process_resident_memory_bytes{job=\"otel-collector\"}",
734
+ "RSS",
735
+ "bytes",
736
+ ),
737
+ ],
738
+ };
739
+ return JSON.stringify(dashboard, null, 2);
740
+ }
741
+
742
+ /** Per-Command drill-down — single command selector, full latency
743
+ * + rate + error breakdown. Linked from "Arc Service Overview"
744
+ * panels via panel-link → opens this dashboard pre-filtered. */
745
+ export function generateArcCommandDashboard(): string {
746
+ const dashboard = {
747
+ title: "Arc Command Drill-Down",
748
+ uid: "arc-command",
749
+ schemaVersion: 39,
750
+ version: 1,
751
+ refresh: "30s",
752
+ time: { from: "now-3h", to: "now" },
753
+ tags: ["arc", "auto-provisioned"],
754
+ templating: {
755
+ list: [
756
+ {
757
+ name: "service",
758
+ label: "Service",
759
+ type: "query",
760
+ datasource: { type: "prometheus", uid: "prometheus" },
761
+ query: "label_values(arc_commands_total, service_name)",
762
+ refresh: 2,
763
+ current: { text: "arc-prod", value: "arc-prod" },
764
+ },
765
+ {
766
+ name: "command",
767
+ label: "Command",
768
+ type: "query",
769
+ datasource: { type: "prometheus", uid: "prometheus" },
770
+ query:
771
+ 'label_values(arc_commands_total{service_name="$service"}, arc_command_name)',
772
+ refresh: 2,
773
+ includeAll: false,
774
+ multi: false,
775
+ },
776
+ ],
777
+ },
778
+ panels: [
779
+ panelStat(
780
+ "Call rate",
781
+ { x: 0, y: 0, w: 6, h: 4 },
782
+ 'sum(rate(arc_commands_total{service_name="$service", arc_command_name="$command"}[5m]))',
783
+ "ops",
784
+ ),
785
+ panelStat(
786
+ "P50 latency",
787
+ { x: 6, y: 0, w: 6, h: 4 },
788
+ 'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
789
+ "ms",
790
+ ),
791
+ panelStat(
792
+ "P95 latency",
793
+ { x: 12, y: 0, w: 6, h: 4 },
794
+ 'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
795
+ "ms",
796
+ { orange: 200, red: 1000 },
797
+ ),
798
+ panelStat(
799
+ "P99 latency",
800
+ { x: 18, y: 0, w: 6, h: 4 },
801
+ 'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
802
+ "ms",
803
+ { orange: 500, red: 2000 },
804
+ ),
805
+
806
+ panelTimeseries(
807
+ "Call rate over time",
808
+ { x: 0, y: 4, w: 12, h: 8 },
809
+ 'sum(rate(arc_commands_total{service_name="$service", arc_command_name="$command"}[1m]))',
810
+ "calls/s",
811
+ "ops",
812
+ ),
813
+ panelTimeseries(
814
+ "Latency percentiles",
815
+ { x: 12, y: 4, w: 12, h: 8 },
816
+ [
817
+ {
818
+ expr: 'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
819
+ legend: "p50",
820
+ },
821
+ {
822
+ expr: 'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
823
+ legend: "p95",
824
+ },
825
+ {
826
+ expr: 'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
827
+ legend: "p99",
828
+ },
829
+ ],
830
+ "ms",
831
+ ),
832
+
833
+ // Tempo traces for this specific command
834
+ {
835
+ title: "Recent traces (sampled)",
836
+ type: "traces",
837
+ gridPos: { x: 0, y: 12, w: 24, h: 14 },
838
+ datasource: { type: "tempo", uid: "tempo" },
839
+ targets: [
840
+ {
841
+ queryType: "traceql",
842
+ query: '{resource.service.name = "$service" && name = "command.$command"}',
843
+ refId: "A",
844
+ limit: 20,
845
+ },
846
+ ],
847
+ },
848
+ ],
849
+ };
850
+ return JSON.stringify(dashboard, null, 2);
851
+ }
852
+
853
+ /** All config files needed on the host. Returns map of relative-path → contents
854
+ * so bootstrap can write+upload them in one pass. */
855
+ export function generateObservabilityConfigs(
856
+ cfg: DeployConfig,
857
+ ): Record<string, string> {
858
+ return {
859
+ "observability/otel-collector-config.yaml": generateOtelCollectorConfig(cfg),
860
+ "observability/tempo.yaml": generateTempoConfig(cfg),
861
+ "observability/loki-config.yaml": generateLokiConfig(cfg),
862
+ "observability/prometheus.yml": generatePrometheusConfig(cfg),
863
+ "observability/grafana-datasources.yaml": generateGrafanaDatasources(),
864
+ "observability/grafana-dashboards.yaml": generateGrafanaDashboardsProvider(),
865
+ "observability/grafana-dashboards/arc-overview.json": generateArcOverviewDashboard(),
866
+ "observability/grafana-dashboards/arc-traces.json": generateArcTracesDashboard(),
867
+ "observability/grafana-dashboards/arc-service-map.json": generateArcServiceMapDashboard(),
868
+ "observability/grafana-dashboards/arc-logs.json": generateArcLogsDashboard(),
869
+ "observability/grafana-dashboards/arc-sampling.json": generateArcSamplingDashboard(),
870
+ "observability/grafana-dashboards/arc-command.json": generateArcCommandDashboard(),
871
+ };
872
+ }
873
+
874
+ // ---------------------------------------------------------------------------
875
+ // Panel builders — keep dashboards readable. Returns Grafana panel JSON.
876
+ // ---------------------------------------------------------------------------
877
+
878
+ interface GridPos { x: number; y: number; w: number; h: number; }
879
+ interface Threshold { orange?: number; red?: number; }
880
+
881
+ function panelStat(
882
+ title: string,
883
+ gridPos: GridPos,
884
+ expr: string,
885
+ unit: string,
886
+ thresholds?: Threshold,
887
+ ) {
888
+ const steps: { color: string; value: number | null }[] = [
889
+ { color: "green", value: null },
890
+ ];
891
+ if (thresholds?.orange !== undefined) {
892
+ steps.push({ color: "orange", value: thresholds.orange });
893
+ }
894
+ if (thresholds?.red !== undefined) {
895
+ steps.push({ color: "red", value: thresholds.red });
896
+ }
897
+ return {
898
+ title,
899
+ type: "stat",
900
+ gridPos,
901
+ datasource: { type: "prometheus", uid: "prometheus" },
902
+ targets: [{ expr, refId: "A", legendFormat: title }],
903
+ fieldConfig: {
904
+ defaults: {
905
+ unit,
906
+ thresholds: { mode: "absolute", steps },
907
+ },
908
+ overrides: [],
909
+ },
910
+ options: {
911
+ colorMode: "value",
912
+ graphMode: "area",
913
+ justifyMode: "auto",
914
+ reduceOptions: { calcs: ["lastNotNull"], fields: "", values: false },
915
+ textMode: "auto",
916
+ },
917
+ };
918
+ }
919
+
920
+ function panelTimeseries(
921
+ title: string,
922
+ gridPos: GridPos,
923
+ query: string | { expr: string; legend: string }[],
924
+ legend: string,
925
+ unit: string,
926
+ ) {
927
+ const targets = Array.isArray(query)
928
+ ? query.map((q, i) => ({
929
+ expr: q.expr,
930
+ refId: String.fromCharCode(65 + i),
931
+ legendFormat: q.legend,
932
+ }))
933
+ : [{ expr: query, refId: "A", legendFormat: legend }];
934
+ return {
935
+ title,
936
+ type: "timeseries",
937
+ gridPos,
938
+ datasource: { type: "prometheus", uid: "prometheus" },
939
+ targets,
940
+ fieldConfig: {
941
+ defaults: {
942
+ unit,
943
+ custom: {
944
+ drawStyle: "line",
945
+ lineInterpolation: "smooth",
946
+ lineWidth: 1.5,
947
+ fillOpacity: 10,
948
+ showPoints: "never",
949
+ },
950
+ },
951
+ overrides: [],
952
+ },
953
+ options: {
954
+ legend: { displayMode: "list", placement: "bottom", showLegend: true },
955
+ tooltip: { mode: "multi", sort: "desc" },
956
+ },
957
+ };
958
+ }