@arcote.tech/arc-cli 0.7.6 → 0.7.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -216,9 +216,10 @@ compactor:
216
216
  `;
217
217
  }
218
218
 
219
- /** Prometheus — accepts remote_write from the collector, scrapes itself. */
220
- export function generatePrometheusConfig(cfg: DeployConfig): string {
221
- const retention = pickRetention(cfg.observability);
219
+ /** Prometheus — accepts remote_write from the collector, scrapes itself.
220
+ * Retention is set via the `--storage.tsdb.retention.time` command-line
221
+ * flag in compose.ts (Prometheus rejects retention inside the YAML). */
222
+ export function generatePrometheusConfig(_cfg: DeployConfig): string {
222
223
  return `# Generated by \`arc platform deploy\` — do not edit by hand.
223
224
  global:
224
225
  scrape_interval: 15s
@@ -232,12 +233,8 @@ scrape_configs:
232
233
  static_configs:
233
234
  - targets: [otel-collector:8888]
234
235
 
235
- storage:
236
- tsdb:
237
- retention.time: ${retention.metrics}
238
-
239
- # Note: remote-write inbound is enabled via the --web.enable-remote-write-receiver
240
- # command-line flag (set in docker-compose), not here.
236
+ # remote-write inbound is enabled via the --web.enable-remote-write-receiver
237
+ # command-line flag (compose.ts). Retention via --storage.tsdb.retention.time.
241
238
  `;
242
239
  }
243
240
 
@@ -278,6 +275,581 @@ datasources:
278
275
  `;
279
276
  }
280
277
 
278
+ /** Grafana dashboard-provider config — points Grafana at the bind-mounted
279
+ * dashboards directory and tells it to refresh on file change. */
280
+ export function generateGrafanaDashboardsProvider(): string {
281
+ return `# Generated by \`arc platform deploy\` — do not edit by hand.
282
+ apiVersion: 1
283
+ providers:
284
+ - name: arc
285
+ orgId: 1
286
+ folder: Arc
287
+ type: file
288
+ disableDeletion: false
289
+ editable: true
290
+ updateIntervalSeconds: 30
291
+ allowUiUpdates: true
292
+ options:
293
+ path: /etc/grafana/provisioning/dashboards/arc
294
+ foldersFromFilesStructure: false
295
+ `;
296
+ }
297
+
298
+ /** Top-level "Arc Service Overview" dashboard. Single comprehensive panel
299
+ * set that answers the 80% of operator questions: traffic volume, error
300
+ * rate, latency percentiles, slowest commands, DB activity, recent logs.
301
+ * Designed for Grafana 11+; uses the auto-provisioned Prometheus / Loki /
302
+ * Tempo datasources by UID so it works without manual setup. */
303
+ export function generateArcOverviewDashboard(): string {
304
+ const dashboard = {
305
+ title: "Arc Service Overview",
306
+ uid: "arc-overview",
307
+ schemaVersion: 39,
308
+ version: 1,
309
+ refresh: "30s",
310
+ time: { from: "now-1h", to: "now" },
311
+ timepicker: {},
312
+ tags: ["arc", "auto-provisioned"],
313
+ templating: {
314
+ list: [
315
+ {
316
+ name: "service",
317
+ label: "Service",
318
+ type: "query",
319
+ datasource: { type: "prometheus", uid: "prometheus" },
320
+ query: "label_values(traces_spanmetrics_calls_total, service_name)",
321
+ refresh: 2,
322
+ includeAll: false,
323
+ multi: false,
324
+ current: { text: "arc-prod", value: "arc-prod" },
325
+ },
326
+ ],
327
+ },
328
+ panels: [
329
+ // Row: top-line stats
330
+ panelStat(
331
+ "Request rate (req/s)",
332
+ { x: 0, y: 0, w: 6, h: 4 },
333
+ 'sum(rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m]))',
334
+ "reqps",
335
+ ),
336
+ panelStat(
337
+ "Error rate (%)",
338
+ { x: 6, y: 0, w: 6, h: 4 },
339
+ 'sum(rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER", status_code="STATUS_CODE_ERROR"}[5m])) / clamp_min(sum(rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])), 0.001) * 100',
340
+ "percent",
341
+ { red: 1, orange: 0.1 },
342
+ ),
343
+ panelStat(
344
+ "P99 latency",
345
+ { x: 12, y: 0, w: 6, h: 4 },
346
+ 'histogram_quantile(0.99, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
347
+ "ms",
348
+ { red: 1000, orange: 300 },
349
+ ),
350
+ panelStat(
351
+ "Active commands/sec",
352
+ { x: 18, y: 0, w: 6, h: 4 },
353
+ 'sum(rate(arc_commands_total{service_name="$service"}[5m]))',
354
+ "ops",
355
+ ),
356
+
357
+ // Row: request volume + latency over time
358
+ panelTimeseries(
359
+ "Request rate by route",
360
+ { x: 0, y: 4, w: 12, h: 8 },
361
+ 'sum by (span_name) (rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[1m]))',
362
+ "{{span_name}}",
363
+ "reqps",
364
+ ),
365
+ panelTimeseries(
366
+ "Latency percentiles",
367
+ { x: 12, y: 4, w: 12, h: 8 },
368
+ [
369
+ {
370
+ expr: 'histogram_quantile(0.5, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
371
+ legend: "p50",
372
+ },
373
+ {
374
+ expr: 'histogram_quantile(0.95, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
375
+ legend: "p95",
376
+ },
377
+ {
378
+ expr: 'histogram_quantile(0.99, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
379
+ legend: "p99",
380
+ },
381
+ ],
382
+ "ms",
383
+ ),
384
+
385
+ // Row: commands
386
+ panelTimeseries(
387
+ "Commands per second",
388
+ { x: 0, y: 12, w: 12, h: 8 },
389
+ 'sum by (arc_command_name) (rate(arc_commands_total{service_name="$service"}[1m]))',
390
+ "{{arc_command_name}}",
391
+ "ops",
392
+ ),
393
+ panelTimeseries(
394
+ "Command p95 latency",
395
+ { x: 12, y: 12, w: 12, h: 8 },
396
+ 'histogram_quantile(0.95, sum by (arc_command_name, le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service"}[5m])))',
397
+ "{{arc_command_name}}",
398
+ "ms",
399
+ ),
400
+
401
+ // Row: database
402
+ panelTimeseries(
403
+ "DB find ops/sec by collection",
404
+ { x: 0, y: 20, w: 12, h: 8 },
405
+ 'sum by (db_collection_name) (rate(arc_db_find_ms_milliseconds_count{service_name="$service"}[1m]))',
406
+ "{{db_collection_name}}",
407
+ "ops",
408
+ ),
409
+ panelTimeseries(
410
+ "DB find p95 latency",
411
+ { x: 12, y: 20, w: 12, h: 8 },
412
+ 'histogram_quantile(0.95, sum by (db_collection_name, le) (rate(arc_db_find_ms_milliseconds_bucket{service_name="$service"}[5m])))',
413
+ "{{db_collection_name}}",
414
+ "ms",
415
+ ),
416
+
417
+ // Row: logs panel for the service (Loki)
418
+ {
419
+ title: "Recent error logs",
420
+ type: "logs",
421
+ gridPos: { x: 0, y: 28, w: 24, h: 8 },
422
+ datasource: { type: "loki", uid: "loki" },
423
+ targets: [
424
+ {
425
+ expr: '{service_name="$service"} |= `ERROR`',
426
+ refId: "A",
427
+ },
428
+ ],
429
+ options: {
430
+ showTime: true,
431
+ showLabels: false,
432
+ showCommonLabels: false,
433
+ wrapLogMessage: true,
434
+ enableLogDetails: true,
435
+ dedupStrategy: "none",
436
+ sortOrder: "Descending",
437
+ },
438
+ },
439
+ ],
440
+ };
441
+ return JSON.stringify(dashboard, null, 2);
442
+ }
443
+
444
+ /** Tempo Search dashboard — convenience entry point. Single panel that
445
+ * links into Explore with a TraceQL search for the selected service. */
446
+ export function generateArcTracesDashboard(): string {
447
+ const dashboard = {
448
+ title: "Arc Recent Traces",
449
+ uid: "arc-traces",
450
+ schemaVersion: 39,
451
+ version: 1,
452
+ refresh: "1m",
453
+ time: { from: "now-1h", to: "now" },
454
+ tags: ["arc", "auto-provisioned"],
455
+ templating: {
456
+ list: [
457
+ {
458
+ name: "service",
459
+ label: "Service",
460
+ type: "query",
461
+ datasource: { type: "prometheus", uid: "prometheus" },
462
+ query: "label_values(traces_spanmetrics_calls_total, service_name)",
463
+ refresh: 2,
464
+ current: { text: "arc-prod", value: "arc-prod" },
465
+ },
466
+ ],
467
+ },
468
+ panels: [
469
+ {
470
+ title: "Slowest traces (p95 ≥ 500ms)",
471
+ type: "traces",
472
+ gridPos: { x: 0, y: 0, w: 24, h: 14 },
473
+ datasource: { type: "tempo", uid: "tempo" },
474
+ targets: [
475
+ {
476
+ queryType: "traceql",
477
+ query: '{resource.service.name = "$service" && duration > 500ms}',
478
+ refId: "A",
479
+ limit: 20,
480
+ },
481
+ ],
482
+ },
483
+ {
484
+ title: "Recent errors",
485
+ type: "traces",
486
+ gridPos: { x: 0, y: 14, w: 24, h: 14 },
487
+ datasource: { type: "tempo", uid: "tempo" },
488
+ targets: [
489
+ {
490
+ queryType: "traceql",
491
+ query: '{resource.service.name = "$service" && status = error}',
492
+ refId: "A",
493
+ limit: 20,
494
+ },
495
+ ],
496
+ },
497
+ ],
498
+ };
499
+ return JSON.stringify(dashboard, null, 2);
500
+ }
501
+
502
+ /** Service Map / Topology dashboard — leverages Tempo's metrics_generator
503
+ * service-graph output (`traces_service_graph_request_*`). Shows the call
504
+ * graph between services with rate + latency on each edge. */
505
+ export function generateArcServiceMapDashboard(): string {
506
+ const dashboard = {
507
+ title: "Arc Service Map",
508
+ uid: "arc-service-map",
509
+ schemaVersion: 39,
510
+ version: 1,
511
+ refresh: "30s",
512
+ time: { from: "now-1h", to: "now" },
513
+ tags: ["arc", "auto-provisioned"],
514
+ panels: [
515
+ // Service-to-service call rate
516
+ panelTimeseries(
517
+ "Service-to-service request rate",
518
+ { x: 0, y: 0, w: 24, h: 9 },
519
+ 'sum by (client, server) (rate(traces_service_graph_request_total[1m]))',
520
+ "{{client}} → {{server}}",
521
+ "reqps",
522
+ ),
523
+ // Edge p95 latency
524
+ panelTimeseries(
525
+ "Inter-service p95 latency",
526
+ { x: 0, y: 9, w: 24, h: 9 },
527
+ 'histogram_quantile(0.95, sum by (client, server, le) (rate(traces_service_graph_request_server_seconds_bucket[5m]))) * 1000',
528
+ "{{client}} → {{server}}",
529
+ "ms",
530
+ ),
531
+ // Service graph node-degree table (which services talk to which)
532
+ {
533
+ title: "Service-graph edges (last 5m)",
534
+ type: "table",
535
+ gridPos: { x: 0, y: 18, w: 24, h: 8 },
536
+ datasource: { type: "prometheus", uid: "prometheus" },
537
+ targets: [
538
+ {
539
+ expr: "sum by (client, server) (increase(traces_service_graph_request_total[5m]))",
540
+ refId: "A",
541
+ instant: true,
542
+ format: "table",
543
+ },
544
+ ],
545
+ transformations: [
546
+ { id: "organize", options: { excludeByName: { Time: true } } },
547
+ { id: "sortBy", options: { sort: [{ field: "Value", desc: true }] } },
548
+ ],
549
+ },
550
+ ],
551
+ };
552
+ return JSON.stringify(dashboard, null, 2);
553
+ }
554
+
555
+ /** Logs Explorer — Loki-focused dashboard. Top error patterns, ingest
556
+ * rate per service, ad-hoc search with trace-link via derivedFields
557
+ * (configured in datasources.yaml). */
558
+ export function generateArcLogsDashboard(): string {
559
+ const dashboard = {
560
+ title: "Arc Logs Explorer",
561
+ uid: "arc-logs",
562
+ schemaVersion: 39,
563
+ version: 1,
564
+ refresh: "30s",
565
+ time: { from: "now-1h", to: "now" },
566
+ tags: ["arc", "auto-provisioned"],
567
+ templating: {
568
+ list: [
569
+ {
570
+ name: "service",
571
+ label: "Service",
572
+ type: "query",
573
+ datasource: { type: "loki", uid: "loki" },
574
+ query: "label_values(service_name)",
575
+ refresh: 2,
576
+ current: { text: "arc-prod", value: "arc-prod" },
577
+ },
578
+ {
579
+ name: "search",
580
+ label: "Filter",
581
+ type: "textbox",
582
+ query: "",
583
+ current: { text: "", value: "" },
584
+ },
585
+ ],
586
+ },
587
+ panels: [
588
+ // Stat: total log lines, last hour
589
+ panelStat(
590
+ "Logs ingested (1h)",
591
+ { x: 0, y: 0, w: 6, h: 4 },
592
+ 'sum(increase({service_name="$service"}[1h]))',
593
+ "short",
594
+ ),
595
+ // Stat: errors (last hour)
596
+ panelStat(
597
+ "Errors (1h)",
598
+ { x: 6, y: 0, w: 6, h: 4 },
599
+ 'sum(increase({service_name="$service", severity_text=~"ERROR|FATAL"}[1h]))',
600
+ "short",
601
+ { orange: 1, red: 50 },
602
+ ),
603
+ // Time series: log volume by severity
604
+ {
605
+ title: "Log volume by severity",
606
+ type: "timeseries",
607
+ gridPos: { x: 12, y: 0, w: 12, h: 8 },
608
+ datasource: { type: "loki", uid: "loki" },
609
+ targets: [
610
+ {
611
+ expr: 'sum by (severity_text) (count_over_time({service_name="$service"} |~ "$search" [$__interval]))',
612
+ refId: "A",
613
+ legendFormat: "{{severity_text}}",
614
+ },
615
+ ],
616
+ fieldConfig: {
617
+ defaults: {
618
+ unit: "short",
619
+ custom: {
620
+ drawStyle: "bars",
621
+ fillOpacity: 50,
622
+ lineWidth: 0,
623
+ stacking: { mode: "normal", group: "A" },
624
+ },
625
+ },
626
+ overrides: [],
627
+ },
628
+ options: {
629
+ legend: { displayMode: "list", placement: "bottom", showLegend: true },
630
+ tooltip: { mode: "multi", sort: "desc" },
631
+ },
632
+ },
633
+ // Tail: live logs (filtered)
634
+ {
635
+ title: "Live tail (filtered by $search)",
636
+ type: "logs",
637
+ gridPos: { x: 0, y: 8, w: 24, h: 18 },
638
+ datasource: { type: "loki", uid: "loki" },
639
+ targets: [
640
+ {
641
+ expr: '{service_name="$service"} |~ "$search"',
642
+ refId: "A",
643
+ },
644
+ ],
645
+ options: {
646
+ showTime: true,
647
+ showLabels: false,
648
+ showCommonLabels: false,
649
+ wrapLogMessage: true,
650
+ enableLogDetails: true,
651
+ dedupStrategy: "none",
652
+ sortOrder: "Descending",
653
+ },
654
+ },
655
+ ],
656
+ };
657
+ return JSON.stringify(dashboard, null, 2);
658
+ }
659
+
660
+ /** Tail Sampling Insights — otel-collector self-metrics that tell us
661
+ * how aggressively we're sampling, which policies fire most, and
662
+ * whether the exporter is keeping up. */
663
+ export function generateArcSamplingDashboard(): string {
664
+ const dashboard = {
665
+ title: "Arc Tail Sampling & Collector Health",
666
+ uid: "arc-sampling",
667
+ schemaVersion: 39,
668
+ version: 1,
669
+ refresh: "30s",
670
+ time: { from: "now-3h", to: "now" },
671
+ tags: ["arc", "auto-provisioned"],
672
+ panels: [
673
+ panelStat(
674
+ "Spans received/sec",
675
+ { x: 0, y: 0, w: 6, h: 4 },
676
+ "sum(rate(otelcol_receiver_accepted_spans[5m]))",
677
+ "ops",
678
+ ),
679
+ panelStat(
680
+ "Spans exported/sec (sampled)",
681
+ { x: 6, y: 0, w: 6, h: 4 },
682
+ "sum(rate(otelcol_exporter_sent_spans[5m]))",
683
+ "ops",
684
+ ),
685
+ panelStat(
686
+ "Spans dropped (refused) / 5m",
687
+ { x: 12, y: 0, w: 6, h: 4 },
688
+ "sum(increase(otelcol_receiver_refused_spans[5m]))",
689
+ "short",
690
+ { orange: 1, red: 100 },
691
+ ),
692
+ panelStat(
693
+ "Export failures / 5m",
694
+ { x: 18, y: 0, w: 6, h: 4 },
695
+ "sum(increase(otelcol_exporter_send_failed_spans[5m]))",
696
+ "short",
697
+ { orange: 1, red: 50 },
698
+ ),
699
+
700
+ panelTimeseries(
701
+ "Tail-sampling policy decisions",
702
+ { x: 0, y: 4, w: 12, h: 8 },
703
+ 'sum by (policy) (rate(otelcol_processor_tail_sampling_count_traces_sampled{sampled="true"}[1m]))',
704
+ "{{policy}} sampled",
705
+ "ops",
706
+ ),
707
+ panelTimeseries(
708
+ "Receiver vs Exporter (effective sampling rate)",
709
+ { x: 12, y: 4, w: 12, h: 8 },
710
+ [
711
+ {
712
+ expr: "sum(rate(otelcol_receiver_accepted_spans[1m]))",
713
+ legend: "received",
714
+ },
715
+ {
716
+ expr: "sum(rate(otelcol_exporter_sent_spans[1m]))",
717
+ legend: "exported",
718
+ },
719
+ ],
720
+ "ops",
721
+ ),
722
+
723
+ panelTimeseries(
724
+ "Collector queue size (BatchSpanProcessor)",
725
+ { x: 0, y: 12, w: 12, h: 8 },
726
+ "otelcol_processor_batch_batch_send_size_sum / clamp_min(otelcol_processor_batch_batch_send_size_count, 1)",
727
+ "avg batch size",
728
+ "short",
729
+ ),
730
+ panelTimeseries(
731
+ "Collector process memory",
732
+ { x: 12, y: 12, w: 12, h: 8 },
733
+ "process_resident_memory_bytes{job=\"otel-collector\"}",
734
+ "RSS",
735
+ "bytes",
736
+ ),
737
+ ],
738
+ };
739
+ return JSON.stringify(dashboard, null, 2);
740
+ }
741
+
742
+ /** Per-Command drill-down — single command selector, full latency
743
+ * + rate + error breakdown. Linked from "Arc Service Overview"
744
+ * panels via panel-link → opens this dashboard pre-filtered. */
745
+ export function generateArcCommandDashboard(): string {
746
+ const dashboard = {
747
+ title: "Arc Command Drill-Down",
748
+ uid: "arc-command",
749
+ schemaVersion: 39,
750
+ version: 1,
751
+ refresh: "30s",
752
+ time: { from: "now-3h", to: "now" },
753
+ tags: ["arc", "auto-provisioned"],
754
+ templating: {
755
+ list: [
756
+ {
757
+ name: "service",
758
+ label: "Service",
759
+ type: "query",
760
+ datasource: { type: "prometheus", uid: "prometheus" },
761
+ query: "label_values(arc_commands_total, service_name)",
762
+ refresh: 2,
763
+ current: { text: "arc-prod", value: "arc-prod" },
764
+ },
765
+ {
766
+ name: "command",
767
+ label: "Command",
768
+ type: "query",
769
+ datasource: { type: "prometheus", uid: "prometheus" },
770
+ query:
771
+ 'label_values(arc_commands_total{service_name="$service"}, arc_command_name)',
772
+ refresh: 2,
773
+ includeAll: false,
774
+ multi: false,
775
+ },
776
+ ],
777
+ },
778
+ panels: [
779
+ panelStat(
780
+ "Call rate",
781
+ { x: 0, y: 0, w: 6, h: 4 },
782
+ 'sum(rate(arc_commands_total{service_name="$service", arc_command_name="$command"}[5m]))',
783
+ "ops",
784
+ ),
785
+ panelStat(
786
+ "P50 latency",
787
+ { x: 6, y: 0, w: 6, h: 4 },
788
+ 'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
789
+ "ms",
790
+ ),
791
+ panelStat(
792
+ "P95 latency",
793
+ { x: 12, y: 0, w: 6, h: 4 },
794
+ 'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
795
+ "ms",
796
+ { orange: 200, red: 1000 },
797
+ ),
798
+ panelStat(
799
+ "P99 latency",
800
+ { x: 18, y: 0, w: 6, h: 4 },
801
+ 'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
802
+ "ms",
803
+ { orange: 500, red: 2000 },
804
+ ),
805
+
806
+ panelTimeseries(
807
+ "Call rate over time",
808
+ { x: 0, y: 4, w: 12, h: 8 },
809
+ 'sum(rate(arc_commands_total{service_name="$service", arc_command_name="$command"}[1m]))',
810
+ "calls/s",
811
+ "ops",
812
+ ),
813
+ panelTimeseries(
814
+ "Latency percentiles",
815
+ { x: 12, y: 4, w: 12, h: 8 },
816
+ [
817
+ {
818
+ expr: 'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
819
+ legend: "p50",
820
+ },
821
+ {
822
+ expr: 'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
823
+ legend: "p95",
824
+ },
825
+ {
826
+ expr: 'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
827
+ legend: "p99",
828
+ },
829
+ ],
830
+ "ms",
831
+ ),
832
+
833
+ // Tempo traces for this specific command
834
+ {
835
+ title: "Recent traces (sampled)",
836
+ type: "traces",
837
+ gridPos: { x: 0, y: 12, w: 24, h: 14 },
838
+ datasource: { type: "tempo", uid: "tempo" },
839
+ targets: [
840
+ {
841
+ queryType: "traceql",
842
+ query: '{resource.service.name = "$service" && name = "command.$command"}',
843
+ refId: "A",
844
+ limit: 20,
845
+ },
846
+ ],
847
+ },
848
+ ],
849
+ };
850
+ return JSON.stringify(dashboard, null, 2);
851
+ }
852
+
281
853
  /** All config files needed on the host. Returns map of relative-path → contents
282
854
  * so bootstrap can write+upload them in one pass. */
283
855
  export function generateObservabilityConfigs(
@@ -289,5 +861,98 @@ export function generateObservabilityConfigs(
289
861
  "observability/loki-config.yaml": generateLokiConfig(cfg),
290
862
  "observability/prometheus.yml": generatePrometheusConfig(cfg),
291
863
  "observability/grafana-datasources.yaml": generateGrafanaDatasources(),
864
+ "observability/grafana-dashboards.yaml": generateGrafanaDashboardsProvider(),
865
+ "observability/grafana-dashboards/arc-overview.json": generateArcOverviewDashboard(),
866
+ "observability/grafana-dashboards/arc-traces.json": generateArcTracesDashboard(),
867
+ "observability/grafana-dashboards/arc-service-map.json": generateArcServiceMapDashboard(),
868
+ "observability/grafana-dashboards/arc-logs.json": generateArcLogsDashboard(),
869
+ "observability/grafana-dashboards/arc-sampling.json": generateArcSamplingDashboard(),
870
+ "observability/grafana-dashboards/arc-command.json": generateArcCommandDashboard(),
871
+ };
872
+ }
873
+
874
+ // ---------------------------------------------------------------------------
875
+ // Panel builders — keep dashboards readable. Returns Grafana panel JSON.
876
+ // ---------------------------------------------------------------------------
877
+
878
+ interface GridPos { x: number; y: number; w: number; h: number; }
879
+ interface Threshold { orange?: number; red?: number; }
880
+
881
+ function panelStat(
882
+ title: string,
883
+ gridPos: GridPos,
884
+ expr: string,
885
+ unit: string,
886
+ thresholds?: Threshold,
887
+ ) {
888
+ const steps: { color: string; value: number | null }[] = [
889
+ { color: "green", value: null },
890
+ ];
891
+ if (thresholds?.orange !== undefined) {
892
+ steps.push({ color: "orange", value: thresholds.orange });
893
+ }
894
+ if (thresholds?.red !== undefined) {
895
+ steps.push({ color: "red", value: thresholds.red });
896
+ }
897
+ return {
898
+ title,
899
+ type: "stat",
900
+ gridPos,
901
+ datasource: { type: "prometheus", uid: "prometheus" },
902
+ targets: [{ expr, refId: "A", legendFormat: title }],
903
+ fieldConfig: {
904
+ defaults: {
905
+ unit,
906
+ thresholds: { mode: "absolute", steps },
907
+ },
908
+ overrides: [],
909
+ },
910
+ options: {
911
+ colorMode: "value",
912
+ graphMode: "area",
913
+ justifyMode: "auto",
914
+ reduceOptions: { calcs: ["lastNotNull"], fields: "", values: false },
915
+ textMode: "auto",
916
+ },
917
+ };
918
+ }
919
+
920
+ function panelTimeseries(
921
+ title: string,
922
+ gridPos: GridPos,
923
+ query: string | { expr: string; legend: string }[],
924
+ legend: string,
925
+ unit: string,
926
+ ) {
927
+ const targets = Array.isArray(query)
928
+ ? query.map((q, i) => ({
929
+ expr: q.expr,
930
+ refId: String.fromCharCode(65 + i),
931
+ legendFormat: q.legend,
932
+ }))
933
+ : [{ expr: query, refId: "A", legendFormat: legend }];
934
+ return {
935
+ title,
936
+ type: "timeseries",
937
+ gridPos,
938
+ datasource: { type: "prometheus", uid: "prometheus" },
939
+ targets,
940
+ fieldConfig: {
941
+ defaults: {
942
+ unit,
943
+ custom: {
944
+ drawStyle: "line",
945
+ lineInterpolation: "smooth",
946
+ lineWidth: 1.5,
947
+ fillOpacity: 10,
948
+ showPoints: "never",
949
+ },
950
+ },
951
+ overrides: [],
952
+ },
953
+ options: {
954
+ legend: { displayMode: "list", placement: "bottom", showLegend: true },
955
+ tooltip: { mode: "multi", sort: "desc" },
956
+ },
292
957
  };
293
958
  }