@arcote.tech/arc-cli 0.7.6 → 0.7.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +11988 -28969
- package/package.json +22 -10
- package/src/builder/dependency-collector.ts +34 -1
- package/src/commands/platform-deploy.ts +6 -0
- package/src/deploy/bootstrap.ts +35 -7
- package/src/deploy/compose.ts +12 -0
- package/src/deploy/htpasswd.ts +20 -0
- package/src/deploy/observability-configs.ts +674 -9
- package/src/platform/server.ts +19 -1
|
@@ -216,9 +216,10 @@ compactor:
|
|
|
216
216
|
`;
|
|
217
217
|
}
|
|
218
218
|
|
|
219
|
-
/** Prometheus — accepts remote_write from the collector, scrapes itself.
|
|
220
|
-
|
|
221
|
-
|
|
219
|
+
/** Prometheus — accepts remote_write from the collector, scrapes itself.
|
|
220
|
+
* Retention is set via the `--storage.tsdb.retention.time` command-line
|
|
221
|
+
* flag in compose.ts (Prometheus rejects retention inside the YAML). */
|
|
222
|
+
export function generatePrometheusConfig(_cfg: DeployConfig): string {
|
|
222
223
|
return `# Generated by \`arc platform deploy\` — do not edit by hand.
|
|
223
224
|
global:
|
|
224
225
|
scrape_interval: 15s
|
|
@@ -232,12 +233,8 @@ scrape_configs:
|
|
|
232
233
|
static_configs:
|
|
233
234
|
- targets: [otel-collector:8888]
|
|
234
235
|
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
retention.time: ${retention.metrics}
|
|
238
|
-
|
|
239
|
-
# Note: remote-write inbound is enabled via the --web.enable-remote-write-receiver
|
|
240
|
-
# command-line flag (set in docker-compose), not here.
|
|
236
|
+
# remote-write inbound is enabled via the --web.enable-remote-write-receiver
|
|
237
|
+
# command-line flag (compose.ts). Retention via --storage.tsdb.retention.time.
|
|
241
238
|
`;
|
|
242
239
|
}
|
|
243
240
|
|
|
@@ -278,6 +275,581 @@ datasources:
|
|
|
278
275
|
`;
|
|
279
276
|
}
|
|
280
277
|
|
|
278
|
+
/** Grafana dashboard-provider config — points Grafana at the bind-mounted
|
|
279
|
+
* dashboards directory and tells it to refresh on file change. */
|
|
280
|
+
export function generateGrafanaDashboardsProvider(): string {
|
|
281
|
+
return `# Generated by \`arc platform deploy\` — do not edit by hand.
|
|
282
|
+
apiVersion: 1
|
|
283
|
+
providers:
|
|
284
|
+
- name: arc
|
|
285
|
+
orgId: 1
|
|
286
|
+
folder: Arc
|
|
287
|
+
type: file
|
|
288
|
+
disableDeletion: false
|
|
289
|
+
editable: true
|
|
290
|
+
updateIntervalSeconds: 30
|
|
291
|
+
allowUiUpdates: true
|
|
292
|
+
options:
|
|
293
|
+
path: /etc/grafana/provisioning/dashboards/arc
|
|
294
|
+
foldersFromFilesStructure: false
|
|
295
|
+
`;
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
/** Top-level "Arc Service Overview" dashboard. Single comprehensive panel
|
|
299
|
+
* set that answers the 80% of operator questions: traffic volume, error
|
|
300
|
+
* rate, latency percentiles, slowest commands, DB activity, recent logs.
|
|
301
|
+
* Designed for Grafana 11+; uses the auto-provisioned Prometheus / Loki /
|
|
302
|
+
* Tempo datasources by UID so it works without manual setup. */
|
|
303
|
+
export function generateArcOverviewDashboard(): string {
|
|
304
|
+
const dashboard = {
|
|
305
|
+
title: "Arc Service Overview",
|
|
306
|
+
uid: "arc-overview",
|
|
307
|
+
schemaVersion: 39,
|
|
308
|
+
version: 1,
|
|
309
|
+
refresh: "30s",
|
|
310
|
+
time: { from: "now-1h", to: "now" },
|
|
311
|
+
timepicker: {},
|
|
312
|
+
tags: ["arc", "auto-provisioned"],
|
|
313
|
+
templating: {
|
|
314
|
+
list: [
|
|
315
|
+
{
|
|
316
|
+
name: "service",
|
|
317
|
+
label: "Service",
|
|
318
|
+
type: "query",
|
|
319
|
+
datasource: { type: "prometheus", uid: "prometheus" },
|
|
320
|
+
query: "label_values(traces_spanmetrics_calls_total, service_name)",
|
|
321
|
+
refresh: 2,
|
|
322
|
+
includeAll: false,
|
|
323
|
+
multi: false,
|
|
324
|
+
current: { text: "arc-prod", value: "arc-prod" },
|
|
325
|
+
},
|
|
326
|
+
],
|
|
327
|
+
},
|
|
328
|
+
panels: [
|
|
329
|
+
// Row: top-line stats
|
|
330
|
+
panelStat(
|
|
331
|
+
"Request rate (req/s)",
|
|
332
|
+
{ x: 0, y: 0, w: 6, h: 4 },
|
|
333
|
+
'sum(rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m]))',
|
|
334
|
+
"reqps",
|
|
335
|
+
),
|
|
336
|
+
panelStat(
|
|
337
|
+
"Error rate (%)",
|
|
338
|
+
{ x: 6, y: 0, w: 6, h: 4 },
|
|
339
|
+
'sum(rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER", status_code="STATUS_CODE_ERROR"}[5m])) / clamp_min(sum(rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])), 0.001) * 100',
|
|
340
|
+
"percent",
|
|
341
|
+
{ red: 1, orange: 0.1 },
|
|
342
|
+
),
|
|
343
|
+
panelStat(
|
|
344
|
+
"P99 latency",
|
|
345
|
+
{ x: 12, y: 0, w: 6, h: 4 },
|
|
346
|
+
'histogram_quantile(0.99, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
|
|
347
|
+
"ms",
|
|
348
|
+
{ red: 1000, orange: 300 },
|
|
349
|
+
),
|
|
350
|
+
panelStat(
|
|
351
|
+
"Active commands/sec",
|
|
352
|
+
{ x: 18, y: 0, w: 6, h: 4 },
|
|
353
|
+
'sum(rate(arc_commands_total{service_name="$service"}[5m]))',
|
|
354
|
+
"ops",
|
|
355
|
+
),
|
|
356
|
+
|
|
357
|
+
// Row: request volume + latency over time
|
|
358
|
+
panelTimeseries(
|
|
359
|
+
"Request rate by route",
|
|
360
|
+
{ x: 0, y: 4, w: 12, h: 8 },
|
|
361
|
+
'sum by (span_name) (rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[1m]))',
|
|
362
|
+
"{{span_name}}",
|
|
363
|
+
"reqps",
|
|
364
|
+
),
|
|
365
|
+
panelTimeseries(
|
|
366
|
+
"Latency percentiles",
|
|
367
|
+
{ x: 12, y: 4, w: 12, h: 8 },
|
|
368
|
+
[
|
|
369
|
+
{
|
|
370
|
+
expr: 'histogram_quantile(0.5, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
|
|
371
|
+
legend: "p50",
|
|
372
|
+
},
|
|
373
|
+
{
|
|
374
|
+
expr: 'histogram_quantile(0.95, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
|
|
375
|
+
legend: "p95",
|
|
376
|
+
},
|
|
377
|
+
{
|
|
378
|
+
expr: 'histogram_quantile(0.99, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
|
|
379
|
+
legend: "p99",
|
|
380
|
+
},
|
|
381
|
+
],
|
|
382
|
+
"ms",
|
|
383
|
+
),
|
|
384
|
+
|
|
385
|
+
// Row: commands
|
|
386
|
+
panelTimeseries(
|
|
387
|
+
"Commands per second",
|
|
388
|
+
{ x: 0, y: 12, w: 12, h: 8 },
|
|
389
|
+
'sum by (arc_command_name) (rate(arc_commands_total{service_name="$service"}[1m]))',
|
|
390
|
+
"{{arc_command_name}}",
|
|
391
|
+
"ops",
|
|
392
|
+
),
|
|
393
|
+
panelTimeseries(
|
|
394
|
+
"Command p95 latency",
|
|
395
|
+
{ x: 12, y: 12, w: 12, h: 8 },
|
|
396
|
+
'histogram_quantile(0.95, sum by (arc_command_name, le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service"}[5m])))',
|
|
397
|
+
"{{arc_command_name}}",
|
|
398
|
+
"ms",
|
|
399
|
+
),
|
|
400
|
+
|
|
401
|
+
// Row: database
|
|
402
|
+
panelTimeseries(
|
|
403
|
+
"DB find ops/sec by collection",
|
|
404
|
+
{ x: 0, y: 20, w: 12, h: 8 },
|
|
405
|
+
'sum by (db_collection_name) (rate(arc_db_find_ms_milliseconds_count{service_name="$service"}[1m]))',
|
|
406
|
+
"{{db_collection_name}}",
|
|
407
|
+
"ops",
|
|
408
|
+
),
|
|
409
|
+
panelTimeseries(
|
|
410
|
+
"DB find p95 latency",
|
|
411
|
+
{ x: 12, y: 20, w: 12, h: 8 },
|
|
412
|
+
'histogram_quantile(0.95, sum by (db_collection_name, le) (rate(arc_db_find_ms_milliseconds_bucket{service_name="$service"}[5m])))',
|
|
413
|
+
"{{db_collection_name}}",
|
|
414
|
+
"ms",
|
|
415
|
+
),
|
|
416
|
+
|
|
417
|
+
// Row: logs panel for the service (Loki)
|
|
418
|
+
{
|
|
419
|
+
title: "Recent error logs",
|
|
420
|
+
type: "logs",
|
|
421
|
+
gridPos: { x: 0, y: 28, w: 24, h: 8 },
|
|
422
|
+
datasource: { type: "loki", uid: "loki" },
|
|
423
|
+
targets: [
|
|
424
|
+
{
|
|
425
|
+
expr: '{service_name="$service"} |= `ERROR`',
|
|
426
|
+
refId: "A",
|
|
427
|
+
},
|
|
428
|
+
],
|
|
429
|
+
options: {
|
|
430
|
+
showTime: true,
|
|
431
|
+
showLabels: false,
|
|
432
|
+
showCommonLabels: false,
|
|
433
|
+
wrapLogMessage: true,
|
|
434
|
+
enableLogDetails: true,
|
|
435
|
+
dedupStrategy: "none",
|
|
436
|
+
sortOrder: "Descending",
|
|
437
|
+
},
|
|
438
|
+
},
|
|
439
|
+
],
|
|
440
|
+
};
|
|
441
|
+
return JSON.stringify(dashboard, null, 2);
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
/** Tempo Search dashboard — convenience entry point. Single panel that
|
|
445
|
+
* links into Explore with a TraceQL search for the selected service. */
|
|
446
|
+
export function generateArcTracesDashboard(): string {
|
|
447
|
+
const dashboard = {
|
|
448
|
+
title: "Arc Recent Traces",
|
|
449
|
+
uid: "arc-traces",
|
|
450
|
+
schemaVersion: 39,
|
|
451
|
+
version: 1,
|
|
452
|
+
refresh: "1m",
|
|
453
|
+
time: { from: "now-1h", to: "now" },
|
|
454
|
+
tags: ["arc", "auto-provisioned"],
|
|
455
|
+
templating: {
|
|
456
|
+
list: [
|
|
457
|
+
{
|
|
458
|
+
name: "service",
|
|
459
|
+
label: "Service",
|
|
460
|
+
type: "query",
|
|
461
|
+
datasource: { type: "prometheus", uid: "prometheus" },
|
|
462
|
+
query: "label_values(traces_spanmetrics_calls_total, service_name)",
|
|
463
|
+
refresh: 2,
|
|
464
|
+
current: { text: "arc-prod", value: "arc-prod" },
|
|
465
|
+
},
|
|
466
|
+
],
|
|
467
|
+
},
|
|
468
|
+
panels: [
|
|
469
|
+
{
|
|
470
|
+
title: "Slowest traces (p95 ≥ 500ms)",
|
|
471
|
+
type: "traces",
|
|
472
|
+
gridPos: { x: 0, y: 0, w: 24, h: 14 },
|
|
473
|
+
datasource: { type: "tempo", uid: "tempo" },
|
|
474
|
+
targets: [
|
|
475
|
+
{
|
|
476
|
+
queryType: "traceql",
|
|
477
|
+
query: '{resource.service.name = "$service" && duration > 500ms}',
|
|
478
|
+
refId: "A",
|
|
479
|
+
limit: 20,
|
|
480
|
+
},
|
|
481
|
+
],
|
|
482
|
+
},
|
|
483
|
+
{
|
|
484
|
+
title: "Recent errors",
|
|
485
|
+
type: "traces",
|
|
486
|
+
gridPos: { x: 0, y: 14, w: 24, h: 14 },
|
|
487
|
+
datasource: { type: "tempo", uid: "tempo" },
|
|
488
|
+
targets: [
|
|
489
|
+
{
|
|
490
|
+
queryType: "traceql",
|
|
491
|
+
query: '{resource.service.name = "$service" && status = error}',
|
|
492
|
+
refId: "A",
|
|
493
|
+
limit: 20,
|
|
494
|
+
},
|
|
495
|
+
],
|
|
496
|
+
},
|
|
497
|
+
],
|
|
498
|
+
};
|
|
499
|
+
return JSON.stringify(dashboard, null, 2);
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
/** Service Map / Topology dashboard — leverages Tempo's metrics_generator
|
|
503
|
+
* service-graph output (`traces_service_graph_request_*`). Shows the call
|
|
504
|
+
* graph between services with rate + latency on each edge. */
|
|
505
|
+
export function generateArcServiceMapDashboard(): string {
|
|
506
|
+
const dashboard = {
|
|
507
|
+
title: "Arc Service Map",
|
|
508
|
+
uid: "arc-service-map",
|
|
509
|
+
schemaVersion: 39,
|
|
510
|
+
version: 1,
|
|
511
|
+
refresh: "30s",
|
|
512
|
+
time: { from: "now-1h", to: "now" },
|
|
513
|
+
tags: ["arc", "auto-provisioned"],
|
|
514
|
+
panels: [
|
|
515
|
+
// Service-to-service call rate
|
|
516
|
+
panelTimeseries(
|
|
517
|
+
"Service-to-service request rate",
|
|
518
|
+
{ x: 0, y: 0, w: 24, h: 9 },
|
|
519
|
+
'sum by (client, server) (rate(traces_service_graph_request_total[1m]))',
|
|
520
|
+
"{{client}} → {{server}}",
|
|
521
|
+
"reqps",
|
|
522
|
+
),
|
|
523
|
+
// Edge p95 latency
|
|
524
|
+
panelTimeseries(
|
|
525
|
+
"Inter-service p95 latency",
|
|
526
|
+
{ x: 0, y: 9, w: 24, h: 9 },
|
|
527
|
+
'histogram_quantile(0.95, sum by (client, server, le) (rate(traces_service_graph_request_server_seconds_bucket[5m]))) * 1000',
|
|
528
|
+
"{{client}} → {{server}}",
|
|
529
|
+
"ms",
|
|
530
|
+
),
|
|
531
|
+
// Service graph node-degree table (which services talk to which)
|
|
532
|
+
{
|
|
533
|
+
title: "Service-graph edges (last 5m)",
|
|
534
|
+
type: "table",
|
|
535
|
+
gridPos: { x: 0, y: 18, w: 24, h: 8 },
|
|
536
|
+
datasource: { type: "prometheus", uid: "prometheus" },
|
|
537
|
+
targets: [
|
|
538
|
+
{
|
|
539
|
+
expr: "sum by (client, server) (increase(traces_service_graph_request_total[5m]))",
|
|
540
|
+
refId: "A",
|
|
541
|
+
instant: true,
|
|
542
|
+
format: "table",
|
|
543
|
+
},
|
|
544
|
+
],
|
|
545
|
+
transformations: [
|
|
546
|
+
{ id: "organize", options: { excludeByName: { Time: true } } },
|
|
547
|
+
{ id: "sortBy", options: { sort: [{ field: "Value", desc: true }] } },
|
|
548
|
+
],
|
|
549
|
+
},
|
|
550
|
+
],
|
|
551
|
+
};
|
|
552
|
+
return JSON.stringify(dashboard, null, 2);
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
/** Logs Explorer — Loki-focused dashboard. Top error patterns, ingest
|
|
556
|
+
* rate per service, ad-hoc search with trace-link via derivedFields
|
|
557
|
+
* (configured in datasources.yaml). */
|
|
558
|
+
export function generateArcLogsDashboard(): string {
|
|
559
|
+
const dashboard = {
|
|
560
|
+
title: "Arc Logs Explorer",
|
|
561
|
+
uid: "arc-logs",
|
|
562
|
+
schemaVersion: 39,
|
|
563
|
+
version: 1,
|
|
564
|
+
refresh: "30s",
|
|
565
|
+
time: { from: "now-1h", to: "now" },
|
|
566
|
+
tags: ["arc", "auto-provisioned"],
|
|
567
|
+
templating: {
|
|
568
|
+
list: [
|
|
569
|
+
{
|
|
570
|
+
name: "service",
|
|
571
|
+
label: "Service",
|
|
572
|
+
type: "query",
|
|
573
|
+
datasource: { type: "loki", uid: "loki" },
|
|
574
|
+
query: "label_values(service_name)",
|
|
575
|
+
refresh: 2,
|
|
576
|
+
current: { text: "arc-prod", value: "arc-prod" },
|
|
577
|
+
},
|
|
578
|
+
{
|
|
579
|
+
name: "search",
|
|
580
|
+
label: "Filter",
|
|
581
|
+
type: "textbox",
|
|
582
|
+
query: "",
|
|
583
|
+
current: { text: "", value: "" },
|
|
584
|
+
},
|
|
585
|
+
],
|
|
586
|
+
},
|
|
587
|
+
panels: [
|
|
588
|
+
// Stat: total log lines, last hour
|
|
589
|
+
panelStat(
|
|
590
|
+
"Logs ingested (1h)",
|
|
591
|
+
{ x: 0, y: 0, w: 6, h: 4 },
|
|
592
|
+
'sum(increase({service_name="$service"}[1h]))',
|
|
593
|
+
"short",
|
|
594
|
+
),
|
|
595
|
+
// Stat: errors (last hour)
|
|
596
|
+
panelStat(
|
|
597
|
+
"Errors (1h)",
|
|
598
|
+
{ x: 6, y: 0, w: 6, h: 4 },
|
|
599
|
+
'sum(increase({service_name="$service", severity_text=~"ERROR|FATAL"}[1h]))',
|
|
600
|
+
"short",
|
|
601
|
+
{ orange: 1, red: 50 },
|
|
602
|
+
),
|
|
603
|
+
// Time series: log volume by severity
|
|
604
|
+
{
|
|
605
|
+
title: "Log volume by severity",
|
|
606
|
+
type: "timeseries",
|
|
607
|
+
gridPos: { x: 12, y: 0, w: 12, h: 8 },
|
|
608
|
+
datasource: { type: "loki", uid: "loki" },
|
|
609
|
+
targets: [
|
|
610
|
+
{
|
|
611
|
+
expr: 'sum by (severity_text) (count_over_time({service_name="$service"} |~ "$search" [$__interval]))',
|
|
612
|
+
refId: "A",
|
|
613
|
+
legendFormat: "{{severity_text}}",
|
|
614
|
+
},
|
|
615
|
+
],
|
|
616
|
+
fieldConfig: {
|
|
617
|
+
defaults: {
|
|
618
|
+
unit: "short",
|
|
619
|
+
custom: {
|
|
620
|
+
drawStyle: "bars",
|
|
621
|
+
fillOpacity: 50,
|
|
622
|
+
lineWidth: 0,
|
|
623
|
+
stacking: { mode: "normal", group: "A" },
|
|
624
|
+
},
|
|
625
|
+
},
|
|
626
|
+
overrides: [],
|
|
627
|
+
},
|
|
628
|
+
options: {
|
|
629
|
+
legend: { displayMode: "list", placement: "bottom", showLegend: true },
|
|
630
|
+
tooltip: { mode: "multi", sort: "desc" },
|
|
631
|
+
},
|
|
632
|
+
},
|
|
633
|
+
// Tail: live logs (filtered)
|
|
634
|
+
{
|
|
635
|
+
title: "Live tail (filtered by $search)",
|
|
636
|
+
type: "logs",
|
|
637
|
+
gridPos: { x: 0, y: 8, w: 24, h: 18 },
|
|
638
|
+
datasource: { type: "loki", uid: "loki" },
|
|
639
|
+
targets: [
|
|
640
|
+
{
|
|
641
|
+
expr: '{service_name="$service"} |~ "$search"',
|
|
642
|
+
refId: "A",
|
|
643
|
+
},
|
|
644
|
+
],
|
|
645
|
+
options: {
|
|
646
|
+
showTime: true,
|
|
647
|
+
showLabels: false,
|
|
648
|
+
showCommonLabels: false,
|
|
649
|
+
wrapLogMessage: true,
|
|
650
|
+
enableLogDetails: true,
|
|
651
|
+
dedupStrategy: "none",
|
|
652
|
+
sortOrder: "Descending",
|
|
653
|
+
},
|
|
654
|
+
},
|
|
655
|
+
],
|
|
656
|
+
};
|
|
657
|
+
return JSON.stringify(dashboard, null, 2);
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
/** Tail Sampling Insights — otel-collector self-metrics that tell us
|
|
661
|
+
* how aggressively we're sampling, which policies fire most, and
|
|
662
|
+
* whether the exporter is keeping up. */
|
|
663
|
+
export function generateArcSamplingDashboard(): string {
|
|
664
|
+
const dashboard = {
|
|
665
|
+
title: "Arc Tail Sampling & Collector Health",
|
|
666
|
+
uid: "arc-sampling",
|
|
667
|
+
schemaVersion: 39,
|
|
668
|
+
version: 1,
|
|
669
|
+
refresh: "30s",
|
|
670
|
+
time: { from: "now-3h", to: "now" },
|
|
671
|
+
tags: ["arc", "auto-provisioned"],
|
|
672
|
+
panels: [
|
|
673
|
+
panelStat(
|
|
674
|
+
"Spans received/sec",
|
|
675
|
+
{ x: 0, y: 0, w: 6, h: 4 },
|
|
676
|
+
"sum(rate(otelcol_receiver_accepted_spans[5m]))",
|
|
677
|
+
"ops",
|
|
678
|
+
),
|
|
679
|
+
panelStat(
|
|
680
|
+
"Spans exported/sec (sampled)",
|
|
681
|
+
{ x: 6, y: 0, w: 6, h: 4 },
|
|
682
|
+
"sum(rate(otelcol_exporter_sent_spans[5m]))",
|
|
683
|
+
"ops",
|
|
684
|
+
),
|
|
685
|
+
panelStat(
|
|
686
|
+
"Spans dropped (refused) / 5m",
|
|
687
|
+
{ x: 12, y: 0, w: 6, h: 4 },
|
|
688
|
+
"sum(increase(otelcol_receiver_refused_spans[5m]))",
|
|
689
|
+
"short",
|
|
690
|
+
{ orange: 1, red: 100 },
|
|
691
|
+
),
|
|
692
|
+
panelStat(
|
|
693
|
+
"Export failures / 5m",
|
|
694
|
+
{ x: 18, y: 0, w: 6, h: 4 },
|
|
695
|
+
"sum(increase(otelcol_exporter_send_failed_spans[5m]))",
|
|
696
|
+
"short",
|
|
697
|
+
{ orange: 1, red: 50 },
|
|
698
|
+
),
|
|
699
|
+
|
|
700
|
+
panelTimeseries(
|
|
701
|
+
"Tail-sampling policy decisions",
|
|
702
|
+
{ x: 0, y: 4, w: 12, h: 8 },
|
|
703
|
+
'sum by (policy) (rate(otelcol_processor_tail_sampling_count_traces_sampled{sampled="true"}[1m]))',
|
|
704
|
+
"{{policy}} sampled",
|
|
705
|
+
"ops",
|
|
706
|
+
),
|
|
707
|
+
panelTimeseries(
|
|
708
|
+
"Receiver vs Exporter (effective sampling rate)",
|
|
709
|
+
{ x: 12, y: 4, w: 12, h: 8 },
|
|
710
|
+
[
|
|
711
|
+
{
|
|
712
|
+
expr: "sum(rate(otelcol_receiver_accepted_spans[1m]))",
|
|
713
|
+
legend: "received",
|
|
714
|
+
},
|
|
715
|
+
{
|
|
716
|
+
expr: "sum(rate(otelcol_exporter_sent_spans[1m]))",
|
|
717
|
+
legend: "exported",
|
|
718
|
+
},
|
|
719
|
+
],
|
|
720
|
+
"ops",
|
|
721
|
+
),
|
|
722
|
+
|
|
723
|
+
panelTimeseries(
|
|
724
|
+
"Collector queue size (BatchSpanProcessor)",
|
|
725
|
+
{ x: 0, y: 12, w: 12, h: 8 },
|
|
726
|
+
"otelcol_processor_batch_batch_send_size_sum / clamp_min(otelcol_processor_batch_batch_send_size_count, 1)",
|
|
727
|
+
"avg batch size",
|
|
728
|
+
"short",
|
|
729
|
+
),
|
|
730
|
+
panelTimeseries(
|
|
731
|
+
"Collector process memory",
|
|
732
|
+
{ x: 12, y: 12, w: 12, h: 8 },
|
|
733
|
+
"process_resident_memory_bytes{job=\"otel-collector\"}",
|
|
734
|
+
"RSS",
|
|
735
|
+
"bytes",
|
|
736
|
+
),
|
|
737
|
+
],
|
|
738
|
+
};
|
|
739
|
+
return JSON.stringify(dashboard, null, 2);
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
/** Per-Command drill-down — single command selector, full latency
|
|
743
|
+
* + rate + error breakdown. Linked from "Arc Service Overview"
|
|
744
|
+
* panels via panel-link → opens this dashboard pre-filtered. */
|
|
745
|
+
export function generateArcCommandDashboard(): string {
|
|
746
|
+
const dashboard = {
|
|
747
|
+
title: "Arc Command Drill-Down",
|
|
748
|
+
uid: "arc-command",
|
|
749
|
+
schemaVersion: 39,
|
|
750
|
+
version: 1,
|
|
751
|
+
refresh: "30s",
|
|
752
|
+
time: { from: "now-3h", to: "now" },
|
|
753
|
+
tags: ["arc", "auto-provisioned"],
|
|
754
|
+
templating: {
|
|
755
|
+
list: [
|
|
756
|
+
{
|
|
757
|
+
name: "service",
|
|
758
|
+
label: "Service",
|
|
759
|
+
type: "query",
|
|
760
|
+
datasource: { type: "prometheus", uid: "prometheus" },
|
|
761
|
+
query: "label_values(arc_commands_total, service_name)",
|
|
762
|
+
refresh: 2,
|
|
763
|
+
current: { text: "arc-prod", value: "arc-prod" },
|
|
764
|
+
},
|
|
765
|
+
{
|
|
766
|
+
name: "command",
|
|
767
|
+
label: "Command",
|
|
768
|
+
type: "query",
|
|
769
|
+
datasource: { type: "prometheus", uid: "prometheus" },
|
|
770
|
+
query:
|
|
771
|
+
'label_values(arc_commands_total{service_name="$service"}, arc_command_name)',
|
|
772
|
+
refresh: 2,
|
|
773
|
+
includeAll: false,
|
|
774
|
+
multi: false,
|
|
775
|
+
},
|
|
776
|
+
],
|
|
777
|
+
},
|
|
778
|
+
panels: [
|
|
779
|
+
panelStat(
|
|
780
|
+
"Call rate",
|
|
781
|
+
{ x: 0, y: 0, w: 6, h: 4 },
|
|
782
|
+
'sum(rate(arc_commands_total{service_name="$service", arc_command_name="$command"}[5m]))',
|
|
783
|
+
"ops",
|
|
784
|
+
),
|
|
785
|
+
panelStat(
|
|
786
|
+
"P50 latency",
|
|
787
|
+
{ x: 6, y: 0, w: 6, h: 4 },
|
|
788
|
+
'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
|
|
789
|
+
"ms",
|
|
790
|
+
),
|
|
791
|
+
panelStat(
|
|
792
|
+
"P95 latency",
|
|
793
|
+
{ x: 12, y: 0, w: 6, h: 4 },
|
|
794
|
+
'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
|
|
795
|
+
"ms",
|
|
796
|
+
{ orange: 200, red: 1000 },
|
|
797
|
+
),
|
|
798
|
+
panelStat(
|
|
799
|
+
"P99 latency",
|
|
800
|
+
{ x: 18, y: 0, w: 6, h: 4 },
|
|
801
|
+
'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
|
|
802
|
+
"ms",
|
|
803
|
+
{ orange: 500, red: 2000 },
|
|
804
|
+
),
|
|
805
|
+
|
|
806
|
+
panelTimeseries(
|
|
807
|
+
"Call rate over time",
|
|
808
|
+
{ x: 0, y: 4, w: 12, h: 8 },
|
|
809
|
+
'sum(rate(arc_commands_total{service_name="$service", arc_command_name="$command"}[1m]))',
|
|
810
|
+
"calls/s",
|
|
811
|
+
"ops",
|
|
812
|
+
),
|
|
813
|
+
panelTimeseries(
|
|
814
|
+
"Latency percentiles",
|
|
815
|
+
{ x: 12, y: 4, w: 12, h: 8 },
|
|
816
|
+
[
|
|
817
|
+
{
|
|
818
|
+
expr: 'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
|
|
819
|
+
legend: "p50",
|
|
820
|
+
},
|
|
821
|
+
{
|
|
822
|
+
expr: 'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
|
|
823
|
+
legend: "p95",
|
|
824
|
+
},
|
|
825
|
+
{
|
|
826
|
+
expr: 'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
|
|
827
|
+
legend: "p99",
|
|
828
|
+
},
|
|
829
|
+
],
|
|
830
|
+
"ms",
|
|
831
|
+
),
|
|
832
|
+
|
|
833
|
+
// Tempo traces for this specific command
|
|
834
|
+
{
|
|
835
|
+
title: "Recent traces (sampled)",
|
|
836
|
+
type: "traces",
|
|
837
|
+
gridPos: { x: 0, y: 12, w: 24, h: 14 },
|
|
838
|
+
datasource: { type: "tempo", uid: "tempo" },
|
|
839
|
+
targets: [
|
|
840
|
+
{
|
|
841
|
+
queryType: "traceql",
|
|
842
|
+
query: '{resource.service.name = "$service" && name = "command.$command"}',
|
|
843
|
+
refId: "A",
|
|
844
|
+
limit: 20,
|
|
845
|
+
},
|
|
846
|
+
],
|
|
847
|
+
},
|
|
848
|
+
],
|
|
849
|
+
};
|
|
850
|
+
return JSON.stringify(dashboard, null, 2);
|
|
851
|
+
}
|
|
852
|
+
|
|
281
853
|
/** All config files needed on the host. Returns map of relative-path → contents
|
|
282
854
|
* so bootstrap can write+upload them in one pass. */
|
|
283
855
|
export function generateObservabilityConfigs(
|
|
@@ -289,5 +861,98 @@ export function generateObservabilityConfigs(
|
|
|
289
861
|
"observability/loki-config.yaml": generateLokiConfig(cfg),
|
|
290
862
|
"observability/prometheus.yml": generatePrometheusConfig(cfg),
|
|
291
863
|
"observability/grafana-datasources.yaml": generateGrafanaDatasources(),
|
|
864
|
+
"observability/grafana-dashboards.yaml": generateGrafanaDashboardsProvider(),
|
|
865
|
+
"observability/grafana-dashboards/arc-overview.json": generateArcOverviewDashboard(),
|
|
866
|
+
"observability/grafana-dashboards/arc-traces.json": generateArcTracesDashboard(),
|
|
867
|
+
"observability/grafana-dashboards/arc-service-map.json": generateArcServiceMapDashboard(),
|
|
868
|
+
"observability/grafana-dashboards/arc-logs.json": generateArcLogsDashboard(),
|
|
869
|
+
"observability/grafana-dashboards/arc-sampling.json": generateArcSamplingDashboard(),
|
|
870
|
+
"observability/grafana-dashboards/arc-command.json": generateArcCommandDashboard(),
|
|
871
|
+
};
|
|
872
|
+
}
|
|
873
|
+
|
|
874
|
+
// ---------------------------------------------------------------------------
|
|
875
|
+
// Panel builders — keep dashboards readable. Returns Grafana panel JSON.
|
|
876
|
+
// ---------------------------------------------------------------------------
|
|
877
|
+
|
|
878
|
+
interface GridPos { x: number; y: number; w: number; h: number; }
|
|
879
|
+
interface Threshold { orange?: number; red?: number; }
|
|
880
|
+
|
|
881
|
+
function panelStat(
|
|
882
|
+
title: string,
|
|
883
|
+
gridPos: GridPos,
|
|
884
|
+
expr: string,
|
|
885
|
+
unit: string,
|
|
886
|
+
thresholds?: Threshold,
|
|
887
|
+
) {
|
|
888
|
+
const steps: { color: string; value: number | null }[] = [
|
|
889
|
+
{ color: "green", value: null },
|
|
890
|
+
];
|
|
891
|
+
if (thresholds?.orange !== undefined) {
|
|
892
|
+
steps.push({ color: "orange", value: thresholds.orange });
|
|
893
|
+
}
|
|
894
|
+
if (thresholds?.red !== undefined) {
|
|
895
|
+
steps.push({ color: "red", value: thresholds.red });
|
|
896
|
+
}
|
|
897
|
+
return {
|
|
898
|
+
title,
|
|
899
|
+
type: "stat",
|
|
900
|
+
gridPos,
|
|
901
|
+
datasource: { type: "prometheus", uid: "prometheus" },
|
|
902
|
+
targets: [{ expr, refId: "A", legendFormat: title }],
|
|
903
|
+
fieldConfig: {
|
|
904
|
+
defaults: {
|
|
905
|
+
unit,
|
|
906
|
+
thresholds: { mode: "absolute", steps },
|
|
907
|
+
},
|
|
908
|
+
overrides: [],
|
|
909
|
+
},
|
|
910
|
+
options: {
|
|
911
|
+
colorMode: "value",
|
|
912
|
+
graphMode: "area",
|
|
913
|
+
justifyMode: "auto",
|
|
914
|
+
reduceOptions: { calcs: ["lastNotNull"], fields: "", values: false },
|
|
915
|
+
textMode: "auto",
|
|
916
|
+
},
|
|
917
|
+
};
|
|
918
|
+
}
|
|
919
|
+
|
|
920
|
+
function panelTimeseries(
|
|
921
|
+
title: string,
|
|
922
|
+
gridPos: GridPos,
|
|
923
|
+
query: string | { expr: string; legend: string }[],
|
|
924
|
+
legend: string,
|
|
925
|
+
unit: string,
|
|
926
|
+
) {
|
|
927
|
+
const targets = Array.isArray(query)
|
|
928
|
+
? query.map((q, i) => ({
|
|
929
|
+
expr: q.expr,
|
|
930
|
+
refId: String.fromCharCode(65 + i),
|
|
931
|
+
legendFormat: q.legend,
|
|
932
|
+
}))
|
|
933
|
+
: [{ expr: query, refId: "A", legendFormat: legend }];
|
|
934
|
+
return {
|
|
935
|
+
title,
|
|
936
|
+
type: "timeseries",
|
|
937
|
+
gridPos,
|
|
938
|
+
datasource: { type: "prometheus", uid: "prometheus" },
|
|
939
|
+
targets,
|
|
940
|
+
fieldConfig: {
|
|
941
|
+
defaults: {
|
|
942
|
+
unit,
|
|
943
|
+
custom: {
|
|
944
|
+
drawStyle: "line",
|
|
945
|
+
lineInterpolation: "smooth",
|
|
946
|
+
lineWidth: 1.5,
|
|
947
|
+
fillOpacity: 10,
|
|
948
|
+
showPoints: "never",
|
|
949
|
+
},
|
|
950
|
+
},
|
|
951
|
+
overrides: [],
|
|
952
|
+
},
|
|
953
|
+
options: {
|
|
954
|
+
legend: { displayMode: "list", placement: "bottom", showLegend: true },
|
|
955
|
+
tooltip: { mode: "multi", sort: "desc" },
|
|
956
|
+
},
|
|
292
957
|
};
|
|
293
958
|
}
|