@arcote.tech/arc-cli 0.7.18 → 0.7.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -25870,6 +25870,7 @@ import {
25870
25870
  ATTR_SERVICE_NAME,
25871
25871
  ATTR_SERVICE_VERSION
25872
25872
  } from "@opentelemetry/semantic-conventions/incubating";
25873
+ import { format as format2 } from "util";
25873
25874
  import {
25874
25875
  context,
25875
25876
  propagation,
@@ -25880,6 +25881,41 @@ import {
25880
25881
  logs,
25881
25882
  SeverityNumber
25882
25883
  } from "@opentelemetry/api-logs";
25884
+ function patchConsole(telemetry) {
25885
+ if (patched || !telemetry.active)
25886
+ return () => {};
25887
+ patched = true;
25888
+ const originals = [];
25889
+ for (const [method, level] of METHODS) {
25890
+ const original = console[method].bind(console);
25891
+ originals.push([method, original]);
25892
+ console[method] = (...args) => {
25893
+ original(...args);
25894
+ if (emitting)
25895
+ return;
25896
+ emitting = true;
25897
+ try {
25898
+ const body = format2(...args);
25899
+ if (body.startsWith("[arc-otel]"))
25900
+ return;
25901
+ const error = args.find((arg) => arg instanceof Error);
25902
+ telemetry.log(level, body, error ? {
25903
+ "exception.type": error.name,
25904
+ "exception.message": error.message,
25905
+ "exception.stacktrace": error.stack ?? ""
25906
+ } : {});
25907
+ } catch {} finally {
25908
+ emitting = false;
25909
+ }
25910
+ };
25911
+ }
25912
+ return () => {
25913
+ for (const [method, original] of originals) {
25914
+ console[method] = original;
25915
+ }
25916
+ patched = false;
25917
+ };
25918
+ }
25883
25919
  function sanitizeAttrs(input, opts = {}) {
25884
25920
  if (!input)
25885
25921
  return {};
@@ -25939,6 +25975,7 @@ class ArcTelemetry {
25939
25975
  meter = null;
25940
25976
  histograms = new Map;
25941
25977
  counters = new Map;
25978
+ upDownCounters = new Map;
25942
25979
  constructor(config) {
25943
25980
  const mode = config.mode ?? "development";
25944
25981
  const enabled = config.enabled ?? mode !== "disabled";
@@ -26050,6 +26087,18 @@ class ArcTelemetry {
26050
26087
  counter.add(value, attrs);
26051
26088
  } catch {}
26052
26089
  }
26090
+ addUpDown(name, delta, attrs = {}) {
26091
+ if (!this.active || !this.meter)
26092
+ return;
26093
+ let counter = this.upDownCounters.get(name);
26094
+ if (!counter) {
26095
+ counter = this.meter.createUpDownCounter(name);
26096
+ this.upDownCounters.set(name, counter);
26097
+ }
26098
+ try {
26099
+ counter.add(delta, attrs);
26100
+ } catch {}
26101
+ }
26053
26102
  recordHistogram(name, value, attrs = {}) {
26054
26103
  if (!this.active || !this.meter)
26055
26104
  return;
@@ -26097,53 +26146,31 @@ function noopSpan() {
26097
26146
  function wrapDbAdapter(adapter, telemetry, dbSystem) {
26098
26147
  if (!telemetry || !telemetry.active)
26099
26148
  return adapter;
26149
+ const dbAttrs = (operation, store) => ({
26150
+ "db.system": dbSystem,
26151
+ "db.operation.name": operation,
26152
+ ...store ? { "db.collection.name": store } : {}
26153
+ });
26154
+ const measureOp = async (operation, store, fn) => {
26155
+ const start = Date.now();
26156
+ try {
26157
+ return await fn();
26158
+ } finally {
26159
+ telemetry.measureSince("arc.db.operation.duration", start, dbAttrs(operation, store));
26160
+ }
26161
+ };
26100
26162
  const wrapRead = (tx) => ({
26101
- find: async (store, options) => telemetry.startSpan(`db.find ${store}`, async (span) => {
26102
- const start = Date.now();
26103
- try {
26104
- const rows = await tx.find(store, options);
26105
- span.setAttribute("db.response.row_count", rows.length);
26106
- return rows;
26107
- } finally {
26108
- telemetry.measureSince("arc.db.find_ms", start, {
26109
- "db.system": dbSystem,
26110
- "db.collection.name": store
26111
- });
26112
- }
26113
- }, {
26114
- kind: 3,
26115
- attributes: {
26116
- "db.system": dbSystem,
26117
- "db.operation.name": "find",
26118
- "db.collection.name": store
26119
- }
26120
- })
26163
+ find: async (store, options) => telemetry.startSpan(`db.find ${store}`, async (span) => measureOp("find", store, async () => {
26164
+ const rows = await tx.find(store, options);
26165
+ span.setAttribute("db.response.row_count", rows.length);
26166
+ return rows;
26167
+ }), { kind: 3, attributes: dbAttrs("find", store) })
26121
26168
  });
26122
26169
  const wrapReadWrite = (tx) => ({
26123
26170
  ...wrapRead(tx),
26124
- set: async (store, data) => telemetry.startSpan(`db.set ${store}`, () => tx.set(store, data), {
26125
- kind: 3,
26126
- attributes: {
26127
- "db.system": dbSystem,
26128
- "db.operation.name": "set",
26129
- "db.collection.name": store
26130
- }
26131
- }),
26132
- remove: async (store, id3) => telemetry.startSpan(`db.remove ${store}`, () => tx.remove(store, id3), {
26133
- kind: 3,
26134
- attributes: {
26135
- "db.system": dbSystem,
26136
- "db.operation.name": "remove",
26137
- "db.collection.name": store
26138
- }
26139
- }),
26140
- commit: async () => telemetry.startSpan("db.commit", () => tx.commit(), {
26141
- kind: 3,
26142
- attributes: {
26143
- "db.system": dbSystem,
26144
- "db.operation.name": "commit"
26145
- }
26146
- })
26171
+ set: async (store, data) => telemetry.startSpan(`db.set ${store}`, () => measureOp("set", store, () => tx.set(store, data)), { kind: 3, attributes: dbAttrs("set", store) }),
26172
+ remove: async (store, id3) => telemetry.startSpan(`db.remove ${store}`, () => measureOp("remove", store, () => tx.remove(store, id3)), { kind: 3, attributes: dbAttrs("remove", store) }),
26173
+ commit: async () => telemetry.startSpan("db.commit", () => measureOp("commit", undefined, () => tx.commit()), { kind: 3, attributes: dbAttrs("commit") })
26147
26174
  });
26148
26175
  return new Proxy(adapter, {
26149
26176
  get(target, prop) {
@@ -26206,6 +26233,7 @@ function initServerTelemetry(config) {
26206
26233
  logger: loggerProvider.getLogger(config.serviceName),
26207
26234
  meter: meterProvider.getMeter(config.serviceName)
26208
26235
  });
26236
+ const restoreConsole = config.patchConsole !== false ? patchConsole(telemetry) : () => {};
26209
26237
  if (telemetry.config.debug) {
26210
26238
  console.log("[arc-otel] server init", {
26211
26239
  serviceName: config.serviceName,
@@ -26216,6 +26244,7 @@ function initServerTelemetry(config) {
26216
26244
  });
26217
26245
  }
26218
26246
  const shutdown = async () => {
26247
+ restoreConsole();
26219
26248
  try {
26220
26249
  await Promise.all([
26221
26250
  tracerProvider.shutdown(),
@@ -26228,8 +26257,15 @@ function initServerTelemetry(config) {
26228
26257
  };
26229
26258
  return { telemetry, shutdown };
26230
26259
  }
26231
- var DEFAULT_REDACT_KEY_PATTERN, DEFAULT_MAX_STRING_LEN = 2048, DEFAULT_MAX_JSON_LEN = 4096;
26260
+ var METHODS, patched = false, emitting = false, DEFAULT_REDACT_KEY_PATTERN, DEFAULT_MAX_STRING_LEN = 2048, DEFAULT_MAX_JSON_LEN = 4096;
26232
26261
  var init_init_server = __esm(() => {
26262
+ METHODS = [
26263
+ ["debug", "debug"],
26264
+ ["log", "info"],
26265
+ ["info", "info"],
26266
+ ["warn", "warn"],
26267
+ ["error", "error"]
26268
+ ];
26233
26269
  DEFAULT_REDACT_KEY_PATTERN = /(password|passwd|token|secret|authorization|jwt|api[_-]?key|cookie|email|credit[_-]?card|ssn)/i;
26234
26270
  });
26235
26271
 
@@ -36054,18 +36090,32 @@ function generateCaddyfile(cfg) {
36054
36090
  email ${cfg.caddy.email}`;
36055
36091
  const tlsDirective = cfg.caddy.email === "internal" ? `
36056
36092
  tls internal` : "";
36093
+ const observability = cfg.observability?.enabled === true;
36094
+ const logDirective = observability ? [" log {", " output stdout", " format json", " }"] : [];
36057
36095
  const lines = [];
36058
36096
  lines.push("# Generated by `arc platform deploy` \u2014 do not edit by hand.");
36059
36097
  lines.push("");
36060
36098
  lines.push("{");
36061
36099
  lines.push(" admin off");
36100
+ if (observability) {
36101
+ lines.push(" metrics {");
36102
+ lines.push(" per_host");
36103
+ lines.push(" }");
36104
+ }
36062
36105
  if (email)
36063
36106
  lines.push(` ${email.trim()}`);
36064
36107
  lines.push("}");
36065
36108
  lines.push("");
36109
+ if (observability) {
36110
+ lines.push(":2020 {");
36111
+ lines.push(" metrics");
36112
+ lines.push("}");
36113
+ lines.push("");
36114
+ }
36066
36115
  for (const [name, env2] of Object.entries(cfg.envs)) {
36067
36116
  lines.push(`${env2.domain} {${tlsDirective}`);
36068
- if (cfg.observability?.enabled) {
36117
+ lines.push(...logDirective);
36118
+ if (observability) {
36069
36119
  lines.push(" handle_path /otel/* {");
36070
36120
  lines.push(" reverse_proxy otel-collector:4318");
36071
36121
  lines.push(" }");
@@ -36078,13 +36128,11 @@ function generateCaddyfile(cfg) {
36078
36128
  lines.push("}");
36079
36129
  lines.push("");
36080
36130
  }
36081
- if (cfg.observability?.enabled) {
36082
- const firstEnv = Object.values(cfg.envs)[0];
36083
- if (firstEnv) {
36084
- const subdomain = cfg.observability.subdomain ?? "observability";
36085
- const apex = apexOf(firstEnv.domain);
36086
- const observabilityDomain = `${subdomain}.${apex}`;
36087
- lines.push(`${observabilityDomain} {${tlsDirective}`);
36131
+ if (observability) {
36132
+ const domain = observabilityDomain(cfg);
36133
+ if (domain) {
36134
+ lines.push(`${domain} {${tlsDirective}`);
36135
+ lines.push(...logDirective);
36088
36136
  lines.push(" basic_auth {");
36089
36137
  lines.push(" import /etc/caddy/observability-htpasswd");
36090
36138
  lines.push(" }");
@@ -36094,6 +36142,7 @@ function generateCaddyfile(cfg) {
36094
36142
  }
36095
36143
  }
36096
36144
  lines.push(`${cfg.registry.domain} {${tlsDirective}`);
36145
+ lines.push(...logDirective);
36097
36146
  lines.push(" reverse_proxy registry:5000 {");
36098
36147
  lines.push(" header_up Host {host}");
36099
36148
  lines.push(" }");
@@ -36105,6 +36154,15 @@ function generateCaddyfile(cfg) {
36105
36154
  `) + `
36106
36155
  `;
36107
36156
  }
36157
+ function observabilityDomain(cfg) {
36158
+ if (!cfg.observability?.enabled)
36159
+ return null;
36160
+ const firstEnv = Object.values(cfg.envs)[0];
36161
+ if (!firstEnv)
36162
+ return null;
36163
+ const subdomain = cfg.observability.subdomain ?? "observability";
36164
+ return `${subdomain}.${apexOf(firstEnv.domain)}`;
36165
+ }
36108
36166
  function apexOf(host) {
36109
36167
  const parts = host.split(".");
36110
36168
  if (parts.length <= 2)
@@ -36113,6 +36171,13 @@ function apexOf(host) {
36113
36171
  }
36114
36172
 
36115
36173
  // src/deploy/compose.ts
36174
+ function pushLogging(lines) {
36175
+ lines.push(" logging:");
36176
+ lines.push(" driver: json-file");
36177
+ lines.push(" options:");
36178
+ lines.push(' max-size: "10m"');
36179
+ lines.push(' max-file: "3"');
36180
+ }
36116
36181
  function generateCompose({ cfg }) {
36117
36182
  const lines = [];
36118
36183
  lines.push("# Generated by `arc platform deploy` \u2014 do not edit by hand.");
@@ -36121,6 +36186,7 @@ function generateCompose({ cfg }) {
36121
36186
  lines.push(" caddy:");
36122
36187
  lines.push(" image: caddy:2-alpine");
36123
36188
  lines.push(" restart: unless-stopped");
36189
+ pushLogging(lines);
36124
36190
  lines.push(" ports:");
36125
36191
  lines.push(' - "80:80"');
36126
36192
  lines.push(' - "443:443"');
@@ -36133,10 +36199,15 @@ function generateCompose({ cfg }) {
36133
36199
  lines.push(" - caddy_config:/config");
36134
36200
  lines.push(" networks:");
36135
36201
  lines.push(" - arc-net");
36202
+ if (cfg.observability?.enabled) {
36203
+ lines.push(" expose:");
36204
+ lines.push(' - "2020" # Prometheus metrics endpoint (Caddyfile :2020 site)');
36205
+ }
36136
36206
  lines.push("");
36137
36207
  lines.push(" registry:");
36138
36208
  lines.push(" image: registry:2");
36139
36209
  lines.push(" restart: unless-stopped");
36210
+ pushLogging(lines);
36140
36211
  lines.push(" volumes:");
36141
36212
  lines.push(" - registry_data:/var/lib/registry");
36142
36213
  lines.push(" - ./registry-auth/htpasswd:/auth/htpasswd:ro");
@@ -36157,6 +36228,13 @@ function generateCompose({ cfg }) {
36157
36228
  lines.push(` image: \${ARC_IMAGE_${upperName}:-arc-${name}:not-deployed}`);
36158
36229
  lines.push(` container_name: arc-${name}`);
36159
36230
  lines.push(" restart: unless-stopped");
36231
+ pushLogging(lines);
36232
+ lines.push(" healthcheck:");
36233
+ lines.push(' test: ["CMD", "wget", "-qO-", "http://127.0.0.1:5005/health"]');
36234
+ lines.push(" interval: 30s");
36235
+ lines.push(" timeout: 5s");
36236
+ lines.push(" retries: 3");
36237
+ lines.push(" start_period: 20s");
36160
36238
  if (usePostgres) {
36161
36239
  lines.push(" depends_on:");
36162
36240
  lines.push(` arc-db-${name}:`);
@@ -36209,6 +36287,7 @@ function generateCompose({ cfg }) {
36209
36287
  lines.push(` image: ${image2}`);
36210
36288
  lines.push(` container_name: arc-db-${name}`);
36211
36289
  lines.push(" restart: unless-stopped");
36290
+ pushLogging(lines);
36212
36291
  lines.push(" environment:");
36213
36292
  lines.push(" POSTGRES_USER: arc");
36214
36293
  lines.push(" POSTGRES_DB: arc");
@@ -36233,9 +36312,13 @@ function generateCompose({ cfg }) {
36233
36312
  lines.push(" image: otel/opentelemetry-collector-contrib:0.114.0");
36234
36313
  lines.push(" container_name: arc-otel-collector");
36235
36314
  lines.push(" restart: unless-stopped");
36315
+ pushLogging(lines);
36316
+ lines.push(' user: "0:0"');
36236
36317
  lines.push(' command: ["--config=/etc/otelcol-contrib/config.yaml"]');
36237
36318
  lines.push(" volumes:");
36238
36319
  lines.push(" - ./observability/otel-collector-config.yaml:/etc/otelcol-contrib/config.yaml:ro");
36320
+ lines.push(" - /:/hostfs:ro # hostmetrics root_path");
36321
+ lines.push(" - /var/run/docker.sock:/var/run/docker.sock:ro # docker_stats");
36239
36322
  lines.push(" networks: [arc-net]");
36240
36323
  lines.push(" expose:");
36241
36324
  lines.push(' - "4317" # OTLP gRPC');
@@ -36250,6 +36333,7 @@ function generateCompose({ cfg }) {
36250
36333
  lines.push(" image: grafana/tempo:2.6.1");
36251
36334
  lines.push(" container_name: arc-tempo");
36252
36335
  lines.push(" restart: unless-stopped");
36336
+ pushLogging(lines);
36253
36337
  lines.push(' command: ["-config.file=/etc/tempo.yaml"]');
36254
36338
  lines.push(' user: "0" # tempo writes to /var/tempo, owned by root in the image');
36255
36339
  lines.push(" volumes:");
@@ -36264,6 +36348,7 @@ function generateCompose({ cfg }) {
36264
36348
  lines.push(" image: grafana/loki:3.3.2");
36265
36349
  lines.push(" container_name: arc-loki");
36266
36350
  lines.push(" restart: unless-stopped");
36351
+ pushLogging(lines);
36267
36352
  lines.push(' command: ["-config.file=/etc/loki/local-config.yaml"]');
36268
36353
  lines.push(' user: "0"');
36269
36354
  lines.push(" volumes:");
@@ -36278,6 +36363,7 @@ function generateCompose({ cfg }) {
36278
36363
  lines.push(" image: prom/prometheus:v2.55.1");
36279
36364
  lines.push(" container_name: arc-prometheus");
36280
36365
  lines.push(" restart: unless-stopped");
36366
+ pushLogging(lines);
36281
36367
  lines.push(" command:");
36282
36368
  lines.push(' - "--config.file=/etc/prometheus/prometheus.yml"');
36283
36369
  lines.push(' - "--storage.tsdb.path=/prometheus"');
@@ -36291,20 +36377,47 @@ function generateCompose({ cfg }) {
36291
36377
  lines.push(" expose:");
36292
36378
  lines.push(' - "9090" # HTTP API + remote_write receiver');
36293
36379
  lines.push("");
36380
+ lines.push(" alloy:");
36381
+ lines.push(" image: grafana/alloy:v1.16.1");
36382
+ lines.push(" container_name: arc-alloy");
36383
+ lines.push(" restart: unless-stopped");
36384
+ pushLogging(lines);
36385
+ lines.push(' user: "0" # docker.sock access');
36386
+ lines.push(" command:");
36387
+ lines.push(" - run");
36388
+ lines.push(" - --server.http.listen-addr=0.0.0.0:12345");
36389
+ lines.push(" - --storage.path=/var/lib/alloy/data");
36390
+ lines.push(" - /etc/alloy/config.alloy");
36391
+ lines.push(" volumes:");
36392
+ lines.push(" - ./observability/alloy-config.alloy:/etc/alloy/config.alloy:ro");
36393
+ lines.push(" - /var/run/docker.sock:/var/run/docker.sock:ro");
36394
+ lines.push(" - alloy_data:/var/lib/alloy/data");
36395
+ lines.push(" networks: [arc-net]");
36396
+ lines.push(" expose:");
36397
+ lines.push(' - "12345" # Alloy self-metrics (Prom scrape)');
36398
+ lines.push(" depends_on:");
36399
+ lines.push(" - loki");
36400
+ lines.push("");
36294
36401
  const adminPasswordEnv = cfg.observability.adminPasswordEnv ?? "ARC_OBSERVABILITY_PASSWORD";
36402
+ const grafanaDomain = observabilityDomain(cfg);
36295
36403
  lines.push(" grafana:");
36296
36404
  lines.push(" image: grafana/grafana:11.4.0");
36297
36405
  lines.push(" container_name: arc-grafana");
36298
36406
  lines.push(" restart: unless-stopped");
36407
+ pushLogging(lines);
36299
36408
  lines.push(" environment:");
36300
36409
  lines.push(" GF_SECURITY_ADMIN_USER: admin");
36301
36410
  lines.push(` GF_SECURITY_ADMIN_PASSWORD: \${${adminPasswordEnv}:?missing ${adminPasswordEnv}}`);
36302
36411
  lines.push(' GF_USERS_ALLOW_SIGN_UP: "false"');
36303
36412
  lines.push(' GF_AUTH_ANONYMOUS_ENABLED: "false"');
36413
+ if (grafanaDomain) {
36414
+ lines.push(` GF_SERVER_ROOT_URL: "https://${grafanaDomain}"`);
36415
+ }
36304
36416
  lines.push(" volumes:");
36305
36417
  lines.push(" - ./observability/grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml:ro");
36306
36418
  lines.push(" - ./observability/grafana-dashboards.yaml:/etc/grafana/provisioning/dashboards/dashboards.yaml:ro");
36307
36419
  lines.push(" - ./observability/grafana-dashboards:/etc/grafana/provisioning/dashboards/arc:ro");
36420
+ lines.push(" - ./observability/grafana-alerting:/etc/grafana/provisioning/alerting:ro");
36308
36421
  lines.push(" - grafana_data:/var/lib/grafana");
36309
36422
  lines.push(" networks: [arc-net]");
36310
36423
  lines.push(" expose:");
@@ -36334,6 +36447,7 @@ function generateCompose({ cfg }) {
36334
36447
  lines.push(" loki_data:");
36335
36448
  lines.push(" prometheus_data:");
36336
36449
  lines.push(" grafana_data:");
36450
+ lines.push(" alloy_data:");
36337
36451
  }
36338
36452
  return lines.join(`
36339
36453
  `) + `
@@ -36408,6 +36522,64 @@ ${envNames.map((name) => ` - "https://${cfg.envs[name].domain}"`).joi
36408
36522
  - tracestate
36409
36523
  - content-type
36410
36524
 
36525
+ # Host-level CPU / memory / load / disk / filesystem / network metrics.
36526
+ # The host root is bind-mounted read-only at /hostfs (see compose).
36527
+ hostmetrics:
36528
+ collection_interval: 30s
36529
+ root_path: /hostfs
36530
+ scrapers:
36531
+ cpu:
36532
+ metrics:
36533
+ system.cpu.utilization:
36534
+ enabled: true
36535
+ memory:
36536
+ metrics:
36537
+ system.memory.utilization:
36538
+ enabled: true
36539
+ load: {}
36540
+ disk: {}
36541
+ filesystem:
36542
+ metrics:
36543
+ system.filesystem.utilization:
36544
+ enabled: true
36545
+ exclude_fs_types:
36546
+ fs_types: [autofs, binfmt_misc, bpf, cgroup2, configfs, debugfs, devpts, devtmpfs, fusectl, hugetlbfs, iso9660, mqueue, nsfs, overlay, proc, procfs, pstore, rpc_pipefs, securityfs, selinuxfs, squashfs, sysfs, tracefs, tmpfs]
36547
+ match_type: strict
36548
+ exclude_mount_points:
36549
+ mount_points: ["/var/lib/docker/.*", "/run/.*", "/snap/.*", "/boot/.*"]
36550
+ match_type: regexp
36551
+ network: {}
36552
+ paging: {}
36553
+
36554
+ # Per-container CPU / memory / network / block-IO + restarts straight from
36555
+ # the Docker daemon (socket bind-mounted read-only, see compose).
36556
+ docker_stats:
36557
+ endpoint: unix:///var/run/docker.sock
36558
+ collection_interval: 30s
36559
+ metrics:
36560
+ container.restarts:
36561
+ enabled: true
36562
+ container.uptime:
36563
+ enabled: true
36564
+
36565
+ connectors:
36566
+ # Span\u2192metrics computed from 100% of spans (pipeline runs BEFORE tail
36567
+ # sampling) \u2014 lowering the sampling policy later never skews dashboards.
36568
+ spanmetrics:
36569
+ histogram:
36570
+ unit: ms
36571
+ explicit:
36572
+ buckets: [2ms, 6ms, 10ms, 50ms, 100ms, 250ms, 500ms, 1s, 2s, 5s, 10s]
36573
+ metrics_flush_interval: 15s
36574
+ # Emits traces_service_graph_* (same metric names Tempo's generator would).
36575
+ servicegraph:
36576
+ metrics_flush_interval: 15s
36577
+ store:
36578
+ ttl: 5s
36579
+ max_items: 5000
36580
+ # Joins the raw-trace pipeline to the sampled-storage pipeline.
36581
+ forward: {}
36582
+
36411
36583
  processors:
36412
36584
  batch:
36413
36585
  timeout: 5s
@@ -36418,7 +36590,8 @@ processors:
36418
36590
  # Errors + slow traces zachowywane w 100%, normalne traces r\xF3wnie\u017C 100%
36419
36591
  # przy obecnej skali (boostrap produkcji). Tail sampling matchuje OR po
36420
36592
  # policies \u2014 bez "always" policy WSZYSTKIE OK traces by\u0142yby droppowane.
36421
- # Obni\u017C 'random_100pct' do np. 10% gdy ruch eksploduje.
36593
+ # Obni\u017C 'random_100pct' do np. 10% gdy ruch eksploduje \u2014 span-metrics s\u0105
36594
+ # liczone przed samplingiem, wi\u0119c dashboardy pozostan\u0105 dok\u0142adne.
36422
36595
  tail_sampling:
36423
36596
  decision_wait: 10s
36424
36597
  num_traces: 50000
@@ -36443,6 +36616,34 @@ processors:
36443
36616
  - key: http.request.header.cookie
36444
36617
  action: delete
36445
36618
 
36619
+ # Cardinality guard for span-metrics: the SPA fallback answers EVERY path,
36620
+ # so raw span names (one per bot-scanned URL) would explode Prometheus
36621
+ # series. Static assets collapse to "<METHOD> static", /route/* to
36622
+ # "<METHOD> /route", anything else outside the known API surface to
36623
+ # "<METHOD> other". Runs BEFORE the spanmetrics connector. Note: "$$" is a
36624
+ # literal "$" (collector env expansion), RE2 has no lookahead \u2192 IsMatch+not.
36625
+ transform/span_names:
36626
+ error_mode: ignore
36627
+ trace_statements:
36628
+ - context: span
36629
+ statements:
36630
+ - set(name, Concat([attributes["http.request.method"], "static"], " ")) where IsMatch(name, "^[A-Z]+ /.*\\\\.(js|mjs|css|map|ico|png|jpe?g|svg|gif|webp|avif|woff2?|ttf|otf|txt|xml|json|webmanifest)$$")
36631
+ - replace_pattern(name, "^([A-Z]+) /route(/.*)?$$", "$$1 /route")
36632
+ - set(name, Concat([attributes["http.request.method"], "other"], " ")) where IsMatch(name, "^[A-Z]+ /") and not IsMatch(name, "^[A-Z]+ ((/api|/command|/query)(/.*)?|/route|/ws|/health|/otel(/.*)?|/)$$")
36633
+
36634
+ # Stable service.name for infra metric streams (becomes the service_name
36635
+ # label after resource_to_telemetry_conversion).
36636
+ resource/host:
36637
+ attributes:
36638
+ - key: service.name
36639
+ value: arc-host
36640
+ action: upsert
36641
+ resource/docker:
36642
+ attributes:
36643
+ - key: service.name
36644
+ value: arc-docker
36645
+ action: upsert
36646
+
36446
36647
  exporters:
36447
36648
  otlp/tempo:
36448
36649
  endpoint: tempo:4317
@@ -36458,6 +36659,10 @@ exporters:
36458
36659
  endpoint: http://prometheus:9090/api/v1/write
36459
36660
  tls:
36460
36661
  insecure: true
36662
+ # Copy resource attributes (service.name, deployment.environment, \u2026)
36663
+ # onto every series \u2014 dashboards filter by service_name.
36664
+ resource_to_telemetry_conversion:
36665
+ enabled: true
36461
36666
 
36462
36667
  extensions:
36463
36668
  health_check: {}
@@ -36465,19 +36670,42 @@ extensions:
36465
36670
 
36466
36671
  service:
36467
36672
  extensions: [health_check, zpages]
36673
+ # Collector self-metrics. Since 0.111 the default bind is localhost only \u2014
36674
+ # Prometheus scrapes otel-collector:8888, so listen on all interfaces.
36675
+ telemetry:
36676
+ metrics:
36677
+ level: detailed
36678
+ readers:
36679
+ - pull:
36680
+ exporter:
36681
+ prometheus:
36682
+ host: "0.0.0.0"
36683
+ port: 8888
36468
36684
  pipelines:
36469
- traces:
36685
+ traces/in:
36470
36686
  receivers: [otlp]
36471
- processors: [tail_sampling, attributes, batch]
36687
+ processors: [attributes, transform/span_names]
36688
+ exporters: [spanmetrics, servicegraph, forward]
36689
+ traces/sampled:
36690
+ receivers: [forward]
36691
+ processors: [tail_sampling, batch]
36472
36692
  exporters: [otlp/tempo]
36473
36693
  logs:
36474
36694
  receivers: [otlp]
36475
36695
  processors: [attributes, batch]
36476
36696
  exporters: [otlphttp/loki]
36477
36697
  metrics:
36478
- receivers: [otlp]
36698
+ receivers: [otlp, spanmetrics, servicegraph]
36479
36699
  processors: [batch]
36480
36700
  exporters: [prometheusremotewrite]
36701
+ metrics/host:
36702
+ receivers: [hostmetrics]
36703
+ processors: [resource/host, batch]
36704
+ exporters: [prometheusremotewrite]
36705
+ metrics/docker:
36706
+ receivers: [docker_stats]
36707
+ processors: [resource/docker, batch]
36708
+ exporters: [prometheusremotewrite]
36481
36709
  `;
36482
36710
  }
36483
36711
  function generateTempoConfig(cfg) {
@@ -36513,20 +36741,9 @@ storage:
36513
36741
  wal:
36514
36742
  path: /var/tempo/wal
36515
36743
 
36516
- metrics_generator:
36517
- registry:
36518
- external_labels:
36519
- source: tempo
36520
- storage:
36521
- path: /var/tempo/generator/wal
36522
- remote_write:
36523
- - url: http://prometheus:9090/api/v1/write
36524
- send_exemplars: true
36525
-
36526
- overrides:
36527
- defaults:
36528
- metrics_generator:
36529
- processors: [service-graphs, span-metrics]
36744
+ # NOTE: no metrics_generator \u2014 span-metrics + service-graph are produced by
36745
+ # the otel-collector connectors BEFORE tail sampling (accurate rates even
36746
+ # when sampling is later tightened) and remote-written to Prometheus there.
36530
36747
  `;
36531
36748
  }
36532
36749
  function generateLokiConfig(cfg) {
@@ -36582,11 +36799,205 @@ scrape_configs:
36582
36799
  - job_name: otel-collector
36583
36800
  static_configs:
36584
36801
  - targets: [otel-collector:8888]
36802
+ - job_name: caddy
36803
+ static_configs:
36804
+ - targets: [caddy:2020]
36805
+ - job_name: loki
36806
+ static_configs:
36807
+ - targets: [loki:3100]
36808
+ - job_name: tempo
36809
+ static_configs:
36810
+ - targets: [tempo:3200]
36811
+ - job_name: grafana
36812
+ static_configs:
36813
+ - targets: [grafana:3000]
36814
+ - job_name: alloy
36815
+ static_configs:
36816
+ - targets: [alloy:12345]
36585
36817
 
36586
36818
  # remote-write inbound is enabled via the --web.enable-remote-write-receiver
36587
36819
  # command-line flag (compose.ts). Retention via --storage.tsdb.retention.time.
36588
36820
  `;
36589
36821
  }
36822
+ function generateAlloyConfig() {
36823
+ return `// Generated by \`arc platform deploy\` \u2014 do not edit by hand.
36824
+ discovery.docker "containers" {
36825
+ host = "unix:///var/run/docker.sock"
36826
+ refresh_interval = "15s"
36827
+ }
36828
+
36829
+ discovery.relabel "containers" {
36830
+ targets = discovery.docker.containers.targets
36831
+
36832
+ rule {
36833
+ source_labels = ["__meta_docker_container_name"]
36834
+ regex = "/(.*)"
36835
+ target_label = "container"
36836
+ }
36837
+ rule {
36838
+ source_labels = ["__meta_docker_container_label_com_docker_compose_service"]
36839
+ target_label = "compose_service"
36840
+ }
36841
+ }
36842
+
36843
+ loki.source.docker "containers" {
36844
+ host = "unix:///var/run/docker.sock"
36845
+ targets = discovery.docker.containers.targets
36846
+ relabel_rules = discovery.relabel.containers.rules
36847
+ labels = { source = "docker" }
36848
+ forward_to = [loki.write.loki.receiver]
36849
+ }
36850
+
36851
+ loki.write "loki" {
36852
+ endpoint {
36853
+ url = "http://loki:3100/loki/api/v1/push"
36854
+ }
36855
+ }
36856
+ `;
36857
+ }
36858
+ function generateGrafanaAlerting(cfg) {
36859
+ const webhookUrl = cfg.observability?.alertWebhookUrl;
36860
+ const rules = [
36861
+ {
36862
+ uid: "arc-high-error-rate",
36863
+ title: "High server error rate (>5%)",
36864
+ expr: 'sum(rate(traces_span_metrics_calls_total{span_kind="SPAN_KIND_SERVER", status_code="STATUS_CODE_ERROR"}[5m])) / clamp_min(sum(rate(traces_span_metrics_calls_total{span_kind="SPAN_KIND_SERVER"}[5m])), 0.001)',
36865
+ threshold: 0.05,
36866
+ pendingFor: "5m",
36867
+ summary: "More than 5% of server spans are errors over the last 5 minutes."
36868
+ },
36869
+ {
36870
+ uid: "arc-high-latency-p95",
36871
+ title: "High p95 latency (>1s)",
36872
+ expr: 'histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_kind="SPAN_KIND_SERVER"}[5m])))',
36873
+ threshold: 1000,
36874
+ pendingFor: "10m",
36875
+ summary: "Server p95 latency above 1s for 10 minutes."
36876
+ },
36877
+ {
36878
+ uid: "arc-host-disk-high",
36879
+ title: "Host disk usage >85%",
36880
+ expr: 'max by (mountpoint) (sum by (device, mountpoint) (system_filesystem_usage_bytes{state="used"}) / sum by (device, mountpoint) (system_filesystem_usage_bytes))',
36881
+ threshold: 0.85,
36882
+ pendingFor: "15m",
36883
+ summary: "A host filesystem is more than 85% full."
36884
+ },
36885
+ {
36886
+ uid: "arc-host-memory-high",
36887
+ title: "Host memory usage >90%",
36888
+ expr: 'sum(system_memory_usage_bytes{state="used"}) / sum(system_memory_usage_bytes)',
36889
+ threshold: 0.9,
36890
+ pendingFor: "10m",
36891
+ summary: "Host memory usage above 90% for 10 minutes."
36892
+ },
36893
+ {
36894
+ uid: "arc-container-restarts",
36895
+ title: "Container restarted",
36896
+ expr: "sum by (container_name) (increase(container_restarts_total[15m]))",
36897
+ threshold: 0,
36898
+ pendingFor: "0s",
36899
+ summary: "A container restarted within the last 15 minutes."
36900
+ },
36901
+ {
36902
+ uid: "arc-app-silent",
36903
+ title: "App stopped reporting metrics",
36904
+ expr: "absent(arc_commands_total)",
36905
+ threshold: 0,
36906
+ pendingFor: "10m",
36907
+ summary: "No arc_commands_total series for 10 minutes \u2014 app down or telemetry broken."
36908
+ },
36909
+ {
36910
+ uid: "arc-collector-export-failures",
36911
+ title: "Telemetry export failures",
36912
+ expr: 'sum(increase({__name__=~"otelcol_exporter_send_failed_(spans|metric_points|log_records)"}[15m]))',
36913
+ threshold: 0,
36914
+ pendingFor: "0s",
36915
+ summary: "The otel-collector failed to export telemetry within the last 15 minutes."
36916
+ },
36917
+ {
36918
+ uid: "arc-target-down",
36919
+ title: "Scrape target down",
36920
+ expr: "min(up)",
36921
+ threshold: 1,
36922
+ op: "lt",
36923
+ pendingFor: "5m",
36924
+ summary: "A Prometheus scrape target has been down for 5 minutes."
36925
+ }
36926
+ ];
36927
+ const ruleYaml = rules.map((rule) => {
36928
+ const op = rule.op ?? "gt";
36929
+ return ` - uid: ${rule.uid}
36930
+ title: ${JSON.stringify(rule.title)}
36931
+ condition: C
36932
+ for: ${rule.pendingFor}
36933
+ noDataState: OK
36934
+ execErrState: OK
36935
+ annotations:
36936
+ summary: ${JSON.stringify(rule.summary)}
36937
+ labels:
36938
+ source: arc
36939
+ data:
36940
+ - refId: A
36941
+ relativeTimeRange: { from: 600, to: 0 }
36942
+ datasourceUid: prometheus
36943
+ model:
36944
+ expr: ${JSON.stringify(rule.expr)}
36945
+ instant: true
36946
+ intervalMs: 1000
36947
+ maxDataPoints: 43200
36948
+ refId: A
36949
+ - refId: B
36950
+ relativeTimeRange: { from: 0, to: 0 }
36951
+ datasourceUid: __expr__
36952
+ model:
36953
+ type: reduce
36954
+ expression: A
36955
+ reducer: last
36956
+ refId: B
36957
+ - refId: C
36958
+ relativeTimeRange: { from: 0, to: 0 }
36959
+ datasourceUid: __expr__
36960
+ model:
36961
+ type: threshold
36962
+ expression: B
36963
+ refId: C
36964
+ conditions:
36965
+ - evaluator:
36966
+ type: ${op}
36967
+ params: [${rule.threshold}]`;
36968
+ }).join(`
36969
+ `);
36970
+ const contactSection = webhookUrl ? `
36971
+ contactPoints:
36972
+ - orgId: 1
36973
+ name: arc-webhook
36974
+ receivers:
36975
+ - uid: arc-webhook
36976
+ type: webhook
36977
+ settings:
36978
+ url: ${JSON.stringify(webhookUrl)}
36979
+ httpMethod: POST
36980
+
36981
+ policies:
36982
+ - orgId: 1
36983
+ receiver: arc-webhook
36984
+ group_by: ["grafana_folder", "alertname"]
36985
+ group_wait: 30s
36986
+ group_interval: 5m
36987
+ repeat_interval: 4h
36988
+ ` : "";
36989
+ return `# Generated by \`arc platform deploy\` \u2014 do not edit by hand.
36990
+ apiVersion: 1
36991
+
36992
+ groups:
36993
+ - orgId: 1
36994
+ name: arc-alerts
36995
+ folder: Arc
36996
+ interval: 1m
36997
+ rules:
36998
+ ${ruleYaml}
36999
+ ${contactSection}`;
37000
+ }
36590
37001
  function generateGrafanaDatasources() {
36591
37002
  return `# Generated by \`arc platform deploy\` \u2014 do not edit by hand.
36592
37003
  apiVersion: 1
@@ -36610,10 +37021,17 @@ datasources:
36610
37021
  uid: loki
36611
37022
  jsonData:
36612
37023
  derivedFields:
37024
+ # Plain-text logs that happen to contain "trace_id=<id>".
36613
37025
  - datasourceUid: tempo
36614
37026
  matcherRegex: "trace_id=(\\\\w+)"
36615
37027
  name: TraceID
36616
37028
  url: $\${__value.raw}
37029
+ # OTLP-ingested logs \u2014 trace_id arrives as structured metadata.
37030
+ - datasourceUid: tempo
37031
+ matcherType: label
37032
+ matcherRegex: trace_id
37033
+ name: TraceID (OTLP)
37034
+ url: $\${__value.raw}
36617
37035
  - name: Prometheus
36618
37036
  type: prometheus
36619
37037
  access: proxy
@@ -36656,7 +37074,7 @@ function generateArcOverviewDashboard() {
36656
37074
  label: "Service",
36657
37075
  type: "query",
36658
37076
  datasource: { type: "prometheus", uid: "prometheus" },
36659
- query: "label_values(traces_spanmetrics_calls_total, service_name)",
37077
+ query: "label_values(traces_span_metrics_calls_total, service_name)",
36660
37078
  refresh: 2,
36661
37079
  includeAll: false,
36662
37080
  multi: false,
@@ -36665,29 +37083,29 @@ function generateArcOverviewDashboard() {
36665
37083
  ]
36666
37084
  },
36667
37085
  panels: [
36668
- panelStat("Request rate (req/s)", { x: 0, y: 0, w: 6, h: 4 }, 'sum(rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m]))', "reqps"),
36669
- panelStat("Error rate (%)", { x: 6, y: 0, w: 6, h: 4 }, 'sum(rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER", status_code="STATUS_CODE_ERROR"}[5m])) / clamp_min(sum(rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])), 0.001) * 100', "percent", { red: 1, orange: 0.1 }),
36670
- panelStat("P99 latency", { x: 12, y: 0, w: 6, h: 4 }, 'histogram_quantile(0.99, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))', "ms", { red: 1000, orange: 300 }),
37086
+ panelStat("Request rate (req/s)", { x: 0, y: 0, w: 6, h: 4 }, 'sum(rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m]))', "reqps"),
37087
+ panelStat("Error rate (%)", { x: 6, y: 0, w: 6, h: 4 }, 'sum(rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER", status_code="STATUS_CODE_ERROR"}[5m])) / clamp_min(sum(rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])), 0.001) * 100', "percent", { red: 1, orange: 0.1 }),
37088
+ panelStat("P99 latency", { x: 12, y: 0, w: 6, h: 4 }, 'histogram_quantile(0.99, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))', "ms", { red: 1000, orange: 300 }),
36671
37089
  panelStat("Active commands/sec", { x: 18, y: 0, w: 6, h: 4 }, 'sum(rate(arc_commands_total{service_name="$service"}[5m]))', "ops"),
36672
- panelTimeseries("Request rate by route", { x: 0, y: 4, w: 12, h: 8 }, 'sum by (span_name) (rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[1m]))', "{{span_name}}", "reqps"),
37090
+ panelTimeseries("Request rate by route", { x: 0, y: 4, w: 12, h: 8 }, 'sum by (span_name) (rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[1m]))', "{{span_name}}", "reqps"),
36673
37091
  panelTimeseries("Latency percentiles", { x: 12, y: 4, w: 12, h: 8 }, [
36674
37092
  {
36675
- expr: 'histogram_quantile(0.5, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
37093
+ expr: 'histogram_quantile(0.5, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
36676
37094
  legend: "p50"
36677
37095
  },
36678
37096
  {
36679
- expr: 'histogram_quantile(0.95, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
37097
+ expr: 'histogram_quantile(0.95, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
36680
37098
  legend: "p95"
36681
37099
  },
36682
37100
  {
36683
- expr: 'histogram_quantile(0.99, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
37101
+ expr: 'histogram_quantile(0.99, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
36684
37102
  legend: "p99"
36685
37103
  }
36686
- ], "ms"),
37104
+ ], "", "ms"),
36687
37105
  panelTimeseries("Commands per second", { x: 0, y: 12, w: 12, h: 8 }, 'sum by (arc_command_name) (rate(arc_commands_total{service_name="$service"}[1m]))', "{{arc_command_name}}", "ops"),
36688
- panelTimeseries("Command p95 latency", { x: 12, y: 12, w: 12, h: 8 }, 'histogram_quantile(0.95, sum by (arc_command_name, le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service"}[5m])))', "{{arc_command_name}}", "ms"),
36689
- panelTimeseries("DB find ops/sec by collection", { x: 0, y: 20, w: 12, h: 8 }, 'sum by (db_collection_name) (rate(arc_db_find_ms_milliseconds_count{service_name="$service"}[1m]))', "{{db_collection_name}}", "ops"),
36690
- panelTimeseries("DB find p95 latency", { x: 12, y: 20, w: 12, h: 8 }, 'histogram_quantile(0.95, sum by (db_collection_name, le) (rate(arc_db_find_ms_milliseconds_bucket{service_name="$service"}[5m])))', "{{db_collection_name}}", "ms"),
37106
+ panelTimeseries("Command p95 latency", { x: 12, y: 12, w: 12, h: 8 }, 'histogram_quantile(0.95, sum by (arc_command_name, le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service"}[5m])))', "{{arc_command_name}}", "ms"),
37107
+ panelTimeseries("DB ops/sec by collection", { x: 0, y: 20, w: 12, h: 8 }, 'sum by (db_collection_name, db_operation_name) (rate(arc_db_operation_duration_milliseconds_count{service_name="$service"}[1m]))', "{{db_collection_name}} {{db_operation_name}}", "ops"),
37108
+ panelTimeseries("DB p95 latency by operation", { x: 12, y: 20, w: 12, h: 8 }, 'histogram_quantile(0.95, sum by (db_operation_name, le) (rate(arc_db_operation_duration_milliseconds_bucket{service_name="$service"}[5m])))', "{{db_operation_name}}", "ms"),
36691
37109
  {
36692
37110
  title: "Recent error logs",
36693
37111
  type: "logs",
@@ -36695,7 +37113,7 @@ function generateArcOverviewDashboard() {
36695
37113
  datasource: { type: "loki", uid: "loki" },
36696
37114
  targets: [
36697
37115
  {
36698
- expr: '{service_name="$service"} |= `ERROR`',
37116
+ expr: '{service_name="$service"} | severity_text=~`ERROR|FATAL`',
36699
37117
  refId: "A"
36700
37118
  }
36701
37119
  ],
@@ -36729,7 +37147,7 @@ function generateArcTracesDashboard() {
36729
37147
  label: "Service",
36730
37148
  type: "query",
36731
37149
  datasource: { type: "prometheus", uid: "prometheus" },
36732
- query: "label_values(traces_spanmetrics_calls_total, service_name)",
37150
+ query: "label_values(traces_span_metrics_calls_total, service_name)",
36733
37151
  refresh: 2,
36734
37152
  current: { text: "arc-prod", value: "arc-prod" }
36735
37153
  }
@@ -36828,12 +37246,23 @@ function generateArcLogsDashboard() {
36828
37246
  type: "textbox",
36829
37247
  query: "",
36830
37248
  current: { text: "", value: "" }
37249
+ },
37250
+ {
37251
+ name: "container",
37252
+ label: "Container",
37253
+ type: "query",
37254
+ datasource: { type: "loki", uid: "loki" },
37255
+ query: "label_values(container)",
37256
+ refresh: 2,
37257
+ includeAll: true,
37258
+ multi: false,
37259
+ current: { text: "All", value: "$__all" }
36831
37260
  }
36832
37261
  ]
36833
37262
  },
36834
37263
  panels: [
36835
- panelStat("Logs ingested (1h)", { x: 0, y: 0, w: 6, h: 4 }, 'sum(increase({service_name="$service"}[1h]))', "short"),
36836
- panelStat("Errors (1h)", { x: 6, y: 0, w: 6, h: 4 }, 'sum(increase({service_name="$service", severity_text=~"ERROR|FATAL"}[1h]))', "short", { orange: 1, red: 50 }),
37264
+ panelStat("Logs ingested (1h)", { x: 0, y: 0, w: 6, h: 4 }, 'sum(count_over_time({service_name="$service"}[1h]))', "short", undefined, LOKI_DS),
37265
+ panelStat("Errors (1h)", { x: 6, y: 0, w: 6, h: 4 }, 'sum(count_over_time({service_name="$service"} | severity_text=~`ERROR|FATAL` [1h]))', "short", { orange: 1, red: 50 }, LOKI_DS),
36837
37266
  {
36838
37267
  title: "Log volume by severity",
36839
37268
  type: "timeseries",
@@ -36883,6 +37312,27 @@ function generateArcLogsDashboard() {
36883
37312
  dedupStrategy: "none",
36884
37313
  sortOrder: "Descending"
36885
37314
  }
37315
+ },
37316
+ {
37317
+ title: "Container logs ($container)",
37318
+ type: "logs",
37319
+ gridPos: { x: 0, y: 26, w: 24, h: 14 },
37320
+ datasource: { type: "loki", uid: "loki" },
37321
+ targets: [
37322
+ {
37323
+ expr: '{container=~"$container"} |~ "$search"',
37324
+ refId: "A"
37325
+ }
37326
+ ],
37327
+ options: {
37328
+ showTime: true,
37329
+ showLabels: true,
37330
+ showCommonLabels: false,
37331
+ wrapLogMessage: true,
37332
+ enableLogDetails: true,
37333
+ dedupStrategy: "none",
37334
+ sortOrder: "Descending"
37335
+ }
36886
37336
  }
36887
37337
  ]
36888
37338
  };
@@ -36912,7 +37362,7 @@ function generateArcSamplingDashboard() {
36912
37362
  expr: "sum(rate(otelcol_exporter_sent_spans[1m]))",
36913
37363
  legend: "exported"
36914
37364
  }
36915
- ], "ops"),
37365
+ ], "", "ops"),
36916
37366
  panelTimeseries("Collector queue size (BatchSpanProcessor)", { x: 0, y: 12, w: 12, h: 8 }, "otelcol_processor_batch_batch_send_size_sum / clamp_min(otelcol_processor_batch_batch_send_size_count, 1)", "avg batch size", "short"),
36917
37367
  panelTimeseries("Collector process memory", { x: 12, y: 12, w: 12, h: 8 }, 'process_resident_memory_bytes{job="otel-collector"}', "RSS", "bytes")
36918
37368
  ]
@@ -36953,24 +37403,24 @@ function generateArcCommandDashboard() {
36953
37403
  },
36954
37404
  panels: [
36955
37405
  panelStat("Call rate", { x: 0, y: 0, w: 6, h: 4 }, 'sum(rate(arc_commands_total{service_name="$service", arc_command_name="$command"}[5m]))', "ops"),
36956
- panelStat("P50 latency", { x: 6, y: 0, w: 6, h: 4 }, 'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))', "ms"),
36957
- panelStat("P95 latency", { x: 12, y: 0, w: 6, h: 4 }, 'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))', "ms", { orange: 200, red: 1000 }),
36958
- panelStat("P99 latency", { x: 18, y: 0, w: 6, h: 4 }, 'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))', "ms", { orange: 500, red: 2000 }),
37406
+ panelStat("P50 latency", { x: 6, y: 0, w: 6, h: 4 }, 'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))', "ms"),
37407
+ panelStat("P95 latency", { x: 12, y: 0, w: 6, h: 4 }, 'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))', "ms", { orange: 200, red: 1000 }),
37408
+ panelStat("P99 latency", { x: 18, y: 0, w: 6, h: 4 }, 'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))', "ms", { orange: 500, red: 2000 }),
36959
37409
  panelTimeseries("Call rate over time", { x: 0, y: 4, w: 12, h: 8 }, 'sum(rate(arc_commands_total{service_name="$service", arc_command_name="$command"}[1m]))', "calls/s", "ops"),
36960
37410
  panelTimeseries("Latency percentiles", { x: 12, y: 4, w: 12, h: 8 }, [
36961
37411
  {
36962
- expr: 'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
37412
+ expr: 'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
36963
37413
  legend: "p50"
36964
37414
  },
36965
37415
  {
36966
- expr: 'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
37416
+ expr: 'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
36967
37417
  legend: "p95"
36968
37418
  },
36969
37419
  {
36970
- expr: 'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
37420
+ expr: 'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
36971
37421
  legend: "p99"
36972
37422
  }
36973
- ], "ms"),
37423
+ ], "", "ms"),
36974
37424
  {
36975
37425
  title: "Recent traces (sampled)",
36976
37426
  type: "traces",
@@ -36989,23 +37439,108 @@ function generateArcCommandDashboard() {
36989
37439
  };
36990
37440
  return JSON.stringify(dashboard, null, 2);
36991
37441
  }
37442
+ function generateArcInfraDashboard() {
37443
+ const dashboard = {
37444
+ title: "Arc Infrastructure",
37445
+ uid: "arc-infra",
37446
+ schemaVersion: 39,
37447
+ version: 1,
37448
+ refresh: "30s",
37449
+ time: { from: "now-3h", to: "now" },
37450
+ tags: ["arc", "auto-provisioned"],
37451
+ panels: [
37452
+ panelStat("Host CPU used", { x: 0, y: 0, w: 6, h: 4 }, '100 * (1 - avg(rate(system_cpu_time_seconds_total{state="idle"}[5m])))', "percent", { orange: 70, red: 90 }),
37453
+ panelStat("Host memory used", { x: 6, y: 0, w: 6, h: 4 }, '100 * sum(system_memory_usage_bytes{state="used"}) / sum(system_memory_usage_bytes)', "percent", { orange: 80, red: 90 }),
37454
+ panelStat("Disk used (worst mount)", { x: 12, y: 0, w: 6, h: 4 }, '100 * max(sum by (device, mountpoint) (system_filesystem_usage_bytes{state="used"}) / sum by (device, mountpoint) (system_filesystem_usage_bytes))', "percent", { orange: 75, red: 85 }),
37455
+ panelStat("Load (1m)", { x: 18, y: 0, w: 6, h: 4 }, "system_cpu_load_average_1m", "short"),
37456
+ panelTimeseries("Host CPU utilization", { x: 0, y: 4, w: 12, h: 8 }, [
37457
+ {
37458
+ expr: '100 * (1 - avg(rate(system_cpu_time_seconds_total{state="idle"}[5m])))',
37459
+ legend: "used %"
37460
+ },
37461
+ { expr: "system_cpu_load_average_1m", legend: "load 1m" },
37462
+ { expr: "system_cpu_load_average_5m", legend: "load 5m" },
37463
+ { expr: "system_cpu_load_average_15m", legend: "load 15m" }
37464
+ ], "", "short"),
37465
+ panelTimeseries("Host memory by state", { x: 12, y: 4, w: 12, h: 8 }, "sum by (state) (system_memory_usage_bytes)", "{{state}}", "bytes"),
37466
+ panelTimeseries("Filesystem usage by mount", { x: 0, y: 12, w: 12, h: 8 }, '100 * sum by (mountpoint) (system_filesystem_usage_bytes{state="used"}) / sum by (mountpoint) (system_filesystem_usage_bytes)', "{{mountpoint}}", "percent"),
37467
+ panelTimeseries("Disk I/O", { x: 12, y: 12, w: 12, h: 8 }, "sum by (device, direction) (rate(system_disk_io_bytes_total[5m]))", "{{device}} {{direction}}", "Bps"),
37468
+ panelTimeseries("Network I/O", { x: 0, y: 20, w: 12, h: 8 }, 'sum by (device, direction) (rate(system_network_io_bytes_total{device!="lo"}[5m]))', "{{device}} {{direction}}", "Bps"),
37469
+ panelTimeseries("Container restarts (24h)", { x: 12, y: 20, w: 12, h: 8 }, "sum by (container_name) (increase(container_restarts_total[24h]))", "{{container_name}}", "short"),
37470
+ panelTimeseries("Container CPU", { x: 0, y: 28, w: 12, h: 8 }, "container_cpu_utilization_ratio", "{{container_name}}", "percent"),
37471
+ panelTimeseries("Container memory", { x: 12, y: 28, w: 12, h: 8 }, "container_memory_usage_total_bytes", "{{container_name}}", "bytes"),
37472
+ panelTimeseries("Container network RX", { x: 0, y: 36, w: 12, h: 8 }, "sum by (container_name) (rate(container_network_io_usage_rx_bytes_total[5m]))", "{{container_name}}", "Bps"),
37473
+ panelTimeseries("Container network TX", { x: 12, y: 36, w: 12, h: 8 }, "sum by (container_name) (rate(container_network_io_usage_tx_bytes_total[5m]))", "{{container_name}}", "Bps")
37474
+ ]
37475
+ };
37476
+ return JSON.stringify(dashboard, null, 2);
37477
+ }
37478
+ function generateArcEdgeDashboard() {
37479
+ const dashboard = {
37480
+ title: "Arc Caddy / Edge",
37481
+ uid: "arc-edge",
37482
+ schemaVersion: 39,
37483
+ version: 1,
37484
+ refresh: "30s",
37485
+ time: { from: "now-1h", to: "now" },
37486
+ tags: ["arc", "auto-provisioned"],
37487
+ panels: [
37488
+ panelStat("Requests/s", { x: 0, y: 0, w: 6, h: 4 }, "sum(rate(caddy_http_request_duration_seconds_count[1m]))", "reqps"),
37489
+ panelStat("In-flight requests", { x: 6, y: 0, w: 6, h: 4 }, "sum(caddy_http_requests_in_flight)", "short"),
37490
+ panelStat("Handler errors/s", { x: 12, y: 0, w: 6, h: 4 }, "sum(rate(caddy_http_request_errors_total[5m]))", "ops", { orange: 0.1, red: 1 }),
37491
+ panelStat("P95 latency", { x: 18, y: 0, w: 6, h: 4 }, "histogram_quantile(0.95, sum by (le) (rate(caddy_http_request_duration_seconds_bucket[5m])))", "s", { orange: 0.3, red: 1 }),
37492
+ panelTimeseries("Request rate by host", { x: 0, y: 4, w: 12, h: 8 }, "sum by (host) (rate(caddy_http_request_duration_seconds_count[1m]))", "{{host}}", "reqps"),
37493
+ panelTimeseries("Responses by status code", { x: 12, y: 4, w: 12, h: 8 }, "sum by (code) (rate(caddy_http_request_duration_seconds_count[1m]))", "{{code}}", "reqps"),
37494
+ panelTimeseries("P95 latency by host", { x: 0, y: 12, w: 12, h: 8 }, "histogram_quantile(0.95, sum by (host, le) (rate(caddy_http_request_duration_seconds_bucket[5m])))", "{{host}}", "s"),
37495
+ panelTimeseries("4xx/5xx responses (access log)", { x: 12, y: 12, w: 12, h: 8 }, 'sum by (status) (count_over_time({compose_service="caddy"} | json | status >= 400 [$__interval]))', "{{status}}", "short", LOKI_DS),
37496
+ {
37497
+ title: "Access log (live)",
37498
+ type: "logs",
37499
+ gridPos: { x: 0, y: 20, w: 24, h: 12 },
37500
+ datasource: { type: "loki", uid: "loki" },
37501
+ targets: [
37502
+ {
37503
+ expr: '{compose_service="caddy"}',
37504
+ refId: "A"
37505
+ }
37506
+ ],
37507
+ options: {
37508
+ showTime: true,
37509
+ showLabels: false,
37510
+ showCommonLabels: false,
37511
+ wrapLogMessage: true,
37512
+ enableLogDetails: true,
37513
+ dedupStrategy: "none",
37514
+ sortOrder: "Descending"
37515
+ }
37516
+ }
37517
+ ]
37518
+ };
37519
+ return JSON.stringify(dashboard, null, 2);
37520
+ }
36992
37521
  function generateObservabilityConfigs(cfg) {
36993
37522
  return {
36994
37523
  "observability/otel-collector-config.yaml": generateOtelCollectorConfig(cfg),
36995
37524
  "observability/tempo.yaml": generateTempoConfig(cfg),
36996
37525
  "observability/loki-config.yaml": generateLokiConfig(cfg),
36997
37526
  "observability/prometheus.yml": generatePrometheusConfig(cfg),
37527
+ "observability/alloy-config.alloy": generateAlloyConfig(),
36998
37528
  "observability/grafana-datasources.yaml": generateGrafanaDatasources(),
36999
37529
  "observability/grafana-dashboards.yaml": generateGrafanaDashboardsProvider(),
37530
+ "observability/grafana-alerting/alerting.yaml": generateGrafanaAlerting(cfg),
37000
37531
  "observability/grafana-dashboards/arc-overview.json": generateArcOverviewDashboard(),
37001
37532
  "observability/grafana-dashboards/arc-traces.json": generateArcTracesDashboard(),
37002
37533
  "observability/grafana-dashboards/arc-service-map.json": generateArcServiceMapDashboard(),
37003
37534
  "observability/grafana-dashboards/arc-logs.json": generateArcLogsDashboard(),
37004
37535
  "observability/grafana-dashboards/arc-sampling.json": generateArcSamplingDashboard(),
37005
- "observability/grafana-dashboards/arc-command.json": generateArcCommandDashboard()
37536
+ "observability/grafana-dashboards/arc-command.json": generateArcCommandDashboard(),
37537
+ "observability/grafana-dashboards/arc-infra.json": generateArcInfraDashboard(),
37538
+ "observability/grafana-dashboards/arc-edge.json": generateArcEdgeDashboard()
37006
37539
  };
37007
37540
  }
37008
- function panelStat(title, gridPos, expr, unit, thresholds) {
37541
+ var PROMETHEUS_DS = { type: "prometheus", uid: "prometheus" };
37542
+ var LOKI_DS = { type: "loki", uid: "loki" };
37543
+ function panelStat(title, gridPos, expr, unit, thresholds, datasource = PROMETHEUS_DS) {
37009
37544
  const steps = [
37010
37545
  { color: "green", value: null }
37011
37546
  ];
@@ -37019,7 +37554,7 @@ function panelStat(title, gridPos, expr, unit, thresholds) {
37019
37554
  title,
37020
37555
  type: "stat",
37021
37556
  gridPos,
37022
- datasource: { type: "prometheus", uid: "prometheus" },
37557
+ datasource,
37023
37558
  targets: [{ expr, refId: "A", legendFormat: title }],
37024
37559
  fieldConfig: {
37025
37560
  defaults: {
@@ -37037,7 +37572,7 @@ function panelStat(title, gridPos, expr, unit, thresholds) {
37037
37572
  }
37038
37573
  };
37039
37574
  }
37040
- function panelTimeseries(title, gridPos, query, legend, unit) {
37575
+ function panelTimeseries(title, gridPos, query, legend, unit, datasource = PROMETHEUS_DS) {
37041
37576
  const targets = Array.isArray(query) ? query.map((q, i) => ({
37042
37577
  expr: q.expr,
37043
37578
  refId: String.fromCharCode(65 + i),
@@ -37047,7 +37582,7 @@ function panelTimeseries(title, gridPos, query, legend, unit) {
37047
37582
  title,
37048
37583
  type: "timeseries",
37049
37584
  gridPos,
37050
- datasource: { type: "prometheus", uid: "prometheus" },
37585
+ datasource,
37051
37586
  targets,
37052
37587
  fieldConfig: {
37053
37588
  defaults: {
@@ -37378,11 +37913,16 @@ function validateDeployConfig(input) {
37378
37913
  metrics: optionalString(retentionRaw, "observability.retention.metrics")
37379
37914
  };
37380
37915
  }
37916
+ const alertWebhookUrl = optionalString(observabilityRaw, "observability.alertWebhookUrl");
37917
+ if (alertWebhookUrl !== undefined && !/^https?:\/\/.+/.test(alertWebhookUrl)) {
37918
+ throw new Error(`deploy.arc.json: observability.alertWebhookUrl must be an http(s) URL (got "${alertWebhookUrl}")`);
37919
+ }
37381
37920
  validated.observability = {
37382
37921
  enabled: enabledRaw,
37383
37922
  subdomain: optionalString(observabilityRaw, "observability.subdomain") ?? "observability",
37384
37923
  adminPasswordEnv: optionalString(observabilityRaw, "observability.adminPasswordEnv") ?? "ARC_OBSERVABILITY_PASSWORD",
37385
- retention
37924
+ retention,
37925
+ alertWebhookUrl
37386
37926
  };
37387
37927
  }
37388
37928
  const provision = input.provision;
@@ -37678,14 +38218,14 @@ async function bootstrap(inputs) {
37678
38218
  });
37679
38219
  ok("Host bootstrapped");
37680
38220
  }
37681
- const needUpStack = state.kind !== "ready" || state.marker === null || state.marker.configHash !== inputs.configHash || !await isRegistryRunning(cfg);
38221
+ const needUpStack = state.kind !== "ready" || state.marker === null || state.marker.configHash !== inputs.configHash || state.marker.cliVersion !== inputs.cliVersion || !await isRegistryRunning(cfg);
37682
38222
  if (needUpStack) {
37683
38223
  await upStack(inputs);
37684
38224
  ok("Docker stack up");
37685
38225
  }
37686
38226
  if (cfg.observability?.enabled) {
37687
38227
  log2("Ensuring observability sidecars are running...");
37688
- const obsServices = ["otel-collector", "tempo", "loki", "prometheus", "grafana"];
38228
+ const obsServices = ["otel-collector", "tempo", "loki", "prometheus", "alloy", "grafana"];
37689
38229
  await assertExec(cfg.target, `cd ${cfg.target.remoteDir} && docker compose pull --ignore-pull-failures ${obsServices.join(" ")} && docker compose up -d ${obsServices.join(" ")}`);
37690
38230
  ok("Observability stack up");
37691
38231
  }
@@ -37752,7 +38292,7 @@ async function upStack(inputs) {
37752
38292
  await scpUpload(cfg.target, join18(workDir, "docker-compose.yml"), `${cfg.target.remoteDir}/docker-compose.yml`);
37753
38293
  await scpUpload(cfg.target, join18(workDir, "htpasswd"), `${cfg.target.remoteDir}/registry-auth/htpasswd`);
37754
38294
  if (observabilityFiles && observabilityHtpasswd) {
37755
- await assertExec(cfg.target, `mkdir -p ${cfg.target.remoteDir}/observability/grafana-dashboards`);
38295
+ await assertExec(cfg.target, `mkdir -p ${cfg.target.remoteDir}/observability/grafana-dashboards ${cfg.target.remoteDir}/observability/grafana-alerting`);
37756
38296
  for (const relPath of Object.keys(observabilityFiles)) {
37757
38297
  const localDir = dirname9(join18(workDir, relPath));
37758
38298
  mkdirSync12(localDir, { recursive: true });
@@ -39274,7 +39814,7 @@ class ContextHandler {
39274
39814
  try {
39275
39815
  return await this.telemetry.startSpan(`command.${commandName}`, runCommand, { attributes: baseAttrs });
39276
39816
  } finally {
39277
- this.telemetry.measureSince("arc.command.duration_ms", start, {
39817
+ this.telemetry.measureSince("arc.command.duration", start, {
39278
39818
  "arc.command.name": commandName
39279
39819
  });
39280
39820
  this.telemetry.incrementCounter("arc.commands.total", 1, {
@@ -40359,6 +40899,7 @@ async function createArcServer(config) {
40359
40899
  websocket: {
40360
40900
  open(ws) {
40361
40901
  connectionManager.addClient(ws);
40902
+ config.telemetry?.addUpDown("arc.ws.active_connections", 1);
40362
40903
  },
40363
40904
  async message(ws, messageStr) {
40364
40905
  const client = connectionManager.getClientByWs(ws);
@@ -40371,6 +40912,9 @@ async function createArcServer(config) {
40371
40912
  console.error("Failed to parse WS message:", error);
40372
40913
  return;
40373
40914
  }
40915
+ config.telemetry?.incrementCounter("arc.ws.messages", 1, {
40916
+ "messaging.message.type": String(message?.type ?? "unknown")
40917
+ });
40374
40918
  const dispatch = async () => {
40375
40919
  try {
40376
40920
  for (const handler of wsHandlers) {
@@ -40408,6 +40952,7 @@ async function createArcServer(config) {
40408
40952
  cleanupClientSubs(client.id);
40409
40953
  config.onWsClose?.(client.id);
40410
40954
  connectionManager.removeClient(client.id);
40955
+ config.telemetry?.addUpDown("arc.ws.active_connections", -1);
40411
40956
  }
40412
40957
  }
40413
40958
  }
@@ -40726,7 +41271,8 @@ async function startPlatformServer(opts) {
40726
41271
  endpoint: process.env.OTEL_EXPORTER_OTLP_ENDPOINT,
40727
41272
  mode: devMode ? "development" : "production",
40728
41273
  sampleRate: devMode ? 1 : 1,
40729
- debug: process.env.ARC_OTEL_DEBUG === "true"
41274
+ debug: process.env.ARC_OTEL_DEBUG === "true",
41275
+ patchConsole: process.env.ARC_OTEL_PATCH_CONSOLE !== "false"
40730
41276
  });
40731
41277
  telemetry = init2.telemetry;
40732
41278
  telemetryShutdown = init2.shutdown;