@arcote.tech/arc-cli 0.7.19 → 0.7.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +651 -105
- package/package.json +9 -9
- package/src/deploy/bootstrap.ts +8 -3
- package/src/deploy/caddyfile.ts +43 -8
- package/src/deploy/compose.ts +73 -0
- package/src/deploy/config.ts +15 -0
- package/src/deploy/observability-configs.ts +674 -48
- package/src/platform/server.ts +3 -0
package/dist/index.js
CHANGED
|
@@ -25870,6 +25870,7 @@ import {
|
|
|
25870
25870
|
ATTR_SERVICE_NAME,
|
|
25871
25871
|
ATTR_SERVICE_VERSION
|
|
25872
25872
|
} from "@opentelemetry/semantic-conventions/incubating";
|
|
25873
|
+
import { format as format2 } from "util";
|
|
25873
25874
|
import {
|
|
25874
25875
|
context,
|
|
25875
25876
|
propagation,
|
|
@@ -25880,6 +25881,41 @@ import {
|
|
|
25880
25881
|
logs,
|
|
25881
25882
|
SeverityNumber
|
|
25882
25883
|
} from "@opentelemetry/api-logs";
|
|
25884
|
+
function patchConsole(telemetry) {
|
|
25885
|
+
if (patched || !telemetry.active)
|
|
25886
|
+
return () => {};
|
|
25887
|
+
patched = true;
|
|
25888
|
+
const originals = [];
|
|
25889
|
+
for (const [method, level] of METHODS) {
|
|
25890
|
+
const original = console[method].bind(console);
|
|
25891
|
+
originals.push([method, original]);
|
|
25892
|
+
console[method] = (...args) => {
|
|
25893
|
+
original(...args);
|
|
25894
|
+
if (emitting)
|
|
25895
|
+
return;
|
|
25896
|
+
emitting = true;
|
|
25897
|
+
try {
|
|
25898
|
+
const body = format2(...args);
|
|
25899
|
+
if (body.startsWith("[arc-otel]"))
|
|
25900
|
+
return;
|
|
25901
|
+
const error = args.find((arg) => arg instanceof Error);
|
|
25902
|
+
telemetry.log(level, body, error ? {
|
|
25903
|
+
"exception.type": error.name,
|
|
25904
|
+
"exception.message": error.message,
|
|
25905
|
+
"exception.stacktrace": error.stack ?? ""
|
|
25906
|
+
} : {});
|
|
25907
|
+
} catch {} finally {
|
|
25908
|
+
emitting = false;
|
|
25909
|
+
}
|
|
25910
|
+
};
|
|
25911
|
+
}
|
|
25912
|
+
return () => {
|
|
25913
|
+
for (const [method, original] of originals) {
|
|
25914
|
+
console[method] = original;
|
|
25915
|
+
}
|
|
25916
|
+
patched = false;
|
|
25917
|
+
};
|
|
25918
|
+
}
|
|
25883
25919
|
function sanitizeAttrs(input, opts = {}) {
|
|
25884
25920
|
if (!input)
|
|
25885
25921
|
return {};
|
|
@@ -25939,6 +25975,7 @@ class ArcTelemetry {
|
|
|
25939
25975
|
meter = null;
|
|
25940
25976
|
histograms = new Map;
|
|
25941
25977
|
counters = new Map;
|
|
25978
|
+
upDownCounters = new Map;
|
|
25942
25979
|
constructor(config) {
|
|
25943
25980
|
const mode = config.mode ?? "development";
|
|
25944
25981
|
const enabled = config.enabled ?? mode !== "disabled";
|
|
@@ -26050,6 +26087,18 @@ class ArcTelemetry {
|
|
|
26050
26087
|
counter.add(value, attrs);
|
|
26051
26088
|
} catch {}
|
|
26052
26089
|
}
|
|
26090
|
+
addUpDown(name, delta, attrs = {}) {
|
|
26091
|
+
if (!this.active || !this.meter)
|
|
26092
|
+
return;
|
|
26093
|
+
let counter = this.upDownCounters.get(name);
|
|
26094
|
+
if (!counter) {
|
|
26095
|
+
counter = this.meter.createUpDownCounter(name);
|
|
26096
|
+
this.upDownCounters.set(name, counter);
|
|
26097
|
+
}
|
|
26098
|
+
try {
|
|
26099
|
+
counter.add(delta, attrs);
|
|
26100
|
+
} catch {}
|
|
26101
|
+
}
|
|
26053
26102
|
recordHistogram(name, value, attrs = {}) {
|
|
26054
26103
|
if (!this.active || !this.meter)
|
|
26055
26104
|
return;
|
|
@@ -26097,53 +26146,31 @@ function noopSpan() {
|
|
|
26097
26146
|
function wrapDbAdapter(adapter, telemetry, dbSystem) {
|
|
26098
26147
|
if (!telemetry || !telemetry.active)
|
|
26099
26148
|
return adapter;
|
|
26149
|
+
const dbAttrs = (operation, store) => ({
|
|
26150
|
+
"db.system": dbSystem,
|
|
26151
|
+
"db.operation.name": operation,
|
|
26152
|
+
...store ? { "db.collection.name": store } : {}
|
|
26153
|
+
});
|
|
26154
|
+
const measureOp = async (operation, store, fn) => {
|
|
26155
|
+
const start = Date.now();
|
|
26156
|
+
try {
|
|
26157
|
+
return await fn();
|
|
26158
|
+
} finally {
|
|
26159
|
+
telemetry.measureSince("arc.db.operation.duration", start, dbAttrs(operation, store));
|
|
26160
|
+
}
|
|
26161
|
+
};
|
|
26100
26162
|
const wrapRead = (tx) => ({
|
|
26101
|
-
find: async (store, options) => telemetry.startSpan(`db.find ${store}`, async (span) => {
|
|
26102
|
-
const
|
|
26103
|
-
|
|
26104
|
-
|
|
26105
|
-
|
|
26106
|
-
return rows;
|
|
26107
|
-
} finally {
|
|
26108
|
-
telemetry.measureSince("arc.db.find_ms", start, {
|
|
26109
|
-
"db.system": dbSystem,
|
|
26110
|
-
"db.collection.name": store
|
|
26111
|
-
});
|
|
26112
|
-
}
|
|
26113
|
-
}, {
|
|
26114
|
-
kind: 3,
|
|
26115
|
-
attributes: {
|
|
26116
|
-
"db.system": dbSystem,
|
|
26117
|
-
"db.operation.name": "find",
|
|
26118
|
-
"db.collection.name": store
|
|
26119
|
-
}
|
|
26120
|
-
})
|
|
26163
|
+
find: async (store, options) => telemetry.startSpan(`db.find ${store}`, async (span) => measureOp("find", store, async () => {
|
|
26164
|
+
const rows = await tx.find(store, options);
|
|
26165
|
+
span.setAttribute("db.response.row_count", rows.length);
|
|
26166
|
+
return rows;
|
|
26167
|
+
}), { kind: 3, attributes: dbAttrs("find", store) })
|
|
26121
26168
|
});
|
|
26122
26169
|
const wrapReadWrite = (tx) => ({
|
|
26123
26170
|
...wrapRead(tx),
|
|
26124
|
-
set: async (store, data) => telemetry.startSpan(`db.set ${store}`, () => tx.set(store, data), {
|
|
26125
|
-
|
|
26126
|
-
|
|
26127
|
-
"db.system": dbSystem,
|
|
26128
|
-
"db.operation.name": "set",
|
|
26129
|
-
"db.collection.name": store
|
|
26130
|
-
}
|
|
26131
|
-
}),
|
|
26132
|
-
remove: async (store, id3) => telemetry.startSpan(`db.remove ${store}`, () => tx.remove(store, id3), {
|
|
26133
|
-
kind: 3,
|
|
26134
|
-
attributes: {
|
|
26135
|
-
"db.system": dbSystem,
|
|
26136
|
-
"db.operation.name": "remove",
|
|
26137
|
-
"db.collection.name": store
|
|
26138
|
-
}
|
|
26139
|
-
}),
|
|
26140
|
-
commit: async () => telemetry.startSpan("db.commit", () => tx.commit(), {
|
|
26141
|
-
kind: 3,
|
|
26142
|
-
attributes: {
|
|
26143
|
-
"db.system": dbSystem,
|
|
26144
|
-
"db.operation.name": "commit"
|
|
26145
|
-
}
|
|
26146
|
-
})
|
|
26171
|
+
set: async (store, data) => telemetry.startSpan(`db.set ${store}`, () => measureOp("set", store, () => tx.set(store, data)), { kind: 3, attributes: dbAttrs("set", store) }),
|
|
26172
|
+
remove: async (store, id3) => telemetry.startSpan(`db.remove ${store}`, () => measureOp("remove", store, () => tx.remove(store, id3)), { kind: 3, attributes: dbAttrs("remove", store) }),
|
|
26173
|
+
commit: async () => telemetry.startSpan("db.commit", () => measureOp("commit", undefined, () => tx.commit()), { kind: 3, attributes: dbAttrs("commit") })
|
|
26147
26174
|
});
|
|
26148
26175
|
return new Proxy(adapter, {
|
|
26149
26176
|
get(target, prop) {
|
|
@@ -26206,6 +26233,7 @@ function initServerTelemetry(config) {
|
|
|
26206
26233
|
logger: loggerProvider.getLogger(config.serviceName),
|
|
26207
26234
|
meter: meterProvider.getMeter(config.serviceName)
|
|
26208
26235
|
});
|
|
26236
|
+
const restoreConsole = config.patchConsole !== false ? patchConsole(telemetry) : () => {};
|
|
26209
26237
|
if (telemetry.config.debug) {
|
|
26210
26238
|
console.log("[arc-otel] server init", {
|
|
26211
26239
|
serviceName: config.serviceName,
|
|
@@ -26216,6 +26244,7 @@ function initServerTelemetry(config) {
|
|
|
26216
26244
|
});
|
|
26217
26245
|
}
|
|
26218
26246
|
const shutdown = async () => {
|
|
26247
|
+
restoreConsole();
|
|
26219
26248
|
try {
|
|
26220
26249
|
await Promise.all([
|
|
26221
26250
|
tracerProvider.shutdown(),
|
|
@@ -26228,8 +26257,15 @@ function initServerTelemetry(config) {
|
|
|
26228
26257
|
};
|
|
26229
26258
|
return { telemetry, shutdown };
|
|
26230
26259
|
}
|
|
26231
|
-
var DEFAULT_REDACT_KEY_PATTERN, DEFAULT_MAX_STRING_LEN = 2048, DEFAULT_MAX_JSON_LEN = 4096;
|
|
26260
|
+
var METHODS, patched = false, emitting = false, DEFAULT_REDACT_KEY_PATTERN, DEFAULT_MAX_STRING_LEN = 2048, DEFAULT_MAX_JSON_LEN = 4096;
|
|
26232
26261
|
var init_init_server = __esm(() => {
|
|
26262
|
+
METHODS = [
|
|
26263
|
+
["debug", "debug"],
|
|
26264
|
+
["log", "info"],
|
|
26265
|
+
["info", "info"],
|
|
26266
|
+
["warn", "warn"],
|
|
26267
|
+
["error", "error"]
|
|
26268
|
+
];
|
|
26233
26269
|
DEFAULT_REDACT_KEY_PATTERN = /(password|passwd|token|secret|authorization|jwt|api[_-]?key|cookie|email|credit[_-]?card|ssn)/i;
|
|
26234
26270
|
});
|
|
26235
26271
|
|
|
@@ -36054,18 +36090,32 @@ function generateCaddyfile(cfg) {
|
|
|
36054
36090
|
email ${cfg.caddy.email}`;
|
|
36055
36091
|
const tlsDirective = cfg.caddy.email === "internal" ? `
|
|
36056
36092
|
tls internal` : "";
|
|
36093
|
+
const observability = cfg.observability?.enabled === true;
|
|
36094
|
+
const logDirective = observability ? [" log {", " output stdout", " format json", " }"] : [];
|
|
36057
36095
|
const lines = [];
|
|
36058
36096
|
lines.push("# Generated by `arc platform deploy` \u2014 do not edit by hand.");
|
|
36059
36097
|
lines.push("");
|
|
36060
36098
|
lines.push("{");
|
|
36061
36099
|
lines.push(" admin off");
|
|
36100
|
+
if (observability) {
|
|
36101
|
+
lines.push(" metrics {");
|
|
36102
|
+
lines.push(" per_host");
|
|
36103
|
+
lines.push(" }");
|
|
36104
|
+
}
|
|
36062
36105
|
if (email)
|
|
36063
36106
|
lines.push(` ${email.trim()}`);
|
|
36064
36107
|
lines.push("}");
|
|
36065
36108
|
lines.push("");
|
|
36109
|
+
if (observability) {
|
|
36110
|
+
lines.push(":2020 {");
|
|
36111
|
+
lines.push(" metrics");
|
|
36112
|
+
lines.push("}");
|
|
36113
|
+
lines.push("");
|
|
36114
|
+
}
|
|
36066
36115
|
for (const [name, env2] of Object.entries(cfg.envs)) {
|
|
36067
36116
|
lines.push(`${env2.domain} {${tlsDirective}`);
|
|
36068
|
-
|
|
36117
|
+
lines.push(...logDirective);
|
|
36118
|
+
if (observability) {
|
|
36069
36119
|
lines.push(" handle_path /otel/* {");
|
|
36070
36120
|
lines.push(" reverse_proxy otel-collector:4318");
|
|
36071
36121
|
lines.push(" }");
|
|
@@ -36078,13 +36128,11 @@ function generateCaddyfile(cfg) {
|
|
|
36078
36128
|
lines.push("}");
|
|
36079
36129
|
lines.push("");
|
|
36080
36130
|
}
|
|
36081
|
-
if (
|
|
36082
|
-
const
|
|
36083
|
-
if (
|
|
36084
|
-
|
|
36085
|
-
|
|
36086
|
-
const observabilityDomain = `${subdomain}.${apex}`;
|
|
36087
|
-
lines.push(`${observabilityDomain} {${tlsDirective}`);
|
|
36131
|
+
if (observability) {
|
|
36132
|
+
const domain = observabilityDomain(cfg);
|
|
36133
|
+
if (domain) {
|
|
36134
|
+
lines.push(`${domain} {${tlsDirective}`);
|
|
36135
|
+
lines.push(...logDirective);
|
|
36088
36136
|
lines.push(" basic_auth {");
|
|
36089
36137
|
lines.push(" import /etc/caddy/observability-htpasswd");
|
|
36090
36138
|
lines.push(" }");
|
|
@@ -36094,6 +36142,7 @@ function generateCaddyfile(cfg) {
|
|
|
36094
36142
|
}
|
|
36095
36143
|
}
|
|
36096
36144
|
lines.push(`${cfg.registry.domain} {${tlsDirective}`);
|
|
36145
|
+
lines.push(...logDirective);
|
|
36097
36146
|
lines.push(" reverse_proxy registry:5000 {");
|
|
36098
36147
|
lines.push(" header_up Host {host}");
|
|
36099
36148
|
lines.push(" }");
|
|
@@ -36105,6 +36154,15 @@ function generateCaddyfile(cfg) {
|
|
|
36105
36154
|
`) + `
|
|
36106
36155
|
`;
|
|
36107
36156
|
}
|
|
36157
|
+
function observabilityDomain(cfg) {
|
|
36158
|
+
if (!cfg.observability?.enabled)
|
|
36159
|
+
return null;
|
|
36160
|
+
const firstEnv = Object.values(cfg.envs)[0];
|
|
36161
|
+
if (!firstEnv)
|
|
36162
|
+
return null;
|
|
36163
|
+
const subdomain = cfg.observability.subdomain ?? "observability";
|
|
36164
|
+
return `${subdomain}.${apexOf(firstEnv.domain)}`;
|
|
36165
|
+
}
|
|
36108
36166
|
function apexOf(host) {
|
|
36109
36167
|
const parts = host.split(".");
|
|
36110
36168
|
if (parts.length <= 2)
|
|
@@ -36113,6 +36171,13 @@ function apexOf(host) {
|
|
|
36113
36171
|
}
|
|
36114
36172
|
|
|
36115
36173
|
// src/deploy/compose.ts
|
|
36174
|
+
function pushLogging(lines) {
|
|
36175
|
+
lines.push(" logging:");
|
|
36176
|
+
lines.push(" driver: json-file");
|
|
36177
|
+
lines.push(" options:");
|
|
36178
|
+
lines.push(' max-size: "10m"');
|
|
36179
|
+
lines.push(' max-file: "3"');
|
|
36180
|
+
}
|
|
36116
36181
|
function generateCompose({ cfg }) {
|
|
36117
36182
|
const lines = [];
|
|
36118
36183
|
lines.push("# Generated by `arc platform deploy` \u2014 do not edit by hand.");
|
|
@@ -36121,6 +36186,7 @@ function generateCompose({ cfg }) {
|
|
|
36121
36186
|
lines.push(" caddy:");
|
|
36122
36187
|
lines.push(" image: caddy:2-alpine");
|
|
36123
36188
|
lines.push(" restart: unless-stopped");
|
|
36189
|
+
pushLogging(lines);
|
|
36124
36190
|
lines.push(" ports:");
|
|
36125
36191
|
lines.push(' - "80:80"');
|
|
36126
36192
|
lines.push(' - "443:443"');
|
|
@@ -36133,10 +36199,15 @@ function generateCompose({ cfg }) {
|
|
|
36133
36199
|
lines.push(" - caddy_config:/config");
|
|
36134
36200
|
lines.push(" networks:");
|
|
36135
36201
|
lines.push(" - arc-net");
|
|
36202
|
+
if (cfg.observability?.enabled) {
|
|
36203
|
+
lines.push(" expose:");
|
|
36204
|
+
lines.push(' - "2020" # Prometheus metrics endpoint (Caddyfile :2020 site)');
|
|
36205
|
+
}
|
|
36136
36206
|
lines.push("");
|
|
36137
36207
|
lines.push(" registry:");
|
|
36138
36208
|
lines.push(" image: registry:2");
|
|
36139
36209
|
lines.push(" restart: unless-stopped");
|
|
36210
|
+
pushLogging(lines);
|
|
36140
36211
|
lines.push(" volumes:");
|
|
36141
36212
|
lines.push(" - registry_data:/var/lib/registry");
|
|
36142
36213
|
lines.push(" - ./registry-auth/htpasswd:/auth/htpasswd:ro");
|
|
@@ -36157,6 +36228,13 @@ function generateCompose({ cfg }) {
|
|
|
36157
36228
|
lines.push(` image: \${ARC_IMAGE_${upperName}:-arc-${name}:not-deployed}`);
|
|
36158
36229
|
lines.push(` container_name: arc-${name}`);
|
|
36159
36230
|
lines.push(" restart: unless-stopped");
|
|
36231
|
+
pushLogging(lines);
|
|
36232
|
+
lines.push(" healthcheck:");
|
|
36233
|
+
lines.push(' test: ["CMD", "wget", "-qO-", "http://127.0.0.1:5005/health"]');
|
|
36234
|
+
lines.push(" interval: 30s");
|
|
36235
|
+
lines.push(" timeout: 5s");
|
|
36236
|
+
lines.push(" retries: 3");
|
|
36237
|
+
lines.push(" start_period: 20s");
|
|
36160
36238
|
if (usePostgres) {
|
|
36161
36239
|
lines.push(" depends_on:");
|
|
36162
36240
|
lines.push(` arc-db-${name}:`);
|
|
@@ -36209,6 +36287,7 @@ function generateCompose({ cfg }) {
|
|
|
36209
36287
|
lines.push(` image: ${image2}`);
|
|
36210
36288
|
lines.push(` container_name: arc-db-${name}`);
|
|
36211
36289
|
lines.push(" restart: unless-stopped");
|
|
36290
|
+
pushLogging(lines);
|
|
36212
36291
|
lines.push(" environment:");
|
|
36213
36292
|
lines.push(" POSTGRES_USER: arc");
|
|
36214
36293
|
lines.push(" POSTGRES_DB: arc");
|
|
@@ -36233,9 +36312,13 @@ function generateCompose({ cfg }) {
|
|
|
36233
36312
|
lines.push(" image: otel/opentelemetry-collector-contrib:0.114.0");
|
|
36234
36313
|
lines.push(" container_name: arc-otel-collector");
|
|
36235
36314
|
lines.push(" restart: unless-stopped");
|
|
36315
|
+
pushLogging(lines);
|
|
36316
|
+
lines.push(' user: "0:0"');
|
|
36236
36317
|
lines.push(' command: ["--config=/etc/otelcol-contrib/config.yaml"]');
|
|
36237
36318
|
lines.push(" volumes:");
|
|
36238
36319
|
lines.push(" - ./observability/otel-collector-config.yaml:/etc/otelcol-contrib/config.yaml:ro");
|
|
36320
|
+
lines.push(" - /:/hostfs:ro # hostmetrics root_path");
|
|
36321
|
+
lines.push(" - /var/run/docker.sock:/var/run/docker.sock:ro # docker_stats");
|
|
36239
36322
|
lines.push(" networks: [arc-net]");
|
|
36240
36323
|
lines.push(" expose:");
|
|
36241
36324
|
lines.push(' - "4317" # OTLP gRPC');
|
|
@@ -36250,6 +36333,7 @@ function generateCompose({ cfg }) {
|
|
|
36250
36333
|
lines.push(" image: grafana/tempo:2.6.1");
|
|
36251
36334
|
lines.push(" container_name: arc-tempo");
|
|
36252
36335
|
lines.push(" restart: unless-stopped");
|
|
36336
|
+
pushLogging(lines);
|
|
36253
36337
|
lines.push(' command: ["-config.file=/etc/tempo.yaml"]');
|
|
36254
36338
|
lines.push(' user: "0" # tempo writes to /var/tempo, owned by root in the image');
|
|
36255
36339
|
lines.push(" volumes:");
|
|
@@ -36264,6 +36348,7 @@ function generateCompose({ cfg }) {
|
|
|
36264
36348
|
lines.push(" image: grafana/loki:3.3.2");
|
|
36265
36349
|
lines.push(" container_name: arc-loki");
|
|
36266
36350
|
lines.push(" restart: unless-stopped");
|
|
36351
|
+
pushLogging(lines);
|
|
36267
36352
|
lines.push(' command: ["-config.file=/etc/loki/local-config.yaml"]');
|
|
36268
36353
|
lines.push(' user: "0"');
|
|
36269
36354
|
lines.push(" volumes:");
|
|
@@ -36278,6 +36363,7 @@ function generateCompose({ cfg }) {
|
|
|
36278
36363
|
lines.push(" image: prom/prometheus:v2.55.1");
|
|
36279
36364
|
lines.push(" container_name: arc-prometheus");
|
|
36280
36365
|
lines.push(" restart: unless-stopped");
|
|
36366
|
+
pushLogging(lines);
|
|
36281
36367
|
lines.push(" command:");
|
|
36282
36368
|
lines.push(' - "--config.file=/etc/prometheus/prometheus.yml"');
|
|
36283
36369
|
lines.push(' - "--storage.tsdb.path=/prometheus"');
|
|
@@ -36291,20 +36377,47 @@ function generateCompose({ cfg }) {
|
|
|
36291
36377
|
lines.push(" expose:");
|
|
36292
36378
|
lines.push(' - "9090" # HTTP API + remote_write receiver');
|
|
36293
36379
|
lines.push("");
|
|
36380
|
+
lines.push(" alloy:");
|
|
36381
|
+
lines.push(" image: grafana/alloy:v1.16.1");
|
|
36382
|
+
lines.push(" container_name: arc-alloy");
|
|
36383
|
+
lines.push(" restart: unless-stopped");
|
|
36384
|
+
pushLogging(lines);
|
|
36385
|
+
lines.push(' user: "0" # docker.sock access');
|
|
36386
|
+
lines.push(" command:");
|
|
36387
|
+
lines.push(" - run");
|
|
36388
|
+
lines.push(" - --server.http.listen-addr=0.0.0.0:12345");
|
|
36389
|
+
lines.push(" - --storage.path=/var/lib/alloy/data");
|
|
36390
|
+
lines.push(" - /etc/alloy/config.alloy");
|
|
36391
|
+
lines.push(" volumes:");
|
|
36392
|
+
lines.push(" - ./observability/alloy-config.alloy:/etc/alloy/config.alloy:ro");
|
|
36393
|
+
lines.push(" - /var/run/docker.sock:/var/run/docker.sock:ro");
|
|
36394
|
+
lines.push(" - alloy_data:/var/lib/alloy/data");
|
|
36395
|
+
lines.push(" networks: [arc-net]");
|
|
36396
|
+
lines.push(" expose:");
|
|
36397
|
+
lines.push(' - "12345" # Alloy self-metrics (Prom scrape)');
|
|
36398
|
+
lines.push(" depends_on:");
|
|
36399
|
+
lines.push(" - loki");
|
|
36400
|
+
lines.push("");
|
|
36294
36401
|
const adminPasswordEnv = cfg.observability.adminPasswordEnv ?? "ARC_OBSERVABILITY_PASSWORD";
|
|
36402
|
+
const grafanaDomain = observabilityDomain(cfg);
|
|
36295
36403
|
lines.push(" grafana:");
|
|
36296
36404
|
lines.push(" image: grafana/grafana:11.4.0");
|
|
36297
36405
|
lines.push(" container_name: arc-grafana");
|
|
36298
36406
|
lines.push(" restart: unless-stopped");
|
|
36407
|
+
pushLogging(lines);
|
|
36299
36408
|
lines.push(" environment:");
|
|
36300
36409
|
lines.push(" GF_SECURITY_ADMIN_USER: admin");
|
|
36301
36410
|
lines.push(` GF_SECURITY_ADMIN_PASSWORD: \${${adminPasswordEnv}:?missing ${adminPasswordEnv}}`);
|
|
36302
36411
|
lines.push(' GF_USERS_ALLOW_SIGN_UP: "false"');
|
|
36303
36412
|
lines.push(' GF_AUTH_ANONYMOUS_ENABLED: "false"');
|
|
36413
|
+
if (grafanaDomain) {
|
|
36414
|
+
lines.push(` GF_SERVER_ROOT_URL: "https://${grafanaDomain}"`);
|
|
36415
|
+
}
|
|
36304
36416
|
lines.push(" volumes:");
|
|
36305
36417
|
lines.push(" - ./observability/grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml:ro");
|
|
36306
36418
|
lines.push(" - ./observability/grafana-dashboards.yaml:/etc/grafana/provisioning/dashboards/dashboards.yaml:ro");
|
|
36307
36419
|
lines.push(" - ./observability/grafana-dashboards:/etc/grafana/provisioning/dashboards/arc:ro");
|
|
36420
|
+
lines.push(" - ./observability/grafana-alerting:/etc/grafana/provisioning/alerting:ro");
|
|
36308
36421
|
lines.push(" - grafana_data:/var/lib/grafana");
|
|
36309
36422
|
lines.push(" networks: [arc-net]");
|
|
36310
36423
|
lines.push(" expose:");
|
|
@@ -36334,6 +36447,7 @@ function generateCompose({ cfg }) {
|
|
|
36334
36447
|
lines.push(" loki_data:");
|
|
36335
36448
|
lines.push(" prometheus_data:");
|
|
36336
36449
|
lines.push(" grafana_data:");
|
|
36450
|
+
lines.push(" alloy_data:");
|
|
36337
36451
|
}
|
|
36338
36452
|
return lines.join(`
|
|
36339
36453
|
`) + `
|
|
@@ -36408,6 +36522,64 @@ ${envNames.map((name) => ` - "https://${cfg.envs[name].domain}"`).joi
|
|
|
36408
36522
|
- tracestate
|
|
36409
36523
|
- content-type
|
|
36410
36524
|
|
|
36525
|
+
# Host-level CPU / memory / load / disk / filesystem / network metrics.
|
|
36526
|
+
# The host root is bind-mounted read-only at /hostfs (see compose).
|
|
36527
|
+
hostmetrics:
|
|
36528
|
+
collection_interval: 30s
|
|
36529
|
+
root_path: /hostfs
|
|
36530
|
+
scrapers:
|
|
36531
|
+
cpu:
|
|
36532
|
+
metrics:
|
|
36533
|
+
system.cpu.utilization:
|
|
36534
|
+
enabled: true
|
|
36535
|
+
memory:
|
|
36536
|
+
metrics:
|
|
36537
|
+
system.memory.utilization:
|
|
36538
|
+
enabled: true
|
|
36539
|
+
load: {}
|
|
36540
|
+
disk: {}
|
|
36541
|
+
filesystem:
|
|
36542
|
+
metrics:
|
|
36543
|
+
system.filesystem.utilization:
|
|
36544
|
+
enabled: true
|
|
36545
|
+
exclude_fs_types:
|
|
36546
|
+
fs_types: [autofs, binfmt_misc, bpf, cgroup2, configfs, debugfs, devpts, devtmpfs, fusectl, hugetlbfs, iso9660, mqueue, nsfs, overlay, proc, procfs, pstore, rpc_pipefs, securityfs, selinuxfs, squashfs, sysfs, tracefs, tmpfs]
|
|
36547
|
+
match_type: strict
|
|
36548
|
+
exclude_mount_points:
|
|
36549
|
+
mount_points: ["/var/lib/docker/.*", "/run/.*", "/snap/.*", "/boot/.*"]
|
|
36550
|
+
match_type: regexp
|
|
36551
|
+
network: {}
|
|
36552
|
+
paging: {}
|
|
36553
|
+
|
|
36554
|
+
# Per-container CPU / memory / network / block-IO + restarts straight from
|
|
36555
|
+
# the Docker daemon (socket bind-mounted read-only, see compose).
|
|
36556
|
+
docker_stats:
|
|
36557
|
+
endpoint: unix:///var/run/docker.sock
|
|
36558
|
+
collection_interval: 30s
|
|
36559
|
+
metrics:
|
|
36560
|
+
container.restarts:
|
|
36561
|
+
enabled: true
|
|
36562
|
+
container.uptime:
|
|
36563
|
+
enabled: true
|
|
36564
|
+
|
|
36565
|
+
connectors:
|
|
36566
|
+
# Span\u2192metrics computed from 100% of spans (pipeline runs BEFORE tail
|
|
36567
|
+
# sampling) \u2014 lowering the sampling policy later never skews dashboards.
|
|
36568
|
+
spanmetrics:
|
|
36569
|
+
histogram:
|
|
36570
|
+
unit: ms
|
|
36571
|
+
explicit:
|
|
36572
|
+
buckets: [2ms, 6ms, 10ms, 50ms, 100ms, 250ms, 500ms, 1s, 2s, 5s, 10s]
|
|
36573
|
+
metrics_flush_interval: 15s
|
|
36574
|
+
# Emits traces_service_graph_* (same metric names Tempo's generator would).
|
|
36575
|
+
servicegraph:
|
|
36576
|
+
metrics_flush_interval: 15s
|
|
36577
|
+
store:
|
|
36578
|
+
ttl: 5s
|
|
36579
|
+
max_items: 5000
|
|
36580
|
+
# Joins the raw-trace pipeline to the sampled-storage pipeline.
|
|
36581
|
+
forward: {}
|
|
36582
|
+
|
|
36411
36583
|
processors:
|
|
36412
36584
|
batch:
|
|
36413
36585
|
timeout: 5s
|
|
@@ -36418,7 +36590,8 @@ processors:
|
|
|
36418
36590
|
# Errors + slow traces zachowywane w 100%, normalne traces r\xF3wnie\u017C 100%
|
|
36419
36591
|
# przy obecnej skali (boostrap produkcji). Tail sampling matchuje OR po
|
|
36420
36592
|
# policies \u2014 bez "always" policy WSZYSTKIE OK traces by\u0142yby droppowane.
|
|
36421
|
-
# Obni\u017C 'random_100pct' do np. 10% gdy ruch eksploduje
|
|
36593
|
+
# Obni\u017C 'random_100pct' do np. 10% gdy ruch eksploduje \u2014 span-metrics s\u0105
|
|
36594
|
+
# liczone przed samplingiem, wi\u0119c dashboardy pozostan\u0105 dok\u0142adne.
|
|
36422
36595
|
tail_sampling:
|
|
36423
36596
|
decision_wait: 10s
|
|
36424
36597
|
num_traces: 50000
|
|
@@ -36443,6 +36616,34 @@ processors:
|
|
|
36443
36616
|
- key: http.request.header.cookie
|
|
36444
36617
|
action: delete
|
|
36445
36618
|
|
|
36619
|
+
# Cardinality guard for span-metrics: the SPA fallback answers EVERY path,
|
|
36620
|
+
# so raw span names (one per bot-scanned URL) would explode Prometheus
|
|
36621
|
+
# series. Static assets collapse to "<METHOD> static", /route/* to
|
|
36622
|
+
# "<METHOD> /route", anything else outside the known API surface to
|
|
36623
|
+
# "<METHOD> other". Runs BEFORE the spanmetrics connector. Note: "$$" is a
|
|
36624
|
+
# literal "$" (collector env expansion), RE2 has no lookahead \u2192 IsMatch+not.
|
|
36625
|
+
transform/span_names:
|
|
36626
|
+
error_mode: ignore
|
|
36627
|
+
trace_statements:
|
|
36628
|
+
- context: span
|
|
36629
|
+
statements:
|
|
36630
|
+
- set(name, Concat([attributes["http.request.method"], "static"], " ")) where IsMatch(name, "^[A-Z]+ /.*\\\\.(js|mjs|css|map|ico|png|jpe?g|svg|gif|webp|avif|woff2?|ttf|otf|txt|xml|json|webmanifest)$$")
|
|
36631
|
+
- replace_pattern(name, "^([A-Z]+) /route(/.*)?$$", "$$1 /route")
|
|
36632
|
+
- set(name, Concat([attributes["http.request.method"], "other"], " ")) where IsMatch(name, "^[A-Z]+ /") and not IsMatch(name, "^[A-Z]+ ((/api|/command|/query)(/.*)?|/route|/ws|/health|/otel(/.*)?|/)$$")
|
|
36633
|
+
|
|
36634
|
+
# Stable service.name for infra metric streams (becomes the service_name
|
|
36635
|
+
# label after resource_to_telemetry_conversion).
|
|
36636
|
+
resource/host:
|
|
36637
|
+
attributes:
|
|
36638
|
+
- key: service.name
|
|
36639
|
+
value: arc-host
|
|
36640
|
+
action: upsert
|
|
36641
|
+
resource/docker:
|
|
36642
|
+
attributes:
|
|
36643
|
+
- key: service.name
|
|
36644
|
+
value: arc-docker
|
|
36645
|
+
action: upsert
|
|
36646
|
+
|
|
36446
36647
|
exporters:
|
|
36447
36648
|
otlp/tempo:
|
|
36448
36649
|
endpoint: tempo:4317
|
|
@@ -36458,6 +36659,10 @@ exporters:
|
|
|
36458
36659
|
endpoint: http://prometheus:9090/api/v1/write
|
|
36459
36660
|
tls:
|
|
36460
36661
|
insecure: true
|
|
36662
|
+
# Copy resource attributes (service.name, deployment.environment, \u2026)
|
|
36663
|
+
# onto every series \u2014 dashboards filter by service_name.
|
|
36664
|
+
resource_to_telemetry_conversion:
|
|
36665
|
+
enabled: true
|
|
36461
36666
|
|
|
36462
36667
|
extensions:
|
|
36463
36668
|
health_check: {}
|
|
@@ -36465,19 +36670,42 @@ extensions:
|
|
|
36465
36670
|
|
|
36466
36671
|
service:
|
|
36467
36672
|
extensions: [health_check, zpages]
|
|
36673
|
+
# Collector self-metrics. Since 0.111 the default bind is localhost only \u2014
|
|
36674
|
+
# Prometheus scrapes otel-collector:8888, so listen on all interfaces.
|
|
36675
|
+
telemetry:
|
|
36676
|
+
metrics:
|
|
36677
|
+
level: detailed
|
|
36678
|
+
readers:
|
|
36679
|
+
- pull:
|
|
36680
|
+
exporter:
|
|
36681
|
+
prometheus:
|
|
36682
|
+
host: "0.0.0.0"
|
|
36683
|
+
port: 8888
|
|
36468
36684
|
pipelines:
|
|
36469
|
-
traces:
|
|
36685
|
+
traces/in:
|
|
36470
36686
|
receivers: [otlp]
|
|
36471
|
-
processors: [
|
|
36687
|
+
processors: [attributes, transform/span_names]
|
|
36688
|
+
exporters: [spanmetrics, servicegraph, forward]
|
|
36689
|
+
traces/sampled:
|
|
36690
|
+
receivers: [forward]
|
|
36691
|
+
processors: [tail_sampling, batch]
|
|
36472
36692
|
exporters: [otlp/tempo]
|
|
36473
36693
|
logs:
|
|
36474
36694
|
receivers: [otlp]
|
|
36475
36695
|
processors: [attributes, batch]
|
|
36476
36696
|
exporters: [otlphttp/loki]
|
|
36477
36697
|
metrics:
|
|
36478
|
-
receivers: [otlp]
|
|
36698
|
+
receivers: [otlp, spanmetrics, servicegraph]
|
|
36479
36699
|
processors: [batch]
|
|
36480
36700
|
exporters: [prometheusremotewrite]
|
|
36701
|
+
metrics/host:
|
|
36702
|
+
receivers: [hostmetrics]
|
|
36703
|
+
processors: [resource/host, batch]
|
|
36704
|
+
exporters: [prometheusremotewrite]
|
|
36705
|
+
metrics/docker:
|
|
36706
|
+
receivers: [docker_stats]
|
|
36707
|
+
processors: [resource/docker, batch]
|
|
36708
|
+
exporters: [prometheusremotewrite]
|
|
36481
36709
|
`;
|
|
36482
36710
|
}
|
|
36483
36711
|
function generateTempoConfig(cfg) {
|
|
@@ -36513,20 +36741,9 @@ storage:
|
|
|
36513
36741
|
wal:
|
|
36514
36742
|
path: /var/tempo/wal
|
|
36515
36743
|
|
|
36516
|
-
metrics_generator
|
|
36517
|
-
|
|
36518
|
-
|
|
36519
|
-
source: tempo
|
|
36520
|
-
storage:
|
|
36521
|
-
path: /var/tempo/generator/wal
|
|
36522
|
-
remote_write:
|
|
36523
|
-
- url: http://prometheus:9090/api/v1/write
|
|
36524
|
-
send_exemplars: true
|
|
36525
|
-
|
|
36526
|
-
overrides:
|
|
36527
|
-
defaults:
|
|
36528
|
-
metrics_generator:
|
|
36529
|
-
processors: [service-graphs, span-metrics]
|
|
36744
|
+
# NOTE: no metrics_generator \u2014 span-metrics + service-graph are produced by
|
|
36745
|
+
# the otel-collector connectors BEFORE tail sampling (accurate rates even
|
|
36746
|
+
# when sampling is later tightened) and remote-written to Prometheus there.
|
|
36530
36747
|
`;
|
|
36531
36748
|
}
|
|
36532
36749
|
function generateLokiConfig(cfg) {
|
|
@@ -36582,11 +36799,205 @@ scrape_configs:
|
|
|
36582
36799
|
- job_name: otel-collector
|
|
36583
36800
|
static_configs:
|
|
36584
36801
|
- targets: [otel-collector:8888]
|
|
36802
|
+
- job_name: caddy
|
|
36803
|
+
static_configs:
|
|
36804
|
+
- targets: [caddy:2020]
|
|
36805
|
+
- job_name: loki
|
|
36806
|
+
static_configs:
|
|
36807
|
+
- targets: [loki:3100]
|
|
36808
|
+
- job_name: tempo
|
|
36809
|
+
static_configs:
|
|
36810
|
+
- targets: [tempo:3200]
|
|
36811
|
+
- job_name: grafana
|
|
36812
|
+
static_configs:
|
|
36813
|
+
- targets: [grafana:3000]
|
|
36814
|
+
- job_name: alloy
|
|
36815
|
+
static_configs:
|
|
36816
|
+
- targets: [alloy:12345]
|
|
36585
36817
|
|
|
36586
36818
|
# remote-write inbound is enabled via the --web.enable-remote-write-receiver
|
|
36587
36819
|
# command-line flag (compose.ts). Retention via --storage.tsdb.retention.time.
|
|
36588
36820
|
`;
|
|
36589
36821
|
}
|
|
36822
|
+
function generateAlloyConfig() {
|
|
36823
|
+
return `// Generated by \`arc platform deploy\` \u2014 do not edit by hand.
|
|
36824
|
+
discovery.docker "containers" {
|
|
36825
|
+
host = "unix:///var/run/docker.sock"
|
|
36826
|
+
refresh_interval = "15s"
|
|
36827
|
+
}
|
|
36828
|
+
|
|
36829
|
+
discovery.relabel "containers" {
|
|
36830
|
+
targets = discovery.docker.containers.targets
|
|
36831
|
+
|
|
36832
|
+
rule {
|
|
36833
|
+
source_labels = ["__meta_docker_container_name"]
|
|
36834
|
+
regex = "/(.*)"
|
|
36835
|
+
target_label = "container"
|
|
36836
|
+
}
|
|
36837
|
+
rule {
|
|
36838
|
+
source_labels = ["__meta_docker_container_label_com_docker_compose_service"]
|
|
36839
|
+
target_label = "compose_service"
|
|
36840
|
+
}
|
|
36841
|
+
}
|
|
36842
|
+
|
|
36843
|
+
loki.source.docker "containers" {
|
|
36844
|
+
host = "unix:///var/run/docker.sock"
|
|
36845
|
+
targets = discovery.docker.containers.targets
|
|
36846
|
+
relabel_rules = discovery.relabel.containers.rules
|
|
36847
|
+
labels = { source = "docker" }
|
|
36848
|
+
forward_to = [loki.write.loki.receiver]
|
|
36849
|
+
}
|
|
36850
|
+
|
|
36851
|
+
loki.write "loki" {
|
|
36852
|
+
endpoint {
|
|
36853
|
+
url = "http://loki:3100/loki/api/v1/push"
|
|
36854
|
+
}
|
|
36855
|
+
}
|
|
36856
|
+
`;
|
|
36857
|
+
}
|
|
36858
|
+
function generateGrafanaAlerting(cfg) {
|
|
36859
|
+
const webhookUrl = cfg.observability?.alertWebhookUrl;
|
|
36860
|
+
const rules = [
|
|
36861
|
+
{
|
|
36862
|
+
uid: "arc-high-error-rate",
|
|
36863
|
+
title: "High server error rate (>5%)",
|
|
36864
|
+
expr: 'sum(rate(traces_span_metrics_calls_total{span_kind="SPAN_KIND_SERVER", status_code="STATUS_CODE_ERROR"}[5m])) / clamp_min(sum(rate(traces_span_metrics_calls_total{span_kind="SPAN_KIND_SERVER"}[5m])), 0.001)',
|
|
36865
|
+
threshold: 0.05,
|
|
36866
|
+
pendingFor: "5m",
|
|
36867
|
+
summary: "More than 5% of server spans are errors over the last 5 minutes."
|
|
36868
|
+
},
|
|
36869
|
+
{
|
|
36870
|
+
uid: "arc-high-latency-p95",
|
|
36871
|
+
title: "High p95 latency (>1s)",
|
|
36872
|
+
expr: 'histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_kind="SPAN_KIND_SERVER"}[5m])))',
|
|
36873
|
+
threshold: 1000,
|
|
36874
|
+
pendingFor: "10m",
|
|
36875
|
+
summary: "Server p95 latency above 1s for 10 minutes."
|
|
36876
|
+
},
|
|
36877
|
+
{
|
|
36878
|
+
uid: "arc-host-disk-high",
|
|
36879
|
+
title: "Host disk usage >85%",
|
|
36880
|
+
expr: 'max by (mountpoint) (sum by (device, mountpoint) (system_filesystem_usage_bytes{state="used"}) / sum by (device, mountpoint) (system_filesystem_usage_bytes))',
|
|
36881
|
+
threshold: 0.85,
|
|
36882
|
+
pendingFor: "15m",
|
|
36883
|
+
summary: "A host filesystem is more than 85% full."
|
|
36884
|
+
},
|
|
36885
|
+
{
|
|
36886
|
+
uid: "arc-host-memory-high",
|
|
36887
|
+
title: "Host memory usage >90%",
|
|
36888
|
+
expr: 'sum(system_memory_usage_bytes{state="used"}) / sum(system_memory_usage_bytes)',
|
|
36889
|
+
threshold: 0.9,
|
|
36890
|
+
pendingFor: "10m",
|
|
36891
|
+
summary: "Host memory usage above 90% for 10 minutes."
|
|
36892
|
+
},
|
|
36893
|
+
{
|
|
36894
|
+
uid: "arc-container-restarts",
|
|
36895
|
+
title: "Container restarted",
|
|
36896
|
+
expr: "sum by (container_name) (increase(container_restarts_total[15m]))",
|
|
36897
|
+
threshold: 0,
|
|
36898
|
+
pendingFor: "0s",
|
|
36899
|
+
summary: "A container restarted within the last 15 minutes."
|
|
36900
|
+
},
|
|
36901
|
+
{
|
|
36902
|
+
uid: "arc-app-silent",
|
|
36903
|
+
title: "App stopped reporting metrics",
|
|
36904
|
+
expr: "absent(arc_commands_total)",
|
|
36905
|
+
threshold: 0,
|
|
36906
|
+
pendingFor: "10m",
|
|
36907
|
+
summary: "No arc_commands_total series for 10 minutes \u2014 app down or telemetry broken."
|
|
36908
|
+
},
|
|
36909
|
+
{
|
|
36910
|
+
uid: "arc-collector-export-failures",
|
|
36911
|
+
title: "Telemetry export failures",
|
|
36912
|
+
expr: 'sum(increase({__name__=~"otelcol_exporter_send_failed_(spans|metric_points|log_records)"}[15m]))',
|
|
36913
|
+
threshold: 0,
|
|
36914
|
+
pendingFor: "0s",
|
|
36915
|
+
summary: "The otel-collector failed to export telemetry within the last 15 minutes."
|
|
36916
|
+
},
|
|
36917
|
+
{
|
|
36918
|
+
uid: "arc-target-down",
|
|
36919
|
+
title: "Scrape target down",
|
|
36920
|
+
expr: "min(up)",
|
|
36921
|
+
threshold: 1,
|
|
36922
|
+
op: "lt",
|
|
36923
|
+
pendingFor: "5m",
|
|
36924
|
+
summary: "A Prometheus scrape target has been down for 5 minutes."
|
|
36925
|
+
}
|
|
36926
|
+
];
|
|
36927
|
+
const ruleYaml = rules.map((rule) => {
|
|
36928
|
+
const op = rule.op ?? "gt";
|
|
36929
|
+
return ` - uid: ${rule.uid}
|
|
36930
|
+
title: ${JSON.stringify(rule.title)}
|
|
36931
|
+
condition: C
|
|
36932
|
+
for: ${rule.pendingFor}
|
|
36933
|
+
noDataState: OK
|
|
36934
|
+
execErrState: OK
|
|
36935
|
+
annotations:
|
|
36936
|
+
summary: ${JSON.stringify(rule.summary)}
|
|
36937
|
+
labels:
|
|
36938
|
+
source: arc
|
|
36939
|
+
data:
|
|
36940
|
+
- refId: A
|
|
36941
|
+
relativeTimeRange: { from: 600, to: 0 }
|
|
36942
|
+
datasourceUid: prometheus
|
|
36943
|
+
model:
|
|
36944
|
+
expr: ${JSON.stringify(rule.expr)}
|
|
36945
|
+
instant: true
|
|
36946
|
+
intervalMs: 1000
|
|
36947
|
+
maxDataPoints: 43200
|
|
36948
|
+
refId: A
|
|
36949
|
+
- refId: B
|
|
36950
|
+
relativeTimeRange: { from: 0, to: 0 }
|
|
36951
|
+
datasourceUid: __expr__
|
|
36952
|
+
model:
|
|
36953
|
+
type: reduce
|
|
36954
|
+
expression: A
|
|
36955
|
+
reducer: last
|
|
36956
|
+
refId: B
|
|
36957
|
+
- refId: C
|
|
36958
|
+
relativeTimeRange: { from: 0, to: 0 }
|
|
36959
|
+
datasourceUid: __expr__
|
|
36960
|
+
model:
|
|
36961
|
+
type: threshold
|
|
36962
|
+
expression: B
|
|
36963
|
+
refId: C
|
|
36964
|
+
conditions:
|
|
36965
|
+
- evaluator:
|
|
36966
|
+
type: ${op}
|
|
36967
|
+
params: [${rule.threshold}]`;
|
|
36968
|
+
}).join(`
|
|
36969
|
+
`);
|
|
36970
|
+
const contactSection = webhookUrl ? `
|
|
36971
|
+
contactPoints:
|
|
36972
|
+
- orgId: 1
|
|
36973
|
+
name: arc-webhook
|
|
36974
|
+
receivers:
|
|
36975
|
+
- uid: arc-webhook
|
|
36976
|
+
type: webhook
|
|
36977
|
+
settings:
|
|
36978
|
+
url: ${JSON.stringify(webhookUrl)}
|
|
36979
|
+
httpMethod: POST
|
|
36980
|
+
|
|
36981
|
+
policies:
|
|
36982
|
+
- orgId: 1
|
|
36983
|
+
receiver: arc-webhook
|
|
36984
|
+
group_by: ["grafana_folder", "alertname"]
|
|
36985
|
+
group_wait: 30s
|
|
36986
|
+
group_interval: 5m
|
|
36987
|
+
repeat_interval: 4h
|
|
36988
|
+
` : "";
|
|
36989
|
+
return `# Generated by \`arc platform deploy\` \u2014 do not edit by hand.
|
|
36990
|
+
apiVersion: 1
|
|
36991
|
+
|
|
36992
|
+
groups:
|
|
36993
|
+
- orgId: 1
|
|
36994
|
+
name: arc-alerts
|
|
36995
|
+
folder: Arc
|
|
36996
|
+
interval: 1m
|
|
36997
|
+
rules:
|
|
36998
|
+
${ruleYaml}
|
|
36999
|
+
${contactSection}`;
|
|
37000
|
+
}
|
|
36590
37001
|
function generateGrafanaDatasources() {
|
|
36591
37002
|
return `# Generated by \`arc platform deploy\` \u2014 do not edit by hand.
|
|
36592
37003
|
apiVersion: 1
|
|
@@ -36610,10 +37021,17 @@ datasources:
|
|
|
36610
37021
|
uid: loki
|
|
36611
37022
|
jsonData:
|
|
36612
37023
|
derivedFields:
|
|
37024
|
+
# Plain-text logs that happen to contain "trace_id=<id>".
|
|
36613
37025
|
- datasourceUid: tempo
|
|
36614
37026
|
matcherRegex: "trace_id=(\\\\w+)"
|
|
36615
37027
|
name: TraceID
|
|
36616
37028
|
url: $\${__value.raw}
|
|
37029
|
+
# OTLP-ingested logs \u2014 trace_id arrives as structured metadata.
|
|
37030
|
+
- datasourceUid: tempo
|
|
37031
|
+
matcherType: label
|
|
37032
|
+
matcherRegex: trace_id
|
|
37033
|
+
name: TraceID (OTLP)
|
|
37034
|
+
url: $\${__value.raw}
|
|
36617
37035
|
- name: Prometheus
|
|
36618
37036
|
type: prometheus
|
|
36619
37037
|
access: proxy
|
|
@@ -36656,7 +37074,7 @@ function generateArcOverviewDashboard() {
|
|
|
36656
37074
|
label: "Service",
|
|
36657
37075
|
type: "query",
|
|
36658
37076
|
datasource: { type: "prometheus", uid: "prometheus" },
|
|
36659
|
-
query: "label_values(
|
|
37077
|
+
query: "label_values(traces_span_metrics_calls_total, service_name)",
|
|
36660
37078
|
refresh: 2,
|
|
36661
37079
|
includeAll: false,
|
|
36662
37080
|
multi: false,
|
|
@@ -36665,29 +37083,29 @@ function generateArcOverviewDashboard() {
|
|
|
36665
37083
|
]
|
|
36666
37084
|
},
|
|
36667
37085
|
panels: [
|
|
36668
|
-
panelStat("Request rate (req/s)", { x: 0, y: 0, w: 6, h: 4 }, 'sum(rate(
|
|
36669
|
-
panelStat("Error rate (%)", { x: 6, y: 0, w: 6, h: 4 }, 'sum(rate(
|
|
36670
|
-
panelStat("P99 latency", { x: 12, y: 0, w: 6, h: 4 }, 'histogram_quantile(0.99, sum(rate(
|
|
37086
|
+
panelStat("Request rate (req/s)", { x: 0, y: 0, w: 6, h: 4 }, 'sum(rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m]))', "reqps"),
|
|
37087
|
+
panelStat("Error rate (%)", { x: 6, y: 0, w: 6, h: 4 }, 'sum(rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER", status_code="STATUS_CODE_ERROR"}[5m])) / clamp_min(sum(rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])), 0.001) * 100', "percent", { red: 1, orange: 0.1 }),
|
|
37088
|
+
panelStat("P99 latency", { x: 12, y: 0, w: 6, h: 4 }, 'histogram_quantile(0.99, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))', "ms", { red: 1000, orange: 300 }),
|
|
36671
37089
|
panelStat("Active commands/sec", { x: 18, y: 0, w: 6, h: 4 }, 'sum(rate(arc_commands_total{service_name="$service"}[5m]))', "ops"),
|
|
36672
|
-
panelTimeseries("Request rate by route", { x: 0, y: 4, w: 12, h: 8 }, 'sum by (span_name) (rate(
|
|
37090
|
+
panelTimeseries("Request rate by route", { x: 0, y: 4, w: 12, h: 8 }, 'sum by (span_name) (rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[1m]))', "{{span_name}}", "reqps"),
|
|
36673
37091
|
panelTimeseries("Latency percentiles", { x: 12, y: 4, w: 12, h: 8 }, [
|
|
36674
37092
|
{
|
|
36675
|
-
expr: 'histogram_quantile(0.5, sum(rate(
|
|
37093
|
+
expr: 'histogram_quantile(0.5, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
|
|
36676
37094
|
legend: "p50"
|
|
36677
37095
|
},
|
|
36678
37096
|
{
|
|
36679
|
-
expr: 'histogram_quantile(0.95, sum(rate(
|
|
37097
|
+
expr: 'histogram_quantile(0.95, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
|
|
36680
37098
|
legend: "p95"
|
|
36681
37099
|
},
|
|
36682
37100
|
{
|
|
36683
|
-
expr: 'histogram_quantile(0.99, sum(rate(
|
|
37101
|
+
expr: 'histogram_quantile(0.99, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
|
|
36684
37102
|
legend: "p99"
|
|
36685
37103
|
}
|
|
36686
|
-
], "ms"),
|
|
37104
|
+
], "", "ms"),
|
|
36687
37105
|
panelTimeseries("Commands per second", { x: 0, y: 12, w: 12, h: 8 }, 'sum by (arc_command_name) (rate(arc_commands_total{service_name="$service"}[1m]))', "{{arc_command_name}}", "ops"),
|
|
36688
|
-
panelTimeseries("Command p95 latency", { x: 12, y: 12, w: 12, h: 8 }, 'histogram_quantile(0.95, sum by (arc_command_name, le) (rate(
|
|
36689
|
-
panelTimeseries("DB
|
|
36690
|
-
panelTimeseries("DB
|
|
37106
|
+
panelTimeseries("Command p95 latency", { x: 12, y: 12, w: 12, h: 8 }, 'histogram_quantile(0.95, sum by (arc_command_name, le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service"}[5m])))', "{{arc_command_name}}", "ms"),
|
|
37107
|
+
panelTimeseries("DB ops/sec by collection", { x: 0, y: 20, w: 12, h: 8 }, 'sum by (db_collection_name, db_operation_name) (rate(arc_db_operation_duration_milliseconds_count{service_name="$service"}[1m]))', "{{db_collection_name}} {{db_operation_name}}", "ops"),
|
|
37108
|
+
panelTimeseries("DB p95 latency by operation", { x: 12, y: 20, w: 12, h: 8 }, 'histogram_quantile(0.95, sum by (db_operation_name, le) (rate(arc_db_operation_duration_milliseconds_bucket{service_name="$service"}[5m])))', "{{db_operation_name}}", "ms"),
|
|
36691
37109
|
{
|
|
36692
37110
|
title: "Recent error logs",
|
|
36693
37111
|
type: "logs",
|
|
@@ -36695,7 +37113,7 @@ function generateArcOverviewDashboard() {
|
|
|
36695
37113
|
datasource: { type: "loki", uid: "loki" },
|
|
36696
37114
|
targets: [
|
|
36697
37115
|
{
|
|
36698
|
-
expr: '{service_name="$service"}
|
|
37116
|
+
expr: '{service_name="$service"} | severity_text=~`ERROR|FATAL`',
|
|
36699
37117
|
refId: "A"
|
|
36700
37118
|
}
|
|
36701
37119
|
],
|
|
@@ -36729,7 +37147,7 @@ function generateArcTracesDashboard() {
|
|
|
36729
37147
|
label: "Service",
|
|
36730
37148
|
type: "query",
|
|
36731
37149
|
datasource: { type: "prometheus", uid: "prometheus" },
|
|
36732
|
-
query: "label_values(
|
|
37150
|
+
query: "label_values(traces_span_metrics_calls_total, service_name)",
|
|
36733
37151
|
refresh: 2,
|
|
36734
37152
|
current: { text: "arc-prod", value: "arc-prod" }
|
|
36735
37153
|
}
|
|
@@ -36828,12 +37246,23 @@ function generateArcLogsDashboard() {
|
|
|
36828
37246
|
type: "textbox",
|
|
36829
37247
|
query: "",
|
|
36830
37248
|
current: { text: "", value: "" }
|
|
37249
|
+
},
|
|
37250
|
+
{
|
|
37251
|
+
name: "container",
|
|
37252
|
+
label: "Container",
|
|
37253
|
+
type: "query",
|
|
37254
|
+
datasource: { type: "loki", uid: "loki" },
|
|
37255
|
+
query: "label_values(container)",
|
|
37256
|
+
refresh: 2,
|
|
37257
|
+
includeAll: true,
|
|
37258
|
+
multi: false,
|
|
37259
|
+
current: { text: "All", value: "$__all" }
|
|
36831
37260
|
}
|
|
36832
37261
|
]
|
|
36833
37262
|
},
|
|
36834
37263
|
panels: [
|
|
36835
|
-
panelStat("Logs ingested (1h)", { x: 0, y: 0, w: 6, h: 4 }, 'sum(
|
|
36836
|
-
panelStat("Errors (1h)", { x: 6, y: 0, w: 6, h: 4 }, 'sum(
|
|
37264
|
+
panelStat("Logs ingested (1h)", { x: 0, y: 0, w: 6, h: 4 }, 'sum(count_over_time({service_name="$service"}[1h]))', "short", undefined, LOKI_DS),
|
|
37265
|
+
panelStat("Errors (1h)", { x: 6, y: 0, w: 6, h: 4 }, 'sum(count_over_time({service_name="$service"} | severity_text=~`ERROR|FATAL` [1h]))', "short", { orange: 1, red: 50 }, LOKI_DS),
|
|
36837
37266
|
{
|
|
36838
37267
|
title: "Log volume by severity",
|
|
36839
37268
|
type: "timeseries",
|
|
@@ -36883,6 +37312,27 @@ function generateArcLogsDashboard() {
|
|
|
36883
37312
|
dedupStrategy: "none",
|
|
36884
37313
|
sortOrder: "Descending"
|
|
36885
37314
|
}
|
|
37315
|
+
},
|
|
37316
|
+
{
|
|
37317
|
+
title: "Container logs ($container)",
|
|
37318
|
+
type: "logs",
|
|
37319
|
+
gridPos: { x: 0, y: 26, w: 24, h: 14 },
|
|
37320
|
+
datasource: { type: "loki", uid: "loki" },
|
|
37321
|
+
targets: [
|
|
37322
|
+
{
|
|
37323
|
+
expr: '{container=~"$container"} |~ "$search"',
|
|
37324
|
+
refId: "A"
|
|
37325
|
+
}
|
|
37326
|
+
],
|
|
37327
|
+
options: {
|
|
37328
|
+
showTime: true,
|
|
37329
|
+
showLabels: true,
|
|
37330
|
+
showCommonLabels: false,
|
|
37331
|
+
wrapLogMessage: true,
|
|
37332
|
+
enableLogDetails: true,
|
|
37333
|
+
dedupStrategy: "none",
|
|
37334
|
+
sortOrder: "Descending"
|
|
37335
|
+
}
|
|
36886
37336
|
}
|
|
36887
37337
|
]
|
|
36888
37338
|
};
|
|
@@ -36912,7 +37362,7 @@ function generateArcSamplingDashboard() {
|
|
|
36912
37362
|
expr: "sum(rate(otelcol_exporter_sent_spans[1m]))",
|
|
36913
37363
|
legend: "exported"
|
|
36914
37364
|
}
|
|
36915
|
-
], "ops"),
|
|
37365
|
+
], "", "ops"),
|
|
36916
37366
|
panelTimeseries("Collector queue size (BatchSpanProcessor)", { x: 0, y: 12, w: 12, h: 8 }, "otelcol_processor_batch_batch_send_size_sum / clamp_min(otelcol_processor_batch_batch_send_size_count, 1)", "avg batch size", "short"),
|
|
36917
37367
|
panelTimeseries("Collector process memory", { x: 12, y: 12, w: 12, h: 8 }, 'process_resident_memory_bytes{job="otel-collector"}', "RSS", "bytes")
|
|
36918
37368
|
]
|
|
@@ -36953,24 +37403,24 @@ function generateArcCommandDashboard() {
|
|
|
36953
37403
|
},
|
|
36954
37404
|
panels: [
|
|
36955
37405
|
panelStat("Call rate", { x: 0, y: 0, w: 6, h: 4 }, 'sum(rate(arc_commands_total{service_name="$service", arc_command_name="$command"}[5m]))', "ops"),
|
|
36956
|
-
panelStat("P50 latency", { x: 6, y: 0, w: 6, h: 4 }, 'histogram_quantile(0.5, sum by (le) (rate(
|
|
36957
|
-
panelStat("P95 latency", { x: 12, y: 0, w: 6, h: 4 }, 'histogram_quantile(0.95, sum by (le) (rate(
|
|
36958
|
-
panelStat("P99 latency", { x: 18, y: 0, w: 6, h: 4 }, 'histogram_quantile(0.99, sum by (le) (rate(
|
|
37406
|
+
panelStat("P50 latency", { x: 6, y: 0, w: 6, h: 4 }, 'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))', "ms"),
|
|
37407
|
+
panelStat("P95 latency", { x: 12, y: 0, w: 6, h: 4 }, 'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))', "ms", { orange: 200, red: 1000 }),
|
|
37408
|
+
panelStat("P99 latency", { x: 18, y: 0, w: 6, h: 4 }, 'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))', "ms", { orange: 500, red: 2000 }),
|
|
36959
37409
|
panelTimeseries("Call rate over time", { x: 0, y: 4, w: 12, h: 8 }, 'sum(rate(arc_commands_total{service_name="$service", arc_command_name="$command"}[1m]))', "calls/s", "ops"),
|
|
36960
37410
|
panelTimeseries("Latency percentiles", { x: 12, y: 4, w: 12, h: 8 }, [
|
|
36961
37411
|
{
|
|
36962
|
-
expr: 'histogram_quantile(0.5, sum by (le) (rate(
|
|
37412
|
+
expr: 'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
|
|
36963
37413
|
legend: "p50"
|
|
36964
37414
|
},
|
|
36965
37415
|
{
|
|
36966
|
-
expr: 'histogram_quantile(0.95, sum by (le) (rate(
|
|
37416
|
+
expr: 'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
|
|
36967
37417
|
legend: "p95"
|
|
36968
37418
|
},
|
|
36969
37419
|
{
|
|
36970
|
-
expr: 'histogram_quantile(0.99, sum by (le) (rate(
|
|
37420
|
+
expr: 'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
|
|
36971
37421
|
legend: "p99"
|
|
36972
37422
|
}
|
|
36973
|
-
], "ms"),
|
|
37423
|
+
], "", "ms"),
|
|
36974
37424
|
{
|
|
36975
37425
|
title: "Recent traces (sampled)",
|
|
36976
37426
|
type: "traces",
|
|
@@ -36989,23 +37439,108 @@ function generateArcCommandDashboard() {
|
|
|
36989
37439
|
};
|
|
36990
37440
|
return JSON.stringify(dashboard, null, 2);
|
|
36991
37441
|
}
|
|
37442
|
+
function generateArcInfraDashboard() {
|
|
37443
|
+
const dashboard = {
|
|
37444
|
+
title: "Arc Infrastructure",
|
|
37445
|
+
uid: "arc-infra",
|
|
37446
|
+
schemaVersion: 39,
|
|
37447
|
+
version: 1,
|
|
37448
|
+
refresh: "30s",
|
|
37449
|
+
time: { from: "now-3h", to: "now" },
|
|
37450
|
+
tags: ["arc", "auto-provisioned"],
|
|
37451
|
+
panels: [
|
|
37452
|
+
panelStat("Host CPU used", { x: 0, y: 0, w: 6, h: 4 }, '100 * (1 - avg(rate(system_cpu_time_seconds_total{state="idle"}[5m])))', "percent", { orange: 70, red: 90 }),
|
|
37453
|
+
panelStat("Host memory used", { x: 6, y: 0, w: 6, h: 4 }, '100 * sum(system_memory_usage_bytes{state="used"}) / sum(system_memory_usage_bytes)', "percent", { orange: 80, red: 90 }),
|
|
37454
|
+
panelStat("Disk used (worst mount)", { x: 12, y: 0, w: 6, h: 4 }, '100 * max(sum by (device, mountpoint) (system_filesystem_usage_bytes{state="used"}) / sum by (device, mountpoint) (system_filesystem_usage_bytes))', "percent", { orange: 75, red: 85 }),
|
|
37455
|
+
panelStat("Load (1m)", { x: 18, y: 0, w: 6, h: 4 }, "system_cpu_load_average_1m", "short"),
|
|
37456
|
+
panelTimeseries("Host CPU utilization", { x: 0, y: 4, w: 12, h: 8 }, [
|
|
37457
|
+
{
|
|
37458
|
+
expr: '100 * (1 - avg(rate(system_cpu_time_seconds_total{state="idle"}[5m])))',
|
|
37459
|
+
legend: "used %"
|
|
37460
|
+
},
|
|
37461
|
+
{ expr: "system_cpu_load_average_1m", legend: "load 1m" },
|
|
37462
|
+
{ expr: "system_cpu_load_average_5m", legend: "load 5m" },
|
|
37463
|
+
{ expr: "system_cpu_load_average_15m", legend: "load 15m" }
|
|
37464
|
+
], "", "short"),
|
|
37465
|
+
panelTimeseries("Host memory by state", { x: 12, y: 4, w: 12, h: 8 }, "sum by (state) (system_memory_usage_bytes)", "{{state}}", "bytes"),
|
|
37466
|
+
panelTimeseries("Filesystem usage by mount", { x: 0, y: 12, w: 12, h: 8 }, '100 * sum by (mountpoint) (system_filesystem_usage_bytes{state="used"}) / sum by (mountpoint) (system_filesystem_usage_bytes)', "{{mountpoint}}", "percent"),
|
|
37467
|
+
panelTimeseries("Disk I/O", { x: 12, y: 12, w: 12, h: 8 }, "sum by (device, direction) (rate(system_disk_io_bytes_total[5m]))", "{{device}} {{direction}}", "Bps"),
|
|
37468
|
+
panelTimeseries("Network I/O", { x: 0, y: 20, w: 12, h: 8 }, 'sum by (device, direction) (rate(system_network_io_bytes_total{device!="lo"}[5m]))', "{{device}} {{direction}}", "Bps"),
|
|
37469
|
+
panelTimeseries("Container restarts (24h)", { x: 12, y: 20, w: 12, h: 8 }, "sum by (container_name) (increase(container_restarts_total[24h]))", "{{container_name}}", "short"),
|
|
37470
|
+
panelTimeseries("Container CPU", { x: 0, y: 28, w: 12, h: 8 }, "container_cpu_utilization_ratio", "{{container_name}}", "percent"),
|
|
37471
|
+
panelTimeseries("Container memory", { x: 12, y: 28, w: 12, h: 8 }, "container_memory_usage_total_bytes", "{{container_name}}", "bytes"),
|
|
37472
|
+
panelTimeseries("Container network RX", { x: 0, y: 36, w: 12, h: 8 }, "sum by (container_name) (rate(container_network_io_usage_rx_bytes_total[5m]))", "{{container_name}}", "Bps"),
|
|
37473
|
+
panelTimeseries("Container network TX", { x: 12, y: 36, w: 12, h: 8 }, "sum by (container_name) (rate(container_network_io_usage_tx_bytes_total[5m]))", "{{container_name}}", "Bps")
|
|
37474
|
+
]
|
|
37475
|
+
};
|
|
37476
|
+
return JSON.stringify(dashboard, null, 2);
|
|
37477
|
+
}
|
|
37478
|
+
function generateArcEdgeDashboard() {
|
|
37479
|
+
const dashboard = {
|
|
37480
|
+
title: "Arc Caddy / Edge",
|
|
37481
|
+
uid: "arc-edge",
|
|
37482
|
+
schemaVersion: 39,
|
|
37483
|
+
version: 1,
|
|
37484
|
+
refresh: "30s",
|
|
37485
|
+
time: { from: "now-1h", to: "now" },
|
|
37486
|
+
tags: ["arc", "auto-provisioned"],
|
|
37487
|
+
panels: [
|
|
37488
|
+
panelStat("Requests/s", { x: 0, y: 0, w: 6, h: 4 }, "sum(rate(caddy_http_request_duration_seconds_count[1m]))", "reqps"),
|
|
37489
|
+
panelStat("In-flight requests", { x: 6, y: 0, w: 6, h: 4 }, "sum(caddy_http_requests_in_flight)", "short"),
|
|
37490
|
+
panelStat("Handler errors/s", { x: 12, y: 0, w: 6, h: 4 }, "sum(rate(caddy_http_request_errors_total[5m]))", "ops", { orange: 0.1, red: 1 }),
|
|
37491
|
+
panelStat("P95 latency", { x: 18, y: 0, w: 6, h: 4 }, "histogram_quantile(0.95, sum by (le) (rate(caddy_http_request_duration_seconds_bucket[5m])))", "s", { orange: 0.3, red: 1 }),
|
|
37492
|
+
panelTimeseries("Request rate by host", { x: 0, y: 4, w: 12, h: 8 }, "sum by (host) (rate(caddy_http_request_duration_seconds_count[1m]))", "{{host}}", "reqps"),
|
|
37493
|
+
panelTimeseries("Responses by status code", { x: 12, y: 4, w: 12, h: 8 }, "sum by (code) (rate(caddy_http_request_duration_seconds_count[1m]))", "{{code}}", "reqps"),
|
|
37494
|
+
panelTimeseries("P95 latency by host", { x: 0, y: 12, w: 12, h: 8 }, "histogram_quantile(0.95, sum by (host, le) (rate(caddy_http_request_duration_seconds_bucket[5m])))", "{{host}}", "s"),
|
|
37495
|
+
panelTimeseries("4xx/5xx responses (access log)", { x: 12, y: 12, w: 12, h: 8 }, 'sum by (status) (count_over_time({compose_service="caddy"} | json | status >= 400 [$__interval]))', "{{status}}", "short", LOKI_DS),
|
|
37496
|
+
{
|
|
37497
|
+
title: "Access log (live)",
|
|
37498
|
+
type: "logs",
|
|
37499
|
+
gridPos: { x: 0, y: 20, w: 24, h: 12 },
|
|
37500
|
+
datasource: { type: "loki", uid: "loki" },
|
|
37501
|
+
targets: [
|
|
37502
|
+
{
|
|
37503
|
+
expr: '{compose_service="caddy"}',
|
|
37504
|
+
refId: "A"
|
|
37505
|
+
}
|
|
37506
|
+
],
|
|
37507
|
+
options: {
|
|
37508
|
+
showTime: true,
|
|
37509
|
+
showLabels: false,
|
|
37510
|
+
showCommonLabels: false,
|
|
37511
|
+
wrapLogMessage: true,
|
|
37512
|
+
enableLogDetails: true,
|
|
37513
|
+
dedupStrategy: "none",
|
|
37514
|
+
sortOrder: "Descending"
|
|
37515
|
+
}
|
|
37516
|
+
}
|
|
37517
|
+
]
|
|
37518
|
+
};
|
|
37519
|
+
return JSON.stringify(dashboard, null, 2);
|
|
37520
|
+
}
|
|
36992
37521
|
function generateObservabilityConfigs(cfg) {
|
|
36993
37522
|
return {
|
|
36994
37523
|
"observability/otel-collector-config.yaml": generateOtelCollectorConfig(cfg),
|
|
36995
37524
|
"observability/tempo.yaml": generateTempoConfig(cfg),
|
|
36996
37525
|
"observability/loki-config.yaml": generateLokiConfig(cfg),
|
|
36997
37526
|
"observability/prometheus.yml": generatePrometheusConfig(cfg),
|
|
37527
|
+
"observability/alloy-config.alloy": generateAlloyConfig(),
|
|
36998
37528
|
"observability/grafana-datasources.yaml": generateGrafanaDatasources(),
|
|
36999
37529
|
"observability/grafana-dashboards.yaml": generateGrafanaDashboardsProvider(),
|
|
37530
|
+
"observability/grafana-alerting/alerting.yaml": generateGrafanaAlerting(cfg),
|
|
37000
37531
|
"observability/grafana-dashboards/arc-overview.json": generateArcOverviewDashboard(),
|
|
37001
37532
|
"observability/grafana-dashboards/arc-traces.json": generateArcTracesDashboard(),
|
|
37002
37533
|
"observability/grafana-dashboards/arc-service-map.json": generateArcServiceMapDashboard(),
|
|
37003
37534
|
"observability/grafana-dashboards/arc-logs.json": generateArcLogsDashboard(),
|
|
37004
37535
|
"observability/grafana-dashboards/arc-sampling.json": generateArcSamplingDashboard(),
|
|
37005
|
-
"observability/grafana-dashboards/arc-command.json": generateArcCommandDashboard()
|
|
37536
|
+
"observability/grafana-dashboards/arc-command.json": generateArcCommandDashboard(),
|
|
37537
|
+
"observability/grafana-dashboards/arc-infra.json": generateArcInfraDashboard(),
|
|
37538
|
+
"observability/grafana-dashboards/arc-edge.json": generateArcEdgeDashboard()
|
|
37006
37539
|
};
|
|
37007
37540
|
}
|
|
37008
|
-
|
|
37541
|
+
var PROMETHEUS_DS = { type: "prometheus", uid: "prometheus" };
|
|
37542
|
+
var LOKI_DS = { type: "loki", uid: "loki" };
|
|
37543
|
+
function panelStat(title, gridPos, expr, unit, thresholds, datasource = PROMETHEUS_DS) {
|
|
37009
37544
|
const steps = [
|
|
37010
37545
|
{ color: "green", value: null }
|
|
37011
37546
|
];
|
|
@@ -37019,7 +37554,7 @@ function panelStat(title, gridPos, expr, unit, thresholds) {
|
|
|
37019
37554
|
title,
|
|
37020
37555
|
type: "stat",
|
|
37021
37556
|
gridPos,
|
|
37022
|
-
datasource
|
|
37557
|
+
datasource,
|
|
37023
37558
|
targets: [{ expr, refId: "A", legendFormat: title }],
|
|
37024
37559
|
fieldConfig: {
|
|
37025
37560
|
defaults: {
|
|
@@ -37037,7 +37572,7 @@ function panelStat(title, gridPos, expr, unit, thresholds) {
|
|
|
37037
37572
|
}
|
|
37038
37573
|
};
|
|
37039
37574
|
}
|
|
37040
|
-
function panelTimeseries(title, gridPos, query, legend, unit) {
|
|
37575
|
+
function panelTimeseries(title, gridPos, query, legend, unit, datasource = PROMETHEUS_DS) {
|
|
37041
37576
|
const targets = Array.isArray(query) ? query.map((q, i) => ({
|
|
37042
37577
|
expr: q.expr,
|
|
37043
37578
|
refId: String.fromCharCode(65 + i),
|
|
@@ -37047,7 +37582,7 @@ function panelTimeseries(title, gridPos, query, legend, unit) {
|
|
|
37047
37582
|
title,
|
|
37048
37583
|
type: "timeseries",
|
|
37049
37584
|
gridPos,
|
|
37050
|
-
datasource
|
|
37585
|
+
datasource,
|
|
37051
37586
|
targets,
|
|
37052
37587
|
fieldConfig: {
|
|
37053
37588
|
defaults: {
|
|
@@ -37378,11 +37913,16 @@ function validateDeployConfig(input) {
|
|
|
37378
37913
|
metrics: optionalString(retentionRaw, "observability.retention.metrics")
|
|
37379
37914
|
};
|
|
37380
37915
|
}
|
|
37916
|
+
const alertWebhookUrl = optionalString(observabilityRaw, "observability.alertWebhookUrl");
|
|
37917
|
+
if (alertWebhookUrl !== undefined && !/^https?:\/\/.+/.test(alertWebhookUrl)) {
|
|
37918
|
+
throw new Error(`deploy.arc.json: observability.alertWebhookUrl must be an http(s) URL (got "${alertWebhookUrl}")`);
|
|
37919
|
+
}
|
|
37381
37920
|
validated.observability = {
|
|
37382
37921
|
enabled: enabledRaw,
|
|
37383
37922
|
subdomain: optionalString(observabilityRaw, "observability.subdomain") ?? "observability",
|
|
37384
37923
|
adminPasswordEnv: optionalString(observabilityRaw, "observability.adminPasswordEnv") ?? "ARC_OBSERVABILITY_PASSWORD",
|
|
37385
|
-
retention
|
|
37924
|
+
retention,
|
|
37925
|
+
alertWebhookUrl
|
|
37386
37926
|
};
|
|
37387
37927
|
}
|
|
37388
37928
|
const provision = input.provision;
|
|
@@ -37678,14 +38218,14 @@ async function bootstrap(inputs) {
|
|
|
37678
38218
|
});
|
|
37679
38219
|
ok("Host bootstrapped");
|
|
37680
38220
|
}
|
|
37681
|
-
const needUpStack = state.kind !== "ready" || state.marker === null || state.marker.configHash !== inputs.configHash || !await isRegistryRunning(cfg);
|
|
38221
|
+
const needUpStack = state.kind !== "ready" || state.marker === null || state.marker.configHash !== inputs.configHash || state.marker.cliVersion !== inputs.cliVersion || !await isRegistryRunning(cfg);
|
|
37682
38222
|
if (needUpStack) {
|
|
37683
38223
|
await upStack(inputs);
|
|
37684
38224
|
ok("Docker stack up");
|
|
37685
38225
|
}
|
|
37686
38226
|
if (cfg.observability?.enabled) {
|
|
37687
38227
|
log2("Ensuring observability sidecars are running...");
|
|
37688
|
-
const obsServices = ["otel-collector", "tempo", "loki", "prometheus", "grafana"];
|
|
38228
|
+
const obsServices = ["otel-collector", "tempo", "loki", "prometheus", "alloy", "grafana"];
|
|
37689
38229
|
await assertExec(cfg.target, `cd ${cfg.target.remoteDir} && docker compose pull --ignore-pull-failures ${obsServices.join(" ")} && docker compose up -d ${obsServices.join(" ")}`);
|
|
37690
38230
|
ok("Observability stack up");
|
|
37691
38231
|
}
|
|
@@ -37752,7 +38292,7 @@ async function upStack(inputs) {
|
|
|
37752
38292
|
await scpUpload(cfg.target, join18(workDir, "docker-compose.yml"), `${cfg.target.remoteDir}/docker-compose.yml`);
|
|
37753
38293
|
await scpUpload(cfg.target, join18(workDir, "htpasswd"), `${cfg.target.remoteDir}/registry-auth/htpasswd`);
|
|
37754
38294
|
if (observabilityFiles && observabilityHtpasswd) {
|
|
37755
|
-
await assertExec(cfg.target, `mkdir -p ${cfg.target.remoteDir}/observability/grafana-dashboards`);
|
|
38295
|
+
await assertExec(cfg.target, `mkdir -p ${cfg.target.remoteDir}/observability/grafana-dashboards ${cfg.target.remoteDir}/observability/grafana-alerting`);
|
|
37756
38296
|
for (const relPath of Object.keys(observabilityFiles)) {
|
|
37757
38297
|
const localDir = dirname9(join18(workDir, relPath));
|
|
37758
38298
|
mkdirSync12(localDir, { recursive: true });
|
|
@@ -39274,7 +39814,7 @@ class ContextHandler {
|
|
|
39274
39814
|
try {
|
|
39275
39815
|
return await this.telemetry.startSpan(`command.${commandName}`, runCommand, { attributes: baseAttrs });
|
|
39276
39816
|
} finally {
|
|
39277
|
-
this.telemetry.measureSince("arc.command.
|
|
39817
|
+
this.telemetry.measureSince("arc.command.duration", start, {
|
|
39278
39818
|
"arc.command.name": commandName
|
|
39279
39819
|
});
|
|
39280
39820
|
this.telemetry.incrementCounter("arc.commands.total", 1, {
|
|
@@ -40359,6 +40899,7 @@ async function createArcServer(config) {
|
|
|
40359
40899
|
websocket: {
|
|
40360
40900
|
open(ws) {
|
|
40361
40901
|
connectionManager.addClient(ws);
|
|
40902
|
+
config.telemetry?.addUpDown("arc.ws.active_connections", 1);
|
|
40362
40903
|
},
|
|
40363
40904
|
async message(ws, messageStr) {
|
|
40364
40905
|
const client = connectionManager.getClientByWs(ws);
|
|
@@ -40371,6 +40912,9 @@ async function createArcServer(config) {
|
|
|
40371
40912
|
console.error("Failed to parse WS message:", error);
|
|
40372
40913
|
return;
|
|
40373
40914
|
}
|
|
40915
|
+
config.telemetry?.incrementCounter("arc.ws.messages", 1, {
|
|
40916
|
+
"messaging.message.type": String(message?.type ?? "unknown")
|
|
40917
|
+
});
|
|
40374
40918
|
const dispatch = async () => {
|
|
40375
40919
|
try {
|
|
40376
40920
|
for (const handler of wsHandlers) {
|
|
@@ -40408,6 +40952,7 @@ async function createArcServer(config) {
|
|
|
40408
40952
|
cleanupClientSubs(client.id);
|
|
40409
40953
|
config.onWsClose?.(client.id);
|
|
40410
40954
|
connectionManager.removeClient(client.id);
|
|
40955
|
+
config.telemetry?.addUpDown("arc.ws.active_connections", -1);
|
|
40411
40956
|
}
|
|
40412
40957
|
}
|
|
40413
40958
|
}
|
|
@@ -40726,7 +41271,8 @@ async function startPlatformServer(opts) {
|
|
|
40726
41271
|
endpoint: process.env.OTEL_EXPORTER_OTLP_ENDPOINT,
|
|
40727
41272
|
mode: devMode ? "development" : "production",
|
|
40728
41273
|
sampleRate: devMode ? 1 : 1,
|
|
40729
|
-
debug: process.env.ARC_OTEL_DEBUG === "true"
|
|
41274
|
+
debug: process.env.ARC_OTEL_DEBUG === "true",
|
|
41275
|
+
patchConsole: process.env.ARC_OTEL_PATCH_CONSOLE !== "false"
|
|
40730
41276
|
});
|
|
40731
41277
|
telemetry = init2.telemetry;
|
|
40732
41278
|
telemetryShutdown = init2.shutdown;
|