@arcote.tech/arc-cli 0.7.19 → 0.7.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -853,7 +853,7 @@ Expecting one of '${allowedValues.join("', '")}'`);
853
853
  this._exitCallback = (err) => {
854
854
  if (err.code !== "commander.executeSubCommandAsync") {
855
855
  throw err;
856
- } else {}
856
+ }
857
857
  };
858
858
  }
859
859
  return this;
@@ -25870,6 +25870,7 @@ import {
25870
25870
  ATTR_SERVICE_NAME,
25871
25871
  ATTR_SERVICE_VERSION
25872
25872
  } from "@opentelemetry/semantic-conventions/incubating";
25873
+ import { format as format2 } from "util";
25873
25874
  import {
25874
25875
  context,
25875
25876
  propagation,
@@ -25880,6 +25881,41 @@ import {
25880
25881
  logs,
25881
25882
  SeverityNumber
25882
25883
  } from "@opentelemetry/api-logs";
25884
+ function patchConsole(telemetry) {
25885
+ if (patched || !telemetry.active)
25886
+ return () => {};
25887
+ patched = true;
25888
+ const originals = [];
25889
+ for (const [method, level] of METHODS) {
25890
+ const original = console[method].bind(console);
25891
+ originals.push([method, original]);
25892
+ console[method] = (...args) => {
25893
+ original(...args);
25894
+ if (emitting)
25895
+ return;
25896
+ emitting = true;
25897
+ try {
25898
+ const body = format2(...args);
25899
+ if (body.startsWith("[arc-otel]"))
25900
+ return;
25901
+ const error = args.find((arg) => arg instanceof Error);
25902
+ telemetry.log(level, body, error ? {
25903
+ "exception.type": error.name,
25904
+ "exception.message": error.message,
25905
+ "exception.stacktrace": error.stack ?? ""
25906
+ } : {});
25907
+ } catch {} finally {
25908
+ emitting = false;
25909
+ }
25910
+ };
25911
+ }
25912
+ return () => {
25913
+ for (const [method, original] of originals) {
25914
+ console[method] = original;
25915
+ }
25916
+ patched = false;
25917
+ };
25918
+ }
25883
25919
  function sanitizeAttrs(input, opts = {}) {
25884
25920
  if (!input)
25885
25921
  return {};
@@ -25939,6 +25975,7 @@ class ArcTelemetry {
25939
25975
  meter = null;
25940
25976
  histograms = new Map;
25941
25977
  counters = new Map;
25978
+ upDownCounters = new Map;
25942
25979
  constructor(config) {
25943
25980
  const mode = config.mode ?? "development";
25944
25981
  const enabled = config.enabled ?? mode !== "disabled";
@@ -26050,6 +26087,18 @@ class ArcTelemetry {
26050
26087
  counter.add(value, attrs);
26051
26088
  } catch {}
26052
26089
  }
26090
+ addUpDown(name, delta, attrs = {}) {
26091
+ if (!this.active || !this.meter)
26092
+ return;
26093
+ let counter = this.upDownCounters.get(name);
26094
+ if (!counter) {
26095
+ counter = this.meter.createUpDownCounter(name);
26096
+ this.upDownCounters.set(name, counter);
26097
+ }
26098
+ try {
26099
+ counter.add(delta, attrs);
26100
+ } catch {}
26101
+ }
26053
26102
  recordHistogram(name, value, attrs = {}) {
26054
26103
  if (!this.active || !this.meter)
26055
26104
  return;
@@ -26097,53 +26146,31 @@ function noopSpan() {
26097
26146
  function wrapDbAdapter(adapter, telemetry, dbSystem) {
26098
26147
  if (!telemetry || !telemetry.active)
26099
26148
  return adapter;
26149
+ const dbAttrs = (operation, store) => ({
26150
+ "db.system": dbSystem,
26151
+ "db.operation.name": operation,
26152
+ ...store ? { "db.collection.name": store } : {}
26153
+ });
26154
+ const measureOp = async (operation, store, fn) => {
26155
+ const start = Date.now();
26156
+ try {
26157
+ return await fn();
26158
+ } finally {
26159
+ telemetry.measureSince("arc.db.operation.duration", start, dbAttrs(operation, store));
26160
+ }
26161
+ };
26100
26162
  const wrapRead = (tx) => ({
26101
- find: async (store, options) => telemetry.startSpan(`db.find ${store}`, async (span) => {
26102
- const start = Date.now();
26103
- try {
26104
- const rows = await tx.find(store, options);
26105
- span.setAttribute("db.response.row_count", rows.length);
26106
- return rows;
26107
- } finally {
26108
- telemetry.measureSince("arc.db.find_ms", start, {
26109
- "db.system": dbSystem,
26110
- "db.collection.name": store
26111
- });
26112
- }
26113
- }, {
26114
- kind: 3,
26115
- attributes: {
26116
- "db.system": dbSystem,
26117
- "db.operation.name": "find",
26118
- "db.collection.name": store
26119
- }
26120
- })
26163
+ find: async (store, options) => telemetry.startSpan(`db.find ${store}`, async (span) => measureOp("find", store, async () => {
26164
+ const rows = await tx.find(store, options);
26165
+ span.setAttribute("db.response.row_count", rows.length);
26166
+ return rows;
26167
+ }), { kind: 3, attributes: dbAttrs("find", store) })
26121
26168
  });
26122
26169
  const wrapReadWrite = (tx) => ({
26123
26170
  ...wrapRead(tx),
26124
- set: async (store, data) => telemetry.startSpan(`db.set ${store}`, () => tx.set(store, data), {
26125
- kind: 3,
26126
- attributes: {
26127
- "db.system": dbSystem,
26128
- "db.operation.name": "set",
26129
- "db.collection.name": store
26130
- }
26131
- }),
26132
- remove: async (store, id3) => telemetry.startSpan(`db.remove ${store}`, () => tx.remove(store, id3), {
26133
- kind: 3,
26134
- attributes: {
26135
- "db.system": dbSystem,
26136
- "db.operation.name": "remove",
26137
- "db.collection.name": store
26138
- }
26139
- }),
26140
- commit: async () => telemetry.startSpan("db.commit", () => tx.commit(), {
26141
- kind: 3,
26142
- attributes: {
26143
- "db.system": dbSystem,
26144
- "db.operation.name": "commit"
26145
- }
26146
- })
26171
+ set: async (store, data) => telemetry.startSpan(`db.set ${store}`, () => measureOp("set", store, () => tx.set(store, data)), { kind: 3, attributes: dbAttrs("set", store) }),
26172
+ remove: async (store, id3) => telemetry.startSpan(`db.remove ${store}`, () => measureOp("remove", store, () => tx.remove(store, id3)), { kind: 3, attributes: dbAttrs("remove", store) }),
26173
+ commit: async () => telemetry.startSpan("db.commit", () => measureOp("commit", undefined, () => tx.commit()), { kind: 3, attributes: dbAttrs("commit") })
26147
26174
  });
26148
26175
  return new Proxy(adapter, {
26149
26176
  get(target, prop) {
@@ -26206,6 +26233,7 @@ function initServerTelemetry(config) {
26206
26233
  logger: loggerProvider.getLogger(config.serviceName),
26207
26234
  meter: meterProvider.getMeter(config.serviceName)
26208
26235
  });
26236
+ const restoreConsole = config.patchConsole !== false ? patchConsole(telemetry) : () => {};
26209
26237
  if (telemetry.config.debug) {
26210
26238
  console.log("[arc-otel] server init", {
26211
26239
  serviceName: config.serviceName,
@@ -26216,6 +26244,7 @@ function initServerTelemetry(config) {
26216
26244
  });
26217
26245
  }
26218
26246
  const shutdown = async () => {
26247
+ restoreConsole();
26219
26248
  try {
26220
26249
  await Promise.all([
26221
26250
  tracerProvider.shutdown(),
@@ -26228,8 +26257,15 @@ function initServerTelemetry(config) {
26228
26257
  };
26229
26258
  return { telemetry, shutdown };
26230
26259
  }
26231
- var DEFAULT_REDACT_KEY_PATTERN, DEFAULT_MAX_STRING_LEN = 2048, DEFAULT_MAX_JSON_LEN = 4096;
26260
+ var METHODS, patched = false, emitting = false, DEFAULT_REDACT_KEY_PATTERN, DEFAULT_MAX_STRING_LEN = 2048, DEFAULT_MAX_JSON_LEN = 4096;
26232
26261
  var init_init_server = __esm(() => {
26262
+ METHODS = [
26263
+ ["debug", "debug"],
26264
+ ["log", "info"],
26265
+ ["info", "info"],
26266
+ ["warn", "warn"],
26267
+ ["error", "error"]
26268
+ ];
26233
26269
  DEFAULT_REDACT_KEY_PATTERN = /(password|passwd|token|secret|authorization|jwt|api[_-]?key|cookie|email|credit[_-]?card|ssn)/i;
26234
26270
  });
26235
26271
 
@@ -34687,6 +34723,17 @@ function serverExternalsPlugin() {
34687
34723
  }
34688
34724
  };
34689
34725
  }
34726
+ function workspaceSourcePlugin(srcByName) {
34727
+ return {
34728
+ name: "workspace-source",
34729
+ setup(build2) {
34730
+ build2.onResolve({ filter: /^[^./]/ }, (args) => {
34731
+ const src = srcByName.get(args.path);
34732
+ return src ? { path: src, sideEffects: true } : null;
34733
+ });
34734
+ }
34735
+ };
34736
+ }
34690
34737
  function jsxDevShimPlugin() {
34691
34738
  return {
34692
34739
  name: "jsx-dev-runtime-shim",
@@ -34707,9 +34754,10 @@ export { Fragment };
34707
34754
  };
34708
34755
  }
34709
34756
  var CONTEXT_CLIENTS = [
34710
- { name: "server", target: "bun", defines: { ONLY_SERVER: "true", ONLY_BROWSER: "false", ONLY_CLIENT: "false" } },
34711
34757
  { name: "browser", target: "browser", defines: { ONLY_SERVER: "false", ONLY_BROWSER: "true", ONLY_CLIENT: "true" } }
34712
34758
  ];
34759
+ var SERVER_DEFINES = { ONLY_SERVER: "true", ONLY_BROWSER: "false", ONLY_CLIENT: "false" };
34760
+ var SERVER_ENTRY_FILE = "_server.js";
34713
34761
  function discoverPackages(rootDir) {
34714
34762
  const rootPkg = JSON.parse(readFileSync7(join8(rootDir, "package.json"), "utf-8"));
34715
34763
  const workspaceGlobs = rootPkg.workspaces ?? [];
@@ -34802,9 +34850,7 @@ async function buildContextClient(pkg, rootDir, client, cache, noCache) {
34802
34850
  console.log(` building: ${pkg.name} (${client.name})`);
34803
34851
  const peerDeps = Object.keys(pkg.packageJson.peerDependencies ?? {});
34804
34852
  const allDeps = pkg.packageJson.dependencies ?? {};
34805
- const isBrowser2 = client.name === "browser";
34806
- const workspaceDeps = isBrowser2 ? Object.keys(allDeps) : Object.entries(allDeps).filter(([, spec]) => !spec.startsWith("workspace:")).map(([name]) => name);
34807
- const externals = [...peerDeps, ...workspaceDeps];
34853
+ const externals = [...peerDeps, ...Object.keys(allDeps)];
34808
34854
  const result = await Bun.build({
34809
34855
  entrypoints: [pkg.entrypoint],
34810
34856
  outdir: join8(outDir, "main"),
@@ -34812,7 +34858,7 @@ async function buildContextClient(pkg, rootDir, client, cache, noCache) {
34812
34858
  format: "esm",
34813
34859
  naming: "index.[ext]",
34814
34860
  external: externals,
34815
- plugins: isBrowser2 ? [jsxDevShimPlugin()] : [jsxDevShimPlugin(), serverExternalsPlugin()],
34861
+ plugins: [jsxDevShimPlugin()],
34816
34862
  define: client.defines
34817
34863
  });
34818
34864
  if (!result.success) {
@@ -34875,6 +34921,80 @@ async function buildContextPackages(rootDir, packages, cache, noCache) {
34875
34921
  }
34876
34922
  return { declarationErrors };
34877
34923
  }
34924
+ async function buildServerApp(rootDir, serverDir, packages, cache, noCache) {
34925
+ const contexts = packages.filter((p) => isContextPackage(p.packageJson));
34926
+ mkdirSync6(serverDir, { recursive: true });
34927
+ const srcByName = new Map(packages.map((p) => [p.name, p.entrypoint]));
34928
+ const externalSet = new Set(FRAMEWORK_PEERS);
34929
+ for (const p of packages) {
34930
+ for (const name of Object.keys(p.packageJson.peerDependencies ?? {})) {
34931
+ externalSet.add(name);
34932
+ }
34933
+ for (const [name, spec] of Object.entries(p.packageJson.dependencies ?? {})) {
34934
+ if (!spec.startsWith("workspace:"))
34935
+ externalSet.add(name);
34936
+ }
34937
+ }
34938
+ const external = [...externalSet];
34939
+ const unitId = "server-app";
34940
+ const inputHash = sha256OfJson({
34941
+ members: packages.map((p) => ({ name: p.name, src: pkgSourceHash(p) })).sort((a, b) => a.name.localeCompare(b.name)),
34942
+ contexts: contexts.map((p) => p.name).sort(),
34943
+ external: [...external].sort(),
34944
+ defines: SERVER_DEFINES
34945
+ });
34946
+ const entryFileAbs = join8(serverDir, SERVER_ENTRY_FILE);
34947
+ if (!noCache && isCacheHit(cache, unitId, inputHash, [entryFileAbs])) {
34948
+ console.log(` \u2713 cached: ${unitId}`);
34949
+ return { entryFile: SERVER_ENTRY_FILE, cached: true };
34950
+ }
34951
+ console.log(` building: ${unitId} (${contexts.length} server modules)`);
34952
+ for (const f of readdirSync4(serverDir)) {
34953
+ if (f.endsWith(".js"))
34954
+ rmSync(join8(serverDir, f), { force: true });
34955
+ }
34956
+ const tmpDir = join8(serverDir, "_entries");
34957
+ mkdirSync6(tmpDir, { recursive: true });
34958
+ const entrySrc = join8(tmpDir, SERVER_ENTRY_FILE.replace(/\.js$/, ".ts"));
34959
+ writeFileSync6(entrySrc, contexts.map((p) => `import "${p.name}";`).join(`
34960
+ `) + `
34961
+ `);
34962
+ let result;
34963
+ try {
34964
+ result = await Bun.build({
34965
+ entrypoints: [entrySrc],
34966
+ outdir: serverDir,
34967
+ target: "bun",
34968
+ format: "esm",
34969
+ splitting: true,
34970
+ naming: "[name].[ext]",
34971
+ external,
34972
+ plugins: [
34973
+ jsxDevShimPlugin(),
34974
+ serverExternalsPlugin(),
34975
+ workspaceSourcePlugin(srcByName)
34976
+ ],
34977
+ define: SERVER_DEFINES
34978
+ });
34979
+ } finally {
34980
+ rmSync(tmpDir, { recursive: true, force: true });
34981
+ }
34982
+ if (!result.success) {
34983
+ for (const log2 of result.logs)
34984
+ console.error(log2);
34985
+ throw new Error("Server app build failed");
34986
+ }
34987
+ const entryOut = result.outputs.find((o) => o.kind === "entry-point");
34988
+ if (!entryOut) {
34989
+ throw new Error("Server app build: entry not found in outputs");
34990
+ }
34991
+ if (basename2(entryOut.path) !== SERVER_ENTRY_FILE) {
34992
+ throw new Error(`Server app build: unexpected entry name ${basename2(entryOut.path)} (wanted ${SERVER_ENTRY_FILE})`);
34993
+ }
34994
+ const outputHash = sha256OfDir(serverDir);
34995
+ updateCache(cache, unitId, inputHash, { outputHash });
34996
+ return { entryFile: SERVER_ENTRY_FILE, cached: false };
34997
+ }
34878
34998
  async function buildBrowserApp(rootDir, outDir, plan, cache, noCache, i18nCollector) {
34879
34999
  mkdirSync6(outDir, { recursive: true });
34880
35000
  const publicMembers = plan.groups.get("public") ?? [];
@@ -35180,11 +35300,8 @@ import {
35180
35300
  writeFileSync as writeFileSync7
35181
35301
  } from "fs";
35182
35302
  import { join as join9 } from "path";
35183
- async function extractAccessMap(rootDir, packages) {
35184
- const serverBundles = packages.filter((p) => isContextPackage(p.packageJson)).map((p) => ({
35185
- name: p.name,
35186
- path: join9(p.path, "dist", "server", "main", "index.js")
35187
- })).filter((b) => existsSync8(b.path));
35303
+ async function extractAccessMap(rootDir, serverBundlePath) {
35304
+ const serverBundles = existsSync8(serverBundlePath) ? [{ name: "server", path: serverBundlePath }] : [];
35188
35305
  const workerDir = join9(rootDir, ".arc", ".tmp");
35189
35306
  mkdirSync7(workerDir, { recursive: true });
35190
35307
  const workerPath = join9(workerDir, `access-extractor-${Date.now()}.mjs`);
@@ -35513,8 +35630,9 @@ async function buildAll(ws, opts = {}) {
35513
35630
  log2(`Building (concurrency parallel${noCache ? ", no-cache" : ""})...`);
35514
35631
  assertOneModulePerPackage(ws.packages);
35515
35632
  await buildContextPackages(ws.rootDir, ws.packages, cache, noCache);
35516
- copyContextServerBundles(ws);
35517
- const accessMap = await extractAccessMap(ws.rootDir, ws.packages);
35633
+ const serverDir = join12(ws.arcDir, "server");
35634
+ const { entryFile: serverEntry } = await buildServerApp(ws.rootDir, serverDir, ws.packages, cache, noCache);
35635
+ const accessMap = await extractAccessMap(ws.rootDir, join12(serverDir, serverEntry));
35518
35636
  mkdirSync9(ws.arcDir, { recursive: true });
35519
35637
  writeFileSync9(join12(ws.arcDir, "access.json"), JSON.stringify(accessMap, null, 2) + `
35520
35638
  `);
@@ -35545,22 +35663,6 @@ function assembleManifest(ws, browser, cache) {
35545
35663
  buildTime: new Date().toISOString()
35546
35664
  };
35547
35665
  }
35548
- function copyContextServerBundles(ws) {
35549
- const outDir = join12(ws.arcDir, "server");
35550
- mkdirSync9(outDir, { recursive: true });
35551
- for (const pkg of ws.packages) {
35552
- if (!isContextPackage(pkg.packageJson))
35553
- continue;
35554
- const src = join12(pkg.path, "dist", "server", "main", "index.js");
35555
- if (!existsSync10(src)) {
35556
- err(`Server bundle missing for ${pkg.name}: ${src}`);
35557
- continue;
35558
- }
35559
- const safeName = pkg.path.split("/").pop();
35560
- const dst = join12(outDir, `${safeName}.js`);
35561
- copyFileSync(src, dst);
35562
- }
35563
- }
35564
35666
  function resolveAssetSource(from, pkgDir, rootDir) {
35565
35667
  if (from.startsWith("./") || from.startsWith("../")) {
35566
35668
  const resolved = join12(pkgDir, from);
@@ -35656,34 +35758,15 @@ async function loadServerContext(ws) {
35656
35758
  const platformPkg = JSON.parse(readFileSync11(join12(platformDir, "package.json"), "utf-8"));
35657
35759
  const platformEntry = join12(platformDir, platformPkg.main ?? "src/index.ts");
35658
35760
  await import(platformEntry);
35659
- const serverDir = join12(ws.arcDir, "server");
35660
- const bundles = existsSync10(serverDir) ? readdirSync6(serverDir).filter((f) => f.endsWith(".js")) : [];
35661
- if (bundles.length > 0) {
35662
- for (const file of bundles) {
35663
- const bundlePath = join12(serverDir, file);
35664
- try {
35665
- await import(bundlePath);
35666
- } catch (e) {
35667
- err(`Failed to load server bundle ${file}: ${e}`);
35668
- }
35669
- }
35670
- } else if (ws.packages.length > 0) {
35671
- const ctxPackages = ws.packages.filter((p) => isContextPackage(p.packageJson));
35672
- for (const ctx of ctxPackages) {
35673
- const serverDist = join12(ctx.path, "dist", "server", "main", "index.js");
35674
- if (!existsSync10(serverDist)) {
35675
- err(`Context server dist not found: ${serverDist}`);
35676
- continue;
35677
- }
35678
- try {
35679
- await import(serverDist);
35680
- } catch (e) {
35681
- err(`Failed to load server context from ${ctx.name}: ${e}`);
35682
- }
35683
- }
35684
- } else {
35761
+ const serverEntry = join12(ws.arcDir, "server", SERVER_ENTRY_FILE);
35762
+ if (!existsSync10(serverEntry)) {
35685
35763
  return { context: null, moduleAccess: new Map };
35686
35764
  }
35765
+ try {
35766
+ await import(serverEntry);
35767
+ } catch (e) {
35768
+ err(`Failed to load server bundle ${SERVER_ENTRY_FILE}: ${e}`);
35769
+ }
35687
35770
  const { getContext, getAllModuleAccess } = await import(platformEntry);
35688
35771
  return {
35689
35772
  context: getContext() ?? null,
@@ -36054,18 +36137,32 @@ function generateCaddyfile(cfg) {
36054
36137
  email ${cfg.caddy.email}`;
36055
36138
  const tlsDirective = cfg.caddy.email === "internal" ? `
36056
36139
  tls internal` : "";
36140
+ const observability = cfg.observability?.enabled === true;
36141
+ const logDirective = observability ? [" log {", " output stdout", " format json", " }"] : [];
36057
36142
  const lines = [];
36058
36143
  lines.push("# Generated by `arc platform deploy` \u2014 do not edit by hand.");
36059
36144
  lines.push("");
36060
36145
  lines.push("{");
36061
36146
  lines.push(" admin off");
36147
+ if (observability) {
36148
+ lines.push(" metrics {");
36149
+ lines.push(" per_host");
36150
+ lines.push(" }");
36151
+ }
36062
36152
  if (email)
36063
36153
  lines.push(` ${email.trim()}`);
36064
36154
  lines.push("}");
36065
36155
  lines.push("");
36156
+ if (observability) {
36157
+ lines.push(":2020 {");
36158
+ lines.push(" metrics");
36159
+ lines.push("}");
36160
+ lines.push("");
36161
+ }
36066
36162
  for (const [name, env2] of Object.entries(cfg.envs)) {
36067
36163
  lines.push(`${env2.domain} {${tlsDirective}`);
36068
- if (cfg.observability?.enabled) {
36164
+ lines.push(...logDirective);
36165
+ if (observability) {
36069
36166
  lines.push(" handle_path /otel/* {");
36070
36167
  lines.push(" reverse_proxy otel-collector:4318");
36071
36168
  lines.push(" }");
@@ -36078,13 +36175,11 @@ function generateCaddyfile(cfg) {
36078
36175
  lines.push("}");
36079
36176
  lines.push("");
36080
36177
  }
36081
- if (cfg.observability?.enabled) {
36082
- const firstEnv = Object.values(cfg.envs)[0];
36083
- if (firstEnv) {
36084
- const subdomain = cfg.observability.subdomain ?? "observability";
36085
- const apex = apexOf(firstEnv.domain);
36086
- const observabilityDomain = `${subdomain}.${apex}`;
36087
- lines.push(`${observabilityDomain} {${tlsDirective}`);
36178
+ if (observability) {
36179
+ const domain = observabilityDomain(cfg);
36180
+ if (domain) {
36181
+ lines.push(`${domain} {${tlsDirective}`);
36182
+ lines.push(...logDirective);
36088
36183
  lines.push(" basic_auth {");
36089
36184
  lines.push(" import /etc/caddy/observability-htpasswd");
36090
36185
  lines.push(" }");
@@ -36094,6 +36189,7 @@ function generateCaddyfile(cfg) {
36094
36189
  }
36095
36190
  }
36096
36191
  lines.push(`${cfg.registry.domain} {${tlsDirective}`);
36192
+ lines.push(...logDirective);
36097
36193
  lines.push(" reverse_proxy registry:5000 {");
36098
36194
  lines.push(" header_up Host {host}");
36099
36195
  lines.push(" }");
@@ -36105,6 +36201,15 @@ function generateCaddyfile(cfg) {
36105
36201
  `) + `
36106
36202
  `;
36107
36203
  }
36204
+ function observabilityDomain(cfg) {
36205
+ if (!cfg.observability?.enabled)
36206
+ return null;
36207
+ const firstEnv = Object.values(cfg.envs)[0];
36208
+ if (!firstEnv)
36209
+ return null;
36210
+ const subdomain = cfg.observability.subdomain ?? "observability";
36211
+ return `${subdomain}.${apexOf(firstEnv.domain)}`;
36212
+ }
36108
36213
  function apexOf(host) {
36109
36214
  const parts = host.split(".");
36110
36215
  if (parts.length <= 2)
@@ -36113,6 +36218,13 @@ function apexOf(host) {
36113
36218
  }
36114
36219
 
36115
36220
  // src/deploy/compose.ts
36221
+ function pushLogging(lines) {
36222
+ lines.push(" logging:");
36223
+ lines.push(" driver: json-file");
36224
+ lines.push(" options:");
36225
+ lines.push(' max-size: "10m"');
36226
+ lines.push(' max-file: "3"');
36227
+ }
36116
36228
  function generateCompose({ cfg }) {
36117
36229
  const lines = [];
36118
36230
  lines.push("# Generated by `arc platform deploy` \u2014 do not edit by hand.");
@@ -36121,6 +36233,7 @@ function generateCompose({ cfg }) {
36121
36233
  lines.push(" caddy:");
36122
36234
  lines.push(" image: caddy:2-alpine");
36123
36235
  lines.push(" restart: unless-stopped");
36236
+ pushLogging(lines);
36124
36237
  lines.push(" ports:");
36125
36238
  lines.push(' - "80:80"');
36126
36239
  lines.push(' - "443:443"');
@@ -36133,10 +36246,15 @@ function generateCompose({ cfg }) {
36133
36246
  lines.push(" - caddy_config:/config");
36134
36247
  lines.push(" networks:");
36135
36248
  lines.push(" - arc-net");
36249
+ if (cfg.observability?.enabled) {
36250
+ lines.push(" expose:");
36251
+ lines.push(' - "2020" # Prometheus metrics endpoint (Caddyfile :2020 site)');
36252
+ }
36136
36253
  lines.push("");
36137
36254
  lines.push(" registry:");
36138
36255
  lines.push(" image: registry:2");
36139
36256
  lines.push(" restart: unless-stopped");
36257
+ pushLogging(lines);
36140
36258
  lines.push(" volumes:");
36141
36259
  lines.push(" - registry_data:/var/lib/registry");
36142
36260
  lines.push(" - ./registry-auth/htpasswd:/auth/htpasswd:ro");
@@ -36157,6 +36275,13 @@ function generateCompose({ cfg }) {
36157
36275
  lines.push(` image: \${ARC_IMAGE_${upperName}:-arc-${name}:not-deployed}`);
36158
36276
  lines.push(` container_name: arc-${name}`);
36159
36277
  lines.push(" restart: unless-stopped");
36278
+ pushLogging(lines);
36279
+ lines.push(" healthcheck:");
36280
+ lines.push(' test: ["CMD", "wget", "-qO-", "http://127.0.0.1:5005/health"]');
36281
+ lines.push(" interval: 30s");
36282
+ lines.push(" timeout: 5s");
36283
+ lines.push(" retries: 3");
36284
+ lines.push(" start_period: 20s");
36160
36285
  if (usePostgres) {
36161
36286
  lines.push(" depends_on:");
36162
36287
  lines.push(` arc-db-${name}:`);
@@ -36209,6 +36334,7 @@ function generateCompose({ cfg }) {
36209
36334
  lines.push(` image: ${image2}`);
36210
36335
  lines.push(` container_name: arc-db-${name}`);
36211
36336
  lines.push(" restart: unless-stopped");
36337
+ pushLogging(lines);
36212
36338
  lines.push(" environment:");
36213
36339
  lines.push(" POSTGRES_USER: arc");
36214
36340
  lines.push(" POSTGRES_DB: arc");
@@ -36233,9 +36359,13 @@ function generateCompose({ cfg }) {
36233
36359
  lines.push(" image: otel/opentelemetry-collector-contrib:0.114.0");
36234
36360
  lines.push(" container_name: arc-otel-collector");
36235
36361
  lines.push(" restart: unless-stopped");
36362
+ pushLogging(lines);
36363
+ lines.push(' user: "0:0"');
36236
36364
  lines.push(' command: ["--config=/etc/otelcol-contrib/config.yaml"]');
36237
36365
  lines.push(" volumes:");
36238
36366
  lines.push(" - ./observability/otel-collector-config.yaml:/etc/otelcol-contrib/config.yaml:ro");
36367
+ lines.push(" - /:/hostfs:ro # hostmetrics root_path");
36368
+ lines.push(" - /var/run/docker.sock:/var/run/docker.sock:ro # docker_stats");
36239
36369
  lines.push(" networks: [arc-net]");
36240
36370
  lines.push(" expose:");
36241
36371
  lines.push(' - "4317" # OTLP gRPC');
@@ -36250,6 +36380,7 @@ function generateCompose({ cfg }) {
36250
36380
  lines.push(" image: grafana/tempo:2.6.1");
36251
36381
  lines.push(" container_name: arc-tempo");
36252
36382
  lines.push(" restart: unless-stopped");
36383
+ pushLogging(lines);
36253
36384
  lines.push(' command: ["-config.file=/etc/tempo.yaml"]');
36254
36385
  lines.push(' user: "0" # tempo writes to /var/tempo, owned by root in the image');
36255
36386
  lines.push(" volumes:");
@@ -36264,6 +36395,7 @@ function generateCompose({ cfg }) {
36264
36395
  lines.push(" image: grafana/loki:3.3.2");
36265
36396
  lines.push(" container_name: arc-loki");
36266
36397
  lines.push(" restart: unless-stopped");
36398
+ pushLogging(lines);
36267
36399
  lines.push(' command: ["-config.file=/etc/loki/local-config.yaml"]');
36268
36400
  lines.push(' user: "0"');
36269
36401
  lines.push(" volumes:");
@@ -36278,6 +36410,7 @@ function generateCompose({ cfg }) {
36278
36410
  lines.push(" image: prom/prometheus:v2.55.1");
36279
36411
  lines.push(" container_name: arc-prometheus");
36280
36412
  lines.push(" restart: unless-stopped");
36413
+ pushLogging(lines);
36281
36414
  lines.push(" command:");
36282
36415
  lines.push(' - "--config.file=/etc/prometheus/prometheus.yml"');
36283
36416
  lines.push(' - "--storage.tsdb.path=/prometheus"');
@@ -36291,20 +36424,47 @@ function generateCompose({ cfg }) {
36291
36424
  lines.push(" expose:");
36292
36425
  lines.push(' - "9090" # HTTP API + remote_write receiver');
36293
36426
  lines.push("");
36427
+ lines.push(" alloy:");
36428
+ lines.push(" image: grafana/alloy:v1.16.1");
36429
+ lines.push(" container_name: arc-alloy");
36430
+ lines.push(" restart: unless-stopped");
36431
+ pushLogging(lines);
36432
+ lines.push(' user: "0" # docker.sock access');
36433
+ lines.push(" command:");
36434
+ lines.push(" - run");
36435
+ lines.push(" - --server.http.listen-addr=0.0.0.0:12345");
36436
+ lines.push(" - --storage.path=/var/lib/alloy/data");
36437
+ lines.push(" - /etc/alloy/config.alloy");
36438
+ lines.push(" volumes:");
36439
+ lines.push(" - ./observability/alloy-config.alloy:/etc/alloy/config.alloy:ro");
36440
+ lines.push(" - /var/run/docker.sock:/var/run/docker.sock:ro");
36441
+ lines.push(" - alloy_data:/var/lib/alloy/data");
36442
+ lines.push(" networks: [arc-net]");
36443
+ lines.push(" expose:");
36444
+ lines.push(' - "12345" # Alloy self-metrics (Prom scrape)');
36445
+ lines.push(" depends_on:");
36446
+ lines.push(" - loki");
36447
+ lines.push("");
36294
36448
  const adminPasswordEnv = cfg.observability.adminPasswordEnv ?? "ARC_OBSERVABILITY_PASSWORD";
36449
+ const grafanaDomain = observabilityDomain(cfg);
36295
36450
  lines.push(" grafana:");
36296
36451
  lines.push(" image: grafana/grafana:11.4.0");
36297
36452
  lines.push(" container_name: arc-grafana");
36298
36453
  lines.push(" restart: unless-stopped");
36454
+ pushLogging(lines);
36299
36455
  lines.push(" environment:");
36300
36456
  lines.push(" GF_SECURITY_ADMIN_USER: admin");
36301
36457
  lines.push(` GF_SECURITY_ADMIN_PASSWORD: \${${adminPasswordEnv}:?missing ${adminPasswordEnv}}`);
36302
36458
  lines.push(' GF_USERS_ALLOW_SIGN_UP: "false"');
36303
36459
  lines.push(' GF_AUTH_ANONYMOUS_ENABLED: "false"');
36460
+ if (grafanaDomain) {
36461
+ lines.push(` GF_SERVER_ROOT_URL: "https://${grafanaDomain}"`);
36462
+ }
36304
36463
  lines.push(" volumes:");
36305
36464
  lines.push(" - ./observability/grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml:ro");
36306
36465
  lines.push(" - ./observability/grafana-dashboards.yaml:/etc/grafana/provisioning/dashboards/dashboards.yaml:ro");
36307
36466
  lines.push(" - ./observability/grafana-dashboards:/etc/grafana/provisioning/dashboards/arc:ro");
36467
+ lines.push(" - ./observability/grafana-alerting:/etc/grafana/provisioning/alerting:ro");
36308
36468
  lines.push(" - grafana_data:/var/lib/grafana");
36309
36469
  lines.push(" networks: [arc-net]");
36310
36470
  lines.push(" expose:");
@@ -36334,6 +36494,7 @@ function generateCompose({ cfg }) {
36334
36494
  lines.push(" loki_data:");
36335
36495
  lines.push(" prometheus_data:");
36336
36496
  lines.push(" grafana_data:");
36497
+ lines.push(" alloy_data:");
36337
36498
  }
36338
36499
  return lines.join(`
36339
36500
  `) + `
@@ -36408,6 +36569,69 @@ ${envNames.map((name) => ` - "https://${cfg.envs[name].domain}"`).joi
36408
36569
  - tracestate
36409
36570
  - content-type
36410
36571
 
36572
+ # Host-level CPU / memory / load / disk / filesystem / network metrics.
36573
+ # The host root is bind-mounted read-only at /hostfs (see compose).
36574
+ hostmetrics:
36575
+ collection_interval: 30s
36576
+ root_path: /hostfs
36577
+ scrapers:
36578
+ cpu:
36579
+ metrics:
36580
+ system.cpu.utilization:
36581
+ enabled: true
36582
+ memory:
36583
+ metrics:
36584
+ system.memory.utilization:
36585
+ enabled: true
36586
+ load: {}
36587
+ disk: {}
36588
+ filesystem:
36589
+ metrics:
36590
+ system.filesystem.utilization:
36591
+ enabled: true
36592
+ exclude_fs_types:
36593
+ fs_types: [autofs, binfmt_misc, bpf, cgroup2, configfs, debugfs, devpts, devtmpfs, fusectl, hugetlbfs, iso9660, mqueue, nsfs, overlay, proc, procfs, pstore, rpc_pipefs, securityfs, selinuxfs, squashfs, sysfs, tracefs, tmpfs]
36594
+ match_type: strict
36595
+ exclude_mount_points:
36596
+ mount_points: ["/var/lib/docker/.*", "/run/.*", "/snap/.*", "/boot/.*"]
36597
+ match_type: regexp
36598
+ network: {}
36599
+ paging: {}
36600
+
36601
+ # Per-container CPU / memory / network / block-IO + restarts straight from
36602
+ # the Docker daemon (socket bind-mounted read-only, see compose).
36603
+ # api_version pinned: the receiver defaults to Docker API 1.25, which modern
36604
+ # daemons (Engine 25+ require >= 1.40) reject \u2014 without this the receiver
36605
+ # fails to start and takes the whole collector down. Quoted so YAML doesn't
36606
+ # parse 1.40 \u2192 1.4. Must be <= the daemon's max; 1.40 is the safe floor.
36607
+ docker_stats:
36608
+ endpoint: unix:///var/run/docker.sock
36609
+ api_version: "1.40"
36610
+ collection_interval: 30s
36611
+ metrics:
36612
+ container.restarts:
36613
+ enabled: true
36614
+ container.uptime:
36615
+ enabled: true
36616
+
36617
+ connectors:
36618
+ # Span\u2192metrics computed from 100% of spans (pipeline runs BEFORE tail
36619
+ # sampling) \u2014 lowering the sampling policy later never skews dashboards.
36620
+ spanmetrics:
36621
+ histogram:
36622
+ unit: ms
36623
+ explicit:
36624
+ buckets: [2ms, 6ms, 10ms, 50ms, 100ms, 250ms, 500ms, 1s, 2s, 5s, 10s]
36625
+ metrics_flush_interval: 15s
36626
+ # Emits traces_service_graph_* (same metric names Tempo's generator would).
36627
+ servicegraph:
36628
+ metrics_flush_interval: 15s
36629
+ store:
36630
+ ttl: 5s
36631
+ max_items: 5000
36632
+ # Joins the raw-trace pipeline to the sampled-storage pipeline.
36633
+ forward: {}
36634
+
36411
36635
  processors:
36412
36636
  batch:
36413
36637
  timeout: 5s
@@ -36418,7 +36642,8 @@ processors:
36418
36642
  # Errors + slow traces zachowywane w 100%, normalne traces r\xF3wnie\u017C 100%
36419
36643
  # przy obecnej skali (boostrap produkcji). Tail sampling matchuje OR po
36420
36644
  # policies \u2014 bez "always" policy WSZYSTKIE OK traces by\u0142yby droppowane.
36421
- # Obni\u017C 'random_100pct' do np. 10% gdy ruch eksploduje.
36645
+ # Obni\u017C 'random_100pct' do np. 10% gdy ruch eksploduje \u2014 span-metrics s\u0105
36646
+ # liczone przed samplingiem, wi\u0119c dashboardy pozostan\u0105 dok\u0142adne.
36422
36647
  tail_sampling:
36423
36648
  decision_wait: 10s
36424
36649
  num_traces: 50000
@@ -36443,6 +36668,34 @@ processors:
36443
36668
  - key: http.request.header.cookie
36444
36669
  action: delete
36445
36670
 
36671
+ # Cardinality guard for span-metrics: the SPA fallback answers EVERY path,
36672
+ # so raw span names (one per bot-scanned URL) would explode Prometheus
36673
+ # series. Static assets collapse to "<METHOD> static", /route/* to
36674
+ # "<METHOD> /route", anything else outside the known API surface to
36675
+ # "<METHOD> other". Runs BEFORE the spanmetrics connector. Note: "$$" is a
36676
+ # literal "$" (collector env expansion), RE2 has no lookahead \u2192 IsMatch+not.
36677
+ transform/span_names:
36678
+ error_mode: ignore
36679
+ trace_statements:
36680
+ - context: span
36681
+ statements:
36682
+ - set(name, Concat([attributes["http.request.method"], "static"], " ")) where IsMatch(name, "^[A-Z]+ /.*\\\\.(js|mjs|css|map|ico|png|jpe?g|svg|gif|webp|avif|woff2?|ttf|otf|txt|xml|json|webmanifest)$$")
36683
+ - replace_pattern(name, "^([A-Z]+) /route(/.*)?$$", "$$1 /route")
36684
+ - set(name, Concat([attributes["http.request.method"], "other"], " ")) where IsMatch(name, "^[A-Z]+ /") and not IsMatch(name, "^[A-Z]+ ((/api|/command|/query)(/.*)?|/route|/ws|/health|/otel(/.*)?|/)$$")
36685
+
36686
+ # Stable service.name for infra metric streams (becomes the service_name
36687
+ # label after resource_to_telemetry_conversion).
36688
+ resource/host:
36689
+ attributes:
36690
+ - key: service.name
36691
+ value: arc-host
36692
+ action: upsert
36693
+ resource/docker:
36694
+ attributes:
36695
+ - key: service.name
36696
+ value: arc-docker
36697
+ action: upsert
36698
+
36446
36699
  exporters:
36447
36700
  otlp/tempo:
36448
36701
  endpoint: tempo:4317
@@ -36458,6 +36711,10 @@ exporters:
36458
36711
  endpoint: http://prometheus:9090/api/v1/write
36459
36712
  tls:
36460
36713
  insecure: true
36714
+ # Copy resource attributes (service.name, deployment.environment, \u2026)
36715
+ # onto every series \u2014 dashboards filter by service_name.
36716
+ resource_to_telemetry_conversion:
36717
+ enabled: true
36461
36718
 
36462
36719
  extensions:
36463
36720
  health_check: {}
@@ -36465,19 +36722,42 @@ extensions:
36465
36722
 
36466
36723
  service:
36467
36724
  extensions: [health_check, zpages]
36725
+ # Collector self-metrics. Since 0.111 the default bind is localhost only \u2014
36726
+ # Prometheus scrapes otel-collector:8888, so listen on all interfaces.
36727
+ telemetry:
36728
+ metrics:
36729
+ level: detailed
36730
+ readers:
36731
+ - pull:
36732
+ exporter:
36733
+ prometheus:
36734
+ host: "0.0.0.0"
36735
+ port: 8888
36468
36736
  pipelines:
36469
- traces:
36737
+ traces/in:
36470
36738
  receivers: [otlp]
36471
- processors: [tail_sampling, attributes, batch]
36739
+ processors: [attributes, transform/span_names]
36740
+ exporters: [spanmetrics, servicegraph, forward]
36741
+ traces/sampled:
36742
+ receivers: [forward]
36743
+ processors: [tail_sampling, batch]
36472
36744
  exporters: [otlp/tempo]
36473
36745
  logs:
36474
36746
  receivers: [otlp]
36475
36747
  processors: [attributes, batch]
36476
36748
  exporters: [otlphttp/loki]
36477
36749
  metrics:
36478
- receivers: [otlp]
36750
+ receivers: [otlp, spanmetrics, servicegraph]
36479
36751
  processors: [batch]
36480
36752
  exporters: [prometheusremotewrite]
36753
+ metrics/host:
36754
+ receivers: [hostmetrics]
36755
+ processors: [resource/host, batch]
36756
+ exporters: [prometheusremotewrite]
36757
+ metrics/docker:
36758
+ receivers: [docker_stats]
36759
+ processors: [resource/docker, batch]
36760
+ exporters: [prometheusremotewrite]
36481
36761
  `;
36482
36762
  }
36483
36763
  function generateTempoConfig(cfg) {
@@ -36513,20 +36793,9 @@ storage:
36513
36793
  wal:
36514
36794
  path: /var/tempo/wal
36515
36795
 
36516
- metrics_generator:
36517
- registry:
36518
- external_labels:
36519
- source: tempo
36520
- storage:
36521
- path: /var/tempo/generator/wal
36522
- remote_write:
36523
- - url: http://prometheus:9090/api/v1/write
36524
- send_exemplars: true
36525
-
36526
- overrides:
36527
- defaults:
36528
- metrics_generator:
36529
- processors: [service-graphs, span-metrics]
36796
+ # NOTE: no metrics_generator \u2014 span-metrics + service-graph are produced by
36797
+ # the otel-collector connectors BEFORE tail sampling (accurate rates even
36798
+ # when sampling is later tightened) and remote-written to Prometheus there.
36530
36799
  `;
36531
36800
  }
36532
36801
  function generateLokiConfig(cfg) {
@@ -36582,11 +36851,214 @@ scrape_configs:
36582
36851
  - job_name: otel-collector
36583
36852
  static_configs:
36584
36853
  - targets: [otel-collector:8888]
36854
+ - job_name: caddy
36855
+ static_configs:
36856
+ - targets: [caddy:2020]
36857
+ - job_name: loki
36858
+ static_configs:
36859
+ - targets: [loki:3100]
36860
+ - job_name: tempo
36861
+ static_configs:
36862
+ - targets: [tempo:3200]
36863
+ - job_name: grafana
36864
+ static_configs:
36865
+ - targets: [grafana:3000]
36866
+ - job_name: alloy
36867
+ static_configs:
36868
+ - targets: [alloy:12345]
36585
36869
 
36586
36870
  # remote-write inbound is enabled via the --web.enable-remote-write-receiver
36587
36871
  # command-line flag (compose.ts). Retention via --storage.tsdb.retention.time.
36588
36872
  `;
36589
36873
  }
36874
+ function generateAlloyConfig() {
36875
+ return `// Generated by \`arc platform deploy\` \u2014 do not edit by hand.
36876
+ discovery.docker "containers" {
36877
+ host = "unix:///var/run/docker.sock"
36878
+ refresh_interval = "15s"
36879
+
36880
+ // Only containers managed by a compose project (our stack). Ad-hoc / rogue
36881
+ // containers (manual debug runs, other stacks) are excluded \u2014 one bad
36882
+ // stream (e.g. log entries older than Loki's reject window) otherwise 400s
36883
+ // the whole loki.write batch and drops good app logs with it.
36884
+ filter {
36885
+ name = "label"
36886
+ values = ["com.docker.compose.project"]
36887
+ }
36888
+ }
36889
+
36890
+ discovery.relabel "containers" {
36891
+ targets = discovery.docker.containers.targets
36892
+
36893
+ rule {
36894
+ source_labels = ["__meta_docker_container_name"]
36895
+ regex = "/(.*)"
36896
+ target_label = "container"
36897
+ }
36898
+ rule {
36899
+ source_labels = ["__meta_docker_container_label_com_docker_compose_service"]
36900
+ target_label = "compose_service"
36901
+ }
36902
+ }
36903
+
36904
+ loki.source.docker "containers" {
36905
+ host = "unix:///var/run/docker.sock"
36906
+ targets = discovery.docker.containers.targets
36907
+ relabel_rules = discovery.relabel.containers.rules
36908
+ labels = { source = "docker" }
36909
+ forward_to = [loki.write.loki.receiver]
36910
+ }
36911
+
36912
+ loki.write "loki" {
36913
+ endpoint {
36914
+ url = "http://loki:3100/loki/api/v1/push"
36915
+ }
36916
+ }
36917
+ `;
36918
+ }
36919
+ function generateGrafanaAlerting(cfg) {
36920
+ const webhookUrl = cfg.observability?.alertWebhookUrl;
36921
+ const rules = [
36922
+ {
36923
+ uid: "arc-high-error-rate",
36924
+ title: "High server error rate (>5%)",
36925
+ expr: 'sum(rate(traces_span_metrics_calls_total{span_kind="SPAN_KIND_SERVER", status_code="STATUS_CODE_ERROR"}[5m])) / clamp_min(sum(rate(traces_span_metrics_calls_total{span_kind="SPAN_KIND_SERVER"}[5m])), 0.001)',
36926
+ threshold: 0.05,
36927
+ pendingFor: "5m",
36928
+ summary: "More than 5% of server spans are errors over the last 5 minutes."
36929
+ },
36930
+ {
36931
+ uid: "arc-high-latency-p95",
36932
+ title: "High p95 latency (>1s)",
36933
+ expr: 'histogram_quantile(0.95, sum by (le) (rate(traces_span_metrics_duration_milliseconds_bucket{span_kind="SPAN_KIND_SERVER"}[5m])))',
36934
+ threshold: 1000,
36935
+ pendingFor: "10m",
36936
+ summary: "Server p95 latency above 1s for 10 minutes."
36937
+ },
36938
+ {
36939
+ uid: "arc-host-disk-high",
36940
+ title: "Host disk usage >85%",
36941
+ expr: 'max by (mountpoint) (sum by (device, mountpoint) (system_filesystem_usage_bytes{state="used"}) / sum by (device, mountpoint) (system_filesystem_usage_bytes))',
36942
+ threshold: 0.85,
36943
+ pendingFor: "15m",
36944
+ summary: "A host filesystem is more than 85% full."
36945
+ },
36946
+ {
36947
+ uid: "arc-host-memory-high",
36948
+ title: "Host memory usage >90%",
36949
+ expr: 'sum(system_memory_usage_bytes{state="used"}) / sum(system_memory_usage_bytes)',
36950
+ threshold: 0.9,
36951
+ pendingFor: "10m",
36952
+ summary: "Host memory usage above 90% for 10 minutes."
36953
+ },
36954
+ {
36955
+ uid: "arc-container-restarts",
36956
+ title: "Container restarted",
36957
+ expr: "sum by (container_name) (increase(container_restarts_total[15m]))",
36958
+ threshold: 0,
36959
+ pendingFor: "0s",
36960
+ summary: "A container restarted within the last 15 minutes."
36961
+ },
36962
+ {
36963
+ uid: "arc-app-silent",
36964
+ title: "App stopped reporting metrics",
36965
+ expr: "absent(arc_commands_total)",
36966
+ threshold: 0,
36967
+ pendingFor: "10m",
36968
+ summary: "No arc_commands_total series for 10 minutes \u2014 app down or telemetry broken."
36969
+ },
36970
+ {
36971
+ uid: "arc-collector-export-failures",
36972
+ title: "Telemetry export failures",
36973
+ expr: 'sum(increase({__name__=~"otelcol_exporter_send_failed_(spans|metric_points|log_records)"}[15m]))',
36974
+ threshold: 0,
36975
+ pendingFor: "0s",
36976
+ summary: "The otel-collector failed to export telemetry within the last 15 minutes."
36977
+ },
36978
+ {
36979
+ uid: "arc-target-down",
36980
+ title: "Scrape target down",
36981
+ expr: "min(up)",
36982
+ threshold: 1,
36983
+ op: "lt",
36984
+ pendingFor: "5m",
36985
+ summary: "A Prometheus scrape target has been down for 5 minutes."
36986
+ }
36987
+ ];
36988
+ const ruleYaml = rules.map((rule) => {
36989
+ const op = rule.op ?? "gt";
36990
+ return ` - uid: ${rule.uid}
36991
+ title: ${JSON.stringify(rule.title)}
36992
+ condition: C
36993
+ for: ${rule.pendingFor}
36994
+ noDataState: OK
36995
+ execErrState: OK
36996
+ annotations:
36997
+ summary: ${JSON.stringify(rule.summary)}
36998
+ labels:
36999
+ source: arc
37000
+ data:
37001
+ - refId: A
37002
+ relativeTimeRange: { from: 600, to: 0 }
37003
+ datasourceUid: prometheus
37004
+ model:
37005
+ expr: ${JSON.stringify(rule.expr)}
37006
+ instant: true
37007
+ intervalMs: 1000
37008
+ maxDataPoints: 43200
37009
+ refId: A
37010
+ - refId: B
37011
+ relativeTimeRange: { from: 0, to: 0 }
37012
+ datasourceUid: __expr__
37013
+ model:
37014
+ type: reduce
37015
+ expression: A
37016
+ reducer: last
37017
+ refId: B
37018
+ - refId: C
37019
+ relativeTimeRange: { from: 0, to: 0 }
37020
+ datasourceUid: __expr__
37021
+ model:
37022
+ type: threshold
37023
+ expression: B
37024
+ refId: C
37025
+ conditions:
37026
+ - evaluator:
37027
+ type: ${op}
37028
+ params: [${rule.threshold}]`;
37029
+ }).join(`
37030
+ `);
37031
+ const contactSection = webhookUrl ? `
37032
+ contactPoints:
37033
+ - orgId: 1
37034
+ name: arc-webhook
37035
+ receivers:
37036
+ - uid: arc-webhook
37037
+ type: webhook
37038
+ settings:
37039
+ url: ${JSON.stringify(webhookUrl)}
37040
+ httpMethod: POST
37041
+
37042
+ policies:
37043
+ - orgId: 1
37044
+ receiver: arc-webhook
37045
+ group_by: ["grafana_folder", "alertname"]
37046
+ group_wait: 30s
37047
+ group_interval: 5m
37048
+ repeat_interval: 4h
37049
+ ` : "";
37050
+ return `# Generated by \`arc platform deploy\` \u2014 do not edit by hand.
37051
+ apiVersion: 1
37052
+
37053
+ groups:
37054
+ - orgId: 1
37055
+ name: arc-alerts
37056
+ folder: Arc
37057
+ interval: 1m
37058
+ rules:
37059
+ ${ruleYaml}
37060
+ ${contactSection}`;
37061
+ }
36590
37062
  function generateGrafanaDatasources() {
36591
37063
  return `# Generated by \`arc platform deploy\` \u2014 do not edit by hand.
36592
37064
  apiVersion: 1
@@ -36610,10 +37082,17 @@ datasources:
36610
37082
  uid: loki
36611
37083
  jsonData:
36612
37084
  derivedFields:
37085
+ # Plain-text logs that happen to contain "trace_id=<id>".
36613
37086
  - datasourceUid: tempo
36614
37087
  matcherRegex: "trace_id=(\\\\w+)"
36615
37088
  name: TraceID
36616
37089
  url: $\${__value.raw}
37090
+ # OTLP-ingested logs \u2014 trace_id arrives as structured metadata.
37091
+ - datasourceUid: tempo
37092
+ matcherType: label
37093
+ matcherRegex: trace_id
37094
+ name: TraceID (OTLP)
37095
+ url: $\${__value.raw}
36617
37096
  - name: Prometheus
36618
37097
  type: prometheus
36619
37098
  access: proxy
@@ -36656,7 +37135,7 @@ function generateArcOverviewDashboard() {
36656
37135
  label: "Service",
36657
37136
  type: "query",
36658
37137
  datasource: { type: "prometheus", uid: "prometheus" },
36659
- query: "label_values(traces_spanmetrics_calls_total, service_name)",
37138
+ query: "label_values(traces_span_metrics_calls_total, service_name)",
36660
37139
  refresh: 2,
36661
37140
  includeAll: false,
36662
37141
  multi: false,
@@ -36665,29 +37144,29 @@ function generateArcOverviewDashboard() {
36665
37144
  ]
36666
37145
  },
36667
37146
  panels: [
36668
- panelStat("Request rate (req/s)", { x: 0, y: 0, w: 6, h: 4 }, 'sum(rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m]))', "reqps"),
36669
- panelStat("Error rate (%)", { x: 6, y: 0, w: 6, h: 4 }, 'sum(rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER", status_code="STATUS_CODE_ERROR"}[5m])) / clamp_min(sum(rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])), 0.001) * 100', "percent", { red: 1, orange: 0.1 }),
36670
- panelStat("P99 latency", { x: 12, y: 0, w: 6, h: 4 }, 'histogram_quantile(0.99, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))', "ms", { red: 1000, orange: 300 }),
37147
+ panelStat("Request rate (req/s)", { x: 0, y: 0, w: 6, h: 4 }, 'sum(rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m]))', "reqps"),
37148
+ panelStat("Error rate (%)", { x: 6, y: 0, w: 6, h: 4 }, 'sum(rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER", status_code="STATUS_CODE_ERROR"}[5m])) / clamp_min(sum(rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])), 0.001) * 100', "percent", { red: 1, orange: 0.1 }),
37149
+ panelStat("P99 latency", { x: 12, y: 0, w: 6, h: 4 }, 'histogram_quantile(0.99, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))', "ms", { red: 1000, orange: 300 }),
36671
37150
  panelStat("Active commands/sec", { x: 18, y: 0, w: 6, h: 4 }, 'sum(rate(arc_commands_total{service_name="$service"}[5m]))', "ops"),
36672
- panelTimeseries("Request rate by route", { x: 0, y: 4, w: 12, h: 8 }, 'sum by (span_name) (rate(traces_spanmetrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[1m]))', "{{span_name}}", "reqps"),
37151
+ panelTimeseries("Request rate by route", { x: 0, y: 4, w: 12, h: 8 }, 'sum by (span_name) (rate(traces_span_metrics_calls_total{service_name="$service", span_kind="SPAN_KIND_SERVER"}[1m]))', "{{span_name}}", "reqps"),
36673
37152
  panelTimeseries("Latency percentiles", { x: 12, y: 4, w: 12, h: 8 }, [
36674
37153
  {
36675
- expr: 'histogram_quantile(0.5, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
37154
+ expr: 'histogram_quantile(0.5, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
36676
37155
  legend: "p50"
36677
37156
  },
36678
37157
  {
36679
- expr: 'histogram_quantile(0.95, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
37158
+ expr: 'histogram_quantile(0.95, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
36680
37159
  legend: "p95"
36681
37160
  },
36682
37161
  {
36683
- expr: 'histogram_quantile(0.99, sum(rate(traces_spanmetrics_latency_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
37162
+ expr: 'histogram_quantile(0.99, sum(rate(traces_span_metrics_duration_milliseconds_bucket{service_name="$service", span_kind="SPAN_KIND_SERVER"}[5m])) by (le))',
36684
37163
  legend: "p99"
36685
37164
  }
36686
- ], "ms"),
37165
+ ], "", "ms"),
36687
37166
  panelTimeseries("Commands per second", { x: 0, y: 12, w: 12, h: 8 }, 'sum by (arc_command_name) (rate(arc_commands_total{service_name="$service"}[1m]))', "{{arc_command_name}}", "ops"),
36688
- panelTimeseries("Command p95 latency", { x: 12, y: 12, w: 12, h: 8 }, 'histogram_quantile(0.95, sum by (arc_command_name, le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service"}[5m])))', "{{arc_command_name}}", "ms"),
36689
- panelTimeseries("DB find ops/sec by collection", { x: 0, y: 20, w: 12, h: 8 }, 'sum by (db_collection_name) (rate(arc_db_find_ms_milliseconds_count{service_name="$service"}[1m]))', "{{db_collection_name}}", "ops"),
36690
- panelTimeseries("DB find p95 latency", { x: 12, y: 20, w: 12, h: 8 }, 'histogram_quantile(0.95, sum by (db_collection_name, le) (rate(arc_db_find_ms_milliseconds_bucket{service_name="$service"}[5m])))', "{{db_collection_name}}", "ms"),
37167
+ panelTimeseries("Command p95 latency", { x: 12, y: 12, w: 12, h: 8 }, 'histogram_quantile(0.95, sum by (arc_command_name, le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service"}[5m])))', "{{arc_command_name}}", "ms"),
37168
+ panelTimeseries("DB ops/sec by collection", { x: 0, y: 20, w: 12, h: 8 }, 'sum by (db_collection_name, db_operation_name) (rate(arc_db_operation_duration_milliseconds_count{service_name="$service"}[1m]))', "{{db_collection_name}} {{db_operation_name}}", "ops"),
37169
+ panelTimeseries("DB p95 latency by operation", { x: 12, y: 20, w: 12, h: 8 }, 'histogram_quantile(0.95, sum by (db_operation_name, le) (rate(arc_db_operation_duration_milliseconds_bucket{service_name="$service"}[5m])))', "{{db_operation_name}}", "ms"),
36691
37170
  {
36692
37171
  title: "Recent error logs",
36693
37172
  type: "logs",
@@ -36695,7 +37174,7 @@ function generateArcOverviewDashboard() {
36695
37174
  datasource: { type: "loki", uid: "loki" },
36696
37175
  targets: [
36697
37176
  {
36698
- expr: '{service_name="$service"} |= `ERROR`',
37177
+ expr: '{service_name="$service"} | severity_text=~`ERROR|FATAL`',
36699
37178
  refId: "A"
36700
37179
  }
36701
37180
  ],
@@ -36729,7 +37208,7 @@ function generateArcTracesDashboard() {
36729
37208
  label: "Service",
36730
37209
  type: "query",
36731
37210
  datasource: { type: "prometheus", uid: "prometheus" },
36732
- query: "label_values(traces_spanmetrics_calls_total, service_name)",
37211
+ query: "label_values(traces_span_metrics_calls_total, service_name)",
36733
37212
  refresh: 2,
36734
37213
  current: { text: "arc-prod", value: "arc-prod" }
36735
37214
  }
@@ -36828,12 +37307,23 @@ function generateArcLogsDashboard() {
36828
37307
  type: "textbox",
36829
37308
  query: "",
36830
37309
  current: { text: "", value: "" }
37310
+ },
37311
+ {
37312
+ name: "container",
37313
+ label: "Container",
37314
+ type: "query",
37315
+ datasource: { type: "loki", uid: "loki" },
37316
+ query: "label_values(container)",
37317
+ refresh: 2,
37318
+ includeAll: true,
37319
+ multi: false,
37320
+ current: { text: "All", value: "$__all" }
36831
37321
  }
36832
37322
  ]
36833
37323
  },
36834
37324
  panels: [
36835
- panelStat("Logs ingested (1h)", { x: 0, y: 0, w: 6, h: 4 }, 'sum(increase({service_name="$service"}[1h]))', "short"),
36836
- panelStat("Errors (1h)", { x: 6, y: 0, w: 6, h: 4 }, 'sum(increase({service_name="$service", severity_text=~"ERROR|FATAL"}[1h]))', "short", { orange: 1, red: 50 }),
37325
+ panelStat("Logs ingested (1h)", { x: 0, y: 0, w: 6, h: 4 }, 'sum(count_over_time({service_name="$service"}[1h]))', "short", undefined, LOKI_DS),
37326
+ panelStat("Errors (1h)", { x: 6, y: 0, w: 6, h: 4 }, 'sum(count_over_time({service_name="$service"} | severity_text=~`ERROR|FATAL` [1h]))', "short", { orange: 1, red: 50 }, LOKI_DS),
36837
37327
  {
36838
37328
  title: "Log volume by severity",
36839
37329
  type: "timeseries",
@@ -36883,6 +37373,27 @@ function generateArcLogsDashboard() {
36883
37373
  dedupStrategy: "none",
36884
37374
  sortOrder: "Descending"
36885
37375
  }
37376
+ },
37377
+ {
37378
+ title: "Container logs ($container)",
37379
+ type: "logs",
37380
+ gridPos: { x: 0, y: 26, w: 24, h: 14 },
37381
+ datasource: { type: "loki", uid: "loki" },
37382
+ targets: [
37383
+ {
37384
+ expr: '{container=~"$container"} |~ "$search"',
37385
+ refId: "A"
37386
+ }
37387
+ ],
37388
+ options: {
37389
+ showTime: true,
37390
+ showLabels: true,
37391
+ showCommonLabels: false,
37392
+ wrapLogMessage: true,
37393
+ enableLogDetails: true,
37394
+ dedupStrategy: "none",
37395
+ sortOrder: "Descending"
37396
+ }
36886
37397
  }
36887
37398
  ]
36888
37399
  };
@@ -36912,7 +37423,7 @@ function generateArcSamplingDashboard() {
36912
37423
  expr: "sum(rate(otelcol_exporter_sent_spans[1m]))",
36913
37424
  legend: "exported"
36914
37425
  }
36915
- ], "ops"),
37426
+ ], "", "ops"),
36916
37427
  panelTimeseries("Collector queue size (BatchSpanProcessor)", { x: 0, y: 12, w: 12, h: 8 }, "otelcol_processor_batch_batch_send_size_sum / clamp_min(otelcol_processor_batch_batch_send_size_count, 1)", "avg batch size", "short"),
36917
37428
  panelTimeseries("Collector process memory", { x: 12, y: 12, w: 12, h: 8 }, 'process_resident_memory_bytes{job="otel-collector"}', "RSS", "bytes")
36918
37429
  ]
@@ -36953,24 +37464,24 @@ function generateArcCommandDashboard() {
36953
37464
  },
36954
37465
  panels: [
36955
37466
  panelStat("Call rate", { x: 0, y: 0, w: 6, h: 4 }, 'sum(rate(arc_commands_total{service_name="$service", arc_command_name="$command"}[5m]))', "ops"),
36956
- panelStat("P50 latency", { x: 6, y: 0, w: 6, h: 4 }, 'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))', "ms"),
36957
- panelStat("P95 latency", { x: 12, y: 0, w: 6, h: 4 }, 'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))', "ms", { orange: 200, red: 1000 }),
36958
- panelStat("P99 latency", { x: 18, y: 0, w: 6, h: 4 }, 'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))', "ms", { orange: 500, red: 2000 }),
37467
+ panelStat("P50 latency", { x: 6, y: 0, w: 6, h: 4 }, 'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))', "ms"),
37468
+ panelStat("P95 latency", { x: 12, y: 0, w: 6, h: 4 }, 'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))', "ms", { orange: 200, red: 1000 }),
37469
+ panelStat("P99 latency", { x: 18, y: 0, w: 6, h: 4 }, 'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))', "ms", { orange: 500, red: 2000 }),
36959
37470
  panelTimeseries("Call rate over time", { x: 0, y: 4, w: 12, h: 8 }, 'sum(rate(arc_commands_total{service_name="$service", arc_command_name="$command"}[1m]))', "calls/s", "ops"),
36960
37471
  panelTimeseries("Latency percentiles", { x: 12, y: 4, w: 12, h: 8 }, [
36961
37472
  {
36962
- expr: 'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
37473
+ expr: 'histogram_quantile(0.5, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
36963
37474
  legend: "p50"
36964
37475
  },
36965
37476
  {
36966
- expr: 'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
37477
+ expr: 'histogram_quantile(0.95, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
36967
37478
  legend: "p95"
36968
37479
  },
36969
37480
  {
36970
- expr: 'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_ms_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
37481
+ expr: 'histogram_quantile(0.99, sum by (le) (rate(arc_command_duration_milliseconds_bucket{service_name="$service", arc_command_name="$command"}[5m])))',
36971
37482
  legend: "p99"
36972
37483
  }
36973
- ], "ms"),
37484
+ ], "", "ms"),
36974
37485
  {
36975
37486
  title: "Recent traces (sampled)",
36976
37487
  type: "traces",
@@ -36989,23 +37500,108 @@ function generateArcCommandDashboard() {
36989
37500
  };
36990
37501
  return JSON.stringify(dashboard, null, 2);
36991
37502
  }
37503
+ function generateArcInfraDashboard() {
37504
+ const dashboard = {
37505
+ title: "Arc Infrastructure",
37506
+ uid: "arc-infra",
37507
+ schemaVersion: 39,
37508
+ version: 1,
37509
+ refresh: "30s",
37510
+ time: { from: "now-3h", to: "now" },
37511
+ tags: ["arc", "auto-provisioned"],
37512
+ panels: [
37513
+ panelStat("Host CPU used", { x: 0, y: 0, w: 6, h: 4 }, '100 * (1 - avg(rate(system_cpu_time_seconds_total{state="idle"}[5m])))', "percent", { orange: 70, red: 90 }),
37514
+ panelStat("Host memory used", { x: 6, y: 0, w: 6, h: 4 }, '100 * sum(system_memory_usage_bytes{state="used"}) / sum(system_memory_usage_bytes)', "percent", { orange: 80, red: 90 }),
37515
+ panelStat("Disk used (worst mount)", { x: 12, y: 0, w: 6, h: 4 }, '100 * max(sum by (device, mountpoint) (system_filesystem_usage_bytes{state="used"}) / sum by (device, mountpoint) (system_filesystem_usage_bytes))', "percent", { orange: 75, red: 85 }),
37516
+ panelStat("Load (1m)", { x: 18, y: 0, w: 6, h: 4 }, "system_cpu_load_average_1m", "short"),
37517
+ panelTimeseries("Host CPU utilization", { x: 0, y: 4, w: 12, h: 8 }, [
37518
+ {
37519
+ expr: '100 * (1 - avg(rate(system_cpu_time_seconds_total{state="idle"}[5m])))',
37520
+ legend: "used %"
37521
+ },
37522
+ { expr: "system_cpu_load_average_1m", legend: "load 1m" },
37523
+ { expr: "system_cpu_load_average_5m", legend: "load 5m" },
37524
+ { expr: "system_cpu_load_average_15m", legend: "load 15m" }
37525
+ ], "", "short"),
37526
+ panelTimeseries("Host memory by state", { x: 12, y: 4, w: 12, h: 8 }, "sum by (state) (system_memory_usage_bytes)", "{{state}}", "bytes"),
37527
+ panelTimeseries("Filesystem usage by mount", { x: 0, y: 12, w: 12, h: 8 }, '100 * sum by (mountpoint) (system_filesystem_usage_bytes{state="used"}) / sum by (mountpoint) (system_filesystem_usage_bytes)', "{{mountpoint}}", "percent"),
37528
+ panelTimeseries("Disk I/O", { x: 12, y: 12, w: 12, h: 8 }, "sum by (device, direction) (rate(system_disk_io_bytes_total[5m]))", "{{device}} {{direction}}", "Bps"),
37529
+ panelTimeseries("Network I/O", { x: 0, y: 20, w: 12, h: 8 }, 'sum by (device, direction) (rate(system_network_io_bytes_total{device!="lo"}[5m]))', "{{device}} {{direction}}", "Bps"),
37530
+ panelTimeseries("Container restarts (24h)", { x: 12, y: 20, w: 12, h: 8 }, "sum by (container_name) (increase(container_restarts_total[24h]))", "{{container_name}}", "short"),
37531
+ panelTimeseries("Container CPU", { x: 0, y: 28, w: 12, h: 8 }, "container_cpu_utilization_ratio", "{{container_name}}", "percent"),
37532
+ panelTimeseries("Container memory", { x: 12, y: 28, w: 12, h: 8 }, "container_memory_usage_total_bytes", "{{container_name}}", "bytes"),
37533
+ panelTimeseries("Container network RX", { x: 0, y: 36, w: 12, h: 8 }, "sum by (container_name) (rate(container_network_io_usage_rx_bytes_total[5m]))", "{{container_name}}", "Bps"),
37534
+ panelTimeseries("Container network TX", { x: 12, y: 36, w: 12, h: 8 }, "sum by (container_name) (rate(container_network_io_usage_tx_bytes_total[5m]))", "{{container_name}}", "Bps")
37535
+ ]
37536
+ };
37537
+ return JSON.stringify(dashboard, null, 2);
37538
+ }
37539
+ function generateArcEdgeDashboard() {
37540
+ const dashboard = {
37541
+ title: "Arc Caddy / Edge",
37542
+ uid: "arc-edge",
37543
+ schemaVersion: 39,
37544
+ version: 1,
37545
+ refresh: "30s",
37546
+ time: { from: "now-1h", to: "now" },
37547
+ tags: ["arc", "auto-provisioned"],
37548
+ panels: [
37549
+ panelStat("Requests/s", { x: 0, y: 0, w: 6, h: 4 }, "sum(rate(caddy_http_request_duration_seconds_count[1m]))", "reqps"),
37550
+ panelStat("In-flight requests", { x: 6, y: 0, w: 6, h: 4 }, "sum(caddy_http_requests_in_flight)", "short"),
37551
+ panelStat("Handler errors/s", { x: 12, y: 0, w: 6, h: 4 }, "sum(rate(caddy_http_request_errors_total[5m]))", "ops", { orange: 0.1, red: 1 }),
37552
+ panelStat("P95 latency", { x: 18, y: 0, w: 6, h: 4 }, "histogram_quantile(0.95, sum by (le) (rate(caddy_http_request_duration_seconds_bucket[5m])))", "s", { orange: 0.3, red: 1 }),
37553
+ panelTimeseries("Request rate by host", { x: 0, y: 4, w: 12, h: 8 }, "sum by (host) (rate(caddy_http_request_duration_seconds_count[1m]))", "{{host}}", "reqps"),
37554
+ panelTimeseries("Responses by status code", { x: 12, y: 4, w: 12, h: 8 }, "sum by (code) (rate(caddy_http_request_duration_seconds_count[1m]))", "{{code}}", "reqps"),
37555
+ panelTimeseries("P95 latency by host", { x: 0, y: 12, w: 12, h: 8 }, "histogram_quantile(0.95, sum by (host, le) (rate(caddy_http_request_duration_seconds_bucket[5m])))", "{{host}}", "s"),
37556
+ panelTimeseries("4xx/5xx responses (access log)", { x: 12, y: 12, w: 12, h: 8 }, 'sum by (status) (count_over_time({compose_service="caddy"} | json | status >= 400 [$__interval]))', "{{status}}", "short", LOKI_DS),
37557
+ {
37558
+ title: "Access log (live)",
37559
+ type: "logs",
37560
+ gridPos: { x: 0, y: 20, w: 24, h: 12 },
37561
+ datasource: { type: "loki", uid: "loki" },
37562
+ targets: [
37563
+ {
37564
+ expr: '{compose_service="caddy"}',
37565
+ refId: "A"
37566
+ }
37567
+ ],
37568
+ options: {
37569
+ showTime: true,
37570
+ showLabels: false,
37571
+ showCommonLabels: false,
37572
+ wrapLogMessage: true,
37573
+ enableLogDetails: true,
37574
+ dedupStrategy: "none",
37575
+ sortOrder: "Descending"
37576
+ }
37577
+ }
37578
+ ]
37579
+ };
37580
+ return JSON.stringify(dashboard, null, 2);
37581
+ }
36992
37582
  function generateObservabilityConfigs(cfg) {
36993
37583
  return {
36994
37584
  "observability/otel-collector-config.yaml": generateOtelCollectorConfig(cfg),
36995
37585
  "observability/tempo.yaml": generateTempoConfig(cfg),
36996
37586
  "observability/loki-config.yaml": generateLokiConfig(cfg),
36997
37587
  "observability/prometheus.yml": generatePrometheusConfig(cfg),
37588
+ "observability/alloy-config.alloy": generateAlloyConfig(),
36998
37589
  "observability/grafana-datasources.yaml": generateGrafanaDatasources(),
36999
37590
  "observability/grafana-dashboards.yaml": generateGrafanaDashboardsProvider(),
37591
+ "observability/grafana-alerting/alerting.yaml": generateGrafanaAlerting(cfg),
37000
37592
  "observability/grafana-dashboards/arc-overview.json": generateArcOverviewDashboard(),
37001
37593
  "observability/grafana-dashboards/arc-traces.json": generateArcTracesDashboard(),
37002
37594
  "observability/grafana-dashboards/arc-service-map.json": generateArcServiceMapDashboard(),
37003
37595
  "observability/grafana-dashboards/arc-logs.json": generateArcLogsDashboard(),
37004
37596
  "observability/grafana-dashboards/arc-sampling.json": generateArcSamplingDashboard(),
37005
- "observability/grafana-dashboards/arc-command.json": generateArcCommandDashboard()
37597
+ "observability/grafana-dashboards/arc-command.json": generateArcCommandDashboard(),
37598
+ "observability/grafana-dashboards/arc-infra.json": generateArcInfraDashboard(),
37599
+ "observability/grafana-dashboards/arc-edge.json": generateArcEdgeDashboard()
37006
37600
  };
37007
37601
  }
37008
- function panelStat(title, gridPos, expr, unit, thresholds) {
37602
+ var PROMETHEUS_DS = { type: "prometheus", uid: "prometheus" };
37603
+ var LOKI_DS = { type: "loki", uid: "loki" };
37604
+ function panelStat(title, gridPos, expr, unit, thresholds, datasource = PROMETHEUS_DS) {
37009
37605
  const steps = [
37010
37606
  { color: "green", value: null }
37011
37607
  ];
@@ -37019,7 +37615,7 @@ function panelStat(title, gridPos, expr, unit, thresholds) {
37019
37615
  title,
37020
37616
  type: "stat",
37021
37617
  gridPos,
37022
- datasource: { type: "prometheus", uid: "prometheus" },
37618
+ datasource,
37023
37619
  targets: [{ expr, refId: "A", legendFormat: title }],
37024
37620
  fieldConfig: {
37025
37621
  defaults: {
@@ -37037,7 +37633,7 @@ function panelStat(title, gridPos, expr, unit, thresholds) {
37037
37633
  }
37038
37634
  };
37039
37635
  }
37040
- function panelTimeseries(title, gridPos, query, legend, unit) {
37636
+ function panelTimeseries(title, gridPos, query, legend, unit, datasource = PROMETHEUS_DS) {
37041
37637
  const targets = Array.isArray(query) ? query.map((q, i) => ({
37042
37638
  expr: q.expr,
37043
37639
  refId: String.fromCharCode(65 + i),
@@ -37047,7 +37643,7 @@ function panelTimeseries(title, gridPos, query, legend, unit) {
37047
37643
  title,
37048
37644
  type: "timeseries",
37049
37645
  gridPos,
37050
- datasource: { type: "prometheus", uid: "prometheus" },
37646
+ datasource,
37051
37647
  targets,
37052
37648
  fieldConfig: {
37053
37649
  defaults: {
@@ -37378,11 +37974,16 @@ function validateDeployConfig(input) {
37378
37974
  metrics: optionalString(retentionRaw, "observability.retention.metrics")
37379
37975
  };
37380
37976
  }
37977
+ const alertWebhookUrl = optionalString(observabilityRaw, "observability.alertWebhookUrl");
37978
+ if (alertWebhookUrl !== undefined && !/^https?:\/\/.+/.test(alertWebhookUrl)) {
37979
+ throw new Error(`deploy.arc.json: observability.alertWebhookUrl must be an http(s) URL (got "${alertWebhookUrl}")`);
37980
+ }
37381
37981
  validated.observability = {
37382
37982
  enabled: enabledRaw,
37383
37983
  subdomain: optionalString(observabilityRaw, "observability.subdomain") ?? "observability",
37384
37984
  adminPasswordEnv: optionalString(observabilityRaw, "observability.adminPasswordEnv") ?? "ARC_OBSERVABILITY_PASSWORD",
37385
- retention
37985
+ retention,
37986
+ alertWebhookUrl
37386
37987
  };
37387
37988
  }
37388
37989
  const provision = input.provision;
@@ -37678,14 +38279,14 @@ async function bootstrap(inputs) {
37678
38279
  });
37679
38280
  ok("Host bootstrapped");
37680
38281
  }
37681
- const needUpStack = state.kind !== "ready" || state.marker === null || state.marker.configHash !== inputs.configHash || !await isRegistryRunning(cfg);
38282
+ const needUpStack = state.kind !== "ready" || state.marker === null || state.marker.configHash !== inputs.configHash || state.marker.cliVersion !== inputs.cliVersion || !await isRegistryRunning(cfg);
37682
38283
  if (needUpStack) {
37683
38284
  await upStack(inputs);
37684
38285
  ok("Docker stack up");
37685
38286
  }
37686
38287
  if (cfg.observability?.enabled) {
37687
38288
  log2("Ensuring observability sidecars are running...");
37688
- const obsServices = ["otel-collector", "tempo", "loki", "prometheus", "grafana"];
38289
+ const obsServices = ["otel-collector", "tempo", "loki", "prometheus", "alloy", "grafana"];
37689
38290
  await assertExec(cfg.target, `cd ${cfg.target.remoteDir} && docker compose pull --ignore-pull-failures ${obsServices.join(" ")} && docker compose up -d ${obsServices.join(" ")}`);
37690
38291
  ok("Observability stack up");
37691
38292
  }
@@ -37752,7 +38353,7 @@ async function upStack(inputs) {
37752
38353
  await scpUpload(cfg.target, join18(workDir, "docker-compose.yml"), `${cfg.target.remoteDir}/docker-compose.yml`);
37753
38354
  await scpUpload(cfg.target, join18(workDir, "htpasswd"), `${cfg.target.remoteDir}/registry-auth/htpasswd`);
37754
38355
  if (observabilityFiles && observabilityHtpasswd) {
37755
- await assertExec(cfg.target, `mkdir -p ${cfg.target.remoteDir}/observability/grafana-dashboards`);
38356
+ await assertExec(cfg.target, `mkdir -p ${cfg.target.remoteDir}/observability/grafana-dashboards ${cfg.target.remoteDir}/observability/grafana-alerting`);
37756
38357
  for (const relPath of Object.keys(observabilityFiles)) {
37757
38358
  const localDir = dirname9(join18(workDir, relPath));
37758
38359
  mkdirSync12(localDir, { recursive: true });
@@ -39274,7 +39875,7 @@ class ContextHandler {
39274
39875
  try {
39275
39876
  return await this.telemetry.startSpan(`command.${commandName}`, runCommand, { attributes: baseAttrs });
39276
39877
  } finally {
39277
- this.telemetry.measureSince("arc.command.duration_ms", start, {
39878
+ this.telemetry.measureSince("arc.command.duration", start, {
39278
39879
  "arc.command.name": commandName
39279
39880
  });
39280
39881
  this.telemetry.incrementCounter("arc.commands.total", 1, {
@@ -40359,6 +40960,7 @@ async function createArcServer(config) {
40359
40960
  websocket: {
40360
40961
  open(ws) {
40361
40962
  connectionManager.addClient(ws);
40963
+ config.telemetry?.addUpDown("arc.ws.active_connections", 1);
40362
40964
  },
40363
40965
  async message(ws, messageStr) {
40364
40966
  const client = connectionManager.getClientByWs(ws);
@@ -40371,6 +40973,9 @@ async function createArcServer(config) {
40371
40973
  console.error("Failed to parse WS message:", error);
40372
40974
  return;
40373
40975
  }
40976
+ config.telemetry?.incrementCounter("arc.ws.messages", 1, {
40977
+ "messaging.message.type": String(message?.type ?? "unknown")
40978
+ });
40374
40979
  const dispatch = async () => {
40375
40980
  try {
40376
40981
  for (const handler of wsHandlers) {
@@ -40408,6 +41013,7 @@ async function createArcServer(config) {
40408
41013
  cleanupClientSubs(client.id);
40409
41014
  config.onWsClose?.(client.id);
40410
41015
  connectionManager.removeClient(client.id);
41016
+ config.telemetry?.addUpDown("arc.ws.active_connections", -1);
40411
41017
  }
40412
41018
  }
40413
41019
  }
@@ -40726,7 +41332,8 @@ async function startPlatformServer(opts) {
40726
41332
  endpoint: process.env.OTEL_EXPORTER_OTLP_ENDPOINT,
40727
41333
  mode: devMode ? "development" : "production",
40728
41334
  sampleRate: devMode ? 1 : 1,
40729
- debug: process.env.ARC_OTEL_DEBUG === "true"
41335
+ debug: process.env.ARC_OTEL_DEBUG === "true",
41336
+ patchConsole: process.env.ARC_OTEL_PATCH_CONSOLE !== "false"
40730
41337
  });
40731
41338
  telemetry = init2.telemetry;
40732
41339
  telemetryShutdown = init2.shutdown;