@decocms/start 6.0.1 → 6.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,82 @@
1
+ # Runbook: `http-latency-spike`
2
+
3
+ > A site's p95 latency exceeded its own 24h rolling baseline by 3σ for ≥ 10 minutes.
4
+
5
+ ## What this alert means
6
+
7
+ User-perceived latency on this site is statistically abnormal vs the
8
+ last 24 hours. Latency rarely degrades in isolation — almost always
9
+ something else is bottlenecked underneath. Use this alert as the
10
+ "something is wrong, look around" signal, then triangulate.
11
+
12
+ ## First check (60 seconds)
13
+
14
+ Open the dashboard's **commerce p95 by provider/operation** panel. The
15
+ most common cause of p95 spikes is an upstream commerce API (VTEX,
16
+ Shopify) slowing down — and our SSR is synchronous on the upstream
17
+ call.
18
+
19
+ If commerce p95 spiked at the same moment, jump to
20
+ [`commerce-upstream-slow.md`](./commerce-upstream-slow.md).
21
+
22
+ ## Diagnostic queries
23
+
24
+ ```sql
25
+ -- Latency p95 by route_pattern, last hour
26
+ SELECT
27
+ toStartOfInterval(TimeUnix, INTERVAL 5 MINUTE) AS t,
28
+ Attributes['route_pattern'] AS route,
29
+ quantileBFloat16(0.95)(toFloat64(Sum / nullIf(Count, 0))) AS p95
30
+ FROM otel_metrics_histogram
31
+ WHERE MetricName = 'http_request_duration_ms'
32
+ AND ServiceName = '{site}'
33
+ AND TimeUnix > now() - INTERVAL 1 HOUR
34
+ GROUP BY t, route
35
+ ORDER BY t, p95 DESC;
36
+ ```
37
+
38
+ ```sql
39
+ -- Cache decision distribution — did hit rate drop while latency rose?
40
+ SELECT
41
+ Attributes['cache_decision'] AS decision,
42
+ count() AS n,
43
+ avg(toFloat64(Sum / nullIf(Count, 0))) AS avg_ms
44
+ FROM otel_metrics_histogram
45
+ WHERE MetricName = 'http_request_duration_ms'
46
+ AND ServiceName = '{site}'
47
+ AND TimeUnix > now() - INTERVAL 30 MINUTE
48
+ GROUP BY decision
49
+ ORDER BY n DESC;
50
+ ```
51
+
52
+ ```sql
53
+ -- Slow traces with full span breakdown (sampled ~1%, so re-run if empty)
54
+ SELECT TraceId, SpanName, Duration / 1e6 AS ms, SpanAttributes['url.path'] AS path
55
+ FROM otel_traces
56
+ WHERE ServiceName = '{site}'
57
+ AND Timestamp > now() - INTERVAL 30 MINUTE
58
+ AND SpanName = 'deco.http.request'
59
+ AND (Duration / 1e6) > 2000
60
+ ORDER BY Duration DESC
61
+ LIMIT 50;
62
+ ```
63
+
64
+ ## Common causes & fixes
65
+
66
+ | Rank | Cause | How to confirm | Fix |
67
+ |------|------------------------------------------------------|-----------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------|
68
+ | 1 | Upstream commerce API slow | Commerce p95 panel spikes with the same shape | See [`commerce-upstream-slow.md`](./commerce-upstream-slow.md). |
69
+ | 2 | Cache hit rate dropped (cold cache after deploy/purge) | Cache panel shows MISS share rose at spike start; usually self-heals within 5-10m | Wait it out unless sustained; if sustained check the route-level cache profile. |
70
+ | 3 | One specific route is slow (heavy loader added) | Per-route p95 query shows one `route_pattern` dominating | Inspect recent commits to that route's loader. Consider deferring sections via `Lazy`. |
71
+ | 4 | Cloudflare edge / colo issue | `region` label distribution skewed to one or two colos | Check CF status page; usually clears on its own. |
72
+
73
+ ## Escalation
74
+
75
+ - 30 minutes without resolution → page the site team owner.
76
+ - All sites in a region affected → suspect CF infra; check status.cloudflare.com.
77
+
78
+ ## Post-mortem hook
79
+
80
+ - A representative slow `TraceId` from the third query above.
81
+ - The cache hit rate before/during the spike.
82
+ - Deploy version at the start of the window.
@@ -0,0 +1,100 @@
1
+ # Runbook: `tail-exception-spike`
2
+
3
+ > A site's tail-worker `_outcome=exception` count exceeded its own 24h rolling baseline by 3σ for ≥ 10 minutes.
4
+
5
+ ## What this alert means
6
+
7
+ Real, uncaught exceptions are happening in the Worker — captured by
8
+ the tail consumer with 100% fidelity (`deco-otel-tail`). After Phase 1
9
+ severity reclassification, this alert specifically excludes `canceled`
10
+ and `responseStreamDisconnected` outcomes (those are client-disconnect
11
+ noise, not bugs). What's left is a true bug, OOM, or CPU-limit kill.
12
+
13
+ ## First check (60 seconds)
14
+
15
+ ```sql
16
+ -- What's blowing up, last 15 minutes
17
+ SELECT Body, LogAttributes['url.path'] AS path, count() AS n
18
+ FROM otel_logs
19
+ WHERE ServiceName = '{site}'
20
+ AND SeverityText = 'ERROR'
21
+ AND LogAttributes['_source'] = 'tail-worker'
22
+ AND LogAttributes['_outcome'] = 'exception'
23
+ AND Timestamp > now() - INTERVAL 15 MINUTE
24
+ GROUP BY Body, path
25
+ ORDER BY n DESC
26
+ LIMIT 30;
27
+ ```
28
+
29
+ If 90% of the rows share the same `Body` (same exception class /
30
+ message), that's the bug — proceed to "Common causes" #1.
31
+
32
+ If the exceptions are scattered across many distinct messages, you
33
+ likely have a resource problem (OOM / CPU limit) — proceed to #2.
34
+
35
+ ## Diagnostic queries
36
+
37
+ ```sql
38
+ -- Outcome distribution — separate exception from exceededMemory / exceededCpu
39
+ SELECT
40
+ LogAttributes['_outcome'] AS outcome,
41
+ count() AS n
42
+ FROM otel_logs
43
+ WHERE ServiceName = '{site}'
44
+ AND LogAttributes['_source'] = 'tail-worker'
45
+ AND Timestamp > now() - INTERVAL 30 MINUTE
46
+ GROUP BY outcome
47
+ ORDER BY n DESC;
48
+ ```
49
+
50
+ ```sql
51
+ -- Did a specific deploy cause it?
52
+ SELECT
53
+ LogAttributes['service.version'] AS version,
54
+ LogAttributes['_outcome'] AS outcome,
55
+ count() AS n
56
+ FROM otel_logs
57
+ WHERE ServiceName = '{site}'
58
+ AND LogAttributes['_source'] = 'tail-worker'
59
+ AND Timestamp > now() - INTERVAL 1 HOUR
60
+ GROUP BY version, outcome
61
+ ORDER BY n DESC;
62
+ ```
63
+
64
+ ```sql
65
+ -- Pull the full record for one offending request to get request.id
66
+ -- and trace.id for join queries
67
+ SELECT *
68
+ FROM otel_logs
69
+ WHERE ServiceName = '{site}'
70
+ AND SeverityText = 'ERROR'
71
+ AND LogAttributes['_source'] = 'tail-worker'
72
+ AND LogAttributes['_outcome'] = 'exception'
73
+ AND Timestamp > now() - INTERVAL 15 MINUTE
74
+ ORDER BY Timestamp DESC
75
+ LIMIT 1;
76
+ ```
77
+
78
+ ## Common causes & fixes
79
+
80
+ | Rank | Cause | How to confirm | Fix |
81
+ |------|----------------------------------------------------|-------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------|
82
+ | 1 | A single uncaught throw, recent deploy | Same `Body` dominates; one `service.version` correlates | Roll back the deploy. File a bug with the offending stack + `request.id` for repro. Add a try/catch + structured `logger.error`. |
83
+ | 2 | `exceededMemory` (OOM) | Outcome query shows non-trivial `exceededMemory` count | Look for large in-memory buffers — a `Response.text()` on a multi-MB upstream, a runaway `JSON.parse`. See [`deco-site-memory-debugging`](https://github.com/decocms/deco-start/blob/main/.cursor/skills/deco-site-memory-debugging/SKILL.md) skill. |
84
+ | 3 | `exceededCpu` (CPU-limit kill) | Outcome query shows `exceededCpu` | Investigate a section with a heavy synchronous loop. Move work to a server function or shed load via cache. |
85
+ | 4 | A new upstream returning malformed responses | `Body` references a third-party hostname; matches a known endpoint | Add defensive parsing + a structured `logger.error` so the throw becomes a typed error, not a crash. |
86
+
87
+ ## Escalation
88
+
89
+ - `exceededMemory` / `exceededCpu` sustained → page site team + platform on-call. May indicate a leak that will recur until isolate restart.
90
+ - A throw we can't decode in 15 minutes → page site team owner.
91
+
92
+ ## Post-mortem hook
93
+
94
+ - One full record from query #3 above — preserves the
95
+ `request.id` / `trace.id` for cross-channel correlation.
96
+ - The dominant `Body` (the exception message).
97
+ - The `service.version` window.
98
+ - Whether the alert fired on `exception` or `exceededMemory` /
99
+ `exceededCpu` — drives whether the post-mortem investigates code or
100
+ resource bounds.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@decocms/start",
3
- "version": "6.0.1",
3
+ "version": "6.2.0",
4
4
  "type": "module",
5
5
  "description": "Deco framework for TanStack Start - CMS bridge, admin protocol, hooks, schema generation",
6
6
  "main": "./src/index.ts",
@@ -2,7 +2,11 @@ import * as fs from "node:fs";
2
2
  import * as os from "node:os";
3
3
  import * as path from "node:path";
4
4
  import { afterEach, beforeEach, describe, expect, it } from "vitest";
5
- import { auditObservabilityBlock } from "./audit-observability-config";
5
+ import {
6
+ auditFleetBindings,
7
+ auditObservabilityBlock,
8
+ auditWranglerConfig,
9
+ } from "./audit-observability-config";
6
10
  import { parseJsonc, stripJsoncTrailingCommas } from "./lib/jsonc";
7
11
 
8
12
  describe("auditObservabilityBlock", () => {
@@ -138,6 +142,125 @@ describe("auditObservabilityBlock", () => {
138
142
  });
139
143
  });
140
144
 
145
+ describe("auditFleetBindings (D-14)", () => {
146
+ const canonicalBindings = {
147
+ version_metadata: { binding: "CF_VERSION_METADATA" },
148
+ analytics_engine_datasets: [{ binding: "DECO_METRICS", dataset: "deco_metrics_site" }],
149
+ tail_consumers: [{ service: "deco-otel-tail" }],
150
+ vars: {
151
+ DECO_OTEL_METRICS_ENDPOINT: "https://deco-otel-ingest.example/v1/metrics",
152
+ DECO_OTEL_TRACES_ENDPOINT: "https://deco-otel-ingest.example/v1/traces",
153
+ DECO_OTEL_LOGS_ENDPOINT: "https://deco-otel-ingest.example/v1/logs",
154
+ },
155
+ };
156
+
157
+ it("returns no findings for canonical bindings", () => {
158
+ expect(auditFleetBindings(canonicalBindings)).toEqual([]);
159
+ });
160
+
161
+ it("flags version_metadata_binding_missing as error", () => {
162
+ const { version_metadata: _, ...rest } = canonicalBindings;
163
+ const findings = auditFleetBindings(rest);
164
+ const f = findings.find((x) => x.id === "version_metadata_binding_missing");
165
+ expect(f?.severity).toBe("error");
166
+ });
167
+
168
+ it("flags version_metadata_binding_missing when binding is empty", () => {
169
+ const findings = auditFleetBindings({
170
+ ...canonicalBindings,
171
+ version_metadata: { binding: "" },
172
+ });
173
+ const f = findings.find((x) => x.id === "version_metadata_binding_missing");
174
+ expect(f).toBeDefined();
175
+ });
176
+
177
+ it("flags analytics_engine_binding_missing as warn", () => {
178
+ const findings = auditFleetBindings({
179
+ ...canonicalBindings,
180
+ analytics_engine_datasets: [],
181
+ });
182
+ const f = findings.find((x) => x.id === "analytics_engine_binding_missing");
183
+ expect(f?.severity).toBe("warn");
184
+ });
185
+
186
+ it("flags analytics_engine_binding_missing when binding name doesn't match DECO_METRICS", () => {
187
+ const findings = auditFleetBindings({
188
+ ...canonicalBindings,
189
+ analytics_engine_datasets: [{ binding: "OTHER_NAME" }],
190
+ });
191
+ expect(findings.some((f) => f.id === "analytics_engine_binding_missing")).toBe(true);
192
+ });
193
+
194
+ it("flags tail_consumer_missing as error", () => {
195
+ const findings = auditFleetBindings({
196
+ ...canonicalBindings,
197
+ tail_consumers: [],
198
+ });
199
+ const f = findings.find((x) => x.id === "tail_consumer_missing");
200
+ expect(f?.severity).toBe("error");
201
+ });
202
+
203
+ it("flags tail_consumer_missing when an unrelated tail consumer is configured", () => {
204
+ const findings = auditFleetBindings({
205
+ ...canonicalBindings,
206
+ tail_consumers: [{ service: "another-tail" }],
207
+ });
208
+ expect(findings.some((f) => f.id === "tail_consumer_missing")).toBe(true);
209
+ });
210
+
211
+ it("flags otel_metrics_endpoint_missing when DECO_OTEL_METRICS_ENDPOINT is unset", () => {
212
+ const findings = auditFleetBindings({
213
+ ...canonicalBindings,
214
+ vars: {
215
+ ...canonicalBindings.vars,
216
+ DECO_OTEL_METRICS_ENDPOINT: "",
217
+ },
218
+ });
219
+ expect(findings.some((f) => f.id === "otel_metrics_endpoint_missing")).toBe(true);
220
+ });
221
+
222
+ it("flags otel_traces_endpoint_missing when DECO_OTEL_TRACES_ENDPOINT is missing", () => {
223
+ const { vars: _vars, ...rest } = canonicalBindings;
224
+ const findings = auditFleetBindings(rest);
225
+ expect(findings.some((f) => f.id === "otel_traces_endpoint_missing")).toBe(true);
226
+ expect(findings.some((f) => f.id === "otel_logs_endpoint_missing")).toBe(true);
227
+ expect(findings.some((f) => f.id === "otel_metrics_endpoint_missing")).toBe(true);
228
+ });
229
+
230
+ it("handles missing vars object gracefully", () => {
231
+ expect(() => auditFleetBindings({ vars: undefined })).not.toThrow();
232
+ });
233
+ });
234
+
235
+ describe("auditWranglerConfig — composition", () => {
236
+ it("composes observability + fleet rules", () => {
237
+ const findings = auditWranglerConfig({});
238
+ const ids = findings.map((f) => f.id);
239
+ expect(ids).toContain("observability_missing");
240
+ expect(ids).toContain("version_metadata_binding_missing");
241
+ expect(ids).toContain("tail_consumer_missing");
242
+ });
243
+
244
+ it("returns no findings on a fully canonical wrangler", () => {
245
+ const findings = auditWranglerConfig({
246
+ observability: {
247
+ enabled: true,
248
+ logs: { enabled: true, head_sampling_rate: 1, persist: true },
249
+ traces: { enabled: true, head_sampling_rate: 0.01, persist: true },
250
+ },
251
+ version_metadata: { binding: "CF_VERSION_METADATA" },
252
+ analytics_engine_datasets: [{ binding: "DECO_METRICS", dataset: "deco_metrics_x" }],
253
+ tail_consumers: [{ service: "deco-otel-tail" }],
254
+ vars: {
255
+ DECO_OTEL_METRICS_ENDPOINT: "https://ingest.example/v1/metrics",
256
+ DECO_OTEL_TRACES_ENDPOINT: "https://ingest.example/v1/traces",
257
+ DECO_OTEL_LOGS_ENDPOINT: "https://ingest.example/v1/logs",
258
+ },
259
+ });
260
+ expect(findings).toEqual([]);
261
+ });
262
+ });
263
+
141
264
  describe("JSONC handling — trailing commas + comments", () => {
142
265
  it("stripJsoncTrailingCommas removes commas before `}` and `]`", () => {
143
266
  expect(stripJsoncTrailingCommas(`{ "a": 1, "b": 2, }`)).toBe(`{ "a": 1, "b": 2 }`);
@@ -165,6 +288,133 @@ describe("JSONC handling — trailing commas + comments", () => {
165
288
  });
166
289
  });
167
290
 
291
+ describe("CLI gate hardness (D-16) — --mode warn|block + --github", () => {
292
+ let tmpdir: string;
293
+ const cliPath = path.resolve(__dirname, "audit-observability-config.ts");
294
+
295
+ beforeEach(() => {
296
+ tmpdir = fs.mkdtempSync(path.join(os.tmpdir(), "audit-mode-"));
297
+ });
298
+ afterEach(() => {
299
+ fs.rmSync(tmpdir, { recursive: true, force: true });
300
+ });
301
+
302
+ // Spawn the script via tsx in a child process so we exercise the real
303
+ // `process.exit()` paths instead of monkey-patching them. This is the
304
+ // contract storefront CI consumes, so it's the contract under test.
305
+ function runCli(args: string[]): {
306
+ status: number | null;
307
+ stdout: string;
308
+ stderr: string;
309
+ } {
310
+ const { spawnSync } = require("node:child_process") as typeof import(
311
+ "node:child_process"
312
+ );
313
+ const result = spawnSync(
314
+ process.execPath,
315
+ [
316
+ require.resolve("tsx/cli"),
317
+ cliPath,
318
+ "--source",
319
+ tmpdir,
320
+ ...args,
321
+ ],
322
+ { encoding: "utf8" },
323
+ );
324
+ return {
325
+ status: result.status,
326
+ stdout: result.stdout,
327
+ stderr: result.stderr,
328
+ };
329
+ }
330
+
331
+ it("default mode is warn — exits 0 even with error findings", () => {
332
+ // Empty wrangler triggers `observability_missing` (error) +
333
+ // `tail_consumer_missing` (error) + `version_metadata_*` (error). Warn
334
+ // mode must annotate but exit 0.
335
+ fs.writeFileSync(path.join(tmpdir, "wrangler.jsonc"), "{}");
336
+ const { status, stdout } = runCli([]);
337
+ expect(status).toBe(0);
338
+ expect(stdout).toMatch(/observability_missing/);
339
+ });
340
+
341
+ it("--mode block exits 1 when an error-severity finding is present", () => {
342
+ fs.writeFileSync(path.join(tmpdir, "wrangler.jsonc"), "{}");
343
+ const { status, stdout } = runCli(["--mode", "block"]);
344
+ expect(status).toBe(1);
345
+ expect(stdout).toMatch(/observability_missing/);
346
+ });
347
+
348
+ it("--mode block exits 0 when only warn-severity findings are present", () => {
349
+ // Canonical observability block + the rest of the fleet bindings → only
350
+ // the DECO_OTEL_*_ENDPOINT warns survive. Block mode must exit 0 because
351
+ // those are `warn`, not `error`.
352
+ fs.writeFileSync(
353
+ path.join(tmpdir, "wrangler.jsonc"),
354
+ JSON.stringify({
355
+ name: "my-store",
356
+ observability: {
357
+ enabled: true,
358
+ traces: { enabled: true, head_sampling_rate: 0.01, persist: true },
359
+ logs: { enabled: true, head_sampling_rate: 1, persist: true },
360
+ },
361
+ version_metadata: { binding: "CF_VERSION_METADATA" },
362
+ analytics_engine_datasets: [{ binding: "DECO_METRICS" }],
363
+ tail_consumers: [{ service: "deco-otel-tail" }],
364
+ }),
365
+ );
366
+ const { status } = runCli(["--mode", "block"]);
367
+ expect(status).toBe(0);
368
+ });
369
+
370
+ it("--mode block exits 0 on a fully clean wrangler.jsonc", () => {
371
+ fs.writeFileSync(
372
+ path.join(tmpdir, "wrangler.jsonc"),
373
+ JSON.stringify({
374
+ name: "my-store",
375
+ observability: {
376
+ enabled: true,
377
+ traces: { enabled: true, head_sampling_rate: 0.01, persist: true },
378
+ logs: { enabled: true, head_sampling_rate: 1, persist: true },
379
+ },
380
+ version_metadata: { binding: "CF_VERSION_METADATA" },
381
+ analytics_engine_datasets: [{ binding: "DECO_METRICS" }],
382
+ tail_consumers: [{ service: "deco-otel-tail" }],
383
+ vars: {
384
+ DECO_OTEL_METRICS_ENDPOINT: "https://ingest.example.com",
385
+ DECO_OTEL_TRACES_ENDPOINT: "https://ingest.example.com",
386
+ DECO_OTEL_LOGS_ENDPOINT: "https://ingest.example.com",
387
+ },
388
+ }),
389
+ );
390
+ const { status } = runCli(["--mode", "block"]);
391
+ expect(status).toBe(0);
392
+ });
393
+
394
+ it("--github emits ::warning::/::error:: annotations matched to mode", () => {
395
+ fs.writeFileSync(path.join(tmpdir, "wrangler.jsonc"), "{}");
396
+ // In warn mode, even error-severity findings annotate as `warning` (we
397
+ // never escalate to GitHub `error` annotations when we won't fail the
398
+ // check — keeps the PR annotation channel quiet at v1).
399
+ const warnRun = runCli(["--github"]);
400
+ expect(warnRun.status).toBe(0);
401
+ expect(warnRun.stdout).toMatch(/::warning title=observability_missing::/);
402
+ expect(warnRun.stdout).not.toMatch(/::error title=/);
403
+
404
+ // In block mode, error-severity findings escalate to `::error::`.
405
+ const blockRun = runCli(["--mode", "block", "--github"]);
406
+ expect(blockRun.status).toBe(1);
407
+ expect(blockRun.stdout).toMatch(/::error title=observability_missing::/);
408
+ });
409
+
410
+ it("--mode rejects values other than warn|block with exit 2", () => {
411
+ fs.writeFileSync(path.join(tmpdir, "wrangler.jsonc"), "{}");
412
+ const { status, stderr } = runCli(["--mode", "advisory"]);
413
+ expect(status).toBe(2);
414
+ expect(stderr).toMatch(/--mode must be "warn" or "block"/);
415
+ });
416
+ });
417
+
168
418
  describe("CLI smoke — wrangler.jsonc with trailing commas", () => {
169
419
  let tmpdir: string;
170
420
  beforeEach(() => {