@decocms/start 5.3.0-rc.2 → 5.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/release.yml +5 -2
- package/MIGRATION_TOOLING_PLAN.md +1 -0
- package/docs/observability.md +42 -5
- package/docs/tail-worker-recipe.md +161 -0
- package/package.json +1 -1
- package/scripts/audit-observability-config.ts +0 -0
- package/scripts/generate-blocks.ts +148 -71
- package/scripts/migrate-post-cleanup.ts +0 -0
- package/src/admin/decofile.ts +17 -8
- package/src/sdk/env.ts +7 -3
- package/src/sdk/workerEntry.ts +11 -7
- package/src/vite/plugin.js +129 -10
- package/docs/o11y.md +0 -602
- package/scripts/sync-decofile.ts +0 -221
|
@@ -68,9 +68,12 @@ jobs:
|
|
|
68
68
|
- name: Advance moveable major tag
|
|
69
69
|
run: |
|
|
70
70
|
git fetch --tags --force
|
|
71
|
-
|
|
71
|
+
# Filter out prerelease tags (-rc.N, -next.N, -beta.N, -alpha.N) — the
|
|
72
|
+
# `v*.*.*` glob matches both `v5.3.0` and `v5.3.0-rc.2`, and we never
|
|
73
|
+
# want the moveable major to land on a prerelease.
|
|
74
|
+
LATEST=$(git tag -l 'v*.*.*' --sort=-v:refname | grep -vE '\-(rc|next|beta|alpha)\.' | head -n 1)
|
|
72
75
|
if [ -z "$LATEST" ]; then
|
|
73
|
-
echo "::notice::no v*.*.* tags yet; nothing to advance"
|
|
76
|
+
echo "::notice::no stable v*.*.* tags yet; nothing to advance"
|
|
74
77
|
exit 0
|
|
75
78
|
fi
|
|
76
79
|
MAJOR=$(echo "$LATEST" | sed -E 's/^(v[0-9]+).*/\1/')
|
|
@@ -121,6 +121,7 @@ this plan.
|
|
|
121
121
|
| 2026-05-07 | **D6.1 — Cloudflare credentials never leave `deco-start`** | Same-day refinement of D6 after the first central deploy on `baggagio-tanstack` failed with `Secret CLOUDFLARE_API_TOKEN is required, but not provided while calling`. The original D6 design used `secrets: inherit` from the storefront stub and required `CLOUDFLARE_*` to live in the `deco-sites` org, which broke the principle that *the only secrets a storefront repo holds are the secrets that go into wrangler secrets, not the ones used to deploy*. First-pass refinement: the central `deploy.yml` / `preview.yml` / `sync-secrets.yml` jobs declared `environment: production` to try to make `${{ secrets.CLOUDFLARE_* }}` resolve from `decocms/deco-start`'s `production` Environment. **Found broken empirically on 2026-05-07** — the deployment registers in the *caller* repo, not the called workflow's repo, so the environment lookup uses the caller's `production` env (auto-created with no secrets). Superseded by D6.2 the same evening. |
|
|
122
122
|
| 2026-05-07 | **D6.2 — App-mediated dispatch + no per-site registry (supersedes D6 + D6.1)** | After D6.1's `environment:` mechanism was empirically shown not to work cross-repo, the architecture pivoted: a `decocms-deployer` GitHub App is installed on `decocms/deco-start` (`actions:write`) and on each storefront repo (`contents:read`, optionally `pull-requests:write`). The storefront caller stub mints a short-lived App-installation token and calls `gh workflow run deploy.yml --repo decocms/deco-start --ref v3 -f site_owner=… -f site_name=…`. The central workflow runs in `decocms/deco-start`'s context, so `CLOUDFLARE_API_TOKEN` / `CLOUDFLARE_ACCOUNT_ID` are ordinary repo secrets. For runtime `SECRET_*` values, each storefront has a `<site_name>-secrets` GitHub Environment in `decocms/deco-start` (S1 design); `sync-secrets.yml` binds to that environment and pushes to `wrangler secret put`. The per-site registry under `deploy/sites/<repo>.jsonc` was dropped entirely (Pure C): worker name = repo basename by convention; the App being installed on the storefront repo is the deploy authorization gate; rare per-worker derived fields (like AE dataset name) use `$WORKER_*` substitution tokens in the template. Force-rollback is impossible for production deploys because the central workflow ignores caller-supplied `site_sha` and resolves the storefront's current default-branch HEAD itself. See [`deploy/README.md`](./deploy/README.md) for the full trust model. **Operational migrations required by Pure C:** `miess-01-tanstack` repo's worker shifts from `miess-tanstack` to `miess-01-tanstack` (CF-side cutover); `lebiscuit-tanstack` AE dataset shifts from `deco_metrics_lebiscuit` to `deco_metrics_lebiscuit_tanstack` (orphans old data). |
|
|
123
123
|
| 2026-05-07 | **D6.3 — Revert D6/D6.1/D6.2; deploys move to Cloudflare Workers Builds** | The whole D6 family (centralized GitHub Actions reusable workflows + `decocms-deployer` GitHub App + per-storefront GitHub Environments + central `deploy/wrangler-template.jsonc` + `deco-wrangler` CLI + per-site caller stubs) is being **reverted**. Trigger: GitHub Free orgs do not propagate org-level secrets to private repos, which forced the App private key to live as a per-storefront repo secret in every storefront — that key gives the holder the ability to mint installation tokens that can trigger workflows on `decocms/deco-start`, which in turn have the only Cloudflare credentials in the system. Per-repo distribution + rotation of that key across N customer storefronts didn't scale and concentrated blast radius on one credential. **Replacement (chosen, to be detailed in a follow-up D-record once shipped):** [Cloudflare Workers Builds](https://developers.cloudflare.com/workers/ci-cd/builds/) owns the deploy/preview pipelines per-worker. Verified empirically on `baggagio-tanstack` 2026-05-07: a malicious `wrangler.jsonc` `name` field pointing at a different worker (`americanas-tanstack`) is **ignored** by CF Builds — the deploy lands on the connected worker (`baggagio-tanstack`), CF surfaces a warning banner in the dashboard, and CF auto-opens a PR to fix the config (deco-sites/baggagio-tanstack#34). The dashboard repo<->worker connection is the source of truth; the in-repo config is treated as a secondary input. Per-storefront wiring (one CF dashboard click per worker) is acceptable at our scale; revisit when CF's [git-integration enable API](https://github.com/cloudflare/workers-sdk/issues/12058) lands. The `deco-build` CLI (regenerates `wrangler.jsonc` bindings from a central template) and runtime-secrets management remain to be designed in a separate PR. |
|
|
124
|
+
| 2026-05-19 | **D-8 — Cloudflare Tail Worker (Strategy B) is the canonical 100% error capture mechanism** | At fleet scale (100 sites, 2.5B req/month) head sampling forces a tradeoff: 1% sampling makes the `head_sampling_rate * 5B-event-cap` math work, but 99% of error traces and 99% of error-correlated logs get dropped at the CF Destinations head. The framework already covers framework-emitted errors via the in-Worker direct-POST channel (`DECO_OTEL_LOGS_ENDPOINT`) — that's 100% of `logger.error(...)` regardless of `head_sampling_rate`. But three structural gaps remain that *no* in-Worker code can close from inside its own request handler: (a) uncaught throws (the worker isolate is already unwinding when the throw bubbles out of `instrumentWorker`), (b) `exceededCpu` / `exceededMemory` outcomes (the runtime kills the producer before any in-Worker code can run), (c) raw `console.error(...)` from third-party SDKs that bypass the framework logger. **Decision:** introduce [`deco-otel-tail`](https://github.com/decocms/stats-lake/tree/main/ingestion/otel-tail) — a Cloudflare Tail Worker in `stats-lake/ingestion/otel-tail/`. CF invokes it on every execution of any producer worker that lists it under `tail_consumers` (`wrangler.jsonc`). The handler filters TraceItems down to the interesting subset (`outcome !== "ok" \|\| exceptions.length > 0 \|\| logs.some(l => l.level === "error")`), translates each to OTLP LogRecords (one per exception, one per `error`-level log line, plus a synthetic LogRecord for non-ok outcomes that didn't surface either), and forwards them to `deco-otel-ingest` via an in-account service binding (no public hop). Rows land in `otel_logs` with `Attributes['_source'] = 'tail-worker'` so dashboards can split tail-captured errors from direct-POST + CF-Destinations errors. **Rejected alternatives:** (1) **Codemod + lint to enforce `logger.error` calls** — structural coverage gap; can't catch uncaught throws or 1101s by definition, and a lint can't enforce calls inside third-party code. (2) **Logpush + ingest pipeline** — bypassed because Logpush isn't OTLP-shaped and the pricing curve loses to tail-worker at our scale. (3) **CF dashboard log retention only** — no fan-out to ClickHouse, no fleet-wide query surface. (4) **DO-buffered tail-on-error** — ~$8K/mo at fleet scale per the cost model in `docs/observability.md`. **Coverage matrix lives in [`docs/observability.md`](./docs/observability.md) → "Error capture — three-channel model".** Producer-side wiring is one line per `wrangler.jsonc`: `tail_consumers: [{ service: "deco-otel-tail" }]`. **Operational dependency:** the tail worker MUST be deployed to the same Cloudflare account as `deco-otel-ingest` (currently `c95fc4cec7fc52453228d9db170c372c`) so the `[[services]]` binding resolves. If `deco-otel-ingest` ever moves accounts, the service binding collapses to a public HTTPS POST and the model needs revisiting. **Agent behaviour:** when designing error capture for new Worker-deployed code, default to Strategy B for the long tail; don't reach for codemod/lint enforcement unless there's a specific code-quality concern beyond capture. |
|
|
124
125
|
|
|
125
126
|
The full text of the constitutional rule (loaded into every agent
|
|
126
127
|
session for this repo) lives at
|
package/docs/observability.md
CHANGED
|
@@ -254,14 +254,39 @@ await vtexFetch("https://account.vtexcommercestable.com.br/api/io/_v/intelligent
|
|
|
254
254
|
|
|
255
255
|
Resolution precedence is `init.operation` → `defaultOperation` → `resolveOperation(url, method)` → the literal `"fetch"`. The resolved value lands on the span as `fetch.operation` (so dashboards can `GROUP BY SpanAttributes['fetch.operation']` independent of span name) and is included in the `onComplete` callback payload (so per-app duration histograms can label by operation). `operation` is stripped from `init` before reaching the underlying `fetch` — it never surfaces to the network.
|
|
256
256
|
|
|
257
|
+
## Error capture — three-channel model
|
|
258
|
+
|
|
259
|
+
100% capture of errors is achieved across three complementary channels, each owning a different slice of "what failed":
|
|
260
|
+
|
|
261
|
+
| Error source | Channel | Coverage | Why it's needed |
|
|
262
|
+
| --- | --- | --- | --- |
|
|
263
|
+
| Framework `logger.error(...)` | Direct POST | 100% (rate-limited)| Framework owns the call site, can attach structured context (traceId, route, attrs), and can fire before the request finishes. Latency-sensitive. |
|
|
264
|
+
| Framework span errors (`setError`) | CF Destinations + tail| 1% sampled + 100% tail | Spans ride the CF Destinations pipe; tail worker picks them up again if the request finished `outcome != "ok"`. Together they give per-span detail at scale + 100% capture on regressions. |
|
|
265
|
+
| **Uncaught throws** escaping the handler | **Tail Worker** | **100%** | Direct-POST can't fire — by the time the throw bubbles past `instrumentWorker`, the worker isolate is unwinding. The tail worker runs AFTER the worker terminates and receives the captured exception. |
|
|
266
|
+
| **`exceededCpu` / `exceededMemory`** | **Tail Worker** | **100%** | The producer is killed before any in-Worker code can run. Only the CF runtime can surface these outcomes, and it does so through the tail handler. |
|
|
267
|
+
| **Raw `console.error(...)`** outside framework | **Tail Worker** | **100%** | Third-party SDKs (analytics, payment, observability libs that aren't ours) call `console.error` directly, bypassing the framework logger. CF captures every `console.*` line into the TraceItem. |
|
|
268
|
+
| Info / warn logs | CF Destinations | 1% sampled | Bulk volume. Sampled to keep CF Destinations cost in check at fleet scale. |
|
|
269
|
+
| OTel spans | CF Destinations | 1% sampled | Same as above — spans are 95% of the event volume. |
|
|
270
|
+
| OTel metrics | Direct POST | 100% (buffered) | CF Destinations doesn't support OTLP metrics. Direct-POST is the only path. |
|
|
271
|
+
|
|
272
|
+
The tail-worker channel is implemented by [`deco-otel-tail`](https://github.com/decocms/stats-lake/tree/main/ingestion/otel-tail) (in the stats-lake repo). The producer wrangler opts in with:
|
|
273
|
+
|
|
274
|
+
```jsonc
|
|
275
|
+
"tail_consumers": [
|
|
276
|
+
{ "service": "deco-otel-tail" }
|
|
277
|
+
]
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
Rows from the tail worker land in `otel_logs` with `Attributes['_source'] = 'tail-worker'`, so dashboards can split out tail-captured errors from direct-POST and CF-Destinations errors as needed.
|
|
281
|
+
|
|
257
282
|
## Sampling
|
|
258
283
|
|
|
259
|
-
`head_sampling_rate` on `observability.traces` and `observability.logs` decides at the very start of a trace/log whether Cloudflare Destinations forwards it to the deco-otel-ingest endpoint. CF Destinations does NOT support tail sampling
|
|
284
|
+
`head_sampling_rate` on `observability.traces` and `observability.logs` decides at the very start of a trace/log whether Cloudflare Destinations forwards it to the deco-otel-ingest endpoint. CF Destinations does NOT support tail sampling — the framework instead uses the three-channel error-capture model documented above to achieve 100% error capture independent of `head_sampling_rate`.
|
|
260
285
|
|
|
261
286
|
**Recommended defaults:**
|
|
262
287
|
|
|
263
288
|
- `traces.head_sampling_rate: 0.01` — 1% of traces forward via CF Destinations.
|
|
264
|
-
- `logs.head_sampling_rate:
|
|
289
|
+
- `logs.head_sampling_rate: 0.01` — 1% of info/warn logs forward via CF Destinations. **Errors are not subject to this rate** — they are fully covered by (a) the direct-POST channel for framework `logger.error(...)` (100%, rate-limited), and (b) the tail worker for everything else (uncaught throws, exhaustion outcomes, raw `console.error`). The earlier `logs.head_sampling_rate: 1.0` default was retired when the tail worker landed.
|
|
265
290
|
|
|
266
291
|
**Per-site override tier (heavy traffic only):**
|
|
267
292
|
|
|
@@ -332,8 +357,9 @@ Different signals have different durability guarantees. Knowing where data can b
|
|
|
332
357
|
| Signal | Path | Sampling | Buffer location | Loss conditions |
|
|
333
358
|
| ---------------------- | ----------------------------- | -------------------------------- | ------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
334
359
|
| **Traces (spans)** | CF Destinations | head 1% (`0.01`) | Cloudflare-managed | 99% intentionally dropped at head. Of the 1% that survives, only loss is a CF Destinations outage or an ingestor 5xx (no retries from CF). |
|
|
335
|
-
| **Info / warn logs** | CF Destinations | head 1
|
|
336
|
-
| **
|
|
360
|
+
| **Info / warn logs** | CF Destinations | head 1% (`0.01`) | Cloudflare-managed | 99% intentionally dropped at head. Of the 1% that survives, only loss is a CF Destinations outage. |
|
|
361
|
+
| **Framework error logs** | Direct POST (`/v1/logs`) | none (100%, then rate-limited) | In-Worker buffer | (a) Token-bucket rate limiter trips on a log storm — default `100/min` steady, `20` burst — surplus is **counted-and-dropped** via `onError`. (b) Buffer overflow (default `500` records) before the next flush — same `onError` signal. (c) A failed POST to the ingestor (non-2xx or network error) does **not** drop records — the in-flight snapshot is restored to the front of the buffer. When `snapshot + buffer > cap`, restoration drops the **newest** records first (newest tail of the live buffer, then if still over cap, newest tail of the snapshot) — the oldest, most-likely-causal records are preserved. All drops surface via `onError("overflow", …)` with counts. (d) Worker isolate forcibly evicted before `ctx.waitUntil` completes — covered by the tail worker (next row). |
|
|
362
|
+
| **Uncaught throws, `exceededCpu`, raw `console.error`** | Tail Worker (`deco-otel-tail` → `/v1/logs`) | none (100%) | Out-of-process (separate worker) | (a) Tail worker invocation failure on the CF runtime side (extremely rare; CF retries internally). (b) `deco-otel-ingest` 5xx — the tail worker logs the failure but does NOT retry the OTLP forward, so the affected batch is lost. (c) The producer dies so abruptly that CF can't materialize a TraceItem — undocumented edge case, treat as bounded by CF's own SLA. |
|
|
337
363
|
| **Metrics** | Direct POST (`/v1/metrics`) | none (100%) | In-Worker buffer | Counters and gauges are last-write-wins per datapoint — a forced eviction drops at most one flush window's worth of partial sums. Histograms with un-flushed bucket counts are lost on eviction. Buffer overflow (default `5000` datapoints) drops the oldest datapoint via `onError`. |
|
|
338
364
|
| **AE metrics** | Workers Analytics Engine | none (sampled per-AE-policy) | Cloudflare-managed | AE applies its own sampling once an account crosses the 5B-events/day cap. Below the cap, AE writes are durable on the platform side. |
|
|
339
365
|
|
|
@@ -346,6 +372,17 @@ What this means operationally:
|
|
|
346
372
|
## Out of scope
|
|
347
373
|
|
|
348
374
|
- **In-Worker OTLP exporter for spans / info-logs.** Removed in 5.0.0; CF Destinations is the spans + info/warn-logs path. (Direct-POST does still exist for **metrics** and **error logs**, by deliberate choice — both are signals CF Destinations cannot or should not carry.)
|
|
349
|
-
- **Tail-on-error sampling
|
|
375
|
+
- **Tail-on-error sampling via a Durable Object buffer.** The DO-backed
|
|
376
|
+
approach was rejected on cost grounds (~$8K/mo at fleet scale, see
|
|
377
|
+
[Cost model](#cost-model-fleet-of-100-sites-25b-reqmonth)). The functional
|
|
378
|
+
goal — 100% capture of errors regardless of head sampling — is met via
|
|
379
|
+
two complementary mechanisms: (a) the in-Worker direct-POST channel for
|
|
380
|
+
framework `logger.error(...)` calls, and (b) the **Cloudflare Tail Worker
|
|
381
|
+
(`deco-otel-tail`, Strategy B)** which CF invokes on every invocation of
|
|
382
|
+
a producer worker that lists it under `tail_consumers`. The tail worker
|
|
383
|
+
filters TraceItems down to the "interesting" subset (outcome != ok,
|
|
384
|
+
exceptions, or `level: error` logs) and forwards them as OTLP/JSON logs
|
|
385
|
+
to `deco-otel-ingest` via an intra-account service binding. See
|
|
386
|
+
[decocms/stats-lake/ingestion/otel-tail/](https://github.com/decocms/stats-lake/tree/main/ingestion/otel-tail) and D-8.
|
|
350
387
|
- **Commerce-specific spans.** Per-app (VTEX, Shopify) HTTP spans live in `@decocms/apps`, which calls `createInstrumentedFetch` (with `defaultOperation` / `resolveOperation` configured per provider) and authors `init.operation` at hot call sites. PR #3 in the apps-start repo migrates the per-app fetch sites onto that pattern. The framework owns the span shape (`${name}.${operation}`); the apps repo owns the operation strings + provider-labelled duration histogram.
|
|
351
388
|
- **PII redaction at the framework layer.** URLs are redacted by `redactUrl()` on outbound `fetch` spans; the rest (cookie, authorization, x-vtex-* headers) is redacted in the ingest Worker. No per-site code required for either side.
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
# Per-site recipe — adopt the `deco-otel-tail` tail worker
|
|
2
|
+
|
|
3
|
+
> **Status:** gated. Do NOT roll out to the fleet until the canary site
|
|
4
|
+
> (`casaevideo-tanstack`) has completed a 7-day soak with no false negatives
|
|
5
|
+
> in tail capture and no infrastructure cost shock from the tail-handler
|
|
6
|
+
> invocations. See [D-8 in `MIGRATION_TOOLING_PLAN.md`](../MIGRATION_TOOLING_PLAN.md)
|
|
7
|
+
> for the decision record and [`docs/observability.md`](./observability.md)
|
|
8
|
+
> for the architecture.
|
|
9
|
+
|
|
10
|
+
This is the mechanical recipe for opting any deco storefront worker into
|
|
11
|
+
the [`deco-otel-tail`](https://github.com/decocms/stats-lake/tree/main/ingestion/otel-tail)
|
|
12
|
+
Cloudflare Tail Worker (Strategy B). One PR per storefront repo, two
|
|
13
|
+
file-level changes per PR.
|
|
14
|
+
|
|
15
|
+
## Preconditions
|
|
16
|
+
|
|
17
|
+
Before opening any per-site PR:
|
|
18
|
+
|
|
19
|
+
1. **The tail worker is deployed.** Verify via the Cloudflare dashboard
|
|
20
|
+
(`Workers & Pages → deco-otel-tail`) or:
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
cd decocms/stats-lake/ingestion/otel-tail
|
|
24
|
+
wrangler deployments list
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
You want to see at least one deployment with the `INGEST: deco-otel-ingest`
|
|
28
|
+
service binding.
|
|
29
|
+
|
|
30
|
+
2. **The producer worker is in the same Cloudflare account.** The
|
|
31
|
+
`tail_consumers` mechanism cannot reference workers in a different
|
|
32
|
+
account. Every deco fleet worker is in `c95fc4cec7fc52453228d9db170c372c`
|
|
33
|
+
(the `decocms - production` enterprise account), so this is normally
|
|
34
|
+
true — but a future split would invalidate the recipe.
|
|
35
|
+
|
|
36
|
+
3. **The producer worker is on `@decocms/start >= 5.3.0`.** Older versions
|
|
37
|
+
don't ship the three-channel error capture model and the sampling flip
|
|
38
|
+
in step 2 below will silently drop info/warn diagnostics that some
|
|
39
|
+
dashboards still depend on. If the site is below 5.3.0, bump it first.
|
|
40
|
+
|
|
41
|
+
## The PR (~5 lines of producer-side wrangler change)
|
|
42
|
+
|
|
43
|
+
In the storefront repo, edit `wrangler.jsonc`:
|
|
44
|
+
|
|
45
|
+
### Step 1 — Wire the tail consumer
|
|
46
|
+
|
|
47
|
+
Add at the top level (anywhere is fine; we conventionally place it near the
|
|
48
|
+
other producer-level keys like `kv_namespaces`):
|
|
49
|
+
|
|
50
|
+
```jsonc
|
|
51
|
+
"tail_consumers": [
|
|
52
|
+
{ "service": "deco-otel-tail" }
|
|
53
|
+
],
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Cloudflare will now invoke `deco-otel-tail` on every execution of this
|
|
57
|
+
worker, regardless of outcome. The tail worker itself filters down to
|
|
58
|
+
the "interesting" subset before forwarding anything — see
|
|
59
|
+
[`stats-lake/ingestion/otel-tail/src/index.ts`](https://github.com/decocms/stats-lake/blob/main/ingestion/otel-tail/src/index.ts).
|
|
60
|
+
|
|
61
|
+
### Step 2 — Flip `observability.logs.head_sampling_rate`
|
|
62
|
+
|
|
63
|
+
Inside the existing `observability.logs` block:
|
|
64
|
+
|
|
65
|
+
```jsonc
|
|
66
|
+
"observability": {
|
|
67
|
+
"logs": {
|
|
68
|
+
// ...other settings...
|
|
69
|
+
"head_sampling_rate": 0.01, // was 1 — see docs/observability.md
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
This drops info/warn CF Destinations log volume by 100x. Errors are NOT
|
|
75
|
+
affected — they're now covered by:
|
|
76
|
+
|
|
77
|
+
- the in-Worker direct-POST channel for framework `logger.error(...)`
|
|
78
|
+
(100%, rate-limited), and
|
|
79
|
+
- the tail worker for everything else (uncaught throws, exhaustion
|
|
80
|
+
outcomes, raw `console.error` from third-party SDKs — all 100%).
|
|
81
|
+
|
|
82
|
+
If the site has dashboards or alerts that depend on info-level log
|
|
83
|
+
volume, audit them before flipping. Don't be heroic — keep the previous
|
|
84
|
+
rate temporarily, document the dashboard dependency, fix the dashboard,
|
|
85
|
+
then flip in a follow-up.
|
|
86
|
+
|
|
87
|
+
### Conventional commit + PR body
|
|
88
|
+
|
|
89
|
+
The change is mechanically trivial. Use:
|
|
90
|
+
|
|
91
|
+
```
|
|
92
|
+
feat(observability): adopt deco-otel-tail + drop logs sampling to 1%
|
|
93
|
+
|
|
94
|
+
- Wire `tail_consumers` to deco-otel-tail (100% capture of uncaught
|
|
95
|
+
throws / exceededCpu / raw console.error).
|
|
96
|
+
- Drop `observability.logs.head_sampling_rate` from 1 to 0.01 now that
|
|
97
|
+
errors are covered by the direct-POST + tail channels.
|
|
98
|
+
|
|
99
|
+
Implements Strategy B (D-8) for this site. See
|
|
100
|
+
decocms/deco-start/docs/observability.md, "Error capture — three-channel
|
|
101
|
+
model" for the coverage matrix.
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## Per-site post-deploy validation (60s ceremony)
|
|
105
|
+
|
|
106
|
+
After the wrangler change ships, hit any route on the deployed worker
|
|
107
|
+
that produces an error log, then verify the row lands in ClickHouse with
|
|
108
|
+
`Attributes['_source'] = 'tail-worker'`:
|
|
109
|
+
|
|
110
|
+
```sql
|
|
111
|
+
SELECT Timestamp, ServiceName, SeverityText, Body,
|
|
112
|
+
Attributes['_outcome'] AS outcome,
|
|
113
|
+
Attributes['_source'] AS source
|
|
114
|
+
FROM otel_logs
|
|
115
|
+
WHERE ServiceName = '<your-worker-name>' -- e.g. 'casaevideo-tanstack'
|
|
116
|
+
AND Attributes['_source'] = 'tail-worker'
|
|
117
|
+
AND Timestamp > now() - INTERVAL 5 MINUTE
|
|
118
|
+
ORDER BY Timestamp DESC
|
|
119
|
+
LIMIT 20;
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
If you get zero rows after 5 minutes of normal traffic, the most likely
|
|
123
|
+
causes are:
|
|
124
|
+
|
|
125
|
+
1. The producer's `tail_consumers` block didn't land (rare — wrangler
|
|
126
|
+
would have errored out). Double-check the deployed `wrangler.jsonc`
|
|
127
|
+
via `wrangler deployments view`.
|
|
128
|
+
2. The site is in a Cloudflare account different from `decocms - production`.
|
|
129
|
+
See "Preconditions" above.
|
|
130
|
+
3. `deco-otel-tail` is unhealthy. Tail it directly:
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
cd decocms/stats-lake/ingestion/otel-tail
|
|
134
|
+
wrangler tail
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## Rollout batching
|
|
138
|
+
|
|
139
|
+
Open per-site PRs in batches of 10–20 to keep CI load manageable and to
|
|
140
|
+
make it easy to roll back a batch if a regression sneaks in. The change
|
|
141
|
+
is mechanically identical across all sites, so a single shared template
|
|
142
|
+
can be used. Track adoption state in a fleet-rollout issue on
|
|
143
|
+
[`decocms/deco-start`](https://github.com/decocms/deco-start/issues) so
|
|
144
|
+
it's a single source of truth for who's onboarded.
|
|
145
|
+
|
|
146
|
+
## When NOT to use this recipe
|
|
147
|
+
|
|
148
|
+
- **Site does NOT use `@decocms/start`.** The recipe only covers fleet
|
|
149
|
+
workers that already participate in the three-channel observability
|
|
150
|
+
model. Sites on other frameworks should adopt the recipe after their
|
|
151
|
+
own observability story stabilises.
|
|
152
|
+
- **Site is in a different Cloudflare account.** The `tail_consumers`
|
|
153
|
+
service reference would fail to resolve. Either consolidate the site
|
|
154
|
+
into the `decocms - production` account or stand up an account-local
|
|
155
|
+
copy of `deco-otel-tail` first (and update the service name in
|
|
156
|
+
`tail_consumers` accordingly).
|
|
157
|
+
- **Site has a high-info-log diagnostic culture.** If the team genuinely
|
|
158
|
+
uses info-level CF Destinations logs for daily debugging — and is
|
|
159
|
+
willing to pay for the volume — keep `logs.head_sampling_rate` at 1
|
|
160
|
+
for that site and skip step 2. Only adopt step 1 (`tail_consumers`).
|
|
161
|
+
Document the deviation in the PR description.
|
package/package.json
CHANGED
|
File without changes
|
|
@@ -15,6 +15,14 @@
|
|
|
15
15
|
* Env / CLI:
|
|
16
16
|
* --blocks-dir override input (default: .deco/blocks)
|
|
17
17
|
* --out-file override output (default: src/server/cms/blocks.gen.ts)
|
|
18
|
+
*
|
|
19
|
+
* Programmatic:
|
|
20
|
+
* import { generateBlocks } from "@decocms/start/scripts/generate-blocks";
|
|
21
|
+
* await generateBlocks({ blocksDir, outFile });
|
|
22
|
+
*
|
|
23
|
+
* The Vite plugin's dev-mode watcher uses the programmatic entry to keep the
|
|
24
|
+
* generated artifact in sync with `.deco/blocks/` without spawning a child
|
|
25
|
+
* process per change.
|
|
18
26
|
*/
|
|
19
27
|
import fs from "node:fs";
|
|
20
28
|
import path from "node:path";
|
|
@@ -25,16 +33,6 @@ import {
|
|
|
25
33
|
mergeCandidates,
|
|
26
34
|
} from "./lib/blocks-dedupe";
|
|
27
35
|
|
|
28
|
-
const args = process.argv.slice(2);
|
|
29
|
-
function arg(name: string, fallback: string): string {
|
|
30
|
-
const idx = args.indexOf(`--${name}`);
|
|
31
|
-
return idx !== -1 && args[idx + 1] ? args[idx + 1] : fallback;
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
const blocksDir = path.resolve(process.cwd(), arg("blocks-dir", ".deco/blocks"));
|
|
35
|
-
const outFile = path.resolve(process.cwd(), arg("out-file", "src/server/cms/blocks.gen.ts"));
|
|
36
|
-
const jsonFile = outFile.replace(/\.ts$/, ".json");
|
|
37
|
-
|
|
38
36
|
const TS_STUB = [
|
|
39
37
|
"// Auto-generated — thin wrapper around blocks.gen.json.",
|
|
40
38
|
"// The Vite plugin replaces this at load time with JSON.parse(...).",
|
|
@@ -44,75 +42,154 @@ const TS_STUB = [
|
|
|
44
42
|
"",
|
|
45
43
|
].join("\n");
|
|
46
44
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
process.exit(0);
|
|
45
|
+
export interface GenerateBlocksOptions {
|
|
46
|
+
blocksDir: string;
|
|
47
|
+
outFile: string;
|
|
48
|
+
/** Suppress the per-run summary log. Defaults to false. */
|
|
49
|
+
silent?: boolean;
|
|
53
50
|
}
|
|
54
51
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
for (const file of files) {
|
|
63
|
-
const { name, passes } = decodeBlockNameWithPasses(file);
|
|
64
|
-
const fp = path.join(blocksDir, file);
|
|
65
|
-
let parsed: unknown;
|
|
66
|
-
try {
|
|
67
|
-
parsed = JSON.parse(fs.readFileSync(fp, "utf-8"));
|
|
68
|
-
} catch (e) {
|
|
69
|
-
console.warn(`Failed to parse ${file}:`, e);
|
|
70
|
-
continue;
|
|
71
|
-
}
|
|
72
|
-
candidatesWithKeys.push({
|
|
73
|
-
key: name,
|
|
74
|
-
candidate: {
|
|
75
|
-
file,
|
|
76
|
-
passes,
|
|
77
|
-
mtimeMs: fs.statSync(fp).mtimeMs,
|
|
78
|
-
hasPath: blockHasPath(parsed),
|
|
79
|
-
parsed,
|
|
80
|
-
},
|
|
81
|
-
});
|
|
52
|
+
export interface GenerateBlocksResult {
|
|
53
|
+
count: number;
|
|
54
|
+
collisions: number;
|
|
55
|
+
jsonFile: string;
|
|
56
|
+
outFile: string;
|
|
57
|
+
/** True when the blocks dir was missing and an empty barrel was emitted. */
|
|
58
|
+
empty: boolean;
|
|
82
59
|
}
|
|
83
60
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
);
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
61
|
+
export async function generateBlocks(
|
|
62
|
+
options: GenerateBlocksOptions,
|
|
63
|
+
): Promise<GenerateBlocksResult> {
|
|
64
|
+
const blocksDir = path.resolve(options.blocksDir);
|
|
65
|
+
const outFile = path.resolve(options.outFile);
|
|
66
|
+
const jsonFile = outFile.replace(/\.ts$/, ".json");
|
|
67
|
+
const silent = options.silent ?? false;
|
|
68
|
+
|
|
69
|
+
if (!fs.existsSync(blocksDir)) {
|
|
70
|
+
if (!silent) {
|
|
71
|
+
console.warn(`Blocks directory not found: ${blocksDir} — generating empty barrel.`);
|
|
72
|
+
}
|
|
73
|
+
fs.mkdirSync(path.dirname(outFile), { recursive: true });
|
|
74
|
+
fs.writeFileSync(jsonFile, "{}");
|
|
75
|
+
fs.writeFileSync(outFile, TS_STUB);
|
|
76
|
+
return { count: 0, collisions: 0, jsonFile, outFile, empty: true };
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
const files = fs.readdirSync(blocksDir).filter((f) => f.endsWith(".json"));
|
|
80
|
+
|
|
81
|
+
// Read each file into a Candidate, then let the dedupe lib pick the winner
|
|
82
|
+
// per decoded key and report any collisions. See `lib/blocks-dedupe.ts` for
|
|
83
|
+
// the priority order and the rationale behind it (TL;DR: never use file size,
|
|
84
|
+
// don't trust mtime alone in CI clones).
|
|
85
|
+
const candidatesWithKeys: Array<{ candidate: Candidate; key: string }> = [];
|
|
86
|
+
for (const file of files) {
|
|
87
|
+
const { name, passes } = decodeBlockNameWithPasses(file);
|
|
88
|
+
const fp = path.join(blocksDir, file);
|
|
89
|
+
let parsed: unknown;
|
|
90
|
+
try {
|
|
91
|
+
parsed = JSON.parse(fs.readFileSync(fp, "utf-8"));
|
|
92
|
+
} catch (e) {
|
|
93
|
+
if (!silent) console.warn(`Failed to parse ${file}:`, e);
|
|
94
|
+
continue;
|
|
95
|
+
}
|
|
96
|
+
candidatesWithKeys.push({
|
|
97
|
+
key: name,
|
|
98
|
+
candidate: {
|
|
99
|
+
file,
|
|
100
|
+
passes,
|
|
101
|
+
mtimeMs: fs.statSync(fp).mtimeMs,
|
|
102
|
+
hasPath: blockHasPath(parsed),
|
|
103
|
+
parsed,
|
|
104
|
+
},
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
const { winners, collisions } = mergeCandidates(candidatesWithKeys);
|
|
109
|
+
|
|
110
|
+
if (!silent && collisions.length > 0) {
|
|
111
|
+
console.warn(
|
|
112
|
+
`Detected ${collisions.length} filename collision(s) in ${path.relative(process.cwd(), blocksDir)}:`,
|
|
113
|
+
);
|
|
114
|
+
for (const c of collisions) {
|
|
115
|
+
const losers = c.files.filter((f) => f !== c.winner);
|
|
116
|
+
console.warn(` - ${c.key}`);
|
|
117
|
+
console.warn(` winner: ${c.winner}`);
|
|
118
|
+
for (const l of losers) console.warn(` ignore: ${l}`);
|
|
119
|
+
}
|
|
120
|
+
console.warn(" Cause: multiple writers (manual sync vs deco-sync-bot) producing");
|
|
121
|
+
console.warn(" different filename encodings for the same logical key. Delete the");
|
|
122
|
+
console.warn(" stale file(s) listed under 'ignore' to silence this warning.");
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
const blocks: Record<string, unknown> = {};
|
|
126
|
+
for (const [name, c] of Object.entries(winners)) {
|
|
127
|
+
blocks[name] = c.parsed;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
fs.mkdirSync(path.dirname(outFile), { recursive: true });
|
|
131
|
+
|
|
132
|
+
// 1. Compact JSON — the real data (no pretty-printing to save ~40% size)
|
|
133
|
+
const jsonStr = JSON.stringify(blocks);
|
|
134
|
+
fs.writeFileSync(jsonFile, jsonStr);
|
|
135
|
+
|
|
136
|
+
// 2. Thin TS wrapper — just for TypeScript tooling and as a Vite load target.
|
|
137
|
+
// Only write if content differs to avoid triggering Vite's file watcher,
|
|
138
|
+
// which would cascade module invalidation to the route tree and crash
|
|
139
|
+
// TanStack Router during dev hot-reload.
|
|
140
|
+
let existingTs: string | undefined;
|
|
141
|
+
try { existingTs = fs.readFileSync(outFile, "utf-8"); } catch {}
|
|
142
|
+
if (existingTs !== TS_STUB) {
|
|
143
|
+
fs.writeFileSync(outFile, TS_STUB);
|
|
95
144
|
}
|
|
96
|
-
console.warn(" Cause: multiple writers (manual sync vs deco-sync-bot) producing");
|
|
97
|
-
console.warn(" different filename encodings for the same logical key. Delete the");
|
|
98
|
-
console.warn(" stale file(s) listed under 'ignore' to silence this warning.");
|
|
99
|
-
}
|
|
100
145
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
146
|
+
if (!silent) {
|
|
147
|
+
const jsonSizeMB = (Buffer.byteLength(jsonStr) / 1_048_576).toFixed(1);
|
|
148
|
+
console.log(
|
|
149
|
+
`Generated ${Object.keys(blocks).length} blocks → ${path.relative(process.cwd(), jsonFile)} (${jsonSizeMB} MB)`,
|
|
150
|
+
);
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
return {
|
|
154
|
+
count: Object.keys(blocks).length,
|
|
155
|
+
collisions: collisions.length,
|
|
156
|
+
jsonFile,
|
|
157
|
+
outFile,
|
|
158
|
+
empty: false,
|
|
159
|
+
};
|
|
104
160
|
}
|
|
105
161
|
|
|
106
|
-
|
|
162
|
+
// ---------------------------------------------------------------------------
|
|
163
|
+
// CLI shim — preserved so `npm run generate:blocks` and migration scripts
|
|
164
|
+
// keep working unchanged.
|
|
165
|
+
// ---------------------------------------------------------------------------
|
|
166
|
+
|
|
167
|
+
function isMainModule(): boolean {
|
|
168
|
+
// tsx/node ESM: import.meta.url matches process.argv[1] when invoked directly.
|
|
169
|
+
// Use a forgiving comparison so it works under both `tsx script.ts` and
|
|
170
|
+
// `node --import tsx script.ts`.
|
|
171
|
+
const entry = process.argv[1];
|
|
172
|
+
if (!entry) return false;
|
|
173
|
+
try {
|
|
174
|
+
const entryUrl = new URL(`file://${path.resolve(entry)}`).href;
|
|
175
|
+
return import.meta.url === entryUrl;
|
|
176
|
+
} catch {
|
|
177
|
+
return false;
|
|
178
|
+
}
|
|
179
|
+
}
|
|
107
180
|
|
|
108
|
-
|
|
109
|
-
const
|
|
110
|
-
|
|
181
|
+
if (isMainModule()) {
|
|
182
|
+
const args = process.argv.slice(2);
|
|
183
|
+
const arg = (name: string, fallback: string): string => {
|
|
184
|
+
const idx = args.indexOf(`--${name}`);
|
|
185
|
+
return idx !== -1 && args[idx + 1] ? args[idx + 1] : fallback;
|
|
186
|
+
};
|
|
111
187
|
|
|
112
|
-
|
|
113
|
-
|
|
188
|
+
const blocksDir = path.resolve(process.cwd(), arg("blocks-dir", ".deco/blocks"));
|
|
189
|
+
const outFile = path.resolve(process.cwd(), arg("out-file", "src/server/cms/blocks.gen.ts"));
|
|
114
190
|
|
|
115
|
-
|
|
116
|
-
console.
|
|
117
|
-
|
|
118
|
-
);
|
|
191
|
+
generateBlocks({ blocksDir, outFile }).catch((err) => {
|
|
192
|
+
console.error(err);
|
|
193
|
+
process.exit(1);
|
|
194
|
+
});
|
|
195
|
+
}
|
|
File without changes
|
package/src/admin/decofile.ts
CHANGED
|
@@ -20,15 +20,24 @@ export async function handleDecofileReload(
|
|
|
20
20
|
request: Request,
|
|
21
21
|
env?: Record<string, unknown>,
|
|
22
22
|
): Promise<Response> {
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
23
|
+
// In dev mode the Vite plugin POSTs new blocks here to hot-reload without
|
|
24
|
+
// module invalidation (which breaks TanStack Start/Router state). Skip auth
|
|
25
|
+
// so the plugin can POST from localhost.
|
|
26
|
+
// Uses import.meta.env.DEV directly (not isDevMode()) because isDevMode()
|
|
27
|
+
// bypass auth. Vite statically replaces import.meta.env.DEV with `false`
|
|
28
|
+
// in production builds, so this branch is dead-code-eliminated.
|
|
29
|
+
const isViteDev = !!(import.meta as unknown as { env?: { DEV?: boolean } }).env?.DEV;
|
|
30
|
+
if (!isViteDev) {
|
|
31
|
+
const authHeader = request.headers.get("Authorization") || "";
|
|
32
|
+
const expectedToken =
|
|
33
|
+
(env?.DECO_RELEASE_RELOAD_TOKEN as string | undefined) ??
|
|
34
|
+
(typeof globalThis.process !== "undefined"
|
|
35
|
+
? globalThis.process.env?.DECO_RELEASE_RELOAD_TOKEN
|
|
36
|
+
: undefined);
|
|
29
37
|
|
|
30
|
-
|
|
31
|
-
|
|
38
|
+
if (!expectedToken || authHeader !== expectedToken) {
|
|
39
|
+
return new Response("Unauthorized", { status: 401 });
|
|
40
|
+
}
|
|
32
41
|
}
|
|
33
42
|
|
|
34
43
|
let newBlocks: Record<string, unknown>;
|
package/src/sdk/env.ts
CHANGED
|
@@ -11,8 +11,8 @@ let _isDev: boolean | null = null;
|
|
|
11
11
|
* Returns `true` when running in a development environment.
|
|
12
12
|
*
|
|
13
13
|
* Detection order:
|
|
14
|
-
* 1. `
|
|
15
|
-
* 2. `NODE_ENV=development`
|
|
14
|
+
* 1. `import.meta.env.DEV` — Vite build-time constant (reliable in Workers/Miniflare)
|
|
15
|
+
* 2. `NODE_ENV=development` — standard Node/Vite convention
|
|
16
16
|
*
|
|
17
17
|
* The result is memoised after the first evaluation.
|
|
18
18
|
*/
|
|
@@ -21,7 +21,11 @@ export function isDevMode(): boolean {
|
|
|
21
21
|
|
|
22
22
|
const env = typeof globalThis.process !== "undefined" ? globalThis.process.env : undefined;
|
|
23
23
|
|
|
24
|
-
|
|
24
|
+
// Vite statically replaces import.meta.env.DEV at build time (true in dev, false in prod).
|
|
25
|
+
// In Miniflare/Workers, process.env is unavailable, so this is the reliable signal.
|
|
26
|
+
const vitaDev = !!(import.meta as unknown as { env?: { DEV?: boolean } }).env?.DEV;
|
|
27
|
+
|
|
28
|
+
_isDev = vitaDev || env?.NODE_ENV === "development" || env?.DECO_PREVIEW === "true";
|
|
25
29
|
|
|
26
30
|
return _isDev;
|
|
27
31
|
}
|