npm - @decocms/start - Versions diffs - 5.3.0-rc.2 → 5.4.0 - Mend

@decocms/start 5.3.0-rc.2 → 5.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/.github/workflows/release.yml +5 -2
package/MIGRATION_TOOLING_PLAN.md +1 -0
package/docs/observability.md +42 -5
package/docs/tail-worker-recipe.md +161 -0
package/package.json +1 -1
package/scripts/audit-observability-config.ts +0 -0
package/scripts/generate-blocks.ts +148 -71
package/scripts/migrate-post-cleanup.ts +0 -0
package/src/admin/decofile.ts +17 -8
package/src/sdk/env.ts +7 -3
package/src/sdk/workerEntry.ts +11 -7
package/src/vite/plugin.js +129 -10
package/docs/o11y.md +0 -602
package/scripts/sync-decofile.ts +0 -221

package/.github/workflows/release.yml CHANGED Viewed

@@ -68,9 +68,12 @@ jobs:
       - name: Advance moveable major tag
         run: |
           git fetch --tags --force
-          LATEST=$(git tag -l 'v*.*.*' --sort=-v:refname | head -n 1)
+          # Filter out prerelease tags (-rc.N, -next.N, -beta.N, -alpha.N) — the
+          # `v*.*.*` glob matches both `v5.3.0` and `v5.3.0-rc.2`, and we never
+          # want the moveable major to land on a prerelease.
+          LATEST=$(git tag -l 'v*.*.*' --sort=-v:refname | grep -vE '\-(rc|next|beta|alpha)\.' | head -n 1)
           if [ -z "$LATEST" ]; then
-            echo "::notice::no v*.*.* tags yet; nothing to advance"
+            echo "::notice::no stable v*.*.* tags yet; nothing to advance"
             exit 0
           fi
           MAJOR=$(echo "$LATEST" | sed -E 's/^(v[0-9]+).*/\1/')

package/MIGRATION_TOOLING_PLAN.md CHANGED Viewed

@@ -121,6 +121,7 @@ this plan.
 | 2026-05-07 | **D6.1 — Cloudflare credentials never leave `deco-start`** | Same-day refinement of D6 after the first central deploy on `baggagio-tanstack` failed with `Secret CLOUDFLARE_API_TOKEN is required, but not provided while calling`. The original D6 design used `secrets: inherit` from the storefront stub and required `CLOUDFLARE_*` to live in the `deco-sites` org, which broke the principle that *the only secrets a storefront repo holds are the secrets that go into wrangler secrets, not the ones used to deploy*. First-pass refinement: the central `deploy.yml` / `preview.yml` / `sync-secrets.yml` jobs declared `environment: production` to try to make `${{ secrets.CLOUDFLARE_* }}` resolve from `decocms/deco-start`'s `production` Environment. **Found broken empirically on 2026-05-07** — the deployment registers in the *caller* repo, not the called workflow's repo, so the environment lookup uses the caller's `production` env (auto-created with no secrets). Superseded by D6.2 the same evening. |
 | 2026-05-07 | **D6.2 — App-mediated dispatch + no per-site registry (supersedes D6 + D6.1)** | After D6.1's `environment:` mechanism was empirically shown not to work cross-repo, the architecture pivoted: a `decocms-deployer` GitHub App is installed on `decocms/deco-start` (`actions:write`) and on each storefront repo (`contents:read`, optionally `pull-requests:write`). The storefront caller stub mints a short-lived App-installation token and calls `gh workflow run deploy.yml --repo decocms/deco-start --ref v3 -f site_owner=… -f site_name=…`. The central workflow runs in `decocms/deco-start`'s context, so `CLOUDFLARE_API_TOKEN` / `CLOUDFLARE_ACCOUNT_ID` are ordinary repo secrets. For runtime `SECRET_*` values, each storefront has a `<site_name>-secrets` GitHub Environment in `decocms/deco-start` (S1 design); `sync-secrets.yml` binds to that environment and pushes to `wrangler secret put`. The per-site registry under `deploy/sites/<repo>.jsonc` was dropped entirely (Pure C): worker name = repo basename by convention; the App being installed on the storefront repo is the deploy authorization gate; rare per-worker derived fields (like AE dataset name) use `$WORKER_*` substitution tokens in the template. Force-rollback is impossible for production deploys because the central workflow ignores caller-supplied `site_sha` and resolves the storefront's current default-branch HEAD itself. See [`deploy/README.md`](./deploy/README.md) for the full trust model. **Operational migrations required by Pure C:** `miess-01-tanstack` repo's worker shifts from `miess-tanstack` to `miess-01-tanstack` (CF-side cutover); `lebiscuit-tanstack` AE dataset shifts from `deco_metrics_lebiscuit` to `deco_metrics_lebiscuit_tanstack` (orphans old data). |
 | 2026-05-07 | **D6.3 — Revert D6/D6.1/D6.2; deploys move to Cloudflare Workers Builds** | The whole D6 family (centralized GitHub Actions reusable workflows + `decocms-deployer` GitHub App + per-storefront GitHub Environments + central `deploy/wrangler-template.jsonc` + `deco-wrangler` CLI + per-site caller stubs) is being **reverted**. Trigger: GitHub Free orgs do not propagate org-level secrets to private repos, which forced the App private key to live as a per-storefront repo secret in every storefront — that key gives the holder the ability to mint installation tokens that can trigger workflows on `decocms/deco-start`, which in turn have the only Cloudflare credentials in the system. Per-repo distribution + rotation of that key across N customer storefronts didn't scale and concentrated blast radius on one credential. **Replacement (chosen, to be detailed in a follow-up D-record once shipped):** [Cloudflare Workers Builds](https://developers.cloudflare.com/workers/ci-cd/builds/) owns the deploy/preview pipelines per-worker. Verified empirically on `baggagio-tanstack` 2026-05-07: a malicious `wrangler.jsonc` `name` field pointing at a different worker (`americanas-tanstack`) is **ignored** by CF Builds — the deploy lands on the connected worker (`baggagio-tanstack`), CF surfaces a warning banner in the dashboard, and CF auto-opens a PR to fix the config (deco-sites/baggagio-tanstack#34). The dashboard repo<->worker connection is the source of truth; the in-repo config is treated as a secondary input. Per-storefront wiring (one CF dashboard click per worker) is acceptable at our scale; revisit when CF's [git-integration enable API](https://github.com/cloudflare/workers-sdk/issues/12058) lands. The `deco-build` CLI (regenerates `wrangler.jsonc` bindings from a central template) and runtime-secrets management remain to be designed in a separate PR. |
+| 2026-05-19 | **D-8 — Cloudflare Tail Worker (Strategy B) is the canonical 100% error capture mechanism** | At fleet scale (100 sites, 2.5B req/month) head sampling forces a tradeoff: 1% sampling makes the `head_sampling_rate * 5B-event-cap` math work, but 99% of error traces and 99% of error-correlated logs get dropped at the CF Destinations head. The framework already covers framework-emitted errors via the in-Worker direct-POST channel (`DECO_OTEL_LOGS_ENDPOINT`) — that's 100% of `logger.error(...)` regardless of `head_sampling_rate`. But three structural gaps remain that *no* in-Worker code can close from inside its own request handler: (a) uncaught throws (the worker isolate is already unwinding when the throw bubbles out of `instrumentWorker`), (b) `exceededCpu` / `exceededMemory` outcomes (the runtime kills the producer before any in-Worker code can run), (c) raw `console.error(...)` from third-party SDKs that bypass the framework logger. **Decision:** introduce [`deco-otel-tail`](https://github.com/decocms/stats-lake/tree/main/ingestion/otel-tail) — a Cloudflare Tail Worker in `stats-lake/ingestion/otel-tail/`. CF invokes it on every execution of any producer worker that lists it under `tail_consumers` (`wrangler.jsonc`). The handler filters TraceItems down to the interesting subset (`outcome !== "ok" \|\| exceptions.length > 0 \|\| logs.some(l => l.level === "error")`), translates each to OTLP LogRecords (one per exception, one per `error`-level log line, plus a synthetic LogRecord for non-ok outcomes that didn't surface either), and forwards them to `deco-otel-ingest` via an in-account service binding (no public hop). Rows land in `otel_logs` with `Attributes['_source'] = 'tail-worker'` so dashboards can split tail-captured errors from direct-POST + CF-Destinations errors. **Rejected alternatives:** (1) **Codemod + lint to enforce `logger.error` calls** — structural coverage gap; can't catch uncaught throws or 1101s by definition, and a lint can't enforce calls inside third-party code. (2) **Logpush + ingest pipeline** — bypassed because Logpush isn't OTLP-shaped and the pricing curve loses to tail-worker at our scale. (3) **CF dashboard log retention only** — no fan-out to ClickHouse, no fleet-wide query surface. (4) **DO-buffered tail-on-error** — ~$8K/mo at fleet scale per the cost model in `docs/observability.md`. **Coverage matrix lives in [`docs/observability.md`](./docs/observability.md) → "Error capture — three-channel model".** Producer-side wiring is one line per `wrangler.jsonc`: `tail_consumers: [{ service: "deco-otel-tail" }]`. **Operational dependency:** the tail worker MUST be deployed to the same Cloudflare account as `deco-otel-ingest` (currently `c95fc4cec7fc52453228d9db170c372c`) so the `[[services]]` binding resolves. If `deco-otel-ingest` ever moves accounts, the service binding collapses to a public HTTPS POST and the model needs revisiting. **Agent behaviour:** when designing error capture for new Worker-deployed code, default to Strategy B for the long tail; don't reach for codemod/lint enforcement unless there's a specific code-quality concern beyond capture. |
 The full text of the constitutional rule (loaded into every agent
 session for this repo) lives at

package/docs/observability.md CHANGED Viewed

@@ -254,14 +254,39 @@ await vtexFetch("https://account.vtexcommercestable.com.br/api/io/_v/intelligent
 Resolution precedence is `init.operation` → `defaultOperation` → `resolveOperation(url, method)` → the literal `"fetch"`. The resolved value lands on the span as `fetch.operation` (so dashboards can `GROUP BY SpanAttributes['fetch.operation']` independent of span name) and is included in the `onComplete` callback payload (so per-app duration histograms can label by operation). `operation` is stripped from `init` before reaching the underlying `fetch` — it never surfaces to the network.
+## Error capture — three-channel model
+100% capture of errors is achieved across three complementary channels, each owning a different slice of "what failed":
+| Error source                              | Channel               | Coverage           | Why it's needed                                                                                                                                                                                  |
+| ---                                       | ---                   | ---                | ---                                                                                                                                                                                              |
+| Framework `logger.error(...)`             | Direct POST           | 100% (rate-limited)| Framework owns the call site, can attach structured context (traceId, route, attrs), and can fire before the request finishes. Latency-sensitive.                                                  |
+| Framework span errors (`setError`)        | CF Destinations + tail| 1% sampled + 100% tail | Spans ride the CF Destinations pipe; tail worker picks them up again if the request finished `outcome != "ok"`. Together they give per-span detail at scale + 100% capture on regressions.       |
+| **Uncaught throws** escaping the handler  | **Tail Worker**       | **100%**           | Direct-POST can't fire — by the time the throw bubbles past `instrumentWorker`, the worker isolate is unwinding. The tail worker runs AFTER the worker terminates and receives the captured exception. |
+| **`exceededCpu` / `exceededMemory`**      | **Tail Worker**       | **100%**           | The producer is killed before any in-Worker code can run. Only the CF runtime can surface these outcomes, and it does so through the tail handler.                                                |
+| **Raw `console.error(...)`** outside framework | **Tail Worker** | **100%**           | Third-party SDKs (analytics, payment, observability libs that aren't ours) call `console.error` directly, bypassing the framework logger. CF captures every `console.*` line into the TraceItem. |
+| Info / warn logs                          | CF Destinations       | 1% sampled         | Bulk volume. Sampled to keep CF Destinations cost in check at fleet scale.                                                                                                                       |
+| OTel spans                                | CF Destinations       | 1% sampled         | Same as above — spans are 95% of the event volume.                                                                                                                                                |
+| OTel metrics                              | Direct POST           | 100% (buffered)    | CF Destinations doesn't support OTLP metrics. Direct-POST is the only path.                                                                                                                       |
+The tail-worker channel is implemented by [`deco-otel-tail`](https://github.com/decocms/stats-lake/tree/main/ingestion/otel-tail) (in the stats-lake repo). The producer wrangler opts in with:
+```jsonc
+"tail_consumers": [
+  { "service": "deco-otel-tail" }
+]
+```
+Rows from the tail worker land in `otel_logs` with `Attributes['_source'] = 'tail-worker'`, so dashboards can split out tail-captured errors from direct-POST and CF-Destinations errors as needed.
 ## Sampling
-`head_sampling_rate` on `observability.traces` and `observability.logs` decides at the very start of a trace/log whether Cloudflare Destinations forwards it to the deco-otel-ingest endpoint. CF Destinations does NOT support tail sampling (status-aware filtering after the trace completes), so the framework leans on head sampling for cost control plus a separate direct-POST channel for error logs and metrics (100% of errors and metrics are captured regardless of the head sampling rate).
+`head_sampling_rate` on `observability.traces` and `observability.logs` decides at the very start of a trace/log whether Cloudflare Destinations forwards it to the deco-otel-ingest endpoint. CF Destinations does NOT support tail sampling — the framework instead uses the three-channel error-capture model documented above to achieve 100% error capture independent of `head_sampling_rate`.
 **Recommended defaults:**
 - `traces.head_sampling_rate: 0.01` — 1% of traces forward via CF Destinations.
-- `logs.head_sampling_rate: 1.0` — 100% of info/warn logs forward via CF Destinations. **Errors are not subject to this rate** — when `DECO_OTEL_LOGS_ENDPOINT` is set, the `instrumentWorker` direct-POST error channel captures 100% of `logger.error(...)` records regardless of CF head sampling. It's safe to drop `logs.head_sampling_rate` to `0.01` for the noisier info/warn tier once you've confirmed the direct-POST channel is healthy in the CF dashboard (look for the boot log `otel: enabled service=… otlpErrorLogs=true`).
+- `logs.head_sampling_rate: 0.01` — 1% of info/warn logs forward via CF Destinations. **Errors are not subject to this rate** — they are fully covered by (a) the direct-POST channel for framework `logger.error(...)` (100%, rate-limited), and (b) the tail worker for everything else (uncaught throws, exhaustion outcomes, raw `console.error`). The earlier `logs.head_sampling_rate: 1.0` default was retired when the tail worker landed.
 **Per-site override tier (heavy traffic only):**
@@ -332,8 +357,9 @@ Different signals have different durability guarantees. Knowing where data can b
 | Signal                 | Path                          | Sampling                         | Buffer location          | Loss conditions                                                                                                                                |
 | ---------------------- | ----------------------------- | -------------------------------- | ------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------- |
 | **Traces (spans)**     | CF Destinations               | head 1% (`0.01`)                 | Cloudflare-managed       | 99% intentionally dropped at head. Of the 1% that survives, only loss is a CF Destinations outage or an ingestor 5xx (no retries from CF).      |
-| **Info / warn logs**   | CF Destinations               | head 1.0 (currently 100%)        | Cloudflare-managed       | Same as traces — only platform-side outage. Once dropped to `0.01`, 99% intentionally dropped at head.                                          |
-| **Error logs**         | Direct POST (`/v1/logs`)      | none (100%, then rate-limited)   | In-Worker buffer         | (a) Token-bucket rate limiter trips on a log storm — default `100/min` steady, `20` burst — surplus is **counted-and-dropped** via `onError`. (b) Buffer overflow (default `500` records) before the next flush — same `onError` signal. (c) A failed POST to the ingestor (non-2xx or network error) does **not** drop records — the in-flight snapshot is restored to the front of the buffer. When `snapshot + buffer > cap`, restoration drops the **newest** records first (newest tail of the live buffer, then if still over cap, newest tail of the snapshot) — the oldest, most-likely-causal records are preserved. All drops surface via `onError("overflow", …)` with counts. (d) Worker isolate forcibly evicted before `ctx.waitUntil` completes — should be rare; the flush is triggered on every request edge. |
+| **Info / warn logs**   | CF Destinations               | head 1% (`0.01`)                 | Cloudflare-managed       | 99% intentionally dropped at head. Of the 1% that survives, only loss is a CF Destinations outage.                                              |
+| **Framework error logs** | Direct POST (`/v1/logs`)    | none (100%, then rate-limited)   | In-Worker buffer         | (a) Token-bucket rate limiter trips on a log storm — default `100/min` steady, `20` burst — surplus is **counted-and-dropped** via `onError`. (b) Buffer overflow (default `500` records) before the next flush — same `onError` signal. (c) A failed POST to the ingestor (non-2xx or network error) does **not** drop records — the in-flight snapshot is restored to the front of the buffer. When `snapshot + buffer > cap`, restoration drops the **newest** records first (newest tail of the live buffer, then if still over cap, newest tail of the snapshot) — the oldest, most-likely-causal records are preserved. All drops surface via `onError("overflow", …)` with counts. (d) Worker isolate forcibly evicted before `ctx.waitUntil` completes — covered by the tail worker (next row). |
+| **Uncaught throws, `exceededCpu`, raw `console.error`** | Tail Worker (`deco-otel-tail` → `/v1/logs`) | none (100%) | Out-of-process (separate worker) | (a) Tail worker invocation failure on the CF runtime side (extremely rare; CF retries internally). (b) `deco-otel-ingest` 5xx — the tail worker logs the failure but does NOT retry the OTLP forward, so the affected batch is lost. (c) The producer dies so abruptly that CF can't materialize a TraceItem — undocumented edge case, treat as bounded by CF's own SLA. |
 | **Metrics**            | Direct POST (`/v1/metrics`)   | none (100%)                      | In-Worker buffer         | Counters and gauges are last-write-wins per datapoint — a forced eviction drops at most one flush window's worth of partial sums. Histograms with un-flushed bucket counts are lost on eviction. Buffer overflow (default `5000` datapoints) drops the oldest datapoint via `onError`. |
 | **AE metrics**         | Workers Analytics Engine      | none (sampled per-AE-policy)     | Cloudflare-managed       | AE applies its own sampling once an account crosses the 5B-events/day cap. Below the cap, AE writes are durable on the platform side.           |
@@ -346,6 +372,17 @@ What this means operationally:
 ## Out of scope
 - **In-Worker OTLP exporter for spans / info-logs.** Removed in 5.0.0; CF Destinations is the spans + info/warn-logs path. (Direct-POST does still exist for **metrics** and **error logs**, by deliberate choice — both are signals CF Destinations cannot or should not carry.)
-- **Tail-on-error sampling.** Designed away — CF Destinations doesn't support tail sampling, and a DO-backed buffer would add ~$8K/mo at fleet scale (see [Cost model](#cost-model-fleet-of-100-sites-25b-reqmonth)). 100% capture of errors is achieved instead via the direct-POST error channel.
+- **Tail-on-error sampling via a Durable Object buffer.** The DO-backed
+  approach was rejected on cost grounds (~$8K/mo at fleet scale, see
+  [Cost model](#cost-model-fleet-of-100-sites-25b-reqmonth)). The functional
+  goal — 100% capture of errors regardless of head sampling — is met via
+  two complementary mechanisms: (a) the in-Worker direct-POST channel for
+  framework `logger.error(...)` calls, and (b) the **Cloudflare Tail Worker
+  (`deco-otel-tail`, Strategy B)** which CF invokes on every invocation of
+  a producer worker that lists it under `tail_consumers`. The tail worker
+  filters TraceItems down to the "interesting" subset (outcome != ok,
+  exceptions, or `level: error` logs) and forwards them as OTLP/JSON logs
+  to `deco-otel-ingest` via an intra-account service binding. See
+  [decocms/stats-lake/ingestion/otel-tail/](https://github.com/decocms/stats-lake/tree/main/ingestion/otel-tail) and D-8.
 - **Commerce-specific spans.** Per-app (VTEX, Shopify) HTTP spans live in `@decocms/apps`, which calls `createInstrumentedFetch` (with `defaultOperation` / `resolveOperation` configured per provider) and authors `init.operation` at hot call sites. PR #3 in the apps-start repo migrates the per-app fetch sites onto that pattern. The framework owns the span shape (`${name}.${operation}`); the apps repo owns the operation strings + provider-labelled duration histogram.
 - **PII redaction at the framework layer.** URLs are redacted by `redactUrl()` on outbound `fetch` spans; the rest (cookie, authorization, x-vtex-* headers) is redacted in the ingest Worker. No per-site code required for either side.

package/docs/tail-worker-recipe.md ADDED Viewed

@@ -0,0 +1,161 @@
+# Per-site recipe — adopt the `deco-otel-tail` tail worker
+> **Status:** gated. Do NOT roll out to the fleet until the canary site
+> (`casaevideo-tanstack`) has completed a 7-day soak with no false negatives
+> in tail capture and no infrastructure cost shock from the tail-handler
+> invocations. See [D-8 in `MIGRATION_TOOLING_PLAN.md`](../MIGRATION_TOOLING_PLAN.md)
+> for the decision record and [`docs/observability.md`](./observability.md)
+> for the architecture.
+This is the mechanical recipe for opting any deco storefront worker into
+the [`deco-otel-tail`](https://github.com/decocms/stats-lake/tree/main/ingestion/otel-tail)
+Cloudflare Tail Worker (Strategy B). One PR per storefront repo, two
+file-level changes per PR.
+## Preconditions
+Before opening any per-site PR:
+1. **The tail worker is deployed.** Verify via the Cloudflare dashboard
+   (`Workers & Pages → deco-otel-tail`) or:
+   ```bash
+   cd decocms/stats-lake/ingestion/otel-tail
+   wrangler deployments list
+   ```
+   You want to see at least one deployment with the `INGEST: deco-otel-ingest`
+   service binding.
+2. **The producer worker is in the same Cloudflare account.** The
+   `tail_consumers` mechanism cannot reference workers in a different
+   account. Every deco fleet worker is in `c95fc4cec7fc52453228d9db170c372c`
+   (the `decocms - production` enterprise account), so this is normally
+   true — but a future split would invalidate the recipe.
+3. **The producer worker is on `@decocms/start >= 5.3.0`.** Older versions
+   don't ship the three-channel error capture model and the sampling flip
+   in step 2 below will silently drop info/warn diagnostics that some
+   dashboards still depend on. If the site is below 5.3.0, bump it first.
+## The PR (~5 lines of producer-side wrangler change)
+In the storefront repo, edit `wrangler.jsonc`:
+### Step 1 — Wire the tail consumer
+Add at the top level (anywhere is fine; we conventionally place it near the
+other producer-level keys like `kv_namespaces`):
+```jsonc
+"tail_consumers": [
+  { "service": "deco-otel-tail" }
+],
+```
+Cloudflare will now invoke `deco-otel-tail` on every execution of this
+worker, regardless of outcome. The tail worker itself filters down to
+the "interesting" subset before forwarding anything — see
+[`stats-lake/ingestion/otel-tail/src/index.ts`](https://github.com/decocms/stats-lake/blob/main/ingestion/otel-tail/src/index.ts).
+### Step 2 — Flip `observability.logs.head_sampling_rate`
+Inside the existing `observability.logs` block:
+```jsonc
+"observability": {
+  "logs": {
+    // ...other settings...
+    "head_sampling_rate": 0.01,   // was 1 — see docs/observability.md
+  }
+}
+```
+This drops info/warn CF Destinations log volume by 100x. Errors are NOT
+affected — they're now covered by:
+- the in-Worker direct-POST channel for framework `logger.error(...)`
+  (100%, rate-limited), and
+- the tail worker for everything else (uncaught throws, exhaustion
+  outcomes, raw `console.error` from third-party SDKs — all 100%).
+If the site has dashboards or alerts that depend on info-level log
+volume, audit them before flipping. Don't be heroic — keep the previous
+rate temporarily, document the dashboard dependency, fix the dashboard,
+then flip in a follow-up.
+### Conventional commit + PR body
+The change is mechanically trivial. Use:
+```
+feat(observability): adopt deco-otel-tail + drop logs sampling to 1%
+- Wire `tail_consumers` to deco-otel-tail (100% capture of uncaught
+  throws / exceededCpu / raw console.error).
+- Drop `observability.logs.head_sampling_rate` from 1 to 0.01 now that
+  errors are covered by the direct-POST + tail channels.
+Implements Strategy B (D-8) for this site. See
+decocms/deco-start/docs/observability.md, "Error capture — three-channel
+model" for the coverage matrix.
+```
+## Per-site post-deploy validation (60s ceremony)
+After the wrangler change ships, hit any route on the deployed worker
+that produces an error log, then verify the row lands in ClickHouse with
+`Attributes['_source'] = 'tail-worker'`:
+```sql
+SELECT Timestamp, ServiceName, SeverityText, Body,
+       Attributes['_outcome'] AS outcome,
+       Attributes['_source']  AS source
+FROM otel_logs
+WHERE ServiceName = '<your-worker-name>'   -- e.g. 'casaevideo-tanstack'
+  AND Attributes['_source'] = 'tail-worker'
+  AND Timestamp > now() - INTERVAL 5 MINUTE
+ORDER BY Timestamp DESC
+LIMIT 20;
+```
+If you get zero rows after 5 minutes of normal traffic, the most likely
+causes are:
+1. The producer's `tail_consumers` block didn't land (rare — wrangler
+   would have errored out). Double-check the deployed `wrangler.jsonc`
+   via `wrangler deployments view`.
+2. The site is in a Cloudflare account different from `decocms - production`.
+   See "Preconditions" above.
+3. `deco-otel-tail` is unhealthy. Tail it directly:
+   ```bash
+   cd decocms/stats-lake/ingestion/otel-tail
+   wrangler tail
+   ```
+## Rollout batching
+Open per-site PRs in batches of 10–20 to keep CI load manageable and to
+make it easy to roll back a batch if a regression sneaks in. The change
+is mechanically identical across all sites, so a single shared template
+can be used. Track adoption state in a fleet-rollout issue on
+[`decocms/deco-start`](https://github.com/decocms/deco-start/issues) so
+it's a single source of truth for who's onboarded.
+## When NOT to use this recipe
+- **Site does NOT use `@decocms/start`.** The recipe only covers fleet
+  workers that already participate in the three-channel observability
+  model. Sites on other frameworks should adopt the recipe after their
+  own observability story stabilises.
+- **Site is in a different Cloudflare account.** The `tail_consumers`
+  service reference would fail to resolve. Either consolidate the site
+  into the `decocms - production` account or stand up an account-local
+  copy of `deco-otel-tail` first (and update the service name in
+  `tail_consumers` accordingly).
+- **Site has a high-info-log diagnostic culture.** If the team genuinely
+  uses info-level CF Destinations logs for daily debugging — and is
+  willing to pay for the volume — keep `logs.head_sampling_rate` at 1
+  for that site and skip step 2. Only adopt step 1 (`tail_consumers`).
+  Document the deviation in the PR description.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@decocms/start",
-  "version": "5.3.0-rc.2",
+  "version": "5.4.0",
   "type": "module",
   "description": "Deco framework for TanStack Start - CMS bridge, admin protocol, hooks, schema generation",
   "main": "./src/index.ts",

package/scripts/audit-observability-config.ts CHANGED Viewed

File without changes

package/scripts/generate-blocks.ts CHANGED Viewed

@@ -15,6 +15,14 @@
  * Env / CLI:
  *   --blocks-dir  override input  (default: .deco/blocks)
  *   --out-file    override output (default: src/server/cms/blocks.gen.ts)
+ *
+ * Programmatic:
+ *   import { generateBlocks } from "@decocms/start/scripts/generate-blocks";
+ *   await generateBlocks({ blocksDir, outFile });
+ *
+ * The Vite plugin's dev-mode watcher uses the programmatic entry to keep the
+ * generated artifact in sync with `.deco/blocks/` without spawning a child
+ * process per change.
  */
 import fs from "node:fs";
 import path from "node:path";
@@ -25,16 +33,6 @@ import {
   mergeCandidates,
 } from "./lib/blocks-dedupe";
-const args = process.argv.slice(2);
-function arg(name: string, fallback: string): string {
-  const idx = args.indexOf(`--${name}`);
-  return idx !== -1 && args[idx + 1] ? args[idx + 1] : fallback;
-}
-const blocksDir = path.resolve(process.cwd(), arg("blocks-dir", ".deco/blocks"));
-const outFile = path.resolve(process.cwd(), arg("out-file", "src/server/cms/blocks.gen.ts"));
-const jsonFile = outFile.replace(/\.ts$/, ".json");
 const TS_STUB = [
   "// Auto-generated — thin wrapper around blocks.gen.json.",
   "// The Vite plugin replaces this at load time with JSON.parse(...).",
@@ -44,75 +42,154 @@ const TS_STUB = [
   "",
 ].join("\n");
-if (!fs.existsSync(blocksDir)) {
-  console.warn(`Blocks directory not found: ${blocksDir} — generating empty barrel.`);
-  fs.mkdirSync(path.dirname(outFile), { recursive: true });
-  fs.writeFileSync(jsonFile, "{}");
-  fs.writeFileSync(outFile, TS_STUB);
-  process.exit(0);
+export interface GenerateBlocksOptions {
+  blocksDir: string;
+  outFile: string;
+  /** Suppress the per-run summary log. Defaults to false. */
+  silent?: boolean;
 }
-const files = fs.readdirSync(blocksDir).filter((f) => f.endsWith(".json"));
-// Read each file into a Candidate, then let the dedupe lib pick the winner
-// per decoded key and report any collisions. See `lib/blocks-dedupe.ts` for
-// the priority order and the rationale behind it (TL;DR: never use file size,
-// don't trust mtime alone in CI clones).
-const candidatesWithKeys: Array<{ candidate: Candidate; key: string }> = [];
-for (const file of files) {
-  const { name, passes } = decodeBlockNameWithPasses(file);
-  const fp = path.join(blocksDir, file);
-  let parsed: unknown;
-  try {
-    parsed = JSON.parse(fs.readFileSync(fp, "utf-8"));
-  } catch (e) {
-    console.warn(`Failed to parse ${file}:`, e);
-    continue;
-  }
-  candidatesWithKeys.push({
-    key: name,
-    candidate: {
-      file,
-      passes,
-      mtimeMs: fs.statSync(fp).mtimeMs,
-      hasPath: blockHasPath(parsed),
-      parsed,
-    },
-  });
+export interface GenerateBlocksResult {
+  count: number;
+  collisions: number;
+  jsonFile: string;
+  outFile: string;
+  /** True when the blocks dir was missing and an empty barrel was emitted. */
+  empty: boolean;
 }
-const { winners, collisions } = mergeCandidates(candidatesWithKeys);
-if (collisions.length > 0) {
-  console.warn(
-    `Detected ${collisions.length} filename collision(s) in ${path.relative(process.cwd(), blocksDir)}:`,
-  );
-  for (const c of collisions) {
-    const losers = c.files.filter((f) => f !== c.winner);
-    console.warn(`  - ${c.key}`);
-    console.warn(`      winner: ${c.winner}`);
-    for (const l of losers) console.warn(`      ignore: ${l}`);
+export async function generateBlocks(
+  options: GenerateBlocksOptions,
+): Promise<GenerateBlocksResult> {
+  const blocksDir = path.resolve(options.blocksDir);
+  const outFile = path.resolve(options.outFile);
+  const jsonFile = outFile.replace(/\.ts$/, ".json");
+  const silent = options.silent ?? false;
+  if (!fs.existsSync(blocksDir)) {
+    if (!silent) {
+      console.warn(`Blocks directory not found: ${blocksDir} — generating empty barrel.`);
+    }
+    fs.mkdirSync(path.dirname(outFile), { recursive: true });
+    fs.writeFileSync(jsonFile, "{}");
+    fs.writeFileSync(outFile, TS_STUB);
+    return { count: 0, collisions: 0, jsonFile, outFile, empty: true };
+  }
+  const files = fs.readdirSync(blocksDir).filter((f) => f.endsWith(".json"));
+  // Read each file into a Candidate, then let the dedupe lib pick the winner
+  // per decoded key and report any collisions. See `lib/blocks-dedupe.ts` for
+  // the priority order and the rationale behind it (TL;DR: never use file size,
+  // don't trust mtime alone in CI clones).
+  const candidatesWithKeys: Array<{ candidate: Candidate; key: string }> = [];
+  for (const file of files) {
+    const { name, passes } = decodeBlockNameWithPasses(file);
+    const fp = path.join(blocksDir, file);
+    let parsed: unknown;
+    try {
+      parsed = JSON.parse(fs.readFileSync(fp, "utf-8"));
+    } catch (e) {
+      if (!silent) console.warn(`Failed to parse ${file}:`, e);
+      continue;
+    }
+    candidatesWithKeys.push({
+      key: name,
+      candidate: {
+        file,
+        passes,
+        mtimeMs: fs.statSync(fp).mtimeMs,
+        hasPath: blockHasPath(parsed),
+        parsed,
+      },
+    });
+  }
+  const { winners, collisions } = mergeCandidates(candidatesWithKeys);
+  if (!silent && collisions.length > 0) {
+    console.warn(
+      `Detected ${collisions.length} filename collision(s) in ${path.relative(process.cwd(), blocksDir)}:`,
+    );
+    for (const c of collisions) {
+      const losers = c.files.filter((f) => f !== c.winner);
+      console.warn(`  - ${c.key}`);
+      console.warn(`      winner: ${c.winner}`);
+      for (const l of losers) console.warn(`      ignore: ${l}`);
+    }
+    console.warn("    Cause: multiple writers (manual sync vs deco-sync-bot) producing");
+    console.warn("    different filename encodings for the same logical key. Delete the");
+    console.warn("    stale file(s) listed under 'ignore' to silence this warning.");
+  }
+  const blocks: Record<string, unknown> = {};
+  for (const [name, c] of Object.entries(winners)) {
+    blocks[name] = c.parsed;
+  }
+  fs.mkdirSync(path.dirname(outFile), { recursive: true });
+  // 1. Compact JSON — the real data (no pretty-printing to save ~40% size)
+  const jsonStr = JSON.stringify(blocks);
+  fs.writeFileSync(jsonFile, jsonStr);
+  // 2. Thin TS wrapper — just for TypeScript tooling and as a Vite load target.
+  // Only write if content differs to avoid triggering Vite's file watcher,
+  // which would cascade module invalidation to the route tree and crash
+  // TanStack Router during dev hot-reload.
+  let existingTs: string | undefined;
+  try { existingTs = fs.readFileSync(outFile, "utf-8"); } catch {}
+  if (existingTs !== TS_STUB) {
+    fs.writeFileSync(outFile, TS_STUB);
   }
-  console.warn("    Cause: multiple writers (manual sync vs deco-sync-bot) producing");
-  console.warn("    different filename encodings for the same logical key. Delete the");
-  console.warn("    stale file(s) listed under 'ignore' to silence this warning.");
-}
-const blocks: Record<string, unknown> = {};
-for (const [name, c] of Object.entries(winners)) {
-  blocks[name] = c.parsed;
+  if (!silent) {
+    const jsonSizeMB = (Buffer.byteLength(jsonStr) / 1_048_576).toFixed(1);
+    console.log(
+      `Generated ${Object.keys(blocks).length} blocks → ${path.relative(process.cwd(), jsonFile)} (${jsonSizeMB} MB)`,
+    );
+  }
+  return {
+    count: Object.keys(blocks).length,
+    collisions: collisions.length,
+    jsonFile,
+    outFile,
+    empty: false,
+  };
 }
-fs.mkdirSync(path.dirname(outFile), { recursive: true });
+// ---------------------------------------------------------------------------
+// CLI shim — preserved so `npm run generate:blocks` and migration scripts
+// keep working unchanged.
+// ---------------------------------------------------------------------------
+function isMainModule(): boolean {
+  // tsx/node ESM: import.meta.url matches process.argv[1] when invoked directly.
+  // Use a forgiving comparison so it works under both `tsx script.ts` and
+  // `node --import tsx script.ts`.
+  const entry = process.argv[1];
+  if (!entry) return false;
+  try {
+    const entryUrl = new URL(`file://${path.resolve(entry)}`).href;
+    return import.meta.url === entryUrl;
+  } catch {
+    return false;
+  }
+}
-// 1. Compact JSON — the real data (no pretty-printing to save ~40% size)
-const jsonStr = JSON.stringify(blocks);
-fs.writeFileSync(jsonFile, jsonStr);
+if (isMainModule()) {
+  const args = process.argv.slice(2);
+  const arg = (name: string, fallback: string): string => {
+    const idx = args.indexOf(`--${name}`);
+    return idx !== -1 && args[idx + 1] ? args[idx + 1] : fallback;
+  };
-// 2. Thin TS wrapper — just for TypeScript tooling and as a Vite load target
-fs.writeFileSync(outFile, TS_STUB);
+  const blocksDir = path.resolve(process.cwd(), arg("blocks-dir", ".deco/blocks"));
+  const outFile = path.resolve(process.cwd(), arg("out-file", "src/server/cms/blocks.gen.ts"));
-const jsonSizeMB = (Buffer.byteLength(jsonStr) / 1_048_576).toFixed(1);
-console.log(
-  `Generated ${Object.keys(blocks).length} blocks → ${path.relative(process.cwd(), jsonFile)} (${jsonSizeMB} MB)`,
-);
+  generateBlocks({ blocksDir, outFile }).catch((err) => {
+    console.error(err);
+    process.exit(1);
+  });
+}

package/scripts/migrate-post-cleanup.ts CHANGED Viewed

File without changes

package/src/admin/decofile.ts CHANGED Viewed

@@ -20,15 +20,24 @@ export async function handleDecofileReload(
   request: Request,
   env?: Record<string, unknown>,
 ): Promise<Response> {
-  const authHeader = request.headers.get("Authorization") || "";
-  const expectedToken =
-    (env?.DECO_RELEASE_RELOAD_TOKEN as string | undefined) ??
-    (typeof globalThis.process !== "undefined"
-      ? globalThis.process.env?.DECO_RELEASE_RELOAD_TOKEN
-      : undefined);
+  // In dev mode the Vite plugin POSTs new blocks here to hot-reload without
+  // module invalidation (which breaks TanStack Start/Router state). Skip auth
+  // so the plugin can POST from localhost.
+  // Uses import.meta.env.DEV directly (not isDevMode()) because isDevMode()
+  // bypass auth. Vite statically replaces import.meta.env.DEV with `false`
+  // in production builds, so this branch is dead-code-eliminated.
+  const isViteDev = !!(import.meta as unknown as { env?: { DEV?: boolean } }).env?.DEV;
+  if (!isViteDev) {
+    const authHeader = request.headers.get("Authorization") || "";
+    const expectedToken =
+      (env?.DECO_RELEASE_RELOAD_TOKEN as string | undefined) ??
+      (typeof globalThis.process !== "undefined"
+        ? globalThis.process.env?.DECO_RELEASE_RELOAD_TOKEN
+        : undefined);
-  if (!expectedToken || authHeader !== expectedToken) {
-    return new Response("Unauthorized", { status: 401 });
+    if (!expectedToken || authHeader !== expectedToken) {
+      return new Response("Unauthorized", { status: 401 });
+    }
   }
   let newBlocks: Record<string, unknown>;

package/src/sdk/env.ts CHANGED Viewed

@@ -11,8 +11,8 @@ let _isDev: boolean | null = null;
  * Returns `true` when running in a development environment.
  *
  * Detection order:
- *  1. `DECO_CACHE_DISABLE=true` — explicit opt-in (always wins)
- *  2. `NODE_ENV=development`    — standard Node/Vite convention
+ *  1. `import.meta.env.DEV`  — Vite build-time constant (reliable in Workers/Miniflare)
+ *  2. `NODE_ENV=development` — standard Node/Vite convention
  *
  * The result is memoised after the first evaluation.
  */
@@ -21,7 +21,11 @@ export function isDevMode(): boolean {
   const env = typeof globalThis.process !== "undefined" ? globalThis.process.env : undefined;
-  _isDev = env?.DECO_CACHE_DISABLE === "true" || env?.NODE_ENV === "development";
+  // Vite statically replaces import.meta.env.DEV at build time (true in dev, false in prod).
+  // In Miniflare/Workers, process.env is unavailable, so this is the reliable signal.
+  const vitaDev = !!(import.meta as unknown as { env?: { DEV?: boolean } }).env?.DEV;
+  _isDev = vitaDev || env?.NODE_ENV === "development" || env?.DECO_PREVIEW === "true";
   return _isDev;
 }