autotel 3.0.0 → 3.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. package/README.md +21 -4
  2. package/dist/attribute-redacting-processor.cjs +8 -8
  3. package/dist/attribute-redacting-processor.d.cts +10 -1
  4. package/dist/attribute-redacting-processor.d.ts +10 -1
  5. package/dist/attribute-redacting-processor.js +1 -1
  6. package/dist/attributes.cjs +21 -21
  7. package/dist/attributes.js +2 -2
  8. package/dist/auto.cjs +3 -3
  9. package/dist/auto.js +2 -2
  10. package/dist/{chunk-7HNQYHK4.js → chunk-52PUSFC2.js} +3 -3
  11. package/dist/{chunk-7HNQYHK4.js.map → chunk-52PUSFC2.js.map} +1 -1
  12. package/dist/{chunk-L7JDUDJD.cjs → chunk-7SMNC4LS.cjs} +7 -7
  13. package/dist/{chunk-L7JDUDJD.cjs.map → chunk-7SMNC4LS.cjs.map} +1 -1
  14. package/dist/{chunk-563EL6O6.cjs → chunk-BPO2PQ3T.cjs} +12 -8
  15. package/dist/chunk-BPO2PQ3T.cjs.map +1 -0
  16. package/dist/{chunk-ZSABTI3C.cjs → chunk-DAZ7EGR4.cjs} +17 -17
  17. package/dist/{chunk-ZSABTI3C.cjs.map → chunk-DAZ7EGR4.cjs.map} +1 -1
  18. package/dist/{chunk-ER43K7ES.js → chunk-DDXIUZEG.js} +3 -3
  19. package/dist/{chunk-ER43K7ES.js.map → chunk-DDXIUZEG.js.map} +1 -1
  20. package/dist/{chunk-JKIMEPI2.cjs → chunk-DQ2SUROF.cjs} +4 -4
  21. package/dist/{chunk-JKIMEPI2.cjs.map → chunk-DQ2SUROF.cjs.map} +1 -1
  22. package/dist/{chunk-KHGA4OST.cjs → chunk-HKZHUGGN.cjs} +5 -5
  23. package/dist/{chunk-KHGA4OST.cjs.map → chunk-HKZHUGGN.cjs.map} +1 -1
  24. package/dist/{chunk-TDNKIHKT.js → chunk-JVWJDHDB.js} +13 -4
  25. package/dist/chunk-JVWJDHDB.js.map +1 -0
  26. package/dist/{chunk-3QMFLJHJ.js → chunk-K7HSRLP5.js} +3 -3
  27. package/dist/{chunk-3QMFLJHJ.js.map → chunk-K7HSRLP5.js.map} +1 -1
  28. package/dist/{chunk-CJ4PD2TZ.cjs → chunk-KKGM42RQ.cjs} +13 -13
  29. package/dist/{chunk-CJ4PD2TZ.cjs.map → chunk-KKGM42RQ.cjs.map} +1 -1
  30. package/dist/{chunk-DWOBIBLY.cjs → chunk-MOO75VE4.cjs} +5 -5
  31. package/dist/{chunk-DWOBIBLY.cjs.map → chunk-MOO75VE4.cjs.map} +1 -1
  32. package/dist/{chunk-CMNGGTQL.cjs → chunk-NXLRY2CE.cjs} +13 -4
  33. package/dist/chunk-NXLRY2CE.cjs.map +1 -0
  34. package/dist/{chunk-4DAG3RFS.js → chunk-OM4OSBOP.js} +4 -4
  35. package/dist/{chunk-4DAG3RFS.js.map → chunk-OM4OSBOP.js.map} +1 -1
  36. package/dist/{chunk-DAAJLUTO.js → chunk-PMRWMRXY.js} +4 -4
  37. package/dist/{chunk-DAAJLUTO.js.map → chunk-PMRWMRXY.js.map} +1 -1
  38. package/dist/{chunk-MOK3E54E.cjs → chunk-QPH5ZKP5.cjs} +32 -32
  39. package/dist/{chunk-MOK3E54E.cjs.map → chunk-QPH5ZKP5.cjs.map} +1 -1
  40. package/dist/{chunk-IUDXKLS4.js → chunk-TFRZOUTV.js} +3 -3
  41. package/dist/{chunk-IUDXKLS4.js.map → chunk-TFRZOUTV.js.map} +1 -1
  42. package/dist/{chunk-QG3U5ONP.js → chunk-Z7VAOK5X.js} +3 -3
  43. package/dist/{chunk-QG3U5ONP.js.map → chunk-Z7VAOK5X.js.map} +1 -1
  44. package/dist/{chunk-W35FVJBC.js → chunk-ZDPIWKWD.js} +9 -5
  45. package/dist/chunk-ZDPIWKWD.js.map +1 -0
  46. package/dist/correlation-id.cjs +11 -11
  47. package/dist/correlation-id.js +3 -3
  48. package/dist/decorators.cjs +5 -5
  49. package/dist/decorators.js +4 -4
  50. package/dist/event.cjs +7 -7
  51. package/dist/event.js +4 -4
  52. package/dist/functional.cjs +11 -11
  53. package/dist/functional.js +4 -4
  54. package/dist/http.cjs +4 -4
  55. package/dist/http.js +3 -3
  56. package/dist/index.cjs +226 -92
  57. package/dist/index.cjs.map +1 -1
  58. package/dist/index.d.cts +67 -3
  59. package/dist/index.d.ts +67 -3
  60. package/dist/index.js +138 -15
  61. package/dist/index.js.map +1 -1
  62. package/dist/instrumentation.cjs +9 -9
  63. package/dist/instrumentation.js +2 -2
  64. package/dist/messaging.cjs +8 -8
  65. package/dist/messaging.js +5 -5
  66. package/dist/semantic-helpers.cjs +9 -9
  67. package/dist/semantic-helpers.js +5 -5
  68. package/dist/webhook.cjs +6 -6
  69. package/dist/webhook.js +4 -4
  70. package/dist/workflow-distributed.cjs +6 -6
  71. package/dist/workflow-distributed.js +4 -4
  72. package/dist/workflow.cjs +9 -9
  73. package/dist/workflow.js +5 -5
  74. package/package.json +43 -45
  75. package/skills/analyze-traces/SKILL.md +178 -0
  76. package/skills/autotel-core/SKILL.md +0 -7
  77. package/skills/autotel-events/SKILL.md +0 -6
  78. package/skills/autotel-frameworks/SKILL.md +0 -9
  79. package/skills/autotel-instrumentation/SKILL.md +0 -7
  80. package/skills/autotel-request-logging/SKILL.md +0 -8
  81. package/skills/autotel-structured-errors/SKILL.md +0 -7
  82. package/skills/build-audit-trails/SKILL.md +302 -0
  83. package/skills/debug-missing-spans/SKILL.md +248 -0
  84. package/skills/migrate-to-autotel/SKILL.md +268 -0
  85. package/skills/review-otel-patterns/SKILL.md +488 -0
  86. package/skills/review-otel-patterns/references/code-review.md +75 -0
  87. package/skills/review-otel-patterns/references/processor-pipeline.md +205 -0
  88. package/skills/review-otel-patterns/references/structured-errors.md +102 -0
  89. package/skills/review-otel-patterns/references/wide-spans.md +85 -0
  90. package/skills/tune-sampling/SKILL.md +210 -0
  91. package/src/attribute-redacting-processor.test.ts +6 -4
  92. package/src/attribute-redacting-processor.ts +11 -2
  93. package/src/drain-toolkit.test.ts +113 -0
  94. package/src/drain-toolkit.ts +129 -0
  95. package/src/enricher-toolkit.test.ts +67 -0
  96. package/src/enricher-toolkit.ts +79 -0
  97. package/src/index.ts +19 -0
  98. package/src/redact-values.test.ts +24 -10
  99. package/src/redact-values.ts +9 -2
  100. package/src/request-logger.test.ts +91 -0
  101. package/src/request-logger.ts +36 -2
  102. package/src/structured-error.test.ts +4 -1
  103. package/bin/intent.js +0 -6
  104. package/dist/chunk-563EL6O6.cjs.map +0 -1
  105. package/dist/chunk-CMNGGTQL.cjs.map +0 -1
  106. package/dist/chunk-TDNKIHKT.js.map +0 -1
  107. package/dist/chunk-W35FVJBC.js.map +0 -1
  108. package/src/package-manifest.test.ts +0 -24
@@ -2,14 +2,6 @@
2
2
  name: autotel-request-logging
3
3
  description: >
4
4
  getRequestLogger(), set(), info/warn/error, emitNow(). One snapshot per request; requires active span. Use when adding request-scoped context or replacing scattered console.log.
5
- type: core
6
- library: autotel
7
- library_version: '2.23.0'
8
- requires:
9
- - autotel-instrumentation
10
- sources:
11
- - jagreehal/autotel:packages/autotel/src/request-logger.ts
12
- - jagreehal/autotel:docs/AGENT-GUIDE.md
13
5
  ---
14
6
 
15
7
  # Autotel — Request Logging
@@ -2,13 +2,6 @@
2
2
  name: autotel-structured-errors
3
3
  description: >
4
4
  createStructuredError, parseError, recordStructuredError. API errors with message, why, fix, link; client parsing for UI. Use in API routes and client catch blocks.
5
- type: core
6
- library: autotel
7
- library_version: '2.23.0'
8
- sources:
9
- - jagreehal/autotel:packages/autotel/src/structured-error.ts
10
- - jagreehal/autotel:packages/autotel/src/parse-error.ts
11
- - jagreehal/autotel:docs/AGENT-GUIDE.md
12
5
  ---
13
6
 
14
7
  # Autotel — Structured Errors
@@ -0,0 +1,302 @@
1
+ ---
2
+ name: build-audit-trails
3
+ description: >
4
+ Design tamper-aware audit trails on top of OpenTelemetry spans using
5
+ autotel. Covers what counts as auditable, the audit-only span discipline,
6
+ signing and tamper-detection, denial logging, redaction, retention,
7
+ separation of concerns from operational telemetry, and framework wiring
8
+ (Next.js, Nuxt, Hono, Express, Cloudflare Workers).
9
+ license: MIT
10
+ ---
11
+
12
+ # Build audit trails
13
+
14
+ An _audit trail_ is a record of who did what to which resource, when, and whether it was permitted — durable, tamper-evident, and admissible. Operational telemetry (latency, errors, span shapes) is for engineers; audit trails are for compliance, security, and forensics. They overlap technically but differ on every other axis.
15
+
16
+ autotel lets you express both with the same primitive — a span — but you should keep them on **separate processors** so an audit event never gets dropped by sampling, never gets redacted by a debug rule, and never goes to the same backend as your ops data.
17
+
18
+ ## When to use
19
+
20
+ - Implementing GDPR / HIPAA / SOC2 / PCI-DSS / ISO 27001 / GxP compliance
21
+ - Adding "who did what" trails for admin actions, access reviews, payments
22
+ - Recording authorization decisions (allow + deny)
23
+ - Building immutable evidence for incident response
24
+
25
+ ## The audit span discipline
26
+
27
+ An auditable event has six required parts:
28
+
29
+ | Field | OTel attribute | Example |
30
+ | ------------------- | --------------------------------------------------------------- | --------------------------------------------- |
31
+ | When | (span timestamp) | `2026-05-04T17:23:11.412Z` |
32
+ | Who | `enduser.id` + `enduser.role` | `usr_42`, `admin` |
33
+ | Where (acting from) | `client.address`, `network.peer.address`, `user_agent.original` | `203.0.113.5`, `Chrome 121` |
34
+ | What | `audit.action` | `secret.read`, `policy.update`, `user.delete` |
35
+ | Which resource | `audit.resource.type` + `audit.resource.id` | `secret`, `sec_abc` |
36
+ | Outcome | `audit.outcome` (`allow` / `deny`) + `audit.reason` | `deny`, `MFA required` |
37
+
38
+ Plus useful optional fields: `audit.policy.id` (which policy made the call), `audit.evidence` (linked artefact id), `audit.actor.session.id`.
39
+
40
+ ## Step 1: Define a typed `audit()` helper
41
+
42
+ Centralise the schema in one place so every site gets it right:
43
+
44
+ ```typescript
45
+ import { trace, SpanKind } from '@opentelemetry/api';
46
+
47
+ type AuditAction =
48
+ | 'secret.read'
49
+ | 'secret.write'
50
+ | 'secret.delete'
51
+ | 'policy.update'
52
+ | 'user.create'
53
+ | 'user.delete'
54
+ | 'data.export'
55
+ | 'session.assume';
56
+
57
+ interface AuditPayload {
58
+ action: AuditAction;
59
+ resource: { type: string; id: string };
60
+ outcome: 'allow' | 'deny';
61
+ reason?: string;
62
+ actor?: { id: string; role?: string; sessionId?: string };
63
+ policy?: { id: string };
64
+ evidence?: { id: string };
65
+ }
66
+
67
+ const tracer = trace.getTracer('autotel-audit', '1.0.0');
68
+
69
+ export function audit(payload: AuditPayload): void {
70
+ const span = tracer.startSpan(`audit.${payload.action}`, {
71
+ kind: SpanKind.INTERNAL,
72
+ attributes: {
73
+ audit: true,
74
+ 'audit.action': payload.action,
75
+ 'audit.outcome': payload.outcome,
76
+ 'audit.resource.type': payload.resource.type,
77
+ 'audit.resource.id': payload.resource.id,
78
+ ...(payload.reason && { 'audit.reason': payload.reason }),
79
+ ...(payload.actor?.id && { 'enduser.id': payload.actor.id }),
80
+ ...(payload.actor?.role && { 'enduser.role': payload.actor.role }),
81
+ ...(payload.actor?.sessionId && {
82
+ 'audit.actor.session.id': payload.actor.sessionId,
83
+ }),
84
+ ...(payload.policy?.id && { 'audit.policy.id': payload.policy.id }),
85
+ ...(payload.evidence?.id && { 'audit.evidence.id': payload.evidence.id }),
86
+ },
87
+ });
88
+ span.end();
89
+ }
90
+ ```
91
+
92
+ ## Step 2: Always log denials
93
+
94
+ A frequent compliance failure is "we logged what users did but not what we **stopped** them doing." Wrap the authorization decision so both branches go through `audit()`:
95
+
96
+ ```typescript
97
+ export async function withAuthz<T>(
98
+ payload: Omit<AuditPayload, 'outcome'>,
99
+ decide: () => Promise<{ allow: boolean; reason?: string }>,
100
+ body: () => Promise<T>,
101
+ ): Promise<T> {
102
+ const decision = await decide();
103
+ if (!decision.allow) {
104
+ audit({ ...payload, outcome: 'deny', reason: decision.reason });
105
+ throw createStructuredError({
106
+ status: 403,
107
+ code: 'FORBIDDEN',
108
+ message: 'Not allowed',
109
+ why: decision.reason ?? 'Insufficient permissions',
110
+ });
111
+ }
112
+ audit({ ...payload, outcome: 'allow' });
113
+ return body();
114
+ }
115
+ ```
116
+
117
+ ## Step 3: Separate the audit pipeline
118
+
119
+ Critical: route audit spans to a **different processor and backend** so:
120
+
121
+ - They are never dropped by head or tail sampling.
122
+ - They are not subject to development-mode debug exporters.
123
+ - They go to a write-once / append-only store (S3 Object Lock, immutable bucket, dedicated audit DB).
124
+
125
+ ```typescript
126
+ import {
127
+ composeSpanProcessors,
128
+ composeSubscribers,
129
+ defineConfig,
130
+ } from 'autotel-edge';
131
+ import { BatchSpanProcessor, FilteringSpanProcessor } from 'autotel/processors';
132
+
133
+ const auditExporter = new BatchSpanProcessor(
134
+ new OTLPHttpJsonExporter({
135
+ url: process.env.AUDIT_OTLP!,
136
+ headers: { authorization: `Bearer ${process.env.AUDIT_TOKEN!}` },
137
+ }),
138
+ );
139
+ const opsExporter = new BatchSpanProcessor(
140
+ new OTLPHttpJsonExporter({ url: process.env.OPS_OTLP! }),
141
+ );
142
+
143
+ // Only audit spans reach the audit pipeline.
144
+ const auditOnly = new FilteringSpanProcessor({
145
+ include: (span) => span.attributes['audit'] === true,
146
+ next: auditExporter,
147
+ });
148
+
149
+ // Conversely, ops never sees audit spans (avoid leaking PII to dashboards).
150
+ const opsOnly = new FilteringSpanProcessor({
151
+ exclude: (span) => span.attributes['audit'] === true,
152
+ next: opsExporter,
153
+ });
154
+
155
+ export const otelConfig = defineConfig({
156
+ service: { name: 'app' },
157
+ spanProcessors: composeSpanProcessors([auditOnly, opsOnly]),
158
+ });
159
+ ```
160
+
161
+ ## Step 4: Tamper detection
162
+
163
+ For environments where audit storage is shared with the producing service (no append-only bucket), sign each span:
164
+
165
+ ```typescript
166
+ import { createHmac, randomUUID } from 'node:crypto'
167
+
168
+ function signAuditAttributes(attrs: Record<string, unknown>): string {
169
+ const key = process.env.AUDIT_HMAC_KEY!
170
+ const payload = JSON.stringify(Object.fromEntries(Object.entries(attrs).sort()))
171
+ return createHmac('sha256', key).update(payload).digest('hex')
172
+ }
173
+
174
+ export function audit(payload: AuditPayload): void {
175
+ const id = randomUUID()
176
+ const attributes = { /* … as before … */, 'audit.id': id }
177
+ const signature = signAuditAttributes(attributes)
178
+ attributes['audit.signature.alg'] = 'HMAC-SHA256'
179
+ attributes['audit.signature.value'] = signature
180
+ // … startSpan …
181
+ }
182
+ ```
183
+
184
+ Verify on the read side: recompute the HMAC over the same sorted attribute set (excluding `audit.signature.value` itself); mismatched ⇒ tampered.
185
+
186
+ For multi-tenant or extra-strict (HIPAA), use Ed25519 with per-environment keys and rotate.
187
+
188
+ ## Step 5: Redaction — what stays and what goes
189
+
190
+ | Field | In audit span? | Notes |
191
+ | ------------------------ | -------------- | ------------------------------------------------------------------------------------- |
192
+ | `enduser.id` | ✅ | Internal user id; never the email |
193
+ | `audit.resource.id` | ✅ | Required for forensics |
194
+ | `client.address` | ✅ | Last-octet redaction acceptable for IPv4 |
195
+ | Free-form payload bodies | ❌ | Never inline raw input — link by id (`audit.evidence.id`) |
196
+ | Secret values | ❌ | Use `audit.action=secret.read` + `audit.resource.id=sec_abc`, never the secret itself |
197
+ | Authorization headers | ❌ | Token names ok (`bearer.*`), values never |
198
+
199
+ `attributeRedactor` defaults are too aggressive for audit (you may need `enduser.id` literal, not masked). Disable redaction selectively:
200
+
201
+ ```typescript
202
+ spanProcessors: composeSpanProcessors([
203
+ // No redactor on the audit branch — keys are already conservative
204
+ auditOnly,
205
+ // Strict redactor on ops
206
+ new AttributeRedactingProcessor(opsOnly, { redactor: 'strict' }),
207
+ ]);
208
+ ```
209
+
210
+ ## Step 6: Retention
211
+
212
+ Audit retention is set by regulation, not engineering taste. Common minimums:
213
+
214
+ | Regulation | Minimum retention |
215
+ | -------------------- | -------------------------------------------- |
216
+ | GDPR | 6 years (financial), 12 months (operational) |
217
+ | HIPAA | 6 years |
218
+ | PCI-DSS | 1 year (online), 3 months hot |
219
+ | SOX | 7 years |
220
+ | GxP / 21 CFR Part 11 | Lifetime of product + 10 years |
221
+
222
+ Express retention as a backend lifecycle policy (S3 Object Lock COMPLIANCE mode, BigQuery `--time_partitioning_expiration`), not application code.
223
+
224
+ ## Step 7: Framework wiring
225
+
226
+ ### Next.js
227
+
228
+ ```typescript
229
+ // app/admin/users/[id]/route.ts
230
+ import { withAuthz, audit } from '@/lib/audit';
231
+
232
+ export async function DELETE(
233
+ req: Request,
234
+ { params }: { params: { id: string } },
235
+ ) {
236
+ return withAuthz(
237
+ {
238
+ action: 'user.delete',
239
+ resource: { type: 'user', id: params.id },
240
+ actor: { id: req.headers.get('x-user-id')!, role: 'admin' },
241
+ },
242
+ async () => ({ allow: await canDelete(req, params.id) }),
243
+ async () => {
244
+ await db.user.delete({ where: { id: params.id } });
245
+ return Response.json({ ok: true });
246
+ },
247
+ );
248
+ }
249
+ ```
250
+
251
+ ### Hono
252
+
253
+ ```typescript
254
+ import { audit, withAuthz } from './audit';
255
+ app.post('/secrets/:id/read', async (c) => {
256
+ return withAuthz(
257
+ {
258
+ action: 'secret.read',
259
+ resource: { type: 'secret', id: c.req.param('id') },
260
+ actor: { id: c.var.user.id, role: c.var.user.role },
261
+ },
262
+ () => requireScope(c, 'secrets:read'),
263
+ async () => c.json({ value: await secrets.read(c.req.param('id')) }),
264
+ );
265
+ });
266
+ ```
267
+
268
+ ### Cloudflare Workers
269
+
270
+ `audit()` from inside `defineWorkerFetch` — `ctx.waitUntil` makes sure the audit span is exported before the response returns:
271
+
272
+ ```typescript
273
+ export default defineWorkerFetch(
274
+ { service: { name: 'admin-api' } },
275
+ async (request, env, ctx, log) => {
276
+ return withAuthz(
277
+ {
278
+ action: 'data.export',
279
+ resource: { type: 'project', id: 'p_123' },
280
+ actor: { id: 'usr_42' },
281
+ },
282
+ async () => ({ allow: true }),
283
+ async () => Response.json({ ok: true }),
284
+ );
285
+ },
286
+ );
287
+ ```
288
+
289
+ ## Anti-patterns
290
+
291
+ | Anti-pattern | Fix |
292
+ | ---------------------------------------------- | -------------------------------------------------------------- |
293
+ | Audit logs in `console.log` / unstructured | Use `audit()` so every event has the same shape |
294
+ | Same backend for audit and ops | Separate processors, separate retention |
295
+ | Audit subject to sampling | `FilteringSpanProcessor` with `include: span.attributes.audit` |
296
+ | Logging only successes | Always log denials too |
297
+ | Putting secrets / payloads in audit attributes | Reference by id only (`audit.evidence.id`) |
298
+ | No tamper detection | HMAC signature on critical environments |
299
+ | Custom retention in code | Express via storage-layer lifecycle policy |
300
+ | Audit on every read of harmless data | Audit _meaningful_ events; not every list call |
301
+ | Audit row tied to a specific framework | The `audit()` function is framework-agnostic |
302
+ | `enduser.id` = email | Use the internal id; emails go in a separate identity table |
@@ -0,0 +1,248 @@
1
+ ---
2
+ name: debug-missing-spans
3
+ description: >
4
+ Troubleshoot when expected OpenTelemetry spans don't reach the backend.
5
+ Walks the chain top-to-bottom — code → SDK init → processor → exporter →
6
+ network → backend ingest — with concrete tests at each step. Covers head
7
+ sampling, ctx.waitUntil drops on Cloudflare, init-order races, runtime
8
+ detection failures, propagation breaks, exporter auth errors, and
9
+ silent ratelimits.
10
+ license: MIT
11
+ ---
12
+
13
+ # Debug missing spans
14
+
15
+ When a span you expect isn't in the backend, the cause is somewhere in this chain:
16
+
17
+ ```
18
+ code → SDK init → head sampler → processor → exporter → network → backend ingest → backend index
19
+ ```
20
+
21
+ This skill walks each link in order with a quick check you can run. Don't skip steps — the cause is rarely where you'd guess.
22
+
23
+ ## Step 0: Reproduce locally with the pretty exporter
24
+
25
+ Before chasing remote backends, confirm the span exists at all:
26
+
27
+ ```typescript
28
+ init({
29
+ service: 'my-app',
30
+ debug: 'pretty', // hierarchical colourised output to stdout
31
+ });
32
+ ```
33
+
34
+ If you see the span in stdout, the SDK + sampler are fine — skip to "exporter / network". If you don't, keep reading.
35
+
36
+ ## Step 1: Is the SDK actually initialised?
37
+
38
+ Common failure: `init()` runs after the first request because of import-order.
39
+
40
+ ```typescript
41
+ import { trace } from '@opentelemetry/api';
42
+
43
+ const tracer = trace.getTracer('autotel-debug');
44
+ console.log(
45
+ '[autotel-debug] tracer is no-op:',
46
+ tracer.constructor.name === 'NoopTracer',
47
+ );
48
+ ```
49
+
50
+ If `true`, `init()` ran too late. Move it to the very top of the entry file (or to `instrumentation.ts` for Next.js).
51
+
52
+ ## Step 2: Head sampler
53
+
54
+ Print the effective head rate:
55
+
56
+ ```typescript
57
+ import { getActiveConfig } from 'autotel-edge';
58
+ console.log('[autotel-debug] sampling:', getActiveConfig()?.sampling);
59
+ ```
60
+
61
+ Common gotchas:
62
+
63
+ - `sampling.rates: { server: 5 }` — 5 % means 95 % of spans never start.
64
+ - Inheriting `OTEL_TRACES_SAMPLER_ARG=0.01` from the environment via the OTel default sampler.
65
+ - Your test happens to hit the unsampled branch — instrument with `sampling: { rates: { server: 100 } }` while reproducing.
66
+
67
+ To force sampling for one request, send a `traceparent` with the sampled flag set:
68
+
69
+ ```
70
+ traceparent: 00-<traceid>-<spanid>-01
71
+ ```
72
+
73
+ (`-01` at the end = sampled.) autotel's parent-based sampler will respect it.
74
+
75
+ ## Step 3: Cloudflare Workers — `ctx.waitUntil`
76
+
77
+ The single biggest cause of missing spans on the edge: **the response returned before the exporter flushed**.
78
+
79
+ If you're using `addEventListener('fetch', …)` or a hand-rolled `fetch` in a module worker without wiring `ctx.waitUntil(…)` to the export call, async drains drop silently.
80
+
81
+ Fix — switch to `defineWorkerFetch` or `wrapModule`, both of which wire `waitUntil` automatically:
82
+
83
+ ```typescript
84
+ import { defineWorkerFetch } from 'autotel-cloudflare';
85
+
86
+ export default defineWorkerFetch(
87
+ { service: { name: 'edge' } },
88
+ async (request, env, ctx, log) => {
89
+ // log.set / spans here all flush via ctx.waitUntil before response returns
90
+ return new Response('ok');
91
+ },
92
+ );
93
+ ```
94
+
95
+ ## Step 4: Processor pipeline
96
+
97
+ Print what's wired:
98
+
99
+ ```typescript
100
+ import { trace } from '@opentelemetry/api';
101
+ const provider = trace.getTracerProvider();
102
+ console.log('[autotel-debug] provider:', provider.constructor.name);
103
+ console.log(
104
+ '[autotel-debug] processors:',
105
+ (provider as any)._registeredSpanProcessors?.map(
106
+ (p: any) => p.constructor.name,
107
+ ),
108
+ );
109
+ ```
110
+
111
+ Common issues:
112
+
113
+ - **A `FilteringSpanProcessor` excludes your span.** Check the `include` / `exclude` predicates.
114
+ - **A `TailSamplingProcessor` dropped the trace** (no error, no slow root, no debug header).
115
+ - **A `composePostProcessors` step returns `[]` for your span.**
116
+
117
+ To bisect, temporarily strip post-processors:
118
+
119
+ ```typescript
120
+ init({
121
+ service: 'my-app',
122
+ exporter: { url: process.env.OTLP_ENDPOINT! },
123
+ // no postProcessor, no tail sampler, no filter
124
+ });
125
+ ```
126
+
127
+ If the span shows up now, add back the processors one at a time.
128
+
129
+ ## Step 5: Exporter
130
+
131
+ Tail the SDK's diagnostic log:
132
+
133
+ ```typescript
134
+ import { diag, DiagConsoleLogger, DiagLogLevel } from '@opentelemetry/api';
135
+ diag.setLogger(new DiagConsoleLogger(), DiagLogLevel.DEBUG);
136
+ ```
137
+
138
+ Look for:
139
+
140
+ ```
141
+ @opentelemetry/api: ... OTLPExporter: failed to send 4 traces, status: 401, error: ...
142
+ ```
143
+
144
+ Common exporter errors:
145
+
146
+ | Status | Meaning | Fix |
147
+ | ------------- | ------------------------- | ----------------------------------------------------------- |
148
+ | `401` | Bad / missing auth header | Check `OTLP_HEADERS` / vendor token name |
149
+ | `403` | Token has no write scope | Issue a token with the right scope |
150
+ | `404` | Wrong endpoint URL | Check region (`api.honeycomb.io` vs `api.eu1.honeycomb.io`) |
151
+ | `413` | Batch too big | Lower `BatchSpanProcessor` `maxExportBatchSize` |
152
+ | `429` | Rate-limited | Reduce head/tail rates; honour `retry-after` |
153
+ | `502/503/504` | Upstream unhealthy | Often transient; add retries; check backend status |
154
+ | Network error | DNS / firewall | `curl -v <url>` from the same network |
155
+
156
+ ## Step 6: Network / TLS
157
+
158
+ For self-hosted Collectors:
159
+
160
+ ```bash
161
+ curl -v -X POST $OTLP_ENDPOINT \
162
+ -H 'content-type: application/json' \
163
+ -H "$AUTH_HEADER" \
164
+ -d '{"resourceSpans":[]}'
165
+ ```
166
+
167
+ Should return `200`. If it doesn't, the problem is between you and the Collector — not autotel.
168
+
169
+ For Cloudflare Workers, run `wrangler tail` and look for `OTLPExporter` errors.
170
+
171
+ ## Step 7: Backend ingest — silent rejection
172
+
173
+ Some backends accept the request with a 200 but drop the events:
174
+
175
+ - **Honeycomb**: dataset must exist _and_ the API key must have write access to it. Mismatched key/dataset → silent drop.
176
+ - **Datadog**: check `service` is set (resource attribute `service.name`) — they ignore spans without it.
177
+ - **Sentry**: SDK version mismatch on envelope → 200 but events disappear.
178
+ - **Grafana Cloud Tempo**: spans without `service.name` go to a fallback service called `unknown_service`.
179
+
180
+ For each backend, the dataset / index / project where you'd expect the span:
181
+
182
+ | Backend | Where the span lands |
183
+ | ------------- | --------------------------------------- |
184
+ | Honeycomb | dataset = `service.name` (auto-created) |
185
+ | Datadog | `service:<name>` filter |
186
+ | Grafana Tempo | search by `traceId` |
187
+ | Jaeger | service dropdown = `service.name` |
188
+ | Sentry | project linked to the DSN |
189
+
190
+ ## Step 8: Backend index lag
191
+
192
+ After a 200, expect ingestion lag of:
193
+
194
+ | Backend | Typical lag |
195
+ | ------------------ | ----------- |
196
+ | Honeycomb | < 5 s |
197
+ | Datadog | 30–60 s |
198
+ | Grafana Tempo | 10–30 s |
199
+ | Sentry | 30–120 s |
200
+ | Self-hosted Jaeger | < 1 s |
201
+
202
+ Don't conclude the span is missing until you've waited > 2× the expected lag.
203
+
204
+ ## Step-by-step checklist
205
+
206
+ ```
207
+ [ ] Span shows in `debug: 'pretty'` stdout
208
+ [ ] `tracer.constructor.name !== 'NoopTracer'` (SDK initialised)
209
+ [ ] Head rate is high enough to allow the request
210
+ [ ] Workers handler uses defineWorkerFetch / wrapModule
211
+ [ ] No post-processor / tail sampler / filter strips it
212
+ [ ] Exporter logs no 4xx/5xx
213
+ [ ] Curl to OTLP endpoint returns 200
214
+ [ ] Backend has the right service.name / dataset / project
215
+ [ ] Waited 2× expected ingest lag
216
+ ```
217
+
218
+ ## When the trace partially shows up
219
+
220
+ Some spans land, some don't:
221
+
222
+ - **Trace context broken between services** — outbound HTTP calls aren't propagating `traceparent`. Confirm autotel's global fetch instrumentation is on (`instrumentation.instrumentGlobalFetch: true`, default).
223
+ - **Async boundary loses context** — a `setTimeout` / queue callback ran outside the AsyncLocalStorage scope. Wrap with `trace()` or use `context.with()`.
224
+ - **Cross-runtime call** — Node service → Workers → browser; verify `traceparent` arrives at each leg via response headers / network panel.
225
+
226
+ ## When the SDK itself crashes
227
+
228
+ ```
229
+ TypeError: Cannot read properties of undefined (reading 'startActiveSpan')
230
+ ```
231
+
232
+ Usually means the API version (`@opentelemetry/api`) and SDK version (`@opentelemetry/sdk-trace-base`) drifted. Run:
233
+
234
+ ```bash
235
+ pnpm why @opentelemetry/api
236
+ ```
237
+
238
+ There should be exactly one resolved version. If there are two, dedup via `pnpm.overrides`.
239
+
240
+ ## Anti-patterns to fix as you debug
241
+
242
+ | Anti-pattern | Why it loses spans |
243
+ | --------------------------------------------------------- | ------------------------------------------------------------------ |
244
+ | `init()` after the first import that uses tracing | Spans before `init()` are no-ops |
245
+ | `addEventListener('fetch', …)` on Workers | Pre-module-worker style; no `ctx.waitUntil` to wire |
246
+ | Single `OTLP_ENDPOINT` env var with `?` chars URL-encoded | Auth gets parsed as part of the path |
247
+ | Importing both `@sentry/tracing` and `autotel` | Double-instrumentation eats spans |
248
+ | `process.exit(0)` immediately after the work | The exporter never flushed; call `await provider.shutdown()` first |