autotel 3.0.0 → 3.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. package/README.md +21 -4
  2. package/dist/attribute-redacting-processor.cjs +8 -8
  3. package/dist/attribute-redacting-processor.d.cts +10 -1
  4. package/dist/attribute-redacting-processor.d.ts +10 -1
  5. package/dist/attribute-redacting-processor.js +1 -1
  6. package/dist/attributes.cjs +21 -21
  7. package/dist/attributes.js +2 -2
  8. package/dist/auto.cjs +3 -3
  9. package/dist/auto.js +2 -2
  10. package/dist/{chunk-7HNQYHK4.js → chunk-52PUSFC2.js} +3 -3
  11. package/dist/{chunk-7HNQYHK4.js.map → chunk-52PUSFC2.js.map} +1 -1
  12. package/dist/{chunk-L7JDUDJD.cjs → chunk-7SMNC4LS.cjs} +7 -7
  13. package/dist/{chunk-L7JDUDJD.cjs.map → chunk-7SMNC4LS.cjs.map} +1 -1
  14. package/dist/{chunk-563EL6O6.cjs → chunk-BPO2PQ3T.cjs} +12 -8
  15. package/dist/chunk-BPO2PQ3T.cjs.map +1 -0
  16. package/dist/{chunk-ZSABTI3C.cjs → chunk-DAZ7EGR4.cjs} +17 -17
  17. package/dist/{chunk-ZSABTI3C.cjs.map → chunk-DAZ7EGR4.cjs.map} +1 -1
  18. package/dist/{chunk-ER43K7ES.js → chunk-DDXIUZEG.js} +3 -3
  19. package/dist/{chunk-ER43K7ES.js.map → chunk-DDXIUZEG.js.map} +1 -1
  20. package/dist/{chunk-JKIMEPI2.cjs → chunk-DQ2SUROF.cjs} +4 -4
  21. package/dist/{chunk-JKIMEPI2.cjs.map → chunk-DQ2SUROF.cjs.map} +1 -1
  22. package/dist/{chunk-KHGA4OST.cjs → chunk-HKZHUGGN.cjs} +5 -5
  23. package/dist/{chunk-KHGA4OST.cjs.map → chunk-HKZHUGGN.cjs.map} +1 -1
  24. package/dist/{chunk-TDNKIHKT.js → chunk-JVWJDHDB.js} +13 -4
  25. package/dist/chunk-JVWJDHDB.js.map +1 -0
  26. package/dist/{chunk-3QMFLJHJ.js → chunk-K7HSRLP5.js} +3 -3
  27. package/dist/{chunk-3QMFLJHJ.js.map → chunk-K7HSRLP5.js.map} +1 -1
  28. package/dist/{chunk-CJ4PD2TZ.cjs → chunk-KKGM42RQ.cjs} +13 -13
  29. package/dist/{chunk-CJ4PD2TZ.cjs.map → chunk-KKGM42RQ.cjs.map} +1 -1
  30. package/dist/{chunk-DWOBIBLY.cjs → chunk-MOO75VE4.cjs} +5 -5
  31. package/dist/{chunk-DWOBIBLY.cjs.map → chunk-MOO75VE4.cjs.map} +1 -1
  32. package/dist/{chunk-CMNGGTQL.cjs → chunk-NXLRY2CE.cjs} +13 -4
  33. package/dist/chunk-NXLRY2CE.cjs.map +1 -0
  34. package/dist/{chunk-4DAG3RFS.js → chunk-OM4OSBOP.js} +4 -4
  35. package/dist/{chunk-4DAG3RFS.js.map → chunk-OM4OSBOP.js.map} +1 -1
  36. package/dist/{chunk-DAAJLUTO.js → chunk-PMRWMRXY.js} +4 -4
  37. package/dist/{chunk-DAAJLUTO.js.map → chunk-PMRWMRXY.js.map} +1 -1
  38. package/dist/{chunk-MOK3E54E.cjs → chunk-QPH5ZKP5.cjs} +32 -32
  39. package/dist/{chunk-MOK3E54E.cjs.map → chunk-QPH5ZKP5.cjs.map} +1 -1
  40. package/dist/{chunk-IUDXKLS4.js → chunk-TFRZOUTV.js} +3 -3
  41. package/dist/{chunk-IUDXKLS4.js.map → chunk-TFRZOUTV.js.map} +1 -1
  42. package/dist/{chunk-QG3U5ONP.js → chunk-Z7VAOK5X.js} +3 -3
  43. package/dist/{chunk-QG3U5ONP.js.map → chunk-Z7VAOK5X.js.map} +1 -1
  44. package/dist/{chunk-W35FVJBC.js → chunk-ZDPIWKWD.js} +9 -5
  45. package/dist/chunk-ZDPIWKWD.js.map +1 -0
  46. package/dist/correlation-id.cjs +11 -11
  47. package/dist/correlation-id.js +3 -3
  48. package/dist/decorators.cjs +5 -5
  49. package/dist/decorators.js +4 -4
  50. package/dist/event.cjs +7 -7
  51. package/dist/event.js +4 -4
  52. package/dist/functional.cjs +11 -11
  53. package/dist/functional.js +4 -4
  54. package/dist/http.cjs +4 -4
  55. package/dist/http.js +3 -3
  56. package/dist/index.cjs +226 -92
  57. package/dist/index.cjs.map +1 -1
  58. package/dist/index.d.cts +67 -3
  59. package/dist/index.d.ts +67 -3
  60. package/dist/index.js +138 -15
  61. package/dist/index.js.map +1 -1
  62. package/dist/instrumentation.cjs +9 -9
  63. package/dist/instrumentation.js +2 -2
  64. package/dist/messaging.cjs +8 -8
  65. package/dist/messaging.js +5 -5
  66. package/dist/semantic-helpers.cjs +9 -9
  67. package/dist/semantic-helpers.js +5 -5
  68. package/dist/webhook.cjs +6 -6
  69. package/dist/webhook.js +4 -4
  70. package/dist/workflow-distributed.cjs +6 -6
  71. package/dist/workflow-distributed.js +4 -4
  72. package/dist/workflow.cjs +9 -9
  73. package/dist/workflow.js +5 -5
  74. package/package.json +43 -45
  75. package/skills/analyze-traces/SKILL.md +178 -0
  76. package/skills/autotel-core/SKILL.md +0 -7
  77. package/skills/autotel-events/SKILL.md +0 -6
  78. package/skills/autotel-frameworks/SKILL.md +0 -9
  79. package/skills/autotel-instrumentation/SKILL.md +0 -7
  80. package/skills/autotel-request-logging/SKILL.md +0 -8
  81. package/skills/autotel-structured-errors/SKILL.md +0 -7
  82. package/skills/build-audit-trails/SKILL.md +302 -0
  83. package/skills/debug-missing-spans/SKILL.md +248 -0
  84. package/skills/migrate-to-autotel/SKILL.md +268 -0
  85. package/skills/review-otel-patterns/SKILL.md +488 -0
  86. package/skills/review-otel-patterns/references/code-review.md +75 -0
  87. package/skills/review-otel-patterns/references/processor-pipeline.md +205 -0
  88. package/skills/review-otel-patterns/references/structured-errors.md +102 -0
  89. package/skills/review-otel-patterns/references/wide-spans.md +85 -0
  90. package/skills/tune-sampling/SKILL.md +210 -0
  91. package/src/attribute-redacting-processor.test.ts +6 -4
  92. package/src/attribute-redacting-processor.ts +11 -2
  93. package/src/drain-toolkit.test.ts +113 -0
  94. package/src/drain-toolkit.ts +129 -0
  95. package/src/enricher-toolkit.test.ts +67 -0
  96. package/src/enricher-toolkit.ts +79 -0
  97. package/src/index.ts +19 -0
  98. package/src/redact-values.test.ts +24 -10
  99. package/src/redact-values.ts +9 -2
  100. package/src/request-logger.test.ts +91 -0
  101. package/src/request-logger.ts +36 -2
  102. package/src/structured-error.test.ts +4 -1
  103. package/bin/intent.js +0 -6
  104. package/dist/chunk-563EL6O6.cjs.map +0 -1
  105. package/dist/chunk-CMNGGTQL.cjs.map +0 -1
  106. package/dist/chunk-TDNKIHKT.js.map +0 -1
  107. package/dist/chunk-W35FVJBC.js.map +0 -1
  108. package/src/package-manifest.test.ts +0 -24
@@ -0,0 +1,205 @@
1
+ # Processor pipeline cookbook
2
+
3
+ Composable building blocks for the autotel pipeline. Each helper is small enough to reason about in isolation and isolates errors so a single bad processor cannot break the others.
4
+
5
+ ## Primitives
6
+
7
+ | Helper | Type | Purpose |
8
+ | ---------------------------- | ----------------- | ------------------------------------------------------------ |
9
+ | `defineConfig(config)` | identity | Authoring helper for typed config |
10
+ | `composeSpanProcessors([…])` | `SpanProcessor` | Fan span lifecycle to multiple processors |
11
+ | `composePostProcessors([…])` | `PostProcessorFn` | Chain post-processors (each sees the output of the previous) |
12
+ | `composeSubscribers([…])` | `EdgeSubscriber` | Fire in-process side effects in order |
13
+
14
+ All from `autotel-edge`.
15
+
16
+ ## Multi-backend export
17
+
18
+ ```typescript
19
+ import { BatchSpanProcessor } from 'autotel/processors';
20
+ import { OTLPHttpJsonExporter } from 'autotel/exporters';
21
+ import { composeSpanProcessors, defineConfig } from 'autotel-edge';
22
+
23
+ const honeycomb = new BatchSpanProcessor(
24
+ new OTLPHttpJsonExporter({
25
+ url: 'https://api.honeycomb.io/v1/traces',
26
+ headers: { 'x-honeycomb-team': process.env.HONEYCOMB_KEY! },
27
+ }),
28
+ );
29
+
30
+ const grafana = new BatchSpanProcessor(
31
+ new OTLPHttpJsonExporter({
32
+ url: process.env.GRAFANA_OTLP_URL!,
33
+ headers: { authorization: `Basic ${process.env.GRAFANA_AUTH!}` },
34
+ }),
35
+ );
36
+
37
+ export const config = defineConfig({
38
+ service: { name: 'checkout' },
39
+ spanProcessors: composeSpanProcessors([honeycomb, grafana]),
40
+ });
41
+ ```
42
+
43
+ ## Tail sampling: keep errors + slow + 10% otherwise
44
+
45
+ ```typescript
46
+ import { TailSamplingProcessor } from 'autotel/processors';
47
+ import { composeSpanProcessors } from 'autotel-edge';
48
+
49
+ const tail = new TailSamplingProcessor({
50
+ keep: (trace) => {
51
+ if (trace.localRootSpan.status?.code === SpanStatusCode.ERROR) return true;
52
+ if (trace.localRootSpan.duration[0] > 1) return true; // > 1s
53
+ return Math.random() < 0.1;
54
+ },
55
+ });
56
+
57
+ spanProcessors: composeSpanProcessors([new BatchSpanProcessor(otlp), tail]);
58
+ ```
59
+
60
+ ## Drop noisy spans before they reach the batcher
61
+
62
+ ```typescript
63
+ import { FilteringSpanProcessor } from 'autotel/processors';
64
+
65
+ const dropHealth = new FilteringSpanProcessor({
66
+ exclude: (span) => /^GET \/(healthz|ready)$/.test(span.name),
67
+ });
68
+
69
+ spanProcessors: composeSpanProcessors([
70
+ dropHealth,
71
+ new BatchSpanProcessor(otlp),
72
+ ]);
73
+ ```
74
+
75
+ ## Bound URL cardinality
76
+
77
+ ```typescript
78
+ import { SpanNameNormalizingProcessor } from 'autotel/processors';
79
+
80
+ const normalise = new SpanNameNormalizingProcessor({
81
+ // Replace UUIDs and 24-char hex ids with placeholders
82
+ replacements: [
83
+ {
84
+ match: /[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/g,
85
+ with: ':id',
86
+ },
87
+ { match: /[0-9a-f]{24}/g, with: ':id' },
88
+ ],
89
+ });
90
+ ```
91
+
92
+ Now `GET /users/123e4567-e89b-12d3-a456-426614174000/orders` becomes `GET /users/:id/orders` in your traces — fewer unique span names, dramatically faster queries.
93
+
94
+ ## Lift baggage onto every span
95
+
96
+ ```typescript
97
+ import { BaggageSpanProcessor } from 'autotel/processors';
98
+
99
+ // Anything placed in baggage upstream becomes an attribute on every child span
100
+ const baggage = new BaggageSpanProcessor({ keys: ['tenant', 'feature_flags'] });
101
+ spanProcessors: composeSpanProcessors([baggage, new BatchSpanProcessor(otlp)]);
102
+ ```
103
+
104
+ ## Subscribers for in-process side effects
105
+
106
+ Subscribers run synchronously in the parent context — ideal for metrics, audit, and cost calculation that you want recorded **before** the span goes to the batcher.
107
+
108
+ ```typescript
109
+ import type { EdgeSubscriber } from 'autotel-edge';
110
+ import { composeSubscribers } from 'autotel-edge';
111
+
112
+ const metricsSubscriber: EdgeSubscriber = (event) => {
113
+ if (
114
+ event.kind === 'span.end' &&
115
+ event.span.attributes['http.response.status_code'] >= 500
116
+ ) {
117
+ metrics.errorCounter.add(1, { route: event.span.name });
118
+ }
119
+ };
120
+
121
+ const auditSubscriber: EdgeSubscriber = (event) => {
122
+ if (event.kind === 'span.end' && event.span.name.startsWith('admin.')) {
123
+ audit.write({
124
+ kind: event.span.name,
125
+ actor: event.span.attributes['user.id'],
126
+ });
127
+ }
128
+ };
129
+
130
+ subscribers: [composeSubscribers([metricsSubscriber, auditSubscriber])];
131
+ ```
132
+
133
+ ## Post-processors for last-mile rewrites
134
+
135
+ Post-processors mutate the array of spans **after** sampling, just before export. Use for redacting stack traces, dropping fields, or annotating with deployment info.
136
+
137
+ ```typescript
138
+ import type { PostProcessorFn } from 'autotel-edge';
139
+ import { composePostProcessors } from 'autotel-edge';
140
+ import { createStringRedactor } from 'autotel';
141
+
142
+ const redactStacks = createStringRedactor('strict');
143
+
144
+ const cleanStacks: PostProcessorFn = (spans) =>
145
+ spans.map((s) => {
146
+ if (typeof s.attributes['exception.stacktrace'] === 'string') {
147
+ s.attributes['exception.stacktrace'] = redactStacks(
148
+ s.attributes['exception.stacktrace'],
149
+ );
150
+ }
151
+ return s;
152
+ });
153
+
154
+ const tagDeploy: PostProcessorFn = (spans) =>
155
+ spans.map((s) => ({
156
+ ...s,
157
+ attributes: { ...s.attributes, 'deploy.id': process.env.RELEASE! },
158
+ }));
159
+
160
+ postProcessor: composePostProcessors([cleanStacks, tagDeploy]);
161
+ ```
162
+
163
+ ## Putting it all together
164
+
165
+ ```typescript
166
+ import {
167
+ defineConfig,
168
+ composeSpanProcessors,
169
+ composeSubscribers,
170
+ composePostProcessors,
171
+ } from 'autotel-edge';
172
+
173
+ export const otelConfig = defineConfig({
174
+ service: { name: 'checkout' },
175
+ attributeRedactor: 'strict',
176
+
177
+ spanProcessors: composeSpanProcessors([
178
+ dropHealth, // 1. drop spans we never want
179
+ normaliseUrls, // 2. bound cardinality
180
+ new BatchSpanProcessor(honeycomb),
181
+ new BatchSpanProcessor(grafana),
182
+ tailSampler, // 3. keep errors + slow + 10%
183
+ ]),
184
+
185
+ subscribers: [
186
+ composeSubscribers([metricsSubscriber, auditSubscriber, aiCostSubscriber]),
187
+ ],
188
+
189
+ postProcessor: composePostProcessors([cleanStacks, tagDeploy]),
190
+ });
191
+ ```
192
+
193
+ ## Error isolation
194
+
195
+ Every compose helper catches errors per item and logs to `console.error` with the helper name. A single bad processor cannot break the others — important when one of your subscribers is a third-party integration (Datadog, PagerDuty, …) that can rate-limit or 502.
196
+
197
+ ## Choosing between subscribers and post-processors
198
+
199
+ | You want… | Use |
200
+ | ----------------------------------------- | ---------------------------------------------------------- |
201
+ | Mutate exported span attributes | `postProcessor` |
202
+ | Drop spans entirely | `FilteringSpanProcessor` (early) or tail sampler |
203
+ | Update an in-process metric on every span | `subscribers` |
204
+ | Send an audit log to a DB | `subscribers` (use `log.fork('audit')` if writes are slow) |
205
+ | Re-emit spans to a second backend | second `BatchSpanProcessor` in `composeSpanProcessors` |
@@ -0,0 +1,102 @@
1
+ # Structured errors
2
+
3
+ `createStructuredError` produces an `Error` carrying enough context to be:
4
+
5
+ - **Recorded onto the active span** (`exception.type`, `exception.message`, `exception.stacktrace`, `span.status = ERROR`).
6
+ - **Returned to clients safely** (`internal` is stripped by `parseError`).
7
+ - **Self-documenting** (`why` explains the cause, `fix` tells the caller what to do, `link` points at runbook docs).
8
+
9
+ ## Field reference
10
+
11
+ | Field | Audience | Purpose |
12
+ | ---------- | ----------- | ---------------------------------------------------------- |
13
+ | `message` | Both | Short, stable summary |
14
+ | `status` | Both | HTTP status (drives client behaviour and span status code) |
15
+ | `why` | Both | Human-readable cause (`"Card declined by issuer"`) |
16
+ | `fix` | Client | Remediation hint (`"Use a different payment method"`) |
17
+ | `link` | Client | URL to docs / runbook |
18
+ | `code` | Both | Machine-readable code (`"PAYMENT_DECLINED"`) |
19
+ | `cause` | Server only | The underlying error |
20
+ | `internal` | Server only | Diagnostic metadata (`{ correlationId, resourceId }`) |
21
+ | `details` | Both | Structured payload (e.g. validation errors per field) |
22
+
23
+ ## Templates
24
+
25
+ ### Validation (400)
26
+
27
+ ```typescript
28
+ throw createStructuredError({
29
+ status: 400,
30
+ code: 'VALIDATION_ERROR',
31
+ message: 'Invalid request body',
32
+ why: 'One or more fields failed validation',
33
+ fix: 'Check the `details` field for per-field errors',
34
+ details: { email: 'must be a valid email', age: 'must be ≥ 18' },
35
+ });
36
+ ```
37
+
38
+ ### Auth (401 / 403)
39
+
40
+ ```typescript
41
+ throw createStructuredError({
42
+ status: 403,
43
+ code: 'FORBIDDEN',
44
+ message: 'Not allowed',
45
+ why: 'You do not have access to this resource',
46
+ fix: 'Ask the workspace owner for access',
47
+ link: 'https://docs.example.com/permissions',
48
+ internal: { resourceId: 'proj_123', userRole: 'member' },
49
+ });
50
+ ```
51
+
52
+ ### Payment (402)
53
+
54
+ ```typescript
55
+ throw createStructuredError({
56
+ status: 402,
57
+ code: 'PAYMENT_DECLINED',
58
+ message: 'Payment declined',
59
+ why: 'Card declined by issuer — insufficient funds',
60
+ fix: 'Use a different payment method or contact your bank',
61
+ link: 'https://docs.example.com/payments/declined',
62
+ cause: stripeError,
63
+ internal: { stripeChargeId: 'ch_…', riskScore: stripeError.risk_level },
64
+ });
65
+ ```
66
+
67
+ ### Upstream failure (502 / 503 / 504)
68
+
69
+ ```typescript
70
+ throw createStructuredError({
71
+ status: 502,
72
+ code: 'UPSTREAM_FAILED',
73
+ message: 'Inventory service is unavailable',
74
+ why: 'Could not reach the inventory service',
75
+ fix: 'Retry in a few minutes',
76
+ cause: fetchError,
77
+ internal: { upstream: 'inventory-svc', retryAttempt: 3 },
78
+ });
79
+ ```
80
+
81
+ ## At HTTP boundaries
82
+
83
+ ```typescript
84
+ import { parseError } from 'autotel';
85
+
86
+ app.onError((error, c) => {
87
+ // span.status is already ERROR with exception fields recorded
88
+ const parsed = parseError(error);
89
+ // `internal` and `cause` are stripped here — never leak them to clients
90
+ return c.json(parsed, parsed.status);
91
+ });
92
+ ```
93
+
94
+ ## Anti-patterns
95
+
96
+ | Anti-pattern | Fix |
97
+ | ------------------------------------------------------------ | -------------------------------------------------------------------------------------- |
98
+ | `throw new Error('something went wrong')` | `createStructuredError({ message, status, why, fix })` |
99
+ | Putting support IDs in `message` (`"Failed for user 42"`) | Use `internal: { userId: 42 }` |
100
+ | Returning `details: { error: stack }` to clients | Stack traces stay in `cause` / span; never serialise them out |
101
+ | `console.error(e); throw e` | Just throw — autotel's span will pick up the exception |
102
+ | Two callers throwing different shapes for the same condition | Centralise: `function declined(reason: string) { throw createStructuredError({ … }) }` |
@@ -0,0 +1,85 @@
1
+ # Designing wide spans
2
+
3
+ A wide span is a single span per logical unit of work (request, job, message, fork) carrying _all_ the fields you'd ever want to filter or group by. autotel lets you build them with `useLogger().set({ … })` — fields are flattened to OTel attributes with stable dotted keys.
4
+
5
+ ## Anatomy
6
+
7
+ ```typescript
8
+ import { useLogger } from 'autotel';
9
+
10
+ export const POST = withAutotel(async (request) => {
11
+ const log = useLogger();
12
+
13
+ // Identity
14
+ log.set({ user: { id: 'usr_123', plan: 'enterprise', role: 'admin' } });
15
+
16
+ // Inputs
17
+ log.set({ cart: { items: 3, total: 14_999, currency: 'USD' } });
18
+
19
+ // Decisions / branches
20
+ log.set({ promo: { applied: 'SUMMER10', discount: 1_500 } });
21
+
22
+ // Outputs
23
+ log.set({
24
+ payment: { provider: 'stripe', method: 'card', authCode: 'auth_x' },
25
+ });
26
+
27
+ return Response.json({ ok: true });
28
+ });
29
+ ```
30
+
31
+ OTel attributes recorded:
32
+
33
+ ```
34
+ user.id=usr_123
35
+ user.plan=enterprise
36
+ user.role=admin
37
+ cart.items=3
38
+ cart.total=14999
39
+ cart.currency=USD
40
+ promo.applied=SUMMER10
41
+ promo.discount=1500
42
+ payment.provider=stripe
43
+ payment.method=card
44
+ payment.authCode=auth_x
45
+ ```
46
+
47
+ ## Rules of thumb
48
+
49
+ 1. **One wide span per logical unit of work.** Many tiny spans hurt query speed; deep call trees can be opt-in (`autotel-drizzle`, `autotel-mongoose`).
50
+ 2. **Group with objects.** `{ user: { id, plan } }` not `userId` / `userPlan`. The flatten step keeps the key shape stable.
51
+ 3. **Capture decisions, not just inputs.** Which branch ran, which promo applied, which fallback fired.
52
+ 4. **Keep cardinality bounded.** Don't put per-request UUIDs in `span.name`; use `SpanNameNormalizingProcessor`. Free-text labels go in attributes.
53
+ 5. **Avoid raw bodies.** Pick the shape: `{ user: { id, plan } }` — never `log.set({ user: requestBody })`.
54
+ 6. **Trust the redactor.** PII you forgot to think about (emails, JWTs, cards) gets masked in production. See `attributeRedactor: 'default'`.
55
+
56
+ ## When you need correlated child spans
57
+
58
+ Use `trace()` to wrap discrete sub-operations whose duration matters:
59
+
60
+ ```typescript
61
+ import { trace } from 'autotel';
62
+
63
+ const fetchInventory = trace(async (sku: string) => {
64
+ /* … */
65
+ });
66
+ const reserveStock = trace(async (sku: string, qty: number) => {
67
+ /* … */
68
+ });
69
+
70
+ await fetchInventory(sku);
71
+ await reserveStock(sku, qty);
72
+ ```
73
+
74
+ Each gets its own span with the function name; both are children of the active request span.
75
+
76
+ ## When you need background work
77
+
78
+ `log.fork('label', fn)` spawns a child span that emits its own wide event with `_parentCorrelationId` set, even after the parent response has been returned. Pass `lifecycle.onChildEnter / onChildExit` if your framework tracks active loggers (Elysia, etc.).
79
+
80
+ ```typescript
81
+ log.fork('audit-write', async () => {
82
+ await audit.write({ kind: 'order.created', orderId });
83
+ });
84
+ return Response.json({ ok: true }); // parent returns immediately
85
+ ```
@@ -0,0 +1,210 @@
1
+ ---
2
+ name: tune-sampling
3
+ description: >
4
+ Choose a sampling strategy for an autotel-instrumented service. Covers
5
+ head sampling (per-span-kind rates, parent-based, ratio), tail sampling
6
+ (keep errors, slow, AI-aware, debug-headers), cost vs cardinality
7
+ tradeoffs, and the math for picking rates that hit a target spans/second
8
+ budget. Includes recipes for low-volume admin services, high-volume APIs,
9
+ AI agents, and Cloudflare Workers.
10
+ license: MIT
11
+ ---
12
+
13
+ # Tune sampling
14
+
15
+ Untuned tracing is either expensive (100 % at scale costs money + drowns dashboards) or unhelpful (1 % loses the failure modes you need to see). The right answer is almost always **head sample most of the boring traffic, tail keep all the interesting traffic**, with explicit overrides for AI calls and customer escalations.
16
+
17
+ ## When to use
18
+
19
+ - Hitting your observability budget
20
+ - Dashboards too sparse to spot anomalies
21
+ - "We have the trace IDs but the spans are gone" complaints
22
+ - New service launching at scale
23
+ - Long-running AI agents producing 50+ spans per request
24
+
25
+ ## The mental model
26
+
27
+ ```
28
+ Total cost = (spans/sec × $/span) + (storage_GB × $/GB-month)
29
+
30
+ Head sampling reduces this directly.
31
+ ```
32
+
33
+ Head sampling makes a decision **at span start** — fast, but coarse (it doesn't know if the span will fail).
34
+ Tail sampling makes the decision **at span end** — slower, more storage upfront, but precise.
35
+
36
+ The right mix:
37
+
38
+ - **Head sample at the entry point** to keep volume tractable.
39
+ - **Tail keep** the high-value subset (errors, slow, AI, debug-headered).
40
+ - **Don't sample audit spans** — separate processor, see [`build-audit-trails`](../build-audit-trails/SKILL.md).
41
+
42
+ ## Head sampling recipes
43
+
44
+ ### Default for a typical web service
45
+
46
+ ```typescript
47
+ init({
48
+ service: 'my-app',
49
+ sampling: {
50
+ rates: {
51
+ server: 25, // server entry spans — sample ¼
52
+ client: 5, // outbound HTTP — sample 1/20
53
+ internal: 5, // internal sub-spans — sample 1/20
54
+ },
55
+ },
56
+ });
57
+ ```
58
+
59
+ Children of a sampled root are **all** kept (parent-based propagation is the default). So `server: 25` means 25 % of _user requests_, complete trace each.
60
+
61
+ ### High-volume API (>1 k req/s)
62
+
63
+ ```typescript
64
+ sampling: {
65
+ rates: { server: 5, client: 1, internal: 1 }, // 5 % → tail keeps errors anyway
66
+ tail: keepInterestingTraces,
67
+ },
68
+ ```
69
+
70
+ ### Low-volume admin / internal service (<10 req/s)
71
+
72
+ 100 % is fine. Don't penalise yourself for a service that produces 1 GB of traces a week.
73
+
74
+ ### Cloudflare Workers (per-colo budget)
75
+
76
+ Workers run distributed — head sampling is your friend because there's no central queue:
77
+
78
+ ```typescript
79
+ defineWorkerFetch(
80
+ {
81
+ service: { name: 'edge' },
82
+ sampling: { rates: { server: 10 } }, // 10 % per colo, scales naturally
83
+ },
84
+ handler,
85
+ );
86
+ ```
87
+
88
+ ## Tail sampling — keep interesting traces
89
+
90
+ Tail sampling looks at the full trace (root span + children) before deciding. autotel ships `TailSamplingProcessor`:
91
+
92
+ ```typescript
93
+ import { TailSamplingProcessor } from 'autotel/processors';
94
+ import { SpanStatusCode } from '@opentelemetry/api';
95
+
96
+ const tail = new TailSamplingProcessor({
97
+ keep: (trace) => {
98
+ // 1. Always keep errors
99
+ if (trace.localRootSpan.status?.code === SpanStatusCode.ERROR) return true;
100
+ if (trace.spans.some((s) => s.status?.code === SpanStatusCode.ERROR))
101
+ return true;
102
+
103
+ // 2. Always keep slow traces (configurable threshold)
104
+ if (durationMs(trace.localRootSpan) > 1_000) return true;
105
+
106
+ // 3. Always keep customer-marked traces
107
+ if (trace.localRootSpan.attributes['debug.trace'] === true) return true;
108
+
109
+ // 4. Always keep AI traces (rare + expensive — full visibility helps)
110
+ if (
111
+ trace.spans.some((s) => typeof s.attributes['gen_ai.system'] === 'string')
112
+ )
113
+ return true;
114
+
115
+ // 5. Otherwise: respect head sampling decision
116
+ return false;
117
+ },
118
+ });
119
+ ```
120
+
121
+ ### Combining with multi-backend
122
+
123
+ ```typescript
124
+ spanProcessors: composeSpanProcessors([
125
+ // Drop nothing here — we want the tail processor to see the full trace
126
+ new BatchSpanProcessor(localExporter),
127
+ tail, // filters before remote export
128
+ new BatchSpanProcessor(expensiveRemoteExporter),
129
+ ]);
130
+ ```
131
+
132
+ ## AI / LLM-aware sampling
133
+
134
+ LLM calls produce 5–50 spans per request and are 100× more expensive than a typical handler call. Tradeoffs:
135
+
136
+ - **Don't head-sample AI handlers below 50 %** — debugging "why did the model loop" requires the full chain.
137
+ - **Always tail-keep AI traces** — the `gen_ai.*` attributes flag them.
138
+ - **Cost-aware sampling** — keep all calls above a $ threshold:
139
+
140
+ ```typescript
141
+ keep: (trace) => {
142
+ const cost = trace.spans.reduce(
143
+ (acc, s) =>
144
+ acc +
145
+ (typeof s.attributes['gen_ai.cost.usd'] === 'number'
146
+ ? (s.attributes['gen_ai.cost.usd'] as number)
147
+ : 0),
148
+ 0,
149
+ );
150
+ if (cost > 0.1) return true; // any trace > $0.10 → keep
151
+ if (cost > 0.01) return Math.random() < 0.5; // > $0.01 → 50 %
152
+ return Math.random() < 0.1; // < $0.01 → 10 %
153
+ };
154
+ ```
155
+
156
+ ## Customer-driven sampling (debug header)
157
+
158
+ Let support flip on full tracing per request:
159
+
160
+ ```typescript
161
+ const tail = new TailSamplingProcessor({
162
+ keep: (trace) => trace.localRootSpan.attributes['x-debug-trace'] === '1' || /* … */,
163
+ })
164
+ ```
165
+
166
+ In your middleware:
167
+
168
+ ```typescript
169
+ if (request.headers.get('x-debug-trace') === '1') {
170
+ useLogger().set({ 'x-debug-trace': '1' });
171
+ }
172
+ ```
173
+
174
+ Now any user can mark a request as "trace this fully" by sending the header — invaluable for reproducing customer reports.
175
+
176
+ ## Sizing the rate
177
+
178
+ Target volume:
179
+
180
+ ```
181
+ spans/sec ≈ requests/sec × spans_per_request × head_rate × tail_keep_rate
182
+ ```
183
+
184
+ Worked example for a 100 req/s API with 8 spans/req:
185
+
186
+ | Head rate | Tail keep | Result |
187
+ | --------- | -------------------------------- | -------------------------------------------- |
188
+ | 100 % | 100 % | 800 spans/sec — expensive |
189
+ | 10 % | 100 % (errors + slow + AI ≈ 5 %) | ≈ 110 spans/sec — sweet spot |
190
+ | 1 % | 100 % | ≈ 18 spans/sec — too sparse for p99 alerting |
191
+
192
+ For per-vendor pricing:
193
+
194
+ - **Honeycomb**: $0.000005 / event for paid plans. 110 spans/sec × 86 400 s = 9.5 M events/day = $48/day.
195
+ - **Datadog APM**: ~$1.27/M spans ingested (varies by region). Same volume → ~$12/day.
196
+ - **Grafana Cloud**: 100 GB free tier; 110 spans/sec ≈ 5 GB/day.
197
+
198
+ ## Anti-patterns
199
+
200
+ | Anti-pattern | Fix |
201
+ | -------------------------------------------- | ------------------------------------------------------------- |
202
+ | 100 % sampling at scale "to be safe" | You're paying 10–100× without proportional value |
203
+ | 1 % sampling with no tail keep | You'll miss every interesting failure |
204
+ | Forgetting to tail-keep errors | Sampled traces with errors → silent customer pain |
205
+ | Same rate for `server` and `internal` | Internal sub-spans are 5–20× more numerous; sample harder |
206
+ | Ratio-based sampling on service entry point | Use parent-based — children of a sampled trace stay together |
207
+ | Head-sampling AI calls below 50 % | Debugging tool loops requires the full chain |
208
+ | Audit spans subject to sampling | Route them to a separate processor (see `build-audit-trails`) |
209
+ | Tail processor before exporter (loses spans) | Tail processor goes between head sampler and remote exporter |
210
+ | Rate-by-route hand-coded in handlers | Use head sampler + tail keep — declarative, one place |
@@ -184,8 +184,9 @@ describe('AttributeRedactingProcessor', () => {
184
184
  });
185
185
  processor.onEnd(span);
186
186
 
187
+ // SSN has no smart mask; falls back to the default replacement.
187
188
  expect(mockProcessor.endedSpans[0]!.attributes['user.ssn']).toBe(
188
- '*******89',
189
+ '[REDACTED]',
189
190
  );
190
191
  });
191
192
 
@@ -199,8 +200,9 @@ describe('AttributeRedactingProcessor', () => {
199
200
  });
200
201
  processor.onEnd(span);
201
202
 
203
+ // PCI-DSS compliant: last 4 digits preserved.
202
204
  expect(mockProcessor.endedSpans[0]!.attributes['payment.card']).toBe(
203
- '**************11',
205
+ '****1111',
204
206
  );
205
207
  });
206
208
 
@@ -710,12 +712,12 @@ describe('edge cases', () => {
710
712
  });
711
713
 
712
714
  const span = createMockReadableSpan({
713
- contacts: 'Email: john@example.com, Phone: 555-123-4567',
715
+ contacts: 'Email: john@example.com, Phone: +1 555-123-4567',
714
716
  });
715
717
  processor.onEnd(span);
716
718
 
717
719
  expect(mockProcessor.endedSpans[0]!.attributes.contacts).toBe(
718
- 'Email: j***@***.com, Phone: ********67',
720
+ 'Email: j***@***.com, Phone: +1******67',
719
721
  );
720
722
  });
721
723
  });
@@ -147,10 +147,19 @@ export const builtinPatterns = {
147
147
  /\b(?!0\.0\.0\.0\b)(?!127\.0\.0\.1\b)\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/g,
148
148
  mask: (m: string) => `***.***.***.${m.split('.').pop()}`,
149
149
  },
150
- /** International phone numbers → +33******78 (country code + last 2 digits) */
150
+ /**
151
+ * International / formatted phone numbers.
152
+ *
153
+ * Matches:
154
+ * - `+33 1 23 45 67 89` -> `+33******89`
155
+ * - `(415) 555-1234` -> `********34`
156
+ * - `555-123-4567` / `555.123.4567` / `5551234567` -> `********67`
157
+ *
158
+ * Bare short digit runs like `12345678` are intentionally not matched.
159
+ */
151
160
  phone: {
152
161
  pattern:
153
- /(?:\+\d{1,3}[\s.-]?)?\(?\d{1,4}\)?[\s.-]?\d{2,4}[\s.-]?\d{2,4}[\s.-]?\d{2,4}\b/g,
162
+ /(?:\+\d{1,3}[\s.-]?\(?\d{1,4}\)?(?:[\s.-]?\d{2,4}){2,4}|\(\d{1,4}\)(?:[\s.-]?\d{2,4}){2,4}|\b\d{3}[-.]?\d{3}[-.]?\d{4}\b)/g,
154
163
  mask: (m: string) => {
155
164
  const digits = m.replace(/[^\d]/g, '');
156
165
  const hasPlus = m.startsWith('+');