autotel 3.0.0 → 3.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +21 -4
- package/dist/attribute-redacting-processor.cjs +8 -8
- package/dist/attribute-redacting-processor.d.cts +10 -1
- package/dist/attribute-redacting-processor.d.ts +10 -1
- package/dist/attribute-redacting-processor.js +1 -1
- package/dist/attributes.cjs +21 -21
- package/dist/attributes.js +2 -2
- package/dist/auto.cjs +3 -3
- package/dist/auto.js +2 -2
- package/dist/{chunk-IUDXKLS4.js → chunk-34X3TKHA.js} +3 -3
- package/dist/{chunk-IUDXKLS4.js.map → chunk-34X3TKHA.js.map} +1 -1
- package/dist/{chunk-3QMFLJHJ.js → chunk-4LF6FV2V.js} +3 -3
- package/dist/{chunk-3QMFLJHJ.js.map → chunk-4LF6FV2V.js.map} +1 -1
- package/dist/{chunk-L7JDUDJD.cjs → chunk-AAYCDHH6.cjs} +7 -7
- package/dist/{chunk-L7JDUDJD.cjs.map → chunk-AAYCDHH6.cjs.map} +1 -1
- package/dist/{chunk-DWOBIBLY.cjs → chunk-AY2SY3MO.cjs} +5 -5
- package/dist/{chunk-DWOBIBLY.cjs.map → chunk-AY2SY3MO.cjs.map} +1 -1
- package/dist/{chunk-563EL6O6.cjs → chunk-BPO2PQ3T.cjs} +12 -8
- package/dist/chunk-BPO2PQ3T.cjs.map +1 -0
- package/dist/{chunk-ZSABTI3C.cjs → chunk-DAZ7EGR4.cjs} +17 -17
- package/dist/{chunk-ZSABTI3C.cjs.map → chunk-DAZ7EGR4.cjs.map} +1 -1
- package/dist/{chunk-ER43K7ES.js → chunk-DDXIUZEG.js} +3 -3
- package/dist/{chunk-ER43K7ES.js.map → chunk-DDXIUZEG.js.map} +1 -1
- package/dist/{chunk-JKIMEPI2.cjs → chunk-DQ2SUROF.cjs} +4 -4
- package/dist/{chunk-JKIMEPI2.cjs.map → chunk-DQ2SUROF.cjs.map} +1 -1
- package/dist/{chunk-DAAJLUTO.js → chunk-F3TNRW2P.js} +6 -5
- package/dist/chunk-F3TNRW2P.js.map +1 -0
- package/dist/{chunk-7HNQYHK4.js → chunk-HBLWOI6P.js} +3 -3
- package/dist/{chunk-7HNQYHK4.js.map → chunk-HBLWOI6P.js.map} +1 -1
- package/dist/{chunk-TDNKIHKT.js → chunk-JVWJDHDB.js} +13 -4
- package/dist/chunk-JVWJDHDB.js.map +1 -0
- package/dist/{chunk-CJ4PD2TZ.cjs → chunk-KKGM42RQ.cjs} +13 -13
- package/dist/{chunk-CJ4PD2TZ.cjs.map → chunk-KKGM42RQ.cjs.map} +1 -1
- package/dist/{chunk-KHGA4OST.cjs → chunk-LMFPZHI4.cjs} +5 -5
- package/dist/{chunk-KHGA4OST.cjs.map → chunk-LMFPZHI4.cjs.map} +1 -1
- package/dist/{chunk-CMNGGTQL.cjs → chunk-NXLRY2CE.cjs} +13 -4
- package/dist/chunk-NXLRY2CE.cjs.map +1 -0
- package/dist/{chunk-4DAG3RFS.js → chunk-OM4OSBOP.js} +4 -4
- package/dist/{chunk-4DAG3RFS.js.map → chunk-OM4OSBOP.js.map} +1 -1
- package/dist/{chunk-MOK3E54E.cjs → chunk-WSGAHSZQ.cjs} +34 -33
- package/dist/chunk-WSGAHSZQ.cjs.map +1 -0
- package/dist/{chunk-QG3U5ONP.js → chunk-Z7VAOK5X.js} +3 -3
- package/dist/{chunk-QG3U5ONP.js.map → chunk-Z7VAOK5X.js.map} +1 -1
- package/dist/{chunk-W35FVJBC.js → chunk-ZDPIWKWD.js} +9 -5
- package/dist/chunk-ZDPIWKWD.js.map +1 -0
- package/dist/correlation-id.cjs +11 -11
- package/dist/correlation-id.js +3 -3
- package/dist/decorators.cjs +5 -5
- package/dist/decorators.js +4 -4
- package/dist/event.cjs +7 -7
- package/dist/event.js +4 -4
- package/dist/functional.cjs +11 -11
- package/dist/functional.d.cts +20 -17
- package/dist/functional.d.ts +20 -17
- package/dist/functional.js +4 -4
- package/dist/http.cjs +4 -4
- package/dist/http.js +3 -3
- package/dist/index.cjs +226 -92
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +67 -3
- package/dist/index.d.ts +67 -3
- package/dist/index.js +138 -15
- package/dist/index.js.map +1 -1
- package/dist/instrumentation.cjs +9 -9
- package/dist/instrumentation.js +2 -2
- package/dist/messaging.cjs +8 -8
- package/dist/messaging.js +5 -5
- package/dist/semantic-helpers.cjs +9 -9
- package/dist/semantic-helpers.js +5 -5
- package/dist/webhook.cjs +6 -6
- package/dist/webhook.js +4 -4
- package/dist/workflow-distributed.cjs +6 -6
- package/dist/workflow-distributed.js +4 -4
- package/dist/workflow.cjs +9 -9
- package/dist/workflow.js +5 -5
- package/package.json +43 -45
- package/skills/analyze-traces/SKILL.md +178 -0
- package/skills/autotel-core/SKILL.md +0 -7
- package/skills/autotel-events/SKILL.md +0 -6
- package/skills/autotel-frameworks/SKILL.md +0 -9
- package/skills/autotel-instrumentation/SKILL.md +0 -7
- package/skills/autotel-request-logging/SKILL.md +0 -8
- package/skills/autotel-structured-errors/SKILL.md +0 -7
- package/skills/build-audit-trails/SKILL.md +302 -0
- package/skills/debug-missing-spans/SKILL.md +248 -0
- package/skills/migrate-to-autotel/SKILL.md +268 -0
- package/skills/review-otel-patterns/SKILL.md +488 -0
- package/skills/review-otel-patterns/references/code-review.md +75 -0
- package/skills/review-otel-patterns/references/processor-pipeline.md +205 -0
- package/skills/review-otel-patterns/references/structured-errors.md +102 -0
- package/skills/review-otel-patterns/references/wide-spans.md +85 -0
- package/skills/tune-sampling/SKILL.md +210 -0
- package/src/attribute-redacting-processor.test.ts +6 -4
- package/src/attribute-redacting-processor.ts +11 -2
- package/src/drain-toolkit.test.ts +113 -0
- package/src/drain-toolkit.ts +129 -0
- package/src/enricher-toolkit.test.ts +67 -0
- package/src/enricher-toolkit.ts +79 -0
- package/src/functional.test.ts +18 -0
- package/src/functional.ts +32 -20
- package/src/index.ts +19 -0
- package/src/redact-values.test.ts +24 -10
- package/src/redact-values.ts +9 -2
- package/src/request-logger.test.ts +91 -0
- package/src/request-logger.ts +36 -2
- package/src/structured-error.test.ts +4 -1
- package/bin/intent.js +0 -6
- package/dist/chunk-563EL6O6.cjs.map +0 -1
- package/dist/chunk-CMNGGTQL.cjs.map +0 -1
- package/dist/chunk-DAAJLUTO.js.map +0 -1
- package/dist/chunk-MOK3E54E.cjs.map +0 -1
- package/dist/chunk-TDNKIHKT.js.map +0 -1
- package/dist/chunk-W35FVJBC.js.map +0 -1
- package/src/package-manifest.test.ts +0 -24
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
# Processor pipeline cookbook
|
|
2
|
+
|
|
3
|
+
Composable building blocks for the autotel pipeline. Each helper is small enough to reason about in isolation and isolates errors so a single bad processor cannot break the others.
|
|
4
|
+
|
|
5
|
+
## Primitives
|
|
6
|
+
|
|
7
|
+
| Helper | Type | Purpose |
|
|
8
|
+
| ---------------------------- | ----------------- | ------------------------------------------------------------ |
|
|
9
|
+
| `defineConfig(config)` | identity | Authoring helper for typed config |
|
|
10
|
+
| `composeSpanProcessors([…])` | `SpanProcessor` | Fan span lifecycle to multiple processors |
|
|
11
|
+
| `composePostProcessors([…])` | `PostProcessorFn` | Chain post-processors (each sees the output of the previous) |
|
|
12
|
+
| `composeSubscribers([…])` | `EdgeSubscriber` | Fire in-process side effects in order |
|
|
13
|
+
|
|
14
|
+
All from `autotel-edge`.
|
|
15
|
+
|
|
16
|
+
## Multi-backend export
|
|
17
|
+
|
|
18
|
+
```typescript
|
|
19
|
+
import { BatchSpanProcessor } from 'autotel/processors';
|
|
20
|
+
import { OTLPHttpJsonExporter } from 'autotel/exporters';
|
|
21
|
+
import { composeSpanProcessors, defineConfig } from 'autotel-edge';
|
|
22
|
+
|
|
23
|
+
const honeycomb = new BatchSpanProcessor(
|
|
24
|
+
new OTLPHttpJsonExporter({
|
|
25
|
+
url: 'https://api.honeycomb.io/v1/traces',
|
|
26
|
+
headers: { 'x-honeycomb-team': process.env.HONEYCOMB_KEY! },
|
|
27
|
+
}),
|
|
28
|
+
);
|
|
29
|
+
|
|
30
|
+
const grafana = new BatchSpanProcessor(
|
|
31
|
+
new OTLPHttpJsonExporter({
|
|
32
|
+
url: process.env.GRAFANA_OTLP_URL!,
|
|
33
|
+
headers: { authorization: `Basic ${process.env.GRAFANA_AUTH!}` },
|
|
34
|
+
}),
|
|
35
|
+
);
|
|
36
|
+
|
|
37
|
+
export const config = defineConfig({
|
|
38
|
+
service: { name: 'checkout' },
|
|
39
|
+
spanProcessors: composeSpanProcessors([honeycomb, grafana]),
|
|
40
|
+
});
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Tail sampling: keep errors + slow + 10% otherwise
|
|
44
|
+
|
|
45
|
+
```typescript
|
|
46
|
+
import { TailSamplingProcessor } from 'autotel/processors';
|
|
47
|
+
import { composeSpanProcessors } from 'autotel-edge';
|
|
48
|
+
|
|
49
|
+
const tail = new TailSamplingProcessor({
|
|
50
|
+
keep: (trace) => {
|
|
51
|
+
if (trace.localRootSpan.status?.code === SpanStatusCode.ERROR) return true;
|
|
52
|
+
if (trace.localRootSpan.duration[0] > 1) return true; // > 1s
|
|
53
|
+
return Math.random() < 0.1;
|
|
54
|
+
},
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
spanProcessors: composeSpanProcessors([new BatchSpanProcessor(otlp), tail]);
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Drop noisy spans before they reach the batcher
|
|
61
|
+
|
|
62
|
+
```typescript
|
|
63
|
+
import { FilteringSpanProcessor } from 'autotel/processors';
|
|
64
|
+
|
|
65
|
+
const dropHealth = new FilteringSpanProcessor({
|
|
66
|
+
exclude: (span) => /^GET \/(healthz|ready)$/.test(span.name),
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
spanProcessors: composeSpanProcessors([
|
|
70
|
+
dropHealth,
|
|
71
|
+
new BatchSpanProcessor(otlp),
|
|
72
|
+
]);
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Bound URL cardinality
|
|
76
|
+
|
|
77
|
+
```typescript
|
|
78
|
+
import { SpanNameNormalizingProcessor } from 'autotel/processors';
|
|
79
|
+
|
|
80
|
+
const normalise = new SpanNameNormalizingProcessor({
|
|
81
|
+
// Replace UUIDs and 24-char hex ids with placeholders
|
|
82
|
+
replacements: [
|
|
83
|
+
{
|
|
84
|
+
match: /[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/g,
|
|
85
|
+
with: ':id',
|
|
86
|
+
},
|
|
87
|
+
{ match: /[0-9a-f]{24}/g, with: ':id' },
|
|
88
|
+
],
|
|
89
|
+
});
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Now `GET /users/123e4567-e89b-12d3-a456-426614174000/orders` becomes `GET /users/:id/orders` in your traces — fewer unique span names, dramatically faster queries.
|
|
93
|
+
|
|
94
|
+
## Lift baggage onto every span
|
|
95
|
+
|
|
96
|
+
```typescript
|
|
97
|
+
import { BaggageSpanProcessor } from 'autotel/processors';
|
|
98
|
+
|
|
99
|
+
// Anything placed in baggage upstream becomes an attribute on every child span
|
|
100
|
+
const baggage = new BaggageSpanProcessor({ keys: ['tenant', 'feature_flags'] });
|
|
101
|
+
spanProcessors: composeSpanProcessors([baggage, new BatchSpanProcessor(otlp)]);
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## Subscribers for in-process side effects
|
|
105
|
+
|
|
106
|
+
Subscribers run synchronously in the parent context — ideal for metrics, audit, and cost calculation that you want recorded **before** the span goes to the batcher.
|
|
107
|
+
|
|
108
|
+
```typescript
|
|
109
|
+
import type { EdgeSubscriber } from 'autotel-edge';
|
|
110
|
+
import { composeSubscribers } from 'autotel-edge';
|
|
111
|
+
|
|
112
|
+
const metricsSubscriber: EdgeSubscriber = (event) => {
|
|
113
|
+
if (
|
|
114
|
+
event.kind === 'span.end' &&
|
|
115
|
+
event.span.attributes['http.response.status_code'] >= 500
|
|
116
|
+
) {
|
|
117
|
+
metrics.errorCounter.add(1, { route: event.span.name });
|
|
118
|
+
}
|
|
119
|
+
};
|
|
120
|
+
|
|
121
|
+
const auditSubscriber: EdgeSubscriber = (event) => {
|
|
122
|
+
if (event.kind === 'span.end' && event.span.name.startsWith('admin.')) {
|
|
123
|
+
audit.write({
|
|
124
|
+
kind: event.span.name,
|
|
125
|
+
actor: event.span.attributes['user.id'],
|
|
126
|
+
});
|
|
127
|
+
}
|
|
128
|
+
};
|
|
129
|
+
|
|
130
|
+
subscribers: [composeSubscribers([metricsSubscriber, auditSubscriber])];
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## Post-processors for last-mile rewrites
|
|
134
|
+
|
|
135
|
+
Post-processors mutate the array of spans **after** sampling, just before export. Use for redacting stack traces, dropping fields, or annotating with deployment info.
|
|
136
|
+
|
|
137
|
+
```typescript
|
|
138
|
+
import type { PostProcessorFn } from 'autotel-edge';
|
|
139
|
+
import { composePostProcessors } from 'autotel-edge';
|
|
140
|
+
import { createStringRedactor } from 'autotel';
|
|
141
|
+
|
|
142
|
+
const redactStacks = createStringRedactor('strict');
|
|
143
|
+
|
|
144
|
+
const cleanStacks: PostProcessorFn = (spans) =>
|
|
145
|
+
spans.map((s) => {
|
|
146
|
+
if (typeof s.attributes['exception.stacktrace'] === 'string') {
|
|
147
|
+
s.attributes['exception.stacktrace'] = redactStacks(
|
|
148
|
+
s.attributes['exception.stacktrace'],
|
|
149
|
+
);
|
|
150
|
+
}
|
|
151
|
+
return s;
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
const tagDeploy: PostProcessorFn = (spans) =>
|
|
155
|
+
spans.map((s) => ({
|
|
156
|
+
...s,
|
|
157
|
+
attributes: { ...s.attributes, 'deploy.id': process.env.RELEASE! },
|
|
158
|
+
}));
|
|
159
|
+
|
|
160
|
+
postProcessor: composePostProcessors([cleanStacks, tagDeploy]);
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
## Putting it all together
|
|
164
|
+
|
|
165
|
+
```typescript
|
|
166
|
+
import {
|
|
167
|
+
defineConfig,
|
|
168
|
+
composeSpanProcessors,
|
|
169
|
+
composeSubscribers,
|
|
170
|
+
composePostProcessors,
|
|
171
|
+
} from 'autotel-edge';
|
|
172
|
+
|
|
173
|
+
export const otelConfig = defineConfig({
|
|
174
|
+
service: { name: 'checkout' },
|
|
175
|
+
attributeRedactor: 'strict',
|
|
176
|
+
|
|
177
|
+
spanProcessors: composeSpanProcessors([
|
|
178
|
+
dropHealth, // 1. drop spans we never want
|
|
179
|
+
normaliseUrls, // 2. bound cardinality
|
|
180
|
+
new BatchSpanProcessor(honeycomb),
|
|
181
|
+
new BatchSpanProcessor(grafana),
|
|
182
|
+
tailSampler, // 3. keep errors + slow + 10%
|
|
183
|
+
]),
|
|
184
|
+
|
|
185
|
+
subscribers: [
|
|
186
|
+
composeSubscribers([metricsSubscriber, auditSubscriber, aiCostSubscriber]),
|
|
187
|
+
],
|
|
188
|
+
|
|
189
|
+
postProcessor: composePostProcessors([cleanStacks, tagDeploy]),
|
|
190
|
+
});
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
## Error isolation
|
|
194
|
+
|
|
195
|
+
Every compose helper catches errors per item and logs to `console.error` with the helper name. A single bad processor cannot break the others — important when one of your subscribers is a third-party integration (Datadog, PagerDuty, …) that can rate-limit or 502.
|
|
196
|
+
|
|
197
|
+
## Choosing between subscribers and post-processors
|
|
198
|
+
|
|
199
|
+
| You want… | Use |
|
|
200
|
+
| ----------------------------------------- | ---------------------------------------------------------- |
|
|
201
|
+
| Mutate exported span attributes | `postProcessor` |
|
|
202
|
+
| Drop spans entirely | `FilteringSpanProcessor` (early) or tail sampler |
|
|
203
|
+
| Update an in-process metric on every span | `subscribers` |
|
|
204
|
+
| Send an audit log to a DB | `subscribers` (use `log.fork('audit')` if writes are slow) |
|
|
205
|
+
| Re-emit spans to a second backend | second `BatchSpanProcessor` in `composeSpanProcessors` |
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# Structured errors
|
|
2
|
+
|
|
3
|
+
`createStructuredError` produces an `Error` carrying enough context to be:
|
|
4
|
+
|
|
5
|
+
- **Recorded onto the active span** (`exception.type`, `exception.message`, `exception.stacktrace`, `span.status = ERROR`).
|
|
6
|
+
- **Returned to clients safely** (`internal` is stripped by `parseError`).
|
|
7
|
+
- **Self-documenting** (`why` explains the cause, `fix` tells the caller what to do, `link` points at runbook docs).
|
|
8
|
+
|
|
9
|
+
## Field reference
|
|
10
|
+
|
|
11
|
+
| Field | Audience | Purpose |
|
|
12
|
+
| ---------- | ----------- | ---------------------------------------------------------- |
|
|
13
|
+
| `message` | Both | Short, stable summary |
|
|
14
|
+
| `status` | Both | HTTP status (drives client behaviour and span status code) |
|
|
15
|
+
| `why` | Both | Human-readable cause (`"Card declined by issuer"`) |
|
|
16
|
+
| `fix` | Client | Remediation hint (`"Use a different payment method"`) |
|
|
17
|
+
| `link` | Client | URL to docs / runbook |
|
|
18
|
+
| `code` | Both | Machine-readable code (`"PAYMENT_DECLINED"`) |
|
|
19
|
+
| `cause` | Server only | The underlying error |
|
|
20
|
+
| `internal` | Server only | Diagnostic metadata (`{ correlationId, resourceId }`) |
|
|
21
|
+
| `details` | Both | Structured payload (e.g. validation errors per field) |
|
|
22
|
+
|
|
23
|
+
## Templates
|
|
24
|
+
|
|
25
|
+
### Validation (400)
|
|
26
|
+
|
|
27
|
+
```typescript
|
|
28
|
+
throw createStructuredError({
|
|
29
|
+
status: 400,
|
|
30
|
+
code: 'VALIDATION_ERROR',
|
|
31
|
+
message: 'Invalid request body',
|
|
32
|
+
why: 'One or more fields failed validation',
|
|
33
|
+
fix: 'Check the `details` field for per-field errors',
|
|
34
|
+
details: { email: 'must be a valid email', age: 'must be ≥ 18' },
|
|
35
|
+
});
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
### Auth (401 / 403)
|
|
39
|
+
|
|
40
|
+
```typescript
|
|
41
|
+
throw createStructuredError({
|
|
42
|
+
status: 403,
|
|
43
|
+
code: 'FORBIDDEN',
|
|
44
|
+
message: 'Not allowed',
|
|
45
|
+
why: 'You do not have access to this resource',
|
|
46
|
+
fix: 'Ask the workspace owner for access',
|
|
47
|
+
link: 'https://docs.example.com/permissions',
|
|
48
|
+
internal: { resourceId: 'proj_123', userRole: 'member' },
|
|
49
|
+
});
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### Payment (402)
|
|
53
|
+
|
|
54
|
+
```typescript
|
|
55
|
+
throw createStructuredError({
|
|
56
|
+
status: 402,
|
|
57
|
+
code: 'PAYMENT_DECLINED',
|
|
58
|
+
message: 'Payment declined',
|
|
59
|
+
why: 'Card declined by issuer — insufficient funds',
|
|
60
|
+
fix: 'Use a different payment method or contact your bank',
|
|
61
|
+
link: 'https://docs.example.com/payments/declined',
|
|
62
|
+
cause: stripeError,
|
|
63
|
+
internal: { stripeChargeId: 'ch_…', riskScore: stripeError.risk_level },
|
|
64
|
+
});
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Upstream failure (502 / 503 / 504)
|
|
68
|
+
|
|
69
|
+
```typescript
|
|
70
|
+
throw createStructuredError({
|
|
71
|
+
status: 502,
|
|
72
|
+
code: 'UPSTREAM_FAILED',
|
|
73
|
+
message: 'Inventory service is unavailable',
|
|
74
|
+
why: 'Could not reach the inventory service',
|
|
75
|
+
fix: 'Retry in a few minutes',
|
|
76
|
+
cause: fetchError,
|
|
77
|
+
internal: { upstream: 'inventory-svc', retryAttempt: 3 },
|
|
78
|
+
});
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## At HTTP boundaries
|
|
82
|
+
|
|
83
|
+
```typescript
|
|
84
|
+
import { parseError } from 'autotel';
|
|
85
|
+
|
|
86
|
+
app.onError((error, c) => {
|
|
87
|
+
// span.status is already ERROR with exception fields recorded
|
|
88
|
+
const parsed = parseError(error);
|
|
89
|
+
// `internal` and `cause` are stripped here — never leak them to clients
|
|
90
|
+
return c.json(parsed, parsed.status);
|
|
91
|
+
});
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Anti-patterns
|
|
95
|
+
|
|
96
|
+
| Anti-pattern | Fix |
|
|
97
|
+
| ------------------------------------------------------------ | -------------------------------------------------------------------------------------- |
|
|
98
|
+
| `throw new Error('something went wrong')` | `createStructuredError({ message, status, why, fix })` |
|
|
99
|
+
| Putting support IDs in `message` (`"Failed for user 42"`) | Use `internal: { userId: 42 }` |
|
|
100
|
+
| Returning `details: { error: stack }` to clients | Stack traces stay in `cause` / span; never serialise them out |
|
|
101
|
+
| `console.error(e); throw e` | Just throw — autotel's span will pick up the exception |
|
|
102
|
+
| Two callers throwing different shapes for the same condition | Centralise: `function declined(reason: string) { throw createStructuredError({ … }) }` |
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# Designing wide spans
|
|
2
|
+
|
|
3
|
+
A wide span is a single span per logical unit of work (request, job, message, fork) carrying _all_ the fields you'd ever want to filter or group by. autotel lets you build them with `useLogger().set({ … })` — fields are flattened to OTel attributes with stable dotted keys.
|
|
4
|
+
|
|
5
|
+
## Anatomy
|
|
6
|
+
|
|
7
|
+
```typescript
|
|
8
|
+
import { useLogger } from 'autotel';
|
|
9
|
+
|
|
10
|
+
export const POST = withAutotel(async (request) => {
|
|
11
|
+
const log = useLogger();
|
|
12
|
+
|
|
13
|
+
// Identity
|
|
14
|
+
log.set({ user: { id: 'usr_123', plan: 'enterprise', role: 'admin' } });
|
|
15
|
+
|
|
16
|
+
// Inputs
|
|
17
|
+
log.set({ cart: { items: 3, total: 14_999, currency: 'USD' } });
|
|
18
|
+
|
|
19
|
+
// Decisions / branches
|
|
20
|
+
log.set({ promo: { applied: 'SUMMER10', discount: 1_500 } });
|
|
21
|
+
|
|
22
|
+
// Outputs
|
|
23
|
+
log.set({
|
|
24
|
+
payment: { provider: 'stripe', method: 'card', authCode: 'auth_x' },
|
|
25
|
+
});
|
|
26
|
+
|
|
27
|
+
return Response.json({ ok: true });
|
|
28
|
+
});
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
OTel attributes recorded:
|
|
32
|
+
|
|
33
|
+
```
|
|
34
|
+
user.id=usr_123
|
|
35
|
+
user.plan=enterprise
|
|
36
|
+
user.role=admin
|
|
37
|
+
cart.items=3
|
|
38
|
+
cart.total=14999
|
|
39
|
+
cart.currency=USD
|
|
40
|
+
promo.applied=SUMMER10
|
|
41
|
+
promo.discount=1500
|
|
42
|
+
payment.provider=stripe
|
|
43
|
+
payment.method=card
|
|
44
|
+
payment.authCode=auth_x
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Rules of thumb
|
|
48
|
+
|
|
49
|
+
1. **One wide span per logical unit of work.** Many tiny spans hurt query speed; deep call trees can be opt-in (`autotel-drizzle`, `autotel-mongoose`).
|
|
50
|
+
2. **Group with objects.** `{ user: { id, plan } }` not `userId` / `userPlan`. The flatten step keeps the key shape stable.
|
|
51
|
+
3. **Capture decisions, not just inputs.** Which branch ran, which promo applied, which fallback fired.
|
|
52
|
+
4. **Keep cardinality bounded.** Don't put per-request UUIDs in `span.name`; use `SpanNameNormalizingProcessor`. Free-text labels go in attributes.
|
|
53
|
+
5. **Avoid raw bodies.** Pick the shape: `{ user: { id, plan } }` — never `log.set({ user: requestBody })`.
|
|
54
|
+
6. **Trust the redactor.** PII you forgot to think about (emails, JWTs, cards) gets masked in production. See `attributeRedactor: 'default'`.
|
|
55
|
+
|
|
56
|
+
## When you need correlated child spans
|
|
57
|
+
|
|
58
|
+
Use `trace()` to wrap discrete sub-operations whose duration matters:
|
|
59
|
+
|
|
60
|
+
```typescript
|
|
61
|
+
import { trace } from 'autotel';
|
|
62
|
+
|
|
63
|
+
const fetchInventory = trace(async (sku: string) => {
|
|
64
|
+
/* … */
|
|
65
|
+
});
|
|
66
|
+
const reserveStock = trace(async (sku: string, qty: number) => {
|
|
67
|
+
/* … */
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
await fetchInventory(sku);
|
|
71
|
+
await reserveStock(sku, qty);
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Each gets its own span with the function name; both are children of the active request span.
|
|
75
|
+
|
|
76
|
+
## When you need background work
|
|
77
|
+
|
|
78
|
+
`log.fork('label', fn)` spawns a child span that emits its own wide event with `_parentCorrelationId` set, even after the parent response has been returned. Pass `lifecycle.onChildEnter / onChildExit` if your framework tracks active loggers (Elysia, etc.).
|
|
79
|
+
|
|
80
|
+
```typescript
|
|
81
|
+
log.fork('audit-write', async () => {
|
|
82
|
+
await audit.write({ kind: 'order.created', orderId });
|
|
83
|
+
});
|
|
84
|
+
return Response.json({ ok: true }); // parent returns immediately
|
|
85
|
+
```
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: tune-sampling
|
|
3
|
+
description: >
|
|
4
|
+
Choose a sampling strategy for an autotel-instrumented service. Covers
|
|
5
|
+
head sampling (per-span-kind rates, parent-based, ratio), tail sampling
|
|
6
|
+
(keep errors, slow, AI-aware, debug-headers), cost vs cardinality
|
|
7
|
+
tradeoffs, and the math for picking rates that hit a target spans/second
|
|
8
|
+
budget. Includes recipes for low-volume admin services, high-volume APIs,
|
|
9
|
+
AI agents, and Cloudflare Workers.
|
|
10
|
+
license: MIT
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
# Tune sampling
|
|
14
|
+
|
|
15
|
+
Untuned tracing is either expensive (100 % at scale costs money + drowns dashboards) or unhelpful (1 % loses the failure modes you need to see). The right answer is almost always **head sample most of the boring traffic, tail keep all the interesting traffic**, with explicit overrides for AI calls and customer escalations.
|
|
16
|
+
|
|
17
|
+
## When to use
|
|
18
|
+
|
|
19
|
+
- Hitting your observability budget
|
|
20
|
+
- Dashboards too sparse to spot anomalies
|
|
21
|
+
- "We have the trace IDs but the spans are gone" complaints
|
|
22
|
+
- New service launching at scale
|
|
23
|
+
- Long-running AI agents producing 50+ spans per request
|
|
24
|
+
|
|
25
|
+
## The mental model
|
|
26
|
+
|
|
27
|
+
```
|
|
28
|
+
Total cost = (spans/sec × $/span) + (storage_GB × $/GB-month)
|
|
29
|
+
↑
|
|
30
|
+
Head sampling reduces this directly.
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
Head sampling makes a decision **at span start** — fast, but coarse (it doesn't know if the span will fail).
|
|
34
|
+
Tail sampling makes the decision **at span end** — slower, more storage upfront, but precise.
|
|
35
|
+
|
|
36
|
+
The right mix:
|
|
37
|
+
|
|
38
|
+
- **Head sample at the entry point** to keep volume tractable.
|
|
39
|
+
- **Tail keep** the high-value subset (errors, slow, AI, debug-headered).
|
|
40
|
+
- **Don't sample audit spans** — separate processor, see [`build-audit-trails`](../build-audit-trails/SKILL.md).
|
|
41
|
+
|
|
42
|
+
## Head sampling recipes
|
|
43
|
+
|
|
44
|
+
### Default for a typical web service
|
|
45
|
+
|
|
46
|
+
```typescript
|
|
47
|
+
init({
|
|
48
|
+
service: 'my-app',
|
|
49
|
+
sampling: {
|
|
50
|
+
rates: {
|
|
51
|
+
server: 25, // server entry spans — sample ¼
|
|
52
|
+
client: 5, // outbound HTTP — sample 1/20
|
|
53
|
+
internal: 5, // internal sub-spans — sample 1/20
|
|
54
|
+
},
|
|
55
|
+
},
|
|
56
|
+
});
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Children of a sampled root are **all** kept (parent-based propagation is the default). So `server: 25` means 25 % of _user requests_, complete trace each.
|
|
60
|
+
|
|
61
|
+
### High-volume API (>1 k req/s)
|
|
62
|
+
|
|
63
|
+
```typescript
|
|
64
|
+
sampling: {
|
|
65
|
+
rates: { server: 5, client: 1, internal: 1 }, // 5 % → tail keeps errors anyway
|
|
66
|
+
tail: keepInterestingTraces,
|
|
67
|
+
},
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### Low-volume admin / internal service (<10 req/s)
|
|
71
|
+
|
|
72
|
+
100 % is fine. Don't penalise yourself for a service that produces 1 GB of traces a week.
|
|
73
|
+
|
|
74
|
+
### Cloudflare Workers (per-colo budget)
|
|
75
|
+
|
|
76
|
+
Workers run distributed — head sampling is your friend because there's no central queue:
|
|
77
|
+
|
|
78
|
+
```typescript
|
|
79
|
+
defineWorkerFetch(
|
|
80
|
+
{
|
|
81
|
+
service: { name: 'edge' },
|
|
82
|
+
sampling: { rates: { server: 10 } }, // 10 % per colo, scales naturally
|
|
83
|
+
},
|
|
84
|
+
handler,
|
|
85
|
+
);
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Tail sampling — keep interesting traces
|
|
89
|
+
|
|
90
|
+
Tail sampling looks at the full trace (root span + children) before deciding. autotel ships `TailSamplingProcessor`:
|
|
91
|
+
|
|
92
|
+
```typescript
|
|
93
|
+
import { TailSamplingProcessor } from 'autotel/processors';
|
|
94
|
+
import { SpanStatusCode } from '@opentelemetry/api';
|
|
95
|
+
|
|
96
|
+
const tail = new TailSamplingProcessor({
|
|
97
|
+
keep: (trace) => {
|
|
98
|
+
// 1. Always keep errors
|
|
99
|
+
if (trace.localRootSpan.status?.code === SpanStatusCode.ERROR) return true;
|
|
100
|
+
if (trace.spans.some((s) => s.status?.code === SpanStatusCode.ERROR))
|
|
101
|
+
return true;
|
|
102
|
+
|
|
103
|
+
// 2. Always keep slow traces (configurable threshold)
|
|
104
|
+
if (durationMs(trace.localRootSpan) > 1_000) return true;
|
|
105
|
+
|
|
106
|
+
// 3. Always keep customer-marked traces
|
|
107
|
+
if (trace.localRootSpan.attributes['debug.trace'] === true) return true;
|
|
108
|
+
|
|
109
|
+
// 4. Always keep AI traces (rare + expensive — full visibility helps)
|
|
110
|
+
if (
|
|
111
|
+
trace.spans.some((s) => typeof s.attributes['gen_ai.system'] === 'string')
|
|
112
|
+
)
|
|
113
|
+
return true;
|
|
114
|
+
|
|
115
|
+
// 5. Otherwise: respect head sampling decision
|
|
116
|
+
return false;
|
|
117
|
+
},
|
|
118
|
+
});
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Combining with multi-backend
|
|
122
|
+
|
|
123
|
+
```typescript
|
|
124
|
+
spanProcessors: composeSpanProcessors([
|
|
125
|
+
// Drop nothing here — we want the tail processor to see the full trace
|
|
126
|
+
new BatchSpanProcessor(localExporter),
|
|
127
|
+
tail, // filters before remote export
|
|
128
|
+
new BatchSpanProcessor(expensiveRemoteExporter),
|
|
129
|
+
]);
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## AI / LLM-aware sampling
|
|
133
|
+
|
|
134
|
+
LLM calls produce 5–50 spans per request and are 100× more expensive than a typical handler call. Tradeoffs:
|
|
135
|
+
|
|
136
|
+
- **Don't head-sample AI handlers below 50 %** — debugging "why did the model loop" requires the full chain.
|
|
137
|
+
- **Always tail-keep AI traces** — the `gen_ai.*` attributes flag them.
|
|
138
|
+
- **Cost-aware sampling** — keep all calls above a $ threshold:
|
|
139
|
+
|
|
140
|
+
```typescript
|
|
141
|
+
keep: (trace) => {
|
|
142
|
+
const cost = trace.spans.reduce(
|
|
143
|
+
(acc, s) =>
|
|
144
|
+
acc +
|
|
145
|
+
(typeof s.attributes['gen_ai.cost.usd'] === 'number'
|
|
146
|
+
? (s.attributes['gen_ai.cost.usd'] as number)
|
|
147
|
+
: 0),
|
|
148
|
+
0,
|
|
149
|
+
);
|
|
150
|
+
if (cost > 0.1) return true; // any trace > $0.10 → keep
|
|
151
|
+
if (cost > 0.01) return Math.random() < 0.5; // > $0.01 → 50 %
|
|
152
|
+
return Math.random() < 0.1; // < $0.01 → 10 %
|
|
153
|
+
};
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
## Customer-driven sampling (debug header)
|
|
157
|
+
|
|
158
|
+
Let support flip on full tracing per request:
|
|
159
|
+
|
|
160
|
+
```typescript
|
|
161
|
+
const tail = new TailSamplingProcessor({
|
|
162
|
+
keep: (trace) => trace.localRootSpan.attributes['x-debug-trace'] === '1' || /* … */,
|
|
163
|
+
})
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
In your middleware:
|
|
167
|
+
|
|
168
|
+
```typescript
|
|
169
|
+
if (request.headers.get('x-debug-trace') === '1') {
|
|
170
|
+
useLogger().set({ 'x-debug-trace': '1' });
|
|
171
|
+
}
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
Now any user can mark a request as "trace this fully" by sending the header — invaluable for reproducing customer reports.
|
|
175
|
+
|
|
176
|
+
## Sizing the rate
|
|
177
|
+
|
|
178
|
+
Target volume:
|
|
179
|
+
|
|
180
|
+
```
|
|
181
|
+
spans/sec ≈ requests/sec × spans_per_request × head_rate × tail_keep_rate
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
Worked example for a 100 req/s API with 8 spans/req:
|
|
185
|
+
|
|
186
|
+
| Head rate | Tail keep | Result |
|
|
187
|
+
| --------- | -------------------------------- | -------------------------------------------- |
|
|
188
|
+
| 100 % | 100 % | 800 spans/sec — expensive |
|
|
189
|
+
| 10 % | 100 % (errors + slow + AI ≈ 5 %) | ≈ 110 spans/sec — sweet spot |
|
|
190
|
+
| 1 % | 100 % | ≈ 18 spans/sec — too sparse for p99 alerting |
|
|
191
|
+
|
|
192
|
+
For per-vendor pricing:
|
|
193
|
+
|
|
194
|
+
- **Honeycomb**: $0.000005 / event for paid plans. 110 spans/sec × 86 400 s = 9.5 M events/day = $48/day.
|
|
195
|
+
- **Datadog APM**: ~$1.27/M spans ingested (varies by region). Same volume → ~$12/day.
|
|
196
|
+
- **Grafana Cloud**: 100 GB free tier; 110 spans/sec ≈ 5 GB/day.
|
|
197
|
+
|
|
198
|
+
## Anti-patterns
|
|
199
|
+
|
|
200
|
+
| Anti-pattern | Fix |
|
|
201
|
+
| -------------------------------------------- | ------------------------------------------------------------- |
|
|
202
|
+
| 100 % sampling at scale "to be safe" | You're paying 10–100× without proportional value |
|
|
203
|
+
| 1 % sampling with no tail keep | You'll miss every interesting failure |
|
|
204
|
+
| Forgetting to tail-keep errors | Sampled traces with errors → silent customer pain |
|
|
205
|
+
| Same rate for `server` and `internal` | Internal sub-spans are 5–20× more numerous; sample harder |
|
|
206
|
+
| Ratio-based sampling on service entry point | Use parent-based — children of a sampled trace stay together |
|
|
207
|
+
| Head-sampling AI calls below 50 % | Debugging tool loops requires the full chain |
|
|
208
|
+
| Audit spans subject to sampling | Route them to a separate processor (see `build-audit-trails`) |
|
|
209
|
+
| Tail processor before exporter (loses spans) | Tail processor goes between head sampler and remote exporter |
|
|
210
|
+
| Rate-by-route hand-coded in handlers | Use head sampler + tail keep — declarative, one place |
|
|
@@ -184,8 +184,9 @@ describe('AttributeRedactingProcessor', () => {
|
|
|
184
184
|
});
|
|
185
185
|
processor.onEnd(span);
|
|
186
186
|
|
|
187
|
+
// SSN has no smart mask; falls back to the default replacement.
|
|
187
188
|
expect(mockProcessor.endedSpans[0]!.attributes['user.ssn']).toBe(
|
|
188
|
-
'
|
|
189
|
+
'[REDACTED]',
|
|
189
190
|
);
|
|
190
191
|
});
|
|
191
192
|
|
|
@@ -199,8 +200,9 @@ describe('AttributeRedactingProcessor', () => {
|
|
|
199
200
|
});
|
|
200
201
|
processor.onEnd(span);
|
|
201
202
|
|
|
203
|
+
// PCI-DSS compliant: last 4 digits preserved.
|
|
202
204
|
expect(mockProcessor.endedSpans[0]!.attributes['payment.card']).toBe(
|
|
203
|
-
'
|
|
205
|
+
'****1111',
|
|
204
206
|
);
|
|
205
207
|
});
|
|
206
208
|
|
|
@@ -710,12 +712,12 @@ describe('edge cases', () => {
|
|
|
710
712
|
});
|
|
711
713
|
|
|
712
714
|
const span = createMockReadableSpan({
|
|
713
|
-
contacts: 'Email: john@example.com, Phone: 555-123-4567',
|
|
715
|
+
contacts: 'Email: john@example.com, Phone: +1 555-123-4567',
|
|
714
716
|
});
|
|
715
717
|
processor.onEnd(span);
|
|
716
718
|
|
|
717
719
|
expect(mockProcessor.endedSpans[0]!.attributes.contacts).toBe(
|
|
718
|
-
'Email: j***@***.com, Phone:
|
|
720
|
+
'Email: j***@***.com, Phone: +1******67',
|
|
719
721
|
);
|
|
720
722
|
});
|
|
721
723
|
});
|
|
@@ -147,10 +147,19 @@ export const builtinPatterns = {
|
|
|
147
147
|
/\b(?!0\.0\.0\.0\b)(?!127\.0\.0\.1\b)\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b/g,
|
|
148
148
|
mask: (m: string) => `***.***.***.${m.split('.').pop()}`,
|
|
149
149
|
},
|
|
150
|
-
/**
|
|
150
|
+
/**
|
|
151
|
+
* International / formatted phone numbers.
|
|
152
|
+
*
|
|
153
|
+
* Matches:
|
|
154
|
+
* - `+33 1 23 45 67 89` -> `+33******89`
|
|
155
|
+
* - `(415) 555-1234` -> `********34`
|
|
156
|
+
* - `555-123-4567` / `555.123.4567` / `5551234567` -> `********67`
|
|
157
|
+
*
|
|
158
|
+
* Bare short digit runs like `12345678` are intentionally not matched.
|
|
159
|
+
*/
|
|
151
160
|
phone: {
|
|
152
161
|
pattern:
|
|
153
|
-
/(?:\+\d{1,3}[\s.-]
|
|
162
|
+
/(?:\+\d{1,3}[\s.-]?\(?\d{1,4}\)?(?:[\s.-]?\d{2,4}){2,4}|\(\d{1,4}\)(?:[\s.-]?\d{2,4}){2,4}|\b\d{3}[-.]?\d{3}[-.]?\d{4}\b)/g,
|
|
154
163
|
mask: (m: string) => {
|
|
155
164
|
const digits = m.replace(/[^\d]/g, '');
|
|
156
165
|
const hasPlus = m.startsWith('+');
|