@luanpdd/kit-mcp 1.35.0 → 1.36.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cli.js +2 -2
- package/bin/mcp.js +6 -6
- package/bin/ui.js +74 -74
- package/gates/ai-prompt-stability.md +120 -120
- package/gates/budget-description.md +68 -68
- package/gates/confidence.md +29 -29
- package/gates/dependency-check.md +33 -33
- package/gates/dept-cycle-prevention.md +179 -179
- package/gates/golden-signals-coverage.md +133 -133
- package/gates/legacy-refactor-safety.md +178 -178
- package/gates/multi-tenant-rls-coverage.md +102 -102
- package/gates/no-personal-uuid.md +72 -72
- package/gates/obs-agents-mcp-supabase.md +86 -86
- package/gates/obs-skills-frontmatter.md +76 -76
- package/gates/observability-coverage.md +151 -151
- package/gates/omm-no-regression.md +83 -83
- package/gates/postmortem-template-required.md +127 -127
- package/gates/prr-checklist-coverage.md +128 -128
- package/gates/regression.md +32 -32
- package/gates/release-pipeline-policy.md +132 -132
- package/gates/secrets-scan.md +33 -33
- package/gates/service-role-not-in-user-facing.md +113 -113
- package/gates/skill-must-include.md +71 -71
- package/gates/sync-idempotent.md +62 -62
- package/gates/verify-phase-goal.md +34 -34
- package/kit/agents/designer-ui.md +216 -216
- package/kit/agents/workflow-generator.md +537 -167
- package/kit/commands/adicionar-backlog.md +1 -1
- package/kit/commands/adicionar-fase.md +1 -1
- package/kit/commands/adicionar-tarefa.md +1 -1
- package/kit/commands/auditar-observabilidade.md +103 -103
- package/kit/commands/auditar-toil.md +129 -129
- package/kit/commands/caracterizar-prompt.md +195 -195
- package/kit/commands/criar-workflow.md +158 -158
- package/kit/commands/definir-perfil.md +1 -1
- package/kit/commands/definir-slo.md +108 -108
- package/kit/commands/fio.md +1 -1
- package/kit/commands/golden-signals.md +142 -142
- package/kit/commands/instrumentar-fase.md +200 -200
- package/kit/commands/investigar-producao.md +162 -162
- package/kit/commands/observabilidade.md +118 -118
- package/kit/commands/postmortem.md +179 -179
- package/kit/commands/prr.md +205 -205
- package/kit/commands/publicar-rapido.md +207 -207
- package/kit/commands/risk-budget.md +220 -220
- package/kit/commands/sre.md +230 -230
- package/kit/file-manifest.json +424 -424
- package/kit/framework/references/output-style.md +22 -22
- package/kit/hooks/post-apply-migration.js +199 -199
- package/kit/hooks/sidecar-tool-publisher.js +210 -210
- package/kit/skills/_shared-dados-distribuidos/glossary.md +224 -224
- package/kit/skills/_shared-legacy/glossary.md +389 -389
- package/kit/skills/_shared-multi-tenant/glossary.md +186 -186
- package/kit/skills/_shared-observability/glossary.md +396 -396
- package/kit/skills/_shared-sre/glossary.md +712 -712
- package/kit/skills/_shared-supabase/glossary.md +234 -234
- package/kit/skills/blameless-postmortems/SKILL.md +340 -340
- package/kit/skills/burn-rate-alerting/SKILL.md +258 -258
- package/kit/skills/cascading-failures/SKILL.md +311 -311
- package/kit/skills/core-analysis-loop/SKILL.md +352 -352
- package/kit/skills/distributed-tracing/SKILL.md +362 -362
- package/kit/skills/dynamic-workflow-authoring/SKILL.md +327 -223
- package/kit/skills/eliminating-toil/SKILL.md +243 -243
- package/kit/skills/event-based-slos/SKILL.md +296 -296
- package/kit/skills/four-golden-signals/SKILL.md +314 -314
- package/kit/skills/hermetic-builds/SKILL.md +323 -323
- package/kit/skills/legacy-monster-methods/SKILL.md +444 -444
- package/kit/skills/llm-as-dependency/SKILL.md +436 -436
- package/kit/skills/load-shedding-graceful-degradation/SKILL.md +396 -396
- package/kit/skills/observability-driven-development/SKILL.md +315 -315
- package/kit/skills/observability-maturity-model/SKILL.md +222 -222
- package/kit/skills/opentelemetry-standard/SKILL.md +351 -351
- package/kit/skills/production-readiness-review/SKILL.md +305 -305
- package/kit/skills/release-engineering/SKILL.md +367 -367
- package/kit/skills/retry-strategies/SKILL.md +372 -372
- package/kit/skills/sre-risk-management/SKILL.md +221 -221
- package/kit/skills/structured-events/SKILL.md +265 -265
- package/kit/skills/supabase-cron-queues/SKILL.md +275 -275
- package/kit/skills/supabase-database-functions/SKILL.md +332 -332
- package/kit/skills/supabase-declarative-schema/SKILL.md +183 -183
- package/kit/skills/supabase-pgvector-rag/SKILL.md +253 -253
- package/kit/skills/supabase-postgres-style/SKILL.md +138 -138
- package/kit/skills/supabase-storage/SKILL.md +234 -234
- package/kit/skills/telemetry-pipelines/SKILL.md +259 -259
- package/kit/skills/telemetry-sampling/SKILL.md +256 -256
- package/kit/skills/ui-anti-padroes-ia/SKILL.md +261 -261
- package/kit/skills/ui-contexto-produto/SKILL.md +248 -248
- package/kit/skills/ui-cor-estrategia/SKILL.md +213 -213
- package/kit/skills/ui-critica-auditoria/SKILL.md +260 -260
- package/kit/skills/ui-motion-funcional/SKILL.md +264 -264
- package/kit/skills/ui-ritmo-espacial/SKILL.md +259 -259
- package/kit/skills/ui-tipografia/SKILL.md +211 -211
- package/package.json +1 -1
- package/src/cli/index.js +1114 -1114
- package/src/cli/render.js +194 -194
- package/src/cli/upgrade-check.js +135 -135
- package/src/core/error-redaction.js +76 -76
- package/src/core/failures.js +153 -153
- package/src/core/gate-runner.js +205 -205
- package/src/core/gates.js +82 -82
- package/src/core/logger.js +170 -170
- package/src/core/manifest-verify.js +174 -174
- package/src/core/metrics.js +268 -268
- package/src/core/notify.js +60 -60
- package/src/core/path-safety.js +141 -141
- package/src/core/replays.js +120 -120
- package/src/core/ui.js +185 -185
- package/src/mcp-server/install.js +149 -149
- package/src/mcp-server/roots.js +124 -124
- package/src/ui/auto-spawn.js +113 -113
- package/src/ui/browser.js +78 -78
- package/src/ui/client.js +130 -130
- package/src/ui/events.js +65 -65
- package/src/ui/lockfile.js +191 -191
- package/src/ui/port.js +67 -67
- package/src/ui/server.js +547 -547
- package/src/ui/wrapper.js +129 -129
|
@@ -1,362 +1,362 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: distributed-tracing
|
|
3
|
-
description: Use ao instrumentar tracing — trace_id/span_id/parent_id, propagar W3C TraceContext via header traceparent, stitching além de RPCs (batch, lambda, queue).
|
|
4
|
-
---
|
|
5
|
-
|
|
6
|
-
# Observabilidade — Distributed Tracing
|
|
7
|
-
|
|
8
|
-
## Quando usar
|
|
9
|
-
|
|
10
|
-
LLM carrega esta skill ao instrumentar tracing distribuído ou stitching de spans. Trigger phrases:
|
|
11
|
-
|
|
12
|
-
- "distributed tracing", "traces", "spans"
|
|
13
|
-
- "propagar contexto entre serviços", "trace cross-service"
|
|
14
|
-
- "W3C TraceContext", "traceparent header"
|
|
15
|
-
- "trace_id span_id parent_span_id"
|
|
16
|
-
- "ligar lambda batch job ao trace"
|
|
17
|
-
- "stitching de eventos"
|
|
18
|
-
|
|
19
|
-
## Regras absolutas
|
|
20
|
-
|
|
21
|
-
- **trace_id é compartilhado** entre todos os spans de um único request distribuído. **NÃO** mude por hop.
|
|
22
|
-
- **span_id é único por span** — gere novo a cada `startSpan()`. 16 hex chars (8 bytes).
|
|
23
|
-
- **parent_span_id aponta para span pai** — null no root span. Define a árvore.
|
|
24
|
-
- **W3C TraceContext é o padrão** — header HTTP `traceparent: 00-{trace_id}-{span_id}-{flags}`. Adote sempre. B3 é fallback para legacy.
|
|
25
|
-
- **Propague ANTES de fazer call cross-service** — extrair contexto do request inbound, propagar no request outbound. Sem isso, trace quebra.
|
|
26
|
-
- **Stitching ≠ apenas RPC** — também batch jobs, queue messages, lambda invocations, S3 uploads. Carregue `traceparent` em metadata da queue, env var do lambda, header da Step Function.
|
|
27
|
-
- **Sample decision propaga** — bit `01` em flags de `traceparent` significa "sample=true". Decisão tomada no head propaga downstream.
|
|
28
|
-
- **Não invente trace_id** — sempre derive do contexto inbound ou gere via SDK (não `crypto.randomUUID()`).
|
|
29
|
-
- **Spans devem ter `kind`** — `SERVER` (handler de inbound), `CLIENT` (call outbound), `PRODUCER`/`CONSUMER` (queue), `INTERNAL` (subspan dentro do mesmo process).
|
|
30
|
-
|
|
31
|
-
## Patterns canônicos
|
|
32
|
-
|
|
33
|
-
### Pattern: extrair contexto inbound + propagar outbound (Node)
|
|
34
|
-
|
|
35
|
-
```ts
|
|
36
|
-
// PT-BR: handler HTTP — extrai traceparent do request inbound, propaga em call outbound
|
|
37
|
-
import { trace, context, propagation } from '@opentelemetry/api'
|
|
38
|
-
|
|
39
|
-
const tracer = trace.getTracer('orders-service')
|
|
40
|
-
|
|
41
|
-
export async function placeOrder(req: Request) {
|
|
42
|
-
// PT-BR: 1 — extrair contexto inbound do header traceparent
|
|
43
|
-
const inboundContext = propagation.extract(context.active(), req.headers)
|
|
44
|
-
|
|
45
|
-
return tracer.startActiveSpan(
|
|
46
|
-
'place_order',
|
|
47
|
-
{ kind: SpanKind.SERVER },
|
|
48
|
-
inboundContext,
|
|
49
|
-
async (span) => {
|
|
50
|
-
span.setAttribute('user.id', req.user.id)
|
|
51
|
-
|
|
52
|
-
// PT-BR: 2 — fazer call outbound — propagation injeta traceparent automaticamente
|
|
53
|
-
// se você usar fetch/grpc instrumentados (ver skill opentelemetry-standard)
|
|
54
|
-
const outboundHeaders: Record<string, string> = {}
|
|
55
|
-
propagation.inject(context.active(), outboundHeaders)
|
|
56
|
-
|
|
57
|
-
const inventoryRes = await fetch('http://inventory/check', {
|
|
58
|
-
headers: outboundHeaders, // PT-BR: traceparent injetado aqui
|
|
59
|
-
body: JSON.stringify({ items: req.items })
|
|
60
|
-
})
|
|
61
|
-
|
|
62
|
-
span.end()
|
|
63
|
-
return inventoryRes.json()
|
|
64
|
-
}
|
|
65
|
-
)
|
|
66
|
-
}
|
|
67
|
-
```
|
|
68
|
-
|
|
69
|
-
### Pattern: traceparent format
|
|
70
|
-
|
|
71
|
-
```text
|
|
72
|
-
traceparent: 00-4bf92f3577b34da6a3ce929d0e0e4736-00f067aa0ba902b7-01
|
|
73
|
-
^ ^ ^ ^
|
|
74
|
-
| | | |
|
|
75
|
-
version | flags (sampled bit)
|
|
76
|
-
trace_id (32 hex / 16 bytes) |
|
|
77
|
-
span_id (16 hex / 8 bytes)
|
|
78
|
-
```
|
|
79
|
-
|
|
80
|
-
```text
|
|
81
|
-
flags:
|
|
82
|
-
01 = sampled (decisão upstream: capture este trace)
|
|
83
|
-
00 = not sampled (decisão upstream: skip)
|
|
84
|
-
```
|
|
85
|
-
|
|
86
|
-
### Pattern: trace cross-service via Supabase Edge Function
|
|
87
|
-
|
|
88
|
-
```ts
|
|
89
|
-
// PT-BR: Edge Function recebe request → propaga para outro service
|
|
90
|
-
import { trace, context, propagation } from 'npm:@opentelemetry/api@1.9.0'
|
|
91
|
-
import { W3CTraceContextPropagator } from 'npm:@opentelemetry/core@1.27.0'
|
|
92
|
-
|
|
93
|
-
propagation.setGlobalPropagator(new W3CTraceContextPropagator())
|
|
94
|
-
|
|
95
|
-
const tracer = trace.getTracer('edge-orders')
|
|
96
|
-
|
|
97
|
-
Deno.serve(async (req) => {
|
|
98
|
-
// PT-BR: extrair traceparent inbound
|
|
99
|
-
const inboundCtx = propagation.extract(context.active(), {
|
|
100
|
-
traceparent: req.headers.get('traceparent') ?? '',
|
|
101
|
-
})
|
|
102
|
-
|
|
103
|
-
return tracer.startActiveSpan(
|
|
104
|
-
'edge_handler',
|
|
105
|
-
{ kind: 1 /* SERVER */ },
|
|
106
|
-
inboundCtx,
|
|
107
|
-
async (span) => {
|
|
108
|
-
span.setAttribute('endpoint', new URL(req.url).pathname)
|
|
109
|
-
|
|
110
|
-
// PT-BR: call outbound para Postgres via PostgREST — injeta traceparent
|
|
111
|
-
const outHeaders: Record<string, string> = {}
|
|
112
|
-
propagation.inject(context.active(), outHeaders)
|
|
113
|
-
|
|
114
|
-
const dbRes = await fetch(Deno.env.get('SUPABASE_URL') + '/rest/v1/orders', {
|
|
115
|
-
method: 'POST',
|
|
116
|
-
headers: {
|
|
117
|
-
...outHeaders,
|
|
118
|
-
'apikey': Deno.env.get('SUPABASE_ANON_KEY')!,
|
|
119
|
-
'content-type': 'application/json',
|
|
120
|
-
},
|
|
121
|
-
body: await req.text(),
|
|
122
|
-
})
|
|
123
|
-
|
|
124
|
-
span.setAttribute('db.status_code', dbRes.status)
|
|
125
|
-
span.end()
|
|
126
|
-
return dbRes
|
|
127
|
-
}
|
|
128
|
-
)
|
|
129
|
-
})
|
|
130
|
-
```
|
|
131
|
-
|
|
132
|
-
### Pattern: stitching além de RPC — queue message (não-RPC)
|
|
133
|
-
|
|
134
|
-
```ts
|
|
135
|
-
// PT-BR: producer — anexa traceparent ao payload da queue (pgmq, SQS, RabbitMQ)
|
|
136
|
-
import { trace, context, propagation } from '@opentelemetry/api'
|
|
137
|
-
|
|
138
|
-
const tracer = trace.getTracer('producer')
|
|
139
|
-
|
|
140
|
-
export async function enqueueEmail(emailJob: EmailJob) {
|
|
141
|
-
return tracer.startActiveSpan(
|
|
142
|
-
'enqueue_email',
|
|
143
|
-
{ kind: SpanKind.PRODUCER },
|
|
144
|
-
async (span) => {
|
|
145
|
-
span.setAttribute('queue.name', 'emails')
|
|
146
|
-
span.setAttribute('email.recipient', emailJob.to)
|
|
147
|
-
|
|
148
|
-
// PT-BR: serializar contexto no payload da mensagem
|
|
149
|
-
const carrier: Record<string, string> = {}
|
|
150
|
-
propagation.inject(context.active(), carrier)
|
|
151
|
-
|
|
152
|
-
await pgmqEnqueue('emails', {
|
|
153
|
-
...emailJob,
|
|
154
|
-
_trace_context: carrier, // PT-BR: viaja com o job
|
|
155
|
-
})
|
|
156
|
-
|
|
157
|
-
span.end()
|
|
158
|
-
}
|
|
159
|
-
)
|
|
160
|
-
}
|
|
161
|
-
|
|
162
|
-
// PT-BR: consumer — extrai traceparent do payload, continua o trace
|
|
163
|
-
export async function processEmailJob(job: EmailJobWithContext) {
|
|
164
|
-
const inboundCtx = propagation.extract(
|
|
165
|
-
context.active(),
|
|
166
|
-
job._trace_context ?? {} // PT-BR: se vazio, novo trace
|
|
167
|
-
)
|
|
168
|
-
|
|
169
|
-
return tracer.startActiveSpan(
|
|
170
|
-
'process_email',
|
|
171
|
-
{ kind: SpanKind.CONSUMER },
|
|
172
|
-
inboundCtx,
|
|
173
|
-
async (span) => {
|
|
174
|
-
span.setAttribute('email.recipient', job.to)
|
|
175
|
-
// PT-BR: agora o span do worker faz parte do mesmo trace do producer
|
|
176
|
-
await sendEmail(job)
|
|
177
|
-
span.end()
|
|
178
|
-
}
|
|
179
|
-
)
|
|
180
|
-
}
|
|
181
|
-
```
|
|
182
|
-
|
|
183
|
-
### Pattern: stitching de batch job (não-RPC)
|
|
184
|
-
|
|
185
|
-
```ts
|
|
186
|
-
// PT-BR: cron job processa N items — 1 span por item, todos com mesmo trace_id
|
|
187
|
-
const tracer = trace.getTracer('billing-cron')
|
|
188
|
-
|
|
189
|
-
export async function dailyBillingJob() {
|
|
190
|
-
return tracer.startActiveSpan('daily_billing', async (rootSpan) => {
|
|
191
|
-
rootSpan.setAttribute('job.type', 'cron')
|
|
192
|
-
rootSpan.setAttribute('build_id', BUILD_ID)
|
|
193
|
-
|
|
194
|
-
const customers = await db.getCustomersDueForBilling()
|
|
195
|
-
rootSpan.setAttribute('customers.count', customers.length)
|
|
196
|
-
|
|
197
|
-
// PT-BR: cada customer vira span filho com mesmo trace_id
|
|
198
|
-
for (const customer of customers) {
|
|
199
|
-
await tracer.startActiveSpan(
|
|
200
|
-
'bill_customer',
|
|
201
|
-
{ kind: SpanKind.INTERNAL },
|
|
202
|
-
async (span) => {
|
|
203
|
-
span.setAttribute('customer.id', customer.id)
|
|
204
|
-
span.setAttribute('customer.tier', customer.tier)
|
|
205
|
-
try {
|
|
206
|
-
await chargeCustomer(customer)
|
|
207
|
-
span.setAttribute('result.success', true)
|
|
208
|
-
} catch (e) {
|
|
209
|
-
span.setAttribute('result.success', false)
|
|
210
|
-
span.setAttribute('error.type', classify(e))
|
|
211
|
-
} finally {
|
|
212
|
-
span.end()
|
|
213
|
-
}
|
|
214
|
-
}
|
|
215
|
-
)
|
|
216
|
-
}
|
|
217
|
-
|
|
218
|
-
rootSpan.end()
|
|
219
|
-
})
|
|
220
|
-
}
|
|
221
|
-
```
|
|
222
|
-
|
|
223
|
-
### Pattern: span kinds
|
|
224
|
-
|
|
225
|
-
| Kind | Quando usar | Exemplo |
|
|
226
|
-
|---|---|---|
|
|
227
|
-
| `SERVER` | Recebendo request inbound | Handler HTTP, gRPC server method |
|
|
228
|
-
| `CLIENT` | Fazendo call outbound | `fetch()`, gRPC client call, DB query |
|
|
229
|
-
| `PRODUCER` | Enviando msg para queue | `pgmq.enqueue()`, SQS publish |
|
|
230
|
-
| `CONSUMER` | Processando msg de queue | Worker recebendo job |
|
|
231
|
-
| `INTERNAL` | Subdivisão dentro do mesmo process | "json_parse", "validation_step" |
|
|
232
|
-
|
|
233
|
-
### Pattern: query traces — montar waterfall
|
|
234
|
-
|
|
235
|
-
```sql
|
|
236
|
-
-- PT-BR: pegar todos os spans de um trace em ordem cronológica
|
|
237
|
-
select
|
|
238
|
-
span_id,
|
|
239
|
-
parent_span_id,
|
|
240
|
-
span_name,
|
|
241
|
-
span_kind,
|
|
242
|
-
service_name,
|
|
243
|
-
duration_ms,
|
|
244
|
-
start_time
|
|
245
|
-
from observability.spans
|
|
246
|
-
where trace_id = '4bf92f3577b34da6a3ce929d0e0e4736'
|
|
247
|
-
order by start_time asc;
|
|
248
|
-
|
|
249
|
-
-- PT-BR: encontrar root span — parent_span_id IS NULL ou span sem parent no mesmo trace
|
|
250
|
-
select *
|
|
251
|
-
from observability.spans
|
|
252
|
-
where trace_id = '4bf92f3577b34da6a3ce929d0e0e4736'
|
|
253
|
-
and parent_span_id is null;
|
|
254
|
-
|
|
255
|
-
-- PT-BR: spans mais lentos cross-trace, último 1h
|
|
256
|
-
select
|
|
257
|
-
service_name,
|
|
258
|
-
span_name,
|
|
259
|
-
percentile_cont(0.99) within group (order by duration_ms) as p99,
|
|
260
|
-
count(*) as samples
|
|
261
|
-
from observability.spans
|
|
262
|
-
where start_time > now() - interval '1 hour'
|
|
263
|
-
group by service_name, span_name
|
|
264
|
-
having count(*) > 100
|
|
265
|
-
order by p99 desc
|
|
266
|
-
limit 20;
|
|
267
|
-
```
|
|
268
|
-
|
|
269
|
-
## Anti-patterns
|
|
270
|
-
|
|
271
|
-
### ANTI: gerar trace_id por hop
|
|
272
|
-
|
|
273
|
-
```ts
|
|
274
|
-
// PT-BR: BAD — quebra a cadeia, cada service vê trace diferente
|
|
275
|
-
const traceId = crypto.randomUUID().replace(/-/g, '').slice(0, 32)
|
|
276
|
-
|
|
277
|
-
// PT-BR: GOOD — extrair do header inbound; deixar SDK gerar root
|
|
278
|
-
const inboundCtx = propagation.extract(context.active(), req.headers)
|
|
279
|
-
tracer.startActiveSpan('handler', {}, inboundCtx, ...)
|
|
280
|
-
```
|
|
281
|
-
|
|
282
|
-
### ANTI: esquecer de propagar em call outbound
|
|
283
|
-
|
|
284
|
-
```ts
|
|
285
|
-
// PT-BR: BAD — outbound call sem traceparent — trace quebra no service B
|
|
286
|
-
await fetch('http://service-b/api', { body: ... })
|
|
287
|
-
|
|
288
|
-
// PT-BR: GOOD — injetar traceparent
|
|
289
|
-
const headers: Record<string, string> = {}
|
|
290
|
-
propagation.inject(context.active(), headers)
|
|
291
|
-
await fetch('http://service-b/api', { headers, body: ... })
|
|
292
|
-
```
|
|
293
|
-
|
|
294
|
-
### ANTI: trace só de RPCs, não de batch/queue
|
|
295
|
-
|
|
296
|
-
```ts
|
|
297
|
-
// PT-BR: BAD — producer/consumer não compartilham trace, debug fica fragmentado
|
|
298
|
-
await pgmqEnqueue('emails', payload) // sem trace context
|
|
299
|
-
// ... depois worker processa sem saber que veio do request X
|
|
300
|
-
|
|
301
|
-
// PT-BR: GOOD — propagar contexto via metadata da queue
|
|
302
|
-
const carrier = {}
|
|
303
|
-
propagation.inject(context.active(), carrier)
|
|
304
|
-
await pgmqEnqueue('emails', { ...payload, _trace_context: carrier })
|
|
305
|
-
```
|
|
306
|
-
|
|
307
|
-
### ANTI: span sem `end()`
|
|
308
|
-
|
|
309
|
-
```ts
|
|
310
|
-
// PT-BR: BAD — span fica aberto forever, duration_ms não calculado, memory leak
|
|
311
|
-
const span = tracer.startSpan('handler')
|
|
312
|
-
// ... handler logic
|
|
313
|
-
return result // PT-BR: ESQUECEU span.end()
|
|
314
|
-
|
|
315
|
-
// PT-BR: GOOD — sempre `try/finally`
|
|
316
|
-
const span = tracer.startSpan('handler')
|
|
317
|
-
try {
|
|
318
|
-
// ... logic
|
|
319
|
-
} finally {
|
|
320
|
-
span.end()
|
|
321
|
-
}
|
|
322
|
-
```
|
|
323
|
-
|
|
324
|
-
### ANTI: span hierarchy errada
|
|
325
|
-
|
|
326
|
-
```ts
|
|
327
|
-
// PT-BR: BAD — usar startSpan sem startActiveSpan, parent não é settado automático
|
|
328
|
-
const parent = tracer.startSpan('parent')
|
|
329
|
-
const child = tracer.startSpan('child') // PT-BR: parent_span_id ficou null
|
|
330
|
-
parent.end()
|
|
331
|
-
child.end()
|
|
332
|
-
|
|
333
|
-
// PT-BR: GOOD — startActiveSpan empurra contexto, child herda parent
|
|
334
|
-
tracer.startActiveSpan('parent', (parent) => {
|
|
335
|
-
tracer.startActiveSpan('child', (child) => {
|
|
336
|
-
// PT-BR: child.parent_span_id === parent.span_id
|
|
337
|
-
child.end()
|
|
338
|
-
})
|
|
339
|
-
parent.end()
|
|
340
|
-
})
|
|
341
|
-
```
|
|
342
|
-
|
|
343
|
-
## Verificação
|
|
344
|
-
|
|
345
|
-
1. **1 trace_id por request** — enviar 1 request, queryar `SELECT DISTINCT trace_id FROM spans WHERE request_id = X` → 1 resultado.
|
|
346
|
-
2. **Cross-service stitching** — request HTTP service A → service B → DB. Queryar `SELECT count(distinct service_name) FROM spans WHERE trace_id = X` → ≥ 3.
|
|
347
|
-
3. **Root span identificável** — `SELECT * FROM spans WHERE trace_id = X AND parent_span_id IS NULL` → 1 row (o root).
|
|
348
|
-
4. **Span hierarchy correta** — graficar via tool (Jaeger UI, Honeycomb, etc.) ou recursivo SQL — deve formar árvore válida (sem ciclos).
|
|
349
|
-
5. **Duration não-zero** — `SELECT min(duration_ms), max(duration_ms) FROM spans` — min ≥ 0, max razoável.
|
|
350
|
-
6. **Sampled flag respeitado** — verificar que se traceparent inbound = `01`, downstream também sample=true.
|
|
351
|
-
7. **Queue stitching funciona** — enqueue + consume → mesmo `trace_id` em ambos os spans.
|
|
352
|
-
|
|
353
|
-
---
|
|
354
|
-
|
|
355
|
-
## Ver também
|
|
356
|
-
|
|
357
|
-
- `kit/skills/_shared-observability/glossary.md` — W3C TraceContext, B3, span kinds
|
|
358
|
-
- `kit/skills/structured-events/SKILL.md` — atributos canônicos por span
|
|
359
|
-
- `kit/skills/opentelemetry-standard/SKILL.md` — SDK que faz extract/inject
|
|
360
|
-
- `kit/skills/telemetry-sampling/SKILL.md` *(Phase 34)* — head vs tail sampling decisão
|
|
361
|
-
|
|
362
|
-
*Material-fonte: Observability Engineering (O'Reilly, 2022) — Cap 6: "Stitching Events into Traces".*
|
|
1
|
+
---
|
|
2
|
+
name: distributed-tracing
|
|
3
|
+
description: Use ao instrumentar tracing — trace_id/span_id/parent_id, propagar W3C TraceContext via header traceparent, stitching além de RPCs (batch, lambda, queue).
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Observabilidade — Distributed Tracing
|
|
7
|
+
|
|
8
|
+
## Quando usar
|
|
9
|
+
|
|
10
|
+
LLM carrega esta skill ao instrumentar tracing distribuído ou stitching de spans. Trigger phrases:
|
|
11
|
+
|
|
12
|
+
- "distributed tracing", "traces", "spans"
|
|
13
|
+
- "propagar contexto entre serviços", "trace cross-service"
|
|
14
|
+
- "W3C TraceContext", "traceparent header"
|
|
15
|
+
- "trace_id span_id parent_span_id"
|
|
16
|
+
- "ligar lambda batch job ao trace"
|
|
17
|
+
- "stitching de eventos"
|
|
18
|
+
|
|
19
|
+
## Regras absolutas
|
|
20
|
+
|
|
21
|
+
- **trace_id é compartilhado** entre todos os spans de um único request distribuído. **NÃO** mude por hop.
|
|
22
|
+
- **span_id é único por span** — gere novo a cada `startSpan()`. 16 hex chars (8 bytes).
|
|
23
|
+
- **parent_span_id aponta para span pai** — null no root span. Define a árvore.
|
|
24
|
+
- **W3C TraceContext é o padrão** — header HTTP `traceparent: 00-{trace_id}-{span_id}-{flags}`. Adote sempre. B3 é fallback para legacy.
|
|
25
|
+
- **Propague ANTES de fazer call cross-service** — extrair contexto do request inbound, propagar no request outbound. Sem isso, trace quebra.
|
|
26
|
+
- **Stitching ≠ apenas RPC** — também batch jobs, queue messages, lambda invocations, S3 uploads. Carregue `traceparent` em metadata da queue, env var do lambda, header da Step Function.
|
|
27
|
+
- **Sample decision propaga** — bit `01` em flags de `traceparent` significa "sample=true". Decisão tomada no head propaga downstream.
|
|
28
|
+
- **Não invente trace_id** — sempre derive do contexto inbound ou gere via SDK (não `crypto.randomUUID()`).
|
|
29
|
+
- **Spans devem ter `kind`** — `SERVER` (handler de inbound), `CLIENT` (call outbound), `PRODUCER`/`CONSUMER` (queue), `INTERNAL` (subspan dentro do mesmo process).
|
|
30
|
+
|
|
31
|
+
## Patterns canônicos
|
|
32
|
+
|
|
33
|
+
### Pattern: extrair contexto inbound + propagar outbound (Node)
|
|
34
|
+
|
|
35
|
+
```ts
|
|
36
|
+
// PT-BR: handler HTTP — extrai traceparent do request inbound, propaga em call outbound
|
|
37
|
+
import { trace, context, propagation } from '@opentelemetry/api'
|
|
38
|
+
|
|
39
|
+
const tracer = trace.getTracer('orders-service')
|
|
40
|
+
|
|
41
|
+
export async function placeOrder(req: Request) {
|
|
42
|
+
// PT-BR: 1 — extrair contexto inbound do header traceparent
|
|
43
|
+
const inboundContext = propagation.extract(context.active(), req.headers)
|
|
44
|
+
|
|
45
|
+
return tracer.startActiveSpan(
|
|
46
|
+
'place_order',
|
|
47
|
+
{ kind: SpanKind.SERVER },
|
|
48
|
+
inboundContext,
|
|
49
|
+
async (span) => {
|
|
50
|
+
span.setAttribute('user.id', req.user.id)
|
|
51
|
+
|
|
52
|
+
// PT-BR: 2 — fazer call outbound — propagation injeta traceparent automaticamente
|
|
53
|
+
// se você usar fetch/grpc instrumentados (ver skill opentelemetry-standard)
|
|
54
|
+
const outboundHeaders: Record<string, string> = {}
|
|
55
|
+
propagation.inject(context.active(), outboundHeaders)
|
|
56
|
+
|
|
57
|
+
const inventoryRes = await fetch('http://inventory/check', {
|
|
58
|
+
headers: outboundHeaders, // PT-BR: traceparent injetado aqui
|
|
59
|
+
body: JSON.stringify({ items: req.items })
|
|
60
|
+
})
|
|
61
|
+
|
|
62
|
+
span.end()
|
|
63
|
+
return inventoryRes.json()
|
|
64
|
+
}
|
|
65
|
+
)
|
|
66
|
+
}
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Pattern: traceparent format
|
|
70
|
+
|
|
71
|
+
```text
|
|
72
|
+
traceparent: 00-4bf92f3577b34da6a3ce929d0e0e4736-00f067aa0ba902b7-01
|
|
73
|
+
^ ^ ^ ^
|
|
74
|
+
| | | |
|
|
75
|
+
version | flags (sampled bit)
|
|
76
|
+
trace_id (32 hex / 16 bytes) |
|
|
77
|
+
span_id (16 hex / 8 bytes)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
```text
|
|
81
|
+
flags:
|
|
82
|
+
01 = sampled (decisão upstream: capture este trace)
|
|
83
|
+
00 = not sampled (decisão upstream: skip)
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Pattern: trace cross-service via Supabase Edge Function
|
|
87
|
+
|
|
88
|
+
```ts
|
|
89
|
+
// PT-BR: Edge Function recebe request → propaga para outro service
|
|
90
|
+
import { trace, context, propagation } from 'npm:@opentelemetry/api@1.9.0'
|
|
91
|
+
import { W3CTraceContextPropagator } from 'npm:@opentelemetry/core@1.27.0'
|
|
92
|
+
|
|
93
|
+
propagation.setGlobalPropagator(new W3CTraceContextPropagator())
|
|
94
|
+
|
|
95
|
+
const tracer = trace.getTracer('edge-orders')
|
|
96
|
+
|
|
97
|
+
Deno.serve(async (req) => {
|
|
98
|
+
// PT-BR: extrair traceparent inbound
|
|
99
|
+
const inboundCtx = propagation.extract(context.active(), {
|
|
100
|
+
traceparent: req.headers.get('traceparent') ?? '',
|
|
101
|
+
})
|
|
102
|
+
|
|
103
|
+
return tracer.startActiveSpan(
|
|
104
|
+
'edge_handler',
|
|
105
|
+
{ kind: 1 /* SERVER */ },
|
|
106
|
+
inboundCtx,
|
|
107
|
+
async (span) => {
|
|
108
|
+
span.setAttribute('endpoint', new URL(req.url).pathname)
|
|
109
|
+
|
|
110
|
+
// PT-BR: call outbound para Postgres via PostgREST — injeta traceparent
|
|
111
|
+
const outHeaders: Record<string, string> = {}
|
|
112
|
+
propagation.inject(context.active(), outHeaders)
|
|
113
|
+
|
|
114
|
+
const dbRes = await fetch(Deno.env.get('SUPABASE_URL') + '/rest/v1/orders', {
|
|
115
|
+
method: 'POST',
|
|
116
|
+
headers: {
|
|
117
|
+
...outHeaders,
|
|
118
|
+
'apikey': Deno.env.get('SUPABASE_ANON_KEY')!,
|
|
119
|
+
'content-type': 'application/json',
|
|
120
|
+
},
|
|
121
|
+
body: await req.text(),
|
|
122
|
+
})
|
|
123
|
+
|
|
124
|
+
span.setAttribute('db.status_code', dbRes.status)
|
|
125
|
+
span.end()
|
|
126
|
+
return dbRes
|
|
127
|
+
}
|
|
128
|
+
)
|
|
129
|
+
})
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### Pattern: stitching além de RPC — queue message (não-RPC)
|
|
133
|
+
|
|
134
|
+
```ts
|
|
135
|
+
// PT-BR: producer — anexa traceparent ao payload da queue (pgmq, SQS, RabbitMQ)
|
|
136
|
+
import { trace, context, propagation } from '@opentelemetry/api'
|
|
137
|
+
|
|
138
|
+
const tracer = trace.getTracer('producer')
|
|
139
|
+
|
|
140
|
+
export async function enqueueEmail(emailJob: EmailJob) {
|
|
141
|
+
return tracer.startActiveSpan(
|
|
142
|
+
'enqueue_email',
|
|
143
|
+
{ kind: SpanKind.PRODUCER },
|
|
144
|
+
async (span) => {
|
|
145
|
+
span.setAttribute('queue.name', 'emails')
|
|
146
|
+
span.setAttribute('email.recipient', emailJob.to)
|
|
147
|
+
|
|
148
|
+
// PT-BR: serializar contexto no payload da mensagem
|
|
149
|
+
const carrier: Record<string, string> = {}
|
|
150
|
+
propagation.inject(context.active(), carrier)
|
|
151
|
+
|
|
152
|
+
await pgmqEnqueue('emails', {
|
|
153
|
+
...emailJob,
|
|
154
|
+
_trace_context: carrier, // PT-BR: viaja com o job
|
|
155
|
+
})
|
|
156
|
+
|
|
157
|
+
span.end()
|
|
158
|
+
}
|
|
159
|
+
)
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// PT-BR: consumer — extrai traceparent do payload, continua o trace
|
|
163
|
+
export async function processEmailJob(job: EmailJobWithContext) {
|
|
164
|
+
const inboundCtx = propagation.extract(
|
|
165
|
+
context.active(),
|
|
166
|
+
job._trace_context ?? {} // PT-BR: se vazio, novo trace
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
return tracer.startActiveSpan(
|
|
170
|
+
'process_email',
|
|
171
|
+
{ kind: SpanKind.CONSUMER },
|
|
172
|
+
inboundCtx,
|
|
173
|
+
async (span) => {
|
|
174
|
+
span.setAttribute('email.recipient', job.to)
|
|
175
|
+
// PT-BR: agora o span do worker faz parte do mesmo trace do producer
|
|
176
|
+
await sendEmail(job)
|
|
177
|
+
span.end()
|
|
178
|
+
}
|
|
179
|
+
)
|
|
180
|
+
}
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
### Pattern: stitching de batch job (não-RPC)
|
|
184
|
+
|
|
185
|
+
```ts
|
|
186
|
+
// PT-BR: cron job processa N items — 1 span por item, todos com mesmo trace_id
|
|
187
|
+
const tracer = trace.getTracer('billing-cron')
|
|
188
|
+
|
|
189
|
+
export async function dailyBillingJob() {
|
|
190
|
+
return tracer.startActiveSpan('daily_billing', async (rootSpan) => {
|
|
191
|
+
rootSpan.setAttribute('job.type', 'cron')
|
|
192
|
+
rootSpan.setAttribute('build_id', BUILD_ID)
|
|
193
|
+
|
|
194
|
+
const customers = await db.getCustomersDueForBilling()
|
|
195
|
+
rootSpan.setAttribute('customers.count', customers.length)
|
|
196
|
+
|
|
197
|
+
// PT-BR: cada customer vira span filho com mesmo trace_id
|
|
198
|
+
for (const customer of customers) {
|
|
199
|
+
await tracer.startActiveSpan(
|
|
200
|
+
'bill_customer',
|
|
201
|
+
{ kind: SpanKind.INTERNAL },
|
|
202
|
+
async (span) => {
|
|
203
|
+
span.setAttribute('customer.id', customer.id)
|
|
204
|
+
span.setAttribute('customer.tier', customer.tier)
|
|
205
|
+
try {
|
|
206
|
+
await chargeCustomer(customer)
|
|
207
|
+
span.setAttribute('result.success', true)
|
|
208
|
+
} catch (e) {
|
|
209
|
+
span.setAttribute('result.success', false)
|
|
210
|
+
span.setAttribute('error.type', classify(e))
|
|
211
|
+
} finally {
|
|
212
|
+
span.end()
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
)
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
rootSpan.end()
|
|
219
|
+
})
|
|
220
|
+
}
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
### Pattern: span kinds
|
|
224
|
+
|
|
225
|
+
| Kind | Quando usar | Exemplo |
|
|
226
|
+
|---|---|---|
|
|
227
|
+
| `SERVER` | Recebendo request inbound | Handler HTTP, gRPC server method |
|
|
228
|
+
| `CLIENT` | Fazendo call outbound | `fetch()`, gRPC client call, DB query |
|
|
229
|
+
| `PRODUCER` | Enviando msg para queue | `pgmq.enqueue()`, SQS publish |
|
|
230
|
+
| `CONSUMER` | Processando msg de queue | Worker recebendo job |
|
|
231
|
+
| `INTERNAL` | Subdivisão dentro do mesmo process | "json_parse", "validation_step" |
|
|
232
|
+
|
|
233
|
+
### Pattern: query traces — montar waterfall
|
|
234
|
+
|
|
235
|
+
```sql
|
|
236
|
+
-- PT-BR: pegar todos os spans de um trace em ordem cronológica
|
|
237
|
+
select
|
|
238
|
+
span_id,
|
|
239
|
+
parent_span_id,
|
|
240
|
+
span_name,
|
|
241
|
+
span_kind,
|
|
242
|
+
service_name,
|
|
243
|
+
duration_ms,
|
|
244
|
+
start_time
|
|
245
|
+
from observability.spans
|
|
246
|
+
where trace_id = '4bf92f3577b34da6a3ce929d0e0e4736'
|
|
247
|
+
order by start_time asc;
|
|
248
|
+
|
|
249
|
+
-- PT-BR: encontrar root span — parent_span_id IS NULL ou span sem parent no mesmo trace
|
|
250
|
+
select *
|
|
251
|
+
from observability.spans
|
|
252
|
+
where trace_id = '4bf92f3577b34da6a3ce929d0e0e4736'
|
|
253
|
+
and parent_span_id is null;
|
|
254
|
+
|
|
255
|
+
-- PT-BR: spans mais lentos cross-trace, último 1h
|
|
256
|
+
select
|
|
257
|
+
service_name,
|
|
258
|
+
span_name,
|
|
259
|
+
percentile_cont(0.99) within group (order by duration_ms) as p99,
|
|
260
|
+
count(*) as samples
|
|
261
|
+
from observability.spans
|
|
262
|
+
where start_time > now() - interval '1 hour'
|
|
263
|
+
group by service_name, span_name
|
|
264
|
+
having count(*) > 100
|
|
265
|
+
order by p99 desc
|
|
266
|
+
limit 20;
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
## Anti-patterns
|
|
270
|
+
|
|
271
|
+
### ANTI: gerar trace_id por hop
|
|
272
|
+
|
|
273
|
+
```ts
|
|
274
|
+
// PT-BR: BAD — quebra a cadeia, cada service vê trace diferente
|
|
275
|
+
const traceId = crypto.randomUUID().replace(/-/g, '').slice(0, 32)
|
|
276
|
+
|
|
277
|
+
// PT-BR: GOOD — extrair do header inbound; deixar SDK gerar root
|
|
278
|
+
const inboundCtx = propagation.extract(context.active(), req.headers)
|
|
279
|
+
tracer.startActiveSpan('handler', {}, inboundCtx, ...)
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
### ANTI: esquecer de propagar em call outbound
|
|
283
|
+
|
|
284
|
+
```ts
|
|
285
|
+
// PT-BR: BAD — outbound call sem traceparent — trace quebra no service B
|
|
286
|
+
await fetch('http://service-b/api', { body: ... })
|
|
287
|
+
|
|
288
|
+
// PT-BR: GOOD — injetar traceparent
|
|
289
|
+
const headers: Record<string, string> = {}
|
|
290
|
+
propagation.inject(context.active(), headers)
|
|
291
|
+
await fetch('http://service-b/api', { headers, body: ... })
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
### ANTI: trace só de RPCs, não de batch/queue
|
|
295
|
+
|
|
296
|
+
```ts
|
|
297
|
+
// PT-BR: BAD — producer/consumer não compartilham trace, debug fica fragmentado
|
|
298
|
+
await pgmqEnqueue('emails', payload) // sem trace context
|
|
299
|
+
// ... depois worker processa sem saber que veio do request X
|
|
300
|
+
|
|
301
|
+
// PT-BR: GOOD — propagar contexto via metadata da queue
|
|
302
|
+
const carrier = {}
|
|
303
|
+
propagation.inject(context.active(), carrier)
|
|
304
|
+
await pgmqEnqueue('emails', { ...payload, _trace_context: carrier })
|
|
305
|
+
```
|
|
306
|
+
|
|
307
|
+
### ANTI: span sem `end()`
|
|
308
|
+
|
|
309
|
+
```ts
|
|
310
|
+
// PT-BR: BAD — span fica aberto forever, duration_ms não calculado, memory leak
|
|
311
|
+
const span = tracer.startSpan('handler')
|
|
312
|
+
// ... handler logic
|
|
313
|
+
return result // PT-BR: ESQUECEU span.end()
|
|
314
|
+
|
|
315
|
+
// PT-BR: GOOD — sempre `try/finally`
|
|
316
|
+
const span = tracer.startSpan('handler')
|
|
317
|
+
try {
|
|
318
|
+
// ... logic
|
|
319
|
+
} finally {
|
|
320
|
+
span.end()
|
|
321
|
+
}
|
|
322
|
+
```
|
|
323
|
+
|
|
324
|
+
### ANTI: span hierarchy errada
|
|
325
|
+
|
|
326
|
+
```ts
|
|
327
|
+
// PT-BR: BAD — usar startSpan sem startActiveSpan, parent não é settado automático
|
|
328
|
+
const parent = tracer.startSpan('parent')
|
|
329
|
+
const child = tracer.startSpan('child') // PT-BR: parent_span_id ficou null
|
|
330
|
+
parent.end()
|
|
331
|
+
child.end()
|
|
332
|
+
|
|
333
|
+
// PT-BR: GOOD — startActiveSpan empurra contexto, child herda parent
|
|
334
|
+
tracer.startActiveSpan('parent', (parent) => {
|
|
335
|
+
tracer.startActiveSpan('child', (child) => {
|
|
336
|
+
// PT-BR: child.parent_span_id === parent.span_id
|
|
337
|
+
child.end()
|
|
338
|
+
})
|
|
339
|
+
parent.end()
|
|
340
|
+
})
|
|
341
|
+
```
|
|
342
|
+
|
|
343
|
+
## Verificação
|
|
344
|
+
|
|
345
|
+
1. **1 trace_id por request** — enviar 1 request, queryar `SELECT DISTINCT trace_id FROM spans WHERE request_id = X` → 1 resultado.
|
|
346
|
+
2. **Cross-service stitching** — request HTTP service A → service B → DB. Queryar `SELECT count(distinct service_name) FROM spans WHERE trace_id = X` → ≥ 3.
|
|
347
|
+
3. **Root span identificável** — `SELECT * FROM spans WHERE trace_id = X AND parent_span_id IS NULL` → 1 row (o root).
|
|
348
|
+
4. **Span hierarchy correta** — graficar via tool (Jaeger UI, Honeycomb, etc.) ou recursivo SQL — deve formar árvore válida (sem ciclos).
|
|
349
|
+
5. **Duration não-zero** — `SELECT min(duration_ms), max(duration_ms) FROM spans` — min ≥ 0, max razoável.
|
|
350
|
+
6. **Sampled flag respeitado** — verificar que se traceparent inbound = `01`, downstream também sample=true.
|
|
351
|
+
7. **Queue stitching funciona** — enqueue + consume → mesmo `trace_id` em ambos os spans.
|
|
352
|
+
|
|
353
|
+
---
|
|
354
|
+
|
|
355
|
+
## Ver também
|
|
356
|
+
|
|
357
|
+
- `kit/skills/_shared-observability/glossary.md` — W3C TraceContext, B3, span kinds
|
|
358
|
+
- `kit/skills/structured-events/SKILL.md` — atributos canônicos por span
|
|
359
|
+
- `kit/skills/opentelemetry-standard/SKILL.md` — SDK que faz extract/inject
|
|
360
|
+
- `kit/skills/telemetry-sampling/SKILL.md` *(Phase 34)* — head vs tail sampling decisão
|
|
361
|
+
|
|
362
|
+
*Material-fonte: Observability Engineering (O'Reilly, 2022) — Cap 6: "Stitching Events into Traces".*
|