@runtime-digital-twin/sdk 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +214 -0
- package/dist/constants.d.ts +11 -0
- package/dist/constants.d.ts.map +1 -0
- package/dist/constants.js +13 -0
- package/dist/db-wrapper.d.ts +258 -0
- package/dist/db-wrapper.d.ts.map +1 -0
- package/dist/db-wrapper.js +636 -0
- package/dist/event-envelope.d.ts +35 -0
- package/dist/event-envelope.d.ts.map +1 -0
- package/dist/event-envelope.js +101 -0
- package/dist/fastify-plugin.d.ts +29 -0
- package/dist/fastify-plugin.d.ts.map +1 -0
- package/dist/fastify-plugin.js +243 -0
- package/dist/http-sentinels.d.ts +39 -0
- package/dist/http-sentinels.d.ts.map +1 -0
- package/dist/http-sentinels.js +169 -0
- package/dist/http-wrapper.d.ts +25 -0
- package/dist/http-wrapper.d.ts.map +1 -0
- package/dist/http-wrapper.js +477 -0
- package/dist/index.d.ts +19 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +93 -0
- package/dist/invariants.d.ts +58 -0
- package/dist/invariants.d.ts.map +1 -0
- package/dist/invariants.js +192 -0
- package/dist/multi-service-edge-builder.d.ts +80 -0
- package/dist/multi-service-edge-builder.d.ts.map +1 -0
- package/dist/multi-service-edge-builder.js +107 -0
- package/dist/outbound-matcher.d.ts +192 -0
- package/dist/outbound-matcher.d.ts.map +1 -0
- package/dist/outbound-matcher.js +457 -0
- package/dist/peer-service-resolver.d.ts +22 -0
- package/dist/peer-service-resolver.d.ts.map +1 -0
- package/dist/peer-service-resolver.js +85 -0
- package/dist/redaction.d.ts +111 -0
- package/dist/redaction.d.ts.map +1 -0
- package/dist/redaction.js +487 -0
- package/dist/replay-logger.d.ts +438 -0
- package/dist/replay-logger.d.ts.map +1 -0
- package/dist/replay-logger.js +434 -0
- package/dist/root-cause-analyzer.d.ts +45 -0
- package/dist/root-cause-analyzer.d.ts.map +1 -0
- package/dist/root-cause-analyzer.js +606 -0
- package/dist/shape-digest-utils.d.ts +45 -0
- package/dist/shape-digest-utils.d.ts.map +1 -0
- package/dist/shape-digest-utils.js +154 -0
- package/dist/trace-bundle-writer.d.ts +52 -0
- package/dist/trace-bundle-writer.d.ts.map +1 -0
- package/dist/trace-bundle-writer.js +267 -0
- package/dist/trace-loader.d.ts +69 -0
- package/dist/trace-loader.d.ts.map +1 -0
- package/dist/trace-loader.js +146 -0
- package/dist/trace-uploader.d.ts +25 -0
- package/dist/trace-uploader.d.ts.map +1 -0
- package/dist/trace-uploader.js +132 -0
- package/package.json +63 -0
|
@@ -0,0 +1,606 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Multi-service root cause analyzer
|
|
4
|
+
*
|
|
5
|
+
* Analyzes distributed traces to determine which upstream service
|
|
6
|
+
* most likely caused a failure.
|
|
7
|
+
*/
|
|
8
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
9
|
+
exports.analyzeMultiServiceRootCause = analyzeMultiServiceRootCause;
|
|
10
|
+
const multi_service_edge_builder_1 = require("./multi-service-edge-builder");
|
|
11
|
+
/**
|
|
12
|
+
* Find the symptom service (error event nearest to terminal 5xx or latest error)
|
|
13
|
+
*/
|
|
14
|
+
function findSymptomService(events) {
|
|
15
|
+
// Find all error events and 5xx responses
|
|
16
|
+
const errorEvents = [];
|
|
17
|
+
const fiveXXResponses = [];
|
|
18
|
+
for (const event of events) {
|
|
19
|
+
if (event.type === 'error') {
|
|
20
|
+
errorEvents.push(event);
|
|
21
|
+
}
|
|
22
|
+
else if (event.type === 'http.response.inbound' ||
|
|
23
|
+
event.type === 'http.response.outbound') {
|
|
24
|
+
if (event.statusCode && event.statusCode >= 500) {
|
|
25
|
+
fiveXXResponses.push(event);
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
// If we have 5xx responses, find error event nearest to the latest one
|
|
30
|
+
if (fiveXXResponses.length > 0) {
|
|
31
|
+
const latest5xx = fiveXXResponses.reduce((latest, current) => current.timestamp > latest.timestamp ? current : latest);
|
|
32
|
+
// Find error event closest in time to latest 5xx
|
|
33
|
+
let nearestError = null;
|
|
34
|
+
let minTimeDiff = Infinity;
|
|
35
|
+
for (const error of errorEvents) {
|
|
36
|
+
const timeDiff = Math.abs(error.timestamp - latest5xx.timestamp);
|
|
37
|
+
if (timeDiff < minTimeDiff) {
|
|
38
|
+
minTimeDiff = timeDiff;
|
|
39
|
+
nearestError = error;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
return nearestError || latest5xx;
|
|
43
|
+
}
|
|
44
|
+
// No 5xx responses - return latest error event
|
|
45
|
+
if (errorEvents.length > 0) {
|
|
46
|
+
return errorEvents.reduce((latest, current) => current.timestamp > latest.timestamp ? current : latest);
|
|
47
|
+
}
|
|
48
|
+
return null;
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Common required fields that should trigger scoring when null
|
|
52
|
+
*/
|
|
53
|
+
const COMMON_REQUIRED_FIELDS = [
|
|
54
|
+
'$.price',
|
|
55
|
+
'$.amount',
|
|
56
|
+
'$.total',
|
|
57
|
+
'$.cost',
|
|
58
|
+
'$.id',
|
|
59
|
+
'$.userId',
|
|
60
|
+
'$.customerId',
|
|
61
|
+
'$.orderId',
|
|
62
|
+
'$.productId',
|
|
63
|
+
];
|
|
64
|
+
/**
|
|
65
|
+
* Check if nullPaths intersect with common required fields
|
|
66
|
+
*/
|
|
67
|
+
function hasRequiredFieldNulls(nullPaths) {
|
|
68
|
+
if (!nullPaths || nullPaths.length === 0) {
|
|
69
|
+
return false;
|
|
70
|
+
}
|
|
71
|
+
return nullPaths.some(path => COMMON_REQUIRED_FIELDS.some(required => path === required || path.startsWith(required + '.') || path.startsWith(required + '[')));
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Find invariant events for a given outbound span
|
|
75
|
+
*/
|
|
76
|
+
function findInvariantsForSpan(spanId, events) {
|
|
77
|
+
return events.filter(e => e.type === 'state.invariant' &&
|
|
78
|
+
(e.spanId === spanId || e.parentSpanId === spanId) &&
|
|
79
|
+
e.passed === false);
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* Score a candidate span
|
|
83
|
+
*/
|
|
84
|
+
function scoreCandidate(event, allEvents) {
|
|
85
|
+
let score = 0;
|
|
86
|
+
const reasons = [];
|
|
87
|
+
if (event.type === 'http.request.outbound') {
|
|
88
|
+
const statusCode = event.statusCode;
|
|
89
|
+
// Check for invariant failures on this outbound call
|
|
90
|
+
const invariants = findInvariantsForSpan(event.spanId, allEvents);
|
|
91
|
+
for (const invariant of invariants) {
|
|
92
|
+
if (invariant.name === 'http.response.forbidden_null') {
|
|
93
|
+
score += 7;
|
|
94
|
+
reasons.push('Outbound response has forbidden_null invariant failure');
|
|
95
|
+
}
|
|
96
|
+
else if (invariant.name === 'http.response.5xx') {
|
|
97
|
+
score += 7;
|
|
98
|
+
reasons.push('Outbound response has 5xx invariant failure');
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
// Check responseShapeDigest for nullPaths intersecting required fields
|
|
102
|
+
if (event.responseShapeDigest?.nullPaths) {
|
|
103
|
+
if (hasRequiredFieldNulls(event.responseShapeDigest.nullPaths)) {
|
|
104
|
+
score += 6;
|
|
105
|
+
reasons.push('Response shape digest has nullPaths intersecting required fields');
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
// Original scoring logic
|
|
109
|
+
if (statusCode !== null && statusCode !== undefined) {
|
|
110
|
+
if (statusCode >= 500) {
|
|
111
|
+
score += 6;
|
|
112
|
+
reasons.push(`Outbound HTTP call returned 5xx (${statusCode})`);
|
|
113
|
+
}
|
|
114
|
+
else if (statusCode >= 400 && statusCode < 500) {
|
|
115
|
+
// Check if there's a downstream error (will be checked later)
|
|
116
|
+
score += 4;
|
|
117
|
+
reasons.push(`Outbound HTTP call returned 4xx (${statusCode}) with downstream error`);
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
else {
|
|
121
|
+
// Missing statusCode but request might have thrown
|
|
122
|
+
// This is handled by checking for error events with same parentSpanId
|
|
123
|
+
score += 4;
|
|
124
|
+
reasons.push('Outbound HTTP call failed (missing statusCode, likely threw error)');
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
else if (event.type === 'error') {
|
|
128
|
+
score += 3;
|
|
129
|
+
reasons.push('Error event (symptom, not root cause)');
|
|
130
|
+
}
|
|
131
|
+
return {
|
|
132
|
+
score,
|
|
133
|
+
reason: reasons.length > 0 ? reasons.join('; ') : 'No scoring factors',
|
|
134
|
+
};
|
|
135
|
+
}
|
|
136
|
+
/**
|
|
137
|
+
* Check if there's a downstream error for a given outbound span
|
|
138
|
+
*/
|
|
139
|
+
function hasDownstreamError(outboundSpanId, events) {
|
|
140
|
+
// Find inbound events that have this outbound span as parent
|
|
141
|
+
for (const event of events) {
|
|
142
|
+
if (event.type === 'http.request.inbound' &&
|
|
143
|
+
event.parentSpanId === outboundSpanId) {
|
|
144
|
+
// Check if there's an error event with this inbound span as parent or same span
|
|
145
|
+
for (const errorEvent of events) {
|
|
146
|
+
if (errorEvent.type === 'error' &&
|
|
147
|
+
(errorEvent.parentSpanId === event.spanId ||
|
|
148
|
+
errorEvent.spanId === event.spanId)) {
|
|
149
|
+
return true;
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
return false;
|
|
155
|
+
}
|
|
156
|
+
/**
|
|
157
|
+
* Build upstream graph from edges (reverse direction)
|
|
158
|
+
*/
|
|
159
|
+
function buildUpstreamGraph(edges) {
|
|
160
|
+
// Map: toService -> [edges that end at this service]
|
|
161
|
+
const graph = new Map();
|
|
162
|
+
for (const edge of edges) {
|
|
163
|
+
if (!graph.has(edge.toService)) {
|
|
164
|
+
graph.set(edge.toService, []);
|
|
165
|
+
}
|
|
166
|
+
graph.get(edge.toService).push(edge);
|
|
167
|
+
}
|
|
168
|
+
return graph;
|
|
169
|
+
}
|
|
170
|
+
/**
|
|
171
|
+
* Walk upstream from a service to find all upstream services
|
|
172
|
+
*/
|
|
173
|
+
function walkUpstream(startService, edges, visited = new Set()) {
|
|
174
|
+
if (visited.has(startService)) {
|
|
175
|
+
return [];
|
|
176
|
+
}
|
|
177
|
+
visited.add(startService);
|
|
178
|
+
const path = [startService];
|
|
179
|
+
// Find edges where this service is the destination
|
|
180
|
+
for (const edge of edges) {
|
|
181
|
+
if (edge.toService === startService && !visited.has(edge.fromService)) {
|
|
182
|
+
const upstreamPath = walkUpstream(edge.fromService, edges, visited);
|
|
183
|
+
path.push(...upstreamPath);
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
return path;
|
|
187
|
+
}
|
|
188
|
+
/**
|
|
189
|
+
* Find all candidate spans for root cause analysis
|
|
190
|
+
*/
|
|
191
|
+
function findCandidateSpans(events) {
|
|
192
|
+
const candidates = [];
|
|
193
|
+
for (const event of events) {
|
|
194
|
+
const { score, reason } = scoreCandidate(event, events);
|
|
195
|
+
// Adjust score for 4xx with downstream error
|
|
196
|
+
if (event.type === 'http.request.outbound' &&
|
|
197
|
+
event.statusCode !== null &&
|
|
198
|
+
event.statusCode !== undefined &&
|
|
199
|
+
event.statusCode >= 400 &&
|
|
200
|
+
event.statusCode < 500) {
|
|
201
|
+
if (!hasDownstreamError(event.spanId, events)) {
|
|
202
|
+
// No downstream error - reduce score (but keep if invariants/shape digest scored)
|
|
203
|
+
// Only reduce if score is purely from status code
|
|
204
|
+
if (score <= 4) {
|
|
205
|
+
// Skip this candidate if score is only from 4xx
|
|
206
|
+
continue;
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
// Adjust score for missing statusCode
|
|
211
|
+
if (event.type === 'http.request.outbound' &&
|
|
212
|
+
(event.statusCode === null || event.statusCode === undefined)) {
|
|
213
|
+
// Check if there's an error event for this span
|
|
214
|
+
const hasError = events.some((e) => e.type === 'error' &&
|
|
215
|
+
(e.spanId === event.spanId || e.parentSpanId === event.spanId));
|
|
216
|
+
if (!hasError && score <= 4) {
|
|
217
|
+
// No error evidence and score is only from missing statusCode
|
|
218
|
+
continue;
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
if (score > 0) {
|
|
222
|
+
const operationName = event.type === 'http.request.outbound'
|
|
223
|
+
? `${event.method || 'UNKNOWN'} ${event.urlTemplate || event.peerHost || 'unknown'}`
|
|
224
|
+
: event.type === 'error'
|
|
225
|
+
? 'error'
|
|
226
|
+
: `${event.method || 'UNKNOWN'} ${event.path || event.route || 'unknown'}`;
|
|
227
|
+
const evidence = {
|
|
228
|
+
kind: event.type === 'error' ? 'error' : 'http',
|
|
229
|
+
};
|
|
230
|
+
if (event.type === 'http.request.outbound') {
|
|
231
|
+
evidence.statusCode = event.statusCode ?? undefined;
|
|
232
|
+
evidence.peerService = event.peerService ?? undefined;
|
|
233
|
+
// Collect invariant events for this span
|
|
234
|
+
const invariants = findInvariantsForSpan(event.spanId, events);
|
|
235
|
+
if (invariants.length > 0) {
|
|
236
|
+
evidence.invariants = invariants.map(inv => ({
|
|
237
|
+
name: inv.name || 'unknown',
|
|
238
|
+
paths: inv.details?.paths,
|
|
239
|
+
}));
|
|
240
|
+
}
|
|
241
|
+
// Collect responseShapeDigest info
|
|
242
|
+
if (event.responseShapeDigest) {
|
|
243
|
+
evidence.responseShapeDigest = {
|
|
244
|
+
hash: event.responseShapeDigest.hash,
|
|
245
|
+
nullPaths: event.responseShapeDigest.nullPaths?.slice(0, 5), // Top 5 nullPaths
|
|
246
|
+
};
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
else if (event.type === 'error') {
|
|
250
|
+
evidence.message = event.error?.message || 'Unknown error';
|
|
251
|
+
}
|
|
252
|
+
candidates.push({
|
|
253
|
+
spanId: event.spanId,
|
|
254
|
+
serviceName: event.serviceName,
|
|
255
|
+
operationName,
|
|
256
|
+
score,
|
|
257
|
+
reason,
|
|
258
|
+
evidence,
|
|
259
|
+
event,
|
|
260
|
+
});
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
return candidates;
|
|
264
|
+
}
|
|
265
|
+
/**
|
|
266
|
+
* Build propagation path from edges
|
|
267
|
+
*/
|
|
268
|
+
function buildPropagationPath(edges) {
|
|
269
|
+
if (edges.length === 0) {
|
|
270
|
+
return [];
|
|
271
|
+
}
|
|
272
|
+
// Find the root service (service that appears as 'from' but never as 'to')
|
|
273
|
+
const toServices = new Set(edges.map((e) => e.toService));
|
|
274
|
+
const fromServices = new Set(edges.map((e) => e.fromService));
|
|
275
|
+
// Find services that are only 'from' (root services)
|
|
276
|
+
const rootServices = Array.from(fromServices).filter((s) => !toServices.has(s));
|
|
277
|
+
if (rootServices.length === 0) {
|
|
278
|
+
// No clear root - use first edge's fromService
|
|
279
|
+
return [edges[0].fromService];
|
|
280
|
+
}
|
|
281
|
+
// Start from root and walk downstream
|
|
282
|
+
const path = [];
|
|
283
|
+
const visited = new Set();
|
|
284
|
+
function walkDownstream(service) {
|
|
285
|
+
if (visited.has(service)) {
|
|
286
|
+
return;
|
|
287
|
+
}
|
|
288
|
+
visited.add(service);
|
|
289
|
+
path.push(service);
|
|
290
|
+
// Find edges where this service is the source
|
|
291
|
+
for (const edge of edges) {
|
|
292
|
+
if (edge.fromService === service && !visited.has(edge.toService)) {
|
|
293
|
+
walkDownstream(edge.toService);
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
// Start from first root service
|
|
298
|
+
walkDownstream(rootServices[0]);
|
|
299
|
+
return path;
|
|
300
|
+
}
|
|
301
|
+
/**
|
|
302
|
+
* Calculate confidence score
|
|
303
|
+
*/
|
|
304
|
+
function calculateConfidence(culprit, events, edges) {
|
|
305
|
+
// High confidence (0.85-0.95): upstream outbound with strong evidence
|
|
306
|
+
if (culprit.event.type === 'http.request.outbound') {
|
|
307
|
+
const statusCode = culprit.event.statusCode;
|
|
308
|
+
// Check for invariant failures (strong signal)
|
|
309
|
+
const hasInvariantFailure = culprit.evidence.invariants?.some(inv => inv.name === 'http.response.forbidden_null' || inv.name === 'http.response.5xx');
|
|
310
|
+
// Check for required field nulls in shape digest (strong signal)
|
|
311
|
+
const hasRequiredFieldNulls = culprit.evidence.responseShapeDigest?.nullPaths?.some(path => COMMON_REQUIRED_FIELDS.some(required => path === required || path.startsWith(required + '.') || path.startsWith(required + '[')));
|
|
312
|
+
// Very high confidence: invariant failure or required field nulls
|
|
313
|
+
if (hasInvariantFailure || hasRequiredFieldNulls) {
|
|
314
|
+
return 0.9;
|
|
315
|
+
}
|
|
316
|
+
// High confidence: 5xx status
|
|
317
|
+
if (statusCode !== null && statusCode !== undefined && statusCode >= 500) {
|
|
318
|
+
return 0.9;
|
|
319
|
+
}
|
|
320
|
+
// High confidence: missing statusCode with error event
|
|
321
|
+
if (statusCode === null &&
|
|
322
|
+
events.some((e) => e.type === 'error' &&
|
|
323
|
+
(e.spanId === culprit.spanId || e.parentSpanId === culprit.spanId))) {
|
|
324
|
+
return 0.85;
|
|
325
|
+
}
|
|
326
|
+
// Medium-high confidence: has shape digest with nullPaths (but not required fields)
|
|
327
|
+
if (culprit.evidence.responseShapeDigest?.nullPaths && culprit.evidence.responseShapeDigest.nullPaths.length > 0) {
|
|
328
|
+
return 0.8;
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
// Medium confidence (0.6-0.8): only symptom errors exist
|
|
332
|
+
if (culprit.event.type === 'error') {
|
|
333
|
+
// Check if there are upstream services
|
|
334
|
+
const hasUpstream = edges.some((e) => e.toService === culprit.serviceName);
|
|
335
|
+
if (!hasUpstream) {
|
|
336
|
+
return 0.7; // No upstream - likely root cause
|
|
337
|
+
}
|
|
338
|
+
return 0.65; // Has upstream - might be symptom
|
|
339
|
+
}
|
|
340
|
+
// Low confidence (<=0.55): trace incomplete or ambiguous
|
|
341
|
+
if (edges.length === 0) {
|
|
342
|
+
return 0.5; // No edges - trace might be incomplete
|
|
343
|
+
}
|
|
344
|
+
return 0.6; // Default medium-low
|
|
345
|
+
}
|
|
346
|
+
/**
|
|
347
|
+
* Analyze multi-service root cause
|
|
348
|
+
*
|
|
349
|
+
* @param traceId - The trace ID
|
|
350
|
+
* @param events - All events for this traceId
|
|
351
|
+
* @returns Root cause analysis result
|
|
352
|
+
*/
|
|
353
|
+
function analyzeMultiServiceRootCause(traceId, events) {
|
|
354
|
+
// Filter events to this traceId
|
|
355
|
+
const traceEvents = events.filter((e) => e.traceId === traceId);
|
|
356
|
+
if (traceEvents.length === 0) {
|
|
357
|
+
throw new Error(`No events found for traceId: ${traceId}`);
|
|
358
|
+
}
|
|
359
|
+
// Build edges
|
|
360
|
+
const edges = (0, multi_service_edge_builder_1.buildEdges)(traceEvents);
|
|
361
|
+
// Find symptom service
|
|
362
|
+
const symptomEvent = findSymptomService(traceEvents);
|
|
363
|
+
if (!symptomEvent) {
|
|
364
|
+
// No errors found - return analysis with first service as culprit
|
|
365
|
+
const firstEvent = traceEvents[0];
|
|
366
|
+
return {
|
|
367
|
+
culprit: {
|
|
368
|
+
serviceName: firstEvent.serviceName,
|
|
369
|
+
spanId: firstEvent.spanId,
|
|
370
|
+
operationName: `${firstEvent.method || 'UNKNOWN'} ${firstEvent.path || 'unknown'}`,
|
|
371
|
+
evidence: {
|
|
372
|
+
kind: 'http',
|
|
373
|
+
},
|
|
374
|
+
confidence: 0.5,
|
|
375
|
+
},
|
|
376
|
+
propagationPath: buildPropagationPath(edges),
|
|
377
|
+
rankedFindings: [],
|
|
378
|
+
};
|
|
379
|
+
}
|
|
380
|
+
// Find all candidate spans
|
|
381
|
+
const candidates = findCandidateSpans(traceEvents);
|
|
382
|
+
// Walk upstream from symptom to find root cause
|
|
383
|
+
// Build parent chain: child spanId -> parent spanId
|
|
384
|
+
const parentChain = new Map();
|
|
385
|
+
for (const event of traceEvents) {
|
|
386
|
+
if (event.parentSpanId) {
|
|
387
|
+
parentChain.set(event.spanId, event.parentSpanId);
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
// Build spanId -> event map for quick lookup
|
|
391
|
+
const spanToEvent = new Map();
|
|
392
|
+
for (const event of traceEvents) {
|
|
393
|
+
spanToEvent.set(event.spanId, event);
|
|
394
|
+
}
|
|
395
|
+
// Walk upstream from symptom span following parentSpanId chain
|
|
396
|
+
// Also collect all spans in the chain for upstream service lookup
|
|
397
|
+
const upstreamCandidates = [];
|
|
398
|
+
const chainSpanIds = new Set();
|
|
399
|
+
let currentSpanId = symptomEvent.spanId;
|
|
400
|
+
const visitedSpans = new Set();
|
|
401
|
+
while (currentSpanId && !visitedSpans.has(currentSpanId)) {
|
|
402
|
+
visitedSpans.add(currentSpanId);
|
|
403
|
+
chainSpanIds.add(currentSpanId);
|
|
404
|
+
// Find candidate for this span
|
|
405
|
+
const candidate = candidates.find((c) => c.spanId === currentSpanId);
|
|
406
|
+
if (candidate) {
|
|
407
|
+
upstreamCandidates.push(candidate);
|
|
408
|
+
}
|
|
409
|
+
// Move to parent span
|
|
410
|
+
currentSpanId = parentChain.get(currentSpanId) || null;
|
|
411
|
+
}
|
|
412
|
+
// Also check if any candidate's spanId is a parent of any span in the chain
|
|
413
|
+
// This catches outbound calls whose child spans are in the chain
|
|
414
|
+
for (const candidate of candidates) {
|
|
415
|
+
// Check if any span in the chain has this candidate's spanId as parentSpanId
|
|
416
|
+
const isParentOfChainSpan = traceEvents.some(e => chainSpanIds.has(e.spanId) && e.parentSpanId === candidate.spanId);
|
|
417
|
+
if (isParentOfChainSpan && !upstreamCandidates.some(c => c.spanId === candidate.spanId)) {
|
|
418
|
+
upstreamCandidates.push(candidate);
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
// Also find candidates from upstream services via edges
|
|
422
|
+
// Build service -> spanIds map
|
|
423
|
+
const serviceToSpans = new Map();
|
|
424
|
+
for (const event of traceEvents) {
|
|
425
|
+
if (!serviceToSpans.has(event.serviceName)) {
|
|
426
|
+
serviceToSpans.set(event.serviceName, []);
|
|
427
|
+
}
|
|
428
|
+
serviceToSpans.get(event.serviceName).push(event.spanId);
|
|
429
|
+
}
|
|
430
|
+
// Find upstream services from edges
|
|
431
|
+
const symptomService = symptomEvent.serviceName;
|
|
432
|
+
const upstreamServices = new Set();
|
|
433
|
+
for (const edge of edges) {
|
|
434
|
+
if (edge.toService === symptomService) {
|
|
435
|
+
upstreamServices.add(edge.fromService);
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
// Recursively find all upstream services
|
|
439
|
+
const allUpstreamServices = new Set(upstreamServices);
|
|
440
|
+
let changed = true;
|
|
441
|
+
while (changed) {
|
|
442
|
+
changed = false;
|
|
443
|
+
for (const upstreamService of Array.from(allUpstreamServices)) {
|
|
444
|
+
for (const edge of edges) {
|
|
445
|
+
if (edge.toService === upstreamService && !allUpstreamServices.has(edge.fromService)) {
|
|
446
|
+
allUpstreamServices.add(edge.fromService);
|
|
447
|
+
changed = true;
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
// Add candidates from all upstream services
|
|
453
|
+
// Also add candidates that are in the parent chain (even if not in upstream services)
|
|
454
|
+
for (const candidate of candidates) {
|
|
455
|
+
const isUpstreamService = allUpstreamServices.has(candidate.serviceName);
|
|
456
|
+
const isInChain = chainSpanIds.has(candidate.spanId);
|
|
457
|
+
if ((isUpstreamService || isInChain) &&
|
|
458
|
+
!upstreamCandidates.some((c) => c.spanId === candidate.spanId)) {
|
|
459
|
+
upstreamCandidates.push(candidate);
|
|
460
|
+
}
|
|
461
|
+
}
|
|
462
|
+
// Sort by score (highest first), then by timestamp (latest first for same score - prefer closer to symptom)
|
|
463
|
+
upstreamCandidates.sort((a, b) => {
|
|
464
|
+
if (b.score !== a.score) {
|
|
465
|
+
return b.score - a.score;
|
|
466
|
+
}
|
|
467
|
+
// For same score, prefer later timestamp (closer to symptom)
|
|
468
|
+
return b.event.timestamp - a.event.timestamp;
|
|
469
|
+
});
|
|
470
|
+
// Pick highest scoring candidate as culprit
|
|
471
|
+
// Prefer upstream candidates, but fall back to all candidates if none found
|
|
472
|
+
let culprit = null;
|
|
473
|
+
if (upstreamCandidates.length > 0) {
|
|
474
|
+
culprit = upstreamCandidates[0];
|
|
475
|
+
// If culprit is an outbound call with peerService, try to find the downstream service
|
|
476
|
+
// This helps identify the actual service causing the problem (e.g., pricing-service)
|
|
477
|
+
// rather than just the caller (e.g., checkout-service)
|
|
478
|
+
// This is especially important when invariants or shape digest issues are present
|
|
479
|
+
if (culprit.event.type === 'http.request.outbound' && culprit.event.peerService) {
|
|
480
|
+
// Find the inbound event that corresponds to this outbound call
|
|
481
|
+
// The inbound event's parentSpanId should match the outbound event's spanId
|
|
482
|
+
const outboundSpanId = culprit.event.spanId;
|
|
483
|
+
const correspondingInbound = traceEvents.find(e => e.type === 'http.request.inbound' && e.parentSpanId === outboundSpanId);
|
|
484
|
+
// If we have invariants or shape digest issues, ALWAYS prefer the downstream service
|
|
485
|
+
// because the problem is with the response FROM that service
|
|
486
|
+
const hasInvariantOrShapeDigestIssue = (culprit.evidence.invariants && culprit.evidence.invariants.length > 0) ||
|
|
487
|
+
(culprit.evidence.responseShapeDigest?.nullPaths && culprit.evidence.responseShapeDigest.nullPaths.length > 0);
|
|
488
|
+
// If we found the downstream service and it's different, prefer identifying that service
|
|
489
|
+
// But keep the outbound event's evidence (shape digest, invariants)
|
|
490
|
+
if (correspondingInbound && correspondingInbound.serviceName !== culprit.serviceName) {
|
|
491
|
+
// Always prefer downstream service when we have invariant/shape digest issues
|
|
492
|
+
// OR when peerService matches the downstream service name
|
|
493
|
+
// This helps identify the actual service causing the problem (e.g., pricing-service)
|
|
494
|
+
// rather than just the caller (e.g., checkout-service)
|
|
495
|
+
if (hasInvariantOrShapeDigestIssue || culprit.event.peerService === correspondingInbound.serviceName) {
|
|
496
|
+
// Create a new candidate for the downstream service with the same score and evidence
|
|
497
|
+
culprit = {
|
|
498
|
+
...culprit,
|
|
499
|
+
serviceName: correspondingInbound.serviceName,
|
|
500
|
+
spanId: correspondingInbound.spanId,
|
|
501
|
+
operationName: `${correspondingInbound.method || 'UNKNOWN'} ${correspondingInbound.path || correspondingInbound.route || 'unknown'}`,
|
|
502
|
+
};
|
|
503
|
+
}
|
|
504
|
+
}
|
|
505
|
+
else if (hasInvariantOrShapeDigestIssue && culprit.event.peerService) {
|
|
506
|
+
// If we can't find the inbound event but we have peerService and invariant/shape digest issues,
|
|
507
|
+
// use the peerService as the service name
|
|
508
|
+
culprit = {
|
|
509
|
+
...culprit,
|
|
510
|
+
serviceName: culprit.event.peerService,
|
|
511
|
+
// Keep the outbound spanId since we don't have the inbound spanId
|
|
512
|
+
};
|
|
513
|
+
}
|
|
514
|
+
else if (culprit.event.peerService && !correspondingInbound && hasInvariantOrShapeDigestIssue) {
|
|
515
|
+
// If we have peerService but no corresponding inbound, and we have invariant/shape digest issues,
|
|
516
|
+
// prefer the peerService as the culprit service name
|
|
517
|
+
culprit = {
|
|
518
|
+
...culprit,
|
|
519
|
+
serviceName: culprit.event.peerService,
|
|
520
|
+
};
|
|
521
|
+
}
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
else {
|
|
525
|
+
// Sort all candidates by score (highest first), then by timestamp (latest first)
|
|
526
|
+
const allCandidatesSorted = [...candidates].sort((a, b) => {
|
|
527
|
+
if (b.score !== a.score) {
|
|
528
|
+
return b.score - a.score;
|
|
529
|
+
}
|
|
530
|
+
// For same score, prefer later timestamp (closer to symptom)
|
|
531
|
+
return b.event.timestamp - a.event.timestamp;
|
|
532
|
+
});
|
|
533
|
+
if (allCandidatesSorted.length > 0) {
|
|
534
|
+
culprit = allCandidatesSorted[0];
|
|
535
|
+
// Same logic: if outbound call, try to identify downstream service
|
|
536
|
+
if (culprit.event.type === 'http.request.outbound' && culprit.event.peerService) {
|
|
537
|
+
const outboundSpanId = culprit.event.spanId;
|
|
538
|
+
const correspondingInbound = traceEvents.find(e => e.type === 'http.request.inbound' && e.parentSpanId === outboundSpanId);
|
|
539
|
+
const hasInvariantOrShapeDigestIssue = (culprit.evidence.invariants && culprit.evidence.invariants.length > 0) ||
|
|
540
|
+
(culprit.evidence.responseShapeDigest?.nullPaths && culprit.evidence.responseShapeDigest.nullPaths.length > 0);
|
|
541
|
+
if (correspondingInbound && correspondingInbound.serviceName !== culprit.serviceName) {
|
|
542
|
+
if (hasInvariantOrShapeDigestIssue || culprit.event.peerService === correspondingInbound.serviceName) {
|
|
543
|
+
culprit = {
|
|
544
|
+
...culprit,
|
|
545
|
+
serviceName: correspondingInbound.serviceName,
|
|
546
|
+
spanId: correspondingInbound.spanId,
|
|
547
|
+
operationName: `${correspondingInbound.method || 'UNKNOWN'} ${correspondingInbound.path || correspondingInbound.route || 'unknown'}`,
|
|
548
|
+
};
|
|
549
|
+
}
|
|
550
|
+
}
|
|
551
|
+
else if (hasInvariantOrShapeDigestIssue && culprit.event.peerService) {
|
|
552
|
+
culprit = {
|
|
553
|
+
...culprit,
|
|
554
|
+
serviceName: culprit.event.peerService,
|
|
555
|
+
};
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
if (!culprit) {
|
|
561
|
+
// Fallback - use symptom event
|
|
562
|
+
const operationName = symptomEvent.type === 'http.request.outbound'
|
|
563
|
+
? `${symptomEvent.method || 'UNKNOWN'} ${symptomEvent.urlTemplate || symptomEvent.peerHost || 'unknown'}`
|
|
564
|
+
: symptomEvent.type === 'error'
|
|
565
|
+
? 'error'
|
|
566
|
+
: `${symptomEvent.method || 'UNKNOWN'} ${symptomEvent.path || symptomEvent.route || 'unknown'}`;
|
|
567
|
+
return {
|
|
568
|
+
culprit: {
|
|
569
|
+
serviceName: symptomEvent.serviceName,
|
|
570
|
+
spanId: symptomEvent.spanId,
|
|
571
|
+
operationName,
|
|
572
|
+
evidence: {
|
|
573
|
+
kind: symptomEvent.type === 'error' ? 'error' : 'http',
|
|
574
|
+
message: symptomEvent.type === 'error' ? (symptomEvent.error?.message || 'Unknown error') : undefined,
|
|
575
|
+
},
|
|
576
|
+
confidence: 0.5,
|
|
577
|
+
},
|
|
578
|
+
propagationPath: buildPropagationPath(edges),
|
|
579
|
+
rankedFindings: [],
|
|
580
|
+
};
|
|
581
|
+
}
|
|
582
|
+
// Calculate confidence
|
|
583
|
+
const confidence = calculateConfidence(culprit, traceEvents, edges);
|
|
584
|
+
// Build ranked findings
|
|
585
|
+
const rankedFindings = upstreamCandidates
|
|
586
|
+
.slice(0, 10) // Top 10
|
|
587
|
+
.map((c) => ({
|
|
588
|
+
score: c.score,
|
|
589
|
+
reason: c.reason,
|
|
590
|
+
serviceName: c.serviceName,
|
|
591
|
+
spanId: c.spanId,
|
|
592
|
+
}));
|
|
593
|
+
// Build propagation path
|
|
594
|
+
const propagationPath = buildPropagationPath(edges);
|
|
595
|
+
return {
|
|
596
|
+
culprit: {
|
|
597
|
+
serviceName: culprit.serviceName,
|
|
598
|
+
spanId: culprit.spanId,
|
|
599
|
+
operationName: culprit.operationName,
|
|
600
|
+
evidence: culprit.evidence,
|
|
601
|
+
confidence,
|
|
602
|
+
},
|
|
603
|
+
propagationPath,
|
|
604
|
+
rankedFindings,
|
|
605
|
+
};
|
|
606
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shape Digest Utilities
|
|
3
|
+
*
|
|
4
|
+
* Utilities for computing shape digests with sampling and feature flags.
|
|
5
|
+
*/
|
|
6
|
+
import { type ShapeDigest } from "@runtime-digital-twin/core";
|
|
7
|
+
export type ShapeDigestMode = "off" | "sampled" | "on";
|
|
8
|
+
/**
|
|
9
|
+
* Get shape digest mode from environment
|
|
10
|
+
*/
|
|
11
|
+
export declare function getShapeDigestMode(): ShapeDigestMode;
|
|
12
|
+
/**
|
|
13
|
+
* Get sample rate from environment
|
|
14
|
+
*/
|
|
15
|
+
export declare function getShapeDigestSampleRate(): number;
|
|
16
|
+
/**
|
|
17
|
+
* Determine if shape digest should be computed
|
|
18
|
+
*/
|
|
19
|
+
export declare function shouldComputeShapeDigest(mode: ShapeDigestMode, sampleRate: number, statusCode?: number | null, hasError?: boolean): boolean;
|
|
20
|
+
/**
|
|
21
|
+
* Check if content type indicates JSON
|
|
22
|
+
*/
|
|
23
|
+
export declare function isJsonContentType(contentType: string | null | undefined): boolean;
|
|
24
|
+
/**
|
|
25
|
+
* Attempt to parse body as JSON
|
|
26
|
+
*/
|
|
27
|
+
export declare function tryParseJson(body: string | Buffer | object | null | undefined): unknown;
|
|
28
|
+
/**
|
|
29
|
+
* Compute shape digest for a body if conditions are met
|
|
30
|
+
*/
|
|
31
|
+
export declare function computeBodyShapeDigest(body: string | Buffer | object | null | undefined, contentType: string | null | undefined, options: {
|
|
32
|
+
mode: ShapeDigestMode;
|
|
33
|
+
sampleRate: number;
|
|
34
|
+
statusCode?: number | null;
|
|
35
|
+
hasError?: boolean;
|
|
36
|
+
}): Promise<ShapeDigest | null>;
|
|
37
|
+
/**
|
|
38
|
+
* Get content type from headers
|
|
39
|
+
*/
|
|
40
|
+
export declare function getContentType(headers: Record<string, string | string[] | undefined> | Headers | null): string | null;
|
|
41
|
+
/**
|
|
42
|
+
* Get size in bytes for a body
|
|
43
|
+
*/
|
|
44
|
+
export declare function getBodySizeBytes(body: string | Buffer | object | null | undefined): number | undefined;
|
|
45
|
+
//# sourceMappingURL=shape-digest-utils.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"shape-digest-utils.d.ts","sourceRoot":"","sources":["../src/shape-digest-utils.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAAsB,KAAK,WAAW,EAAE,MAAM,4BAA4B,CAAC;AAElF,MAAM,MAAM,eAAe,GAAG,KAAK,GAAG,SAAS,GAAG,IAAI,CAAC;AAEvD;;GAEG;AACH,wBAAgB,kBAAkB,IAAI,eAAe,CAMpD;AAED;;GAEG;AACH,wBAAgB,wBAAwB,IAAI,MAAM,CAMjD;AAED;;GAEG;AACH,wBAAgB,wBAAwB,CACtC,IAAI,EAAE,eAAe,EACrB,UAAU,EAAE,MAAM,EAClB,UAAU,CAAC,EAAE,MAAM,GAAG,IAAI,EAC1B,QAAQ,CAAC,EAAE,OAAO,GACjB,OAAO,CAcT;AAED;;GAEG;AACH,wBAAgB,iBAAiB,CAAC,WAAW,EAAE,MAAM,GAAG,IAAI,GAAG,SAAS,GAAG,OAAO,CASjF;AAED;;GAEG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,MAAM,GAAG,IAAI,GAAG,SAAS,GAAG,OAAO,CAoBvF;AAED;;GAEG;AACH,wBAAsB,sBAAsB,CAC1C,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,MAAM,GAAG,IAAI,GAAG,SAAS,EACjD,WAAW,EAAE,MAAM,GAAG,IAAI,GAAG,SAAS,EACtC,OAAO,EAAE;IACP,IAAI,EAAE,eAAe,CAAC;IACtB,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,QAAQ,CAAC,EAAE,OAAO,CAAC;CACpB,GACA,OAAO,CAAC,WAAW,GAAG,IAAI,CAAC,CAmB7B;AAED;;GAEG;AACH,wBAAgB,cAAc,CAAC,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,EAAE,GAAG,SAAS,CAAC,GAAG,OAAO,GAAG,IAAI,GAAG,MAAM,GAAG,IAAI,CAgBrH;AAED;;GAEG;AACH,wBAAgB,gBAAgB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,MAAM,GAAG,IAAI,GAAG,SAAS,GAAG,MAAM,GAAG,SAAS,CAgBtG"}
|