@dipseth/opensearch-logs 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +14 -0
- package/alerts/langfuse-usage.yaml +142 -0
- package/alerts/production-incidents.yaml +280 -0
- package/alerts/service-health.yaml +98 -0
- package/dashboards/langfuse-usage.yaml +57 -0
- package/dist/create-dashboards.d.ts +10 -0
- package/dist/create-dashboards.js +38 -0
- package/dist/create-dashboards.js.map +1 -0
- package/dist/interfaces/alert.interfaces.d.ts +323 -0
- package/dist/interfaces/alert.interfaces.js +6 -0
- package/dist/interfaces/alert.interfaces.js.map +1 -0
- package/dist/interfaces/dashboard-gen.interfaces.d.ts +33 -0
- package/dist/interfaces/dashboard-gen.interfaces.js +3 -0
- package/dist/interfaces/dashboard-gen.interfaces.js.map +1 -0
- package/dist/interfaces/interfaces.d.ts +312 -0
- package/dist/interfaces/interfaces.js +3 -0
- package/dist/interfaces/interfaces.js.map +1 -0
- package/dist/interfaces/playbook.interfaces.d.ts +140 -0
- package/dist/interfaces/playbook.interfaces.js +3 -0
- package/dist/interfaces/playbook.interfaces.js.map +1 -0
- package/dist/os-alert.d.ts +17 -0
- package/dist/os-alert.js +245 -0
- package/dist/os-alert.js.map +1 -0
- package/dist/os-dash.d.ts +9 -0
- package/dist/os-dash.js +53 -0
- package/dist/os-dash.js.map +1 -0
- package/dist/os-monitor.d.ts +12 -0
- package/dist/os-monitor.js +59 -0
- package/dist/os-monitor.js.map +1 -0
- package/dist/os-playbook.d.ts +9 -0
- package/dist/os-playbook.js +71 -0
- package/dist/os-playbook.js.map +1 -0
- package/dist/os-search.d.ts +11 -0
- package/dist/os-search.js +84 -0
- package/dist/os-search.js.map +1 -0
- package/dist/repositories/index.d.ts +1 -0
- package/dist/repositories/index.js +2 -0
- package/dist/repositories/index.js.map +1 -0
- package/dist/repositories/opensearch.repository.d.ts +51 -0
- package/dist/repositories/opensearch.repository.js +167 -0
- package/dist/repositories/opensearch.repository.js.map +1 -0
- package/dist/services/alert.service.d.ts +73 -0
- package/dist/services/alert.service.js +503 -0
- package/dist/services/alert.service.js.map +1 -0
- package/dist/services/dashboard-gen.service.d.ts +36 -0
- package/dist/services/dashboard-gen.service.js +162 -0
- package/dist/services/dashboard-gen.service.js.map +1 -0
- package/dist/services/dashboard.service.d.ts +33 -0
- package/dist/services/dashboard.service.js +428 -0
- package/dist/services/dashboard.service.js.map +1 -0
- package/dist/services/gchat.service.d.ts +45 -0
- package/dist/services/gchat.service.js +228 -0
- package/dist/services/gchat.service.js.map +1 -0
- package/dist/services/index.d.ts +8 -0
- package/dist/services/index.js +9 -0
- package/dist/services/index.js.map +1 -0
- package/dist/services/monitor.service.d.ts +18 -0
- package/dist/services/monitor.service.js +342 -0
- package/dist/services/monitor.service.js.map +1 -0
- package/dist/services/panel-layout.d.ts +21 -0
- package/dist/services/panel-layout.js +33 -0
- package/dist/services/panel-layout.js.map +1 -0
- package/dist/services/playbook-dashboard.service.d.ts +19 -0
- package/dist/services/playbook-dashboard.service.js +434 -0
- package/dist/services/playbook-dashboard.service.js.map +1 -0
- package/dist/services/playbook.service.d.ts +13 -0
- package/dist/services/playbook.service.js +621 -0
- package/dist/services/playbook.service.js.map +1 -0
- package/dist/services/search.service.d.ts +30 -0
- package/dist/services/search.service.js +885 -0
- package/dist/services/search.service.js.map +1 -0
- package/dist/utils/cli.d.ts +14 -0
- package/dist/utils/cli.js +90 -0
- package/dist/utils/cli.js.map +1 -0
- package/dist/utils/config.d.ts +20 -0
- package/dist/utils/config.js +104 -0
- package/dist/utils/config.js.map +1 -0
- package/dist/utils/index.d.ts +5 -0
- package/dist/utils/index.js +5 -0
- package/dist/utils/index.js.map +1 -0
- package/dist/utils/service-registry.d.ts +15 -0
- package/dist/utils/service-registry.js +56 -0
- package/dist/utils/service-registry.js.map +1 -0
- package/dist/utils/template.d.ts +18 -0
- package/dist/utils/template.js +66 -0
- package/dist/utils/template.js.map +1 -0
- package/package.json +76 -0
- package/playbooks/error-investigation.yaml +45 -0
- package/playbooks/incident-triage.yaml +32 -0
- package/playbooks/post-deploy-validation.yaml +24 -0
- package/playbooks/service-deep-dive.yaml +42 -0
|
@@ -0,0 +1,885 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Search service — 9 modes of log search and analysis.
|
|
3
|
+
*
|
|
4
|
+
* Modes: search, count, histogram, timeline, dashboard, latency, errors, services, report
|
|
5
|
+
*/
|
|
6
|
+
import { resolveServiceName, SERVICE_REGISTRY } from "../utils/index.js";
|
|
7
|
+
// ── Report Mode Constants ────────────────────────────────────────────────
|
|
8
|
+
const REPORT_PATTERNS = {
|
|
9
|
+
"HTTP 500s": '"500 Internal Server Error"',
|
|
10
|
+
"HTTP 503s": 'log:"status=503"',
|
|
11
|
+
"HTTP 502s": 'log:"status=502"',
|
|
12
|
+
"Connection Refused": '"Connection refused"',
|
|
13
|
+
"Worker Shutdowns": '"Shutting down"',
|
|
14
|
+
"Worker Startups": '"Application startup complete"',
|
|
15
|
+
"Max Request Limit": '"Maximum request limit"',
|
|
16
|
+
"Playwright Failures": '"Failed to launch browser"',
|
|
17
|
+
"OOM / Memory": 'OOM OR "Cannot allocate memory"',
|
|
18
|
+
"API Key Missing": '"ENCORE_G_API_KEY: None" OR "ENCORE_G_API_KEY: NOT SET"',
|
|
19
|
+
"Client Timeouts (499)": 'log:"status=499"',
|
|
20
|
+
};
|
|
21
|
+
// Painless scripts for parsing latency from correlation middleware messages.
|
|
22
|
+
// Message format: "METHOD /endpoint STATUS DURATIONms"
|
|
23
|
+
const PAINLESS_LATENCY = {
|
|
24
|
+
init: "state.values = []",
|
|
25
|
+
map: String.raw `def msg = doc["message.keyword"].value; if (msg != null) { def m = /\d+(?=ms)/.matcher(msg); if (m.find()) { state.values.add(Integer.parseInt(m.group())); } }`,
|
|
26
|
+
combine: "return state.values",
|
|
27
|
+
reduce: [
|
|
28
|
+
'List all = new ArrayList();',
|
|
29
|
+
'for (s in states) { if (s instanceof List) { all.addAll(s); } }',
|
|
30
|
+
'Map result = new HashMap();',
|
|
31
|
+
'if (all.isEmpty()) { result.put("count", 0); result.put("avg", 0); result.put("p50", 0); result.put("p95", 0); result.put("p99", 0); return result; }',
|
|
32
|
+
'Collections.sort(all); long sum = 0; for (def v : all) { sum += v; }',
|
|
33
|
+
'int sz = all.size();',
|
|
34
|
+
'int p50i = sz / 2; int p95i = sz * 95 / 100; if (p95i >= sz) { p95i = sz - 1; }',
|
|
35
|
+
'int p99i = sz * 99 / 100; if (p99i >= sz) { p99i = sz - 1; }',
|
|
36
|
+
'result.put("count", sz); result.put("avg", sum / sz);',
|
|
37
|
+
'result.put("p50", all.get(p50i)); result.put("p95", all.get(p95i)); result.put("p99", all.get(p99i));',
|
|
38
|
+
'return result;',
|
|
39
|
+
].join(" "),
|
|
40
|
+
};
|
|
41
|
+
function autoInterval(hours) {
|
|
42
|
+
if (hours <= 2)
|
|
43
|
+
return "15m";
|
|
44
|
+
if (hours <= 6)
|
|
45
|
+
return "30m";
|
|
46
|
+
if (hours <= 12)
|
|
47
|
+
return "1h";
|
|
48
|
+
if (hours <= 24)
|
|
49
|
+
return "2h";
|
|
50
|
+
return "4h";
|
|
51
|
+
}
|
|
52
|
+
export class SearchService {
|
|
53
|
+
repo;
|
|
54
|
+
constructor(repo) {
|
|
55
|
+
this.repo = repo;
|
|
56
|
+
}
|
|
57
|
+
// ── Query Builders ──────────────────────────────────────────────────
|
|
58
|
+
parseStatusFilter(status) {
|
|
59
|
+
if (!status)
|
|
60
|
+
return null;
|
|
61
|
+
const s = status.trim().toLowerCase();
|
|
62
|
+
if (s === "5xx")
|
|
63
|
+
return { range: { status_code: { gte: 500, lt: 600 } } };
|
|
64
|
+
if (s === "4xx")
|
|
65
|
+
return { range: { status_code: { gte: 400, lt: 500 } } };
|
|
66
|
+
if (s === "3xx")
|
|
67
|
+
return { range: { status_code: { gte: 300, lt: 400 } } };
|
|
68
|
+
if (s === "2xx")
|
|
69
|
+
return { range: { status_code: { gte: 200, lt: 300 } } };
|
|
70
|
+
if (s.startsWith(">="))
|
|
71
|
+
return { range: { status_code: { gte: Number(s.slice(2)) } } };
|
|
72
|
+
if (s.startsWith(">"))
|
|
73
|
+
return { range: { status_code: { gt: Number(s.slice(1)) } } };
|
|
74
|
+
if (s.startsWith("<="))
|
|
75
|
+
return { range: { status_code: { lte: Number(s.slice(2)) } } };
|
|
76
|
+
if (s.startsWith("<"))
|
|
77
|
+
return { range: { status_code: { lt: Number(s.slice(1)) } } };
|
|
78
|
+
return { term: { status_code: Number(s) } };
|
|
79
|
+
}
|
|
80
|
+
serviceFilter(svcName) {
|
|
81
|
+
return {
|
|
82
|
+
bool: {
|
|
83
|
+
should: [
|
|
84
|
+
{ term: { "service.keyword": svcName } },
|
|
85
|
+
{ match_phrase: { log: svcName } },
|
|
86
|
+
],
|
|
87
|
+
minimum_should_match: 1,
|
|
88
|
+
},
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
/** Hybrid 5xx filter: works whether status_code field exists or not. */
|
|
92
|
+
hybridErrorFilter() {
|
|
93
|
+
return {
|
|
94
|
+
bool: {
|
|
95
|
+
should: [
|
|
96
|
+
{ range: { status_code: { gte: 500 } } },
|
|
97
|
+
{ term: { "level.keyword": "ERROR" } },
|
|
98
|
+
],
|
|
99
|
+
minimum_should_match: 1,
|
|
100
|
+
},
|
|
101
|
+
};
|
|
102
|
+
}
|
|
103
|
+
/** Hybrid 4xx filter: works whether status_code field exists or not. */
|
|
104
|
+
hybridClientErrorFilter() {
|
|
105
|
+
return {
|
|
106
|
+
bool: {
|
|
107
|
+
should: [
|
|
108
|
+
{ range: { status_code: { gte: 400, lt: 500 } } },
|
|
109
|
+
{ term: { "level.keyword": "WARNING" } },
|
|
110
|
+
],
|
|
111
|
+
minimum_should_match: 1,
|
|
112
|
+
},
|
|
113
|
+
};
|
|
114
|
+
}
|
|
115
|
+
/** Scripted metric aggregation that parses latency from correlation middleware messages. */
|
|
116
|
+
latencyScriptedMetric() {
|
|
117
|
+
return {
|
|
118
|
+
scripted_metric: {
|
|
119
|
+
init_script: PAINLESS_LATENCY.init,
|
|
120
|
+
map_script: PAINLESS_LATENCY.map,
|
|
121
|
+
combine_script: PAINLESS_LATENCY.combine,
|
|
122
|
+
reduce_script: PAINLESS_LATENCY.reduce,
|
|
123
|
+
},
|
|
124
|
+
};
|
|
125
|
+
}
|
|
126
|
+
buildQuery(args, fromDt, toDt) {
|
|
127
|
+
const must = [];
|
|
128
|
+
const filters = [
|
|
129
|
+
{ range: { "@timestamp": { gte: fromDt.toISOString(), lte: toDt.toISOString() } } },
|
|
130
|
+
];
|
|
131
|
+
if (args.query)
|
|
132
|
+
must.push({ query_string: { query: args.query } });
|
|
133
|
+
if (args.service) {
|
|
134
|
+
must.push(this.serviceFilter(resolveServiceName(args.service)));
|
|
135
|
+
}
|
|
136
|
+
if (args.level) {
|
|
137
|
+
const upper = args.level.toUpperCase();
|
|
138
|
+
must.push({
|
|
139
|
+
bool: {
|
|
140
|
+
should: [{ term: { "level.keyword": upper } }, { match_phrase: { log: upper } }],
|
|
141
|
+
minimum_should_match: 1,
|
|
142
|
+
},
|
|
143
|
+
});
|
|
144
|
+
}
|
|
145
|
+
const statusFilter = this.parseStatusFilter(args.status);
|
|
146
|
+
if (statusFilter)
|
|
147
|
+
filters.push(statusFilter);
|
|
148
|
+
if (args.correlationId) {
|
|
149
|
+
must.push({
|
|
150
|
+
bool: {
|
|
151
|
+
should: [
|
|
152
|
+
{ term: { "correlation_id.keyword": args.correlationId } },
|
|
153
|
+
{ match_phrase: { log: args.correlationId } },
|
|
154
|
+
],
|
|
155
|
+
minimum_should_match: 1,
|
|
156
|
+
},
|
|
157
|
+
});
|
|
158
|
+
}
|
|
159
|
+
return { bool: { must: must.length ? must : [{ match_all: {} }], filter: filters } };
|
|
160
|
+
}
|
|
161
|
+
// ── Mode: search ────────────────────────────────────────────────────
|
|
162
|
+
async modeSearch(index, query, args) {
|
|
163
|
+
const result = await this.repo.search(index, {
|
|
164
|
+
size: args.limit,
|
|
165
|
+
sort: [{ "@timestamp": args.asc ? "asc" : "desc" }],
|
|
166
|
+
query,
|
|
167
|
+
});
|
|
168
|
+
const { total } = result.hits;
|
|
169
|
+
console.log(`Total: ${total.value}${total.relation === "gte" ? "+" : ""} hits\n`);
|
|
170
|
+
for (const hit of result.hits.hits) {
|
|
171
|
+
const src = hit._source;
|
|
172
|
+
const ts = src["@timestamp"] ?? "?";
|
|
173
|
+
const log = String(src.log ?? "");
|
|
174
|
+
const svc = String(src.service ?? "");
|
|
175
|
+
const level = String(src.level ?? "");
|
|
176
|
+
const statusCode = src.status_code ?? "";
|
|
177
|
+
const duration = src.duration_ms ?? "";
|
|
178
|
+
if (args.json) {
|
|
179
|
+
console.log(JSON.stringify({
|
|
180
|
+
id: hit._id, timestamp: ts, service: svc, level,
|
|
181
|
+
status_code: statusCode, duration_ms: duration, log,
|
|
182
|
+
}));
|
|
183
|
+
}
|
|
184
|
+
else {
|
|
185
|
+
const parts = [];
|
|
186
|
+
if (svc)
|
|
187
|
+
parts.push(`[${svc}]`);
|
|
188
|
+
if (level)
|
|
189
|
+
parts.push(String(level));
|
|
190
|
+
if (statusCode)
|
|
191
|
+
parts.push(`HTTP ${statusCode}`);
|
|
192
|
+
if (duration)
|
|
193
|
+
parts.push(`${duration}ms`);
|
|
194
|
+
const prefix = parts.join(" ");
|
|
195
|
+
const display = args.full ? log : log.slice(0, 300);
|
|
196
|
+
if (prefix) {
|
|
197
|
+
console.log(`[${ts}] ${prefix}`);
|
|
198
|
+
console.log(` ${display}`);
|
|
199
|
+
}
|
|
200
|
+
else {
|
|
201
|
+
console.log(`[${ts}] ${display}`);
|
|
202
|
+
}
|
|
203
|
+
if (args.ids)
|
|
204
|
+
console.log(` _id: ${hit._id}`);
|
|
205
|
+
if (src.correlation_id && args.ids)
|
|
206
|
+
console.log(` correlation_id: ${src.correlation_id}`);
|
|
207
|
+
console.log();
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
// ── Mode: count ─────────────────────────────────────────────────────
|
|
212
|
+
async modeCount(index, query) {
|
|
213
|
+
const n = await this.repo.count(index, { query });
|
|
214
|
+
console.log(`Count: ${n}`);
|
|
215
|
+
}
|
|
216
|
+
// ── Mode: histogram ─────────────────────────────────────────────────
|
|
217
|
+
async modeHistogram(index, query, args) {
|
|
218
|
+
const result = await this.repo.search(index, {
|
|
219
|
+
size: 0, query,
|
|
220
|
+
aggs: {
|
|
221
|
+
over_time: {
|
|
222
|
+
date_histogram: { field: "@timestamp", fixed_interval: args.interval ?? "1h" },
|
|
223
|
+
},
|
|
224
|
+
},
|
|
225
|
+
});
|
|
226
|
+
const total = result.hits.total.value;
|
|
227
|
+
const buckets = result.aggregations.over_time.buckets;
|
|
228
|
+
console.log(`Total: ${total} hits\n`);
|
|
229
|
+
const maxCount = Math.max(...buckets.map(b => b.doc_count), 1);
|
|
230
|
+
for (const b of buckets) {
|
|
231
|
+
const barLen = Math.round((b.doc_count / maxCount) * 40);
|
|
232
|
+
console.log(` ${b.key_as_string} ${String(b.doc_count).padStart(6)} ${"█".repeat(barLen)}`);
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
// ── Mode: timeline ──────────────────────────────────────────────────
|
|
236
|
+
async modeTimeline(index, query, args) {
|
|
237
|
+
const result = await this.repo.search(index, {
|
|
238
|
+
size: args.limit,
|
|
239
|
+
sort: [{ "@timestamp": "asc" }],
|
|
240
|
+
query,
|
|
241
|
+
_source: ["@timestamp", "log", "service", "level", "status_code", "duration_ms", "correlation_id", "endpoint"],
|
|
242
|
+
});
|
|
243
|
+
const total = result.hits.total.value;
|
|
244
|
+
console.log(`Timeline (${total} total, showing ${result.hits.hits.length})\n`);
|
|
245
|
+
for (const hit of result.hits.hits) {
|
|
246
|
+
const src = hit._source;
|
|
247
|
+
const ts = src["@timestamp"] ?? "?";
|
|
248
|
+
const log = String(src.log ?? "");
|
|
249
|
+
const svc = String(src.service ?? "");
|
|
250
|
+
const level = String(src.level ?? "");
|
|
251
|
+
const display = args.full ? log : log.slice(0, 250);
|
|
252
|
+
let tag = "";
|
|
253
|
+
if (svc)
|
|
254
|
+
tag += `[${svc}] `;
|
|
255
|
+
if (level)
|
|
256
|
+
tag += `${level} `;
|
|
257
|
+
console.log(` ${ts} ${tag}${display}`);
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
// ── Mode: dashboard ─────────────────────────────────────────────────
|
|
261
|
+
async modeDashboard(index, fromDt, toDt) {
|
|
262
|
+
const timeFilter = { range: { "@timestamp": { gte: fromDt.toISOString(), lte: toDt.toISOString() } } };
|
|
263
|
+
const aggs = {};
|
|
264
|
+
// Global health checks
|
|
265
|
+
const textChecks = {
|
|
266
|
+
"HTTP 500s (text)": '"500 Internal Server Error"',
|
|
267
|
+
"HTTP 503s (text)": 'log:"status=503"',
|
|
268
|
+
"Connection Refused": '"Connection refused"',
|
|
269
|
+
"Worker Shutdowns": '"Shutting down"',
|
|
270
|
+
"Worker Startups": '"Application startup complete"',
|
|
271
|
+
"Max Request Limit": '"Maximum request limit"',
|
|
272
|
+
"Playwright Failures": '"Failed to launch browser"',
|
|
273
|
+
"OOM / Memory": 'OOM OR "Cannot allocate memory"',
|
|
274
|
+
"API Key None": '"ENCORE_G_API_KEY: None"',
|
|
275
|
+
};
|
|
276
|
+
for (const [name, q] of Object.entries(textChecks)) {
|
|
277
|
+
const safe = name.replace(/[^a-zA-Z0-9]/g, "_");
|
|
278
|
+
aggs[safe] = {
|
|
279
|
+
filter: { bool: { must: [{ query_string: { query: q } }], filter: [timeFilter] } },
|
|
280
|
+
};
|
|
281
|
+
}
|
|
282
|
+
// Per-service errors (5xx or ERROR level)
|
|
283
|
+
for (const [alias, info] of SERVICE_REGISTRY) {
|
|
284
|
+
const safe = `svc_5xx_${alias}`;
|
|
285
|
+
aggs[safe] = {
|
|
286
|
+
filter: {
|
|
287
|
+
bool: {
|
|
288
|
+
must: [
|
|
289
|
+
this.serviceFilter(info.logName),
|
|
290
|
+
{
|
|
291
|
+
bool: {
|
|
292
|
+
should: [
|
|
293
|
+
{ range: { status_code: { gte: 500 } } },
|
|
294
|
+
{ term: { "level.keyword": "ERROR" } },
|
|
295
|
+
{ query_string: { query: '"500 Internal Server Error" OR log:"status=503" OR log:"status=502"' } },
|
|
296
|
+
],
|
|
297
|
+
minimum_should_match: 1,
|
|
298
|
+
},
|
|
299
|
+
},
|
|
300
|
+
],
|
|
301
|
+
filter: [timeFilter],
|
|
302
|
+
},
|
|
303
|
+
},
|
|
304
|
+
};
|
|
305
|
+
}
|
|
306
|
+
// Latency from correlation middleware messages
|
|
307
|
+
aggs.latency_stats = {
|
|
308
|
+
filter: { bool: { filter: [timeFilter], must: [{ term: { "name.keyword": "common.middleware.correlation" } }] } },
|
|
309
|
+
aggs: { parsed: this.latencyScriptedMetric() },
|
|
310
|
+
};
|
|
311
|
+
// Total requests
|
|
312
|
+
aggs.total_requests = { filter: { bool: { filter: [timeFilter] } } };
|
|
313
|
+
const result = await this.repo.search(index, { size: 0, aggs });
|
|
314
|
+
const aggData = result.aggregations;
|
|
315
|
+
const window = `${fmtTime(fromDt)} - ${fmtTime(toDt)} UTC`;
|
|
316
|
+
console.log(`${"=".repeat(60)}`);
|
|
317
|
+
console.log(` Production Health Dashboard (${window})`);
|
|
318
|
+
console.log(`${"=".repeat(60)}\n`);
|
|
319
|
+
console.log(` Total log entries: ${aggData.total_requests.doc_count.toLocaleString()}\n`);
|
|
320
|
+
console.log(" ── Global Health Checks ──\n");
|
|
321
|
+
for (const name of Object.keys(textChecks)) {
|
|
322
|
+
const safe = name.replace(/[^a-zA-Z0-9]/g, "_");
|
|
323
|
+
const count = aggData[safe]?.doc_count ?? 0;
|
|
324
|
+
console.log(` [${count === 0 ? "✓" : "⚠"}] ${name.padEnd(25)} ${String(count).padStart(6)}`);
|
|
325
|
+
}
|
|
326
|
+
console.log(`\n ── Per-Service 5xx Errors ──\n`);
|
|
327
|
+
const svcErrors = [];
|
|
328
|
+
for (const [alias, info] of SERVICE_REGISTRY) {
|
|
329
|
+
const safe = `svc_5xx_${alias}`;
|
|
330
|
+
svcErrors.push([alias, aggData[safe]?.doc_count ?? 0, info.description]);
|
|
331
|
+
}
|
|
332
|
+
svcErrors.sort((a, b) => b[1] - a[1]);
|
|
333
|
+
for (const [alias, count, desc] of svcErrors) {
|
|
334
|
+
console.log(` [${count === 0 ? "✓" : "⚠"}] ${alias.padEnd(25)} ${String(count).padStart(6)} (${desc})`);
|
|
335
|
+
}
|
|
336
|
+
console.log(`\n ── Latency (from correlation middleware) ──\n`);
|
|
337
|
+
const lat = aggData.latency_stats;
|
|
338
|
+
if (lat.doc_count > 0) {
|
|
339
|
+
const parsed = lat.parsed.value;
|
|
340
|
+
console.log(` Requests tracked: ${parsed.count.toLocaleString()}`);
|
|
341
|
+
console.log(` Average: ${String(Math.round(parsed.avg)).padStart(8)} ms`);
|
|
342
|
+
console.log(` P50: ${String(Math.round(parsed.p50)).padStart(8)} ms`);
|
|
343
|
+
console.log(` P95: ${String(Math.round(parsed.p95)).padStart(8)} ms`);
|
|
344
|
+
console.log(` P99: ${String(Math.round(parsed.p99)).padStart(8)} ms`);
|
|
345
|
+
}
|
|
346
|
+
else {
|
|
347
|
+
console.log(" No correlation middleware logs in this window.");
|
|
348
|
+
}
|
|
349
|
+
console.log(`\n Index: ${index}`);
|
|
350
|
+
console.log(` Dashboard: ${this.buildDashboardLink(undefined, fromDt, toDt)}`);
|
|
351
|
+
console.log();
|
|
352
|
+
}
|
|
353
|
+
// ── Mode: latency ───────────────────────────────────────────────────
|
|
354
|
+
async modeLatency(index, fromDt, toDt, args) {
|
|
355
|
+
const timeFilter = { range: { "@timestamp": { gte: fromDt.toISOString(), lte: toDt.toISOString() } } };
|
|
356
|
+
const must = [{ term: { "name.keyword": "common.middleware.correlation" } }];
|
|
357
|
+
if (args.service) {
|
|
358
|
+
must.push(this.serviceFilter(resolveServiceName(args.service)));
|
|
359
|
+
}
|
|
360
|
+
const result = await this.repo.search(index, {
|
|
361
|
+
size: 0,
|
|
362
|
+
query: { bool: { must, filter: [timeFilter] } },
|
|
363
|
+
aggs: {
|
|
364
|
+
overall: this.latencyScriptedMetric(),
|
|
365
|
+
by_service: {
|
|
366
|
+
terms: { field: "service.keyword", size: 20 },
|
|
367
|
+
aggs: { parsed: this.latencyScriptedMetric() },
|
|
368
|
+
},
|
|
369
|
+
over_time: {
|
|
370
|
+
date_histogram: {
|
|
371
|
+
field: "@timestamp", fixed_interval: args.interval ?? "1h", min_doc_count: 0,
|
|
372
|
+
extended_bounds: { min: fromDt.getTime(), max: toDt.getTime() },
|
|
373
|
+
},
|
|
374
|
+
aggs: { parsed: this.latencyScriptedMetric() },
|
|
375
|
+
},
|
|
376
|
+
},
|
|
377
|
+
});
|
|
378
|
+
const total = result.hits.total.value;
|
|
379
|
+
const agg = result.aggregations;
|
|
380
|
+
const window = `${fmtTime(fromDt)} - ${fmtTime(toDt)} UTC`;
|
|
381
|
+
console.log(`${"=".repeat(60)}`);
|
|
382
|
+
console.log(` Latency Analysis (${window})`);
|
|
383
|
+
console.log(`${"=".repeat(60)}\n`);
|
|
384
|
+
console.log(` Correlation middleware requests: ${total.toLocaleString()}\n`);
|
|
385
|
+
const overall = agg.overall.value;
|
|
386
|
+
if (overall.count > 0) {
|
|
387
|
+
console.log(" ── Overall ──\n");
|
|
388
|
+
console.log(` Average: ${String(Math.round(overall.avg)).padStart(8)} ms`);
|
|
389
|
+
console.log(` P50: ${String(Math.round(overall.p50)).padStart(8)} ms`);
|
|
390
|
+
console.log(` P95: ${String(Math.round(overall.p95)).padStart(8)} ms`);
|
|
391
|
+
console.log(` P99: ${String(Math.round(overall.p99)).padStart(8)} ms`);
|
|
392
|
+
}
|
|
393
|
+
else {
|
|
394
|
+
console.log(" No correlation middleware logs with parseable duration.\n");
|
|
395
|
+
}
|
|
396
|
+
if (agg.by_service.buckets.length) {
|
|
397
|
+
console.log(`\n ── Per-Service ──\n`);
|
|
398
|
+
console.log(` ${"Service".padEnd(30)} ${"Count".padStart(7)} ${"Avg".padStart(8)} ${"P95".padStart(8)} ${"P99".padStart(8)}`);
|
|
399
|
+
console.log(` ${"-".repeat(30)} ${"-".repeat(7)} ${"-".repeat(8)} ${"-".repeat(8)} ${"-".repeat(8)}`);
|
|
400
|
+
const sorted = [...agg.by_service.buckets].sort((a, b) => b.parsed.value.p95 - a.parsed.value.p95);
|
|
401
|
+
for (const b of sorted) {
|
|
402
|
+
const p = b.parsed.value;
|
|
403
|
+
if (p.count === 0)
|
|
404
|
+
continue;
|
|
405
|
+
console.log(` ${b.key.padEnd(30)} ${String(p.count).padStart(7)} ${String(Math.round(p.avg)).padStart(7)}ms ${String(Math.round(p.p95)).padStart(7)}ms ${String(Math.round(p.p99)).padStart(7)}ms`);
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
const nonEmptyBuckets = agg.over_time.buckets.filter(b => b.parsed.value.count > 0);
|
|
409
|
+
if (nonEmptyBuckets.length) {
|
|
410
|
+
console.log(`\n ── P95 Latency Over Time ──\n`);
|
|
411
|
+
const maxP95 = Math.max(...nonEmptyBuckets.map(b => b.parsed.value.p95), 1);
|
|
412
|
+
for (const b of agg.over_time.buckets) {
|
|
413
|
+
const p95 = b.parsed.value.count > 0 ? b.parsed.value.p95 : 0;
|
|
414
|
+
const barLen = Math.round((p95 / maxP95) * 30);
|
|
415
|
+
console.log(` ${b.key_as_string} ${String(Math.round(p95)).padStart(7)}ms ${"█".repeat(barLen)}`);
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
console.log();
|
|
419
|
+
}
|
|
420
|
+
// ── Mode: errors ────────────────────────────────────────────────────
|
|
421
|
+
async modeErrors(index, fromDt, toDt, args) {
|
|
422
|
+
const timeFilter = { range: { "@timestamp": { gte: fromDt.toISOString(), lte: toDt.toISOString() } } };
|
|
423
|
+
const must = [{
|
|
424
|
+
bool: {
|
|
425
|
+
should: [
|
|
426
|
+
{ range: { status_code: { gte: 400 } } },
|
|
427
|
+
{ term: { "level.keyword": "ERROR" } },
|
|
428
|
+
{ query_string: { query: '"ERROR" OR "500 Internal Server Error" OR "status=503" OR "status=502"' } },
|
|
429
|
+
],
|
|
430
|
+
minimum_should_match: 1,
|
|
431
|
+
},
|
|
432
|
+
}];
|
|
433
|
+
if (args.service) {
|
|
434
|
+
must.push(this.serviceFilter(resolveServiceName(args.service)));
|
|
435
|
+
}
|
|
436
|
+
const result = await this.repo.search(index, {
|
|
437
|
+
size: 0,
|
|
438
|
+
query: { bool: { must, filter: [timeFilter] } },
|
|
439
|
+
aggs: {
|
|
440
|
+
by_service: {
|
|
441
|
+
terms: { field: "service.keyword", size: 20 },
|
|
442
|
+
aggs: { by_status: { terms: { field: "status_code", size: 10 } } },
|
|
443
|
+
},
|
|
444
|
+
by_status_code: { terms: { field: "status_code", size: 20 } },
|
|
445
|
+
by_endpoint: { terms: { field: "endpoint.keyword", size: 15, order: { _count: "desc" } } },
|
|
446
|
+
by_error_type: { terms: { field: "error_type.keyword", size: 10 } },
|
|
447
|
+
over_time: {
|
|
448
|
+
date_histogram: { field: "@timestamp", fixed_interval: args.interval ?? "1h" },
|
|
449
|
+
},
|
|
450
|
+
recent_errors: {
|
|
451
|
+
top_hits: {
|
|
452
|
+
size: 5, sort: [{ "@timestamp": "desc" }],
|
|
453
|
+
_source: ["@timestamp", "log", "service", "level", "status_code", "endpoint", "error_type", "correlation_id"],
|
|
454
|
+
},
|
|
455
|
+
},
|
|
456
|
+
},
|
|
457
|
+
});
|
|
458
|
+
const total = result.hits.total.value;
|
|
459
|
+
const agg = result.aggregations;
|
|
460
|
+
const window = `${fmtTime(fromDt)} - ${fmtTime(toDt)} UTC`;
|
|
461
|
+
console.log(`${"=".repeat(60)}`);
|
|
462
|
+
console.log(` Error Analysis (${window})`);
|
|
463
|
+
console.log(`${"=".repeat(60)}\n`);
|
|
464
|
+
console.log(` Total errors/4xx+: ${total.toLocaleString()}\n`);
|
|
465
|
+
if (agg.by_status_code.buckets.length) {
|
|
466
|
+
console.log(" ── By Status Code ──\n");
|
|
467
|
+
for (const b of agg.by_status_code.buckets)
|
|
468
|
+
console.log(` HTTP ${String(b.key).padEnd(6)} ${String(b.doc_count).padStart(6)}`);
|
|
469
|
+
}
|
|
470
|
+
if (agg.by_error_type.buckets.length) {
|
|
471
|
+
console.log(`\n ── By Error Type ──\n`);
|
|
472
|
+
for (const b of agg.by_error_type.buckets)
|
|
473
|
+
console.log(` ${String(b.key).padEnd(20)} ${String(b.doc_count).padStart(6)}`);
|
|
474
|
+
}
|
|
475
|
+
if (agg.by_service.buckets.length) {
|
|
476
|
+
console.log(`\n ── By Service ──\n`);
|
|
477
|
+
const sorted = [...agg.by_service.buckets].sort((a, b) => b.doc_count - a.doc_count);
|
|
478
|
+
for (const b of sorted) {
|
|
479
|
+
const detail = b.by_status.buckets.map(s => `${s.key}:${s.doc_count}`).join(", ");
|
|
480
|
+
console.log(` ${String(b.key).padEnd(30)} ${String(b.doc_count).padStart(6)} (${detail})`);
|
|
481
|
+
}
|
|
482
|
+
}
|
|
483
|
+
if (agg.by_endpoint.buckets.length) {
|
|
484
|
+
console.log(`\n ── Top Error Endpoints ──\n`);
|
|
485
|
+
for (const b of agg.by_endpoint.buckets)
|
|
486
|
+
console.log(` ${String(b.key).padEnd(50)} ${String(b.doc_count).padStart(6)}`);
|
|
487
|
+
}
|
|
488
|
+
if (agg.over_time.buckets.length) {
|
|
489
|
+
console.log(`\n ── Errors Over Time ──\n`);
|
|
490
|
+
const maxCount = Math.max(...agg.over_time.buckets.map(b => b.doc_count), 1);
|
|
491
|
+
for (const b of agg.over_time.buckets) {
|
|
492
|
+
const barLen = Math.round((b.doc_count / maxCount) * 30);
|
|
493
|
+
console.log(` ${b.key_as_string} ${String(b.doc_count).padStart(6)} ${"█".repeat(barLen)}`);
|
|
494
|
+
}
|
|
495
|
+
}
|
|
496
|
+
if (agg.recent_errors.hits.hits.length) {
|
|
497
|
+
console.log(`\n ── Recent Error Samples ──\n`);
|
|
498
|
+
for (const hit of agg.recent_errors.hits.hits) {
|
|
499
|
+
const src = hit._source;
|
|
500
|
+
console.log(` [${src["@timestamp"]}] [${src.service ?? "?"}] ${src.level ?? "?"} HTTP ${src.status_code ?? "?"} ${src.endpoint ?? "?"}`);
|
|
501
|
+
if (src.correlation_id)
|
|
502
|
+
console.log(` correlation_id: ${src.correlation_id}`);
|
|
503
|
+
console.log(` ${String(src.log ?? "").slice(0, 200)}`);
|
|
504
|
+
console.log();
|
|
505
|
+
}
|
|
506
|
+
}
|
|
507
|
+
console.log();
|
|
508
|
+
}
|
|
509
|
+
// ── Mode: services ──────────────────────────────────────────────────
|
|
510
|
+
async modeServices(index, fromDt, toDt) {
|
|
511
|
+
const timeFilter = { range: { "@timestamp": { gte: fromDt.toISOString(), lte: toDt.toISOString() } } };
|
|
512
|
+
const result = await this.repo.search(index, {
|
|
513
|
+
size: 0,
|
|
514
|
+
query: { bool: { filter: [timeFilter], must: [{ exists: { field: "service" } }] } },
|
|
515
|
+
aggs: {
|
|
516
|
+
by_service: {
|
|
517
|
+
terms: { field: "service.keyword", size: 30 },
|
|
518
|
+
aggs: {
|
|
519
|
+
errors: { filter: this.hybridErrorFilter() },
|
|
520
|
+
client_errors: { filter: this.hybridClientErrorFilter() },
|
|
521
|
+
latency: {
|
|
522
|
+
filter: { term: { "name.keyword": "common.middleware.correlation" } },
|
|
523
|
+
aggs: { parsed: this.latencyScriptedMetric() },
|
|
524
|
+
},
|
|
525
|
+
},
|
|
526
|
+
},
|
|
527
|
+
},
|
|
528
|
+
});
|
|
529
|
+
const window = `${fmtTime(fromDt)} - ${fmtTime(toDt)} UTC`;
|
|
530
|
+
console.log(`${"=".repeat(60)}`);
|
|
531
|
+
console.log(` Service Health Summary (${window})`);
|
|
532
|
+
console.log(`${"=".repeat(60)}\n`);
|
|
533
|
+
const buckets = result.aggregations.by_service.buckets;
|
|
534
|
+
if (!buckets.length) {
|
|
535
|
+
console.log(" No service-level data found.");
|
|
536
|
+
return;
|
|
537
|
+
}
|
|
538
|
+
console.log(` ${"Service".padEnd(30)} ${"Total".padStart(7)} ${"5xx".padStart(5)} ${"4xx".padStart(5)} ${"Avg ms".padStart(8)} ${"P95 ms".padStart(8)} ${"Err%".padStart(6)}`);
|
|
539
|
+
console.log(` ${"-".repeat(30)} ${"-".repeat(7)} ${"-".repeat(5)} ${"-".repeat(5)} ${"-".repeat(8)} ${"-".repeat(8)} ${"-".repeat(6)}`);
|
|
540
|
+
const sorted = [...buckets].sort((a, b) => b.errors.doc_count - a.errors.doc_count);
|
|
541
|
+
for (const b of sorted) {
|
|
542
|
+
const errs = b.errors.doc_count;
|
|
543
|
+
const clientErrs = b.client_errors.doc_count;
|
|
544
|
+
const parsedLat = b.latency.doc_count > 0 ? b.latency.parsed.value : null;
|
|
545
|
+
const avgMs = parsedLat?.avg ?? 0;
|
|
546
|
+
const p95Ms = parsedLat?.p95 ?? 0;
|
|
547
|
+
const errPct = b.doc_count > 0 ? (errs / b.doc_count * 100) : 0;
|
|
548
|
+
const marker = errs > 0 ? "⚠" : " ";
|
|
549
|
+
console.log(` ${marker}${String(b.key).padEnd(30)} ${String(b.doc_count).padStart(7)} ${String(errs).padStart(5)} ${String(clientErrs).padStart(5)} ${String(Math.round(avgMs)).padStart(7)}ms ${String(Math.round(p95Ms)).padStart(7)}ms ${errPct.toFixed(1).padStart(5)}%`);
|
|
550
|
+
}
|
|
551
|
+
console.log();
|
|
552
|
+
}
|
|
553
|
+
// ── Mode: report ───────────────────────────────────────────────────
|
|
554
|
+
async modeReport(index, fromDt, toDt, args) {
|
|
555
|
+
const timeFilter = { range: { "@timestamp": { gte: fromDt.toISOString(), lte: toDt.toISOString() } } };
|
|
556
|
+
const hours = (toDt.getTime() - fromDt.getTime()) / 3_600_000;
|
|
557
|
+
const interval = args.interval ?? autoInterval(hours);
|
|
558
|
+
const svcFilterClause = args.service
|
|
559
|
+
? this.serviceFilter(resolveServiceName(args.service))
|
|
560
|
+
: null;
|
|
561
|
+
// ── Phase 1: Discovery ──────────────────────────────────────────
|
|
562
|
+
const p1Aggs = {};
|
|
563
|
+
// Per-service breakdown with sub-aggs
|
|
564
|
+
const svcMust = [{ exists: { field: "service" } }];
|
|
565
|
+
if (svcFilterClause)
|
|
566
|
+
svcMust.push(svcFilterClause);
|
|
567
|
+
p1Aggs.by_service = {
|
|
568
|
+
filter: { bool: { must: svcMust, filter: [timeFilter] } },
|
|
569
|
+
aggs: {
|
|
570
|
+
services: {
|
|
571
|
+
terms: { field: "service.keyword", size: 30 },
|
|
572
|
+
aggs: {
|
|
573
|
+
errors_5xx: { filter: this.hybridErrorFilter() },
|
|
574
|
+
errors_4xx: { filter: this.hybridClientErrorFilter() },
|
|
575
|
+
latency: {
|
|
576
|
+
filter: { term: { "name.keyword": "common.middleware.correlation" } },
|
|
577
|
+
aggs: { parsed: this.latencyScriptedMetric() },
|
|
578
|
+
},
|
|
579
|
+
},
|
|
580
|
+
},
|
|
581
|
+
},
|
|
582
|
+
};
|
|
583
|
+
// Incident pattern counts
|
|
584
|
+
for (const [name, q] of Object.entries(REPORT_PATTERNS)) {
|
|
585
|
+
const safe = `pat_${name.replace(/[^a-zA-Z0-9]/g, "_")}`;
|
|
586
|
+
const patMust = [{ query_string: { query: q } }];
|
|
587
|
+
if (svcFilterClause)
|
|
588
|
+
patMust.push(svcFilterClause);
|
|
589
|
+
p1Aggs[safe] = {
|
|
590
|
+
filter: { bool: { must: patMust, filter: [timeFilter] } },
|
|
591
|
+
};
|
|
592
|
+
}
|
|
593
|
+
// Error histogram
|
|
594
|
+
const errHistMust = [{
|
|
595
|
+
bool: {
|
|
596
|
+
should: [
|
|
597
|
+
{ range: { status_code: { gte: 400 } } },
|
|
598
|
+
{ term: { "level.keyword": "ERROR" } },
|
|
599
|
+
],
|
|
600
|
+
minimum_should_match: 1,
|
|
601
|
+
},
|
|
602
|
+
}];
|
|
603
|
+
if (svcFilterClause)
|
|
604
|
+
errHistMust.push(svcFilterClause);
|
|
605
|
+
p1Aggs.error_total = {
|
|
606
|
+
filter: { bool: { must: errHistMust, filter: [timeFilter] } },
|
|
607
|
+
aggs: {
|
|
608
|
+
over_time: { date_histogram: {
|
|
609
|
+
field: "@timestamp", fixed_interval: interval, min_doc_count: 0,
|
|
610
|
+
extended_bounds: { min: fromDt.getTime(), max: toDt.getTime() },
|
|
611
|
+
} },
|
|
612
|
+
},
|
|
613
|
+
};
|
|
614
|
+
// Overall latency from correlation middleware messages
|
|
615
|
+
const latMust = [{ term: { "name.keyword": "common.middleware.correlation" } }];
|
|
616
|
+
if (svcFilterClause)
|
|
617
|
+
latMust.push(svcFilterClause);
|
|
618
|
+
p1Aggs.latency_overview = {
|
|
619
|
+
filter: { bool: { must: latMust, filter: [timeFilter] } },
|
|
620
|
+
aggs: { parsed: this.latencyScriptedMetric() },
|
|
621
|
+
};
|
|
622
|
+
// Total log entries
|
|
623
|
+
const totalMust = svcFilterClause ? [svcFilterClause] : [];
|
|
624
|
+
p1Aggs.total = {
|
|
625
|
+
filter: { bool: { must: totalMust.length ? totalMust : [{ match_all: {} }], filter: [timeFilter] } },
|
|
626
|
+
};
|
|
627
|
+
const p1Result = await this.repo.search(index, { size: 0, aggs: p1Aggs });
|
|
628
|
+
const p1 = p1Result.aggregations;
|
|
629
|
+
// Extract Phase 1 data
|
|
630
|
+
const totalCount = p1.total.doc_count;
|
|
631
|
+
const svcBuckets = p1.by_service.services.buckets;
|
|
632
|
+
const totalErrors = p1.error_total.doc_count;
|
|
633
|
+
const errorTimeBuckets = p1.error_total.over_time.buckets;
|
|
634
|
+
const latOverview = p1.latency_overview;
|
|
635
|
+
const hasLatency = latOverview.doc_count > 0;
|
|
636
|
+
// Collect active patterns
|
|
637
|
+
const activePatterns = [];
|
|
638
|
+
const clearPatterns = [];
|
|
639
|
+
for (const [name] of Object.entries(REPORT_PATTERNS)) {
|
|
640
|
+
const safe = `pat_${name.replace(/[^a-zA-Z0-9]/g, "_")}`;
|
|
641
|
+
const count = p1[safe].doc_count;
|
|
642
|
+
if (count > 0) {
|
|
643
|
+
const rate = (count / hours).toFixed(1);
|
|
644
|
+
activePatterns.push({ name, count, rate: `${rate}/hr` });
|
|
645
|
+
}
|
|
646
|
+
else {
|
|
647
|
+
clearPatterns.push(name);
|
|
648
|
+
}
|
|
649
|
+
}
|
|
650
|
+
// ── Phase 2: Detail (if errors or latency exist) ────────────────
|
|
651
|
+
let statusBuckets = [];
|
|
652
|
+
let errorTypeBuckets = [];
|
|
653
|
+
let errorEpBuckets = [];
|
|
654
|
+
let recentErrors = [];
|
|
655
|
+
let latP95TimeBuckets = [];
|
|
656
|
+
if (totalErrors > 0 || hasLatency) {
|
|
657
|
+
const p2Aggs = {};
|
|
658
|
+
if (totalErrors > 0) {
|
|
659
|
+
const errMust = [{
|
|
660
|
+
bool: {
|
|
661
|
+
should: [
|
|
662
|
+
{ range: { status_code: { gte: 400 } } },
|
|
663
|
+
{ term: { "level.keyword": "ERROR" } },
|
|
664
|
+
],
|
|
665
|
+
minimum_should_match: 1,
|
|
666
|
+
},
|
|
667
|
+
}];
|
|
668
|
+
if (svcFilterClause)
|
|
669
|
+
errMust.push(svcFilterClause);
|
|
670
|
+
p2Aggs.error_detail = {
|
|
671
|
+
filter: { bool: { must: errMust, filter: [timeFilter] } },
|
|
672
|
+
aggs: {
|
|
673
|
+
by_status_code: { terms: { field: "status_code", size: 20 } },
|
|
674
|
+
by_error_type: { terms: { field: "error_type.keyword", size: 10 } },
|
|
675
|
+
by_endpoint: { terms: { field: "endpoint.keyword", size: 10, order: { _count: "desc" } } },
|
|
676
|
+
recent: {
|
|
677
|
+
top_hits: {
|
|
678
|
+
size: 5, sort: [{ "@timestamp": "desc" }],
|
|
679
|
+
_source: ["@timestamp", "log", "service", "level", "status_code", "endpoint", "correlation_id"],
|
|
680
|
+
},
|
|
681
|
+
},
|
|
682
|
+
},
|
|
683
|
+
};
|
|
684
|
+
}
|
|
685
|
+
if (hasLatency) {
|
|
686
|
+
const latDetailMust = [{ term: { "name.keyword": "common.middleware.correlation" } }];
|
|
687
|
+
if (svcFilterClause)
|
|
688
|
+
latDetailMust.push(svcFilterClause);
|
|
689
|
+
p2Aggs.latency_detail = {
|
|
690
|
+
filter: { bool: { must: latDetailMust, filter: [timeFilter] } },
|
|
691
|
+
aggs: {
|
|
692
|
+
p95_over_time: {
|
|
693
|
+
date_histogram: {
|
|
694
|
+
field: "@timestamp", fixed_interval: interval, min_doc_count: 0,
|
|
695
|
+
extended_bounds: { min: fromDt.getTime(), max: toDt.getTime() },
|
|
696
|
+
},
|
|
697
|
+
aggs: { p95_parsed: this.latencyScriptedMetric() },
|
|
698
|
+
},
|
|
699
|
+
},
|
|
700
|
+
};
|
|
701
|
+
}
|
|
702
|
+
const p2Result = await this.repo.search(index, { size: 0, aggs: p2Aggs });
|
|
703
|
+
const p2 = p2Result.aggregations;
|
|
704
|
+
if (totalErrors > 0 && p2.error_detail) {
|
|
705
|
+
statusBuckets = p2.error_detail.by_status_code.buckets;
|
|
706
|
+
errorTypeBuckets = p2.error_detail.by_error_type.buckets;
|
|
707
|
+
errorEpBuckets = p2.error_detail.by_endpoint.buckets;
|
|
708
|
+
recentErrors = p2.error_detail.recent.hits.hits;
|
|
709
|
+
}
|
|
710
|
+
if (hasLatency && p2.latency_detail) {
|
|
711
|
+
latP95TimeBuckets = p2.latency_detail.p95_over_time.buckets;
|
|
712
|
+
}
|
|
713
|
+
}
|
|
714
|
+
// ── Phase 3: Pattern samples (if active patterns exist) ─────────
|
|
715
|
+
const patternSamples = {};
|
|
716
|
+
if (activePatterns.length > 0) {
|
|
717
|
+
const p3Aggs = {};
|
|
718
|
+
for (const pat of activePatterns) {
|
|
719
|
+
const safe = `sample_${pat.name.replace(/[^a-zA-Z0-9]/g, "_")}`;
|
|
720
|
+
const q = REPORT_PATTERNS[pat.name];
|
|
721
|
+
const patMust = [{ query_string: { query: q } }];
|
|
722
|
+
if (svcFilterClause)
|
|
723
|
+
patMust.push(svcFilterClause);
|
|
724
|
+
p3Aggs[safe] = {
|
|
725
|
+
filter: { bool: { must: patMust, filter: [timeFilter] } },
|
|
726
|
+
aggs: {
|
|
727
|
+
samples: {
|
|
728
|
+
top_hits: {
|
|
729
|
+
size: 3, sort: [{ "@timestamp": "desc" }],
|
|
730
|
+
_source: ["@timestamp", "log", "service", "level", "status_code"],
|
|
731
|
+
},
|
|
732
|
+
},
|
|
733
|
+
},
|
|
734
|
+
};
|
|
735
|
+
}
|
|
736
|
+
const p3Result = await this.repo.search(index, { size: 0, aggs: p3Aggs });
|
|
737
|
+
const p3 = p3Result.aggregations;
|
|
738
|
+
for (const pat of activePatterns) {
|
|
739
|
+
const safe = `sample_${pat.name.replace(/[^a-zA-Z0-9]/g, "_")}`;
|
|
740
|
+
if (p3[safe]) {
|
|
741
|
+
patternSamples[pat.name] = p3[safe].samples.hits.hits;
|
|
742
|
+
}
|
|
743
|
+
}
|
|
744
|
+
}
|
|
745
|
+
// ── Render ───────────────────────────────────────────────────────
|
|
746
|
+
const window = `${fromDt.toISOString().slice(0, 16)}Z → ${toDt.toISOString().slice(0, 16)}Z`;
|
|
747
|
+
const svcLabel = args.service ? ` — ${resolveServiceName(args.service)}` : "";
|
|
748
|
+
const activeSvcCount = svcBuckets.length;
|
|
749
|
+
const totalSvcCount = SERVICE_REGISTRY.size;
|
|
750
|
+
console.log(`${"=".repeat(60)}`);
|
|
751
|
+
console.log(` PRODUCTION REPORT${svcLabel} — ${Math.round(hours)}h window`);
|
|
752
|
+
console.log(` ${window}`);
|
|
753
|
+
console.log(`${"=".repeat(60)}\n`);
|
|
754
|
+
console.log(` Total: ${totalCount.toLocaleString()} log entries | ${activeSvcCount} of ${totalSvcCount} services active\n`);
|
|
755
|
+
// Active Services table
|
|
756
|
+
console.log(" ── Active Services ──\n");
|
|
757
|
+
if (svcBuckets.length === 0) {
|
|
758
|
+
console.log(" No service-level data found.\n");
|
|
759
|
+
}
|
|
760
|
+
else {
|
|
761
|
+
console.log(` ${"Service".padEnd(30)} ${"Total".padStart(7)} ${"5xx".padStart(5)} ${"4xx".padStart(5)} ${"Avg ms".padStart(8)} ${"P95 ms".padStart(8)} ${"Err%".padStart(6)}`);
|
|
762
|
+
console.log(` ${"-".repeat(30)} ${"-".repeat(7)} ${"-".repeat(5)} ${"-".repeat(5)} ${"-".repeat(8)} ${"-".repeat(8)} ${"-".repeat(6)}`);
|
|
763
|
+
const sorted = [...svcBuckets].sort((a, b) => b.errors_5xx.doc_count - a.errors_5xx.doc_count);
|
|
764
|
+
for (const b of sorted) {
|
|
765
|
+
const errs5 = b.errors_5xx.doc_count;
|
|
766
|
+
const errs4 = b.errors_4xx.doc_count;
|
|
767
|
+
const parsedLat = b.latency.doc_count > 0 ? b.latency.parsed.value : null;
|
|
768
|
+
const avgMs = parsedLat?.avg ?? 0;
|
|
769
|
+
const p95Ms = parsedLat?.p95 ?? 0;
|
|
770
|
+
const errPct = b.doc_count > 0 ? (errs5 / b.doc_count * 100) : 0;
|
|
771
|
+
const marker = errs5 > 0 ? "⚠" : " ";
|
|
772
|
+
console.log(` ${marker}${String(b.key).padEnd(30)} ${String(b.doc_count).padStart(7)} ${String(errs5).padStart(5)} ${String(errs4).padStart(5)} ${String(Math.round(avgMs)).padStart(7)}ms ${String(Math.round(p95Ms)).padStart(7)}ms ${errPct.toFixed(1).padStart(5)}%`);
|
|
773
|
+
}
|
|
774
|
+
}
|
|
775
|
+
// Incident Patterns
|
|
776
|
+
console.log(`\n ── Incident Patterns ──\n`);
|
|
777
|
+
if (activePatterns.length > 0) {
|
|
778
|
+
console.log(" ACTIVE:");
|
|
779
|
+
for (const pat of activePatterns.sort((a, b) => b.count - a.count)) {
|
|
780
|
+
console.log(` ⚠ ${pat.name.padEnd(28)} ${String(pat.count).padStart(6)} (${pat.rate})`);
|
|
781
|
+
}
|
|
782
|
+
}
|
|
783
|
+
if (clearPatterns.length > 0) {
|
|
784
|
+
console.log(` CLEAR: ${clearPatterns.join(", ")}`);
|
|
785
|
+
}
|
|
786
|
+
// Error Analysis
|
|
787
|
+
if (totalErrors > 0) {
|
|
788
|
+
const errRate = (totalErrors / hours).toFixed(1);
|
|
789
|
+
const errPct = totalCount > 0 ? (totalErrors / totalCount * 100).toFixed(1) : "0.0";
|
|
790
|
+
console.log(`\n ── Error Analysis (${totalErrors.toLocaleString()} errors, ${errPct}%, ${errRate}/hr) ──\n`);
|
|
791
|
+
if (statusBuckets.length) {
|
|
792
|
+
console.log(" By Status Code:");
|
|
793
|
+
for (const b of statusBuckets)
|
|
794
|
+
console.log(` HTTP ${String(b.key).padEnd(6)} ${String(b.doc_count).padStart(6)}`);
|
|
795
|
+
}
|
|
796
|
+
if (errorTypeBuckets.length) {
|
|
797
|
+
console.log(" By Error Type:");
|
|
798
|
+
for (const b of errorTypeBuckets)
|
|
799
|
+
console.log(` ${String(b.key).padEnd(25)} ${String(b.doc_count).padStart(6)}`);
|
|
800
|
+
}
|
|
801
|
+
if (errorEpBuckets.length) {
|
|
802
|
+
console.log(" Top Error Endpoints:");
|
|
803
|
+
for (const b of errorEpBuckets)
|
|
804
|
+
console.log(` ${String(b.key).padEnd(45)} ${String(b.doc_count).padStart(6)}`);
|
|
805
|
+
}
|
|
806
|
+
// Error Trend histogram
|
|
807
|
+
if (errorTimeBuckets.length) {
|
|
808
|
+
console.log(`\n ── Error Trend (${interval} buckets) ──\n`);
|
|
809
|
+
const maxCount = Math.max(...errorTimeBuckets.map(b => b.doc_count), 1);
|
|
810
|
+
for (const b of errorTimeBuckets) {
|
|
811
|
+
const barLen = Math.round((b.doc_count / maxCount) * 30);
|
|
812
|
+
console.log(` ${b.key_as_string} ${String(b.doc_count).padStart(6)} ${"█".repeat(barLen)}`);
|
|
813
|
+
}
|
|
814
|
+
}
|
|
815
|
+
}
|
|
816
|
+
// Latency Overview
|
|
817
|
+
if (hasLatency) {
|
|
818
|
+
const parsed = latOverview.parsed.value;
|
|
819
|
+
console.log(`\n ── Latency Overview (from correlation middleware) ──\n`);
|
|
820
|
+
console.log(` Overall: Avg ${Math.round(parsed.avg).toLocaleString()}ms P50 ${Math.round(parsed.p50).toLocaleString()}ms P95 ${Math.round(parsed.p95).toLocaleString()}ms P99 ${Math.round(parsed.p99).toLocaleString()}ms`);
|
|
821
|
+
console.log(` Tracked requests: ${parsed.count.toLocaleString()}`);
|
|
822
|
+
if (latP95TimeBuckets.length) {
|
|
823
|
+
const nonEmpty = latP95TimeBuckets.filter(b => b.p95_parsed.value.count > 0);
|
|
824
|
+
if (nonEmpty.length > 0) {
|
|
825
|
+
console.log(`\n P95 Over Time (${interval} buckets):`);
|
|
826
|
+
const maxP95 = Math.max(...nonEmpty.map(b => b.p95_parsed.value.p95), 1);
|
|
827
|
+
for (const b of latP95TimeBuckets) {
|
|
828
|
+
const p95 = b.p95_parsed.value.count > 0 ? b.p95_parsed.value.p95 : 0;
|
|
829
|
+
const barLen = Math.round((p95 / maxP95) * 30);
|
|
830
|
+
console.log(` ${b.key_as_string} ${String(Math.round(p95)).padStart(7)}ms ${"█".repeat(barLen)}`);
|
|
831
|
+
}
|
|
832
|
+
}
|
|
833
|
+
}
|
|
834
|
+
}
|
|
835
|
+
// Recent Error Samples
|
|
836
|
+
if (recentErrors.length) {
|
|
837
|
+
console.log(`\n ── Recent Error Samples (${recentErrors.length}) ──\n`);
|
|
838
|
+
for (const hit of recentErrors) {
|
|
839
|
+
const src = hit._source;
|
|
840
|
+
console.log(` [${src["@timestamp"]}] [${src.service ?? "?"}] ${src.level ?? "?"} HTTP ${src.status_code ?? "?"} ${src.endpoint ?? "?"}`);
|
|
841
|
+
if (src.correlation_id)
|
|
842
|
+
console.log(` correlation_id: ${src.correlation_id}`);
|
|
843
|
+
console.log(` ${String(src.log ?? "").slice(0, 200)}`);
|
|
844
|
+
console.log();
|
|
845
|
+
}
|
|
846
|
+
}
|
|
847
|
+
// Pattern Samples
|
|
848
|
+
if (Object.keys(patternSamples).length > 0) {
|
|
849
|
+
console.log(" ── Pattern Samples ──\n");
|
|
850
|
+
for (const [name, hits] of Object.entries(patternSamples)) {
|
|
851
|
+
console.log(` ${name}:`);
|
|
852
|
+
for (const hit of hits) {
|
|
853
|
+
const src = hit._source;
|
|
854
|
+
const ts = String(src["@timestamp"] ?? "").slice(11, 19);
|
|
855
|
+
const svc = src.service ? `[${src.service}]` : "";
|
|
856
|
+
console.log(` ${ts} ${svc} ${String(src.log ?? "").slice(0, 150)}`);
|
|
857
|
+
}
|
|
858
|
+
console.log();
|
|
859
|
+
}
|
|
860
|
+
}
|
|
861
|
+
// Summary line
|
|
862
|
+
const status = activePatterns.length > 0 || totalErrors > 100
|
|
863
|
+
? (activePatterns.length > 3 || totalErrors > 1000 ? "DEGRADED" : "WARNING")
|
|
864
|
+
: "HEALTHY";
|
|
865
|
+
const errRate = hours > 0 ? (totalErrors / hours).toFixed(1) : "0";
|
|
866
|
+
console.log(" ── Summary ──\n");
|
|
867
|
+
console.log(` ${status}: ${activePatterns.length} active patterns, ${totalErrors.toLocaleString()} errors (${errRate}/hr), ${activeSvcCount} services active`);
|
|
868
|
+
console.log(` Dashboard: ${this.buildDashboardLink(undefined, fromDt, toDt)}`);
|
|
869
|
+
console.log(`${"=".repeat(60)}\n`);
|
|
870
|
+
}
|
|
871
|
+
// ── Dashboard Link ──────────────────────────────────────────────────
|
|
872
|
+
buildDashboardLink(queryStr, fromDt, toDt) {
|
|
873
|
+
const encoded = encodeURIComponent(queryStr ?? "");
|
|
874
|
+
const timeG = `_g=(time:(from:'${fromDt.toISOString()}',to:'${toDt.toISOString()}'))`;
|
|
875
|
+
const queryA = `_a=(query:(language:kuery,query:'${encoded}'))`;
|
|
876
|
+
const dashboardUrl = `https://${this.repo.config.host}/app/dashboards?security_tenant=global#/view/python-services-production-health-v2?${timeG}&${queryA}`;
|
|
877
|
+
const discoverUrl = `https://${this.repo.config.host}/app/discover?security_tenant=global#/?${timeG}&${queryA}`;
|
|
878
|
+
return `${dashboardUrl}\n Discover: ${discoverUrl}`;
|
|
879
|
+
}
|
|
880
|
+
}
|
|
881
|
+
// ── Helpers ─────────────────────────────────────────────────────────────
|
|
882
|
+
function fmtTime(d) {
|
|
883
|
+
return d.toISOString().slice(11, 16);
|
|
884
|
+
}
|
|
885
|
+
//# sourceMappingURL=search.service.js.map
|