gtraces 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gtraces/__init__.py
ADDED
|
@@ -0,0 +1,2237 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""gtraces - CLI for GCP Cloud Trace API v1.
|
|
3
|
+
|
|
4
|
+
Also importable as a library:
|
|
5
|
+
|
|
6
|
+
import gtraces
|
|
7
|
+
gtraces.set_token("my-token") # or monkey-patch get_token
|
|
8
|
+
traces = gtraces.trace_list("my-project", start="1h")
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import functools
|
|
12
|
+
import json
|
|
13
|
+
import os
|
|
14
|
+
import random
|
|
15
|
+
import re
|
|
16
|
+
import subprocess
|
|
17
|
+
import time
|
|
18
|
+
from collections import Counter, defaultdict
|
|
19
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
20
|
+
from datetime import datetime, timedelta, timezone
|
|
21
|
+
from urllib.error import HTTPError
|
|
22
|
+
from urllib.parse import urlencode
|
|
23
|
+
from urllib.request import Request, urlopen
|
|
24
|
+
|
|
25
|
+
import click
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
"trace_list",
|
|
29
|
+
"trace_services",
|
|
30
|
+
"trace_spans",
|
|
31
|
+
"trace_get",
|
|
32
|
+
"trace_search",
|
|
33
|
+
"trace_outliers",
|
|
34
|
+
"trace_stats",
|
|
35
|
+
"trace_compare",
|
|
36
|
+
"get_token",
|
|
37
|
+
"set_token",
|
|
38
|
+
"fetch_traces",
|
|
39
|
+
"filter_traces",
|
|
40
|
+
"ApiError",
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
# ── Constants ────────────────────────────────────────────────────────────────
|
|
44
|
+
|
|
45
|
+
DEFAULT_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT")
|
|
46
|
+
API = "https://cloudtrace.googleapis.com/v1/projects"
|
|
47
|
+
|
|
48
|
+
INTERESTING_LABELS = {
|
|
49
|
+
"bids",
|
|
50
|
+
"deadline_ms",
|
|
51
|
+
"actual_deadline_ms",
|
|
52
|
+
"parent_deadline_ms",
|
|
53
|
+
"auctionType",
|
|
54
|
+
"done_by",
|
|
55
|
+
"done_at_ms",
|
|
56
|
+
"ctx_done_at_ms",
|
|
57
|
+
"responses_before_deadline",
|
|
58
|
+
"responses_before_done",
|
|
59
|
+
"drained_count",
|
|
60
|
+
"auction",
|
|
61
|
+
"http.response.status_code",
|
|
62
|
+
"http.ttfb_ms",
|
|
63
|
+
"otel.status_code",
|
|
64
|
+
"otel.status_description",
|
|
65
|
+
"service.name",
|
|
66
|
+
"cloud.region",
|
|
67
|
+
"k8s.cluster.name",
|
|
68
|
+
"placement",
|
|
69
|
+
"publisher_country",
|
|
70
|
+
"abtest",
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
# ── Auth & HTTP ──────────────────────────────────────────────────────────────
|
|
74
|
+
|
|
75
|
+
_token = None
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class ApiError(Exception):
|
|
79
|
+
"""Raised on API failures with actionable hints.
|
|
80
|
+
|
|
81
|
+
Inherits from Exception (not click.ClickException) so library consumers
|
|
82
|
+
don't need click as a dependency for exception handling.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def get_token():
|
|
87
|
+
"""Get access token via gcloud (cached for process lifetime)."""
|
|
88
|
+
global _token
|
|
89
|
+
if _token:
|
|
90
|
+
return _token
|
|
91
|
+
try:
|
|
92
|
+
r = subprocess.run(
|
|
93
|
+
["gcloud", "auth", "print-access-token"],
|
|
94
|
+
capture_output=True,
|
|
95
|
+
text=True,
|
|
96
|
+
check=True,
|
|
97
|
+
)
|
|
98
|
+
_token = r.stdout.strip()
|
|
99
|
+
return _token
|
|
100
|
+
except FileNotFoundError:
|
|
101
|
+
raise ApiError(
|
|
102
|
+
"gcloud not found. Install: https://cloud.google.com/sdk/docs/install"
|
|
103
|
+
)
|
|
104
|
+
except subprocess.CalledProcessError:
|
|
105
|
+
raise ApiError("Auth failed. Run: gcloud auth login")
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def set_token(tok):
|
|
109
|
+
"""Set the auth token directly, bypassing gcloud CLI.
|
|
110
|
+
|
|
111
|
+
Convenience for library consumers who obtain tokens externally
|
|
112
|
+
(e.g. via google-auth ADC).
|
|
113
|
+
"""
|
|
114
|
+
global _token
|
|
115
|
+
_token = tok
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
_MAX_RETRIES = 3
|
|
119
|
+
_RETRY_BASE_SEC = 1.0
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def api_get(project, path, params=None):
|
|
123
|
+
"""GET from Cloud Trace API v1 with retry on 429. Returns parsed JSON."""
|
|
124
|
+
url = f"{API}/{project}{path}"
|
|
125
|
+
if params:
|
|
126
|
+
url += "?" + urlencode(params)
|
|
127
|
+
last_exc = None
|
|
128
|
+
for attempt in range(_MAX_RETRIES + 1):
|
|
129
|
+
req = Request(url, headers={"Authorization": f"Bearer {get_token()}"})
|
|
130
|
+
try:
|
|
131
|
+
with urlopen(req, timeout=30) as resp:
|
|
132
|
+
return json.loads(resp.read())
|
|
133
|
+
except HTTPError as e:
|
|
134
|
+
body = e.read().decode(errors="replace")
|
|
135
|
+
if e.code == 429 and attempt < _MAX_RETRIES:
|
|
136
|
+
delay = _RETRY_BASE_SEC * (2**attempt) + random.uniform(0, 0.5)
|
|
137
|
+
time.sleep(delay)
|
|
138
|
+
last_exc = e
|
|
139
|
+
continue
|
|
140
|
+
msgs = {
|
|
141
|
+
401: "Auth expired. Run: gcloud auth login",
|
|
142
|
+
403: f"Permission denied for project '{project}'",
|
|
143
|
+
404: "Not found",
|
|
144
|
+
429: (
|
|
145
|
+
f"Rate limited after {_MAX_RETRIES} retries. "
|
|
146
|
+
"Reduce --limit or try again later"
|
|
147
|
+
),
|
|
148
|
+
}
|
|
149
|
+
raise ApiError(f"{msgs.get(e.code, f'HTTP {e.code}')}\n{body}")
|
|
150
|
+
raise ApiError(f"Request failed after {_MAX_RETRIES} retries: {last_exc}")
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def fetch_traces(project, params, max_results=None):
|
|
154
|
+
"""Fetch traces with automatic pagination."""
|
|
155
|
+
traces = []
|
|
156
|
+
while True:
|
|
157
|
+
data = api_get(project, "/traces", params)
|
|
158
|
+
traces.extend(data.get("traces", []))
|
|
159
|
+
if max_results and len(traces) >= max_results:
|
|
160
|
+
return traces[:max_results]
|
|
161
|
+
token = data.get("nextPageToken")
|
|
162
|
+
if not token:
|
|
163
|
+
break
|
|
164
|
+
params = {**params, "pageToken": token}
|
|
165
|
+
return traces
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
# ── Helpers ──────────────────────────────────────────────────────────────────
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _ts(s):
|
|
172
|
+
"""RFC3339 string to datetime."""
|
|
173
|
+
return datetime.fromisoformat(s.replace("Z", "+00:00"))
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _dur(span):
|
|
177
|
+
"""Span duration in milliseconds."""
|
|
178
|
+
return (_ts(span["endTime"]) - _ts(span["startTime"])).total_seconds() * 1000
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _root(spans):
|
|
182
|
+
"""Find the root span (no parentSpanId)."""
|
|
183
|
+
for s in spans:
|
|
184
|
+
if not s.get("parentSpanId"):
|
|
185
|
+
return s
|
|
186
|
+
return spans[0] if spans else None
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _fmt_ms(ms):
|
|
190
|
+
"""Format milliseconds for display."""
|
|
191
|
+
if ms >= 1000:
|
|
192
|
+
return f"{ms / 1000:.2f}s"
|
|
193
|
+
return f"{ms:.1f}ms"
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _now():
|
|
197
|
+
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _parse_time(s):
|
|
201
|
+
"""Parse relative (1h, 30m, 2d, 1w) or RFC3339 to RFC3339 string."""
|
|
202
|
+
m = re.match(r"^(\d+)([mhdw])$", s)
|
|
203
|
+
if m:
|
|
204
|
+
n, u = int(m.group(1)), m.group(2)
|
|
205
|
+
delta = {
|
|
206
|
+
"m": timedelta(minutes=n),
|
|
207
|
+
"h": timedelta(hours=n),
|
|
208
|
+
"d": timedelta(days=n),
|
|
209
|
+
"w": timedelta(weeks=n),
|
|
210
|
+
}[u]
|
|
211
|
+
return (datetime.now(timezone.utc) - delta).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
212
|
+
return s
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def _parse_latency(s):
|
|
216
|
+
"""Parse '500ms' or '1.5s' to (api_filter_str, ms_float). Returns (None, None) if empty."""
|
|
217
|
+
if not s:
|
|
218
|
+
return None, None
|
|
219
|
+
m = re.match(r"^(\d+(?:\.\d+)?)(ms|s)$", s)
|
|
220
|
+
if not m:
|
|
221
|
+
raise ValueError(f"Bad latency: {s} (use e.g. 500ms, 1s)")
|
|
222
|
+
val, unit = float(m.group(1)), m.group(2)
|
|
223
|
+
ms = val if unit == "ms" else val * 1000
|
|
224
|
+
return f"{m.group(1)}{m.group(2)}", ms
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _parse_labels(label_tuple):
|
|
228
|
+
"""Parse ('key=value', ...) tuple into a dict."""
|
|
229
|
+
d = {}
|
|
230
|
+
for label in label_tuple:
|
|
231
|
+
if "=" not in label:
|
|
232
|
+
raise click.BadParameter(f"Expected key=value, got: {label}")
|
|
233
|
+
k, v = label.split("=", 1)
|
|
234
|
+
d[k] = v
|
|
235
|
+
return d or None
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def _parse_group_by(group_by):
|
|
239
|
+
"""Parse comma-separated grouping keys."""
|
|
240
|
+
if not group_by:
|
|
241
|
+
return []
|
|
242
|
+
return [k.strip() for k in group_by.split(",") if k.strip()]
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _build_params(
|
|
246
|
+
start, end, limit, view="ROOTSPAN", min_latency=None, services=None, labels=None
|
|
247
|
+
):
|
|
248
|
+
"""Build common API query params."""
|
|
249
|
+
params = {
|
|
250
|
+
"pageSize": min(limit, 100),
|
|
251
|
+
"startTime": _parse_time(start),
|
|
252
|
+
"endTime": end or _now(),
|
|
253
|
+
"view": view,
|
|
254
|
+
}
|
|
255
|
+
parts = []
|
|
256
|
+
if min_latency:
|
|
257
|
+
filt, ms = _parse_latency(min_latency)
|
|
258
|
+
if ms and ms > 0:
|
|
259
|
+
parts.append(f"latency:{filt}")
|
|
260
|
+
if services:
|
|
261
|
+
for svc in services:
|
|
262
|
+
parts.append(f"service.name:{svc}")
|
|
263
|
+
if labels:
|
|
264
|
+
for k, v in labels.items():
|
|
265
|
+
parts.append(f"{k}:{v}")
|
|
266
|
+
if parts:
|
|
267
|
+
params["filter"] = " ".join(parts)
|
|
268
|
+
return params
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def _matches_trace(
|
|
272
|
+
trace,
|
|
273
|
+
*,
|
|
274
|
+
span_name=None,
|
|
275
|
+
services=None,
|
|
276
|
+
labels=None,
|
|
277
|
+
min_ms=None,
|
|
278
|
+
max_ms=None,
|
|
279
|
+
):
|
|
280
|
+
"""Check whether trace matches root/service/label/latency filters."""
|
|
281
|
+
if isinstance(services, str):
|
|
282
|
+
services = (services,)
|
|
283
|
+
spans = trace.get("spans", [])
|
|
284
|
+
r = _root(spans)
|
|
285
|
+
if not r:
|
|
286
|
+
return False
|
|
287
|
+
if span_name and span_name not in (r.get("name") or ""):
|
|
288
|
+
return False
|
|
289
|
+
dur = _dur(r)
|
|
290
|
+
if min_ms is not None and dur < min_ms:
|
|
291
|
+
return False
|
|
292
|
+
if max_ms is not None and dur > max_ms:
|
|
293
|
+
return False
|
|
294
|
+
svc_set = set(services) if services else None
|
|
295
|
+
if svc_set and not any(
|
|
296
|
+
s.get("labels", {}).get("service.name") in svc_set for s in spans
|
|
297
|
+
):
|
|
298
|
+
return False
|
|
299
|
+
rl = r.get("labels", {})
|
|
300
|
+
if labels and not all(rl.get(k) == v for k, v in labels.items()):
|
|
301
|
+
return False
|
|
302
|
+
return True
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def filter_traces(
|
|
306
|
+
traces, *, span_name=None, services=None, labels=None, min_ms=None, max_ms=None
|
|
307
|
+
):
|
|
308
|
+
"""Filter traces by root span name, service(s), labels, and duration range.
|
|
309
|
+
|
|
310
|
+
Unified filter — replaces the former match_traces + _filter_by_labels.
|
|
311
|
+
"""
|
|
312
|
+
return [
|
|
313
|
+
t
|
|
314
|
+
for t in traces
|
|
315
|
+
if _matches_trace(
|
|
316
|
+
t,
|
|
317
|
+
span_name=span_name,
|
|
318
|
+
services=services,
|
|
319
|
+
labels=labels,
|
|
320
|
+
min_ms=min_ms,
|
|
321
|
+
max_ms=max_ms,
|
|
322
|
+
)
|
|
323
|
+
]
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def _to_durations(traces):
|
|
327
|
+
"""Return sorted (ms, trace) list from traces with root spans."""
|
|
328
|
+
out = []
|
|
329
|
+
for t in traces:
|
|
330
|
+
r = _root(t.get("spans", []))
|
|
331
|
+
if r:
|
|
332
|
+
out.append((_dur(r), t))
|
|
333
|
+
out.sort(key=lambda x: x[0])
|
|
334
|
+
return out
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def _resolve_threshold(threshold, pvals):
|
|
338
|
+
"""Resolve a threshold string to milliseconds."""
|
|
339
|
+
if threshold in pvals:
|
|
340
|
+
return pvals[threshold]
|
|
341
|
+
_, ms = _parse_latency(threshold)
|
|
342
|
+
if ms is None:
|
|
343
|
+
raise ValueError(
|
|
344
|
+
f"Bad threshold: {threshold} (use p50/p90/p95/p99 or e.g. 500ms)"
|
|
345
|
+
)
|
|
346
|
+
return ms
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def _pcts(values):
|
|
350
|
+
"""Compute p50/p90/p95/p99 from a sorted list of numbers."""
|
|
351
|
+
n = len(values)
|
|
352
|
+
if n == 0:
|
|
353
|
+
return {"p50": 0, "p90": 0, "p95": 0, "p99": 0}
|
|
354
|
+
targets = {"p50": 0.50, "p90": 0.90, "p95": 0.95, "p99": 0.99}
|
|
355
|
+
return {k: values[min(int(v * n), n - 1)] for k, v in targets.items()}
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def _cli_validate(fn):
|
|
359
|
+
"""Decorator: convert library exceptions to Click exceptions in CLI commands."""
|
|
360
|
+
|
|
361
|
+
@functools.wraps(fn)
|
|
362
|
+
def wrapper(*args, **kwargs):
|
|
363
|
+
try:
|
|
364
|
+
return fn(*args, **kwargs)
|
|
365
|
+
except ValueError as e:
|
|
366
|
+
raise click.BadParameter(str(e))
|
|
367
|
+
except ApiError as e:
|
|
368
|
+
raise click.ClickException(str(e))
|
|
369
|
+
|
|
370
|
+
return wrapper
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
# ── Timeseries helpers ───────────────────────────────────────────────────────
|
|
374
|
+
|
|
375
|
+
_SPARK = "▁▂▃▄▅▆▇█"
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
def _sparkline(values):
|
|
379
|
+
"""Render a list of numbers as a Unicode sparkline."""
|
|
380
|
+
if not values:
|
|
381
|
+
return ""
|
|
382
|
+
mn, mx = min(values), max(values)
|
|
383
|
+
if mn == mx:
|
|
384
|
+
return _SPARK[3] * len(values)
|
|
385
|
+
rng = mx - mn
|
|
386
|
+
return "".join(_SPARK[min(int((v - mn) / rng * 7), 7)] for v in values)
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def _parse_bucket(s):
|
|
390
|
+
"""Parse bucket size (5m, 10m, 1h, 1w) to timedelta."""
|
|
391
|
+
m = re.match(r"^(\d+)([mhdw])$", s)
|
|
392
|
+
if not m:
|
|
393
|
+
raise ValueError(f"Bad bucket: {s} (use e.g. 5m, 1h, 1w)")
|
|
394
|
+
n, u = int(m.group(1)), m.group(2)
|
|
395
|
+
return {
|
|
396
|
+
"m": timedelta(minutes=n),
|
|
397
|
+
"h": timedelta(hours=n),
|
|
398
|
+
"d": timedelta(days=n),
|
|
399
|
+
"w": timedelta(weeks=n),
|
|
400
|
+
}[u]
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
def _make_buckets(data, bucket_delta):
|
|
404
|
+
"""Split [(datetime, ms)] into aligned time buckets.
|
|
405
|
+
|
|
406
|
+
Returns [(label, sorted_durs)] sorted by time, including empty buckets.
|
|
407
|
+
"""
|
|
408
|
+
if not data:
|
|
409
|
+
return []
|
|
410
|
+
bsec = bucket_delta.total_seconds()
|
|
411
|
+
min_ts = min(ts for ts, _ in data)
|
|
412
|
+
max_ts = max(ts for ts, _ in data)
|
|
413
|
+
|
|
414
|
+
# Align to bucket boundary
|
|
415
|
+
start_epoch = (min_ts.timestamp() // bsec) * bsec
|
|
416
|
+
max_idx = int((max_ts.timestamp() - start_epoch) // bsec)
|
|
417
|
+
|
|
418
|
+
bins = defaultdict(list)
|
|
419
|
+
for ts, dur in data:
|
|
420
|
+
idx = int((ts.timestamp() - start_epoch) // bsec)
|
|
421
|
+
bins[idx].append(dur)
|
|
422
|
+
|
|
423
|
+
result = []
|
|
424
|
+
for idx in range(max_idx + 1):
|
|
425
|
+
t = datetime.fromtimestamp(start_epoch + idx * bsec, tz=timezone.utc)
|
|
426
|
+
t_end = t + bucket_delta
|
|
427
|
+
label = f"{t.strftime('%H:%M')}-{t_end.strftime('%H:%M')}"
|
|
428
|
+
durs = sorted(bins.get(idx, []))
|
|
429
|
+
result.append((label, durs))
|
|
430
|
+
return result
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
def _trend(data, n_buckets=12):
|
|
434
|
+
"""Compute p50 sparkline with range indicator from [(datetime, ms)]."""
|
|
435
|
+
if len(data) < 2:
|
|
436
|
+
return ""
|
|
437
|
+
min_ts = min(ts for ts, _ in data)
|
|
438
|
+
max_ts = max(ts for ts, _ in data)
|
|
439
|
+
span_secs = (max_ts - min_ts).total_seconds()
|
|
440
|
+
if span_secs <= 0:
|
|
441
|
+
return ""
|
|
442
|
+
bucket_secs = span_secs / n_buckets
|
|
443
|
+
buckets = [[] for _ in range(n_buckets)]
|
|
444
|
+
for ts, dur in data:
|
|
445
|
+
idx = min(int((ts - min_ts).total_seconds() / bucket_secs), n_buckets - 1)
|
|
446
|
+
buckets[idx].append(dur)
|
|
447
|
+
p50s = []
|
|
448
|
+
for b in buckets:
|
|
449
|
+
if b:
|
|
450
|
+
b.sort()
|
|
451
|
+
p50s.append(b[len(b) // 2])
|
|
452
|
+
else:
|
|
453
|
+
p50s.append(0)
|
|
454
|
+
spark = _sparkline(p50s)
|
|
455
|
+
real = [v for v in p50s if v > 0]
|
|
456
|
+
if real:
|
|
457
|
+
return f"{spark} {_fmt_ms(min(real))}-{_fmt_ms(max(real))}"
|
|
458
|
+
return spark
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
# ── Rendering ────────────────────────────────────────────────────────────────
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
def _parse_fields(field_str):
|
|
465
|
+
"""Parse comma-separated field spec (e.g. 'spans,label:cloud.region')."""
|
|
466
|
+
if not field_str:
|
|
467
|
+
return []
|
|
468
|
+
fields = []
|
|
469
|
+
for f in field_str.split(","):
|
|
470
|
+
f = f.strip()
|
|
471
|
+
if f.startswith("label:"):
|
|
472
|
+
key = f[6:]
|
|
473
|
+
fields.append({"type": "label", "key": key, "header": key})
|
|
474
|
+
elif f == "spans":
|
|
475
|
+
fields.append({"type": "spans", "header": "SPANS"})
|
|
476
|
+
else:
|
|
477
|
+
raise click.BadParameter(
|
|
478
|
+
f"Unknown field: {f} (use 'spans' or 'label:KEY')"
|
|
479
|
+
)
|
|
480
|
+
return fields
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
def _extract_field(t, root, field):
|
|
484
|
+
"""Extract a field value from a trace."""
|
|
485
|
+
if field["type"] == "label":
|
|
486
|
+
return (root or {}).get("labels", {}).get(field["key"], "")
|
|
487
|
+
if field["type"] == "spans":
|
|
488
|
+
return str(len(t.get("spans", [])))
|
|
489
|
+
return ""
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
def render_list(traces, show_labels=False, fields=None):
|
|
493
|
+
"""Render traces as a compact table with dynamic column widths."""
|
|
494
|
+
if not traces:
|
|
495
|
+
click.echo("No traces found.")
|
|
496
|
+
return
|
|
497
|
+
|
|
498
|
+
SEP = " "
|
|
499
|
+
|
|
500
|
+
# Pre-compute all cell values per row
|
|
501
|
+
headers = ["TRACE ID", "ROOT SPAN", "DURATION", "TIME"]
|
|
502
|
+
extra_headers = [f["header"] for f in fields] if fields else []
|
|
503
|
+
all_headers = headers + extra_headers
|
|
504
|
+
|
|
505
|
+
rows = []
|
|
506
|
+
for t in traces:
|
|
507
|
+
r = _root(t.get("spans", []))
|
|
508
|
+
tid = t.get("traceId", "?")
|
|
509
|
+
name = r.get("name", "?") if r else "?"
|
|
510
|
+
dur = _fmt_ms(_dur(r)) if r else "?"
|
|
511
|
+
time = r.get("startTime", "?")[:19] if r else "?"
|
|
512
|
+
extra = [_extract_field(t, r, f) for f in fields] if fields else []
|
|
513
|
+
lbl_str = ""
|
|
514
|
+
if show_labels and r:
|
|
515
|
+
lbl = r.get("labels", {})
|
|
516
|
+
interesting = {k: v for k, v in lbl.items() if k in INTERESTING_LABELS}
|
|
517
|
+
if interesting:
|
|
518
|
+
lbl_str = " ".join(f"{k}={v}" for k, v in interesting.items())
|
|
519
|
+
rows.append(([tid, name, dur, time] + extra, lbl_str))
|
|
520
|
+
|
|
521
|
+
# Compute column widths from header + data
|
|
522
|
+
col_widths = [len(h) for h in all_headers]
|
|
523
|
+
for cells, _ in rows:
|
|
524
|
+
for i, val in enumerate(cells):
|
|
525
|
+
col_widths[i] = max(col_widths[i], len(val))
|
|
526
|
+
|
|
527
|
+
# Right-align DURATION column (index 2)
|
|
528
|
+
right_align = {2}
|
|
529
|
+
|
|
530
|
+
def fmt_row(cells):
|
|
531
|
+
parts = []
|
|
532
|
+
for i, val in enumerate(cells):
|
|
533
|
+
w = col_widths[i]
|
|
534
|
+
parts.append(f"{val:>{w}}" if i in right_align else f"{val:<{w}}")
|
|
535
|
+
return SEP.join(parts)
|
|
536
|
+
|
|
537
|
+
click.echo(fmt_row(all_headers) + (SEP + "LABELS" if show_labels else ""))
|
|
538
|
+
click.echo("\u2500" * sum(col_widths + [len(SEP) * (len(col_widths) - 1)]))
|
|
539
|
+
|
|
540
|
+
for cells, lbl_str in rows:
|
|
541
|
+
line = fmt_row(cells)
|
|
542
|
+
if lbl_str:
|
|
543
|
+
line += SEP + lbl_str
|
|
544
|
+
click.echo(line)
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
def _bar(offset_ms, dur_ms, total_ms, width):
|
|
548
|
+
"""Render a positioned horizontal bar using box-drawing characters."""
|
|
549
|
+
if total_ms <= 0 or width <= 0:
|
|
550
|
+
return ""
|
|
551
|
+
start = offset_ms / total_ms * width
|
|
552
|
+
length = dur_ms / total_ms * width
|
|
553
|
+
si = int(start)
|
|
554
|
+
lead = " " * si
|
|
555
|
+
full = int(length)
|
|
556
|
+
half = (length - full) >= 0.5
|
|
557
|
+
bar = "\u2501" * full + ("\u2578" if half else "") # ━ and ╸
|
|
558
|
+
if not bar and dur_ms > 0:
|
|
559
|
+
bar = "\u2578" # ╸
|
|
560
|
+
return f"{lead}{bar}"
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
def render_tree(trace, bars=False, name_width=35):
|
|
564
|
+
"""Render trace as a span tree, optionally with waterfall timing bars."""
|
|
565
|
+
spans = trace.get("spans", [])
|
|
566
|
+
if not spans:
|
|
567
|
+
click.echo("No spans.")
|
|
568
|
+
return
|
|
569
|
+
|
|
570
|
+
r = _root(spans)
|
|
571
|
+
total = _dur(r) if r else 0
|
|
572
|
+
root_start = _ts(r["startTime"]) if r else _ts(spans[0]["startTime"])
|
|
573
|
+
click.echo(
|
|
574
|
+
f"Trace {trace.get('traceId', '?')} | {_fmt_ms(total)} | {len(spans)} spans\n"
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
children = {}
|
|
578
|
+
for s in spans:
|
|
579
|
+
pid = s.get("parentSpanId")
|
|
580
|
+
children.setdefault(pid, []).append(s)
|
|
581
|
+
|
|
582
|
+
name_col = name_width
|
|
583
|
+
dur_col = 10
|
|
584
|
+
bar_width = 0
|
|
585
|
+
if bars:
|
|
586
|
+
try:
|
|
587
|
+
term_width = os.get_terminal_size().columns
|
|
588
|
+
except (AttributeError, ValueError, OSError):
|
|
589
|
+
term_width = 120
|
|
590
|
+
bar_width = max(20, term_width - name_col - dur_col - 4)
|
|
591
|
+
|
|
592
|
+
lines = []
|
|
593
|
+
|
|
594
|
+
def walk(span, prefix="", last=True):
|
|
595
|
+
d = _dur(span)
|
|
596
|
+
name = span.get("name", "?")
|
|
597
|
+
if span is r:
|
|
598
|
+
tree_str = name
|
|
599
|
+
else:
|
|
600
|
+
conn = "\u2514\u2500 " if last else "\u251c\u2500 "
|
|
601
|
+
tree_str = f"{prefix}{conn}{name}"
|
|
602
|
+
dur_str = _fmt_ms(d)
|
|
603
|
+
if bars:
|
|
604
|
+
offset = (_ts(span["startTime"]) - root_start).total_seconds() * 1000
|
|
605
|
+
if len(tree_str) > name_col:
|
|
606
|
+
tree_str = tree_str[: name_col - 2] + ".."
|
|
607
|
+
lines.append((tree_str, dur_str, _bar(offset, d, total, bar_width)))
|
|
608
|
+
else:
|
|
609
|
+
lines.append((tree_str, dur_str))
|
|
610
|
+
|
|
611
|
+
ext = " " if last else "\u2502 "
|
|
612
|
+
kids = sorted(
|
|
613
|
+
children.get(span.get("spanId"), []),
|
|
614
|
+
key=lambda x: x["startTime"],
|
|
615
|
+
)
|
|
616
|
+
for i, kid in enumerate(kids):
|
|
617
|
+
walk(kid, prefix + ext, i == len(kids) - 1)
|
|
618
|
+
|
|
619
|
+
if r:
|
|
620
|
+
walk(r)
|
|
621
|
+
if bars:
|
|
622
|
+
for tree_str, dur_str, bar_str in lines:
|
|
623
|
+
click.echo(f"{tree_str:<{name_col}} {dur_str:>{dur_col - 2}} {bar_str}")
|
|
624
|
+
else:
|
|
625
|
+
for tree_str, dur_str in lines:
|
|
626
|
+
click.echo(f"{tree_str} {dur_str}")
|
|
627
|
+
|
|
628
|
+
|
|
629
|
+
def render_timeline(trace):
|
|
630
|
+
"""Render chronological timeline with bottleneck summary."""
|
|
631
|
+
spans = trace.get("spans", [])
|
|
632
|
+
if not spans:
|
|
633
|
+
click.echo("No spans.")
|
|
634
|
+
return
|
|
635
|
+
|
|
636
|
+
r = _root(spans)
|
|
637
|
+
root_start = _ts(r["startTime"]) if r else _ts(spans[0]["startTime"])
|
|
638
|
+
total = _dur(r) if r else 1
|
|
639
|
+
|
|
640
|
+
# Find labels that are identical on every span — show once in header
|
|
641
|
+
all_labels = [s.get("labels", {}) for s in spans]
|
|
642
|
+
common = {}
|
|
643
|
+
if all_labels:
|
|
644
|
+
shared_keys = set(all_labels[0].keys())
|
|
645
|
+
for lbl in all_labels[1:]:
|
|
646
|
+
shared_keys &= set(lbl.keys())
|
|
647
|
+
for k in shared_keys:
|
|
648
|
+
vals = {lbl[k] for lbl in all_labels}
|
|
649
|
+
if len(vals) == 1:
|
|
650
|
+
common[k] = next(iter(vals))
|
|
651
|
+
|
|
652
|
+
header = " ".join(
|
|
653
|
+
f"{k}={v}" for k, v in sorted(common.items()) if k in INTERESTING_LABELS
|
|
654
|
+
)
|
|
655
|
+
click.echo(
|
|
656
|
+
f"Trace {trace.get('traceId', '?')} | {_fmt_ms(total)} | {len(spans)} spans"
|
|
657
|
+
)
|
|
658
|
+
if header:
|
|
659
|
+
click.echo(f" {header}")
|
|
660
|
+
click.echo()
|
|
661
|
+
click.echo(f"{'OFFSET':>10} {'SPAN':<55} {'DURATION':>10} {'%':>5} LABELS")
|
|
662
|
+
click.echo("\u2500" * 110)
|
|
663
|
+
|
|
664
|
+
by_id = {s.get("spanId"): s for s in spans}
|
|
665
|
+
|
|
666
|
+
def depth(span):
|
|
667
|
+
d, pid = 0, span.get("parentSpanId")
|
|
668
|
+
while pid and pid in by_id:
|
|
669
|
+
d += 1
|
|
670
|
+
pid = by_id[pid].get("parentSpanId")
|
|
671
|
+
return d
|
|
672
|
+
|
|
673
|
+
ranked = []
|
|
674
|
+
for s in sorted(spans, key=lambda x: x["startTime"]):
|
|
675
|
+
d = _dur(s)
|
|
676
|
+
offset = (_ts(s["startTime"]) - root_start).total_seconds() * 1000
|
|
677
|
+
dep = depth(s)
|
|
678
|
+
pct = d / total * 100 if total > 0 else 0
|
|
679
|
+
|
|
680
|
+
labels = s.get("labels", {})
|
|
681
|
+
unique = {
|
|
682
|
+
k: v
|
|
683
|
+
for k, v in labels.items()
|
|
684
|
+
if k in INTERESTING_LABELS and (k not in common or common[k] != v)
|
|
685
|
+
}
|
|
686
|
+
lbl = " ".join(f"{k}={v}" for k, v in unique.items())
|
|
687
|
+
|
|
688
|
+
indent = " " * dep
|
|
689
|
+
name = f"{indent}{s.get('name', '?')}"
|
|
690
|
+
slow = " *" if d > 100 else ""
|
|
691
|
+
click.echo(
|
|
692
|
+
f"+{_fmt_ms(offset):>9} {name:<55} {_fmt_ms(d) + slow:>12} {pct:>4.0f}% {lbl}"
|
|
693
|
+
)
|
|
694
|
+
ranked.append((s.get("name", "?"), d, pct))
|
|
695
|
+
|
|
696
|
+
ranked.sort(key=lambda x: -x[1])
|
|
697
|
+
click.echo("\nSlowest spans:")
|
|
698
|
+
for name, d, pct in ranked[:5]:
|
|
699
|
+
click.echo(f" {name:<55} {_fmt_ms(d):>10} ({pct:.0f}%)")
|
|
700
|
+
|
|
701
|
+
|
|
702
|
+
def _render_comparison(comparison, primary_pvals, services):
|
|
703
|
+
"""Render cross-service comparison text from structured data."""
|
|
704
|
+
compare_svc = comparison["service"]
|
|
705
|
+
primary_label = ", ".join(services) if services else "primary"
|
|
706
|
+
dist = comparison.get("distribution")
|
|
707
|
+
|
|
708
|
+
if not dist:
|
|
709
|
+
click.echo(f" No traces found for {compare_svc}.")
|
|
710
|
+
return
|
|
711
|
+
|
|
712
|
+
cmp_pvals = dist["percentiles"]
|
|
713
|
+
cmp_n = dist["count"]
|
|
714
|
+
|
|
715
|
+
click.echo(f" {compare_svc} latency ({cmp_n} traces):\n")
|
|
716
|
+
click.echo(f" {'PCTL':<6} {primary_label:<30} {compare_svc}")
|
|
717
|
+
click.echo(f" {'─' * 70}")
|
|
718
|
+
for label in ("p50", "p90", "p95", "p99"):
|
|
719
|
+
click.echo(
|
|
720
|
+
f" {label:<6} {_fmt_ms(primary_pvals[label]):<30} "
|
|
721
|
+
f"{_fmt_ms(cmp_pvals[label])}"
|
|
722
|
+
)
|
|
723
|
+
|
|
724
|
+
click.echo("\n During outlier windows:")
|
|
725
|
+
click.echo(
|
|
726
|
+
f" {'#':<3} {'TIME':<26} "
|
|
727
|
+
f"{primary_label + ' latency':<25} {compare_svc + ' latency'}"
|
|
728
|
+
)
|
|
729
|
+
click.echo(f" {'─' * 80}")
|
|
730
|
+
|
|
731
|
+
for i, w in enumerate(comparison.get("windows", []), 1):
|
|
732
|
+
if "avgMs" in w:
|
|
733
|
+
cmp_str = (
|
|
734
|
+
f"avg {_fmt_ms(w['avgMs'])}, max {_fmt_ms(w['maxMs'])} "
|
|
735
|
+
f"({w['count']} traces)"
|
|
736
|
+
)
|
|
737
|
+
else:
|
|
738
|
+
cmp_str = "(no traces)"
|
|
739
|
+
click.echo(f" {i:<3} {w['time']:<26} {_fmt_ms(w['primaryMs']):<25} {cmp_str}")
|
|
740
|
+
|
|
741
|
+
|
|
742
|
+
def _fmt_opt_ms(ms):
|
|
743
|
+
"""Format optional milliseconds value."""
|
|
744
|
+
return _fmt_ms(ms) if ms is not None else "-"
|
|
745
|
+
|
|
746
|
+
|
|
747
|
+
def render_compare(result):
|
|
748
|
+
"""Render conditional B|A comparison in text mode."""
|
|
749
|
+
a = result["conditionA"]
|
|
750
|
+
b = result["sampleB"]
|
|
751
|
+
group_by = result["meta"].get("groupBy", [])
|
|
752
|
+
|
|
753
|
+
click.echo("Conditional comparison B | A (descriptive, non-causal)")
|
|
754
|
+
click.echo(
|
|
755
|
+
f"A sample: {a['tracesMatched']} traces from {a['tracesFetched']} fetched "
|
|
756
|
+
f"(missing root: {a['skippedNoRoot']})"
|
|
757
|
+
)
|
|
758
|
+
click.echo(
|
|
759
|
+
f"B windows: {b['windowsSucceeded']}/{b['windowsTotal']} succeeded, "
|
|
760
|
+
f"{b['windowsFailed']} failed"
|
|
761
|
+
)
|
|
762
|
+
click.echo(
|
|
763
|
+
f"B traces: seen={b['tracesSeen']} deduped={b['tracesDeduped']} "
|
|
764
|
+
f"(missing root: {b['skippedNoRoot']})"
|
|
765
|
+
)
|
|
766
|
+
click.echo()
|
|
767
|
+
|
|
768
|
+
dist = b["distribution"]
|
|
769
|
+
if dist["count"] == 0:
|
|
770
|
+
click.echo("No B traces matched in conditioned windows.")
|
|
771
|
+
else:
|
|
772
|
+
p = dist["percentiles"]
|
|
773
|
+
click.echo(
|
|
774
|
+
f"B latency avg {_fmt_opt_ms(dist['avgMs'])} "
|
|
775
|
+
f"p50 {_fmt_opt_ms(p['p50'])} p90 {_fmt_opt_ms(p['p90'])} "
|
|
776
|
+
f"p95 {_fmt_opt_ms(p['p95'])} p99 {_fmt_opt_ms(p['p99'])}"
|
|
777
|
+
)
|
|
778
|
+
|
|
779
|
+
groups = result.get("groups", [])
|
|
780
|
+
if groups:
|
|
781
|
+
click.echo()
|
|
782
|
+
click.echo(
|
|
783
|
+
f"{'GROUP':<42} {'COUNT':>6} {'avg':>10} {'p50':>10} {'p90':>10} {'p95':>10} {'p99':>10}"
|
|
784
|
+
)
|
|
785
|
+
click.echo("─" * 108)
|
|
786
|
+
for g in groups:
|
|
787
|
+
key = g.get("key", {})
|
|
788
|
+
label = (
|
|
789
|
+
" ".join(f"{k}={key.get(k, '') or '(none)'}" for k in group_by)
|
|
790
|
+
or "(all)"
|
|
791
|
+
)
|
|
792
|
+
p = g["percentiles"]
|
|
793
|
+
click.echo(
|
|
794
|
+
f"{label:<42} {g['count']:>6} {_fmt_opt_ms(g['avgMs']):>10} "
|
|
795
|
+
f"{_fmt_opt_ms(p['p50']):>10} {_fmt_opt_ms(p['p90']):>10} "
|
|
796
|
+
f"{_fmt_opt_ms(p['p95']):>10} {_fmt_opt_ms(p['p99']):>10}"
|
|
797
|
+
)
|
|
798
|
+
|
|
799
|
+
if result.get("warnings"):
|
|
800
|
+
click.echo()
|
|
801
|
+
click.echo("Warnings:")
|
|
802
|
+
for w in result["warnings"]:
|
|
803
|
+
click.echo(f" - [{w['code']}] {w['message']}")
|
|
804
|
+
|
|
805
|
+
|
|
806
|
+
# ── Programmatic API ─────────────────────────────────────────────────────────
|
|
807
|
+
|
|
808
|
+
|
|
809
|
+
def trace_list(
|
|
810
|
+
project,
|
|
811
|
+
*,
|
|
812
|
+
start="1h",
|
|
813
|
+
end=None,
|
|
814
|
+
limit=20,
|
|
815
|
+
services=(),
|
|
816
|
+
labels=None,
|
|
817
|
+
min_latency=None,
|
|
818
|
+
max_latency=None,
|
|
819
|
+
):
|
|
820
|
+
"""Fetch and filter recent traces. Returns list of trace dicts."""
|
|
821
|
+
_, max_ms = _parse_latency(max_latency)
|
|
822
|
+
params = _build_params(
|
|
823
|
+
start, end, limit, min_latency=min_latency, services=services, labels=labels
|
|
824
|
+
)
|
|
825
|
+
traces = fetch_traces(project, params, max_results=limit)
|
|
826
|
+
if services or max_ms is not None:
|
|
827
|
+
traces = filter_traces(traces, services=services, max_ms=max_ms)
|
|
828
|
+
return traces
|
|
829
|
+
|
|
830
|
+
|
|
831
|
+
def trace_get(project, trace_id):
|
|
832
|
+
"""Fetch a single trace by ID. Returns trace dict."""
|
|
833
|
+
return api_get(project, f"/traces/{trace_id}")
|
|
834
|
+
|
|
835
|
+
|
|
836
|
+
def trace_services(project, *, start="3h", end=None, limit=200):
|
|
837
|
+
"""Collect service and endpoint counts.
|
|
838
|
+
|
|
839
|
+
Returns {services: dict, endpoints: dict, trace_count: int}.
|
|
840
|
+
"""
|
|
841
|
+
params = _build_params(start, end, limit)
|
|
842
|
+
traces = fetch_traces(project, params, max_results=limit)
|
|
843
|
+
if not traces:
|
|
844
|
+
return {"services": {}, "endpoints": {}, "trace_count": 0}
|
|
845
|
+
svc_counts = Counter()
|
|
846
|
+
ep_counts = Counter()
|
|
847
|
+
for t in traces:
|
|
848
|
+
for s in t.get("spans", []):
|
|
849
|
+
svc = s.get("labels", {}).get("service.name")
|
|
850
|
+
if svc:
|
|
851
|
+
svc_counts[svc] += 1
|
|
852
|
+
if not s.get("parentSpanId"):
|
|
853
|
+
ep_counts[s.get("name", "?")] += 1
|
|
854
|
+
return {
|
|
855
|
+
"services": dict(svc_counts),
|
|
856
|
+
"endpoints": dict(ep_counts),
|
|
857
|
+
"trace_count": len(traces),
|
|
858
|
+
}
|
|
859
|
+
|
|
860
|
+
|
|
861
|
+
def trace_spans(
|
|
862
|
+
project,
|
|
863
|
+
*,
|
|
864
|
+
start="1h",
|
|
865
|
+
end=None,
|
|
866
|
+
limit=20,
|
|
867
|
+
services=(),
|
|
868
|
+
min_latency=None,
|
|
869
|
+
max_latency=None,
|
|
870
|
+
):
|
|
871
|
+
"""Collect distinct span name counts.
|
|
872
|
+
|
|
873
|
+
Returns {spans: dict[str, int], trace_count: int}.
|
|
874
|
+
"""
|
|
875
|
+
_, max_ms = _parse_latency(max_latency)
|
|
876
|
+
params = _build_params(
|
|
877
|
+
start, end, limit, view="COMPLETE", min_latency=min_latency, services=services
|
|
878
|
+
)
|
|
879
|
+
traces = fetch_traces(project, params, max_results=limit)
|
|
880
|
+
traces = filter_traces(traces, services=services, max_ms=max_ms)
|
|
881
|
+
span_counts = Counter()
|
|
882
|
+
for t in traces:
|
|
883
|
+
for s in t.get("spans", []):
|
|
884
|
+
span_counts[s.get("name", "?")] += 1
|
|
885
|
+
return {"spans": dict(span_counts.most_common()), "trace_count": len(traces)}
|
|
886
|
+
|
|
887
|
+
|
|
888
|
+
def trace_search(
|
|
889
|
+
project,
|
|
890
|
+
*,
|
|
891
|
+
start="1h",
|
|
892
|
+
end=None,
|
|
893
|
+
limit=50,
|
|
894
|
+
span_name=None,
|
|
895
|
+
labels=None,
|
|
896
|
+
min_latency=None,
|
|
897
|
+
max_latency=None,
|
|
898
|
+
services=(),
|
|
899
|
+
parent_span_id=None,
|
|
900
|
+
order_asc=None,
|
|
901
|
+
order_desc=None,
|
|
902
|
+
):
|
|
903
|
+
"""Search traces with client-side filtering.
|
|
904
|
+
|
|
905
|
+
Without parent_span_id: returns list of trace dicts.
|
|
906
|
+
With parent_span_id: returns list of {traceId, spans} dicts containing
|
|
907
|
+
only the spans matching that parent.
|
|
908
|
+
"""
|
|
909
|
+
if order_asc and order_desc:
|
|
910
|
+
raise ValueError("Cannot use both order_asc and order_desc")
|
|
911
|
+
_, min_ms = _parse_latency(min_latency)
|
|
912
|
+
_, max_ms = _parse_latency(max_latency)
|
|
913
|
+
view = "COMPLETE" if parent_span_id else "ROOTSPAN"
|
|
914
|
+
params = _build_params(
|
|
915
|
+
start,
|
|
916
|
+
end,
|
|
917
|
+
limit,
|
|
918
|
+
view=view,
|
|
919
|
+
min_latency=min_latency,
|
|
920
|
+
services=services,
|
|
921
|
+
labels=labels,
|
|
922
|
+
)
|
|
923
|
+
traces = fetch_traces(project, params, max_results=limit)
|
|
924
|
+
|
|
925
|
+
if not parent_span_id:
|
|
926
|
+
filtered = filter_traces(
|
|
927
|
+
traces,
|
|
928
|
+
span_name=span_name,
|
|
929
|
+
services=services or None,
|
|
930
|
+
min_ms=min_ms,
|
|
931
|
+
max_ms=max_ms,
|
|
932
|
+
)
|
|
933
|
+
if order_asc or order_desc:
|
|
934
|
+
reverse = order_desc is not None
|
|
935
|
+
filtered.sort(
|
|
936
|
+
key=lambda t: _dur(r) if (r := _root(t.get("spans", []))) else 0,
|
|
937
|
+
reverse=reverse,
|
|
938
|
+
)
|
|
939
|
+
return filtered
|
|
940
|
+
|
|
941
|
+
matches = []
|
|
942
|
+
for t in traces:
|
|
943
|
+
matched_spans = [
|
|
944
|
+
s
|
|
945
|
+
for s in t.get("spans", [])
|
|
946
|
+
if s.get("parentSpanId") == parent_span_id
|
|
947
|
+
and (not span_name or s.get("name") == span_name)
|
|
948
|
+
]
|
|
949
|
+
if matched_spans:
|
|
950
|
+
matches.append({"traceId": t["traceId"], "spans": matched_spans})
|
|
951
|
+
return matches
|
|
952
|
+
|
|
953
|
+
|
|
954
|
+
def trace_outliers(
|
|
955
|
+
project,
|
|
956
|
+
*,
|
|
957
|
+
start="1h",
|
|
958
|
+
end=None,
|
|
959
|
+
limit=50,
|
|
960
|
+
services=(),
|
|
961
|
+
labels=None,
|
|
962
|
+
min_latency=None,
|
|
963
|
+
max_latency=None,
|
|
964
|
+
threshold="p95",
|
|
965
|
+
top=5,
|
|
966
|
+
compare_svc=None,
|
|
967
|
+
):
|
|
968
|
+
"""Find outlier traces with per-span breakdown. Always returns a dict.
|
|
969
|
+
|
|
970
|
+
Keys: distribution, count, threshold, thresholdMs, outliers.
|
|
971
|
+
Optional: comparison (when compare_svc is given).
|
|
972
|
+
"""
|
|
973
|
+
_, max_ms = _parse_latency(max_latency)
|
|
974
|
+
params = _build_params(
|
|
975
|
+
start,
|
|
976
|
+
end,
|
|
977
|
+
limit,
|
|
978
|
+
view="COMPLETE",
|
|
979
|
+
min_latency=min_latency,
|
|
980
|
+
services=services,
|
|
981
|
+
labels=labels,
|
|
982
|
+
)
|
|
983
|
+
all_traces = fetch_traces(project, params, max_results=limit)
|
|
984
|
+
filtered = filter_traces(all_traces, max_ms=max_ms)
|
|
985
|
+
durations = _to_durations(filtered)
|
|
986
|
+
|
|
987
|
+
if not durations:
|
|
988
|
+
return {
|
|
989
|
+
"distribution": {"p50": 0, "p90": 0, "p95": 0, "p99": 0},
|
|
990
|
+
"count": 0,
|
|
991
|
+
"threshold": threshold,
|
|
992
|
+
"thresholdMs": 0,
|
|
993
|
+
"outliers": [],
|
|
994
|
+
}
|
|
995
|
+
|
|
996
|
+
pvals = _pcts([ms for ms, _ in durations])
|
|
997
|
+
n = len(durations)
|
|
998
|
+
thresh_ms = _resolve_threshold(threshold, pvals)
|
|
999
|
+
outlier_list = [(ms, t) for ms, t in durations if ms >= thresh_ms]
|
|
1000
|
+
outlier_list.sort(key=lambda x: -x[0])
|
|
1001
|
+
outlier_list = outlier_list[:top]
|
|
1002
|
+
|
|
1003
|
+
json_out = []
|
|
1004
|
+
for total_ms, t in outlier_list:
|
|
1005
|
+
tid = t.get("traceId")
|
|
1006
|
+
span_self = _span_breakdown(t.get("spans", []))
|
|
1007
|
+
total_self = sum(ms for _, ms in span_self) or 1
|
|
1008
|
+
json_out.append(
|
|
1009
|
+
{
|
|
1010
|
+
"traceId": tid,
|
|
1011
|
+
"totalMs": round(total_ms, 1),
|
|
1012
|
+
"totalSelfMs": round(total_self, 1),
|
|
1013
|
+
"spans": [
|
|
1014
|
+
{
|
|
1015
|
+
"name": name,
|
|
1016
|
+
"selfMs": round(ms, 1),
|
|
1017
|
+
"pct": round(ms / total_self * 100),
|
|
1018
|
+
}
|
|
1019
|
+
for name, ms in span_self[:8]
|
|
1020
|
+
],
|
|
1021
|
+
}
|
|
1022
|
+
)
|
|
1023
|
+
|
|
1024
|
+
result = {
|
|
1025
|
+
"distribution": {k: round(v, 1) for k, v in pvals.items()},
|
|
1026
|
+
"count": n,
|
|
1027
|
+
"threshold": threshold,
|
|
1028
|
+
"thresholdMs": round(thresh_ms, 1),
|
|
1029
|
+
"outliers": json_out,
|
|
1030
|
+
}
|
|
1031
|
+
|
|
1032
|
+
if compare_svc:
|
|
1033
|
+
result["comparison"] = _compare_services(
|
|
1034
|
+
project,
|
|
1035
|
+
outlier_list,
|
|
1036
|
+
all_traces,
|
|
1037
|
+
compare_svc,
|
|
1038
|
+
)
|
|
1039
|
+
|
|
1040
|
+
return result
|
|
1041
|
+
|
|
1042
|
+
|
|
1043
|
+
def trace_stats(
|
|
1044
|
+
project,
|
|
1045
|
+
*,
|
|
1046
|
+
start="1h",
|
|
1047
|
+
end=None,
|
|
1048
|
+
limit=100,
|
|
1049
|
+
span_pattern=None,
|
|
1050
|
+
group_by=None,
|
|
1051
|
+
services=(),
|
|
1052
|
+
labels=None,
|
|
1053
|
+
min_latency=None,
|
|
1054
|
+
max_latency=None,
|
|
1055
|
+
bucket=None,
|
|
1056
|
+
sparkline=False,
|
|
1057
|
+
):
|
|
1058
|
+
"""Compute latency stats, optionally grouped by span labels.
|
|
1059
|
+
|
|
1060
|
+
Parameters:
|
|
1061
|
+
group_by — list of label keys to group by, or None
|
|
1062
|
+
labels — dict of label filters, or None
|
|
1063
|
+
bucket — bucket size string (e.g. '5m', '1h', '1w'), or None
|
|
1064
|
+
sparkline — include trend data per group
|
|
1065
|
+
|
|
1066
|
+
Returns:
|
|
1067
|
+
{} — when no traces found.
|
|
1068
|
+
{"totalSpans": 0, "totalTraces": int} — when traces found but no spans matched.
|
|
1069
|
+
Full dict with totalSpans, totalTraces, percentiles, groups, etc. on success.
|
|
1070
|
+
"""
|
|
1071
|
+
_, max_ms = _parse_latency(max_latency)
|
|
1072
|
+
group_keys = list(group_by) if group_by else []
|
|
1073
|
+
bucket_delta = _parse_bucket(bucket) if bucket else None
|
|
1074
|
+
|
|
1075
|
+
params = _build_params(
|
|
1076
|
+
start,
|
|
1077
|
+
end,
|
|
1078
|
+
limit,
|
|
1079
|
+
view="COMPLETE",
|
|
1080
|
+
min_latency=min_latency,
|
|
1081
|
+
services=services,
|
|
1082
|
+
labels=labels,
|
|
1083
|
+
)
|
|
1084
|
+
all_traces = fetch_traces(project, params, max_results=limit)
|
|
1085
|
+
filtered = filter_traces(all_traces, max_ms=max_ms)
|
|
1086
|
+
|
|
1087
|
+
if not filtered:
|
|
1088
|
+
return {}
|
|
1089
|
+
|
|
1090
|
+
groups = defaultdict(list)
|
|
1091
|
+
total_spans = 0
|
|
1092
|
+
|
|
1093
|
+
for t in filtered:
|
|
1094
|
+
for s in t.get("spans", []):
|
|
1095
|
+
if span_pattern and span_pattern not in s.get("name", ""):
|
|
1096
|
+
continue
|
|
1097
|
+
total_spans += 1
|
|
1098
|
+
span_ts = _ts(s["startTime"])
|
|
1099
|
+
dur = _dur(s)
|
|
1100
|
+
if group_keys:
|
|
1101
|
+
lbl = s.get("labels", {})
|
|
1102
|
+
key = tuple(lbl.get(k, "") for k in group_keys)
|
|
1103
|
+
groups[key].append((span_ts, dur))
|
|
1104
|
+
else:
|
|
1105
|
+
groups[()].append((span_ts, dur))
|
|
1106
|
+
|
|
1107
|
+
if not total_spans:
|
|
1108
|
+
return {"totalSpans": 0, "totalTraces": len(filtered)}
|
|
1109
|
+
|
|
1110
|
+
sorted_groups = sorted(groups.items(), key=lambda x: -len(x[1]))
|
|
1111
|
+
|
|
1112
|
+
json_groups = []
|
|
1113
|
+
for key, data in sorted_groups:
|
|
1114
|
+
durs = sorted(d for _, d in data)
|
|
1115
|
+
entry = {
|
|
1116
|
+
"count": len(durs),
|
|
1117
|
+
"avgMs": round(sum(durs) / len(durs), 1),
|
|
1118
|
+
"percentiles": {k: round(v, 1) for k, v in _pcts(durs).items()},
|
|
1119
|
+
}
|
|
1120
|
+
if group_keys:
|
|
1121
|
+
entry["key"] = dict(zip(group_keys, key))
|
|
1122
|
+
if bucket_delta:
|
|
1123
|
+
bkts = _make_buckets(data, bucket_delta)
|
|
1124
|
+
entry["buckets"] = [
|
|
1125
|
+
{
|
|
1126
|
+
"time": lbl,
|
|
1127
|
+
"count": len(bd),
|
|
1128
|
+
"avgMs": round(sum(bd) / len(bd), 1) if bd else None,
|
|
1129
|
+
"percentiles": (
|
|
1130
|
+
{k: round(v, 1) for k, v in _pcts(bd).items()} if bd else None
|
|
1131
|
+
),
|
|
1132
|
+
}
|
|
1133
|
+
for lbl, bd in bkts
|
|
1134
|
+
]
|
|
1135
|
+
if sparkline:
|
|
1136
|
+
trend_vals = _trend(data)
|
|
1137
|
+
if trend_vals:
|
|
1138
|
+
entry["trend"] = trend_vals
|
|
1139
|
+
json_groups.append(entry)
|
|
1140
|
+
|
|
1141
|
+
out = {"totalSpans": total_spans, "totalTraces": len(filtered)}
|
|
1142
|
+
if span_pattern:
|
|
1143
|
+
out["span"] = span_pattern
|
|
1144
|
+
if group_keys:
|
|
1145
|
+
out["groupBy"] = group_keys
|
|
1146
|
+
out["groups"] = json_groups
|
|
1147
|
+
elif len(json_groups) == 1:
|
|
1148
|
+
out.update(json_groups[0])
|
|
1149
|
+
if bucket:
|
|
1150
|
+
out["bucket"] = bucket
|
|
1151
|
+
return out
|
|
1152
|
+
|
|
1153
|
+
|
|
1154
|
+
def _pct_or_none(values):
|
|
1155
|
+
"""Compute rounded percentiles or nulls for empty lists."""
|
|
1156
|
+
if not values:
|
|
1157
|
+
return {"p50": None, "p90": None, "p95": None, "p99": None}
|
|
1158
|
+
return {k: round(v, 1) for k, v in _pcts(sorted(values)).items()}
|
|
1159
|
+
|
|
1160
|
+
|
|
1161
|
+
def _avg_or_none(values):
|
|
1162
|
+
"""Compute rounded average or null for empty lists."""
|
|
1163
|
+
if not values:
|
|
1164
|
+
return None
|
|
1165
|
+
return round(sum(values) / len(values), 1)
|
|
1166
|
+
|
|
1167
|
+
|
|
1168
|
+
def trace_compare(
|
|
1169
|
+
project,
|
|
1170
|
+
*,
|
|
1171
|
+
start="1h",
|
|
1172
|
+
end=None,
|
|
1173
|
+
limit=50,
|
|
1174
|
+
a_services=(),
|
|
1175
|
+
a_labels=None,
|
|
1176
|
+
a_span_name=None,
|
|
1177
|
+
a_min_latency=None,
|
|
1178
|
+
a_max_latency=None,
|
|
1179
|
+
b_service=None,
|
|
1180
|
+
b_labels=None,
|
|
1181
|
+
b_span_name=None,
|
|
1182
|
+
window_sec=30,
|
|
1183
|
+
group_by=None,
|
|
1184
|
+
):
|
|
1185
|
+
"""Describe B latency conditioned on traces where A matches filters.
|
|
1186
|
+
|
|
1187
|
+
This is descriptive co-occurrence analysis in time windows, not causal
|
|
1188
|
+
inference across traces.
|
|
1189
|
+
"""
|
|
1190
|
+
if not b_service:
|
|
1191
|
+
raise ValueError("--b-service is required")
|
|
1192
|
+
if window_sec <= 0:
|
|
1193
|
+
raise ValueError("--window-sec must be > 0")
|
|
1194
|
+
|
|
1195
|
+
resolved_end = end or _now()
|
|
1196
|
+
group_keys = list(group_by or [])
|
|
1197
|
+
_, a_min_ms = _parse_latency(a_min_latency)
|
|
1198
|
+
_, a_max_ms = _parse_latency(a_max_latency)
|
|
1199
|
+
|
|
1200
|
+
a_params = _build_params(
|
|
1201
|
+
start,
|
|
1202
|
+
resolved_end,
|
|
1203
|
+
limit,
|
|
1204
|
+
view="ROOTSPAN",
|
|
1205
|
+
min_latency=a_min_latency,
|
|
1206
|
+
services=a_services,
|
|
1207
|
+
labels=a_labels,
|
|
1208
|
+
)
|
|
1209
|
+
a_traces = fetch_traces(project, a_params, max_results=limit)
|
|
1210
|
+
a_roots = []
|
|
1211
|
+
skipped_a_no_root = 0
|
|
1212
|
+
for t in a_traces:
|
|
1213
|
+
r = _root(t.get("spans", []))
|
|
1214
|
+
if not r:
|
|
1215
|
+
skipped_a_no_root += 1
|
|
1216
|
+
continue
|
|
1217
|
+
if not _matches_trace(
|
|
1218
|
+
t,
|
|
1219
|
+
span_name=a_span_name,
|
|
1220
|
+
services=a_services or None,
|
|
1221
|
+
labels=a_labels,
|
|
1222
|
+
min_ms=a_min_ms,
|
|
1223
|
+
max_ms=a_max_ms,
|
|
1224
|
+
):
|
|
1225
|
+
continue
|
|
1226
|
+
a_roots.append(r)
|
|
1227
|
+
|
|
1228
|
+
warnings = []
|
|
1229
|
+
if not a_roots:
|
|
1230
|
+
warnings.append(
|
|
1231
|
+
{
|
|
1232
|
+
"code": "A_EMPTY_SAMPLE",
|
|
1233
|
+
"message": "No A traces matched the given condition",
|
|
1234
|
+
}
|
|
1235
|
+
)
|
|
1236
|
+
|
|
1237
|
+
# Build per-root windows, then merge overlapping ones to reduce API calls
|
|
1238
|
+
raw_windows = []
|
|
1239
|
+
for r in a_roots:
|
|
1240
|
+
t_start = _ts(r["startTime"])
|
|
1241
|
+
raw_windows.append(
|
|
1242
|
+
(
|
|
1243
|
+
t_start - timedelta(seconds=window_sec),
|
|
1244
|
+
t_start + timedelta(seconds=window_sec),
|
|
1245
|
+
)
|
|
1246
|
+
)
|
|
1247
|
+
raw_windows.sort()
|
|
1248
|
+
|
|
1249
|
+
windows = []
|
|
1250
|
+
for ws, we in raw_windows:
|
|
1251
|
+
if windows and ws <= windows[-1][1]:
|
|
1252
|
+
# Overlapping — extend the previous window
|
|
1253
|
+
prev_s, prev_e = windows[-1]
|
|
1254
|
+
windows[-1] = (prev_s, max(prev_e, we))
|
|
1255
|
+
else:
|
|
1256
|
+
windows.append((ws, we))
|
|
1257
|
+
|
|
1258
|
+
# Format merged windows as RFC3339 strings
|
|
1259
|
+
windows = [
|
|
1260
|
+
(ws.strftime("%Y-%m-%dT%H:%M:%SZ"), we.strftime("%Y-%m-%dT%H:%M:%SZ"))
|
|
1261
|
+
for ws, we in windows
|
|
1262
|
+
]
|
|
1263
|
+
|
|
1264
|
+
b_seen = 0
|
|
1265
|
+
b_roots_by_trace = {}
|
|
1266
|
+
b_skipped_no_root = 0
|
|
1267
|
+
windows_succeeded = 0
|
|
1268
|
+
windows_failed = 0
|
|
1269
|
+
|
|
1270
|
+
def _fetch_window(win):
|
|
1271
|
+
win_start, win_end = win
|
|
1272
|
+
params = _build_params(
|
|
1273
|
+
win_start,
|
|
1274
|
+
win_end,
|
|
1275
|
+
100,
|
|
1276
|
+
view="ROOTSPAN",
|
|
1277
|
+
services=(b_service,),
|
|
1278
|
+
labels=b_labels,
|
|
1279
|
+
)
|
|
1280
|
+
try:
|
|
1281
|
+
traces = fetch_traces(project, params, max_results=100)
|
|
1282
|
+
return {"ok": True, "traces": traces, "start": win_start, "end": win_end}
|
|
1283
|
+
except (ApiError, OSError, ValueError) as e:
|
|
1284
|
+
return {
|
|
1285
|
+
"ok": False,
|
|
1286
|
+
"error": str(e),
|
|
1287
|
+
"start": win_start,
|
|
1288
|
+
"end": win_end,
|
|
1289
|
+
}
|
|
1290
|
+
|
|
1291
|
+
if windows:
|
|
1292
|
+
with ThreadPoolExecutor(max_workers=min(len(windows), 8)) as pool:
|
|
1293
|
+
futures = {pool.submit(_fetch_window, w): w for w in windows}
|
|
1294
|
+
for fut in as_completed(futures):
|
|
1295
|
+
data = fut.result()
|
|
1296
|
+
if not data["ok"]:
|
|
1297
|
+
windows_failed += 1
|
|
1298
|
+
warnings.append(
|
|
1299
|
+
{
|
|
1300
|
+
"code": "B_WINDOW_FETCH_FAILED",
|
|
1301
|
+
"message": (
|
|
1302
|
+
f"window {data['start']}..{data['end']} failed: "
|
|
1303
|
+
f"{data['error'].splitlines()[0]}"
|
|
1304
|
+
),
|
|
1305
|
+
}
|
|
1306
|
+
)
|
|
1307
|
+
continue
|
|
1308
|
+
|
|
1309
|
+
windows_succeeded += 1
|
|
1310
|
+
traces = data["traces"]
|
|
1311
|
+
for t in traces:
|
|
1312
|
+
r = _root(t.get("spans", []))
|
|
1313
|
+
if not r:
|
|
1314
|
+
b_skipped_no_root += 1
|
|
1315
|
+
continue
|
|
1316
|
+
if not _matches_trace(
|
|
1317
|
+
t,
|
|
1318
|
+
span_name=b_span_name,
|
|
1319
|
+
services=(b_service,),
|
|
1320
|
+
labels=b_labels,
|
|
1321
|
+
):
|
|
1322
|
+
continue
|
|
1323
|
+
b_seen += 1
|
|
1324
|
+
tid = t.get("traceId")
|
|
1325
|
+
if tid and tid not in b_roots_by_trace:
|
|
1326
|
+
b_roots_by_trace[tid] = r
|
|
1327
|
+
|
|
1328
|
+
if windows and windows_succeeded == 0:
|
|
1329
|
+
warnings.append(
|
|
1330
|
+
{
|
|
1331
|
+
"code": "B_ALL_WINDOWS_FAILED",
|
|
1332
|
+
"message": "All B window fetches failed; result is empty",
|
|
1333
|
+
}
|
|
1334
|
+
)
|
|
1335
|
+
|
|
1336
|
+
b_durations = sorted(_dur(r) for r in b_roots_by_trace.values())
|
|
1337
|
+
groups = defaultdict(list)
|
|
1338
|
+
if group_keys:
|
|
1339
|
+
for r in b_roots_by_trace.values():
|
|
1340
|
+
rl = r.get("labels", {})
|
|
1341
|
+
key = tuple(rl.get(k, "") for k in group_keys)
|
|
1342
|
+
groups[key].append(_dur(r))
|
|
1343
|
+
|
|
1344
|
+
json_groups = []
|
|
1345
|
+
for key, durs in groups.items():
|
|
1346
|
+
durs = sorted(durs)
|
|
1347
|
+
json_groups.append(
|
|
1348
|
+
{
|
|
1349
|
+
"key": dict(zip(group_keys, key)),
|
|
1350
|
+
"count": len(durs),
|
|
1351
|
+
"avgMs": _avg_or_none(durs),
|
|
1352
|
+
"percentiles": _pct_or_none(durs),
|
|
1353
|
+
}
|
|
1354
|
+
)
|
|
1355
|
+
|
|
1356
|
+
json_groups.sort(
|
|
1357
|
+
key=lambda g: (
|
|
1358
|
+
-g["count"],
|
|
1359
|
+
json.dumps(g.get("key", {}), sort_keys=True, separators=(",", ":")),
|
|
1360
|
+
)
|
|
1361
|
+
)
|
|
1362
|
+
|
|
1363
|
+
dist = {
|
|
1364
|
+
"count": len(b_durations),
|
|
1365
|
+
"avgMs": _avg_or_none(b_durations),
|
|
1366
|
+
"percentiles": _pct_or_none(b_durations),
|
|
1367
|
+
}
|
|
1368
|
+
|
|
1369
|
+
return {
|
|
1370
|
+
"meta": {
|
|
1371
|
+
"schemaVersion": "compare.v1",
|
|
1372
|
+
"project": project,
|
|
1373
|
+
"start": _parse_time(start),
|
|
1374
|
+
"end": resolved_end,
|
|
1375
|
+
"limit": limit,
|
|
1376
|
+
"windowSec": window_sec,
|
|
1377
|
+
"groupBy": group_keys,
|
|
1378
|
+
},
|
|
1379
|
+
"conditionA": {
|
|
1380
|
+
"filters": {
|
|
1381
|
+
"services": list(a_services),
|
|
1382
|
+
"labels": a_labels or {},
|
|
1383
|
+
"spanName": a_span_name,
|
|
1384
|
+
"minLatency": a_min_latency,
|
|
1385
|
+
"maxLatency": a_max_latency,
|
|
1386
|
+
},
|
|
1387
|
+
"tracesFetched": len(a_traces),
|
|
1388
|
+
"tracesMatched": len(a_roots),
|
|
1389
|
+
"skippedNoRoot": skipped_a_no_root,
|
|
1390
|
+
},
|
|
1391
|
+
"sampleB": {
|
|
1392
|
+
"filters": {
|
|
1393
|
+
"service": b_service,
|
|
1394
|
+
"labels": b_labels or {},
|
|
1395
|
+
"spanName": b_span_name,
|
|
1396
|
+
},
|
|
1397
|
+
"windowsTotal": len(windows),
|
|
1398
|
+
"windowsSucceeded": windows_succeeded,
|
|
1399
|
+
"windowsFailed": windows_failed,
|
|
1400
|
+
"tracesSeen": b_seen,
|
|
1401
|
+
"tracesDeduped": len(b_roots_by_trace),
|
|
1402
|
+
"skippedNoRoot": b_skipped_no_root,
|
|
1403
|
+
"distribution": dist,
|
|
1404
|
+
},
|
|
1405
|
+
"groups": json_groups,
|
|
1406
|
+
"warnings": warnings,
|
|
1407
|
+
}
|
|
1408
|
+
|
|
1409
|
+
|
|
1410
|
+
# ── CLI ──────────────────────────────────────────────────────────────────────
|
|
1411
|
+
|
|
1412
|
+
|
|
1413
|
+
@click.group()
|
|
1414
|
+
@click.option(
|
|
1415
|
+
"--project",
|
|
1416
|
+
envvar="GOOGLE_CLOUD_PROJECT",
|
|
1417
|
+
required=True,
|
|
1418
|
+
help="GCP project ID (or set GOOGLE_CLOUD_PROJECT)",
|
|
1419
|
+
)
|
|
1420
|
+
@click.option("--json", "as_json", is_flag=True, help="Raw JSON output")
|
|
1421
|
+
@click.pass_context
|
|
1422
|
+
def cli(ctx, project, as_json):
|
|
1423
|
+
"""gtraces - query and analyze GCP Cloud Traces."""
|
|
1424
|
+
ctx.ensure_object(dict)
|
|
1425
|
+
ctx.obj["project"] = project
|
|
1426
|
+
ctx.obj["json"] = as_json
|
|
1427
|
+
# Pre-warm auth token before any concurrency
|
|
1428
|
+
try:
|
|
1429
|
+
get_token()
|
|
1430
|
+
except ApiError as e:
|
|
1431
|
+
raise click.ClickException(str(e))
|
|
1432
|
+
|
|
1433
|
+
|
|
1434
|
+
@cli.command("list")
|
|
1435
|
+
@click.option(
|
|
1436
|
+
"--start",
|
|
1437
|
+
default="1h",
|
|
1438
|
+
show_default=True,
|
|
1439
|
+
help="Start time (1h, 30m, 2d, 1w, or RFC3339)",
|
|
1440
|
+
)
|
|
1441
|
+
@click.option("--end", default=None, help="End time (default: now)")
|
|
1442
|
+
@click.option(
|
|
1443
|
+
"--limit", default=20, show_default=True, type=int, help="Max traces to fetch"
|
|
1444
|
+
)
|
|
1445
|
+
@click.option(
|
|
1446
|
+
"--service", "services", multiple=True, help="Filter by service.name (repeatable)"
|
|
1447
|
+
)
|
|
1448
|
+
@click.option("--label", "labels", multiple=True, help="Label key=value (repeatable)")
|
|
1449
|
+
@click.option("--min-latency", default=None, help="Min latency (500ms, 1s)")
|
|
1450
|
+
@click.option("--max-latency", default=None, help="Max latency (500ms, 1s)")
|
|
1451
|
+
@click.pass_context
|
|
1452
|
+
@_cli_validate
|
|
1453
|
+
def list_cmd(ctx, start, end, limit, services, labels, min_latency, max_latency):
|
|
1454
|
+
"""List recent traces."""
|
|
1455
|
+
label_dict = _parse_labels(labels)
|
|
1456
|
+
traces = trace_list(
|
|
1457
|
+
ctx.obj["project"],
|
|
1458
|
+
start=start,
|
|
1459
|
+
end=end,
|
|
1460
|
+
limit=limit,
|
|
1461
|
+
services=services,
|
|
1462
|
+
labels=label_dict,
|
|
1463
|
+
min_latency=min_latency,
|
|
1464
|
+
max_latency=max_latency,
|
|
1465
|
+
)
|
|
1466
|
+
if ctx.obj["json"]:
|
|
1467
|
+
click.echo(json.dumps(traces, indent=2))
|
|
1468
|
+
else:
|
|
1469
|
+
render_list(traces)
|
|
1470
|
+
|
|
1471
|
+
|
|
1472
|
+
@cli.command()
|
|
1473
|
+
@click.option(
|
|
1474
|
+
"--start",
|
|
1475
|
+
default="3h",
|
|
1476
|
+
show_default=True,
|
|
1477
|
+
help="Start time (1h, 30m, 2d, 1w, or RFC3339)",
|
|
1478
|
+
)
|
|
1479
|
+
@click.option("--end", default=None, help="End time (default: now)")
|
|
1480
|
+
@click.option(
|
|
1481
|
+
"--limit", default=200, show_default=True, type=int, help="Max traces to fetch"
|
|
1482
|
+
)
|
|
1483
|
+
@click.pass_context
|
|
1484
|
+
@_cli_validate
|
|
1485
|
+
def services(ctx, start, end, limit):
|
|
1486
|
+
"""List services and endpoints seen in recent traces."""
|
|
1487
|
+
result = trace_services(ctx.obj["project"], start=start, end=end, limit=limit)
|
|
1488
|
+
if not result["services"] and not result["endpoints"]:
|
|
1489
|
+
click.echo("No traces found.")
|
|
1490
|
+
return
|
|
1491
|
+
|
|
1492
|
+
if ctx.obj["json"]:
|
|
1493
|
+
out = {k: v for k, v in result.items() if k != "trace_count"}
|
|
1494
|
+
click.echo(json.dumps(out, indent=2))
|
|
1495
|
+
return
|
|
1496
|
+
|
|
1497
|
+
svc_counts = Counter(result["services"])
|
|
1498
|
+
ep_counts = Counter(result["endpoints"])
|
|
1499
|
+
click.echo(f"{'SERVICE':<45} TRACES")
|
|
1500
|
+
click.echo("\u2500" * 55)
|
|
1501
|
+
for svc, n in svc_counts.most_common():
|
|
1502
|
+
click.echo(f" {svc:<45} {n}")
|
|
1503
|
+
click.echo()
|
|
1504
|
+
click.echo(f"{'ENDPOINT':<45} TRACES")
|
|
1505
|
+
click.echo("\u2500" * 55)
|
|
1506
|
+
for ep, n in ep_counts.most_common():
|
|
1507
|
+
click.echo(f" {ep:<45} {n}")
|
|
1508
|
+
click.echo(f"\nScanned {result['trace_count']} traces from last {start}.")
|
|
1509
|
+
|
|
1510
|
+
|
|
1511
|
+
@cli.command()
|
|
1512
|
+
@click.option(
|
|
1513
|
+
"--start",
|
|
1514
|
+
default="1h",
|
|
1515
|
+
show_default=True,
|
|
1516
|
+
help="Start time (1h, 30m, 2d, 1w, or RFC3339)",
|
|
1517
|
+
)
|
|
1518
|
+
@click.option("--end", default=None, help="End time (default: now)")
|
|
1519
|
+
@click.option(
|
|
1520
|
+
"--limit", default=20, show_default=True, type=int, help="Max traces to fetch"
|
|
1521
|
+
)
|
|
1522
|
+
@click.option(
|
|
1523
|
+
"--service", "services", multiple=True, help="Filter by service.name (repeatable)"
|
|
1524
|
+
)
|
|
1525
|
+
@click.option("--min-latency", default=None, help="Min latency (500ms, 1s)")
|
|
1526
|
+
@click.option("--max-latency", default=None, help="Max latency (500ms, 1s)")
|
|
1527
|
+
@click.pass_context
|
|
1528
|
+
@_cli_validate
|
|
1529
|
+
def spans(ctx, start, end, limit, services, min_latency, max_latency):
|
|
1530
|
+
"""List distinct span names from sampled traces."""
|
|
1531
|
+
result = trace_spans(
|
|
1532
|
+
ctx.obj["project"],
|
|
1533
|
+
start=start,
|
|
1534
|
+
end=end,
|
|
1535
|
+
limit=limit,
|
|
1536
|
+
services=services,
|
|
1537
|
+
min_latency=min_latency,
|
|
1538
|
+
max_latency=max_latency,
|
|
1539
|
+
)
|
|
1540
|
+
if not result["spans"]:
|
|
1541
|
+
click.echo("No traces found.")
|
|
1542
|
+
return
|
|
1543
|
+
|
|
1544
|
+
if ctx.obj["json"]:
|
|
1545
|
+
click.echo(json.dumps(result["spans"], indent=2))
|
|
1546
|
+
return
|
|
1547
|
+
|
|
1548
|
+
span_counts = Counter(result["spans"])
|
|
1549
|
+
click.echo(f"{'SPAN NAME':<60} COUNT")
|
|
1550
|
+
click.echo("\u2500" * 70)
|
|
1551
|
+
for name, n in span_counts.most_common():
|
|
1552
|
+
click.echo(f" {name:<60} {n}")
|
|
1553
|
+
click.echo(f"\nSampled {result['trace_count']} traces.")
|
|
1554
|
+
|
|
1555
|
+
|
|
1556
|
+
@cli.command()
|
|
1557
|
+
@click.argument("trace_id")
|
|
1558
|
+
@click.option("--bars", is_flag=True, help="Show waterfall timing bars")
|
|
1559
|
+
@click.option(
|
|
1560
|
+
"--name-width",
|
|
1561
|
+
default=35,
|
|
1562
|
+
show_default=True,
|
|
1563
|
+
type=int,
|
|
1564
|
+
help="Span name column width",
|
|
1565
|
+
)
|
|
1566
|
+
@click.pass_context
|
|
1567
|
+
@_cli_validate
|
|
1568
|
+
def get(ctx, trace_id, bars, name_width):
|
|
1569
|
+
"""Show trace as a span tree."""
|
|
1570
|
+
trace = trace_get(ctx.obj["project"], trace_id)
|
|
1571
|
+
if ctx.obj["json"]:
|
|
1572
|
+
click.echo(json.dumps(trace, indent=2))
|
|
1573
|
+
else:
|
|
1574
|
+
render_tree(trace, bars=bars, name_width=name_width)
|
|
1575
|
+
|
|
1576
|
+
|
|
1577
|
+
@cli.command()
|
|
1578
|
+
@click.option(
|
|
1579
|
+
"--start",
|
|
1580
|
+
default="1h",
|
|
1581
|
+
show_default=True,
|
|
1582
|
+
help="Start time (1h, 30m, 2d, 1w, or RFC3339)",
|
|
1583
|
+
)
|
|
1584
|
+
@click.option("--end", default=None, help="End time (default: now)")
|
|
1585
|
+
@click.option(
|
|
1586
|
+
"--limit", default=50, show_default=True, type=int, help="Max traces to fetch"
|
|
1587
|
+
)
|
|
1588
|
+
@click.option("--span-name", default=None, help="Root span name (substring match)")
|
|
1589
|
+
@click.option("--label", "labels", multiple=True, help="Label key=value (repeatable)")
|
|
1590
|
+
@click.option("--min-latency", default=None, help="Min latency (500ms, 1s)")
|
|
1591
|
+
@click.option("--max-latency", default=None, help="Max latency (500ms, 1s)")
|
|
1592
|
+
@click.option(
|
|
1593
|
+
"--service", "services", multiple=True, help="Filter by service.name (repeatable)"
|
|
1594
|
+
)
|
|
1595
|
+
@click.option(
|
|
1596
|
+
"--parent-span-id",
|
|
1597
|
+
default=None,
|
|
1598
|
+
help="Find spans with this parentSpanId (fetches full traces)",
|
|
1599
|
+
)
|
|
1600
|
+
@click.option("--show-labels", is_flag=True, help="Show interesting labels in output")
|
|
1601
|
+
@click.option(
|
|
1602
|
+
"--extra-fields",
|
|
1603
|
+
"field_str",
|
|
1604
|
+
default=None,
|
|
1605
|
+
help="Extra columns (e.g. spans,label:cloud.region,label:placement)",
|
|
1606
|
+
)
|
|
1607
|
+
@click.option(
|
|
1608
|
+
"--order-asc", default=None, type=click.Choice(["duration"]), help="Sort ascending"
|
|
1609
|
+
)
|
|
1610
|
+
@click.option(
|
|
1611
|
+
"--order-desc",
|
|
1612
|
+
default=None,
|
|
1613
|
+
type=click.Choice(["duration"]),
|
|
1614
|
+
help="Sort descending",
|
|
1615
|
+
)
|
|
1616
|
+
@click.pass_context
|
|
1617
|
+
@_cli_validate
|
|
1618
|
+
def search(
|
|
1619
|
+
ctx,
|
|
1620
|
+
start,
|
|
1621
|
+
end,
|
|
1622
|
+
limit,
|
|
1623
|
+
span_name,
|
|
1624
|
+
labels,
|
|
1625
|
+
min_latency,
|
|
1626
|
+
max_latency,
|
|
1627
|
+
services,
|
|
1628
|
+
parent_span_id,
|
|
1629
|
+
show_labels,
|
|
1630
|
+
field_str,
|
|
1631
|
+
order_asc,
|
|
1632
|
+
order_desc,
|
|
1633
|
+
):
|
|
1634
|
+
"""Search traces with client-side filtering.
|
|
1635
|
+
|
|
1636
|
+
When --parent-span-id is used, full trace details are fetched to match
|
|
1637
|
+
inner spans (not just root spans). Useful for cross-service correlation.
|
|
1638
|
+
|
|
1639
|
+
\b
|
|
1640
|
+
Examples:
|
|
1641
|
+
gtraces search --span-name "POST /v1/rtb" --min-latency 500ms
|
|
1642
|
+
gtraces search --min-latency 300ms --max-latency 500ms --service my-service
|
|
1643
|
+
gtraces search --service my-service --parent-span-id 123456
|
|
1644
|
+
"""
|
|
1645
|
+
p = ctx.obj["project"]
|
|
1646
|
+
label_dict = _parse_labels(labels)
|
|
1647
|
+
fields = _parse_fields(field_str)
|
|
1648
|
+
|
|
1649
|
+
result = trace_search(
|
|
1650
|
+
p,
|
|
1651
|
+
start=start,
|
|
1652
|
+
end=end,
|
|
1653
|
+
limit=limit,
|
|
1654
|
+
span_name=span_name,
|
|
1655
|
+
labels=label_dict,
|
|
1656
|
+
min_latency=min_latency,
|
|
1657
|
+
max_latency=max_latency,
|
|
1658
|
+
services=services,
|
|
1659
|
+
parent_span_id=parent_span_id,
|
|
1660
|
+
order_asc=order_asc,
|
|
1661
|
+
order_desc=order_desc,
|
|
1662
|
+
)
|
|
1663
|
+
|
|
1664
|
+
if not parent_span_id:
|
|
1665
|
+
if ctx.obj["json"]:
|
|
1666
|
+
click.echo(json.dumps(result, indent=2))
|
|
1667
|
+
else:
|
|
1668
|
+
render_list(result, show_labels=show_labels, fields=fields)
|
|
1669
|
+
click.echo(f"\n{len(result)} traces matched.")
|
|
1670
|
+
return
|
|
1671
|
+
|
|
1672
|
+
if not result:
|
|
1673
|
+
click.echo("No spans matched.")
|
|
1674
|
+
return
|
|
1675
|
+
|
|
1676
|
+
if ctx.obj["json"]:
|
|
1677
|
+
click.echo(json.dumps(result, indent=2))
|
|
1678
|
+
return
|
|
1679
|
+
|
|
1680
|
+
total_spans = sum(len(m["spans"]) for m in result)
|
|
1681
|
+
click.echo(
|
|
1682
|
+
f"Found {total_spans} span(s) in {len(result)} trace(s) "
|
|
1683
|
+
f"with parentSpanId={parent_span_id}\n"
|
|
1684
|
+
)
|
|
1685
|
+
click.echo(
|
|
1686
|
+
f"{'TRACE ID':<36} {'SPAN NAME':<40} "
|
|
1687
|
+
f"{'DURATION':>10} {'SPAN ID':<20} LABELS"
|
|
1688
|
+
)
|
|
1689
|
+
click.echo("\u2500" * 120)
|
|
1690
|
+
for m in result:
|
|
1691
|
+
tid = m.get("traceId", "?")
|
|
1692
|
+
for s in m["spans"]:
|
|
1693
|
+
name = s.get("name", "?")[:40]
|
|
1694
|
+
dur = _fmt_ms(_dur(s))
|
|
1695
|
+
sid = s.get("spanId", "?")
|
|
1696
|
+
lbl_d = s.get("labels", {})
|
|
1697
|
+
interesting = {k: v for k, v in lbl_d.items() if k in INTERESTING_LABELS}
|
|
1698
|
+
lbl = " ".join(f"{k}={v}" for k, v in interesting.items())
|
|
1699
|
+
click.echo(f"{tid:<36} {name:<40} {dur:>10} {sid:<20} {lbl}")
|
|
1700
|
+
|
|
1701
|
+
|
|
1702
|
+
@cli.command()
|
|
1703
|
+
@click.option(
|
|
1704
|
+
"--start",
|
|
1705
|
+
default="1h",
|
|
1706
|
+
show_default=True,
|
|
1707
|
+
help="Start time (1h, 30m, 2d, 1w, or RFC3339)",
|
|
1708
|
+
)
|
|
1709
|
+
@click.option("--end", default=None, help="End time (default: now)")
|
|
1710
|
+
@click.option(
|
|
1711
|
+
"--limit", default=50, show_default=True, type=int, help="Max A traces to fetch"
|
|
1712
|
+
)
|
|
1713
|
+
@click.option(
|
|
1714
|
+
"--a-service",
|
|
1715
|
+
"a_services",
|
|
1716
|
+
multiple=True,
|
|
1717
|
+
help="A filter: service.name (repeatable)",
|
|
1718
|
+
)
|
|
1719
|
+
@click.option("--a-label", "a_labels", multiple=True, help="A filter: label key=value")
|
|
1720
|
+
@click.option(
|
|
1721
|
+
"--a-span-name",
|
|
1722
|
+
default=None,
|
|
1723
|
+
help="A filter: root span name (substring match)",
|
|
1724
|
+
)
|
|
1725
|
+
@click.option(
|
|
1726
|
+
"--a-min-latency",
|
|
1727
|
+
default=None,
|
|
1728
|
+
help="A filter: min root latency (500ms, 1s)",
|
|
1729
|
+
)
|
|
1730
|
+
@click.option(
|
|
1731
|
+
"--a-max-latency",
|
|
1732
|
+
default=None,
|
|
1733
|
+
help="A filter: max root latency (500ms, 1s)",
|
|
1734
|
+
)
|
|
1735
|
+
@click.option(
|
|
1736
|
+
"--b-service",
|
|
1737
|
+
required=True,
|
|
1738
|
+
help="B target service.name (required)",
|
|
1739
|
+
)
|
|
1740
|
+
@click.option("--b-label", "b_labels", multiple=True, help="B filter: label key=value")
|
|
1741
|
+
@click.option(
|
|
1742
|
+
"--b-span-name",
|
|
1743
|
+
default=None,
|
|
1744
|
+
help="B filter: root span name (substring match)",
|
|
1745
|
+
)
|
|
1746
|
+
@click.option(
|
|
1747
|
+
"--window-sec",
|
|
1748
|
+
default=30,
|
|
1749
|
+
show_default=True,
|
|
1750
|
+
type=int,
|
|
1751
|
+
help="Conditioning window around each A trace (seconds)",
|
|
1752
|
+
)
|
|
1753
|
+
@click.option(
|
|
1754
|
+
"--group-by",
|
|
1755
|
+
default=None,
|
|
1756
|
+
help="Comma-separated B root label keys to group by",
|
|
1757
|
+
)
|
|
1758
|
+
@click.pass_context
|
|
1759
|
+
@_cli_validate
|
|
1760
|
+
def compare(
|
|
1761
|
+
ctx,
|
|
1762
|
+
start,
|
|
1763
|
+
end,
|
|
1764
|
+
limit,
|
|
1765
|
+
a_services,
|
|
1766
|
+
a_labels,
|
|
1767
|
+
a_span_name,
|
|
1768
|
+
a_min_latency,
|
|
1769
|
+
a_max_latency,
|
|
1770
|
+
b_service,
|
|
1771
|
+
b_labels,
|
|
1772
|
+
b_span_name,
|
|
1773
|
+
window_sec,
|
|
1774
|
+
group_by,
|
|
1775
|
+
):
|
|
1776
|
+
"""Describe B latency conditioned on A traces (descriptive, non-causal).
|
|
1777
|
+
|
|
1778
|
+
\b
|
|
1779
|
+
Examples:
|
|
1780
|
+
gtraces compare --a-service config-service --b-service ssp-service-go
|
|
1781
|
+
gtraces compare --a-service config-service --a-min-latency 600ms --b-service ssp-service-go
|
|
1782
|
+
gtraces --json compare --a-service config-service --b-service ssp-service-go --group-by cloud.region
|
|
1783
|
+
"""
|
|
1784
|
+
result = trace_compare(
|
|
1785
|
+
ctx.obj["project"],
|
|
1786
|
+
start=start,
|
|
1787
|
+
end=end,
|
|
1788
|
+
limit=limit,
|
|
1789
|
+
a_services=a_services,
|
|
1790
|
+
a_labels=_parse_labels(a_labels),
|
|
1791
|
+
a_span_name=a_span_name,
|
|
1792
|
+
a_min_latency=a_min_latency,
|
|
1793
|
+
a_max_latency=a_max_latency,
|
|
1794
|
+
b_service=b_service,
|
|
1795
|
+
b_labels=_parse_labels(b_labels),
|
|
1796
|
+
b_span_name=b_span_name,
|
|
1797
|
+
window_sec=window_sec,
|
|
1798
|
+
group_by=_parse_group_by(group_by),
|
|
1799
|
+
)
|
|
1800
|
+
|
|
1801
|
+
if ctx.obj["json"]:
|
|
1802
|
+
click.echo(json.dumps(result, indent=2))
|
|
1803
|
+
else:
|
|
1804
|
+
render_compare(result)
|
|
1805
|
+
|
|
1806
|
+
|
|
1807
|
+
@cli.command()
|
|
1808
|
+
@click.argument("trace_id")
|
|
1809
|
+
@click.pass_context
|
|
1810
|
+
@_cli_validate
|
|
1811
|
+
def analyze(ctx, trace_id):
|
|
1812
|
+
"""Timeline analysis with bottleneck detection."""
|
|
1813
|
+
trace = trace_get(ctx.obj["project"], trace_id)
|
|
1814
|
+
if ctx.obj["json"]:
|
|
1815
|
+
click.echo(json.dumps(trace, indent=2))
|
|
1816
|
+
else:
|
|
1817
|
+
render_timeline(trace)
|
|
1818
|
+
|
|
1819
|
+
|
|
1820
|
+
# ── Outliers helpers ─────────────────────────────────────────────────────────
|
|
1821
|
+
|
|
1822
|
+
|
|
1823
|
+
def _span_breakdown(all_spans):
|
|
1824
|
+
"""Compute per-span exclusive (self) time. Returns sorted [(name, ms)]."""
|
|
1825
|
+
children_dur = defaultdict(float)
|
|
1826
|
+
for s in all_spans:
|
|
1827
|
+
pid = s.get("parentSpanId")
|
|
1828
|
+
if pid:
|
|
1829
|
+
children_dur[pid] += _dur(s)
|
|
1830
|
+
|
|
1831
|
+
span_self = []
|
|
1832
|
+
for s in all_spans:
|
|
1833
|
+
sid = s.get("spanId")
|
|
1834
|
+
self_time = max(0, _dur(s) - children_dur.get(sid, 0))
|
|
1835
|
+
if self_time > 0:
|
|
1836
|
+
span_self.append((s.get("name", "?"), self_time))
|
|
1837
|
+
|
|
1838
|
+
span_self.sort(key=lambda x: -x[1])
|
|
1839
|
+
return span_self
|
|
1840
|
+
|
|
1841
|
+
|
|
1842
|
+
def _compare_services(project, outlier_list, all_traces, compare_svc):
|
|
1843
|
+
"""Cross-service latency comparison during outlier windows.
|
|
1844
|
+
|
|
1845
|
+
Returns comparison data dict with distribution and per-window metrics.
|
|
1846
|
+
"""
|
|
1847
|
+
cmp_durations = _to_durations(filter_traces(all_traces, services=compare_svc))
|
|
1848
|
+
|
|
1849
|
+
comparison = {"service": compare_svc, "distribution": None, "windows": []}
|
|
1850
|
+
|
|
1851
|
+
if not cmp_durations:
|
|
1852
|
+
return comparison
|
|
1853
|
+
|
|
1854
|
+
cmp_pvals = _pcts([ms for ms, _ in cmp_durations])
|
|
1855
|
+
cmp_n = len(cmp_durations)
|
|
1856
|
+
comparison["distribution"] = {
|
|
1857
|
+
"count": cmp_n,
|
|
1858
|
+
"percentiles": {k: round(v, 1) for k, v in cmp_pvals.items()},
|
|
1859
|
+
}
|
|
1860
|
+
|
|
1861
|
+
# Build window params for each outlier
|
|
1862
|
+
windows = []
|
|
1863
|
+
for total_ms, t in outlier_list:
|
|
1864
|
+
r = _root(t.get("spans", []))
|
|
1865
|
+
if not r:
|
|
1866
|
+
continue
|
|
1867
|
+
t_start = _ts(r["startTime"])
|
|
1868
|
+
win_start = (t_start - timedelta(seconds=30)).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
1869
|
+
win_end = (t_start + timedelta(seconds=30)).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
1870
|
+
windows.append(
|
|
1871
|
+
(
|
|
1872
|
+
total_ms,
|
|
1873
|
+
r,
|
|
1874
|
+
{
|
|
1875
|
+
"pageSize": 20,
|
|
1876
|
+
"startTime": win_start,
|
|
1877
|
+
"endTime": win_end,
|
|
1878
|
+
"view": "ROOTSPAN",
|
|
1879
|
+
},
|
|
1880
|
+
)
|
|
1881
|
+
)
|
|
1882
|
+
|
|
1883
|
+
# Fetch all windows in parallel
|
|
1884
|
+
def _fetch_window(win_params):
|
|
1885
|
+
return fetch_traces(project, win_params, max_results=20)
|
|
1886
|
+
|
|
1887
|
+
with ThreadPoolExecutor(max_workers=min(len(windows), 8)) as pool:
|
|
1888
|
+
futures = {
|
|
1889
|
+
pool.submit(_fetch_window, wp): idx
|
|
1890
|
+
for idx, (_, _, wp) in enumerate(windows)
|
|
1891
|
+
}
|
|
1892
|
+
win_results = [[] for _ in windows]
|
|
1893
|
+
for fut in as_completed(futures):
|
|
1894
|
+
win_results[futures[fut]] = fut.result()
|
|
1895
|
+
|
|
1896
|
+
for _, ((total_ms, r, _), win_traces) in enumerate(zip(windows, win_results), 1):
|
|
1897
|
+
win_durs = _to_durations(filter_traces(win_traces or [], services=compare_svc))
|
|
1898
|
+
|
|
1899
|
+
window_data = {"time": r["startTime"][:19], "primaryMs": round(total_ms, 1)}
|
|
1900
|
+
if win_durs:
|
|
1901
|
+
cmp_vals = [ms for ms, _ in win_durs]
|
|
1902
|
+
avg = sum(cmp_vals) / len(cmp_vals)
|
|
1903
|
+
mx = max(cmp_vals)
|
|
1904
|
+
window_data["avgMs"] = round(avg, 1)
|
|
1905
|
+
window_data["maxMs"] = round(mx, 1)
|
|
1906
|
+
window_data["count"] = len(cmp_vals)
|
|
1907
|
+
|
|
1908
|
+
comparison["windows"].append(window_data)
|
|
1909
|
+
|
|
1910
|
+
return comparison
|
|
1911
|
+
|
|
1912
|
+
|
|
1913
|
+
# ── Outliers command ─────────────────────────────────────────────────────────
|
|
1914
|
+
|
|
1915
|
+
|
|
1916
|
+
@cli.command()
|
|
1917
|
+
@click.option(
|
|
1918
|
+
"--start",
|
|
1919
|
+
default="1h",
|
|
1920
|
+
show_default=True,
|
|
1921
|
+
help="Start time (1h, 30m, 2d, 1w, or RFC3339)",
|
|
1922
|
+
)
|
|
1923
|
+
@click.option("--end", default=None, help="End time (default: now)")
|
|
1924
|
+
@click.option(
|
|
1925
|
+
"--limit", default=50, show_default=True, type=int, help="Max traces to fetch"
|
|
1926
|
+
)
|
|
1927
|
+
@click.option(
|
|
1928
|
+
"--service", "services", multiple=True, help="Filter by service.name (repeatable)"
|
|
1929
|
+
)
|
|
1930
|
+
@click.option("--label", "labels", multiple=True, help="Label key=value (repeatable)")
|
|
1931
|
+
@click.option("--min-latency", default=None, help="Min latency (500ms, 1s)")
|
|
1932
|
+
@click.option("--max-latency", default=None, help="Max latency (500ms, 1s)")
|
|
1933
|
+
@click.option(
|
|
1934
|
+
"--threshold",
|
|
1935
|
+
default="p95",
|
|
1936
|
+
show_default=True,
|
|
1937
|
+
help="Outlier threshold (p50, p90, p95, p99, or raw like 500ms)",
|
|
1938
|
+
)
|
|
1939
|
+
@click.option("--top", default=5, show_default=True, type=int, help="Outliers to show")
|
|
1940
|
+
@click.option(
|
|
1941
|
+
"--compare",
|
|
1942
|
+
"compare_svc",
|
|
1943
|
+
default=None,
|
|
1944
|
+
help="Compare with another service in the same time window",
|
|
1945
|
+
)
|
|
1946
|
+
@click.pass_context
|
|
1947
|
+
@_cli_validate
|
|
1948
|
+
def outliers(
|
|
1949
|
+
ctx,
|
|
1950
|
+
start,
|
|
1951
|
+
end,
|
|
1952
|
+
limit,
|
|
1953
|
+
services,
|
|
1954
|
+
labels,
|
|
1955
|
+
min_latency,
|
|
1956
|
+
max_latency,
|
|
1957
|
+
threshold,
|
|
1958
|
+
top,
|
|
1959
|
+
compare_svc,
|
|
1960
|
+
):
|
|
1961
|
+
"""Find outlier traces and show per-span time breakdown.
|
|
1962
|
+
|
|
1963
|
+
Use --compare to correlate with another service at the same timestamps.
|
|
1964
|
+
|
|
1965
|
+
\b
|
|
1966
|
+
Examples:
|
|
1967
|
+
gtraces outliers --service my-service
|
|
1968
|
+
gtraces outliers --service my-service --label k8s.cluster.name=us-east1-a
|
|
1969
|
+
gtraces outliers --service my-service --compare other-service
|
|
1970
|
+
"""
|
|
1971
|
+
p = ctx.obj["project"]
|
|
1972
|
+
as_json = ctx.obj["json"]
|
|
1973
|
+
label_dict = _parse_labels(labels)
|
|
1974
|
+
|
|
1975
|
+
result = trace_outliers(
|
|
1976
|
+
p,
|
|
1977
|
+
start=start,
|
|
1978
|
+
end=end,
|
|
1979
|
+
limit=limit,
|
|
1980
|
+
services=services,
|
|
1981
|
+
labels=label_dict,
|
|
1982
|
+
min_latency=min_latency,
|
|
1983
|
+
max_latency=max_latency,
|
|
1984
|
+
threshold=threshold,
|
|
1985
|
+
top=top,
|
|
1986
|
+
compare_svc=compare_svc,
|
|
1987
|
+
)
|
|
1988
|
+
|
|
1989
|
+
if as_json:
|
|
1990
|
+
click.echo(json.dumps(result, indent=2))
|
|
1991
|
+
return
|
|
1992
|
+
|
|
1993
|
+
# No traces
|
|
1994
|
+
if result["count"] == 0:
|
|
1995
|
+
click.echo("No traces found.")
|
|
1996
|
+
return
|
|
1997
|
+
|
|
1998
|
+
# Distribution
|
|
1999
|
+
pvals = result["distribution"]
|
|
2000
|
+
click.echo(f"Latency distribution ({result['count']} traces):\n")
|
|
2001
|
+
for label, ms in pvals.items():
|
|
2002
|
+
click.echo(f" {label} {_fmt_ms(ms)}")
|
|
2003
|
+
click.echo()
|
|
2004
|
+
|
|
2005
|
+
if not result["outliers"]:
|
|
2006
|
+
click.echo(
|
|
2007
|
+
f"No outliers above {result['threshold']} "
|
|
2008
|
+
f"({_fmt_ms(result['thresholdMs'])})."
|
|
2009
|
+
)
|
|
2010
|
+
return
|
|
2011
|
+
|
|
2012
|
+
# Outlier table
|
|
2013
|
+
click.echo(
|
|
2014
|
+
f"Outliers above {result['threshold']} "
|
|
2015
|
+
f"({_fmt_ms(result['thresholdMs'])}): "
|
|
2016
|
+
f"{len(result['outliers'])} shown\n"
|
|
2017
|
+
)
|
|
2018
|
+
click.echo(f"{'#':<3} {'TRACE ID':<36} {'TOTAL':>10} TOP SPANS (self time)")
|
|
2019
|
+
click.echo("\u2500" * 110)
|
|
2020
|
+
|
|
2021
|
+
for i, o in enumerate(result["outliers"], 1):
|
|
2022
|
+
total_ms = o["totalMs"]
|
|
2023
|
+
tid = o["traceId"]
|
|
2024
|
+
total_self = o["totalSelfMs"] or 1
|
|
2025
|
+
top_spans = o["spans"][:5]
|
|
2026
|
+
first = top_spans[0]
|
|
2027
|
+
first_pct = first["selfMs"] / total_self * 100
|
|
2028
|
+
click.echo(
|
|
2029
|
+
f"{i:<3} {tid:<36} {_fmt_ms(total_ms):>10} "
|
|
2030
|
+
f"{first['name']} {_fmt_ms(first['selfMs'])} ({first_pct:.0f}%)"
|
|
2031
|
+
)
|
|
2032
|
+
for s in top_spans[1:]:
|
|
2033
|
+
pct = s["selfMs"] / total_self * 100
|
|
2034
|
+
click.echo(f"{'':>52}{s['name']} {_fmt_ms(s['selfMs'])} ({pct:.0f}%)")
|
|
2035
|
+
|
|
2036
|
+
# Comparison
|
|
2037
|
+
if result.get("comparison"):
|
|
2038
|
+
click.echo(f"\n{'=' * 110}")
|
|
2039
|
+
click.echo(f"Comparing with: {result['comparison']['service']}\n")
|
|
2040
|
+
_render_comparison(result["comparison"], pvals, services)
|
|
2041
|
+
|
|
2042
|
+
|
|
2043
|
+
# ── Stats command ────────────────────────────────────────────────────────────
|
|
2044
|
+
|
|
2045
|
+
|
|
2046
|
+
@cli.command()
|
|
2047
|
+
@click.option(
|
|
2048
|
+
"--start",
|
|
2049
|
+
default="1h",
|
|
2050
|
+
show_default=True,
|
|
2051
|
+
help="Start time (1h, 30m, 2d, 1w, or RFC3339)",
|
|
2052
|
+
)
|
|
2053
|
+
@click.option("--end", default=None, help="End time (default: now)")
|
|
2054
|
+
@click.option(
|
|
2055
|
+
"--limit", default=100, show_default=True, type=int, help="Max traces to fetch"
|
|
2056
|
+
)
|
|
2057
|
+
@click.option(
|
|
2058
|
+
"--span-name",
|
|
2059
|
+
"span_pattern",
|
|
2060
|
+
default=None,
|
|
2061
|
+
help="Span name filter (substring match)",
|
|
2062
|
+
)
|
|
2063
|
+
@click.option(
|
|
2064
|
+
"--group-by",
|
|
2065
|
+
"group_by",
|
|
2066
|
+
default=None,
|
|
2067
|
+
help="Comma-separated label keys to group by",
|
|
2068
|
+
)
|
|
2069
|
+
@click.option(
|
|
2070
|
+
"--service", "services", multiple=True, help="Filter by service.name (repeatable)"
|
|
2071
|
+
)
|
|
2072
|
+
@click.option("--label", "labels", multiple=True, help="Label key=value (repeatable)")
|
|
2073
|
+
@click.option("--min-latency", default=None, help="Min latency (500ms, 1s)")
|
|
2074
|
+
@click.option("--max-latency", default=None, help="Max latency (500ms, 1s)")
|
|
2075
|
+
@click.option(
|
|
2076
|
+
"--bucket",
|
|
2077
|
+
"bucket_str",
|
|
2078
|
+
default=None,
|
|
2079
|
+
help="Time bucket size (e.g. 5m, 10m, 1h, 1w)",
|
|
2080
|
+
)
|
|
2081
|
+
@click.option(
|
|
2082
|
+
"--sparkline", "sparkline", is_flag=True, help="Show p50 trend sparkline per group"
|
|
2083
|
+
)
|
|
2084
|
+
@click.pass_context
|
|
2085
|
+
@_cli_validate
|
|
2086
|
+
def stats(
|
|
2087
|
+
ctx,
|
|
2088
|
+
start,
|
|
2089
|
+
end,
|
|
2090
|
+
limit,
|
|
2091
|
+
span_pattern,
|
|
2092
|
+
group_by,
|
|
2093
|
+
services,
|
|
2094
|
+
labels,
|
|
2095
|
+
min_latency,
|
|
2096
|
+
max_latency,
|
|
2097
|
+
bucket_str,
|
|
2098
|
+
sparkline,
|
|
2099
|
+
):
|
|
2100
|
+
"""Latency stats, optionally grouped by span labels.
|
|
2101
|
+
|
|
2102
|
+
Collects matching spans from fetched traces and computes percentile
|
|
2103
|
+
distributions. Use --group-by to break down by one or more label keys.
|
|
2104
|
+
|
|
2105
|
+
\b
|
|
2106
|
+
Examples:
|
|
2107
|
+
gtraces stats --span-name "POST /v1/rtb" --group-by cloud.region
|
|
2108
|
+
gtraces stats --span-name HTTP --group-by cloud.region,service.name
|
|
2109
|
+
gtraces stats --service my-service --group-by abtest
|
|
2110
|
+
gtraces stats --service my-service --bucket 5m
|
|
2111
|
+
gtraces stats --service my-service --group-by abtest --sparkline
|
|
2112
|
+
"""
|
|
2113
|
+
p = ctx.obj["project"]
|
|
2114
|
+
as_json = ctx.obj["json"]
|
|
2115
|
+
label_dict = _parse_labels(labels)
|
|
2116
|
+
group_keys = [k.strip() for k in group_by.split(",")] if group_by else []
|
|
2117
|
+
|
|
2118
|
+
result = trace_stats(
|
|
2119
|
+
p,
|
|
2120
|
+
start=start,
|
|
2121
|
+
end=end,
|
|
2122
|
+
limit=limit,
|
|
2123
|
+
span_pattern=span_pattern,
|
|
2124
|
+
group_by=group_keys or None,
|
|
2125
|
+
services=services,
|
|
2126
|
+
labels=label_dict,
|
|
2127
|
+
min_latency=min_latency,
|
|
2128
|
+
max_latency=max_latency,
|
|
2129
|
+
bucket=bucket_str,
|
|
2130
|
+
sparkline=sparkline,
|
|
2131
|
+
)
|
|
2132
|
+
|
|
2133
|
+
if not result:
|
|
2134
|
+
click.echo("No traces found.")
|
|
2135
|
+
return
|
|
2136
|
+
|
|
2137
|
+
if result.get("totalSpans", 0) == 0:
|
|
2138
|
+
click.echo("No spans matched.")
|
|
2139
|
+
return
|
|
2140
|
+
|
|
2141
|
+
if as_json:
|
|
2142
|
+
click.echo(json.dumps(result, indent=2))
|
|
2143
|
+
return
|
|
2144
|
+
|
|
2145
|
+
# ── Text output ──────────────────────────────────────────────────────
|
|
2146
|
+
total_spans = result["totalSpans"]
|
|
2147
|
+
total_traces = result["totalTraces"]
|
|
2148
|
+
desc = f"Latency stats ({total_spans} spans across {total_traces} traces)"
|
|
2149
|
+
if span_pattern:
|
|
2150
|
+
desc += f' span ~ "{span_pattern}"'
|
|
2151
|
+
click.echo(desc)
|
|
2152
|
+
click.echo()
|
|
2153
|
+
|
|
2154
|
+
groups = result.get("groups", [result] if "percentiles" in result else [])
|
|
2155
|
+
|
|
2156
|
+
# ── Bucket mode ──────────────────────────────────────────────────────
|
|
2157
|
+
if bucket_str:
|
|
2158
|
+
for gi, g in enumerate(groups):
|
|
2159
|
+
if group_keys:
|
|
2160
|
+
key = g.get("key", {})
|
|
2161
|
+
label = " ".join(
|
|
2162
|
+
f"{k}={key.get(k, '') or '(none)'}" for k in group_keys
|
|
2163
|
+
)
|
|
2164
|
+
click.echo(f"\u2500\u2500 {label} ({g['count']} spans) \u2500\u2500")
|
|
2165
|
+
click.echo()
|
|
2166
|
+
|
|
2167
|
+
click.echo(
|
|
2168
|
+
f"{'TIME':<13} {'COUNT':>6} "
|
|
2169
|
+
f"{'avg':>10} {'p50':>10} {'p90':>10} {'p95':>10} {'p99':>10}"
|
|
2170
|
+
)
|
|
2171
|
+
click.echo("\u2500" * 81)
|
|
2172
|
+
|
|
2173
|
+
for b in g.get("buckets", []):
|
|
2174
|
+
if b["count"] > 0:
|
|
2175
|
+
pv = b["percentiles"]
|
|
2176
|
+
click.echo(
|
|
2177
|
+
f"{b['time']:<13} {b['count']:>6} "
|
|
2178
|
+
f"{_fmt_ms(b['avgMs']):>10} {_fmt_ms(pv['p50']):>10} "
|
|
2179
|
+
f"{_fmt_ms(pv['p90']):>10} {_fmt_ms(pv['p95']):>10} "
|
|
2180
|
+
f"{_fmt_ms(pv['p99']):>10}"
|
|
2181
|
+
)
|
|
2182
|
+
else:
|
|
2183
|
+
click.echo(
|
|
2184
|
+
f"{b['time']:<13} {'0':>6} "
|
|
2185
|
+
f"{'-':>10} {'-':>10} {'-':>10} {'-':>10} {'-':>10}"
|
|
2186
|
+
)
|
|
2187
|
+
|
|
2188
|
+
if gi < len(groups) - 1:
|
|
2189
|
+
click.echo()
|
|
2190
|
+
click.echo()
|
|
2191
|
+
return
|
|
2192
|
+
|
|
2193
|
+
# ── Summary mode (with optional sparkline) ───────────────────────────
|
|
2194
|
+
if group_keys:
|
|
2195
|
+
rows = []
|
|
2196
|
+
for g in groups:
|
|
2197
|
+
key = g.get("key", {})
|
|
2198
|
+
label = " ".join(f"{k}={key.get(k, '') or '(none)'}" for k in group_keys)
|
|
2199
|
+
rows.append((label, g))
|
|
2200
|
+
|
|
2201
|
+
max_lbl = max(len(r[0]) for r in rows)
|
|
2202
|
+
max_lbl = max(max_lbl, 5)
|
|
2203
|
+
|
|
2204
|
+
hdr = (
|
|
2205
|
+
f"{'GROUP':<{max_lbl}} {'COUNT':>6} "
|
|
2206
|
+
f"{'avg':>10} {'p50':>10} {'p90':>10} {'p95':>10} {'p99':>10}"
|
|
2207
|
+
)
|
|
2208
|
+
if sparkline:
|
|
2209
|
+
hdr += " TREND"
|
|
2210
|
+
click.echo(hdr)
|
|
2211
|
+
click.echo("\u2500" * (max_lbl + 68 + (30 if sparkline else 0)))
|
|
2212
|
+
|
|
2213
|
+
for label, g in rows:
|
|
2214
|
+
pv = g["percentiles"]
|
|
2215
|
+
line = (
|
|
2216
|
+
f"{label:<{max_lbl}} {g['count']:>6} "
|
|
2217
|
+
f"{_fmt_ms(g['avgMs']):>10} {_fmt_ms(pv['p50']):>10} "
|
|
2218
|
+
f"{_fmt_ms(pv['p90']):>10} {_fmt_ms(pv['p95']):>10} "
|
|
2219
|
+
f"{_fmt_ms(pv['p99']):>10}"
|
|
2220
|
+
)
|
|
2221
|
+
if g.get("trend"):
|
|
2222
|
+
line += f" {g['trend']}"
|
|
2223
|
+
click.echo(line)
|
|
2224
|
+
else:
|
|
2225
|
+
g = groups[0] if groups else result
|
|
2226
|
+
pv = g["percentiles"]
|
|
2227
|
+
click.echo(f" avg {_fmt_ms(g['avgMs'])}")
|
|
2228
|
+
for label, ms in pv.items():
|
|
2229
|
+
click.echo(f" {label} {_fmt_ms(ms)}")
|
|
2230
|
+
if sparkline and g.get("trend"):
|
|
2231
|
+
click.echo(f"\n trend {g['trend']}")
|
|
2232
|
+
|
|
2233
|
+
click.echo()
|
|
2234
|
+
|
|
2235
|
+
|
|
2236
|
+
if __name__ == "__main__":
|
|
2237
|
+
cli()
|