gtraces 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gtraces/__init__.py ADDED
@@ -0,0 +1,2237 @@
1
+ #!/usr/bin/env python3
2
+ """gtraces - CLI for GCP Cloud Trace API v1.
3
+
4
+ Also importable as a library:
5
+
6
+ import gtraces
7
+ gtraces.set_token("my-token") # or monkey-patch get_token
8
+ traces = gtraces.trace_list("my-project", start="1h")
9
+ """
10
+
11
+ import functools
12
+ import json
13
+ import os
14
+ import random
15
+ import re
16
+ import subprocess
17
+ import time
18
+ from collections import Counter, defaultdict
19
+ from concurrent.futures import ThreadPoolExecutor, as_completed
20
+ from datetime import datetime, timedelta, timezone
21
+ from urllib.error import HTTPError
22
+ from urllib.parse import urlencode
23
+ from urllib.request import Request, urlopen
24
+
25
+ import click
26
+
27
+ __all__ = [
28
+ "trace_list",
29
+ "trace_services",
30
+ "trace_spans",
31
+ "trace_get",
32
+ "trace_search",
33
+ "trace_outliers",
34
+ "trace_stats",
35
+ "trace_compare",
36
+ "get_token",
37
+ "set_token",
38
+ "fetch_traces",
39
+ "filter_traces",
40
+ "ApiError",
41
+ ]
42
+
43
+ # ── Constants ────────────────────────────────────────────────────────────────
44
+
45
+ DEFAULT_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT")
46
+ API = "https://cloudtrace.googleapis.com/v1/projects"
47
+
48
+ INTERESTING_LABELS = {
49
+ "bids",
50
+ "deadline_ms",
51
+ "actual_deadline_ms",
52
+ "parent_deadline_ms",
53
+ "auctionType",
54
+ "done_by",
55
+ "done_at_ms",
56
+ "ctx_done_at_ms",
57
+ "responses_before_deadline",
58
+ "responses_before_done",
59
+ "drained_count",
60
+ "auction",
61
+ "http.response.status_code",
62
+ "http.ttfb_ms",
63
+ "otel.status_code",
64
+ "otel.status_description",
65
+ "service.name",
66
+ "cloud.region",
67
+ "k8s.cluster.name",
68
+ "placement",
69
+ "publisher_country",
70
+ "abtest",
71
+ }
72
+
73
+ # ── Auth & HTTP ──────────────────────────────────────────────────────────────
74
+
75
+ _token = None
76
+
77
+
78
+ class ApiError(Exception):
79
+ """Raised on API failures with actionable hints.
80
+
81
+ Inherits from Exception (not click.ClickException) so library consumers
82
+ don't need click as a dependency for exception handling.
83
+ """
84
+
85
+
86
+ def get_token():
87
+ """Get access token via gcloud (cached for process lifetime)."""
88
+ global _token
89
+ if _token:
90
+ return _token
91
+ try:
92
+ r = subprocess.run(
93
+ ["gcloud", "auth", "print-access-token"],
94
+ capture_output=True,
95
+ text=True,
96
+ check=True,
97
+ )
98
+ _token = r.stdout.strip()
99
+ return _token
100
+ except FileNotFoundError:
101
+ raise ApiError(
102
+ "gcloud not found. Install: https://cloud.google.com/sdk/docs/install"
103
+ )
104
+ except subprocess.CalledProcessError:
105
+ raise ApiError("Auth failed. Run: gcloud auth login")
106
+
107
+
108
+ def set_token(tok):
109
+ """Set the auth token directly, bypassing gcloud CLI.
110
+
111
+ Convenience for library consumers who obtain tokens externally
112
+ (e.g. via google-auth ADC).
113
+ """
114
+ global _token
115
+ _token = tok
116
+
117
+
118
+ _MAX_RETRIES = 3
119
+ _RETRY_BASE_SEC = 1.0
120
+
121
+
122
+ def api_get(project, path, params=None):
123
+ """GET from Cloud Trace API v1 with retry on 429. Returns parsed JSON."""
124
+ url = f"{API}/{project}{path}"
125
+ if params:
126
+ url += "?" + urlencode(params)
127
+ last_exc = None
128
+ for attempt in range(_MAX_RETRIES + 1):
129
+ req = Request(url, headers={"Authorization": f"Bearer {get_token()}"})
130
+ try:
131
+ with urlopen(req, timeout=30) as resp:
132
+ return json.loads(resp.read())
133
+ except HTTPError as e:
134
+ body = e.read().decode(errors="replace")
135
+ if e.code == 429 and attempt < _MAX_RETRIES:
136
+ delay = _RETRY_BASE_SEC * (2**attempt) + random.uniform(0, 0.5)
137
+ time.sleep(delay)
138
+ last_exc = e
139
+ continue
140
+ msgs = {
141
+ 401: "Auth expired. Run: gcloud auth login",
142
+ 403: f"Permission denied for project '{project}'",
143
+ 404: "Not found",
144
+ 429: (
145
+ f"Rate limited after {_MAX_RETRIES} retries. "
146
+ "Reduce --limit or try again later"
147
+ ),
148
+ }
149
+ raise ApiError(f"{msgs.get(e.code, f'HTTP {e.code}')}\n{body}")
150
+ raise ApiError(f"Request failed after {_MAX_RETRIES} retries: {last_exc}")
151
+
152
+
153
+ def fetch_traces(project, params, max_results=None):
154
+ """Fetch traces with automatic pagination."""
155
+ traces = []
156
+ while True:
157
+ data = api_get(project, "/traces", params)
158
+ traces.extend(data.get("traces", []))
159
+ if max_results and len(traces) >= max_results:
160
+ return traces[:max_results]
161
+ token = data.get("nextPageToken")
162
+ if not token:
163
+ break
164
+ params = {**params, "pageToken": token}
165
+ return traces
166
+
167
+
168
+ # ── Helpers ──────────────────────────────────────────────────────────────────
169
+
170
+
171
+ def _ts(s):
172
+ """RFC3339 string to datetime."""
173
+ return datetime.fromisoformat(s.replace("Z", "+00:00"))
174
+
175
+
176
+ def _dur(span):
177
+ """Span duration in milliseconds."""
178
+ return (_ts(span["endTime"]) - _ts(span["startTime"])).total_seconds() * 1000
179
+
180
+
181
+ def _root(spans):
182
+ """Find the root span (no parentSpanId)."""
183
+ for s in spans:
184
+ if not s.get("parentSpanId"):
185
+ return s
186
+ return spans[0] if spans else None
187
+
188
+
189
+ def _fmt_ms(ms):
190
+ """Format milliseconds for display."""
191
+ if ms >= 1000:
192
+ return f"{ms / 1000:.2f}s"
193
+ return f"{ms:.1f}ms"
194
+
195
+
196
+ def _now():
197
+ return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
198
+
199
+
200
+ def _parse_time(s):
201
+ """Parse relative (1h, 30m, 2d, 1w) or RFC3339 to RFC3339 string."""
202
+ m = re.match(r"^(\d+)([mhdw])$", s)
203
+ if m:
204
+ n, u = int(m.group(1)), m.group(2)
205
+ delta = {
206
+ "m": timedelta(minutes=n),
207
+ "h": timedelta(hours=n),
208
+ "d": timedelta(days=n),
209
+ "w": timedelta(weeks=n),
210
+ }[u]
211
+ return (datetime.now(timezone.utc) - delta).strftime("%Y-%m-%dT%H:%M:%SZ")
212
+ return s
213
+
214
+
215
+ def _parse_latency(s):
216
+ """Parse '500ms' or '1.5s' to (api_filter_str, ms_float). Returns (None, None) if empty."""
217
+ if not s:
218
+ return None, None
219
+ m = re.match(r"^(\d+(?:\.\d+)?)(ms|s)$", s)
220
+ if not m:
221
+ raise ValueError(f"Bad latency: {s} (use e.g. 500ms, 1s)")
222
+ val, unit = float(m.group(1)), m.group(2)
223
+ ms = val if unit == "ms" else val * 1000
224
+ return f"{m.group(1)}{m.group(2)}", ms
225
+
226
+
227
+ def _parse_labels(label_tuple):
228
+ """Parse ('key=value', ...) tuple into a dict."""
229
+ d = {}
230
+ for label in label_tuple:
231
+ if "=" not in label:
232
+ raise click.BadParameter(f"Expected key=value, got: {label}")
233
+ k, v = label.split("=", 1)
234
+ d[k] = v
235
+ return d or None
236
+
237
+
238
+ def _parse_group_by(group_by):
239
+ """Parse comma-separated grouping keys."""
240
+ if not group_by:
241
+ return []
242
+ return [k.strip() for k in group_by.split(",") if k.strip()]
243
+
244
+
245
+ def _build_params(
246
+ start, end, limit, view="ROOTSPAN", min_latency=None, services=None, labels=None
247
+ ):
248
+ """Build common API query params."""
249
+ params = {
250
+ "pageSize": min(limit, 100),
251
+ "startTime": _parse_time(start),
252
+ "endTime": end or _now(),
253
+ "view": view,
254
+ }
255
+ parts = []
256
+ if min_latency:
257
+ filt, ms = _parse_latency(min_latency)
258
+ if ms and ms > 0:
259
+ parts.append(f"latency:{filt}")
260
+ if services:
261
+ for svc in services:
262
+ parts.append(f"service.name:{svc}")
263
+ if labels:
264
+ for k, v in labels.items():
265
+ parts.append(f"{k}:{v}")
266
+ if parts:
267
+ params["filter"] = " ".join(parts)
268
+ return params
269
+
270
+
271
+ def _matches_trace(
272
+ trace,
273
+ *,
274
+ span_name=None,
275
+ services=None,
276
+ labels=None,
277
+ min_ms=None,
278
+ max_ms=None,
279
+ ):
280
+ """Check whether trace matches root/service/label/latency filters."""
281
+ if isinstance(services, str):
282
+ services = (services,)
283
+ spans = trace.get("spans", [])
284
+ r = _root(spans)
285
+ if not r:
286
+ return False
287
+ if span_name and span_name not in (r.get("name") or ""):
288
+ return False
289
+ dur = _dur(r)
290
+ if min_ms is not None and dur < min_ms:
291
+ return False
292
+ if max_ms is not None and dur > max_ms:
293
+ return False
294
+ svc_set = set(services) if services else None
295
+ if svc_set and not any(
296
+ s.get("labels", {}).get("service.name") in svc_set for s in spans
297
+ ):
298
+ return False
299
+ rl = r.get("labels", {})
300
+ if labels and not all(rl.get(k) == v for k, v in labels.items()):
301
+ return False
302
+ return True
303
+
304
+
305
+ def filter_traces(
306
+ traces, *, span_name=None, services=None, labels=None, min_ms=None, max_ms=None
307
+ ):
308
+ """Filter traces by root span name, service(s), labels, and duration range.
309
+
310
+ Unified filter — replaces the former match_traces + _filter_by_labels.
311
+ """
312
+ return [
313
+ t
314
+ for t in traces
315
+ if _matches_trace(
316
+ t,
317
+ span_name=span_name,
318
+ services=services,
319
+ labels=labels,
320
+ min_ms=min_ms,
321
+ max_ms=max_ms,
322
+ )
323
+ ]
324
+
325
+
326
+ def _to_durations(traces):
327
+ """Return sorted (ms, trace) list from traces with root spans."""
328
+ out = []
329
+ for t in traces:
330
+ r = _root(t.get("spans", []))
331
+ if r:
332
+ out.append((_dur(r), t))
333
+ out.sort(key=lambda x: x[0])
334
+ return out
335
+
336
+
337
+ def _resolve_threshold(threshold, pvals):
338
+ """Resolve a threshold string to milliseconds."""
339
+ if threshold in pvals:
340
+ return pvals[threshold]
341
+ _, ms = _parse_latency(threshold)
342
+ if ms is None:
343
+ raise ValueError(
344
+ f"Bad threshold: {threshold} (use p50/p90/p95/p99 or e.g. 500ms)"
345
+ )
346
+ return ms
347
+
348
+
349
+ def _pcts(values):
350
+ """Compute p50/p90/p95/p99 from a sorted list of numbers."""
351
+ n = len(values)
352
+ if n == 0:
353
+ return {"p50": 0, "p90": 0, "p95": 0, "p99": 0}
354
+ targets = {"p50": 0.50, "p90": 0.90, "p95": 0.95, "p99": 0.99}
355
+ return {k: values[min(int(v * n), n - 1)] for k, v in targets.items()}
356
+
357
+
358
+ def _cli_validate(fn):
359
+ """Decorator: convert library exceptions to Click exceptions in CLI commands."""
360
+
361
+ @functools.wraps(fn)
362
+ def wrapper(*args, **kwargs):
363
+ try:
364
+ return fn(*args, **kwargs)
365
+ except ValueError as e:
366
+ raise click.BadParameter(str(e))
367
+ except ApiError as e:
368
+ raise click.ClickException(str(e))
369
+
370
+ return wrapper
371
+
372
+
373
+ # ── Timeseries helpers ───────────────────────────────────────────────────────
374
+
375
+ _SPARK = "▁▂▃▄▅▆▇█"
376
+
377
+
378
+ def _sparkline(values):
379
+ """Render a list of numbers as a Unicode sparkline."""
380
+ if not values:
381
+ return ""
382
+ mn, mx = min(values), max(values)
383
+ if mn == mx:
384
+ return _SPARK[3] * len(values)
385
+ rng = mx - mn
386
+ return "".join(_SPARK[min(int((v - mn) / rng * 7), 7)] for v in values)
387
+
388
+
389
+ def _parse_bucket(s):
390
+ """Parse bucket size (5m, 10m, 1h, 1w) to timedelta."""
391
+ m = re.match(r"^(\d+)([mhdw])$", s)
392
+ if not m:
393
+ raise ValueError(f"Bad bucket: {s} (use e.g. 5m, 1h, 1w)")
394
+ n, u = int(m.group(1)), m.group(2)
395
+ return {
396
+ "m": timedelta(minutes=n),
397
+ "h": timedelta(hours=n),
398
+ "d": timedelta(days=n),
399
+ "w": timedelta(weeks=n),
400
+ }[u]
401
+
402
+
403
+ def _make_buckets(data, bucket_delta):
404
+ """Split [(datetime, ms)] into aligned time buckets.
405
+
406
+ Returns [(label, sorted_durs)] sorted by time, including empty buckets.
407
+ """
408
+ if not data:
409
+ return []
410
+ bsec = bucket_delta.total_seconds()
411
+ min_ts = min(ts for ts, _ in data)
412
+ max_ts = max(ts for ts, _ in data)
413
+
414
+ # Align to bucket boundary
415
+ start_epoch = (min_ts.timestamp() // bsec) * bsec
416
+ max_idx = int((max_ts.timestamp() - start_epoch) // bsec)
417
+
418
+ bins = defaultdict(list)
419
+ for ts, dur in data:
420
+ idx = int((ts.timestamp() - start_epoch) // bsec)
421
+ bins[idx].append(dur)
422
+
423
+ result = []
424
+ for idx in range(max_idx + 1):
425
+ t = datetime.fromtimestamp(start_epoch + idx * bsec, tz=timezone.utc)
426
+ t_end = t + bucket_delta
427
+ label = f"{t.strftime('%H:%M')}-{t_end.strftime('%H:%M')}"
428
+ durs = sorted(bins.get(idx, []))
429
+ result.append((label, durs))
430
+ return result
431
+
432
+
433
+ def _trend(data, n_buckets=12):
434
+ """Compute p50 sparkline with range indicator from [(datetime, ms)]."""
435
+ if len(data) < 2:
436
+ return ""
437
+ min_ts = min(ts for ts, _ in data)
438
+ max_ts = max(ts for ts, _ in data)
439
+ span_secs = (max_ts - min_ts).total_seconds()
440
+ if span_secs <= 0:
441
+ return ""
442
+ bucket_secs = span_secs / n_buckets
443
+ buckets = [[] for _ in range(n_buckets)]
444
+ for ts, dur in data:
445
+ idx = min(int((ts - min_ts).total_seconds() / bucket_secs), n_buckets - 1)
446
+ buckets[idx].append(dur)
447
+ p50s = []
448
+ for b in buckets:
449
+ if b:
450
+ b.sort()
451
+ p50s.append(b[len(b) // 2])
452
+ else:
453
+ p50s.append(0)
454
+ spark = _sparkline(p50s)
455
+ real = [v for v in p50s if v > 0]
456
+ if real:
457
+ return f"{spark} {_fmt_ms(min(real))}-{_fmt_ms(max(real))}"
458
+ return spark
459
+
460
+
461
+ # ── Rendering ────────────────────────────────────────────────────────────────
462
+
463
+
464
+ def _parse_fields(field_str):
465
+ """Parse comma-separated field spec (e.g. 'spans,label:cloud.region')."""
466
+ if not field_str:
467
+ return []
468
+ fields = []
469
+ for f in field_str.split(","):
470
+ f = f.strip()
471
+ if f.startswith("label:"):
472
+ key = f[6:]
473
+ fields.append({"type": "label", "key": key, "header": key})
474
+ elif f == "spans":
475
+ fields.append({"type": "spans", "header": "SPANS"})
476
+ else:
477
+ raise click.BadParameter(
478
+ f"Unknown field: {f} (use 'spans' or 'label:KEY')"
479
+ )
480
+ return fields
481
+
482
+
483
+ def _extract_field(t, root, field):
484
+ """Extract a field value from a trace."""
485
+ if field["type"] == "label":
486
+ return (root or {}).get("labels", {}).get(field["key"], "")
487
+ if field["type"] == "spans":
488
+ return str(len(t.get("spans", [])))
489
+ return ""
490
+
491
+
492
+ def render_list(traces, show_labels=False, fields=None):
493
+ """Render traces as a compact table with dynamic column widths."""
494
+ if not traces:
495
+ click.echo("No traces found.")
496
+ return
497
+
498
+ SEP = " "
499
+
500
+ # Pre-compute all cell values per row
501
+ headers = ["TRACE ID", "ROOT SPAN", "DURATION", "TIME"]
502
+ extra_headers = [f["header"] for f in fields] if fields else []
503
+ all_headers = headers + extra_headers
504
+
505
+ rows = []
506
+ for t in traces:
507
+ r = _root(t.get("spans", []))
508
+ tid = t.get("traceId", "?")
509
+ name = r.get("name", "?") if r else "?"
510
+ dur = _fmt_ms(_dur(r)) if r else "?"
511
+ time = r.get("startTime", "?")[:19] if r else "?"
512
+ extra = [_extract_field(t, r, f) for f in fields] if fields else []
513
+ lbl_str = ""
514
+ if show_labels and r:
515
+ lbl = r.get("labels", {})
516
+ interesting = {k: v for k, v in lbl.items() if k in INTERESTING_LABELS}
517
+ if interesting:
518
+ lbl_str = " ".join(f"{k}={v}" for k, v in interesting.items())
519
+ rows.append(([tid, name, dur, time] + extra, lbl_str))
520
+
521
+ # Compute column widths from header + data
522
+ col_widths = [len(h) for h in all_headers]
523
+ for cells, _ in rows:
524
+ for i, val in enumerate(cells):
525
+ col_widths[i] = max(col_widths[i], len(val))
526
+
527
+ # Right-align DURATION column (index 2)
528
+ right_align = {2}
529
+
530
+ def fmt_row(cells):
531
+ parts = []
532
+ for i, val in enumerate(cells):
533
+ w = col_widths[i]
534
+ parts.append(f"{val:>{w}}" if i in right_align else f"{val:<{w}}")
535
+ return SEP.join(parts)
536
+
537
+ click.echo(fmt_row(all_headers) + (SEP + "LABELS" if show_labels else ""))
538
+ click.echo("\u2500" * sum(col_widths + [len(SEP) * (len(col_widths) - 1)]))
539
+
540
+ for cells, lbl_str in rows:
541
+ line = fmt_row(cells)
542
+ if lbl_str:
543
+ line += SEP + lbl_str
544
+ click.echo(line)
545
+
546
+
547
+ def _bar(offset_ms, dur_ms, total_ms, width):
548
+ """Render a positioned horizontal bar using box-drawing characters."""
549
+ if total_ms <= 0 or width <= 0:
550
+ return ""
551
+ start = offset_ms / total_ms * width
552
+ length = dur_ms / total_ms * width
553
+ si = int(start)
554
+ lead = " " * si
555
+ full = int(length)
556
+ half = (length - full) >= 0.5
557
+ bar = "\u2501" * full + ("\u2578" if half else "") # ━ and ╸
558
+ if not bar and dur_ms > 0:
559
+ bar = "\u2578" # ╸
560
+ return f"{lead}{bar}"
561
+
562
+
563
+ def render_tree(trace, bars=False, name_width=35):
564
+ """Render trace as a span tree, optionally with waterfall timing bars."""
565
+ spans = trace.get("spans", [])
566
+ if not spans:
567
+ click.echo("No spans.")
568
+ return
569
+
570
+ r = _root(spans)
571
+ total = _dur(r) if r else 0
572
+ root_start = _ts(r["startTime"]) if r else _ts(spans[0]["startTime"])
573
+ click.echo(
574
+ f"Trace {trace.get('traceId', '?')} | {_fmt_ms(total)} | {len(spans)} spans\n"
575
+ )
576
+
577
+ children = {}
578
+ for s in spans:
579
+ pid = s.get("parentSpanId")
580
+ children.setdefault(pid, []).append(s)
581
+
582
+ name_col = name_width
583
+ dur_col = 10
584
+ bar_width = 0
585
+ if bars:
586
+ try:
587
+ term_width = os.get_terminal_size().columns
588
+ except (AttributeError, ValueError, OSError):
589
+ term_width = 120
590
+ bar_width = max(20, term_width - name_col - dur_col - 4)
591
+
592
+ lines = []
593
+
594
+ def walk(span, prefix="", last=True):
595
+ d = _dur(span)
596
+ name = span.get("name", "?")
597
+ if span is r:
598
+ tree_str = name
599
+ else:
600
+ conn = "\u2514\u2500 " if last else "\u251c\u2500 "
601
+ tree_str = f"{prefix}{conn}{name}"
602
+ dur_str = _fmt_ms(d)
603
+ if bars:
604
+ offset = (_ts(span["startTime"]) - root_start).total_seconds() * 1000
605
+ if len(tree_str) > name_col:
606
+ tree_str = tree_str[: name_col - 2] + ".."
607
+ lines.append((tree_str, dur_str, _bar(offset, d, total, bar_width)))
608
+ else:
609
+ lines.append((tree_str, dur_str))
610
+
611
+ ext = " " if last else "\u2502 "
612
+ kids = sorted(
613
+ children.get(span.get("spanId"), []),
614
+ key=lambda x: x["startTime"],
615
+ )
616
+ for i, kid in enumerate(kids):
617
+ walk(kid, prefix + ext, i == len(kids) - 1)
618
+
619
+ if r:
620
+ walk(r)
621
+ if bars:
622
+ for tree_str, dur_str, bar_str in lines:
623
+ click.echo(f"{tree_str:<{name_col}} {dur_str:>{dur_col - 2}} {bar_str}")
624
+ else:
625
+ for tree_str, dur_str in lines:
626
+ click.echo(f"{tree_str} {dur_str}")
627
+
628
+
629
+ def render_timeline(trace):
630
+ """Render chronological timeline with bottleneck summary."""
631
+ spans = trace.get("spans", [])
632
+ if not spans:
633
+ click.echo("No spans.")
634
+ return
635
+
636
+ r = _root(spans)
637
+ root_start = _ts(r["startTime"]) if r else _ts(spans[0]["startTime"])
638
+ total = _dur(r) if r else 1
639
+
640
+ # Find labels that are identical on every span — show once in header
641
+ all_labels = [s.get("labels", {}) for s in spans]
642
+ common = {}
643
+ if all_labels:
644
+ shared_keys = set(all_labels[0].keys())
645
+ for lbl in all_labels[1:]:
646
+ shared_keys &= set(lbl.keys())
647
+ for k in shared_keys:
648
+ vals = {lbl[k] for lbl in all_labels}
649
+ if len(vals) == 1:
650
+ common[k] = next(iter(vals))
651
+
652
+ header = " ".join(
653
+ f"{k}={v}" for k, v in sorted(common.items()) if k in INTERESTING_LABELS
654
+ )
655
+ click.echo(
656
+ f"Trace {trace.get('traceId', '?')} | {_fmt_ms(total)} | {len(spans)} spans"
657
+ )
658
+ if header:
659
+ click.echo(f" {header}")
660
+ click.echo()
661
+ click.echo(f"{'OFFSET':>10} {'SPAN':<55} {'DURATION':>10} {'%':>5} LABELS")
662
+ click.echo("\u2500" * 110)
663
+
664
+ by_id = {s.get("spanId"): s for s in spans}
665
+
666
+ def depth(span):
667
+ d, pid = 0, span.get("parentSpanId")
668
+ while pid and pid in by_id:
669
+ d += 1
670
+ pid = by_id[pid].get("parentSpanId")
671
+ return d
672
+
673
+ ranked = []
674
+ for s in sorted(spans, key=lambda x: x["startTime"]):
675
+ d = _dur(s)
676
+ offset = (_ts(s["startTime"]) - root_start).total_seconds() * 1000
677
+ dep = depth(s)
678
+ pct = d / total * 100 if total > 0 else 0
679
+
680
+ labels = s.get("labels", {})
681
+ unique = {
682
+ k: v
683
+ for k, v in labels.items()
684
+ if k in INTERESTING_LABELS and (k not in common or common[k] != v)
685
+ }
686
+ lbl = " ".join(f"{k}={v}" for k, v in unique.items())
687
+
688
+ indent = " " * dep
689
+ name = f"{indent}{s.get('name', '?')}"
690
+ slow = " *" if d > 100 else ""
691
+ click.echo(
692
+ f"+{_fmt_ms(offset):>9} {name:<55} {_fmt_ms(d) + slow:>12} {pct:>4.0f}% {lbl}"
693
+ )
694
+ ranked.append((s.get("name", "?"), d, pct))
695
+
696
+ ranked.sort(key=lambda x: -x[1])
697
+ click.echo("\nSlowest spans:")
698
+ for name, d, pct in ranked[:5]:
699
+ click.echo(f" {name:<55} {_fmt_ms(d):>10} ({pct:.0f}%)")
700
+
701
+
702
+ def _render_comparison(comparison, primary_pvals, services):
703
+ """Render cross-service comparison text from structured data."""
704
+ compare_svc = comparison["service"]
705
+ primary_label = ", ".join(services) if services else "primary"
706
+ dist = comparison.get("distribution")
707
+
708
+ if not dist:
709
+ click.echo(f" No traces found for {compare_svc}.")
710
+ return
711
+
712
+ cmp_pvals = dist["percentiles"]
713
+ cmp_n = dist["count"]
714
+
715
+ click.echo(f" {compare_svc} latency ({cmp_n} traces):\n")
716
+ click.echo(f" {'PCTL':<6} {primary_label:<30} {compare_svc}")
717
+ click.echo(f" {'─' * 70}")
718
+ for label in ("p50", "p90", "p95", "p99"):
719
+ click.echo(
720
+ f" {label:<6} {_fmt_ms(primary_pvals[label]):<30} "
721
+ f"{_fmt_ms(cmp_pvals[label])}"
722
+ )
723
+
724
+ click.echo("\n During outlier windows:")
725
+ click.echo(
726
+ f" {'#':<3} {'TIME':<26} "
727
+ f"{primary_label + ' latency':<25} {compare_svc + ' latency'}"
728
+ )
729
+ click.echo(f" {'─' * 80}")
730
+
731
+ for i, w in enumerate(comparison.get("windows", []), 1):
732
+ if "avgMs" in w:
733
+ cmp_str = (
734
+ f"avg {_fmt_ms(w['avgMs'])}, max {_fmt_ms(w['maxMs'])} "
735
+ f"({w['count']} traces)"
736
+ )
737
+ else:
738
+ cmp_str = "(no traces)"
739
+ click.echo(f" {i:<3} {w['time']:<26} {_fmt_ms(w['primaryMs']):<25} {cmp_str}")
740
+
741
+
742
+ def _fmt_opt_ms(ms):
743
+ """Format optional milliseconds value."""
744
+ return _fmt_ms(ms) if ms is not None else "-"
745
+
746
+
747
+ def render_compare(result):
748
+ """Render conditional B|A comparison in text mode."""
749
+ a = result["conditionA"]
750
+ b = result["sampleB"]
751
+ group_by = result["meta"].get("groupBy", [])
752
+
753
+ click.echo("Conditional comparison B | A (descriptive, non-causal)")
754
+ click.echo(
755
+ f"A sample: {a['tracesMatched']} traces from {a['tracesFetched']} fetched "
756
+ f"(missing root: {a['skippedNoRoot']})"
757
+ )
758
+ click.echo(
759
+ f"B windows: {b['windowsSucceeded']}/{b['windowsTotal']} succeeded, "
760
+ f"{b['windowsFailed']} failed"
761
+ )
762
+ click.echo(
763
+ f"B traces: seen={b['tracesSeen']} deduped={b['tracesDeduped']} "
764
+ f"(missing root: {b['skippedNoRoot']})"
765
+ )
766
+ click.echo()
767
+
768
+ dist = b["distribution"]
769
+ if dist["count"] == 0:
770
+ click.echo("No B traces matched in conditioned windows.")
771
+ else:
772
+ p = dist["percentiles"]
773
+ click.echo(
774
+ f"B latency avg {_fmt_opt_ms(dist['avgMs'])} "
775
+ f"p50 {_fmt_opt_ms(p['p50'])} p90 {_fmt_opt_ms(p['p90'])} "
776
+ f"p95 {_fmt_opt_ms(p['p95'])} p99 {_fmt_opt_ms(p['p99'])}"
777
+ )
778
+
779
+ groups = result.get("groups", [])
780
+ if groups:
781
+ click.echo()
782
+ click.echo(
783
+ f"{'GROUP':<42} {'COUNT':>6} {'avg':>10} {'p50':>10} {'p90':>10} {'p95':>10} {'p99':>10}"
784
+ )
785
+ click.echo("─" * 108)
786
+ for g in groups:
787
+ key = g.get("key", {})
788
+ label = (
789
+ " ".join(f"{k}={key.get(k, '') or '(none)'}" for k in group_by)
790
+ or "(all)"
791
+ )
792
+ p = g["percentiles"]
793
+ click.echo(
794
+ f"{label:<42} {g['count']:>6} {_fmt_opt_ms(g['avgMs']):>10} "
795
+ f"{_fmt_opt_ms(p['p50']):>10} {_fmt_opt_ms(p['p90']):>10} "
796
+ f"{_fmt_opt_ms(p['p95']):>10} {_fmt_opt_ms(p['p99']):>10}"
797
+ )
798
+
799
+ if result.get("warnings"):
800
+ click.echo()
801
+ click.echo("Warnings:")
802
+ for w in result["warnings"]:
803
+ click.echo(f" - [{w['code']}] {w['message']}")
804
+
805
+
806
+ # ── Programmatic API ─────────────────────────────────────────────────────────
807
+
808
+
809
+ def trace_list(
810
+ project,
811
+ *,
812
+ start="1h",
813
+ end=None,
814
+ limit=20,
815
+ services=(),
816
+ labels=None,
817
+ min_latency=None,
818
+ max_latency=None,
819
+ ):
820
+ """Fetch and filter recent traces. Returns list of trace dicts."""
821
+ _, max_ms = _parse_latency(max_latency)
822
+ params = _build_params(
823
+ start, end, limit, min_latency=min_latency, services=services, labels=labels
824
+ )
825
+ traces = fetch_traces(project, params, max_results=limit)
826
+ if services or max_ms is not None:
827
+ traces = filter_traces(traces, services=services, max_ms=max_ms)
828
+ return traces
829
+
830
+
831
+ def trace_get(project, trace_id):
832
+ """Fetch a single trace by ID. Returns trace dict."""
833
+ return api_get(project, f"/traces/{trace_id}")
834
+
835
+
836
+ def trace_services(project, *, start="3h", end=None, limit=200):
837
+ """Collect service and endpoint counts.
838
+
839
+ Returns {services: dict, endpoints: dict, trace_count: int}.
840
+ """
841
+ params = _build_params(start, end, limit)
842
+ traces = fetch_traces(project, params, max_results=limit)
843
+ if not traces:
844
+ return {"services": {}, "endpoints": {}, "trace_count": 0}
845
+ svc_counts = Counter()
846
+ ep_counts = Counter()
847
+ for t in traces:
848
+ for s in t.get("spans", []):
849
+ svc = s.get("labels", {}).get("service.name")
850
+ if svc:
851
+ svc_counts[svc] += 1
852
+ if not s.get("parentSpanId"):
853
+ ep_counts[s.get("name", "?")] += 1
854
+ return {
855
+ "services": dict(svc_counts),
856
+ "endpoints": dict(ep_counts),
857
+ "trace_count": len(traces),
858
+ }
859
+
860
+
861
+ def trace_spans(
862
+ project,
863
+ *,
864
+ start="1h",
865
+ end=None,
866
+ limit=20,
867
+ services=(),
868
+ min_latency=None,
869
+ max_latency=None,
870
+ ):
871
+ """Collect distinct span name counts.
872
+
873
+ Returns {spans: dict[str, int], trace_count: int}.
874
+ """
875
+ _, max_ms = _parse_latency(max_latency)
876
+ params = _build_params(
877
+ start, end, limit, view="COMPLETE", min_latency=min_latency, services=services
878
+ )
879
+ traces = fetch_traces(project, params, max_results=limit)
880
+ traces = filter_traces(traces, services=services, max_ms=max_ms)
881
+ span_counts = Counter()
882
+ for t in traces:
883
+ for s in t.get("spans", []):
884
+ span_counts[s.get("name", "?")] += 1
885
+ return {"spans": dict(span_counts.most_common()), "trace_count": len(traces)}
886
+
887
+
888
+ def trace_search(
889
+ project,
890
+ *,
891
+ start="1h",
892
+ end=None,
893
+ limit=50,
894
+ span_name=None,
895
+ labels=None,
896
+ min_latency=None,
897
+ max_latency=None,
898
+ services=(),
899
+ parent_span_id=None,
900
+ order_asc=None,
901
+ order_desc=None,
902
+ ):
903
+ """Search traces with client-side filtering.
904
+
905
+ Without parent_span_id: returns list of trace dicts.
906
+ With parent_span_id: returns list of {traceId, spans} dicts containing
907
+ only the spans matching that parent.
908
+ """
909
+ if order_asc and order_desc:
910
+ raise ValueError("Cannot use both order_asc and order_desc")
911
+ _, min_ms = _parse_latency(min_latency)
912
+ _, max_ms = _parse_latency(max_latency)
913
+ view = "COMPLETE" if parent_span_id else "ROOTSPAN"
914
+ params = _build_params(
915
+ start,
916
+ end,
917
+ limit,
918
+ view=view,
919
+ min_latency=min_latency,
920
+ services=services,
921
+ labels=labels,
922
+ )
923
+ traces = fetch_traces(project, params, max_results=limit)
924
+
925
+ if not parent_span_id:
926
+ filtered = filter_traces(
927
+ traces,
928
+ span_name=span_name,
929
+ services=services or None,
930
+ min_ms=min_ms,
931
+ max_ms=max_ms,
932
+ )
933
+ if order_asc or order_desc:
934
+ reverse = order_desc is not None
935
+ filtered.sort(
936
+ key=lambda t: _dur(r) if (r := _root(t.get("spans", []))) else 0,
937
+ reverse=reverse,
938
+ )
939
+ return filtered
940
+
941
+ matches = []
942
+ for t in traces:
943
+ matched_spans = [
944
+ s
945
+ for s in t.get("spans", [])
946
+ if s.get("parentSpanId") == parent_span_id
947
+ and (not span_name or s.get("name") == span_name)
948
+ ]
949
+ if matched_spans:
950
+ matches.append({"traceId": t["traceId"], "spans": matched_spans})
951
+ return matches
952
+
953
+
954
+ def trace_outliers(
955
+ project,
956
+ *,
957
+ start="1h",
958
+ end=None,
959
+ limit=50,
960
+ services=(),
961
+ labels=None,
962
+ min_latency=None,
963
+ max_latency=None,
964
+ threshold="p95",
965
+ top=5,
966
+ compare_svc=None,
967
+ ):
968
+ """Find outlier traces with per-span breakdown. Always returns a dict.
969
+
970
+ Keys: distribution, count, threshold, thresholdMs, outliers.
971
+ Optional: comparison (when compare_svc is given).
972
+ """
973
+ _, max_ms = _parse_latency(max_latency)
974
+ params = _build_params(
975
+ start,
976
+ end,
977
+ limit,
978
+ view="COMPLETE",
979
+ min_latency=min_latency,
980
+ services=services,
981
+ labels=labels,
982
+ )
983
+ all_traces = fetch_traces(project, params, max_results=limit)
984
+ filtered = filter_traces(all_traces, max_ms=max_ms)
985
+ durations = _to_durations(filtered)
986
+
987
+ if not durations:
988
+ return {
989
+ "distribution": {"p50": 0, "p90": 0, "p95": 0, "p99": 0},
990
+ "count": 0,
991
+ "threshold": threshold,
992
+ "thresholdMs": 0,
993
+ "outliers": [],
994
+ }
995
+
996
+ pvals = _pcts([ms for ms, _ in durations])
997
+ n = len(durations)
998
+ thresh_ms = _resolve_threshold(threshold, pvals)
999
+ outlier_list = [(ms, t) for ms, t in durations if ms >= thresh_ms]
1000
+ outlier_list.sort(key=lambda x: -x[0])
1001
+ outlier_list = outlier_list[:top]
1002
+
1003
+ json_out = []
1004
+ for total_ms, t in outlier_list:
1005
+ tid = t.get("traceId")
1006
+ span_self = _span_breakdown(t.get("spans", []))
1007
+ total_self = sum(ms for _, ms in span_self) or 1
1008
+ json_out.append(
1009
+ {
1010
+ "traceId": tid,
1011
+ "totalMs": round(total_ms, 1),
1012
+ "totalSelfMs": round(total_self, 1),
1013
+ "spans": [
1014
+ {
1015
+ "name": name,
1016
+ "selfMs": round(ms, 1),
1017
+ "pct": round(ms / total_self * 100),
1018
+ }
1019
+ for name, ms in span_self[:8]
1020
+ ],
1021
+ }
1022
+ )
1023
+
1024
+ result = {
1025
+ "distribution": {k: round(v, 1) for k, v in pvals.items()},
1026
+ "count": n,
1027
+ "threshold": threshold,
1028
+ "thresholdMs": round(thresh_ms, 1),
1029
+ "outliers": json_out,
1030
+ }
1031
+
1032
+ if compare_svc:
1033
+ result["comparison"] = _compare_services(
1034
+ project,
1035
+ outlier_list,
1036
+ all_traces,
1037
+ compare_svc,
1038
+ )
1039
+
1040
+ return result
1041
+
1042
+
1043
+ def trace_stats(
1044
+ project,
1045
+ *,
1046
+ start="1h",
1047
+ end=None,
1048
+ limit=100,
1049
+ span_pattern=None,
1050
+ group_by=None,
1051
+ services=(),
1052
+ labels=None,
1053
+ min_latency=None,
1054
+ max_latency=None,
1055
+ bucket=None,
1056
+ sparkline=False,
1057
+ ):
1058
+ """Compute latency stats, optionally grouped by span labels.
1059
+
1060
+ Parameters:
1061
+ group_by — list of label keys to group by, or None
1062
+ labels — dict of label filters, or None
1063
+ bucket — bucket size string (e.g. '5m', '1h', '1w'), or None
1064
+ sparkline — include trend data per group
1065
+
1066
+ Returns:
1067
+ {} — when no traces found.
1068
+ {"totalSpans": 0, "totalTraces": int} — when traces found but no spans matched.
1069
+ Full dict with totalSpans, totalTraces, percentiles, groups, etc. on success.
1070
+ """
1071
+ _, max_ms = _parse_latency(max_latency)
1072
+ group_keys = list(group_by) if group_by else []
1073
+ bucket_delta = _parse_bucket(bucket) if bucket else None
1074
+
1075
+ params = _build_params(
1076
+ start,
1077
+ end,
1078
+ limit,
1079
+ view="COMPLETE",
1080
+ min_latency=min_latency,
1081
+ services=services,
1082
+ labels=labels,
1083
+ )
1084
+ all_traces = fetch_traces(project, params, max_results=limit)
1085
+ filtered = filter_traces(all_traces, max_ms=max_ms)
1086
+
1087
+ if not filtered:
1088
+ return {}
1089
+
1090
+ groups = defaultdict(list)
1091
+ total_spans = 0
1092
+
1093
+ for t in filtered:
1094
+ for s in t.get("spans", []):
1095
+ if span_pattern and span_pattern not in s.get("name", ""):
1096
+ continue
1097
+ total_spans += 1
1098
+ span_ts = _ts(s["startTime"])
1099
+ dur = _dur(s)
1100
+ if group_keys:
1101
+ lbl = s.get("labels", {})
1102
+ key = tuple(lbl.get(k, "") for k in group_keys)
1103
+ groups[key].append((span_ts, dur))
1104
+ else:
1105
+ groups[()].append((span_ts, dur))
1106
+
1107
+ if not total_spans:
1108
+ return {"totalSpans": 0, "totalTraces": len(filtered)}
1109
+
1110
+ sorted_groups = sorted(groups.items(), key=lambda x: -len(x[1]))
1111
+
1112
+ json_groups = []
1113
+ for key, data in sorted_groups:
1114
+ durs = sorted(d for _, d in data)
1115
+ entry = {
1116
+ "count": len(durs),
1117
+ "avgMs": round(sum(durs) / len(durs), 1),
1118
+ "percentiles": {k: round(v, 1) for k, v in _pcts(durs).items()},
1119
+ }
1120
+ if group_keys:
1121
+ entry["key"] = dict(zip(group_keys, key))
1122
+ if bucket_delta:
1123
+ bkts = _make_buckets(data, bucket_delta)
1124
+ entry["buckets"] = [
1125
+ {
1126
+ "time": lbl,
1127
+ "count": len(bd),
1128
+ "avgMs": round(sum(bd) / len(bd), 1) if bd else None,
1129
+ "percentiles": (
1130
+ {k: round(v, 1) for k, v in _pcts(bd).items()} if bd else None
1131
+ ),
1132
+ }
1133
+ for lbl, bd in bkts
1134
+ ]
1135
+ if sparkline:
1136
+ trend_vals = _trend(data)
1137
+ if trend_vals:
1138
+ entry["trend"] = trend_vals
1139
+ json_groups.append(entry)
1140
+
1141
+ out = {"totalSpans": total_spans, "totalTraces": len(filtered)}
1142
+ if span_pattern:
1143
+ out["span"] = span_pattern
1144
+ if group_keys:
1145
+ out["groupBy"] = group_keys
1146
+ out["groups"] = json_groups
1147
+ elif len(json_groups) == 1:
1148
+ out.update(json_groups[0])
1149
+ if bucket:
1150
+ out["bucket"] = bucket
1151
+ return out
1152
+
1153
+
1154
+ def _pct_or_none(values):
1155
+ """Compute rounded percentiles or nulls for empty lists."""
1156
+ if not values:
1157
+ return {"p50": None, "p90": None, "p95": None, "p99": None}
1158
+ return {k: round(v, 1) for k, v in _pcts(sorted(values)).items()}
1159
+
1160
+
1161
+ def _avg_or_none(values):
1162
+ """Compute rounded average or null for empty lists."""
1163
+ if not values:
1164
+ return None
1165
+ return round(sum(values) / len(values), 1)
1166
+
1167
+
1168
+ def trace_compare(
1169
+ project,
1170
+ *,
1171
+ start="1h",
1172
+ end=None,
1173
+ limit=50,
1174
+ a_services=(),
1175
+ a_labels=None,
1176
+ a_span_name=None,
1177
+ a_min_latency=None,
1178
+ a_max_latency=None,
1179
+ b_service=None,
1180
+ b_labels=None,
1181
+ b_span_name=None,
1182
+ window_sec=30,
1183
+ group_by=None,
1184
+ ):
1185
+ """Describe B latency conditioned on traces where A matches filters.
1186
+
1187
+ This is descriptive co-occurrence analysis in time windows, not causal
1188
+ inference across traces.
1189
+ """
1190
+ if not b_service:
1191
+ raise ValueError("--b-service is required")
1192
+ if window_sec <= 0:
1193
+ raise ValueError("--window-sec must be > 0")
1194
+
1195
+ resolved_end = end or _now()
1196
+ group_keys = list(group_by or [])
1197
+ _, a_min_ms = _parse_latency(a_min_latency)
1198
+ _, a_max_ms = _parse_latency(a_max_latency)
1199
+
1200
+ a_params = _build_params(
1201
+ start,
1202
+ resolved_end,
1203
+ limit,
1204
+ view="ROOTSPAN",
1205
+ min_latency=a_min_latency,
1206
+ services=a_services,
1207
+ labels=a_labels,
1208
+ )
1209
+ a_traces = fetch_traces(project, a_params, max_results=limit)
1210
+ a_roots = []
1211
+ skipped_a_no_root = 0
1212
+ for t in a_traces:
1213
+ r = _root(t.get("spans", []))
1214
+ if not r:
1215
+ skipped_a_no_root += 1
1216
+ continue
1217
+ if not _matches_trace(
1218
+ t,
1219
+ span_name=a_span_name,
1220
+ services=a_services or None,
1221
+ labels=a_labels,
1222
+ min_ms=a_min_ms,
1223
+ max_ms=a_max_ms,
1224
+ ):
1225
+ continue
1226
+ a_roots.append(r)
1227
+
1228
+ warnings = []
1229
+ if not a_roots:
1230
+ warnings.append(
1231
+ {
1232
+ "code": "A_EMPTY_SAMPLE",
1233
+ "message": "No A traces matched the given condition",
1234
+ }
1235
+ )
1236
+
1237
+ # Build per-root windows, then merge overlapping ones to reduce API calls
1238
+ raw_windows = []
1239
+ for r in a_roots:
1240
+ t_start = _ts(r["startTime"])
1241
+ raw_windows.append(
1242
+ (
1243
+ t_start - timedelta(seconds=window_sec),
1244
+ t_start + timedelta(seconds=window_sec),
1245
+ )
1246
+ )
1247
+ raw_windows.sort()
1248
+
1249
+ windows = []
1250
+ for ws, we in raw_windows:
1251
+ if windows and ws <= windows[-1][1]:
1252
+ # Overlapping — extend the previous window
1253
+ prev_s, prev_e = windows[-1]
1254
+ windows[-1] = (prev_s, max(prev_e, we))
1255
+ else:
1256
+ windows.append((ws, we))
1257
+
1258
+ # Format merged windows as RFC3339 strings
1259
+ windows = [
1260
+ (ws.strftime("%Y-%m-%dT%H:%M:%SZ"), we.strftime("%Y-%m-%dT%H:%M:%SZ"))
1261
+ for ws, we in windows
1262
+ ]
1263
+
1264
+ b_seen = 0
1265
+ b_roots_by_trace = {}
1266
+ b_skipped_no_root = 0
1267
+ windows_succeeded = 0
1268
+ windows_failed = 0
1269
+
1270
+ def _fetch_window(win):
1271
+ win_start, win_end = win
1272
+ params = _build_params(
1273
+ win_start,
1274
+ win_end,
1275
+ 100,
1276
+ view="ROOTSPAN",
1277
+ services=(b_service,),
1278
+ labels=b_labels,
1279
+ )
1280
+ try:
1281
+ traces = fetch_traces(project, params, max_results=100)
1282
+ return {"ok": True, "traces": traces, "start": win_start, "end": win_end}
1283
+ except (ApiError, OSError, ValueError) as e:
1284
+ return {
1285
+ "ok": False,
1286
+ "error": str(e),
1287
+ "start": win_start,
1288
+ "end": win_end,
1289
+ }
1290
+
1291
+ if windows:
1292
+ with ThreadPoolExecutor(max_workers=min(len(windows), 8)) as pool:
1293
+ futures = {pool.submit(_fetch_window, w): w for w in windows}
1294
+ for fut in as_completed(futures):
1295
+ data = fut.result()
1296
+ if not data["ok"]:
1297
+ windows_failed += 1
1298
+ warnings.append(
1299
+ {
1300
+ "code": "B_WINDOW_FETCH_FAILED",
1301
+ "message": (
1302
+ f"window {data['start']}..{data['end']} failed: "
1303
+ f"{data['error'].splitlines()[0]}"
1304
+ ),
1305
+ }
1306
+ )
1307
+ continue
1308
+
1309
+ windows_succeeded += 1
1310
+ traces = data["traces"]
1311
+ for t in traces:
1312
+ r = _root(t.get("spans", []))
1313
+ if not r:
1314
+ b_skipped_no_root += 1
1315
+ continue
1316
+ if not _matches_trace(
1317
+ t,
1318
+ span_name=b_span_name,
1319
+ services=(b_service,),
1320
+ labels=b_labels,
1321
+ ):
1322
+ continue
1323
+ b_seen += 1
1324
+ tid = t.get("traceId")
1325
+ if tid and tid not in b_roots_by_trace:
1326
+ b_roots_by_trace[tid] = r
1327
+
1328
+ if windows and windows_succeeded == 0:
1329
+ warnings.append(
1330
+ {
1331
+ "code": "B_ALL_WINDOWS_FAILED",
1332
+ "message": "All B window fetches failed; result is empty",
1333
+ }
1334
+ )
1335
+
1336
+ b_durations = sorted(_dur(r) for r in b_roots_by_trace.values())
1337
+ groups = defaultdict(list)
1338
+ if group_keys:
1339
+ for r in b_roots_by_trace.values():
1340
+ rl = r.get("labels", {})
1341
+ key = tuple(rl.get(k, "") for k in group_keys)
1342
+ groups[key].append(_dur(r))
1343
+
1344
+ json_groups = []
1345
+ for key, durs in groups.items():
1346
+ durs = sorted(durs)
1347
+ json_groups.append(
1348
+ {
1349
+ "key": dict(zip(group_keys, key)),
1350
+ "count": len(durs),
1351
+ "avgMs": _avg_or_none(durs),
1352
+ "percentiles": _pct_or_none(durs),
1353
+ }
1354
+ )
1355
+
1356
+ json_groups.sort(
1357
+ key=lambda g: (
1358
+ -g["count"],
1359
+ json.dumps(g.get("key", {}), sort_keys=True, separators=(",", ":")),
1360
+ )
1361
+ )
1362
+
1363
+ dist = {
1364
+ "count": len(b_durations),
1365
+ "avgMs": _avg_or_none(b_durations),
1366
+ "percentiles": _pct_or_none(b_durations),
1367
+ }
1368
+
1369
+ return {
1370
+ "meta": {
1371
+ "schemaVersion": "compare.v1",
1372
+ "project": project,
1373
+ "start": _parse_time(start),
1374
+ "end": resolved_end,
1375
+ "limit": limit,
1376
+ "windowSec": window_sec,
1377
+ "groupBy": group_keys,
1378
+ },
1379
+ "conditionA": {
1380
+ "filters": {
1381
+ "services": list(a_services),
1382
+ "labels": a_labels or {},
1383
+ "spanName": a_span_name,
1384
+ "minLatency": a_min_latency,
1385
+ "maxLatency": a_max_latency,
1386
+ },
1387
+ "tracesFetched": len(a_traces),
1388
+ "tracesMatched": len(a_roots),
1389
+ "skippedNoRoot": skipped_a_no_root,
1390
+ },
1391
+ "sampleB": {
1392
+ "filters": {
1393
+ "service": b_service,
1394
+ "labels": b_labels or {},
1395
+ "spanName": b_span_name,
1396
+ },
1397
+ "windowsTotal": len(windows),
1398
+ "windowsSucceeded": windows_succeeded,
1399
+ "windowsFailed": windows_failed,
1400
+ "tracesSeen": b_seen,
1401
+ "tracesDeduped": len(b_roots_by_trace),
1402
+ "skippedNoRoot": b_skipped_no_root,
1403
+ "distribution": dist,
1404
+ },
1405
+ "groups": json_groups,
1406
+ "warnings": warnings,
1407
+ }
1408
+
1409
+
1410
+ # ── CLI ──────────────────────────────────────────────────────────────────────
1411
+
1412
+
1413
+ @click.group()
1414
+ @click.option(
1415
+ "--project",
1416
+ envvar="GOOGLE_CLOUD_PROJECT",
1417
+ required=True,
1418
+ help="GCP project ID (or set GOOGLE_CLOUD_PROJECT)",
1419
+ )
1420
+ @click.option("--json", "as_json", is_flag=True, help="Raw JSON output")
1421
+ @click.pass_context
1422
+ def cli(ctx, project, as_json):
1423
+ """gtraces - query and analyze GCP Cloud Traces."""
1424
+ ctx.ensure_object(dict)
1425
+ ctx.obj["project"] = project
1426
+ ctx.obj["json"] = as_json
1427
+ # Pre-warm auth token before any concurrency
1428
+ try:
1429
+ get_token()
1430
+ except ApiError as e:
1431
+ raise click.ClickException(str(e))
1432
+
1433
+
1434
+ @cli.command("list")
1435
+ @click.option(
1436
+ "--start",
1437
+ default="1h",
1438
+ show_default=True,
1439
+ help="Start time (1h, 30m, 2d, 1w, or RFC3339)",
1440
+ )
1441
+ @click.option("--end", default=None, help="End time (default: now)")
1442
+ @click.option(
1443
+ "--limit", default=20, show_default=True, type=int, help="Max traces to fetch"
1444
+ )
1445
+ @click.option(
1446
+ "--service", "services", multiple=True, help="Filter by service.name (repeatable)"
1447
+ )
1448
+ @click.option("--label", "labels", multiple=True, help="Label key=value (repeatable)")
1449
+ @click.option("--min-latency", default=None, help="Min latency (500ms, 1s)")
1450
+ @click.option("--max-latency", default=None, help="Max latency (500ms, 1s)")
1451
+ @click.pass_context
1452
+ @_cli_validate
1453
+ def list_cmd(ctx, start, end, limit, services, labels, min_latency, max_latency):
1454
+ """List recent traces."""
1455
+ label_dict = _parse_labels(labels)
1456
+ traces = trace_list(
1457
+ ctx.obj["project"],
1458
+ start=start,
1459
+ end=end,
1460
+ limit=limit,
1461
+ services=services,
1462
+ labels=label_dict,
1463
+ min_latency=min_latency,
1464
+ max_latency=max_latency,
1465
+ )
1466
+ if ctx.obj["json"]:
1467
+ click.echo(json.dumps(traces, indent=2))
1468
+ else:
1469
+ render_list(traces)
1470
+
1471
+
1472
+ @cli.command()
1473
+ @click.option(
1474
+ "--start",
1475
+ default="3h",
1476
+ show_default=True,
1477
+ help="Start time (1h, 30m, 2d, 1w, or RFC3339)",
1478
+ )
1479
+ @click.option("--end", default=None, help="End time (default: now)")
1480
+ @click.option(
1481
+ "--limit", default=200, show_default=True, type=int, help="Max traces to fetch"
1482
+ )
1483
+ @click.pass_context
1484
+ @_cli_validate
1485
+ def services(ctx, start, end, limit):
1486
+ """List services and endpoints seen in recent traces."""
1487
+ result = trace_services(ctx.obj["project"], start=start, end=end, limit=limit)
1488
+ if not result["services"] and not result["endpoints"]:
1489
+ click.echo("No traces found.")
1490
+ return
1491
+
1492
+ if ctx.obj["json"]:
1493
+ out = {k: v for k, v in result.items() if k != "trace_count"}
1494
+ click.echo(json.dumps(out, indent=2))
1495
+ return
1496
+
1497
+ svc_counts = Counter(result["services"])
1498
+ ep_counts = Counter(result["endpoints"])
1499
+ click.echo(f"{'SERVICE':<45} TRACES")
1500
+ click.echo("\u2500" * 55)
1501
+ for svc, n in svc_counts.most_common():
1502
+ click.echo(f" {svc:<45} {n}")
1503
+ click.echo()
1504
+ click.echo(f"{'ENDPOINT':<45} TRACES")
1505
+ click.echo("\u2500" * 55)
1506
+ for ep, n in ep_counts.most_common():
1507
+ click.echo(f" {ep:<45} {n}")
1508
+ click.echo(f"\nScanned {result['trace_count']} traces from last {start}.")
1509
+
1510
+
1511
+ @cli.command()
1512
+ @click.option(
1513
+ "--start",
1514
+ default="1h",
1515
+ show_default=True,
1516
+ help="Start time (1h, 30m, 2d, 1w, or RFC3339)",
1517
+ )
1518
+ @click.option("--end", default=None, help="End time (default: now)")
1519
+ @click.option(
1520
+ "--limit", default=20, show_default=True, type=int, help="Max traces to fetch"
1521
+ )
1522
+ @click.option(
1523
+ "--service", "services", multiple=True, help="Filter by service.name (repeatable)"
1524
+ )
1525
+ @click.option("--min-latency", default=None, help="Min latency (500ms, 1s)")
1526
+ @click.option("--max-latency", default=None, help="Max latency (500ms, 1s)")
1527
+ @click.pass_context
1528
+ @_cli_validate
1529
+ def spans(ctx, start, end, limit, services, min_latency, max_latency):
1530
+ """List distinct span names from sampled traces."""
1531
+ result = trace_spans(
1532
+ ctx.obj["project"],
1533
+ start=start,
1534
+ end=end,
1535
+ limit=limit,
1536
+ services=services,
1537
+ min_latency=min_latency,
1538
+ max_latency=max_latency,
1539
+ )
1540
+ if not result["spans"]:
1541
+ click.echo("No traces found.")
1542
+ return
1543
+
1544
+ if ctx.obj["json"]:
1545
+ click.echo(json.dumps(result["spans"], indent=2))
1546
+ return
1547
+
1548
+ span_counts = Counter(result["spans"])
1549
+ click.echo(f"{'SPAN NAME':<60} COUNT")
1550
+ click.echo("\u2500" * 70)
1551
+ for name, n in span_counts.most_common():
1552
+ click.echo(f" {name:<60} {n}")
1553
+ click.echo(f"\nSampled {result['trace_count']} traces.")
1554
+
1555
+
1556
+ @cli.command()
1557
+ @click.argument("trace_id")
1558
+ @click.option("--bars", is_flag=True, help="Show waterfall timing bars")
1559
+ @click.option(
1560
+ "--name-width",
1561
+ default=35,
1562
+ show_default=True,
1563
+ type=int,
1564
+ help="Span name column width",
1565
+ )
1566
+ @click.pass_context
1567
+ @_cli_validate
1568
+ def get(ctx, trace_id, bars, name_width):
1569
+ """Show trace as a span tree."""
1570
+ trace = trace_get(ctx.obj["project"], trace_id)
1571
+ if ctx.obj["json"]:
1572
+ click.echo(json.dumps(trace, indent=2))
1573
+ else:
1574
+ render_tree(trace, bars=bars, name_width=name_width)
1575
+
1576
+
1577
+ @cli.command()
1578
+ @click.option(
1579
+ "--start",
1580
+ default="1h",
1581
+ show_default=True,
1582
+ help="Start time (1h, 30m, 2d, 1w, or RFC3339)",
1583
+ )
1584
+ @click.option("--end", default=None, help="End time (default: now)")
1585
+ @click.option(
1586
+ "--limit", default=50, show_default=True, type=int, help="Max traces to fetch"
1587
+ )
1588
+ @click.option("--span-name", default=None, help="Root span name (substring match)")
1589
+ @click.option("--label", "labels", multiple=True, help="Label key=value (repeatable)")
1590
+ @click.option("--min-latency", default=None, help="Min latency (500ms, 1s)")
1591
+ @click.option("--max-latency", default=None, help="Max latency (500ms, 1s)")
1592
+ @click.option(
1593
+ "--service", "services", multiple=True, help="Filter by service.name (repeatable)"
1594
+ )
1595
+ @click.option(
1596
+ "--parent-span-id",
1597
+ default=None,
1598
+ help="Find spans with this parentSpanId (fetches full traces)",
1599
+ )
1600
+ @click.option("--show-labels", is_flag=True, help="Show interesting labels in output")
1601
+ @click.option(
1602
+ "--extra-fields",
1603
+ "field_str",
1604
+ default=None,
1605
+ help="Extra columns (e.g. spans,label:cloud.region,label:placement)",
1606
+ )
1607
+ @click.option(
1608
+ "--order-asc", default=None, type=click.Choice(["duration"]), help="Sort ascending"
1609
+ )
1610
+ @click.option(
1611
+ "--order-desc",
1612
+ default=None,
1613
+ type=click.Choice(["duration"]),
1614
+ help="Sort descending",
1615
+ )
1616
+ @click.pass_context
1617
+ @_cli_validate
1618
+ def search(
1619
+ ctx,
1620
+ start,
1621
+ end,
1622
+ limit,
1623
+ span_name,
1624
+ labels,
1625
+ min_latency,
1626
+ max_latency,
1627
+ services,
1628
+ parent_span_id,
1629
+ show_labels,
1630
+ field_str,
1631
+ order_asc,
1632
+ order_desc,
1633
+ ):
1634
+ """Search traces with client-side filtering.
1635
+
1636
+ When --parent-span-id is used, full trace details are fetched to match
1637
+ inner spans (not just root spans). Useful for cross-service correlation.
1638
+
1639
+ \b
1640
+ Examples:
1641
+ gtraces search --span-name "POST /v1/rtb" --min-latency 500ms
1642
+ gtraces search --min-latency 300ms --max-latency 500ms --service my-service
1643
+ gtraces search --service my-service --parent-span-id 123456
1644
+ """
1645
+ p = ctx.obj["project"]
1646
+ label_dict = _parse_labels(labels)
1647
+ fields = _parse_fields(field_str)
1648
+
1649
+ result = trace_search(
1650
+ p,
1651
+ start=start,
1652
+ end=end,
1653
+ limit=limit,
1654
+ span_name=span_name,
1655
+ labels=label_dict,
1656
+ min_latency=min_latency,
1657
+ max_latency=max_latency,
1658
+ services=services,
1659
+ parent_span_id=parent_span_id,
1660
+ order_asc=order_asc,
1661
+ order_desc=order_desc,
1662
+ )
1663
+
1664
+ if not parent_span_id:
1665
+ if ctx.obj["json"]:
1666
+ click.echo(json.dumps(result, indent=2))
1667
+ else:
1668
+ render_list(result, show_labels=show_labels, fields=fields)
1669
+ click.echo(f"\n{len(result)} traces matched.")
1670
+ return
1671
+
1672
+ if not result:
1673
+ click.echo("No spans matched.")
1674
+ return
1675
+
1676
+ if ctx.obj["json"]:
1677
+ click.echo(json.dumps(result, indent=2))
1678
+ return
1679
+
1680
+ total_spans = sum(len(m["spans"]) for m in result)
1681
+ click.echo(
1682
+ f"Found {total_spans} span(s) in {len(result)} trace(s) "
1683
+ f"with parentSpanId={parent_span_id}\n"
1684
+ )
1685
+ click.echo(
1686
+ f"{'TRACE ID':<36} {'SPAN NAME':<40} "
1687
+ f"{'DURATION':>10} {'SPAN ID':<20} LABELS"
1688
+ )
1689
+ click.echo("\u2500" * 120)
1690
+ for m in result:
1691
+ tid = m.get("traceId", "?")
1692
+ for s in m["spans"]:
1693
+ name = s.get("name", "?")[:40]
1694
+ dur = _fmt_ms(_dur(s))
1695
+ sid = s.get("spanId", "?")
1696
+ lbl_d = s.get("labels", {})
1697
+ interesting = {k: v for k, v in lbl_d.items() if k in INTERESTING_LABELS}
1698
+ lbl = " ".join(f"{k}={v}" for k, v in interesting.items())
1699
+ click.echo(f"{tid:<36} {name:<40} {dur:>10} {sid:<20} {lbl}")
1700
+
1701
+
1702
+ @cli.command()
1703
+ @click.option(
1704
+ "--start",
1705
+ default="1h",
1706
+ show_default=True,
1707
+ help="Start time (1h, 30m, 2d, 1w, or RFC3339)",
1708
+ )
1709
+ @click.option("--end", default=None, help="End time (default: now)")
1710
+ @click.option(
1711
+ "--limit", default=50, show_default=True, type=int, help="Max A traces to fetch"
1712
+ )
1713
+ @click.option(
1714
+ "--a-service",
1715
+ "a_services",
1716
+ multiple=True,
1717
+ help="A filter: service.name (repeatable)",
1718
+ )
1719
+ @click.option("--a-label", "a_labels", multiple=True, help="A filter: label key=value")
1720
+ @click.option(
1721
+ "--a-span-name",
1722
+ default=None,
1723
+ help="A filter: root span name (substring match)",
1724
+ )
1725
+ @click.option(
1726
+ "--a-min-latency",
1727
+ default=None,
1728
+ help="A filter: min root latency (500ms, 1s)",
1729
+ )
1730
+ @click.option(
1731
+ "--a-max-latency",
1732
+ default=None,
1733
+ help="A filter: max root latency (500ms, 1s)",
1734
+ )
1735
+ @click.option(
1736
+ "--b-service",
1737
+ required=True,
1738
+ help="B target service.name (required)",
1739
+ )
1740
+ @click.option("--b-label", "b_labels", multiple=True, help="B filter: label key=value")
1741
+ @click.option(
1742
+ "--b-span-name",
1743
+ default=None,
1744
+ help="B filter: root span name (substring match)",
1745
+ )
1746
+ @click.option(
1747
+ "--window-sec",
1748
+ default=30,
1749
+ show_default=True,
1750
+ type=int,
1751
+ help="Conditioning window around each A trace (seconds)",
1752
+ )
1753
+ @click.option(
1754
+ "--group-by",
1755
+ default=None,
1756
+ help="Comma-separated B root label keys to group by",
1757
+ )
1758
+ @click.pass_context
1759
+ @_cli_validate
1760
+ def compare(
1761
+ ctx,
1762
+ start,
1763
+ end,
1764
+ limit,
1765
+ a_services,
1766
+ a_labels,
1767
+ a_span_name,
1768
+ a_min_latency,
1769
+ a_max_latency,
1770
+ b_service,
1771
+ b_labels,
1772
+ b_span_name,
1773
+ window_sec,
1774
+ group_by,
1775
+ ):
1776
+ """Describe B latency conditioned on A traces (descriptive, non-causal).
1777
+
1778
+ \b
1779
+ Examples:
1780
+ gtraces compare --a-service config-service --b-service ssp-service-go
1781
+ gtraces compare --a-service config-service --a-min-latency 600ms --b-service ssp-service-go
1782
+ gtraces --json compare --a-service config-service --b-service ssp-service-go --group-by cloud.region
1783
+ """
1784
+ result = trace_compare(
1785
+ ctx.obj["project"],
1786
+ start=start,
1787
+ end=end,
1788
+ limit=limit,
1789
+ a_services=a_services,
1790
+ a_labels=_parse_labels(a_labels),
1791
+ a_span_name=a_span_name,
1792
+ a_min_latency=a_min_latency,
1793
+ a_max_latency=a_max_latency,
1794
+ b_service=b_service,
1795
+ b_labels=_parse_labels(b_labels),
1796
+ b_span_name=b_span_name,
1797
+ window_sec=window_sec,
1798
+ group_by=_parse_group_by(group_by),
1799
+ )
1800
+
1801
+ if ctx.obj["json"]:
1802
+ click.echo(json.dumps(result, indent=2))
1803
+ else:
1804
+ render_compare(result)
1805
+
1806
+
1807
+ @cli.command()
1808
+ @click.argument("trace_id")
1809
+ @click.pass_context
1810
+ @_cli_validate
1811
+ def analyze(ctx, trace_id):
1812
+ """Timeline analysis with bottleneck detection."""
1813
+ trace = trace_get(ctx.obj["project"], trace_id)
1814
+ if ctx.obj["json"]:
1815
+ click.echo(json.dumps(trace, indent=2))
1816
+ else:
1817
+ render_timeline(trace)
1818
+
1819
+
1820
+ # ── Outliers helpers ─────────────────────────────────────────────────────────
1821
+
1822
+
1823
+ def _span_breakdown(all_spans):
1824
+ """Compute per-span exclusive (self) time. Returns sorted [(name, ms)]."""
1825
+ children_dur = defaultdict(float)
1826
+ for s in all_spans:
1827
+ pid = s.get("parentSpanId")
1828
+ if pid:
1829
+ children_dur[pid] += _dur(s)
1830
+
1831
+ span_self = []
1832
+ for s in all_spans:
1833
+ sid = s.get("spanId")
1834
+ self_time = max(0, _dur(s) - children_dur.get(sid, 0))
1835
+ if self_time > 0:
1836
+ span_self.append((s.get("name", "?"), self_time))
1837
+
1838
+ span_self.sort(key=lambda x: -x[1])
1839
+ return span_self
1840
+
1841
+
1842
+ def _compare_services(project, outlier_list, all_traces, compare_svc):
1843
+ """Cross-service latency comparison during outlier windows.
1844
+
1845
+ Returns comparison data dict with distribution and per-window metrics.
1846
+ """
1847
+ cmp_durations = _to_durations(filter_traces(all_traces, services=compare_svc))
1848
+
1849
+ comparison = {"service": compare_svc, "distribution": None, "windows": []}
1850
+
1851
+ if not cmp_durations:
1852
+ return comparison
1853
+
1854
+ cmp_pvals = _pcts([ms for ms, _ in cmp_durations])
1855
+ cmp_n = len(cmp_durations)
1856
+ comparison["distribution"] = {
1857
+ "count": cmp_n,
1858
+ "percentiles": {k: round(v, 1) for k, v in cmp_pvals.items()},
1859
+ }
1860
+
1861
+ # Build window params for each outlier
1862
+ windows = []
1863
+ for total_ms, t in outlier_list:
1864
+ r = _root(t.get("spans", []))
1865
+ if not r:
1866
+ continue
1867
+ t_start = _ts(r["startTime"])
1868
+ win_start = (t_start - timedelta(seconds=30)).strftime("%Y-%m-%dT%H:%M:%SZ")
1869
+ win_end = (t_start + timedelta(seconds=30)).strftime("%Y-%m-%dT%H:%M:%SZ")
1870
+ windows.append(
1871
+ (
1872
+ total_ms,
1873
+ r,
1874
+ {
1875
+ "pageSize": 20,
1876
+ "startTime": win_start,
1877
+ "endTime": win_end,
1878
+ "view": "ROOTSPAN",
1879
+ },
1880
+ )
1881
+ )
1882
+
1883
+ # Fetch all windows in parallel
1884
+ def _fetch_window(win_params):
1885
+ return fetch_traces(project, win_params, max_results=20)
1886
+
1887
+ with ThreadPoolExecutor(max_workers=min(len(windows), 8)) as pool:
1888
+ futures = {
1889
+ pool.submit(_fetch_window, wp): idx
1890
+ for idx, (_, _, wp) in enumerate(windows)
1891
+ }
1892
+ win_results = [[] for _ in windows]
1893
+ for fut in as_completed(futures):
1894
+ win_results[futures[fut]] = fut.result()
1895
+
1896
+ for _, ((total_ms, r, _), win_traces) in enumerate(zip(windows, win_results), 1):
1897
+ win_durs = _to_durations(filter_traces(win_traces or [], services=compare_svc))
1898
+
1899
+ window_data = {"time": r["startTime"][:19], "primaryMs": round(total_ms, 1)}
1900
+ if win_durs:
1901
+ cmp_vals = [ms for ms, _ in win_durs]
1902
+ avg = sum(cmp_vals) / len(cmp_vals)
1903
+ mx = max(cmp_vals)
1904
+ window_data["avgMs"] = round(avg, 1)
1905
+ window_data["maxMs"] = round(mx, 1)
1906
+ window_data["count"] = len(cmp_vals)
1907
+
1908
+ comparison["windows"].append(window_data)
1909
+
1910
+ return comparison
1911
+
1912
+
1913
+ # ── Outliers command ─────────────────────────────────────────────────────────
1914
+
1915
+
1916
+ @cli.command()
1917
+ @click.option(
1918
+ "--start",
1919
+ default="1h",
1920
+ show_default=True,
1921
+ help="Start time (1h, 30m, 2d, 1w, or RFC3339)",
1922
+ )
1923
+ @click.option("--end", default=None, help="End time (default: now)")
1924
+ @click.option(
1925
+ "--limit", default=50, show_default=True, type=int, help="Max traces to fetch"
1926
+ )
1927
+ @click.option(
1928
+ "--service", "services", multiple=True, help="Filter by service.name (repeatable)"
1929
+ )
1930
+ @click.option("--label", "labels", multiple=True, help="Label key=value (repeatable)")
1931
+ @click.option("--min-latency", default=None, help="Min latency (500ms, 1s)")
1932
+ @click.option("--max-latency", default=None, help="Max latency (500ms, 1s)")
1933
+ @click.option(
1934
+ "--threshold",
1935
+ default="p95",
1936
+ show_default=True,
1937
+ help="Outlier threshold (p50, p90, p95, p99, or raw like 500ms)",
1938
+ )
1939
+ @click.option("--top", default=5, show_default=True, type=int, help="Outliers to show")
1940
+ @click.option(
1941
+ "--compare",
1942
+ "compare_svc",
1943
+ default=None,
1944
+ help="Compare with another service in the same time window",
1945
+ )
1946
+ @click.pass_context
1947
+ @_cli_validate
1948
+ def outliers(
1949
+ ctx,
1950
+ start,
1951
+ end,
1952
+ limit,
1953
+ services,
1954
+ labels,
1955
+ min_latency,
1956
+ max_latency,
1957
+ threshold,
1958
+ top,
1959
+ compare_svc,
1960
+ ):
1961
+ """Find outlier traces and show per-span time breakdown.
1962
+
1963
+ Use --compare to correlate with another service at the same timestamps.
1964
+
1965
+ \b
1966
+ Examples:
1967
+ gtraces outliers --service my-service
1968
+ gtraces outliers --service my-service --label k8s.cluster.name=us-east1-a
1969
+ gtraces outliers --service my-service --compare other-service
1970
+ """
1971
+ p = ctx.obj["project"]
1972
+ as_json = ctx.obj["json"]
1973
+ label_dict = _parse_labels(labels)
1974
+
1975
+ result = trace_outliers(
1976
+ p,
1977
+ start=start,
1978
+ end=end,
1979
+ limit=limit,
1980
+ services=services,
1981
+ labels=label_dict,
1982
+ min_latency=min_latency,
1983
+ max_latency=max_latency,
1984
+ threshold=threshold,
1985
+ top=top,
1986
+ compare_svc=compare_svc,
1987
+ )
1988
+
1989
+ if as_json:
1990
+ click.echo(json.dumps(result, indent=2))
1991
+ return
1992
+
1993
+ # No traces
1994
+ if result["count"] == 0:
1995
+ click.echo("No traces found.")
1996
+ return
1997
+
1998
+ # Distribution
1999
+ pvals = result["distribution"]
2000
+ click.echo(f"Latency distribution ({result['count']} traces):\n")
2001
+ for label, ms in pvals.items():
2002
+ click.echo(f" {label} {_fmt_ms(ms)}")
2003
+ click.echo()
2004
+
2005
+ if not result["outliers"]:
2006
+ click.echo(
2007
+ f"No outliers above {result['threshold']} "
2008
+ f"({_fmt_ms(result['thresholdMs'])})."
2009
+ )
2010
+ return
2011
+
2012
+ # Outlier table
2013
+ click.echo(
2014
+ f"Outliers above {result['threshold']} "
2015
+ f"({_fmt_ms(result['thresholdMs'])}): "
2016
+ f"{len(result['outliers'])} shown\n"
2017
+ )
2018
+ click.echo(f"{'#':<3} {'TRACE ID':<36} {'TOTAL':>10} TOP SPANS (self time)")
2019
+ click.echo("\u2500" * 110)
2020
+
2021
+ for i, o in enumerate(result["outliers"], 1):
2022
+ total_ms = o["totalMs"]
2023
+ tid = o["traceId"]
2024
+ total_self = o["totalSelfMs"] or 1
2025
+ top_spans = o["spans"][:5]
2026
+ first = top_spans[0]
2027
+ first_pct = first["selfMs"] / total_self * 100
2028
+ click.echo(
2029
+ f"{i:<3} {tid:<36} {_fmt_ms(total_ms):>10} "
2030
+ f"{first['name']} {_fmt_ms(first['selfMs'])} ({first_pct:.0f}%)"
2031
+ )
2032
+ for s in top_spans[1:]:
2033
+ pct = s["selfMs"] / total_self * 100
2034
+ click.echo(f"{'':>52}{s['name']} {_fmt_ms(s['selfMs'])} ({pct:.0f}%)")
2035
+
2036
+ # Comparison
2037
+ if result.get("comparison"):
2038
+ click.echo(f"\n{'=' * 110}")
2039
+ click.echo(f"Comparing with: {result['comparison']['service']}\n")
2040
+ _render_comparison(result["comparison"], pvals, services)
2041
+
2042
+
2043
+ # ── Stats command ────────────────────────────────────────────────────────────
2044
+
2045
+
2046
+ @cli.command()
2047
+ @click.option(
2048
+ "--start",
2049
+ default="1h",
2050
+ show_default=True,
2051
+ help="Start time (1h, 30m, 2d, 1w, or RFC3339)",
2052
+ )
2053
+ @click.option("--end", default=None, help="End time (default: now)")
2054
+ @click.option(
2055
+ "--limit", default=100, show_default=True, type=int, help="Max traces to fetch"
2056
+ )
2057
+ @click.option(
2058
+ "--span-name",
2059
+ "span_pattern",
2060
+ default=None,
2061
+ help="Span name filter (substring match)",
2062
+ )
2063
+ @click.option(
2064
+ "--group-by",
2065
+ "group_by",
2066
+ default=None,
2067
+ help="Comma-separated label keys to group by",
2068
+ )
2069
+ @click.option(
2070
+ "--service", "services", multiple=True, help="Filter by service.name (repeatable)"
2071
+ )
2072
+ @click.option("--label", "labels", multiple=True, help="Label key=value (repeatable)")
2073
+ @click.option("--min-latency", default=None, help="Min latency (500ms, 1s)")
2074
+ @click.option("--max-latency", default=None, help="Max latency (500ms, 1s)")
2075
+ @click.option(
2076
+ "--bucket",
2077
+ "bucket_str",
2078
+ default=None,
2079
+ help="Time bucket size (e.g. 5m, 10m, 1h, 1w)",
2080
+ )
2081
+ @click.option(
2082
+ "--sparkline", "sparkline", is_flag=True, help="Show p50 trend sparkline per group"
2083
+ )
2084
+ @click.pass_context
2085
+ @_cli_validate
2086
+ def stats(
2087
+ ctx,
2088
+ start,
2089
+ end,
2090
+ limit,
2091
+ span_pattern,
2092
+ group_by,
2093
+ services,
2094
+ labels,
2095
+ min_latency,
2096
+ max_latency,
2097
+ bucket_str,
2098
+ sparkline,
2099
+ ):
2100
+ """Latency stats, optionally grouped by span labels.
2101
+
2102
+ Collects matching spans from fetched traces and computes percentile
2103
+ distributions. Use --group-by to break down by one or more label keys.
2104
+
2105
+ \b
2106
+ Examples:
2107
+ gtraces stats --span-name "POST /v1/rtb" --group-by cloud.region
2108
+ gtraces stats --span-name HTTP --group-by cloud.region,service.name
2109
+ gtraces stats --service my-service --group-by abtest
2110
+ gtraces stats --service my-service --bucket 5m
2111
+ gtraces stats --service my-service --group-by abtest --sparkline
2112
+ """
2113
+ p = ctx.obj["project"]
2114
+ as_json = ctx.obj["json"]
2115
+ label_dict = _parse_labels(labels)
2116
+ group_keys = [k.strip() for k in group_by.split(",")] if group_by else []
2117
+
2118
+ result = trace_stats(
2119
+ p,
2120
+ start=start,
2121
+ end=end,
2122
+ limit=limit,
2123
+ span_pattern=span_pattern,
2124
+ group_by=group_keys or None,
2125
+ services=services,
2126
+ labels=label_dict,
2127
+ min_latency=min_latency,
2128
+ max_latency=max_latency,
2129
+ bucket=bucket_str,
2130
+ sparkline=sparkline,
2131
+ )
2132
+
2133
+ if not result:
2134
+ click.echo("No traces found.")
2135
+ return
2136
+
2137
+ if result.get("totalSpans", 0) == 0:
2138
+ click.echo("No spans matched.")
2139
+ return
2140
+
2141
+ if as_json:
2142
+ click.echo(json.dumps(result, indent=2))
2143
+ return
2144
+
2145
+ # ── Text output ──────────────────────────────────────────────────────
2146
+ total_spans = result["totalSpans"]
2147
+ total_traces = result["totalTraces"]
2148
+ desc = f"Latency stats ({total_spans} spans across {total_traces} traces)"
2149
+ if span_pattern:
2150
+ desc += f' span ~ "{span_pattern}"'
2151
+ click.echo(desc)
2152
+ click.echo()
2153
+
2154
+ groups = result.get("groups", [result] if "percentiles" in result else [])
2155
+
2156
+ # ── Bucket mode ──────────────────────────────────────────────────────
2157
+ if bucket_str:
2158
+ for gi, g in enumerate(groups):
2159
+ if group_keys:
2160
+ key = g.get("key", {})
2161
+ label = " ".join(
2162
+ f"{k}={key.get(k, '') or '(none)'}" for k in group_keys
2163
+ )
2164
+ click.echo(f"\u2500\u2500 {label} ({g['count']} spans) \u2500\u2500")
2165
+ click.echo()
2166
+
2167
+ click.echo(
2168
+ f"{'TIME':<13} {'COUNT':>6} "
2169
+ f"{'avg':>10} {'p50':>10} {'p90':>10} {'p95':>10} {'p99':>10}"
2170
+ )
2171
+ click.echo("\u2500" * 81)
2172
+
2173
+ for b in g.get("buckets", []):
2174
+ if b["count"] > 0:
2175
+ pv = b["percentiles"]
2176
+ click.echo(
2177
+ f"{b['time']:<13} {b['count']:>6} "
2178
+ f"{_fmt_ms(b['avgMs']):>10} {_fmt_ms(pv['p50']):>10} "
2179
+ f"{_fmt_ms(pv['p90']):>10} {_fmt_ms(pv['p95']):>10} "
2180
+ f"{_fmt_ms(pv['p99']):>10}"
2181
+ )
2182
+ else:
2183
+ click.echo(
2184
+ f"{b['time']:<13} {'0':>6} "
2185
+ f"{'-':>10} {'-':>10} {'-':>10} {'-':>10} {'-':>10}"
2186
+ )
2187
+
2188
+ if gi < len(groups) - 1:
2189
+ click.echo()
2190
+ click.echo()
2191
+ return
2192
+
2193
+ # ── Summary mode (with optional sparkline) ───────────────────────────
2194
+ if group_keys:
2195
+ rows = []
2196
+ for g in groups:
2197
+ key = g.get("key", {})
2198
+ label = " ".join(f"{k}={key.get(k, '') or '(none)'}" for k in group_keys)
2199
+ rows.append((label, g))
2200
+
2201
+ max_lbl = max(len(r[0]) for r in rows)
2202
+ max_lbl = max(max_lbl, 5)
2203
+
2204
+ hdr = (
2205
+ f"{'GROUP':<{max_lbl}} {'COUNT':>6} "
2206
+ f"{'avg':>10} {'p50':>10} {'p90':>10} {'p95':>10} {'p99':>10}"
2207
+ )
2208
+ if sparkline:
2209
+ hdr += " TREND"
2210
+ click.echo(hdr)
2211
+ click.echo("\u2500" * (max_lbl + 68 + (30 if sparkline else 0)))
2212
+
2213
+ for label, g in rows:
2214
+ pv = g["percentiles"]
2215
+ line = (
2216
+ f"{label:<{max_lbl}} {g['count']:>6} "
2217
+ f"{_fmt_ms(g['avgMs']):>10} {_fmt_ms(pv['p50']):>10} "
2218
+ f"{_fmt_ms(pv['p90']):>10} {_fmt_ms(pv['p95']):>10} "
2219
+ f"{_fmt_ms(pv['p99']):>10}"
2220
+ )
2221
+ if g.get("trend"):
2222
+ line += f" {g['trend']}"
2223
+ click.echo(line)
2224
+ else:
2225
+ g = groups[0] if groups else result
2226
+ pv = g["percentiles"]
2227
+ click.echo(f" avg {_fmt_ms(g['avgMs'])}")
2228
+ for label, ms in pv.items():
2229
+ click.echo(f" {label} {_fmt_ms(ms)}")
2230
+ if sparkline and g.get("trend"):
2231
+ click.echo(f"\n trend {g['trend']}")
2232
+
2233
+ click.echo()
2234
+
2235
+
2236
+ if __name__ == "__main__":
2237
+ cli()