guidepost 0.2.18__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {guidepost-0.2.18/guidepost.egg-info → guidepost-0.3.0}/PKG-INFO +4 -2
  2. {guidepost-0.2.18 → guidepost-0.3.0}/guidepost/__init__.py +1 -1
  3. guidepost-0.3.0/guidepost/aggregation.py +505 -0
  4. guidepost-0.3.0/guidepost/guidepost.py +352 -0
  5. guidepost-0.3.0/guidepost/node_layout.py +162 -0
  6. guidepost-0.3.0/guidepost/seriation.py +177 -0
  7. guidepost-0.3.0/guidepost/trailmark.py +65 -0
  8. guidepost-0.3.0/guidepost/utils.py +285 -0
  9. guidepost-0.3.0/guidepost/version.py +2 -0
  10. {guidepost-0.2.18 → guidepost-0.3.0/guidepost.egg-info}/PKG-INFO +4 -2
  11. {guidepost-0.2.18 → guidepost-0.3.0}/guidepost.egg-info/SOURCES.txt +9 -3
  12. {guidepost-0.2.18 → guidepost-0.3.0}/guidepost.egg-info/requires.txt +2 -0
  13. {guidepost-0.2.18 → guidepost-0.3.0}/setup.py +4 -2
  14. guidepost-0.3.0/tests/test_aggregation.py +382 -0
  15. guidepost-0.3.0/tests/test_list_parsing.py +128 -0
  16. guidepost-0.3.0/tests/test_node_layout.py +138 -0
  17. guidepost-0.3.0/tests/test_seriation.py +112 -0
  18. guidepost-0.2.18/MANIFEST.in +0 -2
  19. guidepost-0.2.18/README.md +0 -194
  20. guidepost-0.2.18/guidepost/guidepost.js +0 -2340
  21. guidepost-0.2.18/guidepost/guidepost.py +0 -107
  22. guidepost-0.2.18/guidepost/version.py +0 -2
  23. {guidepost-0.2.18 → guidepost-0.3.0}/LICENSE +0 -0
  24. {guidepost-0.2.18 → guidepost-0.3.0}/guidepost.egg-info/dependency_links.txt +0 -0
  25. {guidepost-0.2.18 → guidepost-0.3.0}/guidepost.egg-info/top_level.txt +0 -0
  26. {guidepost-0.2.18 → guidepost-0.3.0}/pyproject.toml +0 -0
  27. {guidepost-0.2.18 → guidepost-0.3.0}/setup.cfg +0 -0
  28. {guidepost-0.2.18 → guidepost-0.3.0}/tutorials/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: guidepost
3
- Version: 0.2.18
3
+ Version: 0.3.0
4
4
  Summary: Guidepost. An overview visualization for understanding supercomputer queue data.
5
5
  Home-page: https://github.com/cscully-allison/guidepost
6
6
  Author: Connor Scully-Allison
@@ -8,7 +8,7 @@ Author-email: cscullyallison@sci.utah.edu
8
8
  Classifier: Programming Language :: Python :: 3
9
9
  Classifier: License :: OSI Approved :: MIT License
10
10
  Classifier: Operating System :: OS Independent
11
- Requires-Python: >=3.6
11
+ Requires-Python: >=3.10
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  Requires-Dist: numpy
@@ -16,6 +16,8 @@ Requires-Dist: pandas
16
16
  Requires-Dist: scikit-learn
17
17
  Requires-Dist: anywidget
18
18
  Requires-Dist: traitlets
19
+ Requires-Dist: pyarrow>=14
20
+ Requires-Dist: duckdb>=0.10
19
21
  Dynamic: author
20
22
  Dynamic: author-email
21
23
  Dynamic: classifier
@@ -1,2 +1,2 @@
1
1
  from .guidepost import Guidepost
2
-
2
+ from .trailmark import Trailmark
@@ -0,0 +1,505 @@
1
+ """
2
+ Server-side aggregation engine for Guidepost.
3
+
4
+ Moves the heaviest interactive path — recomputing per-(x,y)-cell statistics
5
+ when a category filter is applied — off the browser and into DuckDB. At
6
+ 1M rows the JS-side `calculate_box_metrics` rerun was ~1–2s per bar-chart
7
+ click; DuckDB's vectorized groupby completes the same work in ~100ms.
8
+
9
+ The engine is owned by the Guidepost widget. The widget calls
10
+ `aggregate(...)` in response to JS-originated `request_aggregation` messages
11
+ and ships the result back over anywidget's comm channel.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from collections import OrderedDict
17
+ from typing import Optional
18
+
19
+ import duckdb
20
+ import numpy as np
21
+ import pandas as pd
22
+
23
+ # Cap on the per-engine aggregate-result cache. The user typically cycles
24
+ # through ≤10 bar-chart categories plus the unfiltered baseline; 32 leaves
25
+ # headroom for axis/color-agg switching as well.
26
+ _AGG_CACHE_MAX = 32
27
+
28
+ # Sentinel the JS side uses for null categorical values (MISSING_LABEL in
29
+ # js_model.js). When it appears in a category filter, null rows must be matched
30
+ # via `IS NULL` since SQL `IN (...)` never matches NULL. Keep in sync with JS.
31
+ _MISSING_CATEGORY = "(missing)"
32
+
33
+
34
+ # Aggregator name → DuckDB SQL function. AVG and MEDIAN are exact (DuckDB
35
+ # uses APPROX_QUANTILE for very large groups but the row counts per cell
36
+ # stay well below that threshold).
37
+ _COLOR_AGG_SQL = {
38
+ "avg": "AVG",
39
+ "mean": "AVG",
40
+ "average": "AVG",
41
+ "median": "MEDIAN",
42
+ "med": "MEDIAN",
43
+ "min": "MIN",
44
+ "max": "MAX",
45
+ "sum": "SUM",
46
+ "count": "COUNT",
47
+ }
48
+
49
+
50
+ class AggregationEngine:
51
+ """
52
+ Owns a DuckDB view over the cleaned DataFrame and computes per-facet,
53
+ per-(x,y)-cell stats from explicit threshold arrays produced by the JS
54
+ side. Thresholds are passed in (rather than recomputed here) so the
55
+ cell layout stays aligned with the JS-side bins the heatmap is already
56
+ rendering.
57
+ """
58
+
59
+ @staticmethod
60
+ def _qi(name: str) -> str:
61
+ """Quote a SQL identifier, doubling any embedded double-quotes so column
62
+ names containing `"` can't break (or inject into) the generated SQL.
63
+ Values are always bound as parameters; only identifiers need this."""
64
+ return '"' + str(name).replace('"', '""') + '"'
65
+
66
+ @classmethod
67
+ def _category_clause(cls, category_col, category_filter):
68
+ """Builds the optional category-filter SQL fragment and its params.
69
+ Returns ("", []) when inactive. Honors the missing-value sentinel by
70
+ OR-ing an `IS NULL` test, since `IN (...)` never matches NULL rows."""
71
+ if not category_col or not category_filter:
72
+ return "", []
73
+ col = cls._qi(category_col)
74
+ concrete = [v for v in category_filter if v != _MISSING_CATEGORY]
75
+ want_missing = len(concrete) != len(category_filter)
76
+ terms = []
77
+ params: list = []
78
+ if concrete:
79
+ placeholders = ",".join(["?"] * len(concrete))
80
+ terms.append(f'{col} IN ({placeholders})')
81
+ params.extend(concrete)
82
+ if want_missing:
83
+ terms.append(f'{col} IS NULL')
84
+ if not terms:
85
+ return "", []
86
+ return "(" + " OR ".join(terms) + ")", params
87
+
88
+ def __init__(self, df: pd.DataFrame) -> None:
89
+ # Single in-process connection; DuckDB is thread-safe for reads.
90
+ self._conn = duckdb.connect()
91
+ # Pin the session timezone to UTC. JS sends Date values as naive UTC
92
+ # ISO strings (`Date.toISOString()` minus the trailing `Z`); DuckDB
93
+ # otherwise interprets those naive strings — and TIMESTAMP literals
94
+ # built from them — in the *system* timezone when comparing them
95
+ # against TIMESTAMP WITH TIME ZONE columns, which silently shifts
96
+ # bin boundaries (e.g. by 5–6 h in America/Chicago) and can drop
97
+ # all rows for densely-clustered UTC data.
98
+ self._conn.execute("SET TimeZone='UTC'")
99
+ # `register` exposes the DataFrame as a zero-copy view named "df".
100
+ self._conn.register("df", df)
101
+ self._df = df
102
+ # LRU cache for aggregate() results. The same request signature is
103
+ # dispatched by every mouseleave (always-unfiltered) and by repeat
104
+ # hovers on the same bar; caching turns those into instant returns
105
+ # instead of re-running 3+ SQL queries per facet.
106
+ self._agg_cache: "OrderedDict[tuple, dict]" = OrderedDict()
107
+ self._agg_cache_hits = 0
108
+ self._agg_cache_misses = 0
109
+
110
+ def replace(self, df: pd.DataFrame) -> None:
111
+ """Swap the underlying DataFrame (e.g., on `records=` re-assignment)."""
112
+ self._conn.unregister("df")
113
+ self._conn.register("df", df)
114
+ self._df = df
115
+ # New data → previous grids are stale.
116
+ self._agg_cache.clear()
117
+
118
+ def close(self) -> None:
119
+ try:
120
+ self._conn.close()
121
+ except Exception:
122
+ pass
123
+
124
+ @staticmethod
125
+ def _freeze_thresholds(d: dict[str, list]) -> tuple:
126
+ """Hashable, order-preserving snapshot of a facet→thresholds dict."""
127
+ return tuple((k, tuple(d[k] or ())) for k in sorted(d or {}))
128
+
129
+ def aggregate(
130
+ self,
131
+ *,
132
+ facet_by: str,
133
+ x: str,
134
+ y: str,
135
+ color: str,
136
+ color_agg: str,
137
+ x_thresholds_by_facet: dict[str, list],
138
+ y_thresholds_by_facet: dict[str, list],
139
+ category_col: Optional[str] = None,
140
+ category_filter: Optional[list[str]] = None,
141
+ ) -> dict:
142
+ # Cache lookup. Keyed by every input that affects the SQL output.
143
+ cache_key = (
144
+ facet_by, x, y, color, color_agg,
145
+ self._freeze_thresholds(x_thresholds_by_facet),
146
+ self._freeze_thresholds(y_thresholds_by_facet),
147
+ category_col,
148
+ tuple(category_filter) if category_filter else None,
149
+ )
150
+ cached = self._agg_cache.get(cache_key)
151
+ if cached is not None:
152
+ # Touch for LRU ordering, then return the same dict — JS-side
153
+ # `_apply_python_grid` only reads cells (no mutation), so a
154
+ # shared reference is safe.
155
+ self._agg_cache.move_to_end(cache_key)
156
+ self._agg_cache_hits += 1
157
+ return cached
158
+ self._agg_cache_misses += 1
159
+ result = self._aggregate_uncached(
160
+ facet_by=facet_by, x=x, y=y, color=color, color_agg=color_agg,
161
+ x_thresholds_by_facet=x_thresholds_by_facet,
162
+ y_thresholds_by_facet=y_thresholds_by_facet,
163
+ category_col=category_col,
164
+ category_filter=category_filter,
165
+ )
166
+ self._agg_cache[cache_key] = result
167
+ if len(self._agg_cache) > _AGG_CACHE_MAX:
168
+ self._agg_cache.popitem(last=False)
169
+ return result
170
+
171
+ def _aggregate_uncached(
172
+ self,
173
+ *,
174
+ facet_by: str,
175
+ x: str,
176
+ y: str,
177
+ color: str,
178
+ color_agg: str,
179
+ x_thresholds_by_facet: dict[str, list],
180
+ y_thresholds_by_facet: dict[str, list],
181
+ category_col: Optional[str] = None,
182
+ category_filter: Optional[list[str]] = None,
183
+ ) -> dict:
184
+ """
185
+ Computes the heatmap grid for each facet.
186
+
187
+ Returns a dict shaped to match what JSModel.calculate_box_metrics
188
+ writes onto `faceted_bins[fac].column[i]` and `.column[i].bins[j]`:
189
+
190
+ {
191
+ facet_name: {
192
+ "columns": [
193
+ {
194
+ "count": int,
195
+ "min": ..., "max": ..., "avg": ..., "median": ...,
196
+ "bins": [
197
+ {"count": int, "min": ..., "max": ..., "avg": ...,
198
+ "median": ..., "std_ratio": float},
199
+ ...one entry per y-bin...
200
+ ]
201
+ },
202
+ ...one entry per x-bin (left-edge per threshold)...
203
+ ]
204
+ },
205
+ ...
206
+ }
207
+
208
+ Cells with zero matching rows still appear as zero-count slots so
209
+ the JS renderer can address them by index.
210
+ """
211
+ agg = _COLOR_AGG_SQL.get(color_agg, "AVG")
212
+
213
+ # Build per-facet WHERE clauses for the optional category filter.
214
+ cat_sql, params = self._category_clause(category_col, category_filter)
215
+ cat_clause = f" AND {cat_sql}" if cat_sql else ""
216
+
217
+ result: dict = {}
218
+
219
+ # One query per facet keeps thresholds tractable (each facet can
220
+ # have its own x/y thresholds because _build_axis re-detects log
221
+ # vs linear per facet). For 10 facets at 1M rows total this is
222
+ # still ~100ms end-to-end in DuckDB.
223
+ for facet, x_thresholds in x_thresholds_by_facet.items():
224
+ y_thresholds = y_thresholds_by_facet.get(facet, [])
225
+ if not x_thresholds or not y_thresholds:
226
+ result[facet] = {"columns": []}
227
+ continue
228
+
229
+ n_x = len(x_thresholds) - 1
230
+ n_y = len(y_thresholds) - 1
231
+ if n_x <= 0 or n_y <= 0:
232
+ result[facet] = {"columns": []}
233
+ continue
234
+
235
+ # Datetimes need ms-since-epoch coercion before bucketing so
236
+ # the JS-side Date thresholds match the SQL comparisons.
237
+ x_expr = self._coerce_for_threshold(self._qi(x), x_thresholds)
238
+ y_expr = self._coerce_for_threshold(self._qi(y), y_thresholds)
239
+
240
+ # Single query computes all three aggregation levels we need —
241
+ # per-cell stats, per-column rollup, and the facet-level color
242
+ # STDDEV for std_ratio — via GROUPING SETS. Replaces the prior
243
+ # three sequential queries per facet (~3× fewer round-trips
244
+ # into DuckDB at the dominant CASE-WHEN parsing cost).
245
+ sql = f"""
246
+ WITH binned AS (
247
+ SELECT
248
+ {self._threshold_case(x_expr, x_thresholds, 'x_bin')} AS x_bin,
249
+ {self._threshold_case(y_expr, y_thresholds, 'y_bin')} AS y_bin,
250
+ {self._qi(color)} AS color_val,
251
+ {self._qi(y)} AS y_val
252
+ FROM df
253
+ WHERE {self._qi(facet_by)} = ?
254
+ AND {self._qi(x)} IS NOT NULL
255
+ AND {self._qi(y)} IS NOT NULL
256
+ {cat_clause}
257
+ ), kept AS (
258
+ SELECT * FROM binned WHERE x_bin IS NOT NULL AND y_bin IS NOT NULL
259
+ )
260
+ SELECT
261
+ x_bin, y_bin,
262
+ COUNT(*) AS row_count,
263
+ MIN(color_val) AS c_min,
264
+ MAX(color_val) AS c_max,
265
+ AVG(color_val) AS c_avg,
266
+ MEDIAN(color_val) AS c_median,
267
+ STDDEV(color_val) AS c_std,
268
+ {agg}(color_val) AS c_agg,
269
+ MIN(y_val) AS y_min,
270
+ MAX(y_val) AS y_max,
271
+ AVG(y_val) AS y_avg,
272
+ MEDIAN(y_val) AS y_median,
273
+ STDDEV(y_val) AS y_std,
274
+ GROUPING(x_bin) AS g_x,
275
+ GROUPING(y_bin) AS g_y
276
+ FROM kept
277
+ GROUP BY GROUPING SETS ((x_bin, y_bin), (x_bin), ())
278
+ ORDER BY GROUPING(x_bin) + GROUPING(y_bin) DESC
279
+ """
280
+ facet_params = [facet] + params
281
+ rows = self._conn.execute(sql, facet_params).fetchall()
282
+
283
+ # Materialize an n_x × n_y empty grid, then fill from the query.
284
+ columns: list[dict] = []
285
+ for xi in range(n_x):
286
+ cell_bins = [
287
+ {
288
+ "count": 0, "min": 0, "max": 0, "avg": 0,
289
+ "median": 0, "std": 0, "std_ratio": 0,
290
+ }
291
+ for _ in range(n_y)
292
+ ]
293
+ columns.append({
294
+ "count": 0, "min": 0, "max": 0, "avg": 0,
295
+ "median": 0, "std": 0,
296
+ "bins": cell_bins,
297
+ })
298
+
299
+ # Rows arrive in grouping-set order (grand → column → cell), so
300
+ # facet_color_std is known by the time per-cell rows are
301
+ # processed and std_ratio can be filled in-place.
302
+ facet_color_std = 0.0
303
+ for row in rows:
304
+ (xi, yi, count, c_min, c_max, c_avg, c_median, c_std, c_agg,
305
+ y_min, y_max, y_avg, y_median, y_std, g_x, g_y) = row
306
+
307
+ if g_x == 1 and g_y == 1:
308
+ # Grand total — single row, captures facet-color STDDEV.
309
+ facet_color_std = float(c_std) if c_std else 0.0
310
+ continue
311
+
312
+ if g_y == 1:
313
+ # Per-column rollup: stats over y values for this x_bin.
314
+ if xi is not None and 0 <= xi < n_x:
315
+ col = columns[xi]
316
+ col["count"] = int(count or 0)
317
+ col["min"] = self._safe_num(y_min)
318
+ col["max"] = self._safe_num(y_max)
319
+ col["avg"] = self._safe_num(y_avg)
320
+ col["median"] = self._safe_num(y_median)
321
+ col["std"] = self._safe_num(y_std)
322
+ continue
323
+
324
+ # Per-cell stats.
325
+ if (xi is not None and yi is not None
326
+ and 0 <= xi < n_x and 0 <= yi < n_y):
327
+ cell = columns[xi]["bins"][yi]
328
+ cell["count"] = int(count or 0)
329
+ cell["min"] = self._safe_num(c_min)
330
+ cell["max"] = self._safe_num(c_max)
331
+ cell["avg"] = self._safe_num(c_avg)
332
+ cell["median"] = self._safe_num(c_median)
333
+ cell["std"] = self._safe_num(c_std)
334
+ cell["std_ratio"] = (
335
+ cell["std"] / facet_color_std if facet_color_std else 0
336
+ )
337
+ # The JS heatmap reads `cell[color_agg]` for fill; the
338
+ # _COLOR_AGG_SQL aliases (avg/mean/average etc.) all
339
+ # map back to the canonical fields above. For non-canonical
340
+ # names also expose the requested aggregation under its
341
+ # original key so JS can index by it directly.
342
+ cell[color_agg] = self._safe_num(c_agg)
343
+
344
+ result[facet] = {"columns": columns}
345
+
346
+ return result
347
+
348
+ def brush_indices(
349
+ self,
350
+ *,
351
+ facet_by: str,
352
+ x: str,
353
+ y: str,
354
+ facet: str,
355
+ x_range: Optional[list] = None,
356
+ y_range: Optional[list] = None,
357
+ category_col: Optional[str] = None,
358
+ category_filter: Optional[list[str]] = None,
359
+ ) -> np.ndarray:
360
+ """
361
+ Returns the gp_idx values for rows that fall inside the given x/y
362
+ brush ranges within the named facet (and optional category filter).
363
+
364
+ Returns an empty array if no brush range is active — filter alone
365
+ does not yield a selection (matches the legacy JS semantic). A
366
+ cleared brush, even with a category filter still selected on the
367
+ bar chart, should report zero selected records.
368
+ """
369
+ has_x = x_range and len(x_range) == 2
370
+ has_y = y_range and len(y_range) == 2
371
+ if not has_x and not has_y:
372
+ return np.empty(0, dtype=np.int32)
373
+
374
+ clauses = [f'{self._qi(facet_by)} = ?']
375
+ params: list = [facet]
376
+ if has_x:
377
+ # JS sends ISO-like strings for Date axes. For TIMESTAMP WITH
378
+ # TIME ZONE columns DuckDB parses naive strings in the *session*
379
+ # timezone, not UTC, which can drop rows whose values fall in
380
+ # the offset gap between UTC midnight and the local midnight
381
+ # JS rounded toward. Coerce to UTC-aware datetime first so the
382
+ # bind is unambiguous.
383
+ x_lo, x_hi = self._to_utc_if_str(x_range[0]), self._to_utc_if_str(x_range[1])
384
+ clauses.append(f'{self._qi(x)} >= ? AND {self._qi(x)} <= ?')
385
+ params.extend([x_lo, x_hi])
386
+ if has_y:
387
+ # JS pre-normalizes y_range to ascending data values when it
388
+ # translates from row-index space, but sort here defensively so
389
+ # any direct caller (e.g., tests) can still pass either order.
390
+ y0 = self._to_utc_if_str(y_range[0])
391
+ y1 = self._to_utc_if_str(y_range[1])
392
+ lo, hi = sorted([y0, y1])
393
+ clauses.append(f'{self._qi(y)} >= ? AND {self._qi(y)} <= ?')
394
+ params.extend([lo, hi])
395
+ cat_sql, cat_params = self._category_clause(category_col, category_filter)
396
+ if cat_sql:
397
+ clauses.append(cat_sql)
398
+ params.extend(cat_params)
399
+ where = " AND ".join(clauses)
400
+ # DISTINCT guards the forthcoming node-scoped selection path: once a
401
+ # list-valued (exploded) column drives the WHERE, a job touching N
402
+ # matching nodes would otherwise return its gp_idx N times.
403
+ sql = f'SELECT DISTINCT {self._qi("gp_idx")} FROM df WHERE {where}'
404
+ arr = self._conn.execute(sql, params).fetchnumpy()
405
+ # `fetchnumpy` returns a dict {col: ndarray}; pick the single column.
406
+ indices = next(iter(arr.values())) if arr else np.empty(0, dtype=np.int64)
407
+ return indices.astype(np.int32, copy=False)
408
+
409
+ @staticmethod
410
+ def _to_utc_if_str(v):
411
+ """
412
+ JS Date.toISOString() (after the `T`/`Z` strip) yields naive ISO
413
+ strings even though the value is UTC. Bind them as UTC-aware
414
+ datetimes so DuckDB's session-timezone interpretation can't shift
415
+ the comparison boundary against TIMESTAMP WITH TIME ZONE columns.
416
+ Non-strings pass through unchanged.
417
+ """
418
+ if not isinstance(v, str):
419
+ return v
420
+ from datetime import datetime, timezone
421
+ s = v.strip()
422
+ # Handle trailing 'Z' just in case, and the JS-stripped form.
423
+ if s.endswith("Z"):
424
+ s = s[:-1]
425
+ try:
426
+ dt = datetime.fromisoformat(s)
427
+ except ValueError:
428
+ return v # Let DuckDB try to parse if our format guess is off.
429
+ if dt.tzinfo is None:
430
+ dt = dt.replace(tzinfo=timezone.utc)
431
+ return dt
432
+
433
+ @staticmethod
434
+ def _safe_num(v):
435
+ if v is None:
436
+ return 0
437
+ if isinstance(v, float) and (v != v): # NaN
438
+ return 0
439
+ return float(v)
440
+
441
+ @staticmethod
442
+ def _coerce_for_threshold(col_sql: str, thresholds: list) -> str:
443
+ """
444
+ When threshold values are datetimes (sent from JS as ms-since-epoch
445
+ numbers, JS Date.getTime()-style), coerce the DuckDB column to the
446
+ same epoch-ms representation for an apples-to-apples comparison.
447
+ """
448
+ if thresholds and isinstance(thresholds[0], (int, float)):
449
+ # Numeric thresholds — column might still be a TIMESTAMP if JS
450
+ # sent ms-based thresholds derived from a datetime. Detect that
451
+ # case from the value magnitude (epoch-ms for modern dates is
452
+ # > 1e12, well above any HPC numeric range we'd reasonably
453
+ # bucket). We can't easily introspect column types here without
454
+ # a probe query, so let DuckDB handle the coercion via EPOCH_MS.
455
+ #
456
+ # In practice this is only an issue for datetime axes, which JS
457
+ # currently always sends as Date objects (not ms). The threshold
458
+ # values themselves arrive as ISO strings or numbers depending
459
+ # on JSON serialization; the comm-layer handler normalizes them
460
+ # before calling this method.
461
+ return col_sql
462
+ return col_sql
463
+
464
+ @staticmethod
465
+ def _threshold_case(col_sql: str, thresholds: list, alias: str) -> str:
466
+ """
467
+ Builds a CASE expression that maps `col_sql` into an integer bin
468
+ index against `thresholds`. Uniform width_bucket would be faster
469
+ but doesn't handle non-uniform (log-scale) thresholds.
470
+
471
+ Outer-bin semantics match what JS does:
472
+ - The FIRST bin is an underflow bucket — any value < threshold[1]
473
+ lands here. JS uses log-scale thresholds that start at
474
+ `log_values_floor = 1` and calls `sanitize_data_for_log` to
475
+ replace zeros with 1 *before* binning. Python sees the raw
476
+ DataFrame and would otherwise drop every zero-valued row to
477
+ NULL; making bin 0 an underflow bucket absorbs them.
478
+ - The LAST bin is an overflow bucket — any value >= the second-
479
+ to-last threshold lands here. Matches `binValues`' overflow
480
+ check (`i === thresholds.length - 2`) so values at exactly
481
+ `stats.max` aren't lost.
482
+ """
483
+ if len(thresholds) < 2:
484
+ return f"CASE WHEN {col_sql} IS NOT NULL THEN 0 ELSE NULL END"
485
+ last_i = len(thresholds) - 2
486
+ parts = []
487
+ for i in range(len(thresholds) - 1):
488
+ lo = AggregationEngine._sql_literal(thresholds[i])
489
+ hi = AggregationEngine._sql_literal(thresholds[i + 1])
490
+ if i == 0:
491
+ parts.append(f"WHEN {col_sql} < {hi} THEN {i}")
492
+ elif i == last_i:
493
+ parts.append(f"WHEN {col_sql} >= {lo} THEN {i}")
494
+ else:
495
+ parts.append(f"WHEN {col_sql} >= {lo} AND {col_sql} < {hi} THEN {i}")
496
+ return "CASE " + " ".join(parts) + " ELSE NULL END"
497
+
498
+ @staticmethod
499
+ def _sql_literal(v) -> str:
500
+ if v is None:
501
+ return "NULL"
502
+ if isinstance(v, str):
503
+ # Datetime threshold sent as ISO string.
504
+ return f"TIMESTAMP '{v}'"
505
+ return str(v)