vedana-backoffice 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1590 @@
1
+ import logging
2
+ import threading
3
+ import time
4
+ from collections import deque
5
+ from dataclasses import dataclass
6
+ from datetime import datetime
7
+ from queue import Empty, Queue
8
+ from typing import Any, Iterable
9
+
10
+ import pandas as pd
11
+ import reflex as rx
12
+ import sqlalchemy as sa
13
+ from datapipe.compute import run_steps
14
+ from datapipe.step.batch_transform import BaseBatchTransformStep
15
+ from vedana_etl.app import app as etl_app
16
+ from vedana_etl.app import pipeline
17
+ from vedana_etl.config import DBCONN_DATAPIPE
18
+
19
+ from vedana_backoffice.graph.build import build_canonical, derive_step_edges, derive_table_edges, refine_layer_orders
20
+ from vedana_backoffice.util import safe_render_value
21
+
22
+
23
+ @dataclass
24
+ class EtlTableStats:
25
+ table_name: str
26
+ process_ts: float # last process_ts
27
+ row_count: int # excluding those with delete_ts != NULL
28
+
29
+ last_update_ts: float
30
+
31
+
32
+ @dataclass
33
+ class EtlDataTableStats(EtlTableStats):
34
+ last_update_rows: int # update_ts = max(update_ts)
35
+ last_added_rows: int # create_ts = last_update_ts
36
+ last_deleted_rows: int # delete_ts = last_update_ts
37
+
38
+
39
+ @dataclass
40
+ class EtlStepRunStats:
41
+ """Stats for a pipeline step's last run (grouped batch execution)."""
42
+
43
+ step_name: str
44
+ meta_table_name: str # The transform's meta table name
45
+ last_run_start: float # Start of the last run window (earliest process_ts in the run)
46
+ last_run_end: float # End of the last run window (latest process_ts in the run)
47
+ rows_processed: int # Total rows processed in the last run
48
+ rows_success: int # Rows with is_success=True in last run
49
+ rows_failed: int # Rows with is_success=False in last run
50
+ total_success: int # All rows with is_success=True (all time)
51
+ total_failed: int # All rows with is_success=False (all time)
52
+
53
+
54
+ # Threshold in seconds to group consecutive process_ts values into a single "run"
55
+ # If gap between consecutive process_ts > threshold, it's a new run
56
+ RUN_GAP_THRESHOLD_SECONDS = 300 # 5 minutes
57
+
58
+
59
+ class EtlState(rx.State):
60
+ """ETL control state and actions."""
61
+
62
+ default_pipeline_name = "main"
63
+
64
+ # Selections
65
+ selected_flow: str = "all"
66
+ selected_stage: str = "all"
67
+ selected_pipeline: str = default_pipeline_name
68
+
69
+ # Derived from pipeline
70
+ all_steps: list[dict[str, Any]] = [] # [{name, type, inputs, outputs, labels}]
71
+ filtered_steps: list[dict[str, Any]] = []
72
+ available_tables: list[str] = []
73
+ available_flows: list[str] = []
74
+ available_stages: list[str] = []
75
+ available_pipelines: list[str] = [default_pipeline_name]
76
+ pipeline_steps: dict[str, list[dict[str, Any]]] = {}
77
+ pipeline_flows: dict[str, list[str]] = {}
78
+ pipeline_stages: dict[str, list[str]] = {}
79
+
80
+ # Graph view state
81
+ graph_nodes: list[dict[str, Any]] = [] # [{index, name, x, y, w, h, labels_str}]
82
+ graph_edges: list[dict[str, Any]] = [] # [{source, target, label, path, label_x, label_y}]
83
+ graph_width_px: int = 1200
84
+ graph_height_px: int = 600
85
+ graph_svg: str = ""
86
+ graph_width_css: str = "1200px"
87
+ graph_height_css: str = "600px"
88
+
89
+ # Run status
90
+ is_running: bool = False
91
+ logs: list[str] = []
92
+ max_log_lines: int = 2000
93
+
94
+ # UI toggles
95
+ sidebar_open: bool = True
96
+ logs_open: bool = True
97
+
98
+ # Multi-select of nodes (by step index)
99
+ selected_node_ids: list[int] = []
100
+ selection_source: str = "filter"
101
+
102
+ # View mode: False = step-centric, True = data(table)-centric
103
+ data_view: bool = False
104
+
105
+ # Table metadata for data-centric view
106
+ table_meta: dict[str, EtlDataTableStats] = {}
107
+ step_meta: dict[int, EtlStepRunStats] = {} # for step-centric view as well (keys=index)
108
+
109
+ # Table preview panel state
110
+ preview_open: bool = False
111
+ preview_anchor_left: str = "0px"
112
+ preview_anchor_top: str = "0px"
113
+
114
+ # Step status cache: name -> {total_idx_count, changed_idx_count}
115
+ step_status_by_name: dict[str, dict[str, int]] = {}
116
+ step_status_loading: bool = False
117
+
118
+ # Table preview
119
+ preview_table_name: str | None = None
120
+ preview_display_name: str = ""
121
+ preview_rows: list[dict[str, Any]] = []
122
+ preview_columns: list[str] = []
123
+ has_preview: bool = False
124
+
125
+ # Server-side pagination for preview
126
+ preview_page: int = 0 # 0-indexed current page
127
+ preview_page_size: int = 100 # rows per page
128
+ preview_total_rows: int = 0 # total count
129
+ preview_is_meta_table: bool = False # whether we're viewing _meta table
130
+
131
+ # Show only changes from last run (with styling)
132
+ preview_changes_only: bool = False
133
+
134
+ # Expandable row tracking for preview
135
+ preview_expanded_rows: list[str] = []
136
+
137
+ def toggle_preview_row_expand(self, row_id: str) -> None:
138
+ """Toggle expansion state for a preview row."""
139
+ row_id = str(row_id or "")
140
+ if not row_id:
141
+ return
142
+ current = set(self.preview_expanded_rows)
143
+ if row_id in current:
144
+ current.remove(row_id)
145
+ else:
146
+ current.add(row_id)
147
+ self.preview_expanded_rows = list(current)
148
+ # Update expanded state in rows to trigger UI refresh
149
+ updated_rows = []
150
+ for row in self.preview_rows:
151
+ new_row = dict(row)
152
+ new_row["expanded"] = row.get("row_id", "") in current
153
+ updated_rows.append(new_row)
154
+ self.preview_rows = updated_rows
155
+
156
+ def _start_log_capture(self) -> tuple[Queue[str], logging.Handler, logging.Logger]:
157
+ q: Queue[str] = Queue()
158
+
159
+ class _QueueHandler(logging.Handler):
160
+ def __init__(self, queue: Queue[str]):
161
+ super().__init__()
162
+ self._q = queue
163
+
164
+ def emit(self, record: logging.LogRecord) -> None: # type: ignore[override]
165
+ try:
166
+ msg = self.format(record)
167
+ self._q.put(msg)
168
+ except Exception:
169
+ pass
170
+
171
+ handler = _QueueHandler(q)
172
+ handler.setLevel(logging.INFO)
173
+ handler.setFormatter(logging.Formatter("%(levelname)s:%(name)s:%(message)s"))
174
+
175
+ logger = logging.getLogger("datapipe")
176
+ logger.setLevel(logging.INFO)
177
+ logger.addHandler(handler)
178
+
179
+ return q, handler, logger
180
+
181
+ def _stop_log_capture(self, handler: logging.Handler, logger: logging.Logger) -> None:
182
+ try:
183
+ logger.removeHandler(handler)
184
+ except Exception:
185
+ pass
186
+
187
+ def _drain_queue_into_logs(self, q: Queue[str]) -> None:
188
+ while True:
189
+ try:
190
+ msg = q.get_nowait()
191
+ except Empty:
192
+ break
193
+ else:
194
+ self.logs.append(msg)
195
+
196
+ def _append_log(self, msg: str) -> None:
197
+ timestamp = time.strftime("%H:%M:%S")
198
+ self.logs.append(f"[{timestamp}] {msg}")
199
+ self.logs = self.logs[-self.max_log_lines :]
200
+
201
+ def load_pipeline_metadata(self) -> None:
202
+ """Populate metadata by introspecting the ETL pipeline definition."""
203
+
204
+ pipeline_buckets: dict[str, dict[str, Any]] = {}
205
+ pipeline_order: list[str] = []
206
+ tables: set[str] = set()
207
+
208
+ def get_bucket(name: str) -> dict[str, Any]:
209
+ key = str(name).strip() or self.default_pipeline_name
210
+ if key not in pipeline_buckets:
211
+ pipeline_buckets[key] = {"steps": [], "flows": set(), "stages": set()}
212
+ pipeline_order.append(key)
213
+ return pipeline_buckets[key]
214
+
215
+ for idx, step in enumerate(pipeline.steps):
216
+ inputs = [el.name for el in getattr(step, "inputs", [])]
217
+ outputs = [el.name for el in getattr(step, "outputs", [])]
218
+ labels = getattr(step, "labels", []) or []
219
+
220
+ meta = {
221
+ "index": idx,
222
+ "name": step.func.__name__, # type: ignore[attr-defined]
223
+ "step_type": type(step).__name__,
224
+ "inputs": list(inputs),
225
+ "outputs": list(outputs),
226
+ "labels": list(labels),
227
+ "inputs_str": ", ".join([str(x) for x in list(inputs)]),
228
+ "outputs_str": ", ".join([str(x) for x in list(outputs)]),
229
+ "labels_str": ", ".join([f"{k}:{v}" for k, v in list(labels)]),
230
+ }
231
+
232
+ for input_name in inputs:
233
+ tables.add(input_name)
234
+ for output_name in outputs:
235
+ tables.add(output_name)
236
+
237
+ normalized_labels: list[tuple[str, str]] = []
238
+ for key, value in labels:
239
+ k = str(key)
240
+ v = str(value)
241
+ if v == "":
242
+ continue
243
+ normalized_labels.append((k, v))
244
+
245
+ pipeline_names = {v for k, v in normalized_labels if k == "pipeline"}
246
+ flow_labels = {v for k, v in normalized_labels if k == "flow"}
247
+ stage_labels = {v for k, v in normalized_labels if k == "stage"}
248
+
249
+ if not pipeline_names:
250
+ pipeline_names = {self.default_pipeline_name}
251
+
252
+ for pipeline_name in pipeline_names:
253
+ bucket = get_bucket(pipeline_name)
254
+ bucket["steps"].append(meta)
255
+ bucket["flows"].update(flow_labels)
256
+ bucket["stages"].update(stage_labels)
257
+
258
+ # Ensure default pipeline is always available even if no explicit labels exist.
259
+ get_bucket(self.default_pipeline_name)
260
+
261
+ self.pipeline_steps = {name: list(bucket["steps"]) for name, bucket in pipeline_buckets.items()}
262
+ self.pipeline_flows = {name: sorted(bucket["flows"]) for name, bucket in pipeline_buckets.items()}
263
+ self.pipeline_stages = {name: sorted(bucket["stages"]) for name, bucket in pipeline_buckets.items()}
264
+
265
+ ordered_pipelines: list[str] = []
266
+ seen: set[str] = set()
267
+ default_name = self.default_pipeline_name
268
+
269
+ if default_name in pipeline_buckets:
270
+ ordered_pipelines.append(default_name)
271
+ seen.add(default_name)
272
+
273
+ for name in pipeline_order:
274
+ if name in seen:
275
+ continue
276
+ ordered_pipelines.append(name)
277
+ seen.add(name)
278
+
279
+ for name in pipeline_buckets.keys():
280
+ if name in seen:
281
+ continue
282
+ ordered_pipelines.append(name)
283
+ seen.add(name)
284
+
285
+ if not ordered_pipelines:
286
+ ordered_pipelines = [default_name]
287
+
288
+ self.available_pipelines = ordered_pipelines
289
+ self.available_tables = sorted(tables)
290
+
291
+ self._load_table_stats()
292
+ self._load_step_stats()
293
+ self._apply_pipeline_selection(
294
+ target_pipeline=self.selected_pipeline,
295
+ preserve_filters=True,
296
+ preserve_selection=True,
297
+ )
298
+
299
+ def set_pipeline(self, pipeline_name: str) -> None:
300
+ """Switch active pipeline tab."""
301
+ desired = str(pipeline_name).strip() or self.default_pipeline_name
302
+ if desired not in self.pipeline_steps:
303
+ desired = self.default_pipeline_name
304
+ if desired == self.selected_pipeline:
305
+ return
306
+ self._apply_pipeline_selection(target_pipeline=desired)
307
+
308
+ def _apply_pipeline_selection(
309
+ self,
310
+ target_pipeline: str | None = None,
311
+ preserve_filters: bool = False,
312
+ preserve_selection: bool = False,
313
+ ) -> None:
314
+ previous_pipeline = getattr(self, "selected_pipeline", self.default_pipeline_name)
315
+ prev_flow = self.selected_flow
316
+ prev_stage = self.selected_stage
317
+ prev_selection = list(self.selected_node_ids)
318
+ prev_selection_source = self.selection_source
319
+
320
+ active_name = target_pipeline or previous_pipeline or self.default_pipeline_name
321
+ if active_name not in self.pipeline_steps:
322
+ active_name = self.default_pipeline_name
323
+ if active_name not in self.pipeline_steps and self.pipeline_steps:
324
+ active_name = next(iter(self.pipeline_steps.keys()))
325
+
326
+ self.selected_pipeline = active_name
327
+
328
+ current_steps = self.pipeline_steps.get(active_name, [])
329
+ self.all_steps = list(current_steps)
330
+
331
+ flow_values = self.pipeline_flows.get(active_name, [])
332
+ stage_values = self.pipeline_stages.get(active_name, [])
333
+ self.available_flows = ["all", *flow_values]
334
+ self.available_stages = ["all", *stage_values]
335
+
336
+ if preserve_filters and (not prev_flow or prev_flow in flow_values):
337
+ self.selected_flow = prev_flow
338
+ else:
339
+ self.selected_flow = "all"
340
+
341
+ if preserve_filters and (not prev_stage or prev_stage in stage_values):
342
+ self.selected_stage = prev_stage
343
+ else:
344
+ self.selected_stage = "all"
345
+
346
+ if preserve_selection and previous_pipeline == active_name:
347
+ self.selected_node_ids = prev_selection
348
+ self.selection_source = prev_selection_source
349
+ else:
350
+ self.selected_node_ids = []
351
+ self.selection_source = "filter"
352
+
353
+ self._update_filtered_steps()
354
+
355
+ def toggle_sidebar(self) -> None:
356
+ self.sidebar_open = not self.sidebar_open
357
+
358
+ def toggle_logs(self) -> None:
359
+ self.logs_open = not self.logs_open
360
+
361
+ def set_flow(self, flow: str) -> None:
362
+ self.selected_flow = flow
363
+ self.selection_source = "filter"
364
+ self._update_filtered_steps()
365
+
366
+ def set_stage(self, stage: str) -> None:
367
+ self.selected_stage = stage
368
+ self.selection_source = "filter"
369
+ self._update_filtered_steps()
370
+
371
+ def reset_filters(self) -> None:
372
+ """Reset flow and stage selections and rebuild the graph."""
373
+ self.selected_flow = "all"
374
+ self.selected_stage = "all"
375
+ self.selected_node_ids = []
376
+ self.selection_source = "filter"
377
+ self._update_filtered_steps()
378
+
379
+ def set_data_view(self, checked: bool) -> None:
380
+ """Toggle between step-centric and data-centric graph."""
381
+ try:
382
+ self.data_view = bool(checked)
383
+ except Exception:
384
+ self.data_view = False
385
+ # Do not alter filters or explicit selections automatically
386
+ self._rebuild_graph()
387
+
388
+ def toggle_node_selection(self, index: int) -> None:
389
+ """Toggle selection; manual interactions become authoritative unless "all selected" case."""
390
+ try:
391
+ sid = int(index)
392
+ except Exception:
393
+ return
394
+
395
+ # Compute current filter-driven set
396
+ filter_ids: set[int] = set()
397
+ for m in self.filtered_steps or []:
398
+ try:
399
+ midx = int(m.get("index", -1))
400
+ if midx >= 0:
401
+ filter_ids.add(midx)
402
+ except Exception:
403
+ continue
404
+
405
+ manual_set: set[int] = set(self.selected_node_ids or [])
406
+
407
+ # Special case: when current selection is filter-driven, on click switch to manual with a single selection
408
+ if self.selection_source == "filter":
409
+ # If current filter selects all or many, treat as filter-driven
410
+ # On click, switch to manual with single selection
411
+ self.selected_node_ids = [sid]
412
+ self.selection_source = "manual"
413
+ self._rebuild_graph()
414
+ return
415
+
416
+ # Otherwise we are in manual mode: toggle within manual set
417
+ if sid in manual_set:
418
+ manual_set.remove(sid)
419
+ else:
420
+ manual_set.add(sid)
421
+ # If manual set becomes empty, fall back to filter selection
422
+ if not manual_set:
423
+ self.selection_source = "filter"
424
+ self.selected_node_ids = []
425
+ else:
426
+ self.selection_source = "manual"
427
+ self.selected_node_ids = sorted(list(manual_set))
428
+ self._rebuild_graph()
429
+
430
+ def clear_node_selection(self) -> None:
431
+ self.selected_node_ids = []
432
+ self._rebuild_graph()
433
+
434
+ def _get_current_pipeline_step_indices(self) -> set[int]:
435
+ """Get the indices of steps that belong to the currently selected pipeline."""
436
+ return {int(m.get("index", -1)) for m in self.all_steps if m.get("index") is not None}
437
+
438
+ def _filter_steps_by_labels(self, steps: Iterable[Any], restrict_to_pipeline: bool = True) -> list[Any]:
439
+ """Filter steps by flow/stage labels.
440
+
441
+ Args:
442
+ steps: The steps to filter (from etl_app.steps)
443
+ restrict_to_pipeline: If True, only include steps from the current pipeline
444
+ """
445
+ # Get valid step indices for the current pipeline
446
+ if restrict_to_pipeline:
447
+ valid_indices = self._get_current_pipeline_step_indices()
448
+ else:
449
+ valid_indices = None
450
+
451
+ def matches(step: Any, idx: int) -> bool:
452
+ # Check if step is in the current pipeline
453
+ if valid_indices is not None and idx not in valid_indices:
454
+ return False
455
+
456
+ # Check flow/stage labels
457
+ if self.selected_flow == "all" and self.selected_stage == "all":
458
+ return True
459
+
460
+ labels = getattr(step, "labels", []) or []
461
+ label_map: dict[str, set[str]] = {}
462
+ for key, value in labels:
463
+ label_map.setdefault(str(key), set()).add(str(value))
464
+
465
+ if self.selected_flow != "all" and self.selected_flow not in label_map.get("flow", set()):
466
+ return False
467
+ if self.selected_stage != "all" and self.selected_stage not in label_map.get("stage", set()):
468
+ return False
469
+ return True
470
+
471
+ return [s for idx, s in enumerate(steps) if matches(s, idx)]
472
+
473
+ def _update_filtered_steps(self) -> None:
474
+ """Update filtered_steps used for UI from all_steps based on current filters."""
475
+
476
+ def matches_meta(meta: dict[str, Any]) -> bool:
477
+ labels = meta.get("labels", []) or []
478
+ label_map: dict[str, set[str]] = {}
479
+ for key, value in labels:
480
+ label_map.setdefault(str(key), set()).add(str(value))
481
+ # Both conditions must match (additive filtering)
482
+ flow_match = self.selected_flow == "all" or self.selected_flow in label_map.get("flow", set())
483
+ stage_match = self.selected_stage == "all" or self.selected_stage in label_map.get("stage", set())
484
+ return flow_match and stage_match
485
+
486
+ if self.selected_flow == "all" and self.selected_stage == "all":
487
+ self.filtered_steps = list(self.all_steps)
488
+ else:
489
+ self.filtered_steps = [m for m in self.all_steps if matches_meta(m)]
490
+ # Keep graph in sync with filter changes
491
+ self._rebuild_graph()
492
+
493
+ # --- Graph building and layout ---
494
+
495
+ def _rebuild_graph(self) -> None:
496
+ """Build graph nodes/edges and compute a simple layered layout.
497
+
498
+ Nodes are steps; edges are derived when an output table of one step
499
+ appears as an input table of another step. A basic DAG layering is
500
+ computed using indegrees (Kahn) and used to place nodes left-to-right.
501
+ """
502
+ try:
503
+ metas = self.all_steps # Build graph based on current view mode
504
+ cg = build_canonical(metas) # Build canonical graph indexes once
505
+
506
+ # Styling: basic node size and spacing constants (px)
507
+ MIN_NODE_W = 220
508
+ MAX_NODE_W = 420
509
+ NODE_H = 90
510
+ H_SPACING = 120
511
+ V_SPACING = 40
512
+ MARGIN = 24
513
+
514
+ if not self.data_view: # -------- STEP-CENTRIC VIEW --------
515
+ step_ids: list[int] = sorted([s.index for s in cg.steps])
516
+ name_by: dict[int, str] = dict(cg.step_name_by_index)
517
+ step_type_by: dict[int, str] = dict(cg.step_type_by_index)
518
+ # Optional labels_str for node text
519
+ labels_str_by: dict[int, str] = {}
520
+ for m in metas:
521
+ try:
522
+ idx = int(m.get("index", -1))
523
+ labels_str_by[idx] = str(m.get("labels_str", ""))
524
+ except Exception:
525
+ pass
526
+
527
+ unique_ids = list(step_ids)
528
+
529
+ # Derive edges by table indexes (linear)
530
+ edges = derive_step_edges(cg)
531
+
532
+ # Compute indegrees for Kahn layering
533
+ indeg: dict[int, int] = {sid: 0 for sid in unique_ids}
534
+ children: dict[int, list[int]] = {sid: [] for sid in unique_ids}
535
+ parents: dict[int, list[int]] = {sid: [] for sid in unique_ids}
536
+ for s, t, _ in edges:
537
+ indeg[t] = indeg.get(t, 0) + 1
538
+ children.setdefault(s, []).append(t)
539
+ parents.setdefault(t, []).append(s)
540
+
541
+ # Initialize layers
542
+ layer_by: dict[int, int] = {sid: 0 for sid in unique_ids}
543
+
544
+ q: deque[int] = deque([sid for sid in unique_ids if indeg.get(sid, 0) == 0])
545
+ visited: set[int] = set()
546
+ while q:
547
+ sid = q.popleft()
548
+ visited.add(sid)
549
+ for ch in children.get(sid, []):
550
+ # longest path layering
551
+ layer_by[ch] = max(layer_by.get(ch, 0), layer_by.get(sid, 0) + 1)
552
+ indeg[ch] -= 1
553
+ if indeg[ch] == 0:
554
+ q.append(ch)
555
+
556
+ # Any nodes not visited (cycle/isolated) keep default layer 0
557
+ # Group nodes by layer
558
+ layers: dict[int, list[int]] = {}
559
+ max_layer = 0
560
+ for sid in unique_ids:
561
+ layer = layer_by.get(sid, 0)
562
+ max_layer = max(max_layer, layer)
563
+ layers.setdefault(layer, []).append(sid)
564
+
565
+ # Stable order in each layer by barycenter of incoming neighbors (reduces edge length)
566
+ for layer, arr in list(layers.items()):
567
+
568
+ def _barycenter(node_id: int) -> float:
569
+ parent_ids = [s for s, t, _ in edges if t == node_id]
570
+ if not parent_ids:
571
+ return float(layer)
572
+ return sum([layer_by.get(p, 0) for p in parent_ids]) / float(len(parent_ids))
573
+
574
+ layers[layer] = sorted(arr, key=lambda i: (_barycenter(i), name_by.get(i, "")))
575
+
576
+ refine_layer_orders(layers, parents, children, max_layer)
577
+
578
+ # Pre-compute content-based widths/heights per node
579
+ w_by: dict[int, int] = {}
580
+ h_by: dict[int, int] = {}
581
+ for sid in unique_ids:
582
+ nlen = len(name_by.get(sid, "")) + 15 # + 15 from the step type label (BatchTransform etc.)
583
+ llen = len(labels_str_by.get(sid, ""))
584
+ w = min(max(nlen * 8 + 40, llen * 6 + 10, MIN_NODE_W), MAX_NODE_W)
585
+ w_by[sid] = w
586
+ chars_per_line = max(10, int((w - 40) / 7))
587
+ lines = 1
588
+ try:
589
+ lines = max(1, -(-llen // chars_per_line))
590
+ except Exception:
591
+ lines = 1
592
+ base_h = NODE_H
593
+ extra_lines = max(0, lines - 2)
594
+ h_by[sid] = base_h + extra_lines * 14
595
+
596
+ else: # -------- DATA-CENTRIC VIEW --------
597
+ # Build table-centric graph inputs from canonical
598
+ table_list = sorted(list(set([t for s in cg.steps for t in s.inputs + s.outputs])))
599
+ table_id_by_name: dict[str, int] = {name: i for i, name in enumerate(table_list)}
600
+ # Create pseudo ids for layout
601
+ unique_ids = list(range(len(table_list)))
602
+ name_by = {i: n for i, n in enumerate(table_list)}
603
+ labels_str_by = {i: "" for i in unique_ids}
604
+ # Edges between tables from canonical
605
+ table_edges = derive_table_edges(cg)
606
+ # Build adjacency for layering based on table graph (DV-specific names)
607
+ edges_dv: list[tuple[int, int, list[str]]] = []
608
+ for s_id, t_id, _ in table_edges:
609
+ if s_id < 0:
610
+ continue
611
+ edges_dv.append((s_id, t_id, []))
612
+ indeg2: dict[int, int] = {sid: 0 for sid in unique_ids}
613
+ children2: dict[int, list[int]] = {sid: [] for sid in unique_ids}
614
+ parents2: dict[int, list[int]] = {sid: [] for sid in unique_ids}
615
+ for s2, t2, _ in edges_dv:
616
+ indeg2[t2] = indeg2.get(t2, 0) + 1
617
+ children2.setdefault(s2, []).append(t2)
618
+ parents2.setdefault(t2, []).append(s2)
619
+ layer_by2: dict[int, int] = {sid: 0 for sid in unique_ids}
620
+
621
+ q2: deque[int] = deque([sid for sid in unique_ids if indeg2.get(sid, 0) == 0])
622
+ visited2: set[int] = set()
623
+ while q2:
624
+ sid = q2.popleft()
625
+ visited2.add(sid)
626
+ for ch in children2.get(sid, []):
627
+ layer_by2[ch] = max(layer_by2.get(ch, 0), layer_by2.get(sid, 0) + 1)
628
+ indeg2[ch] -= 1
629
+ if indeg2[ch] == 0:
630
+ q2.append(ch)
631
+ layers2: dict[int, list[int]] = {}
632
+ max_layer = 0
633
+ for sid2 in unique_ids:
634
+ layer = layer_by2.get(sid2, 0)
635
+ max_layer = max(max_layer, layer)
636
+ layers2.setdefault(layer, []).append(sid2)
637
+ # Data view: order by barycenter of incoming neighbors to reduce crossings/length
638
+ for layer, arr in list(layers2.items()):
639
+
640
+ def _barycenter(node_id: int) -> float:
641
+ parent_ids = [s for s, t, _ in edges_dv if t == node_id]
642
+ if not parent_ids:
643
+ return float(layer)
644
+ return sum([layer_by2.get(p, 0) for p in parent_ids]) / float(len(parent_ids))
645
+
646
+ layers2[layer] = sorted(arr, key=lambda i: (_barycenter(i), name_by.get(i, "")))
647
+
648
+ refine_layer_orders(layers2, parents2, children2, max_layer)
649
+ # Sizes for tables
650
+ w_by = {}
651
+ h_by = {}
652
+ for sid in unique_ids:
653
+ nlen = len(name_by.get(sid, ""))
654
+ est = max(nlen * 8 + 60, MIN_NODE_W)
655
+ w = min(max(int(est), MIN_NODE_W), MAX_NODE_W)
656
+ w_by[sid] = w
657
+ h_by[sid] = NODE_H
658
+
659
+ # Selected node ids based on selection source
660
+ selected_ids: set[int] = set()
661
+ try:
662
+ if not self.data_view:
663
+ if self.selection_source == "manual" and self.selected_node_ids:
664
+ for midx in self.selected_node_ids:
665
+ selected_ids.add(int(midx))
666
+ else:
667
+ for m in self.filtered_steps or []:
668
+ midx = int(m.get("index", -1))
669
+ if midx >= 0:
670
+ selected_ids.add(midx)
671
+ except Exception:
672
+ selected_ids = set()
673
+
674
+ # Column widths (max of nodes in that layer)
675
+ col_w: dict[int, int] = (
676
+ {
677
+ _l: (max([w_by[sid] for sid in layers.get(_l, [])]) if layers.get(_l) else MIN_NODE_W)
678
+ for _l in layers
679
+ }
680
+ if not self.data_view
681
+ else {
682
+ _l: (max([w_by[sid] for sid in layers2.get(_l, [])]) if layers2.get(_l) else MIN_NODE_W)
683
+ for _l in (layers2 if "layers2" in locals() else {})
684
+ }
685
+ )
686
+
687
+ # Prefix sums for x offsets per layer
688
+ def layer_x(layer: int) -> int:
689
+ x = MARGIN
690
+ for i in range(0, layer):
691
+ x += col_w.get(i, MIN_NODE_W) + H_SPACING
692
+ return x
693
+
694
+ # Compute positions
695
+ nodes: list[dict[str, Any]] = []
696
+ pos_by: dict[int, tuple[int, int]] = {}
697
+ max_rows = max(
698
+ [len((layers2 if self.data_view else layers).get(layer, [])) for layer in range(0, max_layer + 1)]
699
+ or [1]
700
+ )
701
+ # compute row heights as max node height across layers for each row index
702
+ row_heights: list[int] = [NODE_H for _ in range(max_rows)]
703
+ for layer in range(0, max_layer + 1):
704
+ cols = (layers2 if self.data_view else layers).get(layer, [])
705
+ for r_idx, sid in enumerate(cols):
706
+ row_heights[r_idx] = max(row_heights[r_idx], h_by.get(sid, NODE_H))
707
+
708
+ # precompute y offsets
709
+ y_offsets: list[int] = []
710
+ acc = MARGIN
711
+ for r in range(max_rows):
712
+ y_offsets.append(acc)
713
+ acc += row_heights[r] + V_SPACING
714
+
715
+ for _l in range(0, max_layer + 1):
716
+ cols = (layers2 if self.data_view else layers).get(_l, [])
717
+ for r_idx, sid in enumerate(cols):
718
+ x = layer_x(_l)
719
+ y = y_offsets[r_idx]
720
+ pos_by[sid] = (x, y)
721
+ if not self.data_view: # transformation view
722
+ step_name = name_by.get(sid, f"step_{sid}")
723
+
724
+ # Get step stats from step_meta
725
+ step_stats = self.step_meta.get(sid)
726
+ if step_stats and step_stats.last_run_end > 0:
727
+ dt = datetime.fromtimestamp(step_stats.last_run_end)
728
+ last_run_str = dt.strftime("%Y-%m-%d %H:%M")
729
+ rows_processed = step_stats.rows_processed
730
+ rows_success = step_stats.rows_success
731
+ rows_failed = step_stats.rows_failed
732
+ total_success = step_stats.total_success
733
+ total_failed = step_stats.total_failed
734
+ else:
735
+ last_run_str = "—"
736
+ rows_processed = 0
737
+ rows_success = 0
738
+ rows_failed = 0
739
+ total_success = 0
740
+ total_failed = 0
741
+
742
+ nodes.append(
743
+ {
744
+ "index": sid,
745
+ "index_value": str(sid),
746
+ "name": step_name,
747
+ "labels_str": labels_str_by.get(sid, ""),
748
+ "step_type": step_type_by.get(sid, ""),
749
+ "node_type": "step",
750
+ # numeric position/size (might be useful elsewhere)
751
+ "x": x,
752
+ "y": y,
753
+ "w": w_by.get(sid, MIN_NODE_W),
754
+ "h": h_by.get(sid, NODE_H),
755
+ # css strings for Reflex styles (avoid Python ops on Vars)
756
+ "left": f"{x}px",
757
+ "top": f"{y}px",
758
+ "width": f"{w_by.get(sid, MIN_NODE_W)}px",
759
+ "height": f"{h_by.get(sid, NODE_H)}px",
760
+ # Step run stats
761
+ "last_run": last_run_str,
762
+ "rows_processed": rows_processed,
763
+ "rows_success": rows_success,
764
+ "rows_failed": rows_failed,
765
+ "has_failed": rows_failed > 0, # Pre-computed for Reflex rx.cond
766
+ "rows_failed_str": f"({rows_failed} ✗)" if rows_failed > 0 else "",
767
+ # All-time run stats
768
+ "total_success": total_success,
769
+ "total_failed": total_failed,
770
+ "has_total_failed": total_failed > 0, # Pre-computed for Reflex rx.cond
771
+ "total_failed_str": f"({total_failed} ✗)" if total_failed > 0 else "",
772
+ "selected": sid in selected_ids,
773
+ "border_css": "2px solid #3b82f6" if sid in selected_ids else "1px solid #e5e7eb",
774
+ }
775
+ )
776
+ else: # data view: nodes are tables
777
+ table_name = name_by.get(sid, f"table_{sid}")
778
+ rc = self.table_meta[table_name]
779
+ if rc.process_ts:
780
+ dt = datetime.fromtimestamp(rc.process_ts)
781
+ last_run_str = dt.strftime("%Y-%m-%d %H:%M")
782
+ else:
783
+ last_run_str = "—"
784
+
785
+ nodes.append(
786
+ {
787
+ "index": sid,
788
+ "index_value": str(sid),
789
+ "name": table_name,
790
+ "labels_str": "",
791
+ "node_type": "table",
792
+ "x": x,
793
+ "y": y,
794
+ "w": w_by.get(sid, MIN_NODE_W),
795
+ "h": h_by.get(sid, NODE_H),
796
+ "left": f"{x}px",
797
+ "top": f"{y}px",
798
+ "width": f"{w_by.get(sid, MIN_NODE_W)}px",
799
+ "height": f"{h_by.get(sid, NODE_H)}px",
800
+ "last_run": last_run_str,
801
+ "row_count": rc.row_count,
802
+ "last_add": f"+{rc.last_added_rows}",
803
+ "last_upd": f"{rc.last_update_rows}",
804
+ "last_rm": f"-{rc.last_deleted_rows}",
805
+ "selected": (self.preview_table_name == table_name),
806
+ "border_css": (
807
+ "2px solid #3b82f6"
808
+ if (self.preview_table_name == table_name)
809
+ else "1px solid #e5e7eb"
810
+ ),
811
+ }
812
+ )
813
+
814
+ # Compute canvas size
815
+ layers_count = max_layer + 1 if unique_ids else 1
816
+ width_px = (
817
+ MARGIN * 2
818
+ + sum([col_w.get(i, MIN_NODE_W) for i in range(0, layers_count)])
819
+ + max(0, layers_count - 1) * H_SPACING
820
+ )
821
+ height_px = MARGIN * 2 + sum(row_heights) + max(0, max_rows - 1) * V_SPACING
822
+
823
+ # Build edges visuals
824
+ edge_objs: list[dict[str, Any]] = []
825
+ if not self.data_view:
826
+ for s, t, shared in edges:
827
+ sx, sy = pos_by.get(s, (MARGIN, MARGIN))
828
+ tx, ty = pos_by.get(t, (MARGIN, MARGIN))
829
+ x1 = sx + w_by.get(s, MIN_NODE_W)
830
+ y1 = sy + h_by.get(s, NODE_H) // 2
831
+ x2 = tx
832
+ y2 = ty + h_by.get(t, NODE_H) // 2
833
+ cx1 = x1 + 40
834
+ cx2 = x2 - 40
835
+ path = f"M{x1},{y1} C{cx1},{y1} {cx2},{y2} {x2},{y2}"
836
+ label = ", ".join(shared)
837
+ label_x = (x1 + x2) / 2
838
+ label_y = (y1 + y2) / 2 - 6
839
+ edge_objs.append(
840
+ {
841
+ "source": s,
842
+ "target": t,
843
+ "label": label,
844
+ "path": path,
845
+ "label_x": label_x,
846
+ "label_y": label_y,
847
+ # highlight only if both endpoints are selected
848
+ "selected": (s in selected_ids) and (t in selected_ids),
849
+ }
850
+ )
851
+ else:
852
+ for s_id, t_id, step_label in table_edges:
853
+ tx, ty = pos_by.get(t_id, (MARGIN, MARGIN))
854
+ if s_id < 0:
855
+ x1 = max(0, MARGIN - 120)
856
+ y1 = ty + h_by.get(t_id, NODE_H) // 2
857
+ else:
858
+ sx, sy = pos_by.get(s_id, (MARGIN, MARGIN))
859
+ x1 = sx + w_by.get(s_id, MIN_NODE_W)
860
+ y1 = sy + h_by.get(s_id, NODE_H) // 2
861
+ x2 = tx
862
+ y2 = ty + h_by.get(t_id, NODE_H) // 2
863
+ cx1 = x1 + 40
864
+ cx2 = x2 - 40
865
+ path = f"M{x1},{y1} C{cx1},{y1} {cx2},{y2} {x2},{y2}"
866
+ label = step_label
867
+ label_x = (x1 + x2) / 2
868
+ label_y = (y1 + y2) / 2 - 6
869
+ # Highlight edges incident to the previewed table
870
+ sel = False
871
+ try:
872
+ if s_id < 0: # For generator edges, highlight when the target table is highlighted
873
+ sel = self.preview_table_name == name_by.get(t_id, "")
874
+ else: # For regular edges, require both endpoints highlighted (not applicable for now)
875
+ sel = False
876
+ except Exception:
877
+ sel = False
878
+ edge_objs.append(
879
+ {
880
+ "source": s_id,
881
+ "target": t_id,
882
+ "label": label,
883
+ "path": path,
884
+ "label_x": label_x,
885
+ "label_y": label_y,
886
+ "selected": sel,
887
+ }
888
+ )
889
+
890
+ # Build SVG string for edges
891
+ svg_parts: list[str] = []
892
+ svg_parts.append(
893
+ f'<svg width="{width_px}" height="{height_px}" viewBox="0 0 {width_px} {height_px}" preserveAspectRatio="xMinYMin meet" xmlns="http://www.w3.org/2000/svg">'
894
+ )
895
+ for e in edge_objs:
896
+ stroke = "#3b82f6" if e.get("selected") else "#9ca3af"
897
+ width = "2.5" if e.get("selected") else "2"
898
+ opacity = "1.0" if e.get("selected") else "0.6"
899
+ svg_parts.append(
900
+ f'<path d="{e["path"]}" stroke="{stroke}" stroke-width="{width}" opacity="{opacity}" fill="none" />'
901
+ )
902
+ if e.get("label"):
903
+ svg_parts.append(
904
+ f'<text x="{e["label_x"]}" y="{e["label_y"]}" font-size="10" fill="#6b7280" text-anchor="middle">{e["label"]}</text>'
905
+ )
906
+ svg_parts.append("</svg>")
907
+
908
+ self.graph_nodes = nodes
909
+ self.graph_edges = edge_objs
910
+ self.graph_width_px = int(width_px)
911
+ self.graph_height_px = int(height_px)
912
+ self.graph_svg = "".join(svg_parts)
913
+ self.graph_width_css = f"{int(width_px)}px"
914
+ self.graph_height_css = f"{int(height_px)}px"
915
+
916
+ # Position the popover anchor centrally in the viewport to avoid overflow
917
+ # Using fixed positioning (set in UI), so we use viewport-relative values
918
+ # Center both horizontally and vertically to give popover room to expand in any direction
919
+ self.preview_anchor_left = "50vw"
920
+ self.preview_anchor_top = "50vh"
921
+
922
+ except Exception:
923
+ self.graph_nodes = []
924
+ self.graph_edges = []
925
+ self.graph_width_px = 800
926
+ self.graph_height_px = 400
927
+ self.graph_svg = ""
928
+ self.graph_width_css = "800px"
929
+ self.graph_height_css = "400px"
930
+
931
+ def _load_table_stats(self):
932
+ """
933
+ Load stats for each data table using run-based grouping.
934
+
935
+ A "run" is a group of consecutive timestamps where gaps are < RUN_GAP_THRESHOLD_SECONDS.
936
+ This accounts for batch processing where each batch has different timestamps.
937
+ """
938
+ con = DBCONN_DATAPIPE.con # type: ignore[attr-defined]
939
+ self.table_meta = {}
940
+
941
+ for tname in list(self.available_tables):
942
+ try:
943
+ meta_table_name = f"{tname}_meta"
944
+
945
+ # Get all distinct update_ts values to detect runs
946
+ ts_query = sa.text(
947
+ f"""
948
+ SELECT DISTINCT update_ts FROM "{meta_table_name}"
949
+ WHERE update_ts IS NOT NULL
950
+ ORDER BY update_ts DESC
951
+ LIMIT 1000
952
+ """
953
+ )
954
+
955
+ with con.begin() as conn:
956
+ result = conn.execute(ts_query)
957
+ timestamps = [float(row[0]) for row in result.fetchall() if row[0] is not None]
958
+
959
+ run_start, run_end = self._detect_last_run_window(timestamps)
960
+
961
+ # Now query stats using the run window
962
+ if run_start > 0.0:
963
+ query = sa.text(
964
+ f"""
965
+ SELECT
966
+ MAX(process_ts) AS process_ts,
967
+ COUNT(*) FILTER (WHERE delete_ts IS NULL) AS row_count,
968
+ {run_end} AS last_update_ts,
969
+ COUNT(*) FILTER (WHERE update_ts IS NOT NULL AND update_ts != create_ts AND update_ts >= {run_start} AND update_ts <= {run_end}) AS last_update_rows,
970
+ COUNT(*) FILTER (WHERE create_ts IS NOT NULL AND create_ts >= {run_start} AND create_ts <= {run_end}) AS last_added_rows,
971
+ COUNT(*) FILTER (WHERE delete_ts IS NOT NULL AND delete_ts >= {run_start} AND delete_ts <= {run_end}) AS last_deleted_rows
972
+ FROM "{meta_table_name}";
973
+ """
974
+ )
975
+ else:
976
+ # Fallback to original query if no timestamps found
977
+ query = sa.text(
978
+ f"""
979
+ WITH max_update_ts AS (
980
+ SELECT MAX(update_ts) AS max_ts FROM "{meta_table_name}"
981
+ )
982
+ SELECT
983
+ MAX(process_ts) AS process_ts,
984
+ COUNT(*) FILTER (WHERE delete_ts IS NULL) AS row_count,
985
+ (SELECT max_ts FROM max_update_ts) AS last_update_ts,
986
+ COUNT(*) FILTER (WHERE update_ts IS NOT NULL AND update_ts != create_ts AND update_ts = (SELECT max_ts FROM max_update_ts)) AS last_update_rows,
987
+ COUNT(*) FILTER (WHERE create_ts IS NOT NULL AND create_ts = (SELECT max_ts FROM max_update_ts)) AS last_added_rows,
988
+ COUNT(*) FILTER (WHERE delete_ts IS NOT NULL AND delete_ts = (SELECT max_ts FROM max_update_ts)) AS last_deleted_rows
989
+ FROM "{meta_table_name}", max_update_ts;
990
+ """
991
+ )
992
+
993
+ with con.begin() as conn:
994
+ result = conn.execute(query)
995
+ row = result.fetchone()
996
+
997
+ self.table_meta[tname] = EtlDataTableStats(
998
+ table_name=tname,
999
+ process_ts=float(row[0]) if row[0] is not None else 0.0,
1000
+ row_count=int(row[1]) if row[1] is not None else 0,
1001
+ last_update_ts=float(row[2]) if row[2] is not None else 0.0,
1002
+ last_update_rows=int(row[3]) if row[3] is not None else 0,
1003
+ last_added_rows=int(row[4]) if row[4] is not None else 0,
1004
+ last_deleted_rows=int(row[5]) if row[5] is not None else 0,
1005
+ )
1006
+
1007
+ except Exception as e:
1008
+ logging.warning(f"Failed to load stats for table {tname}: {e}", exc_info=True)
1009
+ self.table_meta[tname] = EtlDataTableStats(
1010
+ table_name=tname,
1011
+ process_ts=0.0,
1012
+ row_count=0,
1013
+ last_update_ts=0.0,
1014
+ last_update_rows=0,
1015
+ last_added_rows=0,
1016
+ last_deleted_rows=0,
1017
+ )
1018
+
1019
+ def _detect_last_run_window(self, timestamps: list[float]) -> tuple[float, float]:
1020
+ """Detect the last "run" window from a list of timestamps.
1021
+
1022
+ A run is a group of consecutive timestamps where gaps are < RUN_GAP_THRESHOLD_SECONDS.
1023
+ Returns (run_start, run_end) for the most recent run, or (0.0, 0.0) if no timestamps.
1024
+ """
1025
+ if not timestamps:
1026
+ return 0.0, 0.0
1027
+
1028
+ # Sort descending (most recent first)
1029
+ sorted_ts = sorted(timestamps, reverse=True)
1030
+
1031
+ # Find the last run by walking backwards until gap exceeds threshold
1032
+ run_end = sorted_ts[0]
1033
+ run_start = sorted_ts[0]
1034
+
1035
+ for i in range(1, len(sorted_ts)):
1036
+ gap = run_start - sorted_ts[i]
1037
+ if gap > RUN_GAP_THRESHOLD_SECONDS:
1038
+ # Gap too large - we've found the boundary of the last run
1039
+ break
1040
+ run_start = sorted_ts[i]
1041
+
1042
+ return run_start, run_end
1043
+
1044
+ def _load_step_stats(self) -> None:
1045
+ """Load stats for each pipeline step by querying their transform meta tables."""
1046
+ con = DBCONN_DATAPIPE.con # type: ignore[attr-defined]
1047
+ self.step_meta = {}
1048
+
1049
+ for idx, step in enumerate(etl_app.steps):
1050
+ try:
1051
+ # Only BaseBatchTransformStep has meta_table
1052
+ if not isinstance(step, BaseBatchTransformStep):
1053
+ continue
1054
+
1055
+ # Get the meta table name from the step
1056
+ meta_table = step.meta_table
1057
+ meta_table_name = str(meta_table.sql_table.name)
1058
+
1059
+ # Query all distinct process_ts values to detect runs
1060
+ ts_query = sa.text(
1061
+ f'SELECT DISTINCT process_ts FROM "{meta_table_name}" WHERE process_ts IS NOT NULL ORDER BY process_ts DESC LIMIT 1000'
1062
+ )
1063
+
1064
+ with con.begin() as conn:
1065
+ result = conn.execute(ts_query)
1066
+ timestamps = [float(row[0]) for row in result.fetchall() if row[0] is not None]
1067
+
1068
+ run_start, run_end = self._detect_last_run_window(timestamps)
1069
+
1070
+ # Query all-time totals first
1071
+ totals_query = sa.text(
1072
+ f"""
1073
+ SELECT
1074
+ COUNT(*) FILTER (WHERE is_success = TRUE) AS total_success,
1075
+ COUNT(*) FILTER (WHERE is_success = FALSE) AS total_failed
1076
+ FROM "{meta_table_name}"
1077
+ """
1078
+ )
1079
+
1080
+ with con.begin() as conn:
1081
+ totals_result = conn.execute(totals_query)
1082
+ totals_row = totals_result.fetchone()
1083
+
1084
+ total_success = int(totals_row[0]) if totals_row and totals_row[0] else 0
1085
+ total_failed = int(totals_row[1]) if totals_row and totals_row[1] else 0
1086
+
1087
+ if run_start == 0.0 and run_end == 0.0:
1088
+ # No data
1089
+ self.step_meta[idx] = EtlStepRunStats(
1090
+ step_name=step.name,
1091
+ meta_table_name=meta_table_name,
1092
+ last_run_start=0.0,
1093
+ last_run_end=0.0,
1094
+ rows_processed=0,
1095
+ rows_success=0,
1096
+ rows_failed=0,
1097
+ total_success=total_success,
1098
+ total_failed=total_failed,
1099
+ )
1100
+ continue
1101
+
1102
+ # Query rows in the last run window
1103
+ stats_query = sa.text(
1104
+ f"""
1105
+ SELECT
1106
+ COUNT(*) AS total,
1107
+ COUNT(*) FILTER (WHERE is_success = TRUE) AS success,
1108
+ COUNT(*) FILTER (WHERE is_success = FALSE) AS failed
1109
+ FROM "{meta_table_name}"
1110
+ WHERE process_ts >= {run_start} AND process_ts <= {run_end}
1111
+ """
1112
+ )
1113
+
1114
+ with con.begin() as conn:
1115
+ result = conn.execute(stats_query)
1116
+ row = result.fetchone()
1117
+
1118
+ self.step_meta[idx] = EtlStepRunStats(
1119
+ step_name=step.name,
1120
+ meta_table_name=meta_table_name,
1121
+ last_run_start=run_start,
1122
+ last_run_end=run_end,
1123
+ rows_processed=int(row[0]) if row and row[0] else 0,
1124
+ rows_success=int(row[1]) if row and row[1] else 0,
1125
+ rows_failed=int(row[2]) if row and row[2] else 0,
1126
+ total_success=total_success,
1127
+ total_failed=total_failed,
1128
+ )
1129
+
1130
+ except Exception as e:
1131
+ logging.warning(f"Failed to load step stats for step {idx}: {e}", exc_info=True)
1132
+ step_name = getattr(step, "name", f"step_{idx}")
1133
+ self.step_meta[idx] = EtlStepRunStats(
1134
+ step_name=step_name,
1135
+ meta_table_name="",
1136
+ last_run_start=0.0,
1137
+ last_run_end=0.0,
1138
+ rows_processed=0,
1139
+ rows_success=0,
1140
+ rows_failed=0,
1141
+ total_success=0,
1142
+ total_failed=0,
1143
+ )
1144
+
1145
+ def run_selected(self): # type: ignore[override]
1146
+ """Run the ETL for selected labels in background, streaming logs."""
1147
+ if self.is_running:
1148
+ return None
1149
+
1150
+ self.is_running = True
1151
+ self.logs = []
1152
+ self._append_log("Starting ETL run …")
1153
+ yield
1154
+
1155
+ try:
1156
+ valid_pipeline_indices = self._get_current_pipeline_step_indices()
1157
+
1158
+ if self.selection_source == "manual" and self.selected_node_ids:
1159
+ selected = [
1160
+ i for i in self.selected_node_ids or [] if isinstance(i, int) and i in valid_pipeline_indices
1161
+ ]
1162
+ steps_to_run = [etl_app.steps[i] for i in sorted(selected) if 0 <= i < len(etl_app.steps)]
1163
+ else:
1164
+ steps_to_run = self._filter_steps_by_labels(etl_app.steps)
1165
+ if not steps_to_run:
1166
+ self._append_log("No steps match selected filters")
1167
+ return
1168
+
1169
+ self._append_log(f"Steps to execute: {[getattr(s, 'name', type(s).__name__) for s in steps_to_run]}")
1170
+
1171
+ # stream datapipe logs into UI while each step runs
1172
+ q, handler, logger = self._start_log_capture()
1173
+ try:
1174
+ for step in steps_to_run:
1175
+ step_name = getattr(step, "name", type(step).__name__)
1176
+ self._append_log(f"Running step: {step_name}")
1177
+
1178
+ def _runner(s=step):
1179
+ run_steps(etl_app.ds, [s]) # type: ignore[arg-type]
1180
+
1181
+ t = threading.Thread(target=_runner, daemon=True)
1182
+ t.start()
1183
+ while t.is_alive():
1184
+ self._drain_queue_into_logs(q)
1185
+ yield
1186
+ time.sleep(0.1)
1187
+ t.join(timeout=0)
1188
+ self._drain_queue_into_logs(q)
1189
+ self._append_log(f"Completed step: {step_name}")
1190
+ yield
1191
+ finally:
1192
+ self._stop_log_capture(handler, logger)
1193
+ finally:
1194
+ self.is_running = False
1195
+ self.load_pipeline_metadata()
1196
+ self._append_log("ETL run finished")
1197
+
1198
+ def run_one_step(self, index: int | None = None): # type: ignore[override]
1199
+ if self.is_running:
1200
+ return None
1201
+
1202
+ if index is None:
1203
+ return None
1204
+
1205
+ try:
1206
+ idx = int(index)
1207
+ except Exception:
1208
+ return None
1209
+
1210
+ if idx < 0 or idx >= len(etl_app.steps):
1211
+ self._append_log(f"Invalid step index: {index}")
1212
+ return None
1213
+
1214
+ step = etl_app.steps[idx]
1215
+ self.is_running = True
1216
+ self.logs = []
1217
+ self._append_log(f"Starting single step {step.name}")
1218
+ yield
1219
+
1220
+ try:
1221
+ q, handler, logger = self._start_log_capture()
1222
+ try:
1223
+
1224
+ def _runner():
1225
+ run_steps(etl_app.ds, [step]) # type: ignore[arg-type]
1226
+
1227
+ t = threading.Thread(target=_runner, daemon=True)
1228
+ t.start()
1229
+ while t.is_alive():
1230
+ self._drain_queue_into_logs(q)
1231
+ yield
1232
+ time.sleep(0.1)
1233
+ t.join(timeout=0)
1234
+ self._drain_queue_into_logs(q)
1235
+ finally:
1236
+ self._stop_log_capture(handler, logger)
1237
+ finally:
1238
+ self.is_running = False
1239
+ self._append_log("Single step finished")
1240
+ return None
1241
+
1242
+ def preview_table(self, table_name: str) -> None:
1243
+ """Load a paginated preview from the datapipe DB for a selected table."""
1244
+
1245
+ # Toggle preview: if same table clicked, close; otherwise open new table
1246
+ if self.preview_table_name == table_name and self.preview_open:
1247
+ self.close_preview()
1248
+ return
1249
+
1250
+ self.preview_table_name = table_name
1251
+ self.preview_display_name = ""
1252
+ self.preview_open = True
1253
+ self.has_preview = False
1254
+ self.preview_page = 0 # Reset to first page
1255
+ self.preview_is_meta_table = False
1256
+ self.preview_expanded_rows = [] # Reset expanded rows
1257
+ # Rebuild immediately to reflect selection highlight in data view
1258
+ self._rebuild_graph()
1259
+
1260
+ self._load_preview_page()
1261
+
1262
+ def _load_preview_page(self) -> None:
1263
+ """Load the current page of preview data from the database."""
1264
+ if not self.preview_table_name:
1265
+ return
1266
+
1267
+ if self.preview_changes_only:
1268
+ self._load_preview_changes_page()
1269
+ else:
1270
+ self._load_preview_all_page()
1271
+
1272
+ def _load_preview_all_page(self) -> None:
1273
+ """Load all records (standard preview mode)."""
1274
+ engine = DBCONN_DATAPIPE.con # type: ignore[attr-defined]
1275
+ table_name = self.preview_table_name
1276
+ if not table_name:
1277
+ return
1278
+
1279
+ actual_table = table_name if not self.preview_is_meta_table else f"{table_name}_meta"
1280
+ offset = self.preview_page * self.preview_page_size
1281
+
1282
+ try:
1283
+ # Try main table first (only on first load)
1284
+ if self.preview_page == 0 and not self.preview_is_meta_table:
1285
+ try:
1286
+ # Get total count
1287
+ count_result = pd.read_sql(f'SELECT COUNT(*) as cnt FROM "{table_name}"', con=engine)
1288
+ self.preview_total_rows = int(count_result["cnt"].iloc[0])
1289
+ actual_table = table_name
1290
+ except Exception:
1291
+ # Fall back to _meta table
1292
+ count_result = pd.read_sql(f'SELECT COUNT(*) as cnt FROM "{table_name}_meta"', con=engine)
1293
+ self.preview_total_rows = int(count_result["cnt"].iloc[0])
1294
+ self.preview_display_name = f"{table_name}_meta"
1295
+ self.preview_is_meta_table = True
1296
+ actual_table = f"{table_name}_meta"
1297
+ elif self.preview_is_meta_table:
1298
+ actual_table = f"{table_name}_meta"
1299
+
1300
+ # Load page data
1301
+ df = pd.read_sql(
1302
+ f'SELECT * FROM "{actual_table}" LIMIT {self.preview_page_size} OFFSET {offset}',
1303
+ con=engine,
1304
+ )
1305
+
1306
+ except Exception as e:
1307
+ self._append_log(f"Failed to load table {table_name}: {e}")
1308
+ return
1309
+
1310
+ self.preview_columns = [str(c) for c in df.columns]
1311
+ records_any: list[dict[Any, Any]] = df.astype(object).where(pd.notna(df), None).to_dict(orient="records")
1312
+
1313
+ expanded_set = set(self.preview_expanded_rows)
1314
+ coerced: list[dict[str, Any]] = []
1315
+ for idx, r in enumerate(records_any):
1316
+ try:
1317
+ row_id = f"preview-{self.preview_page}-{idx}"
1318
+ row_data: dict[str, Any] = {str(k): safe_render_value(v) for k, v in dict(r).items()}
1319
+ row_data["row_id"] = row_id
1320
+ row_data["expanded"] = bool(row_id in expanded_set)
1321
+ coerced.append(row_data)
1322
+ except Exception:
1323
+ coerced.append({})
1324
+
1325
+ self.preview_rows = coerced
1326
+ self.has_preview = len(self.preview_rows) > 0
1327
+ try:
1328
+ self._rebuild_graph()
1329
+ except Exception:
1330
+ pass
1331
+
1332
+ def _load_preview_changes_page(self) -> None:
1333
+ """Load only records changed in the last run (with change type styling)."""
1334
+ table_name = self.preview_table_name
1335
+ if not table_name:
1336
+ return
1337
+
1338
+ engine = DBCONN_DATAPIPE.con # type: ignore[attr-defined]
1339
+ meta_table = f"{table_name}_meta"
1340
+ offset = self.preview_page * self.preview_page_size
1341
+
1342
+ # Get the last run window using the same logic as _load_table_stats
1343
+ try:
1344
+ ts_query = sa.text(
1345
+ f"""
1346
+ SELECT DISTINCT update_ts FROM "{meta_table}"
1347
+ WHERE update_ts IS NOT NULL
1348
+ ORDER BY update_ts DESC
1349
+ LIMIT 1000
1350
+ """
1351
+ )
1352
+ with engine.begin() as conn:
1353
+ result = conn.execute(ts_query)
1354
+ timestamps = [float(row[0]) for row in result.fetchall() if row[0] is not None]
1355
+ run_start, run_end = self._detect_last_run_window(timestamps)
1356
+ except Exception as e:
1357
+ self._append_log(f"Failed to detect last run window for {table_name}: {e}")
1358
+ run_start, run_end = 0.0, 0.0
1359
+
1360
+ if run_start == 0.0:
1361
+ # No run detected, show empty
1362
+ self.preview_columns = []
1363
+ self.preview_rows = []
1364
+ self.preview_total_rows = 0
1365
+ self.has_preview = False
1366
+ return
1367
+
1368
+ # Build query for changed records in the last run window
1369
+ meta_exclude = {"hash", "create_ts", "update_ts", "process_ts", "delete_ts"}
1370
+
1371
+ try:
1372
+ inspector = sa.inspect(engine)
1373
+ try:
1374
+ base_cols = [c.get("name", "") for c in inspector.get_columns(table_name)]
1375
+ base_cols = [str(c) for c in base_cols if c]
1376
+ except Exception:
1377
+ base_cols = []
1378
+ meta_cols = [c.get("name", "") for c in inspector.get_columns(meta_table)]
1379
+ meta_cols = [str(c) for c in meta_cols if c]
1380
+ except Exception as e:
1381
+ self._append_log(f"Failed to inspect columns for {table_name}: {e}")
1382
+ return
1383
+
1384
+ data_cols: list[str] = [c for c in meta_cols if c not in meta_exclude]
1385
+ display_cols: list[str] = [c for c in base_cols if c] if base_cols else list(data_cols)
1386
+
1387
+ # Get total count of changed records on first page
1388
+ if self.preview_page == 0:
1389
+ q_count = sa.text(
1390
+ f"""
1391
+ SELECT COUNT(*)
1392
+ FROM "{meta_table}" AS m
1393
+ WHERE
1394
+ (m.delete_ts IS NOT NULL AND m.delete_ts >= {run_start} AND m.delete_ts <= {run_end})
1395
+ OR
1396
+ (m.update_ts IS NOT NULL AND m.update_ts >= {run_start} AND m.update_ts <= {run_end}
1397
+ AND m.update_ts > m.create_ts)
1398
+ OR
1399
+ (m.create_ts IS NOT NULL AND m.create_ts >= {run_start} AND m.create_ts <= {run_end}
1400
+ AND m.delete_ts IS NULL)
1401
+ """
1402
+ )
1403
+ try:
1404
+ with engine.begin() as conn:
1405
+ self.preview_total_rows = int(conn.execute(q_count).scalar() or 0)
1406
+ except Exception:
1407
+ self.preview_total_rows = 0
1408
+
1409
+ # Build SELECT with join to base table (if exists) to get full data
1410
+ if base_cols:
1411
+ select_exprs: list[str] = []
1412
+ for c in display_cols:
1413
+ if c in meta_cols:
1414
+ select_exprs.append(f'COALESCE(b."{c}", m."{c}") AS "{c}"')
1415
+ else:
1416
+ select_exprs.append(f'b."{c}" AS "{c}"')
1417
+ select_cols = ", ".join(select_exprs)
1418
+ on_cond = " AND ".join([f'b."{c}" = m."{c}"' for c in data_cols])
1419
+
1420
+ q_data = sa.text(
1421
+ f"""
1422
+ SELECT
1423
+ {select_cols},
1424
+ CASE
1425
+ WHEN m.delete_ts IS NOT NULL AND m.delete_ts >= {run_start} AND m.delete_ts <= {run_end} THEN 'deleted'
1426
+ WHEN m.update_ts IS NOT NULL AND m.update_ts >= {run_start} AND m.update_ts <= {run_end}
1427
+ AND m.update_ts > m.create_ts THEN 'updated'
1428
+ WHEN m.create_ts IS NOT NULL AND m.create_ts >= {run_start} AND m.create_ts <= {run_end}
1429
+ AND m.delete_ts IS NULL THEN 'added'
1430
+ ELSE NULL
1431
+ END AS change_type
1432
+ FROM "{meta_table}" AS m
1433
+ LEFT JOIN "{table_name}" AS b ON {on_cond}
1434
+ WHERE
1435
+ (m.delete_ts IS NOT NULL AND m.delete_ts >= {run_start} AND m.delete_ts <= {run_end})
1436
+ OR
1437
+ (m.update_ts IS NOT NULL AND m.update_ts >= {run_start} AND m.update_ts <= {run_end}
1438
+ AND m.update_ts > m.create_ts)
1439
+ OR
1440
+ (m.create_ts IS NOT NULL AND m.create_ts >= {run_start} AND m.create_ts <= {run_end}
1441
+ AND m.delete_ts IS NULL)
1442
+ ORDER BY COALESCE(m.update_ts, m.create_ts, m.delete_ts) DESC
1443
+ LIMIT {self.preview_page_size} OFFSET {offset}
1444
+ """
1445
+ )
1446
+ else:
1447
+ # No base table, query meta only
1448
+ select_cols = ", ".join([f'm."{c}"' for c in display_cols])
1449
+ q_data = sa.text(
1450
+ f"""
1451
+ SELECT
1452
+ {select_cols},
1453
+ CASE
1454
+ WHEN m.delete_ts IS NOT NULL AND m.delete_ts >= {run_start} AND m.delete_ts <= {run_end} THEN 'deleted'
1455
+ WHEN m.update_ts IS NOT NULL AND m.update_ts >= {run_start} AND m.update_ts <= {run_end}
1456
+ AND m.update_ts > m.create_ts THEN 'updated'
1457
+ WHEN m.create_ts IS NOT NULL AND m.create_ts >= {run_start} AND m.create_ts <= {run_end}
1458
+ AND m.delete_ts IS NULL THEN 'added'
1459
+ ELSE NULL
1460
+ END AS change_type
1461
+ FROM "{meta_table}" AS m
1462
+ WHERE
1463
+ (m.delete_ts IS NOT NULL AND m.delete_ts >= {run_start} AND m.delete_ts <= {run_end})
1464
+ OR
1465
+ (m.update_ts IS NOT NULL AND m.update_ts >= {run_start} AND m.update_ts <= {run_end}
1466
+ AND m.update_ts > m.create_ts)
1467
+ OR
1468
+ (m.create_ts IS NOT NULL AND m.create_ts >= {run_start} AND m.create_ts <= {run_end}
1469
+ AND m.delete_ts IS NULL)
1470
+ ORDER BY COALESCE(m.update_ts, m.create_ts, m.delete_ts) DESC
1471
+ LIMIT {self.preview_page_size} OFFSET {offset}
1472
+ """
1473
+ )
1474
+
1475
+ try:
1476
+ df = pd.read_sql(q_data, con=engine)
1477
+ except Exception as e:
1478
+ self._append_log(f"Failed to load changes for {table_name}: {e}")
1479
+ return
1480
+
1481
+ # Set columns (exclude change_type from display columns)
1482
+ self.preview_columns = [str(c) for c in display_cols]
1483
+ records_any: list[dict[Any, Any]] = df.astype(object).where(pd.notna(df), None).to_dict(orient="records")
1484
+
1485
+ # Apply row styling based on change_type
1486
+ row_styling = {
1487
+ "added": {"backgroundColor": "rgba(34,197,94,0.12)"},
1488
+ "updated": {"backgroundColor": "rgba(245,158,11,0.12)"},
1489
+ "deleted": {"backgroundColor": "rgba(239,68,68,0.12)"},
1490
+ }
1491
+
1492
+ expanded_set = set(self.preview_expanded_rows)
1493
+ styled: list[dict[str, Any]] = []
1494
+ for idx, r in enumerate(records_any):
1495
+ try:
1496
+ row_id = f"preview-{self.preview_page}-{idx}"
1497
+ row_disp: dict[str, Any] = {str(k): safe_render_value(r.get(k)) for k in self.preview_columns}
1498
+ row_disp["row_style"] = row_styling.get(r.get("change_type", ""), {})
1499
+ row_disp["row_id"] = row_id
1500
+ row_disp["expanded"] = row_id in expanded_set
1501
+ styled.append(row_disp)
1502
+ except Exception:
1503
+ styled.append({})
1504
+
1505
+ self.preview_rows = styled
1506
+ self.has_preview = len(self.preview_rows) > 0
1507
+ try:
1508
+ self._rebuild_graph()
1509
+ except Exception:
1510
+ pass
1511
+
1512
+ def toggle_preview_changes_only(self, checked: bool) -> None:
1513
+ """Toggle between showing all records or only changes from last run."""
1514
+ self.preview_changes_only = bool(checked)
1515
+ self.preview_page = 0 # Reset to first page
1516
+ self._load_preview_page()
1517
+
1518
+ def preview_next_page(self) -> None:
1519
+ """Load the next page of preview data."""
1520
+ max_page = (self.preview_total_rows - 1) // self.preview_page_size if self.preview_total_rows > 0 else 0
1521
+ if self.preview_page < max_page:
1522
+ self.preview_page += 1
1523
+ self._load_preview_page()
1524
+
1525
+ def preview_prev_page(self) -> None:
1526
+ """Load the previous page of preview data."""
1527
+ if self.preview_page > 0:
1528
+ self.preview_page -= 1
1529
+ self._load_preview_page()
1530
+
1531
+ def preview_first_page(self) -> None:
1532
+ """Jump to the first page."""
1533
+ if self.preview_page != 0:
1534
+ self.preview_page = 0
1535
+ self._load_preview_page()
1536
+
1537
+ def preview_last_page(self) -> None:
1538
+ """Jump to the last page."""
1539
+ max_page = (self.preview_total_rows - 1) // self.preview_page_size if self.preview_total_rows > 0 else 0
1540
+ if self.preview_page != max_page:
1541
+ self.preview_page = max_page
1542
+ self._load_preview_page()
1543
+
1544
+ @rx.var
1545
+ def preview_page_display(self) -> str:
1546
+ """Current page display (1-indexed for users)."""
1547
+ total_pages = (self.preview_total_rows - 1) // self.preview_page_size + 1 if self.preview_total_rows > 0 else 1
1548
+ return f"Page {self.preview_page + 1} of {total_pages}"
1549
+
1550
+ @rx.var
1551
+ def preview_rows_display(self) -> str:
1552
+ """Display range of rows being shown."""
1553
+ if self.preview_total_rows == 0:
1554
+ return "No rows"
1555
+ start = self.preview_page * self.preview_page_size + 1
1556
+ end = min(start + self.preview_page_size - 1, self.preview_total_rows)
1557
+ return f"Rows {start}-{end} of {self.preview_total_rows}"
1558
+
1559
+ @rx.var
1560
+ def preview_has_next(self) -> bool:
1561
+ """Whether there's a next page."""
1562
+ max_page = (self.preview_total_rows - 1) // self.preview_page_size if self.preview_total_rows > 0 else 0
1563
+ return self.preview_page < max_page
1564
+
1565
+ @rx.var
1566
+ def preview_has_prev(self) -> bool:
1567
+ """Whether there's a previous page."""
1568
+ return self.preview_page > 0
1569
+
1570
+ def close_preview(self) -> None:
1571
+ self.preview_open = False
1572
+ self.preview_table_name = None
1573
+ self.preview_display_name = ""
1574
+ self.preview_page = 0
1575
+ self.preview_total_rows = 0
1576
+ self.preview_is_meta_table = False
1577
+ self.preview_changes_only = False
1578
+ self.preview_rows = []
1579
+ self.preview_columns = []
1580
+ self.has_preview = False
1581
+ self.preview_expanded_rows = []
1582
+ try:
1583
+ self._rebuild_graph()
1584
+ except Exception:
1585
+ pass
1586
+
1587
+ def set_preview_open(self, open: bool) -> None:
1588
+ """Handle popover open/close state changes."""
1589
+ if not open:
1590
+ self.close_preview()