odibi 2.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. odibi/__init__.py +32 -0
  2. odibi/__main__.py +8 -0
  3. odibi/catalog.py +3011 -0
  4. odibi/cli/__init__.py +11 -0
  5. odibi/cli/__main__.py +6 -0
  6. odibi/cli/catalog.py +553 -0
  7. odibi/cli/deploy.py +69 -0
  8. odibi/cli/doctor.py +161 -0
  9. odibi/cli/export.py +66 -0
  10. odibi/cli/graph.py +150 -0
  11. odibi/cli/init_pipeline.py +242 -0
  12. odibi/cli/lineage.py +259 -0
  13. odibi/cli/main.py +215 -0
  14. odibi/cli/run.py +98 -0
  15. odibi/cli/schema.py +208 -0
  16. odibi/cli/secrets.py +232 -0
  17. odibi/cli/story.py +379 -0
  18. odibi/cli/system.py +132 -0
  19. odibi/cli/test.py +286 -0
  20. odibi/cli/ui.py +31 -0
  21. odibi/cli/validate.py +39 -0
  22. odibi/config.py +3541 -0
  23. odibi/connections/__init__.py +9 -0
  24. odibi/connections/azure_adls.py +499 -0
  25. odibi/connections/azure_sql.py +709 -0
  26. odibi/connections/base.py +28 -0
  27. odibi/connections/factory.py +322 -0
  28. odibi/connections/http.py +78 -0
  29. odibi/connections/local.py +119 -0
  30. odibi/connections/local_dbfs.py +61 -0
  31. odibi/constants.py +17 -0
  32. odibi/context.py +528 -0
  33. odibi/diagnostics/__init__.py +12 -0
  34. odibi/diagnostics/delta.py +520 -0
  35. odibi/diagnostics/diff.py +169 -0
  36. odibi/diagnostics/manager.py +171 -0
  37. odibi/engine/__init__.py +20 -0
  38. odibi/engine/base.py +334 -0
  39. odibi/engine/pandas_engine.py +2178 -0
  40. odibi/engine/polars_engine.py +1114 -0
  41. odibi/engine/registry.py +54 -0
  42. odibi/engine/spark_engine.py +2362 -0
  43. odibi/enums.py +7 -0
  44. odibi/exceptions.py +297 -0
  45. odibi/graph.py +426 -0
  46. odibi/introspect.py +1214 -0
  47. odibi/lineage.py +511 -0
  48. odibi/node.py +3341 -0
  49. odibi/orchestration/__init__.py +0 -0
  50. odibi/orchestration/airflow.py +90 -0
  51. odibi/orchestration/dagster.py +77 -0
  52. odibi/patterns/__init__.py +24 -0
  53. odibi/patterns/aggregation.py +599 -0
  54. odibi/patterns/base.py +94 -0
  55. odibi/patterns/date_dimension.py +423 -0
  56. odibi/patterns/dimension.py +696 -0
  57. odibi/patterns/fact.py +748 -0
  58. odibi/patterns/merge.py +128 -0
  59. odibi/patterns/scd2.py +148 -0
  60. odibi/pipeline.py +2382 -0
  61. odibi/plugins.py +80 -0
  62. odibi/project.py +581 -0
  63. odibi/references.py +151 -0
  64. odibi/registry.py +246 -0
  65. odibi/semantics/__init__.py +71 -0
  66. odibi/semantics/materialize.py +392 -0
  67. odibi/semantics/metrics.py +361 -0
  68. odibi/semantics/query.py +743 -0
  69. odibi/semantics/runner.py +430 -0
  70. odibi/semantics/story.py +507 -0
  71. odibi/semantics/views.py +432 -0
  72. odibi/state/__init__.py +1203 -0
  73. odibi/story/__init__.py +55 -0
  74. odibi/story/doc_story.py +554 -0
  75. odibi/story/generator.py +1431 -0
  76. odibi/story/lineage.py +1043 -0
  77. odibi/story/lineage_utils.py +324 -0
  78. odibi/story/metadata.py +608 -0
  79. odibi/story/renderers.py +453 -0
  80. odibi/story/templates/run_story.html +2520 -0
  81. odibi/story/themes.py +216 -0
  82. odibi/testing/__init__.py +13 -0
  83. odibi/testing/assertions.py +75 -0
  84. odibi/testing/fixtures.py +85 -0
  85. odibi/testing/source_pool.py +277 -0
  86. odibi/transformers/__init__.py +122 -0
  87. odibi/transformers/advanced.py +1472 -0
  88. odibi/transformers/delete_detection.py +610 -0
  89. odibi/transformers/manufacturing.py +1029 -0
  90. odibi/transformers/merge_transformer.py +778 -0
  91. odibi/transformers/relational.py +675 -0
  92. odibi/transformers/scd.py +579 -0
  93. odibi/transformers/sql_core.py +1356 -0
  94. odibi/transformers/validation.py +165 -0
  95. odibi/ui/__init__.py +0 -0
  96. odibi/ui/app.py +195 -0
  97. odibi/utils/__init__.py +66 -0
  98. odibi/utils/alerting.py +667 -0
  99. odibi/utils/config_loader.py +343 -0
  100. odibi/utils/console.py +231 -0
  101. odibi/utils/content_hash.py +202 -0
  102. odibi/utils/duration.py +43 -0
  103. odibi/utils/encoding.py +102 -0
  104. odibi/utils/extensions.py +28 -0
  105. odibi/utils/hashing.py +61 -0
  106. odibi/utils/logging.py +203 -0
  107. odibi/utils/logging_context.py +740 -0
  108. odibi/utils/progress.py +429 -0
  109. odibi/utils/setup_helpers.py +302 -0
  110. odibi/utils/telemetry.py +140 -0
  111. odibi/validation/__init__.py +62 -0
  112. odibi/validation/engine.py +765 -0
  113. odibi/validation/explanation_linter.py +155 -0
  114. odibi/validation/fk.py +547 -0
  115. odibi/validation/gate.py +252 -0
  116. odibi/validation/quarantine.py +605 -0
  117. odibi/writers/__init__.py +15 -0
  118. odibi/writers/sql_server_writer.py +2081 -0
  119. odibi-2.5.0.dist-info/METADATA +255 -0
  120. odibi-2.5.0.dist-info/RECORD +124 -0
  121. odibi-2.5.0.dist-info/WHEEL +5 -0
  122. odibi-2.5.0.dist-info/entry_points.txt +2 -0
  123. odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
  124. odibi-2.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1029 @@
1
+ """
2
+ Manufacturing Transformers
3
+
4
+ Specialized transformers for manufacturing/process data analysis.
5
+ Handles common patterns like cycle detection, phase analysis, and time-in-state calculations.
6
+ """
7
+
8
+ import time
9
+ from typing import Dict, List, Optional, Union
10
+
11
+ import pandas as pd
12
+ from pydantic import BaseModel, Field
13
+
14
+ from odibi.context import EngineContext
15
+ from odibi.enums import EngineType
16
+ from odibi.utils.logging_context import get_logging_context
17
+
18
+
19
+ # =============================================================================
20
+ # DETECT SEQUENTIAL PHASES
21
+ # =============================================================================
22
+
23
+
24
+ class PhaseConfig(BaseModel):
25
+ """Configuration for a single phase."""
26
+
27
+ timer_col: str = Field(..., description="Timer column name for this phase")
28
+ start_threshold: Optional[int] = Field(
29
+ None, description="Override default start threshold for this phase (seconds)"
30
+ )
31
+
32
+
33
+ class DetectSequentialPhasesParams(BaseModel):
34
+ """
35
+ Detect and analyze sequential manufacturing phases from timer columns.
36
+
37
+ This transformer processes raw sensor/PLC data where timer columns increment
38
+ during each phase. It detects phase boundaries, calculates durations, and
39
+ tracks time spent in each equipment status.
40
+
41
+ Common use cases:
42
+ - Batch reactor cycle analysis
43
+ - CIP (Clean-in-Place) phase timing
44
+ - Food processing (cook, cool, package cycles)
45
+ - Any multi-step batch process with PLC timers
46
+
47
+ Scenario: Analyze FBR cycle times
48
+ ```yaml
49
+ detect_sequential_phases:
50
+ group_by: BatchID
51
+ timestamp_col: ts
52
+ phases:
53
+ - timer_col: LoadTime
54
+ - timer_col: AcidTime
55
+ - timer_col: DryTime
56
+ - timer_col: CookTime
57
+ - timer_col: CoolTime
58
+ - timer_col: UnloadTime
59
+ start_threshold: 240
60
+ status_col: Status
61
+ status_mapping:
62
+ 1: idle
63
+ 2: active
64
+ 3: hold
65
+ 4: faulted
66
+ phase_metrics:
67
+ Level: max
68
+ metadata:
69
+ ProductCode: first_after_start
70
+ Weight: max
71
+ ```
72
+
73
+ Scenario: Group by multiple columns
74
+ ```yaml
75
+ detect_sequential_phases:
76
+ group_by:
77
+ - BatchID
78
+ - AssetID
79
+ phases: [LoadTime, CookTime]
80
+ ```
81
+ """
82
+
83
+ group_by: Union[str, List[str]] = Field(
84
+ ...,
85
+ description="Column(s) to group by. Can be a single column name or list of columns. "
86
+ "E.g., 'BatchID' or ['BatchID', 'AssetID']",
87
+ )
88
+ timestamp_col: str = Field(default="ts", description="Timestamp column for ordering events")
89
+ phases: List[Union[str, PhaseConfig]] = Field(
90
+ ...,
91
+ description="List of phase timer columns (strings) or PhaseConfig objects. "
92
+ "Phases are processed sequentially - each phase starts after the previous ends.",
93
+ )
94
+ start_threshold: int = Field(
95
+ default=240,
96
+ description="Default max timer value (seconds) to consider as valid phase start. "
97
+ "Filters out late readings where timer already shows large elapsed time.",
98
+ )
99
+ status_col: Optional[str] = Field(None, description="Column containing equipment status codes")
100
+ status_mapping: Optional[Dict[int, str]] = Field(
101
+ None,
102
+ description="Mapping of status codes to names. "
103
+ "E.g., {1: 'idle', 2: 'active', 3: 'hold', 4: 'faulted'}",
104
+ )
105
+ phase_metrics: Optional[Dict[str, str]] = Field(
106
+ None,
107
+ description="Columns to aggregate within each phase window. "
108
+ "E.g., {Level: max, Pressure: max}. Outputs {Phase}_{Column} columns.",
109
+ )
110
+ metadata: Optional[Dict[str, str]] = Field(
111
+ None,
112
+ description="Columns to include in output with aggregation method. "
113
+ "Options: 'first', 'last', 'first_after_start', 'max', 'min', 'mean', 'sum'. "
114
+ "E.g., {ProductCode: first_after_start, Weight: max}",
115
+ )
116
+ output_time_format: str = Field(
117
+ default="%Y-%m-%d %H:%M:%S",
118
+ description="Format for output timestamp columns",
119
+ )
120
+ fill_null_minutes: bool = Field(
121
+ default=False,
122
+ description="If True, fill null numeric columns (_max_minutes, _status_minutes, _metrics) "
123
+ "with 0. Timestamp columns remain null for skipped phases.",
124
+ )
125
+ spark_native: bool = Field(
126
+ default=False,
127
+ description="If True, use native Spark window functions. If False (default), use "
128
+ "applyInPandas which is often faster for datasets with many batches.",
129
+ )
130
+
131
+
132
+ def _normalize_group_by(group_by: Union[str, List[str]]) -> List[str]:
133
+ """Convert group_by to list format."""
134
+ if isinstance(group_by, str):
135
+ return [group_by]
136
+ return list(group_by)
137
+
138
+
139
+ def _get_expected_columns(params: "DetectSequentialPhasesParams") -> Dict[str, None]:
140
+ """
141
+ Build a dict of ALL expected output columns with None/NaT values.
142
+
143
+ This ensures Spark applyInPandas always receives a DataFrame with
144
+ all columns defined in the schema, even when phases are skipped.
145
+ Uses pd.NaT for timestamp columns to match TimestampType schema.
146
+ """
147
+ columns = {}
148
+
149
+ phase_names = [p.timer_col if isinstance(p, PhaseConfig) else p for p in params.phases]
150
+ for phase in phase_names:
151
+ columns[f"{phase}_start"] = pd.NaT
152
+ columns[f"{phase}_end"] = pd.NaT
153
+ columns[f"{phase}_max_minutes"] = None
154
+
155
+ if params.status_mapping:
156
+ for status_name in params.status_mapping.values():
157
+ columns[f"{phase}_{status_name}_minutes"] = None
158
+
159
+ if params.phase_metrics:
160
+ for metric_col in params.phase_metrics.keys():
161
+ columns[f"{phase}_{metric_col}"] = None
162
+
163
+ if params.metadata:
164
+ for col in params.metadata.keys():
165
+ columns[col] = None
166
+
167
+ return columns
168
+
169
+
170
+ def _get_numeric_columns(params: "DetectSequentialPhasesParams") -> List[str]:
171
+ """Get list of all numeric output column names (for fill_null_minutes)."""
172
+ columns = []
173
+
174
+ phase_names = [p.timer_col if isinstance(p, PhaseConfig) else p for p in params.phases]
175
+ for phase in phase_names:
176
+ columns.append(f"{phase}_max_minutes")
177
+
178
+ if params.status_mapping:
179
+ for status_name in params.status_mapping.values():
180
+ columns.append(f"{phase}_{status_name}_minutes")
181
+
182
+ if params.phase_metrics:
183
+ for metric_col in params.phase_metrics.keys():
184
+ columns.append(f"{phase}_{metric_col}")
185
+
186
+ return columns
187
+
188
+
189
+ def _fill_null_numeric_columns(
190
+ df: pd.DataFrame, params: "DetectSequentialPhasesParams"
191
+ ) -> pd.DataFrame:
192
+ """Fill null values in numeric columns with 0."""
193
+ numeric_cols = _get_numeric_columns(params)
194
+ for col in numeric_cols:
195
+ if col in df.columns:
196
+ df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0.0)
197
+ return df
198
+
199
+
200
+ def detect_sequential_phases(
201
+ context: EngineContext, params: DetectSequentialPhasesParams
202
+ ) -> EngineContext:
203
+ """
204
+ Detect and analyze sequential manufacturing phases.
205
+
206
+ For each group (e.g., batch), this transformer:
207
+ 1. Processes phases sequentially (each starts after previous ends)
208
+ 2. Detects phase start by finding first valid timer reading and back-calculating
209
+ 3. Detects phase end by finding first repeated (plateaued) timer value
210
+ 4. Calculates time spent in each status during each phase
211
+ 5. Aggregates specified metrics within each phase window
212
+ 6. Outputs one summary row per group
213
+
214
+ Output columns per phase:
215
+ - {phase}_start: Phase start timestamp
216
+ - {phase}_end: Phase end timestamp
217
+ - {phase}_max_minutes: Maximum timer value converted to minutes
218
+ - {phase}_{status}_minutes: Time in each status (if status_col provided)
219
+ - {phase}_{metric}: Aggregated metrics (if phase_metrics provided)
220
+ """
221
+ ctx = get_logging_context()
222
+ start_time = time.time()
223
+
224
+ phase_names = [p.timer_col if isinstance(p, PhaseConfig) else p for p in params.phases]
225
+ group_by_cols = _normalize_group_by(params.group_by)
226
+
227
+ ctx.debug(
228
+ "DetectSequentialPhases starting",
229
+ group_by=group_by_cols,
230
+ phases=phase_names,
231
+ )
232
+
233
+ if context.engine_type == EngineType.PANDAS:
234
+ result_df = _detect_phases_pandas(context.df, params)
235
+ elif context.engine_type == EngineType.SPARK:
236
+ if params.spark_native:
237
+ result_df = _detect_phases_spark_native(context.df, params)
238
+ else:
239
+ result_df = _detect_phases_spark(context.df, params)
240
+ elif context.engine_type == EngineType.POLARS:
241
+ result_df = _detect_phases_polars(context.df, params)
242
+ else:
243
+ raise ValueError(f"Unsupported engine: {context.engine_type}")
244
+
245
+ elapsed_ms = (time.time() - start_time) * 1000
246
+ ctx.debug(
247
+ "DetectSequentialPhases completed",
248
+ output_rows=len(result_df) if hasattr(result_df, "__len__") else "unknown",
249
+ elapsed_ms=round(elapsed_ms, 2),
250
+ )
251
+ return context.with_df(result_df)
252
+
253
+
254
+ def _detect_phases_pandas(df: pd.DataFrame, params: DetectSequentialPhasesParams) -> pd.DataFrame:
255
+ """Pandas implementation of sequential phase detection."""
256
+
257
+ group_by_cols = _normalize_group_by(params.group_by)
258
+
259
+ df = df.copy()
260
+ df[params.timestamp_col] = pd.to_datetime(df[params.timestamp_col])
261
+ df = df.sort_values(by=params.timestamp_col, ascending=True).reset_index(drop=True)
262
+ df = df.drop_duplicates()
263
+
264
+ summary_rows = []
265
+ grouped = df.groupby(group_by_cols)
266
+
267
+ for group_id, group in grouped:
268
+ group = group.sort_values(params.timestamp_col).reset_index(drop=True)
269
+
270
+ if len(group_by_cols) == 1:
271
+ row = {group_by_cols[0]: group_id if not isinstance(group_id, tuple) else group_id[0]}
272
+ else:
273
+ row = {col: val for col, val in zip(group_by_cols, group_id)}
274
+
275
+ row.update(_get_expected_columns(params))
276
+
277
+ previous_phase_end = None
278
+ first_phase_start = None
279
+
280
+ for phase in params.phases:
281
+ if isinstance(phase, PhaseConfig):
282
+ timer_col = phase.timer_col
283
+ threshold = phase.start_threshold or params.start_threshold
284
+ else:
285
+ timer_col = phase
286
+ threshold = params.start_threshold
287
+
288
+ if timer_col not in group.columns:
289
+ continue
290
+
291
+ phase_result = _detect_single_phase(
292
+ group=group,
293
+ timer_col=timer_col,
294
+ timestamp_col=params.timestamp_col,
295
+ threshold=threshold,
296
+ previous_phase_end=previous_phase_end,
297
+ status_col=params.status_col,
298
+ status_mapping=params.status_mapping,
299
+ phase_metrics=params.phase_metrics,
300
+ time_format=params.output_time_format,
301
+ )
302
+
303
+ if phase_result:
304
+ row.update(phase_result["columns"])
305
+ previous_phase_end = phase_result["end_time"]
306
+
307
+ if first_phase_start is None:
308
+ first_phase_start = phase_result["start_time"]
309
+
310
+ if params.metadata and first_phase_start is not None:
311
+ metadata_values = _extract_metadata(
312
+ group=group,
313
+ metadata_config=params.metadata,
314
+ timestamp_col=params.timestamp_col,
315
+ first_phase_start=first_phase_start,
316
+ )
317
+ row.update(metadata_values)
318
+
319
+ summary_rows.append(row)
320
+
321
+ result_df = pd.DataFrame(summary_rows)
322
+
323
+ if result_df.empty:
324
+ return result_df
325
+
326
+ first_phase_name = (
327
+ params.phases[0].timer_col
328
+ if isinstance(params.phases[0], PhaseConfig)
329
+ else params.phases[0]
330
+ )
331
+ start_col = f"{first_phase_name}_start"
332
+ if start_col in result_df.columns:
333
+ result_df = result_df.sort_values(by=start_col, ascending=True)
334
+
335
+ if params.fill_null_minutes:
336
+ result_df = _fill_null_numeric_columns(result_df, params)
337
+
338
+ return result_df.reset_index(drop=True)
339
+
340
+
341
+ def _detect_single_phase(
342
+ group: pd.DataFrame,
343
+ timer_col: str,
344
+ timestamp_col: str,
345
+ threshold: int,
346
+ previous_phase_end: Optional[pd.Timestamp],
347
+ status_col: Optional[str],
348
+ status_mapping: Optional[Dict[int, str]],
349
+ phase_metrics: Optional[Dict[str, str]],
350
+ time_format: str,
351
+ ) -> Optional[dict]:
352
+ """
353
+ Detect a single phase's boundaries and calculate metrics.
354
+
355
+ Returns dict with:
356
+ - columns: dict of output column names to values
357
+ - start_time: phase start timestamp (for chaining)
358
+ - end_time: phase end timestamp (for chaining)
359
+ """
360
+
361
+ if previous_phase_end is not None:
362
+ phase_data = group[group[timestamp_col] > previous_phase_end]
363
+ else:
364
+ phase_data = group
365
+
366
+ if phase_data.empty:
367
+ return None
368
+
369
+ non_zero = phase_data[phase_data[timer_col] > 0]
370
+ if non_zero.empty:
371
+ return None
372
+
373
+ potential_starts = non_zero[non_zero[timer_col] <= threshold].sort_values(
374
+ by=timestamp_col, ascending=True
375
+ )
376
+ if potential_starts.empty:
377
+ return None
378
+
379
+ first_idx = potential_starts.index[0]
380
+ first_ts = potential_starts.loc[first_idx, timestamp_col]
381
+ first_val = potential_starts.loc[first_idx, timer_col]
382
+
383
+ true_start = first_ts - pd.Timedelta(seconds=first_val)
384
+
385
+ after_start = phase_data[phase_data[timestamp_col] > true_start].reset_index(drop=True)
386
+
387
+ end_time = None
388
+ max_timer = 0
389
+
390
+ unique_times = after_start.drop_duplicates(subset=[timestamp_col]).reset_index(drop=True)
391
+
392
+ for i in range(1, len(unique_times)):
393
+ curr_val = unique_times[timer_col].iloc[i]
394
+ prev_val = unique_times[timer_col].iloc[i - 1]
395
+ if curr_val == prev_val:
396
+ end_time = unique_times[timestamp_col].iloc[i - 1]
397
+ max_timer = curr_val
398
+ break
399
+
400
+ if end_time is None and len(unique_times) > 0:
401
+ end_time = unique_times[timestamp_col].iloc[-1]
402
+ max_timer = unique_times[timer_col].iloc[-1]
403
+
404
+ if end_time is None:
405
+ return None
406
+
407
+ columns = {
408
+ f"{timer_col}_start": true_start,
409
+ f"{timer_col}_end": end_time,
410
+ f"{timer_col}_max_minutes": round(max_timer / 60, 6) if max_timer else 0,
411
+ }
412
+
413
+ if status_col and status_mapping and status_col in group.columns:
414
+ status_times = _calculate_status_times(
415
+ group=group,
416
+ start_time=true_start,
417
+ end_time=end_time,
418
+ timestamp_col=timestamp_col,
419
+ status_col=status_col,
420
+ status_mapping=status_mapping,
421
+ )
422
+ for status_name, duration in status_times.items():
423
+ columns[f"{timer_col}_{status_name}_minutes"] = round(duration, 6)
424
+
425
+ if phase_metrics:
426
+ phase_window = phase_data[
427
+ (phase_data[timestamp_col] >= true_start) & (phase_data[timestamp_col] <= end_time)
428
+ ]
429
+ for metric_col, agg_func in phase_metrics.items():
430
+ if metric_col in phase_window.columns:
431
+ try:
432
+ value = phase_window[metric_col].agg(agg_func)
433
+ columns[f"{timer_col}_{metric_col}"] = value
434
+ except Exception:
435
+ columns[f"{timer_col}_{metric_col}"] = None
436
+
437
+ return {
438
+ "columns": columns,
439
+ "start_time": true_start,
440
+ "end_time": end_time,
441
+ }
442
+
443
+
444
+ def _calculate_status_times(
445
+ group: pd.DataFrame,
446
+ start_time: pd.Timestamp,
447
+ end_time: pd.Timestamp,
448
+ timestamp_col: str,
449
+ status_col: str,
450
+ status_mapping: Dict[int, str],
451
+ ) -> Dict[str, float]:
452
+ """
453
+ Calculate time spent in each status within a phase window.
454
+
455
+ Tracks status transitions and accumulates duration per status.
456
+ Handles NaN and unknown status codes gracefully.
457
+ """
458
+ status_times = {status_name: 0.0 for status_name in status_mapping.values()}
459
+
460
+ within_phase = group[(group[timestamp_col] >= start_time) & (group[timestamp_col] <= end_time)]
461
+
462
+ if within_phase.empty:
463
+ return status_times
464
+
465
+ valid_rows = within_phase[
466
+ within_phase[status_col].notna() & within_phase[status_col].isin(status_mapping.keys())
467
+ ]
468
+
469
+ if valid_rows.empty:
470
+ return status_times
471
+
472
+ current_status = valid_rows.iloc[0][status_col]
473
+ last_change_ts = valid_rows.iloc[0][timestamp_col]
474
+
475
+ for _, record in within_phase.iterrows():
476
+ ts = record[timestamp_col]
477
+ status = record[status_col]
478
+
479
+ if pd.isna(status) or status not in status_mapping:
480
+ continue
481
+
482
+ if status != current_status:
483
+ time_diff = (ts - last_change_ts).total_seconds() / 60
484
+ status_times[status_mapping[current_status]] += time_diff
485
+ last_change_ts = ts
486
+ current_status = status
487
+
488
+ final_diff = (end_time - last_change_ts).total_seconds() / 60
489
+ status_times[status_mapping[current_status]] += final_diff
490
+
491
+ return status_times
492
+
493
+
494
+ def _extract_metadata(
495
+ group: pd.DataFrame,
496
+ metadata_config: Dict[str, str],
497
+ timestamp_col: str,
498
+ first_phase_start: pd.Timestamp,
499
+ ) -> Dict[str, any]:
500
+ """
501
+ Extract metadata columns with specified aggregation methods.
502
+
503
+ Supported methods:
504
+ - first: First value in group
505
+ - last: Last value in group
506
+ - first_after_start: First value after first phase starts
507
+ - max, min, mean, sum: Standard aggregations
508
+ """
509
+ result = {}
510
+
511
+ for col, method in metadata_config.items():
512
+ if col not in group.columns:
513
+ result[col] = None
514
+ continue
515
+
516
+ try:
517
+ if method == "first":
518
+ result[col] = group[col].iloc[0]
519
+ elif method == "last":
520
+ result[col] = group[col].iloc[-1]
521
+ elif method == "first_after_start":
522
+ after_start = group[group[timestamp_col] >= first_phase_start]
523
+ if not after_start.empty:
524
+ valid = after_start[after_start[col].notna()]
525
+ result[col] = valid[col].iloc[0] if not valid.empty else None
526
+ else:
527
+ result[col] = None
528
+ elif method in ("max", "min", "mean", "sum"):
529
+ result[col] = group[col].agg(method)
530
+ else:
531
+ result[col] = group[col].agg(method)
532
+ except Exception:
533
+ result[col] = None
534
+
535
+ return result
536
+
537
+
538
+ # =============================================================================
539
+ # SPARK IMPLEMENTATION
540
+ # =============================================================================
541
+
542
+
543
+ def _detect_phases_spark(spark_df, params: DetectSequentialPhasesParams):
544
+ """
545
+ Spark implementation using applyInPandas for parallel group processing.
546
+
547
+ Each group (batch) is processed independently using the Pandas logic,
548
+ enabling parallel execution across the cluster.
549
+ """
550
+ from pyspark.sql.types import (
551
+ DoubleType,
552
+ StringType,
553
+ StructField,
554
+ StructType,
555
+ TimestampType,
556
+ )
557
+
558
+ group_by_cols = _normalize_group_by(params.group_by)
559
+
560
+ output_fields = []
561
+ for col in group_by_cols:
562
+ output_fields.append(StructField(col, StringType(), True))
563
+
564
+ phase_names = [p.timer_col if isinstance(p, PhaseConfig) else p for p in params.phases]
565
+ for phase in phase_names:
566
+ output_fields.append(StructField(f"{phase}_start", TimestampType(), True))
567
+ output_fields.append(StructField(f"{phase}_end", TimestampType(), True))
568
+ output_fields.append(StructField(f"{phase}_max_minutes", DoubleType(), True))
569
+
570
+ if params.status_mapping:
571
+ for status_name in params.status_mapping.values():
572
+ output_fields.append(
573
+ StructField(f"{phase}_{status_name}_minutes", DoubleType(), True)
574
+ )
575
+
576
+ if params.phase_metrics:
577
+ for metric_col in params.phase_metrics.keys():
578
+ output_fields.append(StructField(f"{phase}_{metric_col}", DoubleType(), True))
579
+
580
+ if params.metadata:
581
+ numeric_aggs = {"max", "min", "mean", "sum"}
582
+ for col, method in params.metadata.items():
583
+ if method in numeric_aggs:
584
+ output_fields.append(StructField(col, DoubleType(), True))
585
+ else:
586
+ output_fields.append(StructField(col, StringType(), True))
587
+
588
+ output_schema = StructType(output_fields)
589
+
590
+ def process_group(pdf: pd.DataFrame) -> pd.DataFrame:
591
+ """Process a single group using Pandas logic."""
592
+ result = _process_single_group_pandas(pdf, params)
593
+ return pd.DataFrame([result]) if result else pd.DataFrame()
594
+
595
+ result_df = spark_df.groupby(group_by_cols).applyInPandas(process_group, schema=output_schema)
596
+
597
+ return result_df
598
+
599
+
600
+ def _detect_phases_spark_native(spark_df, params: DetectSequentialPhasesParams):
601
+ """
602
+ Native Spark implementation using window functions.
603
+
604
+ This implementation avoids applyInPandas serialization overhead by using
605
+ pure Spark operations: window functions, joins, and aggregations.
606
+
607
+ Performance: 5-20x faster than applyInPandas for large datasets.
608
+ """
609
+ from pyspark.sql import functions as F
610
+ from pyspark.sql import Window
611
+
612
+ ctx = get_logging_context()
613
+ group_by_cols = _normalize_group_by(params.group_by)
614
+ ts = params.timestamp_col
615
+ threshold = params.start_threshold
616
+
617
+ df = spark_df.withColumn(ts, F.col(ts).cast("timestamp"))
618
+
619
+ summary_df = df.select(*group_by_cols).distinct()
620
+
621
+ prev_phase_end_df = None
622
+
623
+ phase_names = [p.timer_col if isinstance(p, PhaseConfig) else p for p in params.phases]
624
+
625
+ for phase_cfg in params.phases:
626
+ if isinstance(phase_cfg, PhaseConfig):
627
+ timer_col = phase_cfg.timer_col
628
+ phase_threshold = phase_cfg.start_threshold or threshold
629
+ else:
630
+ timer_col = phase_cfg
631
+ phase_threshold = threshold
632
+
633
+ if timer_col not in spark_df.columns:
634
+ ctx.debug(f"Skipping phase {timer_col}: column not found")
635
+ continue
636
+
637
+ phase_df = df
638
+
639
+ if prev_phase_end_df is not None:
640
+ phase_df = (
641
+ phase_df.join(prev_phase_end_df, on=group_by_cols, how="inner")
642
+ .filter(F.col(ts) > F.col("prev_end_ts"))
643
+ .drop("prev_end_ts")
644
+ )
645
+
646
+ w_order = Window.partitionBy(*group_by_cols).orderBy(ts)
647
+
648
+ phase_df = phase_df.withColumn("lag_timer", F.lag(timer_col).over(w_order))
649
+ phase_df = phase_df.withColumn("lag_ts", F.lag(ts).over(w_order))
650
+
651
+ start_candidates = phase_df.filter(
652
+ (F.col(timer_col) > 0)
653
+ & (F.col(timer_col) <= F.lit(phase_threshold))
654
+ & F.col(timer_col).isNotNull()
655
+ )
656
+
657
+ w_start_rank = Window.partitionBy(*group_by_cols).orderBy(ts)
658
+ start_rows = (
659
+ start_candidates.withColumn("start_rn", F.row_number().over(w_start_rank))
660
+ .filter(F.col("start_rn") == 1)
661
+ .select(
662
+ *group_by_cols,
663
+ F.col(ts).alias("start_obs_ts"),
664
+ F.col(timer_col).alias("start_obs_timer"),
665
+ )
666
+ )
667
+
668
+ start_rows = start_rows.withColumn(
669
+ "true_start_ts",
670
+ (F.col("start_obs_ts").cast("long") - F.col("start_obs_timer").cast("long")).cast(
671
+ "timestamp"
672
+ ),
673
+ )
674
+
675
+ phase_with_start = phase_df.join(
676
+ start_rows.select(*group_by_cols, "start_obs_ts", "true_start_ts"),
677
+ on=group_by_cols,
678
+ how="inner",
679
+ )
680
+
681
+ phase_with_start = phase_with_start.withColumn(
682
+ "is_plateau",
683
+ (F.col(timer_col).isNotNull())
684
+ & (F.col("lag_timer").isNotNull())
685
+ & (F.col(timer_col) == F.col("lag_timer"))
686
+ & (F.col(ts) != F.col("lag_ts"))
687
+ & (F.col(ts) >= F.col("start_obs_ts"))
688
+ & (F.col("lag_ts") >= F.col("start_obs_ts")),
689
+ )
690
+
691
+ plateau_candidates = phase_with_start.filter("is_plateau")
692
+
693
+ w_plateau_rank = Window.partitionBy(*group_by_cols).orderBy(ts)
694
+ plateau_rows = (
695
+ plateau_candidates.withColumn("plateau_rn", F.row_number().over(w_plateau_rank))
696
+ .filter(F.col("plateau_rn") == 1)
697
+ .select(
698
+ *group_by_cols,
699
+ F.col("lag_ts").alias("end_ts"),
700
+ F.col(timer_col).alias("plateau_timer"),
701
+ )
702
+ )
703
+
704
+ phase_bounds = start_rows.join(plateau_rows, on=group_by_cols, how="left")
705
+
706
+ no_plateau = (
707
+ phase_with_start.filter(~F.col("is_plateau"))
708
+ .groupBy(*group_by_cols)
709
+ .agg(
710
+ F.max(ts).alias("fallback_end_ts"),
711
+ F.max(timer_col).alias("fallback_timer"),
712
+ )
713
+ )
714
+
715
+ phase_bounds = phase_bounds.join(no_plateau, on=group_by_cols, how="left")
716
+
717
+ phase_bounds = phase_bounds.withColumn(
718
+ "final_end_ts", F.coalesce(F.col("end_ts"), F.col("fallback_end_ts"))
719
+ ).withColumn("max_timer", F.coalesce(F.col("plateau_timer"), F.col("fallback_timer")))
720
+
721
+ phase_summary = phase_bounds.select(
722
+ *group_by_cols,
723
+ F.col("true_start_ts").alias(f"{timer_col}_start"),
724
+ F.col("final_end_ts").alias(f"{timer_col}_end"),
725
+ (F.col("max_timer") / 60.0).alias(f"{timer_col}_max_minutes"),
726
+ F.col("true_start_ts").alias("_phase_true_start"),
727
+ F.col("final_end_ts").alias("_phase_end"),
728
+ )
729
+
730
+ if params.status_mapping and params.status_col:
731
+ status_durations = _compute_status_durations_spark(
732
+ df=df,
733
+ phase_bounds=phase_bounds.select(*group_by_cols, "true_start_ts", "final_end_ts"),
734
+ params=params,
735
+ timer_col=timer_col,
736
+ group_by_cols=group_by_cols,
737
+ )
738
+ if status_durations is not None:
739
+ phase_summary = phase_summary.join(status_durations, on=group_by_cols, how="left")
740
+
741
+ if params.phase_metrics:
742
+ metrics_df = _compute_phase_metrics_spark(
743
+ df=df,
744
+ phase_bounds=phase_bounds.select(*group_by_cols, "true_start_ts", "final_end_ts"),
745
+ params=params,
746
+ timer_col=timer_col,
747
+ group_by_cols=group_by_cols,
748
+ )
749
+ if metrics_df is not None:
750
+ phase_summary = phase_summary.join(metrics_df, on=group_by_cols, how="left")
751
+
752
+ summary_df = summary_df.join(
753
+ phase_summary.drop("_phase_true_start", "_phase_end"),
754
+ on=group_by_cols,
755
+ how="left",
756
+ )
757
+
758
+ prev_phase_end_df = phase_bounds.select(
759
+ *group_by_cols, F.col("final_end_ts").alias("prev_end_ts")
760
+ ).filter(F.col("prev_end_ts").isNotNull())
761
+
762
+ if params.metadata:
763
+ phase_start_cols = [F.col(f"{p}_start") for p in phase_names]
764
+ summary_df = summary_df.withColumn("_first_phase_start", F.coalesce(*phase_start_cols))
765
+
766
+ metadata_df = _compute_metadata_spark(
767
+ df=df,
768
+ summary_df=summary_df.select(*group_by_cols, "_first_phase_start"),
769
+ params=params,
770
+ group_by_cols=group_by_cols,
771
+ )
772
+ if metadata_df is not None:
773
+ summary_df = summary_df.join(metadata_df, on=group_by_cols, how="left")
774
+
775
+ summary_df = summary_df.drop("_first_phase_start")
776
+
777
+ if params.fill_null_minutes:
778
+ numeric_cols = _get_numeric_columns(params)
779
+ for col in numeric_cols:
780
+ if col in summary_df.columns:
781
+ summary_df = summary_df.withColumn(col, F.coalesce(F.col(col), F.lit(0.0)))
782
+
783
+ first_phase_start_col = f"{phase_names[0]}_start" if phase_names else None
784
+ if first_phase_start_col and first_phase_start_col in summary_df.columns:
785
+ summary_df = summary_df.orderBy(first_phase_start_col)
786
+
787
+ return summary_df
788
+
789
+
790
+ def _compute_status_durations_spark(
791
+ df, phase_bounds, params: DetectSequentialPhasesParams, timer_col: str, group_by_cols: List[str]
792
+ ):
793
+ """Compute time spent in each status within a phase window using Spark."""
794
+ from pyspark.sql import functions as F
795
+ from pyspark.sql import Window
796
+
797
+ ts = params.timestamp_col
798
+ status_col = params.status_col
799
+ status_mapping = params.status_mapping
800
+ valid_codes = list(status_mapping.keys())
801
+
802
+ status_df = df.join(
803
+ phase_bounds.withColumnRenamed("true_start_ts", "_start").withColumnRenamed(
804
+ "final_end_ts", "_end"
805
+ ),
806
+ on=group_by_cols,
807
+ how="inner",
808
+ ).filter((F.col(ts) >= F.col("_start")) & (F.col(ts) <= F.col("_end")))
809
+
810
+ status_df = status_df.withColumn(
811
+ "valid_status",
812
+ F.when(F.col(status_col).isin([F.lit(c) for c in valid_codes]), F.col(status_col)),
813
+ )
814
+
815
+ w_status = (
816
+ Window.partitionBy(*group_by_cols).orderBy(ts).rowsBetween(Window.unboundedPreceding, 0)
817
+ )
818
+
819
+ status_df = status_df.withColumn(
820
+ "ffill_status", F.last("valid_status", ignorenulls=True).over(w_status)
821
+ )
822
+
823
+ w_lead = Window.partitionBy(*group_by_cols).orderBy(ts)
824
+ status_df = status_df.withColumn("next_ts", F.lead(ts).over(w_lead))
825
+
826
+ status_df = status_df.withColumn(
827
+ "interval_end_ts",
828
+ F.when(
829
+ F.col("next_ts").isNull() | (F.col("next_ts") > F.col("_end")),
830
+ F.col("_end"),
831
+ ).otherwise(F.col("next_ts")),
832
+ )
833
+
834
+ status_df = status_df.withColumn(
835
+ "interval_sec",
836
+ F.greatest(F.lit(0), F.unix_timestamp("interval_end_ts") - F.unix_timestamp(ts)),
837
+ )
838
+
839
+ status_df = status_df.filter((F.col("ffill_status").isNotNull()) & (F.col("interval_sec") > 0))
840
+
841
+ status_df = status_df.withColumn("interval_min", F.col("interval_sec") / 60.0)
842
+
843
+ durations = status_df.groupBy(*group_by_cols, "ffill_status").agg(
844
+ F.sum("interval_min").alias("minutes")
845
+ )
846
+
847
+ durations_pivot = (
848
+ durations.groupBy(*group_by_cols).pivot("ffill_status", valid_codes).agg(F.first("minutes"))
849
+ )
850
+
851
+ for code, status_name in status_mapping.items():
852
+ old_col = str(code)
853
+ new_col = f"{timer_col}_{status_name}_minutes"
854
+ if old_col in durations_pivot.columns:
855
+ durations_pivot = durations_pivot.withColumnRenamed(old_col, new_col)
856
+
857
+ return durations_pivot
858
+
859
+
860
+ def _compute_phase_metrics_spark(
861
+ df, phase_bounds, params: DetectSequentialPhasesParams, timer_col: str, group_by_cols: List[str]
862
+ ):
863
+ """Compute aggregated metrics within a phase window using Spark."""
864
+ from pyspark.sql import functions as F
865
+
866
+ ts = params.timestamp_col
867
+
868
+ metrics_df = df.join(
869
+ phase_bounds.withColumnRenamed("true_start_ts", "_start").withColumnRenamed(
870
+ "final_end_ts", "_end"
871
+ ),
872
+ on=group_by_cols,
873
+ how="inner",
874
+ ).filter((F.col(ts) >= F.col("_start")) & (F.col(ts) <= F.col("_end")))
875
+
876
+ agg_exprs = []
877
+ for metric_col, agg_name in params.phase_metrics.items():
878
+ if metric_col in df.columns:
879
+ func = getattr(F, agg_name)
880
+ agg_exprs.append(func(metric_col).alias(f"{timer_col}_{metric_col}"))
881
+
882
+ if not agg_exprs:
883
+ return None
884
+
885
+ return metrics_df.groupBy(*group_by_cols).agg(*agg_exprs)
886
+
887
+
888
+ def _compute_metadata_spark(
889
+ df, summary_df, params: DetectSequentialPhasesParams, group_by_cols: List[str]
890
+ ):
891
+ """Compute metadata columns using Spark."""
892
+ from pyspark.sql import functions as F
893
+
894
+ ts = params.timestamp_col
895
+
896
+ meta_base = df.join(summary_df, on=group_by_cols, how="inner")
897
+
898
+ agg_exprs = []
899
+ struct_cols = []
900
+
901
+ for col_name, method in params.metadata.items():
902
+ if col_name not in df.columns:
903
+ continue
904
+
905
+ if method == "first":
906
+ agg_exprs.append(F.first(col_name, ignorenulls=True).alias(col_name))
907
+ elif method == "last":
908
+ struct_cols.append(col_name)
909
+ agg_exprs.append(
910
+ F.max(F.struct(F.col(ts), F.col(col_name))).alias(f"__{col_name}_struct")
911
+ )
912
+ elif method == "first_after_start":
913
+ agg_exprs.append(
914
+ F.first(
915
+ F.when(F.col(ts) >= F.col("_first_phase_start"), F.col(col_name)),
916
+ ignorenulls=True,
917
+ ).alias(col_name)
918
+ )
919
+ elif method in ("max", "min", "mean", "sum"):
920
+ func = getattr(F, method)
921
+ agg_exprs.append(func(col_name).alias(col_name))
922
+ else:
923
+ try:
924
+ func = getattr(F, method)
925
+ agg_exprs.append(func(col_name).alias(col_name))
926
+ except AttributeError:
927
+ agg_exprs.append(F.first(col_name, ignorenulls=True).alias(col_name))
928
+
929
+ if not agg_exprs:
930
+ return None
931
+
932
+ metadata_df = meta_base.groupBy(*group_by_cols).agg(*agg_exprs)
933
+
934
+ for col_name in struct_cols:
935
+ metadata_df = metadata_df.withColumn(
936
+ col_name, F.col(f"__{col_name}_struct").getField(col_name)
937
+ ).drop(f"__{col_name}_struct")
938
+
939
+ return metadata_df
940
+
941
+
942
+ def _process_single_group_pandas(
943
+ group: pd.DataFrame, params: DetectSequentialPhasesParams
944
+ ) -> Optional[Dict]:
945
+ """Process a single group and return the summary row dict."""
946
+ group_by_cols = _normalize_group_by(params.group_by)
947
+
948
+ group = group.copy()
949
+ group[params.timestamp_col] = pd.to_datetime(group[params.timestamp_col])
950
+ group = group.sort_values(params.timestamp_col).reset_index(drop=True)
951
+
952
+ if len(group_by_cols) == 1:
953
+ row = {group_by_cols[0]: group[group_by_cols[0]].iloc[0]}
954
+ else:
955
+ row = {col: group[col].iloc[0] for col in group_by_cols}
956
+
957
+ row.update(_get_expected_columns(params))
958
+
959
+ previous_phase_end = None
960
+ first_phase_start = None
961
+
962
+ for phase in params.phases:
963
+ if isinstance(phase, PhaseConfig):
964
+ timer_col = phase.timer_col
965
+ threshold = phase.start_threshold or params.start_threshold
966
+ else:
967
+ timer_col = phase
968
+ threshold = params.start_threshold
969
+
970
+ if timer_col not in group.columns:
971
+ continue
972
+
973
+ phase_result = _detect_single_phase(
974
+ group=group,
975
+ timer_col=timer_col,
976
+ timestamp_col=params.timestamp_col,
977
+ threshold=threshold,
978
+ previous_phase_end=previous_phase_end,
979
+ status_col=params.status_col,
980
+ status_mapping=params.status_mapping,
981
+ phase_metrics=params.phase_metrics,
982
+ time_format=params.output_time_format,
983
+ )
984
+
985
+ if phase_result:
986
+ row.update(phase_result["columns"])
987
+ previous_phase_end = phase_result["end_time"]
988
+
989
+ if first_phase_start is None:
990
+ first_phase_start = phase_result["start_time"]
991
+
992
+ if params.metadata and first_phase_start is not None:
993
+ metadata_values = _extract_metadata(
994
+ group=group,
995
+ metadata_config=params.metadata,
996
+ timestamp_col=params.timestamp_col,
997
+ first_phase_start=first_phase_start,
998
+ )
999
+ row.update(metadata_values)
1000
+
1001
+ if params.fill_null_minutes:
1002
+ numeric_cols = _get_numeric_columns(params)
1003
+ for col in numeric_cols:
1004
+ if col in row and row[col] is None:
1005
+ row[col] = 0.0
1006
+
1007
+ return row
1008
+
1009
+
1010
+ # =============================================================================
1011
+ # POLARS IMPLEMENTATION
1012
+ # =============================================================================
1013
+
1014
+
1015
+ def _detect_phases_polars(polars_df, params: DetectSequentialPhasesParams):
1016
+ """
1017
+ Polars implementation - converts to Pandas for processing.
1018
+
1019
+ TODO: Native Polars implementation for better performance.
1020
+ """
1021
+ pdf = polars_df.to_pandas()
1022
+ result_pdf = _detect_phases_pandas(pdf, params)
1023
+
1024
+ try:
1025
+ import polars as pl
1026
+
1027
+ return pl.from_pandas(result_pdf)
1028
+ except ImportError:
1029
+ raise ValueError("Polars is not installed")