odibi 2.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odibi/__init__.py +32 -0
- odibi/__main__.py +8 -0
- odibi/catalog.py +3011 -0
- odibi/cli/__init__.py +11 -0
- odibi/cli/__main__.py +6 -0
- odibi/cli/catalog.py +553 -0
- odibi/cli/deploy.py +69 -0
- odibi/cli/doctor.py +161 -0
- odibi/cli/export.py +66 -0
- odibi/cli/graph.py +150 -0
- odibi/cli/init_pipeline.py +242 -0
- odibi/cli/lineage.py +259 -0
- odibi/cli/main.py +215 -0
- odibi/cli/run.py +98 -0
- odibi/cli/schema.py +208 -0
- odibi/cli/secrets.py +232 -0
- odibi/cli/story.py +379 -0
- odibi/cli/system.py +132 -0
- odibi/cli/test.py +286 -0
- odibi/cli/ui.py +31 -0
- odibi/cli/validate.py +39 -0
- odibi/config.py +3541 -0
- odibi/connections/__init__.py +9 -0
- odibi/connections/azure_adls.py +499 -0
- odibi/connections/azure_sql.py +709 -0
- odibi/connections/base.py +28 -0
- odibi/connections/factory.py +322 -0
- odibi/connections/http.py +78 -0
- odibi/connections/local.py +119 -0
- odibi/connections/local_dbfs.py +61 -0
- odibi/constants.py +17 -0
- odibi/context.py +528 -0
- odibi/diagnostics/__init__.py +12 -0
- odibi/diagnostics/delta.py +520 -0
- odibi/diagnostics/diff.py +169 -0
- odibi/diagnostics/manager.py +171 -0
- odibi/engine/__init__.py +20 -0
- odibi/engine/base.py +334 -0
- odibi/engine/pandas_engine.py +2178 -0
- odibi/engine/polars_engine.py +1114 -0
- odibi/engine/registry.py +54 -0
- odibi/engine/spark_engine.py +2362 -0
- odibi/enums.py +7 -0
- odibi/exceptions.py +297 -0
- odibi/graph.py +426 -0
- odibi/introspect.py +1214 -0
- odibi/lineage.py +511 -0
- odibi/node.py +3341 -0
- odibi/orchestration/__init__.py +0 -0
- odibi/orchestration/airflow.py +90 -0
- odibi/orchestration/dagster.py +77 -0
- odibi/patterns/__init__.py +24 -0
- odibi/patterns/aggregation.py +599 -0
- odibi/patterns/base.py +94 -0
- odibi/patterns/date_dimension.py +423 -0
- odibi/patterns/dimension.py +696 -0
- odibi/patterns/fact.py +748 -0
- odibi/patterns/merge.py +128 -0
- odibi/patterns/scd2.py +148 -0
- odibi/pipeline.py +2382 -0
- odibi/plugins.py +80 -0
- odibi/project.py +581 -0
- odibi/references.py +151 -0
- odibi/registry.py +246 -0
- odibi/semantics/__init__.py +71 -0
- odibi/semantics/materialize.py +392 -0
- odibi/semantics/metrics.py +361 -0
- odibi/semantics/query.py +743 -0
- odibi/semantics/runner.py +430 -0
- odibi/semantics/story.py +507 -0
- odibi/semantics/views.py +432 -0
- odibi/state/__init__.py +1203 -0
- odibi/story/__init__.py +55 -0
- odibi/story/doc_story.py +554 -0
- odibi/story/generator.py +1431 -0
- odibi/story/lineage.py +1043 -0
- odibi/story/lineage_utils.py +324 -0
- odibi/story/metadata.py +608 -0
- odibi/story/renderers.py +453 -0
- odibi/story/templates/run_story.html +2520 -0
- odibi/story/themes.py +216 -0
- odibi/testing/__init__.py +13 -0
- odibi/testing/assertions.py +75 -0
- odibi/testing/fixtures.py +85 -0
- odibi/testing/source_pool.py +277 -0
- odibi/transformers/__init__.py +122 -0
- odibi/transformers/advanced.py +1472 -0
- odibi/transformers/delete_detection.py +610 -0
- odibi/transformers/manufacturing.py +1029 -0
- odibi/transformers/merge_transformer.py +778 -0
- odibi/transformers/relational.py +675 -0
- odibi/transformers/scd.py +579 -0
- odibi/transformers/sql_core.py +1356 -0
- odibi/transformers/validation.py +165 -0
- odibi/ui/__init__.py +0 -0
- odibi/ui/app.py +195 -0
- odibi/utils/__init__.py +66 -0
- odibi/utils/alerting.py +667 -0
- odibi/utils/config_loader.py +343 -0
- odibi/utils/console.py +231 -0
- odibi/utils/content_hash.py +202 -0
- odibi/utils/duration.py +43 -0
- odibi/utils/encoding.py +102 -0
- odibi/utils/extensions.py +28 -0
- odibi/utils/hashing.py +61 -0
- odibi/utils/logging.py +203 -0
- odibi/utils/logging_context.py +740 -0
- odibi/utils/progress.py +429 -0
- odibi/utils/setup_helpers.py +302 -0
- odibi/utils/telemetry.py +140 -0
- odibi/validation/__init__.py +62 -0
- odibi/validation/engine.py +765 -0
- odibi/validation/explanation_linter.py +155 -0
- odibi/validation/fk.py +547 -0
- odibi/validation/gate.py +252 -0
- odibi/validation/quarantine.py +605 -0
- odibi/writers/__init__.py +15 -0
- odibi/writers/sql_server_writer.py +2081 -0
- odibi-2.5.0.dist-info/METADATA +255 -0
- odibi-2.5.0.dist-info/RECORD +124 -0
- odibi-2.5.0.dist-info/WHEEL +5 -0
- odibi-2.5.0.dist-info/entry_points.txt +2 -0
- odibi-2.5.0.dist-info/licenses/LICENSE +190 -0
- odibi-2.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1029 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Manufacturing Transformers
|
|
3
|
+
|
|
4
|
+
Specialized transformers for manufacturing/process data analysis.
|
|
5
|
+
Handles common patterns like cycle detection, phase analysis, and time-in-state calculations.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import time
|
|
9
|
+
from typing import Dict, List, Optional, Union
|
|
10
|
+
|
|
11
|
+
import pandas as pd
|
|
12
|
+
from pydantic import BaseModel, Field
|
|
13
|
+
|
|
14
|
+
from odibi.context import EngineContext
|
|
15
|
+
from odibi.enums import EngineType
|
|
16
|
+
from odibi.utils.logging_context import get_logging_context
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# =============================================================================
|
|
20
|
+
# DETECT SEQUENTIAL PHASES
|
|
21
|
+
# =============================================================================
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class PhaseConfig(BaseModel):
|
|
25
|
+
"""Configuration for a single phase."""
|
|
26
|
+
|
|
27
|
+
timer_col: str = Field(..., description="Timer column name for this phase")
|
|
28
|
+
start_threshold: Optional[int] = Field(
|
|
29
|
+
None, description="Override default start threshold for this phase (seconds)"
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class DetectSequentialPhasesParams(BaseModel):
|
|
34
|
+
"""
|
|
35
|
+
Detect and analyze sequential manufacturing phases from timer columns.
|
|
36
|
+
|
|
37
|
+
This transformer processes raw sensor/PLC data where timer columns increment
|
|
38
|
+
during each phase. It detects phase boundaries, calculates durations, and
|
|
39
|
+
tracks time spent in each equipment status.
|
|
40
|
+
|
|
41
|
+
Common use cases:
|
|
42
|
+
- Batch reactor cycle analysis
|
|
43
|
+
- CIP (Clean-in-Place) phase timing
|
|
44
|
+
- Food processing (cook, cool, package cycles)
|
|
45
|
+
- Any multi-step batch process with PLC timers
|
|
46
|
+
|
|
47
|
+
Scenario: Analyze FBR cycle times
|
|
48
|
+
```yaml
|
|
49
|
+
detect_sequential_phases:
|
|
50
|
+
group_by: BatchID
|
|
51
|
+
timestamp_col: ts
|
|
52
|
+
phases:
|
|
53
|
+
- timer_col: LoadTime
|
|
54
|
+
- timer_col: AcidTime
|
|
55
|
+
- timer_col: DryTime
|
|
56
|
+
- timer_col: CookTime
|
|
57
|
+
- timer_col: CoolTime
|
|
58
|
+
- timer_col: UnloadTime
|
|
59
|
+
start_threshold: 240
|
|
60
|
+
status_col: Status
|
|
61
|
+
status_mapping:
|
|
62
|
+
1: idle
|
|
63
|
+
2: active
|
|
64
|
+
3: hold
|
|
65
|
+
4: faulted
|
|
66
|
+
phase_metrics:
|
|
67
|
+
Level: max
|
|
68
|
+
metadata:
|
|
69
|
+
ProductCode: first_after_start
|
|
70
|
+
Weight: max
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Scenario: Group by multiple columns
|
|
74
|
+
```yaml
|
|
75
|
+
detect_sequential_phases:
|
|
76
|
+
group_by:
|
|
77
|
+
- BatchID
|
|
78
|
+
- AssetID
|
|
79
|
+
phases: [LoadTime, CookTime]
|
|
80
|
+
```
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
group_by: Union[str, List[str]] = Field(
|
|
84
|
+
...,
|
|
85
|
+
description="Column(s) to group by. Can be a single column name or list of columns. "
|
|
86
|
+
"E.g., 'BatchID' or ['BatchID', 'AssetID']",
|
|
87
|
+
)
|
|
88
|
+
timestamp_col: str = Field(default="ts", description="Timestamp column for ordering events")
|
|
89
|
+
phases: List[Union[str, PhaseConfig]] = Field(
|
|
90
|
+
...,
|
|
91
|
+
description="List of phase timer columns (strings) or PhaseConfig objects. "
|
|
92
|
+
"Phases are processed sequentially - each phase starts after the previous ends.",
|
|
93
|
+
)
|
|
94
|
+
start_threshold: int = Field(
|
|
95
|
+
default=240,
|
|
96
|
+
description="Default max timer value (seconds) to consider as valid phase start. "
|
|
97
|
+
"Filters out late readings where timer already shows large elapsed time.",
|
|
98
|
+
)
|
|
99
|
+
status_col: Optional[str] = Field(None, description="Column containing equipment status codes")
|
|
100
|
+
status_mapping: Optional[Dict[int, str]] = Field(
|
|
101
|
+
None,
|
|
102
|
+
description="Mapping of status codes to names. "
|
|
103
|
+
"E.g., {1: 'idle', 2: 'active', 3: 'hold', 4: 'faulted'}",
|
|
104
|
+
)
|
|
105
|
+
phase_metrics: Optional[Dict[str, str]] = Field(
|
|
106
|
+
None,
|
|
107
|
+
description="Columns to aggregate within each phase window. "
|
|
108
|
+
"E.g., {Level: max, Pressure: max}. Outputs {Phase}_{Column} columns.",
|
|
109
|
+
)
|
|
110
|
+
metadata: Optional[Dict[str, str]] = Field(
|
|
111
|
+
None,
|
|
112
|
+
description="Columns to include in output with aggregation method. "
|
|
113
|
+
"Options: 'first', 'last', 'first_after_start', 'max', 'min', 'mean', 'sum'. "
|
|
114
|
+
"E.g., {ProductCode: first_after_start, Weight: max}",
|
|
115
|
+
)
|
|
116
|
+
output_time_format: str = Field(
|
|
117
|
+
default="%Y-%m-%d %H:%M:%S",
|
|
118
|
+
description="Format for output timestamp columns",
|
|
119
|
+
)
|
|
120
|
+
fill_null_minutes: bool = Field(
|
|
121
|
+
default=False,
|
|
122
|
+
description="If True, fill null numeric columns (_max_minutes, _status_minutes, _metrics) "
|
|
123
|
+
"with 0. Timestamp columns remain null for skipped phases.",
|
|
124
|
+
)
|
|
125
|
+
spark_native: bool = Field(
|
|
126
|
+
default=False,
|
|
127
|
+
description="If True, use native Spark window functions. If False (default), use "
|
|
128
|
+
"applyInPandas which is often faster for datasets with many batches.",
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _normalize_group_by(group_by: Union[str, List[str]]) -> List[str]:
|
|
133
|
+
"""Convert group_by to list format."""
|
|
134
|
+
if isinstance(group_by, str):
|
|
135
|
+
return [group_by]
|
|
136
|
+
return list(group_by)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _get_expected_columns(params: "DetectSequentialPhasesParams") -> Dict[str, None]:
|
|
140
|
+
"""
|
|
141
|
+
Build a dict of ALL expected output columns with None/NaT values.
|
|
142
|
+
|
|
143
|
+
This ensures Spark applyInPandas always receives a DataFrame with
|
|
144
|
+
all columns defined in the schema, even when phases are skipped.
|
|
145
|
+
Uses pd.NaT for timestamp columns to match TimestampType schema.
|
|
146
|
+
"""
|
|
147
|
+
columns = {}
|
|
148
|
+
|
|
149
|
+
phase_names = [p.timer_col if isinstance(p, PhaseConfig) else p for p in params.phases]
|
|
150
|
+
for phase in phase_names:
|
|
151
|
+
columns[f"{phase}_start"] = pd.NaT
|
|
152
|
+
columns[f"{phase}_end"] = pd.NaT
|
|
153
|
+
columns[f"{phase}_max_minutes"] = None
|
|
154
|
+
|
|
155
|
+
if params.status_mapping:
|
|
156
|
+
for status_name in params.status_mapping.values():
|
|
157
|
+
columns[f"{phase}_{status_name}_minutes"] = None
|
|
158
|
+
|
|
159
|
+
if params.phase_metrics:
|
|
160
|
+
for metric_col in params.phase_metrics.keys():
|
|
161
|
+
columns[f"{phase}_{metric_col}"] = None
|
|
162
|
+
|
|
163
|
+
if params.metadata:
|
|
164
|
+
for col in params.metadata.keys():
|
|
165
|
+
columns[col] = None
|
|
166
|
+
|
|
167
|
+
return columns
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _get_numeric_columns(params: "DetectSequentialPhasesParams") -> List[str]:
|
|
171
|
+
"""Get list of all numeric output column names (for fill_null_minutes)."""
|
|
172
|
+
columns = []
|
|
173
|
+
|
|
174
|
+
phase_names = [p.timer_col if isinstance(p, PhaseConfig) else p for p in params.phases]
|
|
175
|
+
for phase in phase_names:
|
|
176
|
+
columns.append(f"{phase}_max_minutes")
|
|
177
|
+
|
|
178
|
+
if params.status_mapping:
|
|
179
|
+
for status_name in params.status_mapping.values():
|
|
180
|
+
columns.append(f"{phase}_{status_name}_minutes")
|
|
181
|
+
|
|
182
|
+
if params.phase_metrics:
|
|
183
|
+
for metric_col in params.phase_metrics.keys():
|
|
184
|
+
columns.append(f"{phase}_{metric_col}")
|
|
185
|
+
|
|
186
|
+
return columns
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _fill_null_numeric_columns(
|
|
190
|
+
df: pd.DataFrame, params: "DetectSequentialPhasesParams"
|
|
191
|
+
) -> pd.DataFrame:
|
|
192
|
+
"""Fill null values in numeric columns with 0."""
|
|
193
|
+
numeric_cols = _get_numeric_columns(params)
|
|
194
|
+
for col in numeric_cols:
|
|
195
|
+
if col in df.columns:
|
|
196
|
+
df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0.0)
|
|
197
|
+
return df
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def detect_sequential_phases(
|
|
201
|
+
context: EngineContext, params: DetectSequentialPhasesParams
|
|
202
|
+
) -> EngineContext:
|
|
203
|
+
"""
|
|
204
|
+
Detect and analyze sequential manufacturing phases.
|
|
205
|
+
|
|
206
|
+
For each group (e.g., batch), this transformer:
|
|
207
|
+
1. Processes phases sequentially (each starts after previous ends)
|
|
208
|
+
2. Detects phase start by finding first valid timer reading and back-calculating
|
|
209
|
+
3. Detects phase end by finding first repeated (plateaued) timer value
|
|
210
|
+
4. Calculates time spent in each status during each phase
|
|
211
|
+
5. Aggregates specified metrics within each phase window
|
|
212
|
+
6. Outputs one summary row per group
|
|
213
|
+
|
|
214
|
+
Output columns per phase:
|
|
215
|
+
- {phase}_start: Phase start timestamp
|
|
216
|
+
- {phase}_end: Phase end timestamp
|
|
217
|
+
- {phase}_max_minutes: Maximum timer value converted to minutes
|
|
218
|
+
- {phase}_{status}_minutes: Time in each status (if status_col provided)
|
|
219
|
+
- {phase}_{metric}: Aggregated metrics (if phase_metrics provided)
|
|
220
|
+
"""
|
|
221
|
+
ctx = get_logging_context()
|
|
222
|
+
start_time = time.time()
|
|
223
|
+
|
|
224
|
+
phase_names = [p.timer_col if isinstance(p, PhaseConfig) else p for p in params.phases]
|
|
225
|
+
group_by_cols = _normalize_group_by(params.group_by)
|
|
226
|
+
|
|
227
|
+
ctx.debug(
|
|
228
|
+
"DetectSequentialPhases starting",
|
|
229
|
+
group_by=group_by_cols,
|
|
230
|
+
phases=phase_names,
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
if context.engine_type == EngineType.PANDAS:
|
|
234
|
+
result_df = _detect_phases_pandas(context.df, params)
|
|
235
|
+
elif context.engine_type == EngineType.SPARK:
|
|
236
|
+
if params.spark_native:
|
|
237
|
+
result_df = _detect_phases_spark_native(context.df, params)
|
|
238
|
+
else:
|
|
239
|
+
result_df = _detect_phases_spark(context.df, params)
|
|
240
|
+
elif context.engine_type == EngineType.POLARS:
|
|
241
|
+
result_df = _detect_phases_polars(context.df, params)
|
|
242
|
+
else:
|
|
243
|
+
raise ValueError(f"Unsupported engine: {context.engine_type}")
|
|
244
|
+
|
|
245
|
+
elapsed_ms = (time.time() - start_time) * 1000
|
|
246
|
+
ctx.debug(
|
|
247
|
+
"DetectSequentialPhases completed",
|
|
248
|
+
output_rows=len(result_df) if hasattr(result_df, "__len__") else "unknown",
|
|
249
|
+
elapsed_ms=round(elapsed_ms, 2),
|
|
250
|
+
)
|
|
251
|
+
return context.with_df(result_df)
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def _detect_phases_pandas(df: pd.DataFrame, params: DetectSequentialPhasesParams) -> pd.DataFrame:
|
|
255
|
+
"""Pandas implementation of sequential phase detection."""
|
|
256
|
+
|
|
257
|
+
group_by_cols = _normalize_group_by(params.group_by)
|
|
258
|
+
|
|
259
|
+
df = df.copy()
|
|
260
|
+
df[params.timestamp_col] = pd.to_datetime(df[params.timestamp_col])
|
|
261
|
+
df = df.sort_values(by=params.timestamp_col, ascending=True).reset_index(drop=True)
|
|
262
|
+
df = df.drop_duplicates()
|
|
263
|
+
|
|
264
|
+
summary_rows = []
|
|
265
|
+
grouped = df.groupby(group_by_cols)
|
|
266
|
+
|
|
267
|
+
for group_id, group in grouped:
|
|
268
|
+
group = group.sort_values(params.timestamp_col).reset_index(drop=True)
|
|
269
|
+
|
|
270
|
+
if len(group_by_cols) == 1:
|
|
271
|
+
row = {group_by_cols[0]: group_id if not isinstance(group_id, tuple) else group_id[0]}
|
|
272
|
+
else:
|
|
273
|
+
row = {col: val for col, val in zip(group_by_cols, group_id)}
|
|
274
|
+
|
|
275
|
+
row.update(_get_expected_columns(params))
|
|
276
|
+
|
|
277
|
+
previous_phase_end = None
|
|
278
|
+
first_phase_start = None
|
|
279
|
+
|
|
280
|
+
for phase in params.phases:
|
|
281
|
+
if isinstance(phase, PhaseConfig):
|
|
282
|
+
timer_col = phase.timer_col
|
|
283
|
+
threshold = phase.start_threshold or params.start_threshold
|
|
284
|
+
else:
|
|
285
|
+
timer_col = phase
|
|
286
|
+
threshold = params.start_threshold
|
|
287
|
+
|
|
288
|
+
if timer_col not in group.columns:
|
|
289
|
+
continue
|
|
290
|
+
|
|
291
|
+
phase_result = _detect_single_phase(
|
|
292
|
+
group=group,
|
|
293
|
+
timer_col=timer_col,
|
|
294
|
+
timestamp_col=params.timestamp_col,
|
|
295
|
+
threshold=threshold,
|
|
296
|
+
previous_phase_end=previous_phase_end,
|
|
297
|
+
status_col=params.status_col,
|
|
298
|
+
status_mapping=params.status_mapping,
|
|
299
|
+
phase_metrics=params.phase_metrics,
|
|
300
|
+
time_format=params.output_time_format,
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
if phase_result:
|
|
304
|
+
row.update(phase_result["columns"])
|
|
305
|
+
previous_phase_end = phase_result["end_time"]
|
|
306
|
+
|
|
307
|
+
if first_phase_start is None:
|
|
308
|
+
first_phase_start = phase_result["start_time"]
|
|
309
|
+
|
|
310
|
+
if params.metadata and first_phase_start is not None:
|
|
311
|
+
metadata_values = _extract_metadata(
|
|
312
|
+
group=group,
|
|
313
|
+
metadata_config=params.metadata,
|
|
314
|
+
timestamp_col=params.timestamp_col,
|
|
315
|
+
first_phase_start=first_phase_start,
|
|
316
|
+
)
|
|
317
|
+
row.update(metadata_values)
|
|
318
|
+
|
|
319
|
+
summary_rows.append(row)
|
|
320
|
+
|
|
321
|
+
result_df = pd.DataFrame(summary_rows)
|
|
322
|
+
|
|
323
|
+
if result_df.empty:
|
|
324
|
+
return result_df
|
|
325
|
+
|
|
326
|
+
first_phase_name = (
|
|
327
|
+
params.phases[0].timer_col
|
|
328
|
+
if isinstance(params.phases[0], PhaseConfig)
|
|
329
|
+
else params.phases[0]
|
|
330
|
+
)
|
|
331
|
+
start_col = f"{first_phase_name}_start"
|
|
332
|
+
if start_col in result_df.columns:
|
|
333
|
+
result_df = result_df.sort_values(by=start_col, ascending=True)
|
|
334
|
+
|
|
335
|
+
if params.fill_null_minutes:
|
|
336
|
+
result_df = _fill_null_numeric_columns(result_df, params)
|
|
337
|
+
|
|
338
|
+
return result_df.reset_index(drop=True)
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def _detect_single_phase(
|
|
342
|
+
group: pd.DataFrame,
|
|
343
|
+
timer_col: str,
|
|
344
|
+
timestamp_col: str,
|
|
345
|
+
threshold: int,
|
|
346
|
+
previous_phase_end: Optional[pd.Timestamp],
|
|
347
|
+
status_col: Optional[str],
|
|
348
|
+
status_mapping: Optional[Dict[int, str]],
|
|
349
|
+
phase_metrics: Optional[Dict[str, str]],
|
|
350
|
+
time_format: str,
|
|
351
|
+
) -> Optional[dict]:
|
|
352
|
+
"""
|
|
353
|
+
Detect a single phase's boundaries and calculate metrics.
|
|
354
|
+
|
|
355
|
+
Returns dict with:
|
|
356
|
+
- columns: dict of output column names to values
|
|
357
|
+
- start_time: phase start timestamp (for chaining)
|
|
358
|
+
- end_time: phase end timestamp (for chaining)
|
|
359
|
+
"""
|
|
360
|
+
|
|
361
|
+
if previous_phase_end is not None:
|
|
362
|
+
phase_data = group[group[timestamp_col] > previous_phase_end]
|
|
363
|
+
else:
|
|
364
|
+
phase_data = group
|
|
365
|
+
|
|
366
|
+
if phase_data.empty:
|
|
367
|
+
return None
|
|
368
|
+
|
|
369
|
+
non_zero = phase_data[phase_data[timer_col] > 0]
|
|
370
|
+
if non_zero.empty:
|
|
371
|
+
return None
|
|
372
|
+
|
|
373
|
+
potential_starts = non_zero[non_zero[timer_col] <= threshold].sort_values(
|
|
374
|
+
by=timestamp_col, ascending=True
|
|
375
|
+
)
|
|
376
|
+
if potential_starts.empty:
|
|
377
|
+
return None
|
|
378
|
+
|
|
379
|
+
first_idx = potential_starts.index[0]
|
|
380
|
+
first_ts = potential_starts.loc[first_idx, timestamp_col]
|
|
381
|
+
first_val = potential_starts.loc[first_idx, timer_col]
|
|
382
|
+
|
|
383
|
+
true_start = first_ts - pd.Timedelta(seconds=first_val)
|
|
384
|
+
|
|
385
|
+
after_start = phase_data[phase_data[timestamp_col] > true_start].reset_index(drop=True)
|
|
386
|
+
|
|
387
|
+
end_time = None
|
|
388
|
+
max_timer = 0
|
|
389
|
+
|
|
390
|
+
unique_times = after_start.drop_duplicates(subset=[timestamp_col]).reset_index(drop=True)
|
|
391
|
+
|
|
392
|
+
for i in range(1, len(unique_times)):
|
|
393
|
+
curr_val = unique_times[timer_col].iloc[i]
|
|
394
|
+
prev_val = unique_times[timer_col].iloc[i - 1]
|
|
395
|
+
if curr_val == prev_val:
|
|
396
|
+
end_time = unique_times[timestamp_col].iloc[i - 1]
|
|
397
|
+
max_timer = curr_val
|
|
398
|
+
break
|
|
399
|
+
|
|
400
|
+
if end_time is None and len(unique_times) > 0:
|
|
401
|
+
end_time = unique_times[timestamp_col].iloc[-1]
|
|
402
|
+
max_timer = unique_times[timer_col].iloc[-1]
|
|
403
|
+
|
|
404
|
+
if end_time is None:
|
|
405
|
+
return None
|
|
406
|
+
|
|
407
|
+
columns = {
|
|
408
|
+
f"{timer_col}_start": true_start,
|
|
409
|
+
f"{timer_col}_end": end_time,
|
|
410
|
+
f"{timer_col}_max_minutes": round(max_timer / 60, 6) if max_timer else 0,
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
if status_col and status_mapping and status_col in group.columns:
|
|
414
|
+
status_times = _calculate_status_times(
|
|
415
|
+
group=group,
|
|
416
|
+
start_time=true_start,
|
|
417
|
+
end_time=end_time,
|
|
418
|
+
timestamp_col=timestamp_col,
|
|
419
|
+
status_col=status_col,
|
|
420
|
+
status_mapping=status_mapping,
|
|
421
|
+
)
|
|
422
|
+
for status_name, duration in status_times.items():
|
|
423
|
+
columns[f"{timer_col}_{status_name}_minutes"] = round(duration, 6)
|
|
424
|
+
|
|
425
|
+
if phase_metrics:
|
|
426
|
+
phase_window = phase_data[
|
|
427
|
+
(phase_data[timestamp_col] >= true_start) & (phase_data[timestamp_col] <= end_time)
|
|
428
|
+
]
|
|
429
|
+
for metric_col, agg_func in phase_metrics.items():
|
|
430
|
+
if metric_col in phase_window.columns:
|
|
431
|
+
try:
|
|
432
|
+
value = phase_window[metric_col].agg(agg_func)
|
|
433
|
+
columns[f"{timer_col}_{metric_col}"] = value
|
|
434
|
+
except Exception:
|
|
435
|
+
columns[f"{timer_col}_{metric_col}"] = None
|
|
436
|
+
|
|
437
|
+
return {
|
|
438
|
+
"columns": columns,
|
|
439
|
+
"start_time": true_start,
|
|
440
|
+
"end_time": end_time,
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
def _calculate_status_times(
|
|
445
|
+
group: pd.DataFrame,
|
|
446
|
+
start_time: pd.Timestamp,
|
|
447
|
+
end_time: pd.Timestamp,
|
|
448
|
+
timestamp_col: str,
|
|
449
|
+
status_col: str,
|
|
450
|
+
status_mapping: Dict[int, str],
|
|
451
|
+
) -> Dict[str, float]:
|
|
452
|
+
"""
|
|
453
|
+
Calculate time spent in each status within a phase window.
|
|
454
|
+
|
|
455
|
+
Tracks status transitions and accumulates duration per status.
|
|
456
|
+
Handles NaN and unknown status codes gracefully.
|
|
457
|
+
"""
|
|
458
|
+
status_times = {status_name: 0.0 for status_name in status_mapping.values()}
|
|
459
|
+
|
|
460
|
+
within_phase = group[(group[timestamp_col] >= start_time) & (group[timestamp_col] <= end_time)]
|
|
461
|
+
|
|
462
|
+
if within_phase.empty:
|
|
463
|
+
return status_times
|
|
464
|
+
|
|
465
|
+
valid_rows = within_phase[
|
|
466
|
+
within_phase[status_col].notna() & within_phase[status_col].isin(status_mapping.keys())
|
|
467
|
+
]
|
|
468
|
+
|
|
469
|
+
if valid_rows.empty:
|
|
470
|
+
return status_times
|
|
471
|
+
|
|
472
|
+
current_status = valid_rows.iloc[0][status_col]
|
|
473
|
+
last_change_ts = valid_rows.iloc[0][timestamp_col]
|
|
474
|
+
|
|
475
|
+
for _, record in within_phase.iterrows():
|
|
476
|
+
ts = record[timestamp_col]
|
|
477
|
+
status = record[status_col]
|
|
478
|
+
|
|
479
|
+
if pd.isna(status) or status not in status_mapping:
|
|
480
|
+
continue
|
|
481
|
+
|
|
482
|
+
if status != current_status:
|
|
483
|
+
time_diff = (ts - last_change_ts).total_seconds() / 60
|
|
484
|
+
status_times[status_mapping[current_status]] += time_diff
|
|
485
|
+
last_change_ts = ts
|
|
486
|
+
current_status = status
|
|
487
|
+
|
|
488
|
+
final_diff = (end_time - last_change_ts).total_seconds() / 60
|
|
489
|
+
status_times[status_mapping[current_status]] += final_diff
|
|
490
|
+
|
|
491
|
+
return status_times
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
def _extract_metadata(
|
|
495
|
+
group: pd.DataFrame,
|
|
496
|
+
metadata_config: Dict[str, str],
|
|
497
|
+
timestamp_col: str,
|
|
498
|
+
first_phase_start: pd.Timestamp,
|
|
499
|
+
) -> Dict[str, any]:
|
|
500
|
+
"""
|
|
501
|
+
Extract metadata columns with specified aggregation methods.
|
|
502
|
+
|
|
503
|
+
Supported methods:
|
|
504
|
+
- first: First value in group
|
|
505
|
+
- last: Last value in group
|
|
506
|
+
- first_after_start: First value after first phase starts
|
|
507
|
+
- max, min, mean, sum: Standard aggregations
|
|
508
|
+
"""
|
|
509
|
+
result = {}
|
|
510
|
+
|
|
511
|
+
for col, method in metadata_config.items():
|
|
512
|
+
if col not in group.columns:
|
|
513
|
+
result[col] = None
|
|
514
|
+
continue
|
|
515
|
+
|
|
516
|
+
try:
|
|
517
|
+
if method == "first":
|
|
518
|
+
result[col] = group[col].iloc[0]
|
|
519
|
+
elif method == "last":
|
|
520
|
+
result[col] = group[col].iloc[-1]
|
|
521
|
+
elif method == "first_after_start":
|
|
522
|
+
after_start = group[group[timestamp_col] >= first_phase_start]
|
|
523
|
+
if not after_start.empty:
|
|
524
|
+
valid = after_start[after_start[col].notna()]
|
|
525
|
+
result[col] = valid[col].iloc[0] if not valid.empty else None
|
|
526
|
+
else:
|
|
527
|
+
result[col] = None
|
|
528
|
+
elif method in ("max", "min", "mean", "sum"):
|
|
529
|
+
result[col] = group[col].agg(method)
|
|
530
|
+
else:
|
|
531
|
+
result[col] = group[col].agg(method)
|
|
532
|
+
except Exception:
|
|
533
|
+
result[col] = None
|
|
534
|
+
|
|
535
|
+
return result
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
# =============================================================================
|
|
539
|
+
# SPARK IMPLEMENTATION
|
|
540
|
+
# =============================================================================
|
|
541
|
+
|
|
542
|
+
|
|
543
|
+
def _detect_phases_spark(spark_df, params: DetectSequentialPhasesParams):
|
|
544
|
+
"""
|
|
545
|
+
Spark implementation using applyInPandas for parallel group processing.
|
|
546
|
+
|
|
547
|
+
Each group (batch) is processed independently using the Pandas logic,
|
|
548
|
+
enabling parallel execution across the cluster.
|
|
549
|
+
"""
|
|
550
|
+
from pyspark.sql.types import (
|
|
551
|
+
DoubleType,
|
|
552
|
+
StringType,
|
|
553
|
+
StructField,
|
|
554
|
+
StructType,
|
|
555
|
+
TimestampType,
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
group_by_cols = _normalize_group_by(params.group_by)
|
|
559
|
+
|
|
560
|
+
output_fields = []
|
|
561
|
+
for col in group_by_cols:
|
|
562
|
+
output_fields.append(StructField(col, StringType(), True))
|
|
563
|
+
|
|
564
|
+
phase_names = [p.timer_col if isinstance(p, PhaseConfig) else p for p in params.phases]
|
|
565
|
+
for phase in phase_names:
|
|
566
|
+
output_fields.append(StructField(f"{phase}_start", TimestampType(), True))
|
|
567
|
+
output_fields.append(StructField(f"{phase}_end", TimestampType(), True))
|
|
568
|
+
output_fields.append(StructField(f"{phase}_max_minutes", DoubleType(), True))
|
|
569
|
+
|
|
570
|
+
if params.status_mapping:
|
|
571
|
+
for status_name in params.status_mapping.values():
|
|
572
|
+
output_fields.append(
|
|
573
|
+
StructField(f"{phase}_{status_name}_minutes", DoubleType(), True)
|
|
574
|
+
)
|
|
575
|
+
|
|
576
|
+
if params.phase_metrics:
|
|
577
|
+
for metric_col in params.phase_metrics.keys():
|
|
578
|
+
output_fields.append(StructField(f"{phase}_{metric_col}", DoubleType(), True))
|
|
579
|
+
|
|
580
|
+
if params.metadata:
|
|
581
|
+
numeric_aggs = {"max", "min", "mean", "sum"}
|
|
582
|
+
for col, method in params.metadata.items():
|
|
583
|
+
if method in numeric_aggs:
|
|
584
|
+
output_fields.append(StructField(col, DoubleType(), True))
|
|
585
|
+
else:
|
|
586
|
+
output_fields.append(StructField(col, StringType(), True))
|
|
587
|
+
|
|
588
|
+
output_schema = StructType(output_fields)
|
|
589
|
+
|
|
590
|
+
def process_group(pdf: pd.DataFrame) -> pd.DataFrame:
|
|
591
|
+
"""Process a single group using Pandas logic."""
|
|
592
|
+
result = _process_single_group_pandas(pdf, params)
|
|
593
|
+
return pd.DataFrame([result]) if result else pd.DataFrame()
|
|
594
|
+
|
|
595
|
+
result_df = spark_df.groupby(group_by_cols).applyInPandas(process_group, schema=output_schema)
|
|
596
|
+
|
|
597
|
+
return result_df
|
|
598
|
+
|
|
599
|
+
|
|
600
|
+
def _detect_phases_spark_native(spark_df, params: DetectSequentialPhasesParams):
|
|
601
|
+
"""
|
|
602
|
+
Native Spark implementation using window functions.
|
|
603
|
+
|
|
604
|
+
This implementation avoids applyInPandas serialization overhead by using
|
|
605
|
+
pure Spark operations: window functions, joins, and aggregations.
|
|
606
|
+
|
|
607
|
+
Performance: 5-20x faster than applyInPandas for large datasets.
|
|
608
|
+
"""
|
|
609
|
+
from pyspark.sql import functions as F
|
|
610
|
+
from pyspark.sql import Window
|
|
611
|
+
|
|
612
|
+
ctx = get_logging_context()
|
|
613
|
+
group_by_cols = _normalize_group_by(params.group_by)
|
|
614
|
+
ts = params.timestamp_col
|
|
615
|
+
threshold = params.start_threshold
|
|
616
|
+
|
|
617
|
+
df = spark_df.withColumn(ts, F.col(ts).cast("timestamp"))
|
|
618
|
+
|
|
619
|
+
summary_df = df.select(*group_by_cols).distinct()
|
|
620
|
+
|
|
621
|
+
prev_phase_end_df = None
|
|
622
|
+
|
|
623
|
+
phase_names = [p.timer_col if isinstance(p, PhaseConfig) else p for p in params.phases]
|
|
624
|
+
|
|
625
|
+
for phase_cfg in params.phases:
|
|
626
|
+
if isinstance(phase_cfg, PhaseConfig):
|
|
627
|
+
timer_col = phase_cfg.timer_col
|
|
628
|
+
phase_threshold = phase_cfg.start_threshold or threshold
|
|
629
|
+
else:
|
|
630
|
+
timer_col = phase_cfg
|
|
631
|
+
phase_threshold = threshold
|
|
632
|
+
|
|
633
|
+
if timer_col not in spark_df.columns:
|
|
634
|
+
ctx.debug(f"Skipping phase {timer_col}: column not found")
|
|
635
|
+
continue
|
|
636
|
+
|
|
637
|
+
phase_df = df
|
|
638
|
+
|
|
639
|
+
if prev_phase_end_df is not None:
|
|
640
|
+
phase_df = (
|
|
641
|
+
phase_df.join(prev_phase_end_df, on=group_by_cols, how="inner")
|
|
642
|
+
.filter(F.col(ts) > F.col("prev_end_ts"))
|
|
643
|
+
.drop("prev_end_ts")
|
|
644
|
+
)
|
|
645
|
+
|
|
646
|
+
w_order = Window.partitionBy(*group_by_cols).orderBy(ts)
|
|
647
|
+
|
|
648
|
+
phase_df = phase_df.withColumn("lag_timer", F.lag(timer_col).over(w_order))
|
|
649
|
+
phase_df = phase_df.withColumn("lag_ts", F.lag(ts).over(w_order))
|
|
650
|
+
|
|
651
|
+
start_candidates = phase_df.filter(
|
|
652
|
+
(F.col(timer_col) > 0)
|
|
653
|
+
& (F.col(timer_col) <= F.lit(phase_threshold))
|
|
654
|
+
& F.col(timer_col).isNotNull()
|
|
655
|
+
)
|
|
656
|
+
|
|
657
|
+
w_start_rank = Window.partitionBy(*group_by_cols).orderBy(ts)
|
|
658
|
+
start_rows = (
|
|
659
|
+
start_candidates.withColumn("start_rn", F.row_number().over(w_start_rank))
|
|
660
|
+
.filter(F.col("start_rn") == 1)
|
|
661
|
+
.select(
|
|
662
|
+
*group_by_cols,
|
|
663
|
+
F.col(ts).alias("start_obs_ts"),
|
|
664
|
+
F.col(timer_col).alias("start_obs_timer"),
|
|
665
|
+
)
|
|
666
|
+
)
|
|
667
|
+
|
|
668
|
+
start_rows = start_rows.withColumn(
|
|
669
|
+
"true_start_ts",
|
|
670
|
+
(F.col("start_obs_ts").cast("long") - F.col("start_obs_timer").cast("long")).cast(
|
|
671
|
+
"timestamp"
|
|
672
|
+
),
|
|
673
|
+
)
|
|
674
|
+
|
|
675
|
+
phase_with_start = phase_df.join(
|
|
676
|
+
start_rows.select(*group_by_cols, "start_obs_ts", "true_start_ts"),
|
|
677
|
+
on=group_by_cols,
|
|
678
|
+
how="inner",
|
|
679
|
+
)
|
|
680
|
+
|
|
681
|
+
phase_with_start = phase_with_start.withColumn(
|
|
682
|
+
"is_plateau",
|
|
683
|
+
(F.col(timer_col).isNotNull())
|
|
684
|
+
& (F.col("lag_timer").isNotNull())
|
|
685
|
+
& (F.col(timer_col) == F.col("lag_timer"))
|
|
686
|
+
& (F.col(ts) != F.col("lag_ts"))
|
|
687
|
+
& (F.col(ts) >= F.col("start_obs_ts"))
|
|
688
|
+
& (F.col("lag_ts") >= F.col("start_obs_ts")),
|
|
689
|
+
)
|
|
690
|
+
|
|
691
|
+
plateau_candidates = phase_with_start.filter("is_plateau")
|
|
692
|
+
|
|
693
|
+
w_plateau_rank = Window.partitionBy(*group_by_cols).orderBy(ts)
|
|
694
|
+
plateau_rows = (
|
|
695
|
+
plateau_candidates.withColumn("plateau_rn", F.row_number().over(w_plateau_rank))
|
|
696
|
+
.filter(F.col("plateau_rn") == 1)
|
|
697
|
+
.select(
|
|
698
|
+
*group_by_cols,
|
|
699
|
+
F.col("lag_ts").alias("end_ts"),
|
|
700
|
+
F.col(timer_col).alias("plateau_timer"),
|
|
701
|
+
)
|
|
702
|
+
)
|
|
703
|
+
|
|
704
|
+
phase_bounds = start_rows.join(plateau_rows, on=group_by_cols, how="left")
|
|
705
|
+
|
|
706
|
+
no_plateau = (
|
|
707
|
+
phase_with_start.filter(~F.col("is_plateau"))
|
|
708
|
+
.groupBy(*group_by_cols)
|
|
709
|
+
.agg(
|
|
710
|
+
F.max(ts).alias("fallback_end_ts"),
|
|
711
|
+
F.max(timer_col).alias("fallback_timer"),
|
|
712
|
+
)
|
|
713
|
+
)
|
|
714
|
+
|
|
715
|
+
phase_bounds = phase_bounds.join(no_plateau, on=group_by_cols, how="left")
|
|
716
|
+
|
|
717
|
+
phase_bounds = phase_bounds.withColumn(
|
|
718
|
+
"final_end_ts", F.coalesce(F.col("end_ts"), F.col("fallback_end_ts"))
|
|
719
|
+
).withColumn("max_timer", F.coalesce(F.col("plateau_timer"), F.col("fallback_timer")))
|
|
720
|
+
|
|
721
|
+
phase_summary = phase_bounds.select(
|
|
722
|
+
*group_by_cols,
|
|
723
|
+
F.col("true_start_ts").alias(f"{timer_col}_start"),
|
|
724
|
+
F.col("final_end_ts").alias(f"{timer_col}_end"),
|
|
725
|
+
(F.col("max_timer") / 60.0).alias(f"{timer_col}_max_minutes"),
|
|
726
|
+
F.col("true_start_ts").alias("_phase_true_start"),
|
|
727
|
+
F.col("final_end_ts").alias("_phase_end"),
|
|
728
|
+
)
|
|
729
|
+
|
|
730
|
+
if params.status_mapping and params.status_col:
|
|
731
|
+
status_durations = _compute_status_durations_spark(
|
|
732
|
+
df=df,
|
|
733
|
+
phase_bounds=phase_bounds.select(*group_by_cols, "true_start_ts", "final_end_ts"),
|
|
734
|
+
params=params,
|
|
735
|
+
timer_col=timer_col,
|
|
736
|
+
group_by_cols=group_by_cols,
|
|
737
|
+
)
|
|
738
|
+
if status_durations is not None:
|
|
739
|
+
phase_summary = phase_summary.join(status_durations, on=group_by_cols, how="left")
|
|
740
|
+
|
|
741
|
+
if params.phase_metrics:
|
|
742
|
+
metrics_df = _compute_phase_metrics_spark(
|
|
743
|
+
df=df,
|
|
744
|
+
phase_bounds=phase_bounds.select(*group_by_cols, "true_start_ts", "final_end_ts"),
|
|
745
|
+
params=params,
|
|
746
|
+
timer_col=timer_col,
|
|
747
|
+
group_by_cols=group_by_cols,
|
|
748
|
+
)
|
|
749
|
+
if metrics_df is not None:
|
|
750
|
+
phase_summary = phase_summary.join(metrics_df, on=group_by_cols, how="left")
|
|
751
|
+
|
|
752
|
+
summary_df = summary_df.join(
|
|
753
|
+
phase_summary.drop("_phase_true_start", "_phase_end"),
|
|
754
|
+
on=group_by_cols,
|
|
755
|
+
how="left",
|
|
756
|
+
)
|
|
757
|
+
|
|
758
|
+
prev_phase_end_df = phase_bounds.select(
|
|
759
|
+
*group_by_cols, F.col("final_end_ts").alias("prev_end_ts")
|
|
760
|
+
).filter(F.col("prev_end_ts").isNotNull())
|
|
761
|
+
|
|
762
|
+
if params.metadata:
|
|
763
|
+
phase_start_cols = [F.col(f"{p}_start") for p in phase_names]
|
|
764
|
+
summary_df = summary_df.withColumn("_first_phase_start", F.coalesce(*phase_start_cols))
|
|
765
|
+
|
|
766
|
+
metadata_df = _compute_metadata_spark(
|
|
767
|
+
df=df,
|
|
768
|
+
summary_df=summary_df.select(*group_by_cols, "_first_phase_start"),
|
|
769
|
+
params=params,
|
|
770
|
+
group_by_cols=group_by_cols,
|
|
771
|
+
)
|
|
772
|
+
if metadata_df is not None:
|
|
773
|
+
summary_df = summary_df.join(metadata_df, on=group_by_cols, how="left")
|
|
774
|
+
|
|
775
|
+
summary_df = summary_df.drop("_first_phase_start")
|
|
776
|
+
|
|
777
|
+
if params.fill_null_minutes:
|
|
778
|
+
numeric_cols = _get_numeric_columns(params)
|
|
779
|
+
for col in numeric_cols:
|
|
780
|
+
if col in summary_df.columns:
|
|
781
|
+
summary_df = summary_df.withColumn(col, F.coalesce(F.col(col), F.lit(0.0)))
|
|
782
|
+
|
|
783
|
+
first_phase_start_col = f"{phase_names[0]}_start" if phase_names else None
|
|
784
|
+
if first_phase_start_col and first_phase_start_col in summary_df.columns:
|
|
785
|
+
summary_df = summary_df.orderBy(first_phase_start_col)
|
|
786
|
+
|
|
787
|
+
return summary_df
|
|
788
|
+
|
|
789
|
+
|
|
790
|
+
def _compute_status_durations_spark(
|
|
791
|
+
df, phase_bounds, params: DetectSequentialPhasesParams, timer_col: str, group_by_cols: List[str]
|
|
792
|
+
):
|
|
793
|
+
"""Compute time spent in each status within a phase window using Spark."""
|
|
794
|
+
from pyspark.sql import functions as F
|
|
795
|
+
from pyspark.sql import Window
|
|
796
|
+
|
|
797
|
+
ts = params.timestamp_col
|
|
798
|
+
status_col = params.status_col
|
|
799
|
+
status_mapping = params.status_mapping
|
|
800
|
+
valid_codes = list(status_mapping.keys())
|
|
801
|
+
|
|
802
|
+
status_df = df.join(
|
|
803
|
+
phase_bounds.withColumnRenamed("true_start_ts", "_start").withColumnRenamed(
|
|
804
|
+
"final_end_ts", "_end"
|
|
805
|
+
),
|
|
806
|
+
on=group_by_cols,
|
|
807
|
+
how="inner",
|
|
808
|
+
).filter((F.col(ts) >= F.col("_start")) & (F.col(ts) <= F.col("_end")))
|
|
809
|
+
|
|
810
|
+
status_df = status_df.withColumn(
|
|
811
|
+
"valid_status",
|
|
812
|
+
F.when(F.col(status_col).isin([F.lit(c) for c in valid_codes]), F.col(status_col)),
|
|
813
|
+
)
|
|
814
|
+
|
|
815
|
+
w_status = (
|
|
816
|
+
Window.partitionBy(*group_by_cols).orderBy(ts).rowsBetween(Window.unboundedPreceding, 0)
|
|
817
|
+
)
|
|
818
|
+
|
|
819
|
+
status_df = status_df.withColumn(
|
|
820
|
+
"ffill_status", F.last("valid_status", ignorenulls=True).over(w_status)
|
|
821
|
+
)
|
|
822
|
+
|
|
823
|
+
w_lead = Window.partitionBy(*group_by_cols).orderBy(ts)
|
|
824
|
+
status_df = status_df.withColumn("next_ts", F.lead(ts).over(w_lead))
|
|
825
|
+
|
|
826
|
+
status_df = status_df.withColumn(
|
|
827
|
+
"interval_end_ts",
|
|
828
|
+
F.when(
|
|
829
|
+
F.col("next_ts").isNull() | (F.col("next_ts") > F.col("_end")),
|
|
830
|
+
F.col("_end"),
|
|
831
|
+
).otherwise(F.col("next_ts")),
|
|
832
|
+
)
|
|
833
|
+
|
|
834
|
+
status_df = status_df.withColumn(
|
|
835
|
+
"interval_sec",
|
|
836
|
+
F.greatest(F.lit(0), F.unix_timestamp("interval_end_ts") - F.unix_timestamp(ts)),
|
|
837
|
+
)
|
|
838
|
+
|
|
839
|
+
status_df = status_df.filter((F.col("ffill_status").isNotNull()) & (F.col("interval_sec") > 0))
|
|
840
|
+
|
|
841
|
+
status_df = status_df.withColumn("interval_min", F.col("interval_sec") / 60.0)
|
|
842
|
+
|
|
843
|
+
durations = status_df.groupBy(*group_by_cols, "ffill_status").agg(
|
|
844
|
+
F.sum("interval_min").alias("minutes")
|
|
845
|
+
)
|
|
846
|
+
|
|
847
|
+
durations_pivot = (
|
|
848
|
+
durations.groupBy(*group_by_cols).pivot("ffill_status", valid_codes).agg(F.first("minutes"))
|
|
849
|
+
)
|
|
850
|
+
|
|
851
|
+
for code, status_name in status_mapping.items():
|
|
852
|
+
old_col = str(code)
|
|
853
|
+
new_col = f"{timer_col}_{status_name}_minutes"
|
|
854
|
+
if old_col in durations_pivot.columns:
|
|
855
|
+
durations_pivot = durations_pivot.withColumnRenamed(old_col, new_col)
|
|
856
|
+
|
|
857
|
+
return durations_pivot
|
|
858
|
+
|
|
859
|
+
|
|
860
|
+
def _compute_phase_metrics_spark(
|
|
861
|
+
df, phase_bounds, params: DetectSequentialPhasesParams, timer_col: str, group_by_cols: List[str]
|
|
862
|
+
):
|
|
863
|
+
"""Compute aggregated metrics within a phase window using Spark."""
|
|
864
|
+
from pyspark.sql import functions as F
|
|
865
|
+
|
|
866
|
+
ts = params.timestamp_col
|
|
867
|
+
|
|
868
|
+
metrics_df = df.join(
|
|
869
|
+
phase_bounds.withColumnRenamed("true_start_ts", "_start").withColumnRenamed(
|
|
870
|
+
"final_end_ts", "_end"
|
|
871
|
+
),
|
|
872
|
+
on=group_by_cols,
|
|
873
|
+
how="inner",
|
|
874
|
+
).filter((F.col(ts) >= F.col("_start")) & (F.col(ts) <= F.col("_end")))
|
|
875
|
+
|
|
876
|
+
agg_exprs = []
|
|
877
|
+
for metric_col, agg_name in params.phase_metrics.items():
|
|
878
|
+
if metric_col in df.columns:
|
|
879
|
+
func = getattr(F, agg_name)
|
|
880
|
+
agg_exprs.append(func(metric_col).alias(f"{timer_col}_{metric_col}"))
|
|
881
|
+
|
|
882
|
+
if not agg_exprs:
|
|
883
|
+
return None
|
|
884
|
+
|
|
885
|
+
return metrics_df.groupBy(*group_by_cols).agg(*agg_exprs)
|
|
886
|
+
|
|
887
|
+
|
|
888
|
+
def _compute_metadata_spark(
|
|
889
|
+
df, summary_df, params: DetectSequentialPhasesParams, group_by_cols: List[str]
|
|
890
|
+
):
|
|
891
|
+
"""Compute metadata columns using Spark."""
|
|
892
|
+
from pyspark.sql import functions as F
|
|
893
|
+
|
|
894
|
+
ts = params.timestamp_col
|
|
895
|
+
|
|
896
|
+
meta_base = df.join(summary_df, on=group_by_cols, how="inner")
|
|
897
|
+
|
|
898
|
+
agg_exprs = []
|
|
899
|
+
struct_cols = []
|
|
900
|
+
|
|
901
|
+
for col_name, method in params.metadata.items():
|
|
902
|
+
if col_name not in df.columns:
|
|
903
|
+
continue
|
|
904
|
+
|
|
905
|
+
if method == "first":
|
|
906
|
+
agg_exprs.append(F.first(col_name, ignorenulls=True).alias(col_name))
|
|
907
|
+
elif method == "last":
|
|
908
|
+
struct_cols.append(col_name)
|
|
909
|
+
agg_exprs.append(
|
|
910
|
+
F.max(F.struct(F.col(ts), F.col(col_name))).alias(f"__{col_name}_struct")
|
|
911
|
+
)
|
|
912
|
+
elif method == "first_after_start":
|
|
913
|
+
agg_exprs.append(
|
|
914
|
+
F.first(
|
|
915
|
+
F.when(F.col(ts) >= F.col("_first_phase_start"), F.col(col_name)),
|
|
916
|
+
ignorenulls=True,
|
|
917
|
+
).alias(col_name)
|
|
918
|
+
)
|
|
919
|
+
elif method in ("max", "min", "mean", "sum"):
|
|
920
|
+
func = getattr(F, method)
|
|
921
|
+
agg_exprs.append(func(col_name).alias(col_name))
|
|
922
|
+
else:
|
|
923
|
+
try:
|
|
924
|
+
func = getattr(F, method)
|
|
925
|
+
agg_exprs.append(func(col_name).alias(col_name))
|
|
926
|
+
except AttributeError:
|
|
927
|
+
agg_exprs.append(F.first(col_name, ignorenulls=True).alias(col_name))
|
|
928
|
+
|
|
929
|
+
if not agg_exprs:
|
|
930
|
+
return None
|
|
931
|
+
|
|
932
|
+
metadata_df = meta_base.groupBy(*group_by_cols).agg(*agg_exprs)
|
|
933
|
+
|
|
934
|
+
for col_name in struct_cols:
|
|
935
|
+
metadata_df = metadata_df.withColumn(
|
|
936
|
+
col_name, F.col(f"__{col_name}_struct").getField(col_name)
|
|
937
|
+
).drop(f"__{col_name}_struct")
|
|
938
|
+
|
|
939
|
+
return metadata_df
|
|
940
|
+
|
|
941
|
+
|
|
942
|
+
def _process_single_group_pandas(
|
|
943
|
+
group: pd.DataFrame, params: DetectSequentialPhasesParams
|
|
944
|
+
) -> Optional[Dict]:
|
|
945
|
+
"""Process a single group and return the summary row dict."""
|
|
946
|
+
group_by_cols = _normalize_group_by(params.group_by)
|
|
947
|
+
|
|
948
|
+
group = group.copy()
|
|
949
|
+
group[params.timestamp_col] = pd.to_datetime(group[params.timestamp_col])
|
|
950
|
+
group = group.sort_values(params.timestamp_col).reset_index(drop=True)
|
|
951
|
+
|
|
952
|
+
if len(group_by_cols) == 1:
|
|
953
|
+
row = {group_by_cols[0]: group[group_by_cols[0]].iloc[0]}
|
|
954
|
+
else:
|
|
955
|
+
row = {col: group[col].iloc[0] for col in group_by_cols}
|
|
956
|
+
|
|
957
|
+
row.update(_get_expected_columns(params))
|
|
958
|
+
|
|
959
|
+
previous_phase_end = None
|
|
960
|
+
first_phase_start = None
|
|
961
|
+
|
|
962
|
+
for phase in params.phases:
|
|
963
|
+
if isinstance(phase, PhaseConfig):
|
|
964
|
+
timer_col = phase.timer_col
|
|
965
|
+
threshold = phase.start_threshold or params.start_threshold
|
|
966
|
+
else:
|
|
967
|
+
timer_col = phase
|
|
968
|
+
threshold = params.start_threshold
|
|
969
|
+
|
|
970
|
+
if timer_col not in group.columns:
|
|
971
|
+
continue
|
|
972
|
+
|
|
973
|
+
phase_result = _detect_single_phase(
|
|
974
|
+
group=group,
|
|
975
|
+
timer_col=timer_col,
|
|
976
|
+
timestamp_col=params.timestamp_col,
|
|
977
|
+
threshold=threshold,
|
|
978
|
+
previous_phase_end=previous_phase_end,
|
|
979
|
+
status_col=params.status_col,
|
|
980
|
+
status_mapping=params.status_mapping,
|
|
981
|
+
phase_metrics=params.phase_metrics,
|
|
982
|
+
time_format=params.output_time_format,
|
|
983
|
+
)
|
|
984
|
+
|
|
985
|
+
if phase_result:
|
|
986
|
+
row.update(phase_result["columns"])
|
|
987
|
+
previous_phase_end = phase_result["end_time"]
|
|
988
|
+
|
|
989
|
+
if first_phase_start is None:
|
|
990
|
+
first_phase_start = phase_result["start_time"]
|
|
991
|
+
|
|
992
|
+
if params.metadata and first_phase_start is not None:
|
|
993
|
+
metadata_values = _extract_metadata(
|
|
994
|
+
group=group,
|
|
995
|
+
metadata_config=params.metadata,
|
|
996
|
+
timestamp_col=params.timestamp_col,
|
|
997
|
+
first_phase_start=first_phase_start,
|
|
998
|
+
)
|
|
999
|
+
row.update(metadata_values)
|
|
1000
|
+
|
|
1001
|
+
if params.fill_null_minutes:
|
|
1002
|
+
numeric_cols = _get_numeric_columns(params)
|
|
1003
|
+
for col in numeric_cols:
|
|
1004
|
+
if col in row and row[col] is None:
|
|
1005
|
+
row[col] = 0.0
|
|
1006
|
+
|
|
1007
|
+
return row
|
|
1008
|
+
|
|
1009
|
+
|
|
1010
|
+
# =============================================================================
|
|
1011
|
+
# POLARS IMPLEMENTATION
|
|
1012
|
+
# =============================================================================
|
|
1013
|
+
|
|
1014
|
+
|
|
1015
|
+
def _detect_phases_polars(polars_df, params: DetectSequentialPhasesParams):
|
|
1016
|
+
"""
|
|
1017
|
+
Polars implementation - converts to Pandas for processing.
|
|
1018
|
+
|
|
1019
|
+
TODO: Native Polars implementation for better performance.
|
|
1020
|
+
"""
|
|
1021
|
+
pdf = polars_df.to_pandas()
|
|
1022
|
+
result_pdf = _detect_phases_pandas(pdf, params)
|
|
1023
|
+
|
|
1024
|
+
try:
|
|
1025
|
+
import polars as pl
|
|
1026
|
+
|
|
1027
|
+
return pl.from_pandas(result_pdf)
|
|
1028
|
+
except ImportError:
|
|
1029
|
+
raise ValueError("Polars is not installed")
|