Sensor2EventLog 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- S2E_v2.py +1025 -0
- abstraction/mt_loop.py +141 -0
- config.py +46 -0
- contextualization/event_log.py +339 -0
- core/__init__.py +3 -0
- core/pipeline.py +189 -0
- evaluation/rule_analyzer.py +246 -0
- features/feature_library.py +270 -0
- main.py +394 -0
- models/base_model.py +68 -0
- models/hmm_model.py +174 -0
- sensor2eventlog-2.0.0.dist-info/METADATA +53 -0
- sensor2eventlog-2.0.0.dist-info/RECORD +18 -0
- sensor2eventlog-2.0.0.dist-info/WHEEL +5 -0
- sensor2eventlog-2.0.0.dist-info/licenses/LICENSE +21 -0
- sensor2eventlog-2.0.0.dist-info/top_level.txt +10 -0
- utils/__init__.py +30 -0
- utils/hmm_utils.py +277 -0
S2E_v2.py
ADDED
|
@@ -0,0 +1,1025 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from scipy.stats import skew, kurtosis, entropy
|
|
4
|
+
from scipy.signal import find_peaks
|
|
5
|
+
from typing import List, Dict, Union, Optional, Tuple
|
|
6
|
+
import matplotlib.pyplot as plt
|
|
7
|
+
from hmmlearn.hmm import GaussianHMM
|
|
8
|
+
from sklearn.preprocessing import StandardScaler
|
|
9
|
+
from sklearn.metrics import confusion_matrix, classification_report
|
|
10
|
+
from scipy.optimize import linear_sum_assignment
|
|
11
|
+
import re
|
|
12
|
+
|
|
13
|
+
class RuleDiagnosticAnalyzer:
|
|
14
|
+
"""
|
|
15
|
+
Analyzes rule performance using coverage, precision, and explainability metrics
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, coverage_threshold: float = 0.6, precision_threshold: float = 0.7,
|
|
19
|
+
explainability_threshold: float = 0.3):
|
|
20
|
+
self.c_low = coverage_threshold
|
|
21
|
+
self.p_low = precision_threshold
|
|
22
|
+
self.epsilon_unex = explainability_threshold
|
|
23
|
+
|
|
24
|
+
def compute_rule_metrics(self, df: pd.DataFrame, event_features: pd.DataFrame,
|
|
25
|
+
state_column: str = 'state') -> Dict:
|
|
26
|
+
"""
|
|
27
|
+
Compute coverage, precision, and explainability metrics for all event features
|
|
28
|
+
|
|
29
|
+
Parameters:
|
|
30
|
+
-----------
|
|
31
|
+
df : pd.DataFrame
|
|
32
|
+
Original dataframe with state labels
|
|
33
|
+
event_features : pd.DataFrame
|
|
34
|
+
DataFrame containing event rule features (binary columns)
|
|
35
|
+
state_column : str
|
|
36
|
+
Column name containing state labels
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
--------
|
|
40
|
+
Dict with comprehensive diagnostic results
|
|
41
|
+
"""
|
|
42
|
+
results = {
|
|
43
|
+
'rule_metrics': {},
|
|
44
|
+
'state_metrics': {},
|
|
45
|
+
'recommendations': [],
|
|
46
|
+
'unexplainable_states': []
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
states = df[state_column].unique()
|
|
50
|
+
|
|
51
|
+
# Compute metrics for each rule-state combination
|
|
52
|
+
rule_metrics = {}
|
|
53
|
+
for rule_col in event_features.columns:
|
|
54
|
+
if rule_col.startswith('event_'):
|
|
55
|
+
rule_metrics[rule_col] = {}
|
|
56
|
+
|
|
57
|
+
for state in states:
|
|
58
|
+
# Get timestamps for this state
|
|
59
|
+
state_mask = df[state_column] == state
|
|
60
|
+
state_timestamps = state_mask[state_mask].index
|
|
61
|
+
|
|
62
|
+
if len(state_timestamps) == 0:
|
|
63
|
+
continue
|
|
64
|
+
|
|
65
|
+
# Rule activations for this state
|
|
66
|
+
rule_activations_state = event_features.loc[state_timestamps, rule_col]
|
|
67
|
+
|
|
68
|
+
# Total rule activations
|
|
69
|
+
total_rule_activations = event_features[rule_col].sum()
|
|
70
|
+
|
|
71
|
+
# Compute coverage and precision
|
|
72
|
+
coverage = rule_activations_state.sum() / len(state_timestamps)
|
|
73
|
+
precision = (rule_activations_state.sum() / total_rule_activations
|
|
74
|
+
if total_rule_activations > 0 else 0)
|
|
75
|
+
|
|
76
|
+
rule_metrics[rule_col][state] = {
|
|
77
|
+
'coverage': coverage,
|
|
78
|
+
'precision': precision,
|
|
79
|
+
'effectiveness': np.sqrt(coverage * precision) if coverage > 0 and precision > 0 else 0
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
results['rule_metrics'] = rule_metrics
|
|
83
|
+
|
|
84
|
+
# Compute state-level metrics
|
|
85
|
+
state_metrics = {}
|
|
86
|
+
for state in states:
|
|
87
|
+
state_mask = df[state_column] == state
|
|
88
|
+
state_timestamps = state_mask[state_mask].index
|
|
89
|
+
|
|
90
|
+
if len(state_timestamps) == 0:
|
|
91
|
+
continue
|
|
92
|
+
|
|
93
|
+
# Find best coverage across all rules for this state
|
|
94
|
+
best_coverage = 0
|
|
95
|
+
best_rule = None
|
|
96
|
+
|
|
97
|
+
for rule_col, state_metrics_dict in rule_metrics.items():
|
|
98
|
+
if state in state_metrics_dict:
|
|
99
|
+
coverage = state_metrics_dict[state]['coverage']
|
|
100
|
+
if coverage > best_coverage:
|
|
101
|
+
best_coverage = coverage
|
|
102
|
+
best_rule = rule_col
|
|
103
|
+
|
|
104
|
+
explainability = best_coverage
|
|
105
|
+
gap = 1 - explainability
|
|
106
|
+
|
|
107
|
+
state_metrics[state] = {
|
|
108
|
+
'explainability': explainability,
|
|
109
|
+
'gap': gap,
|
|
110
|
+
'best_rule': best_rule,
|
|
111
|
+
'best_coverage': best_coverage,
|
|
112
|
+
'state_frequency': len(state_timestamps) / len(df),
|
|
113
|
+
'unexplainable': explainability < self.epsilon_unex
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
if explainability < self.epsilon_unex:
|
|
117
|
+
results['unexplainable_states'].append(state)
|
|
118
|
+
|
|
119
|
+
results['state_metrics'] = state_metrics
|
|
120
|
+
|
|
121
|
+
# Generate recommendations
|
|
122
|
+
results['recommendations'] = self._generate_recommendations(rule_metrics, state_metrics)
|
|
123
|
+
|
|
124
|
+
return results
|
|
125
|
+
|
|
126
|
+
def _generate_recommendations(self, rule_metrics: Dict, state_metrics: Dict) -> List[Dict]:
|
|
127
|
+
"""Generate actionable recommendations based on diagnostic metrics"""
|
|
128
|
+
recommendations = []
|
|
129
|
+
|
|
130
|
+
# Analyze each rule-state combination
|
|
131
|
+
for rule_col, state_dict in rule_metrics.items():
|
|
132
|
+
for state, metrics in state_dict.items():
|
|
133
|
+
coverage = metrics['coverage']
|
|
134
|
+
precision = metrics['precision']
|
|
135
|
+
|
|
136
|
+
# Rule categorization and recommendations
|
|
137
|
+
if coverage >= self.c_low and precision >= self.p_low:
|
|
138
|
+
# Optimal rule - no action needed
|
|
139
|
+
continue
|
|
140
|
+
|
|
141
|
+
elif coverage >= self.c_low and precision < self.p_low:
|
|
142
|
+
# Overly sensitive rule
|
|
143
|
+
recommendations.append({
|
|
144
|
+
'type': 'OVERLY_SENSITIVE_RULE',
|
|
145
|
+
'rule': rule_col,
|
|
146
|
+
'state': state,
|
|
147
|
+
'coverage': coverage,
|
|
148
|
+
'precision': precision,
|
|
149
|
+
'action': f"Rule '{rule_col}' for state '{state}' has good coverage ({coverage:.1%}) but low precision ({precision:.1%}). Add temporal stability constraints or interaction features to reduce false positives.",
|
|
150
|
+
'priority': 'HIGH',
|
|
151
|
+
'suggested_families': ['stability', 'interaction']
|
|
152
|
+
})
|
|
153
|
+
|
|
154
|
+
elif coverage < self.c_low and precision >= self.p_low:
|
|
155
|
+
# Overly specific rule
|
|
156
|
+
recommendations.append({
|
|
157
|
+
'type': 'OVERLY_SPECIFIC_RULE',
|
|
158
|
+
'rule': rule_col,
|
|
159
|
+
'state': state,
|
|
160
|
+
'coverage': coverage,
|
|
161
|
+
'precision': precision,
|
|
162
|
+
'action': f"Rule '{rule_col}' for state '{state}' has high precision ({precision:.1%}) but low coverage ({coverage:.1%}). Relax thresholds or remove restrictive conditions.",
|
|
163
|
+
'priority': 'MEDIUM',
|
|
164
|
+
'suggested_families': ['temporal', 'statistical']
|
|
165
|
+
})
|
|
166
|
+
|
|
167
|
+
elif coverage < self.c_low and precision < self.p_low:
|
|
168
|
+
# Ineffective rule
|
|
169
|
+
recommendations.append({
|
|
170
|
+
'type': 'INEFFECTIVE_RULE',
|
|
171
|
+
'rule': rule_col,
|
|
172
|
+
'state': state,
|
|
173
|
+
'coverage': coverage,
|
|
174
|
+
'precision': precision,
|
|
175
|
+
'action': f"Rule '{rule_col}' for state '{state}' performs poorly (coverage: {coverage:.1%}, precision: {precision:.1%}). Consider complete redesign with alternative sensor combinations.",
|
|
176
|
+
'priority': 'HIGH',
|
|
177
|
+
'suggested_families': ['all']
|
|
178
|
+
})
|
|
179
|
+
|
|
180
|
+
# Analyze unexplainable states
|
|
181
|
+
for state, metrics in state_metrics.items():
|
|
182
|
+
if metrics['unexplainable']:
|
|
183
|
+
recommendations.append({
|
|
184
|
+
'type': 'UNEXPLAINABLE_STATE',
|
|
185
|
+
'state': state,
|
|
186
|
+
'explainability': metrics['explainability'],
|
|
187
|
+
'frequency': metrics['state_frequency'],
|
|
188
|
+
'action': f"State '{state}' is largely unexplained (explainability: {metrics['explainability']:.1%}). Consider state decomposition, feature space expansion, or probabilistic approaches.",
|
|
189
|
+
'priority': 'CRITICAL' if metrics['state_frequency'] > 0.1 else 'HIGH',
|
|
190
|
+
'suggested_approaches': ['state_decomposition', 'feature_expansion', 'probabilistic_modeling']
|
|
191
|
+
})
|
|
192
|
+
|
|
193
|
+
# Sort by priority
|
|
194
|
+
priority_order = {'CRITICAL': 0, 'HIGH': 1, 'MEDIUM': 2, 'LOW': 3}
|
|
195
|
+
recommendations.sort(key=lambda x: priority_order[x['priority']])
|
|
196
|
+
|
|
197
|
+
return recommendations
|
|
198
|
+
|
|
199
|
+
def print_diagnostic_report(self, diagnostic_results: Dict):
|
|
200
|
+
"""Print comprehensive diagnostic report"""
|
|
201
|
+
print("=" * 80)
|
|
202
|
+
print("RULE DIAGNOSTIC REPORT")
|
|
203
|
+
print("=" * 80)
|
|
204
|
+
|
|
205
|
+
# Rule performance summary
|
|
206
|
+
print("\n1. RULE PERFORMANCE SUMMARY:")
|
|
207
|
+
print("-" * 40)
|
|
208
|
+
|
|
209
|
+
rule_metrics = diagnostic_results['rule_metrics']
|
|
210
|
+
for rule_col, state_dict in rule_metrics.items():
|
|
211
|
+
print(f"\nRule: {rule_col}")
|
|
212
|
+
for state, metrics in state_dict.items():
|
|
213
|
+
print(f" State: {state:15} | Coverage: {metrics['coverage']:6.1%} | "
|
|
214
|
+
f"Precision: {metrics['precision']:6.1%} | "
|
|
215
|
+
f"Effectiveness: {metrics['effectiveness']:6.1%}")
|
|
216
|
+
|
|
217
|
+
# State explainability
|
|
218
|
+
print("\n2. STATE EXPLAINABILITY ANALYSIS:")
|
|
219
|
+
print("-" * 40)
|
|
220
|
+
|
|
221
|
+
state_metrics = diagnostic_results['state_metrics']
|
|
222
|
+
for state, metrics in state_metrics.items():
|
|
223
|
+
unexplainable_flag = " ⚠ UNEXPLAINABLE" if metrics['unexplainable'] else ""
|
|
224
|
+
print(f"State: {state:15} | Explainability: {metrics['explainability']:6.1%} | "
|
|
225
|
+
f"Best Rule: {metrics['best_rule'] or 'None'}{unexplainable_flag}")
|
|
226
|
+
|
|
227
|
+
# Recommendations
|
|
228
|
+
print("\n3. ACTIONABLE RECOMMENDATIONS:")
|
|
229
|
+
print("-" * 40)
|
|
230
|
+
|
|
231
|
+
for i, rec in enumerate(diagnostic_results['recommendations'], 1):
|
|
232
|
+
print(f"\n{i}. [{rec['priority']}] {rec['type']}")
|
|
233
|
+
print(f" {rec['action']}")
|
|
234
|
+
|
|
235
|
+
if 'suggested_families' in rec:
|
|
236
|
+
print(f" Suggested feature families: {', '.join(rec['suggested_families'])}")
|
|
237
|
+
if 'suggested_approaches' in rec:
|
|
238
|
+
print(f" Suggested approaches: {', '.join(rec['suggested_approaches'])}")
|
|
239
|
+
|
|
240
|
+
# Summary statistics
|
|
241
|
+
print("\n4. SUMMARY STATISTICS:")
|
|
242
|
+
print("-" * 40)
|
|
243
|
+
|
|
244
|
+
total_states = len(state_metrics)
|
|
245
|
+
unexplainable_states = len(diagnostic_results['unexplainable_states'])
|
|
246
|
+
avg_explainability = np.mean([m['explainability'] for m in state_metrics.values()])
|
|
247
|
+
|
|
248
|
+
print(f"Total states analyzed: {total_states}")
|
|
249
|
+
print(f"Unexplainable states: {unexplainable_states} ({unexplainable_states/total_states:.1%})")
|
|
250
|
+
print(f"Average explainability: {avg_explainability:.1%}")
|
|
251
|
+
print(f"Recommendations generated: {len(diagnostic_results['recommendations'])}")
|
|
252
|
+
|
|
253
|
+
# Enhanced ModularFeatureLibrary with diagnostic capabilities
|
|
254
|
+
class ModularFeatureLibrary:
|
|
255
|
+
def __init__(self, window_sizes=[5], stability_eps=1, peak_threshold=0.1):
|
|
256
|
+
self.window_sizes = window_sizes
|
|
257
|
+
self.stability_eps = stability_eps
|
|
258
|
+
self.peak_threshold = peak_threshold
|
|
259
|
+
self.feature_families = {
|
|
260
|
+
'statistical': self._compute_statistical_features,
|
|
261
|
+
'temporal': self._compute_temporal_features,
|
|
262
|
+
'stability': self._compute_stability_features,
|
|
263
|
+
'interaction': self._compute_interaction_features,
|
|
264
|
+
'event': self._compute_event_features,
|
|
265
|
+
'contextual': self._compute_contextual_features
|
|
266
|
+
}
|
|
267
|
+
self.diagnostic_analyzer = RuleDiagnosticAnalyzer()
|
|
268
|
+
self._feature_cache = {}
|
|
269
|
+
|
|
270
|
+
def _normalize_rule_expr(self, expr: str) -> str:
|
|
271
|
+
"""Convert human-friendly logical ops to pandas-style bitwise ops and normalize spacing."""
|
|
272
|
+
s = expr.strip()
|
|
273
|
+
s = re.sub(r'\bAND\b', '&', s, flags=re.I)
|
|
274
|
+
s = re.sub(r'\bOR\b', '|', s, flags=re.I)
|
|
275
|
+
s = re.sub(r'\bNOT\b', '~', s, flags=re.I)
|
|
276
|
+
s = re.sub(r'\band\b', '&', s)
|
|
277
|
+
s = re.sub(r'\bor\b', '|', s)
|
|
278
|
+
s = re.sub(r'\bnot\b', '~', s)
|
|
279
|
+
s = re.sub(r'\s*([&|~><=!]+)\s*', r' \1 ', s)
|
|
280
|
+
s = re.sub(r'\s+', ' ', s)
|
|
281
|
+
return s.strip()
|
|
282
|
+
|
|
283
|
+
def _evaluate_rule(self, rule_expr: str, available_features: pd.DataFrame) -> pd.Series:
|
|
284
|
+
"""Evaluate a rule expression using the available features."""
|
|
285
|
+
normalized_expr = self._normalize_rule_expr(rule_expr)
|
|
286
|
+
|
|
287
|
+
try:
|
|
288
|
+
eval_env = {col: available_features[col] for col in available_features.columns}
|
|
289
|
+
eval_env.update({
|
|
290
|
+
'np': np, 'pd': pd, 'abs': np.abs, 'min': np.minimum, 'max': np.maximum
|
|
291
|
+
})
|
|
292
|
+
|
|
293
|
+
result = eval(normalized_expr, {"__builtins__": {}}, eval_env)
|
|
294
|
+
|
|
295
|
+
if isinstance(result, pd.Series):
|
|
296
|
+
return result.astype(bool)
|
|
297
|
+
else:
|
|
298
|
+
return pd.Series([bool(result)] * len(available_features),
|
|
299
|
+
index=available_features.index)
|
|
300
|
+
|
|
301
|
+
except Exception as e:
|
|
302
|
+
print(f"Error evaluating rule '{rule_expr}': {e}")
|
|
303
|
+
return pd.Series(False, index=available_features.index)
|
|
304
|
+
|
|
305
|
+
def _safe_norm(self, series):
|
|
306
|
+
"""Z-score normalization"""
|
|
307
|
+
return (series - series.mean()) / (series.std() + 1e-8)
|
|
308
|
+
|
|
309
|
+
def _safe_ratio(self, a, b):
|
|
310
|
+
"""Safe ratio calculation"""
|
|
311
|
+
a_safe = np.abs(a) + 1e-6
|
|
312
|
+
b_safe = np.abs(b) + 1e-6
|
|
313
|
+
ratio = np.log1p(a_safe) - np.log1p(b_safe)
|
|
314
|
+
sign = np.sign(a * b)
|
|
315
|
+
return ratio * sign
|
|
316
|
+
|
|
317
|
+
def _compute_statistical_features(self, df, signals, **kwargs):
|
|
318
|
+
"""Statistical Features Family"""
|
|
319
|
+
features = pd.DataFrame(index=df.index)
|
|
320
|
+
|
|
321
|
+
for signal in signals:
|
|
322
|
+
s = df[signal]
|
|
323
|
+
for win in self.window_sizes:
|
|
324
|
+
roll = s.rolling(win, min_periods=1)
|
|
325
|
+
features[f"{signal}_roll_mean_{win}"] = roll.mean()
|
|
326
|
+
|
|
327
|
+
return features
|
|
328
|
+
|
|
329
|
+
def _compute_temporal_features(self, df, signals, **kwargs):
|
|
330
|
+
"""Temporal Dynamics Family"""
|
|
331
|
+
features = pd.DataFrame(index=df.index)
|
|
332
|
+
|
|
333
|
+
for signal in signals:
|
|
334
|
+
s = df[signal]
|
|
335
|
+
diff = s.diff().fillna(0)
|
|
336
|
+
features[f"{signal}_diff"] = diff
|
|
337
|
+
features[f"{signal}_diff_sign"] = np.sign(diff)
|
|
338
|
+
features[f"{signal}_diff_smooth"] = diff.ewm(span=5).mean()
|
|
339
|
+
features[f"{signal}_abs_diff"] = np.abs(diff)
|
|
340
|
+
|
|
341
|
+
return features
|
|
342
|
+
|
|
343
|
+
def _compute_stability_features(self, df, signals, **kwargs):
|
|
344
|
+
"""Stability Features Family"""
|
|
345
|
+
features = pd.DataFrame(index=df.index)
|
|
346
|
+
|
|
347
|
+
for signal in signals:
|
|
348
|
+
s = df[signal]
|
|
349
|
+
diff = s.diff().fillna(0)
|
|
350
|
+
features[f"{signal}_stability"] = 1.0 / (1.0 + np.abs(diff))
|
|
351
|
+
features[f"{signal}_stable_flag"] = (np.abs(diff) < self.stability_eps).astype(int)
|
|
352
|
+
|
|
353
|
+
stable_periods = (np.abs(diff) < self.stability_eps)
|
|
354
|
+
consecutive_stable = stable_periods.groupby((~stable_periods).cumsum()).cumsum()
|
|
355
|
+
features[f"{signal}_consecutive_stable"] = consecutive_stable
|
|
356
|
+
|
|
357
|
+
return features
|
|
358
|
+
|
|
359
|
+
def _compute_interaction_features(self, df, signals, **kwargs):
|
|
360
|
+
"""Interaction Features Family"""
|
|
361
|
+
features = pd.DataFrame(index=df.index)
|
|
362
|
+
|
|
363
|
+
if len(signals) < 2:
|
|
364
|
+
return features
|
|
365
|
+
|
|
366
|
+
for i in range(len(signals)):
|
|
367
|
+
for j in range(i + 1, len(signals)):
|
|
368
|
+
sig1, sig2 = signals[i], signals[j]
|
|
369
|
+
features[f"{sig1}_x_{sig2}"] = df[sig1] * df[sig2]
|
|
370
|
+
features[f"{sig1}_ratio_{sig2}"] = self._safe_ratio(df[sig1], df[sig2])
|
|
371
|
+
|
|
372
|
+
return features
|
|
373
|
+
|
|
374
|
+
def _compute_event_features(self, df, signals, **kwargs):
|
|
375
|
+
"""Event/Regime Features Family with rule-based features"""
|
|
376
|
+
features = pd.DataFrame(index=df.index)
|
|
377
|
+
|
|
378
|
+
# Create a comprehensive set of available features for rule evaluation
|
|
379
|
+
available_features = df.copy()
|
|
380
|
+
|
|
381
|
+
# Pre-compute all necessary derived features for ALL numeric columns
|
|
382
|
+
for signal in df.columns:
|
|
383
|
+
if pd.api.types.is_numeric_dtype(df[signal]):
|
|
384
|
+
try:
|
|
385
|
+
# Compute temporal features
|
|
386
|
+
diff = df[signal].diff().fillna(0)
|
|
387
|
+
available_features[f"{signal}_diff"] = diff
|
|
388
|
+
available_features[f"{signal}_diff_smooth"] = diff.ewm(span=5).mean()
|
|
389
|
+
available_features[f"{signal}_abs_diff"] = np.abs(diff)
|
|
390
|
+
|
|
391
|
+
# Compute stability features
|
|
392
|
+
available_features[f"{signal}_stability"] = 1.0 / (1.0 + np.abs(diff))
|
|
393
|
+
available_features[f"{signal}_stable_flag"] = (np.abs(diff) < self.stability_eps).astype(int)
|
|
394
|
+
|
|
395
|
+
except (TypeError, ValueError) as e:
|
|
396
|
+
print(f"Warning: Could not compute derived features for {signal}: {e}")
|
|
397
|
+
|
|
398
|
+
# Process event definitions
|
|
399
|
+
rule_counter = 0
|
|
400
|
+
for signal_def in signals:
|
|
401
|
+
if isinstance(signal_def, str) and any(op in signal_def for op in ['>', '<', '==', '&', '|']):
|
|
402
|
+
rule_counter += 1
|
|
403
|
+
try:
|
|
404
|
+
fixed_expr = self._fix_rule_parentheses(signal_def)
|
|
405
|
+
rule_result = self._evaluate_rule(fixed_expr, available_features)
|
|
406
|
+
clean_name = re.sub(r'[^a-zA-Z0-9_]', '_', signal_def[:20])
|
|
407
|
+
feature_name = f"event_{clean_name}"
|
|
408
|
+
if feature_name in features.columns:
|
|
409
|
+
feature_name = f"event_{clean_name}_{rule_counter}"
|
|
410
|
+
features[feature_name] = rule_result.astype(int)
|
|
411
|
+
print(f"Created event feature: {feature_name} from rule: {signal_def}")
|
|
412
|
+
except Exception as e:
|
|
413
|
+
print(f"Error processing rule '{signal_def}': {e}")
|
|
414
|
+
features[f"event_rule_error_{rule_counter}"] = 0
|
|
415
|
+
|
|
416
|
+
elif isinstance(signal_def, dict):
|
|
417
|
+
for rule_name, rule_expr in signal_def.items():
|
|
418
|
+
try:
|
|
419
|
+
fixed_expr = self._fix_rule_parentheses(rule_expr)
|
|
420
|
+
rule_result = self._evaluate_rule(fixed_expr, available_features)
|
|
421
|
+
features[f"event_{rule_name}"] = rule_result.astype(int)
|
|
422
|
+
print(f"Created named event feature: event_{rule_name}")
|
|
423
|
+
except Exception as e:
|
|
424
|
+
print(f"Error processing named rule '{rule_name}': {e}")
|
|
425
|
+
features[f"event_{rule_name}_error"] = 0
|
|
426
|
+
|
|
427
|
+
return features
|
|
428
|
+
|
|
429
|
+
def _compute_contextual_features(self, df, signals, **kwargs):
|
|
430
|
+
"""Contextual Features Family"""
|
|
431
|
+
features = pd.DataFrame(index=df.index)
|
|
432
|
+
batch_id = kwargs.get('batch_id', 'batch_id')
|
|
433
|
+
|
|
434
|
+
if batch_id in df.columns:
|
|
435
|
+
batch_pos = df.groupby(batch_id).cumcount()
|
|
436
|
+
features["batch_position"] = batch_pos / batch_pos.groupby(df[batch_id]).transform('max')
|
|
437
|
+
features["is_batch_start"] = (batch_pos == 0).astype(int)
|
|
438
|
+
features["is_batch_end"] = (batch_pos == batch_pos.groupby(df[batch_id]).transform('max')).astype(int)
|
|
439
|
+
|
|
440
|
+
return features
|
|
441
|
+
|
|
442
|
+
def _fix_rule_parentheses(self, expr: str) -> str:
|
|
443
|
+
"""Add parentheses around comparison operations to avoid ambiguous truth values."""
|
|
444
|
+
normalized = self._normalize_rule_expr(expr)
|
|
445
|
+
parts = re.split(r'(\s*[&|]\s*)', normalized)
|
|
446
|
+
|
|
447
|
+
if len(parts) == 1:
|
|
448
|
+
return normalized
|
|
449
|
+
|
|
450
|
+
result_parts = []
|
|
451
|
+
for part in parts:
|
|
452
|
+
if part.strip() in ['&', '|']:
|
|
453
|
+
result_parts.append(part)
|
|
454
|
+
else:
|
|
455
|
+
if any(op in part for op in ['>', '<', '==', '!=', '>=', '<=']):
|
|
456
|
+
result_parts.append(f'({part})')
|
|
457
|
+
else:
|
|
458
|
+
result_parts.append(part)
|
|
459
|
+
|
|
460
|
+
return ''.join(result_parts)
|
|
461
|
+
|
|
462
|
+
def compute_features(self, df, feature_plan: Dict[str, List[str]]):
|
|
463
|
+
"""Compute features based on a feature plan"""
|
|
464
|
+
all_features = pd.DataFrame(index=df.index)
|
|
465
|
+
|
|
466
|
+
for family, signals in feature_plan.items():
|
|
467
|
+
if family not in self.feature_families:
|
|
468
|
+
print(f"Warning: Unknown feature family '{family}'")
|
|
469
|
+
continue
|
|
470
|
+
|
|
471
|
+
if family == 'interaction':
|
|
472
|
+
for signal_pair in signals:
|
|
473
|
+
if len(signal_pair) == 2:
|
|
474
|
+
family_features = self.feature_families[family](df, signal_pair)
|
|
475
|
+
all_features = pd.concat([all_features, family_features], axis=1)
|
|
476
|
+
else:
|
|
477
|
+
family_features = self.feature_families[family](df, signals)
|
|
478
|
+
all_features = pd.concat([all_features, family_features], axis=1)
|
|
479
|
+
|
|
480
|
+
return all_features.fillna(0)
|
|
481
|
+
|
|
482
|
+
def analyze_rule_performance(self, df: pd.DataFrame, feature_plan: Dict[str, List[str]]) -> Dict:
|
|
483
|
+
"""
|
|
484
|
+
Compute features and analyze rule performance
|
|
485
|
+
|
|
486
|
+
Parameters:
|
|
487
|
+
-----------
|
|
488
|
+
df : pd.DataFrame
|
|
489
|
+
Input data with sensor signals and state labels
|
|
490
|
+
feature_plan : dict
|
|
491
|
+
Feature plan including event rules
|
|
492
|
+
|
|
493
|
+
Returns:
|
|
494
|
+
--------
|
|
495
|
+
Dict with features and diagnostic results
|
|
496
|
+
"""
|
|
497
|
+
# Compute features
|
|
498
|
+
features = self.compute_features(df, feature_plan)
|
|
499
|
+
|
|
500
|
+
# Extract event features for analysis
|
|
501
|
+
event_features = features[[col for col in features.columns if col.startswith('event_')]]
|
|
502
|
+
|
|
503
|
+
if event_features.empty:
|
|
504
|
+
print("No event features found for analysis")
|
|
505
|
+
return {'features': features, 'diagnostics': None}
|
|
506
|
+
|
|
507
|
+
# Run diagnostic analysis
|
|
508
|
+
diagnostic_results = self.diagnostic_analyzer.compute_rule_metrics(df, event_features)
|
|
509
|
+
|
|
510
|
+
return {
|
|
511
|
+
'features': features,
|
|
512
|
+
'diagnostics': diagnostic_results
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
# [Keep all your existing HMM functions here...]
|
|
516
|
+
# empirical_start_trans, emissions_from_labels, viterbi_decode, print_evaluation
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
def empirical_start_trans(labels, lengths, n_states):
|
|
520
|
+
"""Estimate startprob_ and transmat_ from labeled sequences."""
|
|
521
|
+
start = np.zeros(n_states)
|
|
522
|
+
trans = np.zeros((n_states, n_states))
|
|
523
|
+
idx = 0
|
|
524
|
+
for L in lengths:
|
|
525
|
+
seq = labels[idx:idx+L]
|
|
526
|
+
start[seq[0]] += 1
|
|
527
|
+
for i in range(L-1):
|
|
528
|
+
trans[seq[i], seq[i+1]] += 1
|
|
529
|
+
idx += L
|
|
530
|
+
# normalize with small epsilon to avoid zeros
|
|
531
|
+
start = (start + 1e-6) / (start.sum() + 1e-6*n_states)
|
|
532
|
+
trans = (trans + 1e-6)
|
|
533
|
+
trans /= trans.sum(axis=1, keepdims=True)
|
|
534
|
+
return start, trans
|
|
535
|
+
|
|
536
|
+
def emissions_from_labels(X_np, labels_np, n_states):
|
|
537
|
+
"""Compute means and covariances per labeled state."""
|
|
538
|
+
D = X_np.shape[1]
|
|
539
|
+
means = np.zeros((n_states, D))
|
|
540
|
+
covars = np.zeros((n_states, D, D))
|
|
541
|
+
for s in range(n_states):
|
|
542
|
+
sel = (labels_np == s)
|
|
543
|
+
Xi = X_np[sel]
|
|
544
|
+
if len(Xi) < 2:
|
|
545
|
+
# fallback tiny variance
|
|
546
|
+
means[s] = 0.0
|
|
547
|
+
covars[s] = np.eye(D)*1e-2
|
|
548
|
+
else:
|
|
549
|
+
means[s] = Xi.mean(axis=0)
|
|
550
|
+
covars[s] = np.cov(Xi.T) + np.eye(D)*1e-6
|
|
551
|
+
return means, covars
|
|
552
|
+
|
|
553
|
+
def viterbi_decode(model, X_np, lengths):
|
|
554
|
+
return model.predict(X_np, lengths)
|
|
555
|
+
|
|
556
|
+
def print_evaluation(y_true_idx, y_pred_idx, title=""):
|
|
557
|
+
labs_true = [idx_to_state[i] for i in y_true_idx]
|
|
558
|
+
labs_pred = [idx_to_state.get(i, f"UNK{i}") for i in y_pred_idx]
|
|
559
|
+
print(f"\n== {title} ==")
|
|
560
|
+
print(classification_report(labs_true, labs_pred, labels=state_list, zero_division=0))
|
|
561
|
+
cm = confusion_matrix(labs_true, labs_pred, labels=state_list)
|
|
562
|
+
print("Confusion matrix (rows=true, cols=pred):")
|
|
563
|
+
print(pd.DataFrame(cm, index=state_list, columns=state_list))
|
|
564
|
+
|
|
565
|
+
# Enhanced main section with diagnostic analysis
|
|
566
|
+
if __name__ == "__main__":
|
|
567
|
+
CSV_PATH = "synthetic_pasteurization_with_cip_signals.csv"
|
|
568
|
+
#CSV_PATH="SWat.csv"
|
|
569
|
+
USE_CIP = False
|
|
570
|
+
MODE = "unsupervised"
|
|
571
|
+
ADD_DERIVS = True
|
|
572
|
+
N_UNSUP = None
|
|
573
|
+
RANDOM_SEED = 42
|
|
574
|
+
|
|
575
|
+
states_prod = ["Idle","Fill","HeatUp","Hold","Cool","Discharge"]
|
|
576
|
+
#states_prod = ["Filling", "Draining", "Hold"]
|
|
577
|
+
states_cip = ["PreRinse","Caustic","InterRinse","Acid","FinalRinse","Sanitize","Verification","Standby"]
|
|
578
|
+
|
|
579
|
+
state_list = states_prod + (states_cip if USE_CIP else [])
|
|
580
|
+
n_states = len(state_list) if N_UNSUP is None else (N_UNSUP if MODE=="unsupervised" else len(state_list))
|
|
581
|
+
|
|
582
|
+
# Load data
|
|
583
|
+
df = pd.read_csv(CSV_PATH)
|
|
584
|
+
df = df[df["state"].isin(state_list)].copy()
|
|
585
|
+
df.sort_values(["batch_id","timestamp"], inplace=True)
|
|
586
|
+
|
|
587
|
+
# Initialize feature library
|
|
588
|
+
feature_lib = ModularFeatureLibrary()
|
|
589
|
+
'''
|
|
590
|
+
feature_plan = {
|
|
591
|
+
'statistical' : ['FIT101', 'LIT101'],
|
|
592
|
+
'temporal': ['FIT101', 'LIT101'],
|
|
593
|
+
'stability': ['FIT101', 'LIT101'],
|
|
594
|
+
'interaction': [['FIT101', 'LIT101']],
|
|
595
|
+
'event': [ ],
|
|
596
|
+
'contextual': []
|
|
597
|
+
}
|
|
598
|
+
'''
|
|
599
|
+
# Define feature plan with event rules
|
|
600
|
+
|
|
601
|
+
feature_plan = {
|
|
602
|
+
'statistical': ['T', 'Q_in','Q_out'],
|
|
603
|
+
'temporal': ['T', 'Q_in','Q_out'],
|
|
604
|
+
'stability': ['T', 'Q_in','Q_out'],
|
|
605
|
+
'interaction': [['T', 'Q_in','Q_out']],
|
|
606
|
+
'event': [
|
|
607
|
+
'(T_diff_smooth > 1)', '(T_diff_smooth < -1)',
|
|
608
|
+
'(Q_out > 0.3)',
|
|
609
|
+
|
|
610
|
+
#'(T < 20)',
|
|
611
|
+
'(T > 70) & (T_stable_flag == 1)', #
|
|
612
|
+
'(Q_in > 0.3) AND (T_diff < 0.2)'
|
|
613
|
+
#'T > 75', # Additional test rule
|
|
614
|
+
#'Q_in < 0.2' # Additional test rule
|
|
615
|
+
],
|
|
616
|
+
'contextual': []
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
# Compute features and analyze rule performance
|
|
620
|
+
result = feature_lib.analyze_rule_performance(df, feature_plan)
|
|
621
|
+
all_features = result['features']
|
|
622
|
+
diagnostics = result['diagnostics']
|
|
623
|
+
|
|
624
|
+
print(f"Original data shape: {df.shape}")
|
|
625
|
+
print(f"Computed features shape: {all_features.shape}")
|
|
626
|
+
|
|
627
|
+
# Print diagnostic report
|
|
628
|
+
if diagnostics:
|
|
629
|
+
feature_lib.diagnostic_analyzer.print_diagnostic_report(diagnostics)
|
|
630
|
+
|
|
631
|
+
# Continue with HMM training as before...
|
|
632
|
+
all_features.to_csv("data.csv", index=False)
|
|
633
|
+
#event_features = all_features[[col for col in all_features.columns if col.startswith('event_')]]
|
|
634
|
+
#features = event_features
|
|
635
|
+
|
|
636
|
+
important_raw_features = ['T_roll_mean_5', 'Q_in_roll_mean_5', 'Q_out_roll_mean_5', 'T_diff']
|
|
637
|
+
event_features = all_features[[col for col in all_features.columns if col.startswith('event_')]]
|
|
638
|
+
|
|
639
|
+
# Combine them
|
|
640
|
+
features = pd.concat([all_features[important_raw_features], event_features], axis=1)
|
|
641
|
+
# [Rest of your existing HMM code...]
|
|
642
|
+
bids = df["batch_id"].unique()
|
|
643
|
+
bids_train = set(bids[:max(1, int(0.6*len(bids)))])
|
|
644
|
+
bids_test = set(bids) - bids_train
|
|
645
|
+
|
|
646
|
+
def pack_sequences(df_subset, X_subset):
|
|
647
|
+
lengths = df_subset.groupby("batch_id").size().tolist()
|
|
648
|
+
return X_subset.values, lengths
|
|
649
|
+
|
|
650
|
+
scaler = StandardScaler()
|
|
651
|
+
X_train = features[df["batch_id"].isin(bids_train)]
|
|
652
|
+
X_test = features[df["batch_id"].isin(bids_test)]
|
|
653
|
+
X_train_scaled = scaler.fit_transform(X_train)
|
|
654
|
+
X_test_scaled = scaler.transform(X_test)
|
|
655
|
+
|
|
656
|
+
df_train = df[df["batch_id"].isin(bids_train)]
|
|
657
|
+
df_test = df[df["batch_id"].isin(bids_test)]
|
|
658
|
+
|
|
659
|
+
X_train_np, lengths_train = pack_sequences(df_train, pd.DataFrame(X_train_scaled, index=X_train.index))
|
|
660
|
+
X_test_np, lengths_test = pack_sequences(df_test, pd.DataFrame(X_test_scaled, index=X_test.index))
|
|
661
|
+
|
|
662
|
+
state_to_idx = {s:i for i,s in enumerate(state_list)}
|
|
663
|
+
idx_to_state = {i:s for s,i in state_to_idx.items()}
|
|
664
|
+
|
|
665
|
+
# Your existing HMM code continues here...
|
|
666
|
+
# ========= SUPERVISED HMM =========
|
|
667
|
+
if MODE.lower() == "supervised":
|
|
668
|
+
# Labeled indices (train/test)
|
|
669
|
+
y_train_idx = df_train["state"].map(state_to_idx).values
|
|
670
|
+
y_test_idx = df_test["state"].map(state_to_idx).values
|
|
671
|
+
|
|
672
|
+
# Initialize startprob/transmat from labels
|
|
673
|
+
startprob_, transmat_ = empirical_start_trans(y_train_idx, lengths_train, n_states)
|
|
674
|
+
|
|
675
|
+
# Initialize emissions from labels
|
|
676
|
+
means_, covars_ = emissions_from_labels(X_train_np, y_train_idx, n_states)
|
|
677
|
+
|
|
678
|
+
# Build and fit HMM (few EM iters to refine)
|
|
679
|
+
hmm = GaussianHMM(
|
|
680
|
+
n_components=n_states,
|
|
681
|
+
covariance_type="full",
|
|
682
|
+
n_iter=30,
|
|
683
|
+
init_params="", # do not overwrite our inits
|
|
684
|
+
random_state=RANDOM_SEED,
|
|
685
|
+
tol=1e-3,
|
|
686
|
+
verbose=False
|
|
687
|
+
)
|
|
688
|
+
hmm.startprob_ = startprob_
|
|
689
|
+
hmm.transmat_ = transmat_
|
|
690
|
+
hmm.means_ = means_
|
|
691
|
+
hmm.covars_ = covars_
|
|
692
|
+
|
|
693
|
+
hmm.fit(X_train_np, lengths_train)
|
|
694
|
+
|
|
695
|
+
# Decode test and evaluate
|
|
696
|
+
y_pred_test = viterbi_decode(hmm, X_test_np, lengths_test)
|
|
697
|
+
print_evaluation(y_test_idx, y_pred_test, title="Supervised HMM (Test)")
|
|
698
|
+
|
|
699
|
+
# ========= UNSUPERVISED HMM + label mapping =========
|
|
700
|
+
else:
|
|
701
|
+
# Unsupervised fit on TRAIN, then map discovered states to labels (Hungarian)
|
|
702
|
+
hmm = GaussianHMM(
|
|
703
|
+
n_components=n_states,
|
|
704
|
+
covariance_type="diag", # BETTER FOR STATE SEPARATION
|
|
705
|
+
n_iter=100,
|
|
706
|
+
random_state=RANDOM_SEED,
|
|
707
|
+
tol=1e-6,
|
|
708
|
+
init_params="stmc",
|
|
709
|
+
params="stmc"
|
|
710
|
+
)
|
|
711
|
+
hmm.fit(X_train_np, lengths_train)
|
|
712
|
+
|
|
713
|
+
# Predict hidden labels on TRAIN to build contingency with ground truth
|
|
714
|
+
y_train_true = df_train["state"].map(state_to_idx).values
|
|
715
|
+
y_train_hat = viterbi_decode(hmm, X_train_np, lengths_train)
|
|
716
|
+
|
|
717
|
+
# Build contingency (true x pred)
|
|
718
|
+
K = len(state_list)
|
|
719
|
+
cont = np.zeros((K, K), dtype=int)
|
|
720
|
+
for t, p in zip(y_train_true, y_train_hat):
|
|
721
|
+
if t < K and p < K:
|
|
722
|
+
cont[t, p] += 1
|
|
723
|
+
|
|
724
|
+
# Optimal mapping: rows(true)->cols(pred)
|
|
725
|
+
row_ind, col_ind = linear_sum_assignment(cont.max() - cont)
|
|
726
|
+
mapping = {pred: true for true, pred in zip(row_ind, col_ind)}
|
|
727
|
+
|
|
728
|
+
# Decode TEST, remap discovered states to labels
|
|
729
|
+
y_test_hat = viterbi_decode(hmm, X_test_np, lengths_test)
|
|
730
|
+
y_test_mapped = np.array([mapping.get(s, 0) for s in y_test_hat], dtype=int)
|
|
731
|
+
|
|
732
|
+
y_test_true = df_test["state"].map(state_to_idx).values
|
|
733
|
+
print_evaluation(y_test_true, y_test_mapped, title="Unsupervised HMM (mapped) — Test")
|
|
734
|
+
|
|
735
|
+
|
|
736
|
+
#######################################################################################
|
|
737
|
+
|
|
738
|
+
|
|
739
|
+
# ========= TIMESTAMP NORMALIZATION =========
|
|
740
|
+
print("\n=== Normalizing Timestamps ===")
|
|
741
|
+
|
|
742
|
+
def normalize_timestamps(df, timestamp_col="timestamp", case_id_col="batch_id", base_date="2023-01-01"):
|
|
743
|
+
"""
|
|
744
|
+
Correctly normalize timestamps by handling different time units properly.
|
|
745
|
+
"""
|
|
746
|
+
|
|
747
|
+
df_normalized = df.copy()
|
|
748
|
+
|
|
749
|
+
# First, ensure we understand the timestamp format
|
|
750
|
+
print(f"Original timestamp sample: {df[timestamp_col].iloc[:5].tolist()}")
|
|
751
|
+
|
|
752
|
+
# Check if timestamps are numeric (seconds) or string/datetime
|
|
753
|
+
if np.issubdtype(df[timestamp_col].dtype, np.number):
|
|
754
|
+
print("Timestamps are numeric - assuming they represent seconds")
|
|
755
|
+
# Convert numeric seconds to datetime
|
|
756
|
+
base_datetime = pd.to_datetime(base_date)
|
|
757
|
+
df_normalized[timestamp_col] = base_datetime + pd.to_timedelta(df[timestamp_col], unit='s')
|
|
758
|
+
else:
|
|
759
|
+
# Try to parse as datetime
|
|
760
|
+
try:
|
|
761
|
+
df_normalized[timestamp_col] = pd.to_datetime(df[timestamp_col])
|
|
762
|
+
print("Timestamps successfully parsed as datetime")
|
|
763
|
+
except:
|
|
764
|
+
print("Could not parse timestamps. Please check the format.")
|
|
765
|
+
return df
|
|
766
|
+
|
|
767
|
+
|
|
768
|
+
case_groups = df_normalized.groupby(case_id_col)
|
|
769
|
+
|
|
770
|
+
for case_id, case_data in case_groups:
|
|
771
|
+
case_start = case_data[timestamp_col].min()
|
|
772
|
+
time_deltas = case_data[timestamp_col] - case_start
|
|
773
|
+
df_normalized.loc[case_data.index, timestamp_col] = pd.to_datetime(base_date) + time_deltas
|
|
774
|
+
|
|
775
|
+
return df_normalized
|
|
776
|
+
|
|
777
|
+
|
|
778
|
+
df_normalized = normalize_timestamps(df, base_date="2023-01-01")
|
|
779
|
+
|
|
780
|
+
|
|
781
|
+
def create_interval_event_log_normalized(df, y_pred, state_mapping, case_id_col="batch_id", timestamp_col="timestamp"):
|
|
782
|
+
"""
|
|
783
|
+
Create interval-based event log using normalized timestamps
|
|
784
|
+
"""
|
|
785
|
+
|
|
786
|
+
df_with_pred = df.copy()
|
|
787
|
+
df_with_pred['predicted_state'] = [state_mapping.get(i, f"Unknown_{i}") for i in y_pred]
|
|
788
|
+
|
|
789
|
+
event_log_segments = []
|
|
790
|
+
|
|
791
|
+
|
|
792
|
+
for case_id in df_with_pred[case_id_col].unique():
|
|
793
|
+
case_data = df_with_pred[df_with_pred[case_id_col] == case_id].copy()
|
|
794
|
+
case_data = case_data.sort_values(timestamp_col)
|
|
795
|
+
|
|
796
|
+
|
|
797
|
+
current_state = None
|
|
798
|
+
segment_start = None
|
|
799
|
+
segment_indices = []
|
|
800
|
+
|
|
801
|
+
for idx, row in case_data.iterrows():
|
|
802
|
+
if current_state is None:
|
|
803
|
+
|
|
804
|
+
current_state = row['predicted_state']
|
|
805
|
+
segment_start = row[timestamp_col]
|
|
806
|
+
segment_indices = [idx]
|
|
807
|
+
elif row['predicted_state'] == current_state:
|
|
808
|
+
|
|
809
|
+
segment_indices.append(idx)
|
|
810
|
+
else:
|
|
811
|
+
segment_end = case_data.loc[segment_indices[-1], timestamp_col]
|
|
812
|
+
duration = (pd.to_datetime(segment_end) - pd.to_datetime(segment_start)).total_seconds()
|
|
813
|
+
event_log_segments.append({
|
|
814
|
+
'case_id': case_id,
|
|
815
|
+
'activity': current_state,
|
|
816
|
+
'start_timestamp': segment_start,
|
|
817
|
+
'end_timestamp': segment_end,
|
|
818
|
+
'duration_seconds': duration,
|
|
819
|
+
'event_count': len(segment_indices)
|
|
820
|
+
})
|
|
821
|
+
current_state = row['predicted_state']
|
|
822
|
+
segment_start = row[timestamp_col]
|
|
823
|
+
segment_indices = [idx]
|
|
824
|
+
|
|
825
|
+
|
|
826
|
+
if current_state is not None and segment_start is not None:
|
|
827
|
+
segment_end = case_data.loc[segment_indices[-1], timestamp_col]
|
|
828
|
+
duration = (pd.to_datetime(segment_end) - pd.to_datetime(segment_start)).total_seconds()
|
|
829
|
+
|
|
830
|
+
event_log_segments.append({
|
|
831
|
+
'case_id': case_id,
|
|
832
|
+
'activity': current_state,
|
|
833
|
+
'start_timestamp': segment_start,
|
|
834
|
+
'end_timestamp': segment_end,
|
|
835
|
+
'duration_seconds': duration,
|
|
836
|
+
'event_count': len(segment_indices)
|
|
837
|
+
})
|
|
838
|
+
|
|
839
|
+
|
|
840
|
+
event_log = pd.DataFrame(event_log_segments)
|
|
841
|
+
|
|
842
|
+
|
|
843
|
+
event_log['activity_sequence'] = event_log.groupby('case_id').cumcount() + 1
|
|
844
|
+
|
|
845
|
+
|
|
846
|
+
event_log = event_log[['case_id', 'activity_sequence', 'activity',
|
|
847
|
+
'start_timestamp', 'end_timestamp',
|
|
848
|
+
'duration_seconds', 'event_count']]
|
|
849
|
+
|
|
850
|
+
return event_log
|
|
851
|
+
|
|
852
|
+
|
|
853
|
+
X_test_full_np, lengths_test_full = pack_sequences(df_test, pd.DataFrame(X_test_scaled, index=X_test.index))
|
|
854
|
+
y_test_full_hat = viterbi_decode(hmm, X_test_full_np, lengths_test_full)
|
|
855
|
+
|
|
856
|
+
|
|
857
|
+
if MODE.lower() == "unsupervised":
|
|
858
|
+
|
|
859
|
+
state_mapping = {pred: idx_to_state[true] for pred, true in mapping.items() if true in idx_to_state}
|
|
860
|
+
else:
|
|
861
|
+
|
|
862
|
+
state_mapping = idx_to_state
|
|
863
|
+
|
|
864
|
+
|
|
865
|
+
df_test_normalized = df_normalized[df_normalized["batch_id"].isin(df_test["batch_id"].unique())]
|
|
866
|
+
|
|
867
|
+
# Align lengths
|
|
868
|
+
assert len(df_test_normalized) == len(y_test_full_hat), "Mismatch between test rows and predictions!"
|
|
869
|
+
|
|
870
|
+
interval_event_log_normalized = create_interval_event_log_normalized(
|
|
871
|
+
df_test_normalized, y_test_full_hat, state_mapping
|
|
872
|
+
)
|
|
873
|
+
|
|
874
|
+
|
|
875
|
+
# Save to CSV
|
|
876
|
+
normalized_log_path = "pasteurization_normalized_event_log_MT.csv"
|
|
877
|
+
interval_event_log_normalized.to_csv(normalized_log_path, index=False)
|
|
878
|
+
print(f"Normalized event log saved to: {normalized_log_path}")
|
|
879
|
+
|
|
880
|
+
# Show the beautiful result!
|
|
881
|
+
print("\nSample of normalized event log:")
|
|
882
|
+
print(interval_event_log_normalized.head(10))
|
|
883
|
+
|
|
884
|
+
# ========= COMPARE BEFORE/AFTER =========
|
|
885
|
+
print("\n=== Timestamp Normalization Comparison ===")
|
|
886
|
+
|
|
887
|
+
|
|
888
|
+
sample_case = interval_event_log_normalized['case_id'].iloc[0]
|
|
889
|
+
original_case_data = df[df['batch_id'] == sample_case].copy()
|
|
890
|
+
normalized_case_data = df_normalized[df_normalized['batch_id'] == sample_case].copy()
|
|
891
|
+
|
|
892
|
+
print(f"Sample Case: {sample_case}")
|
|
893
|
+
print(f"Original start: {original_case_data['timestamp'].min()}")
|
|
894
|
+
print(f"Normalized start: {normalized_case_data['timestamp'].min()}")
|
|
895
|
+
print(f"Original duration: {(pd.to_datetime(original_case_data['timestamp'].max()) - pd.to_datetime(original_case_data['timestamp'].min())).total_seconds():.0f} seconds")
|
|
896
|
+
print(f"Normalized duration: {(pd.to_datetime(normalized_case_data['timestamp'].max()) - pd.to_datetime(normalized_case_data['timestamp'].min())).total_seconds():.0f} seconds (same!)")
|
|
897
|
+
|
|
898
|
+
|
|
899
|
+
print("\n=== Creating Enhanced Visualizations ===")
|
|
900
|
+
|
|
901
|
+
|
|
902
|
+
plt.figure(figsize=(14, 8))
|
|
903
|
+
|
|
904
|
+
|
|
905
|
+
activities = interval_event_log_normalized['activity'].unique()
|
|
906
|
+
colors = plt.cm.Set3(np.linspace(0, 1, len(activities)))
|
|
907
|
+
color_map = dict(zip(activities, colors))
|
|
908
|
+
|
|
909
|
+
|
|
910
|
+
for i, case_id in enumerate(interval_event_log_normalized['case_id'].unique()[:10]): # First 10 cases
|
|
911
|
+
case_data = interval_event_log_normalized[interval_event_log_normalized['case_id'] == case_id]
|
|
912
|
+
|
|
913
|
+
for _, activity_row in case_data.iterrows():
|
|
914
|
+
start = pd.to_datetime(activity_row['start_timestamp'])
|
|
915
|
+
end = pd.to_datetime(activity_row['end_timestamp'])
|
|
916
|
+
duration = (end - start).total_seconds() / 3600 # Convert to hours for plotting
|
|
917
|
+
|
|
918
|
+
plt.barh(y=i, width=duration, left=start,
|
|
919
|
+
color=color_map[activity_row['activity']],
|
|
920
|
+
edgecolor='black', alpha=0.7)
|
|
921
|
+
|
|
922
|
+
# Add activity label for longer segments
|
|
923
|
+
if duration > 0.1: # Only label segments longer than 6 minutes
|
|
924
|
+
plt.text(start + pd.Timedelta(seconds=duration*3600/2), i,
|
|
925
|
+
activity_row['activity'], ha='center', va='center',
|
|
926
|
+
fontsize=8, fontweight='bold')
|
|
927
|
+
|
|
928
|
+
plt.yticks(range(10), interval_event_log_normalized['case_id'].unique()[:10])
|
|
929
|
+
plt.xlabel('Time (from normalized start)')
|
|
930
|
+
plt.ylabel('Case ID')
|
|
931
|
+
plt.title('Process Execution Gantt Chart (First 10 Cases)')
|
|
932
|
+
plt.legend([plt.Rectangle((0,0),1,1, color=color_map[act]) for act in activities],
|
|
933
|
+
activities, bbox_to_anchor=(1.05, 1), loc='upper left')
|
|
934
|
+
plt.grid(True, alpha=0.3)
|
|
935
|
+
plt.tight_layout()
|
|
936
|
+
plt.savefig('process_gantt_chart.png', dpi=300, bbox_inches='tight')
|
|
937
|
+
plt.show()
|
|
938
|
+
|
|
939
|
+
|
|
940
|
+
def filter_brief_states(event_log, min_duration_seconds=5.0):
|
|
941
|
+
"""
|
|
942
|
+
Remove state segments that are too brief by merging them with adjacent states
|
|
943
|
+
"""
|
|
944
|
+
filtered_segments = []
|
|
945
|
+
|
|
946
|
+
for case_id in event_log['case_id'].unique():
|
|
947
|
+
case_data = event_log[event_log['case_id'] == case_id].copy()
|
|
948
|
+
|
|
949
|
+
i = 0
|
|
950
|
+
while i < len(case_data):
|
|
951
|
+
current_segment = case_data.iloc[i]
|
|
952
|
+
|
|
953
|
+
# If segment is too brief, merge with previous or next
|
|
954
|
+
if current_segment['duration_seconds'] < min_duration_seconds and len(case_data) > 1:
|
|
955
|
+
|
|
956
|
+
if i == 0: # First segment - merge with next
|
|
957
|
+
next_segment = case_data.iloc[i + 1]
|
|
958
|
+
merged_segment = {
|
|
959
|
+
'case_id': case_id,
|
|
960
|
+
'activity': next_segment['activity'],
|
|
961
|
+
'start_timestamp': current_segment['start_timestamp'],
|
|
962
|
+
'end_timestamp': next_segment['end_timestamp'],
|
|
963
|
+
'duration_seconds': current_segment['duration_seconds'] + next_segment['duration_seconds'],
|
|
964
|
+
'event_count': current_segment['event_count'] + next_segment['event_count']
|
|
965
|
+
}
|
|
966
|
+
filtered_segments.append(merged_segment)
|
|
967
|
+
i += 2 # Skip next segment since we merged it
|
|
968
|
+
|
|
969
|
+
elif i == len(case_data) - 1: # Last segment - merge with previous
|
|
970
|
+
prev_segment = case_data.iloc[i - 1]
|
|
971
|
+
merged_segment = {
|
|
972
|
+
'case_id': case_id,
|
|
973
|
+
'activity': prev_segment['activity'],
|
|
974
|
+
'start_timestamp': prev_segment['start_timestamp'],
|
|
975
|
+
'end_timestamp': current_segment['end_timestamp'],
|
|
976
|
+
'duration_seconds': prev_segment['duration_seconds'] + current_segment['duration_seconds'],
|
|
977
|
+
'event_count': prev_segment['event_count'] + current_segment['event_count']
|
|
978
|
+
}
|
|
979
|
+
# Replace the last segment we added
|
|
980
|
+
filtered_segments = filtered_segments[:-1]
|
|
981
|
+
filtered_segments.append(merged_segment)
|
|
982
|
+
i += 1
|
|
983
|
+
|
|
984
|
+
else: # Middle segment - merge with previous (you could choose which neighbor to merge with)
|
|
985
|
+
prev_segment = case_data.iloc[i - 1]
|
|
986
|
+
merged_segment = {
|
|
987
|
+
'case_id': case_id,
|
|
988
|
+
'activity': prev_segment['activity'],
|
|
989
|
+
'start_timestamp': prev_segment['start_timestamp'],
|
|
990
|
+
'end_timestamp': current_segment['end_timestamp'],
|
|
991
|
+
'duration_seconds': prev_segment['duration_seconds'] + current_segment['duration_seconds'],
|
|
992
|
+
'event_count': prev_segment['event_count'] + current_segment['event_count']
|
|
993
|
+
}
|
|
994
|
+
# Replace the last segment we added
|
|
995
|
+
filtered_segments = filtered_segments[:-1]
|
|
996
|
+
filtered_segments.append(merged_segment)
|
|
997
|
+
i += 1
|
|
998
|
+
else:
|
|
999
|
+
# Keep segments that are long enough
|
|
1000
|
+
filtered_segments.append(current_segment.to_dict())
|
|
1001
|
+
i += 1
|
|
1002
|
+
|
|
1003
|
+
# Create new event log
|
|
1004
|
+
filtered_log = pd.DataFrame(filtered_segments)
|
|
1005
|
+
|
|
1006
|
+
# Recalculate activity sequence
|
|
1007
|
+
filtered_log['activity_sequence'] = filtered_log.groupby('case_id').cumcount() + 1
|
|
1008
|
+
|
|
1009
|
+
return filtered_log
|
|
1010
|
+
|
|
1011
|
+
# Apply the filter to your event log
|
|
1012
|
+
min_duration = 2.0 # Minimum duration in seconds (adjust as needed)
|
|
1013
|
+
filtered_event_log = filter_brief_states(interval_event_log_normalized, min_duration_seconds=min_duration)
|
|
1014
|
+
|
|
1015
|
+
print(f"Original events: {len(interval_event_log_normalized)}")
|
|
1016
|
+
print(f"Filtered events: {len(interval_event_log_normalized)}")
|
|
1017
|
+
print("Removed", len(interval_event_log_normalized) - len(filtered_event_log), "brief state segments")
|
|
1018
|
+
|
|
1019
|
+
filtered_log_path = "pasteurization_cleaned_event_log_MT.csv"
|
|
1020
|
+
filtered_event_log.to_csv(filtered_log_path, index=False)
|
|
1021
|
+
print(f"\nCleaned event log saved to: {filtered_log_path}")
|
|
1022
|
+
|
|
1023
|
+
|
|
1024
|
+
|
|
1025
|
+
|