Sensor2EventLog 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
S2E_v2.py ADDED
@@ -0,0 +1,1025 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from scipy.stats import skew, kurtosis, entropy
4
+ from scipy.signal import find_peaks
5
+ from typing import List, Dict, Union, Optional, Tuple
6
+ import matplotlib.pyplot as plt
7
+ from hmmlearn.hmm import GaussianHMM
8
+ from sklearn.preprocessing import StandardScaler
9
+ from sklearn.metrics import confusion_matrix, classification_report
10
+ from scipy.optimize import linear_sum_assignment
11
+ import re
12
+
13
+ class RuleDiagnosticAnalyzer:
14
+ """
15
+ Analyzes rule performance using coverage, precision, and explainability metrics
16
+ """
17
+
18
+ def __init__(self, coverage_threshold: float = 0.6, precision_threshold: float = 0.7,
19
+ explainability_threshold: float = 0.3):
20
+ self.c_low = coverage_threshold
21
+ self.p_low = precision_threshold
22
+ self.epsilon_unex = explainability_threshold
23
+
24
+ def compute_rule_metrics(self, df: pd.DataFrame, event_features: pd.DataFrame,
25
+ state_column: str = 'state') -> Dict:
26
+ """
27
+ Compute coverage, precision, and explainability metrics for all event features
28
+
29
+ Parameters:
30
+ -----------
31
+ df : pd.DataFrame
32
+ Original dataframe with state labels
33
+ event_features : pd.DataFrame
34
+ DataFrame containing event rule features (binary columns)
35
+ state_column : str
36
+ Column name containing state labels
37
+
38
+ Returns:
39
+ --------
40
+ Dict with comprehensive diagnostic results
41
+ """
42
+ results = {
43
+ 'rule_metrics': {},
44
+ 'state_metrics': {},
45
+ 'recommendations': [],
46
+ 'unexplainable_states': []
47
+ }
48
+
49
+ states = df[state_column].unique()
50
+
51
+ # Compute metrics for each rule-state combination
52
+ rule_metrics = {}
53
+ for rule_col in event_features.columns:
54
+ if rule_col.startswith('event_'):
55
+ rule_metrics[rule_col] = {}
56
+
57
+ for state in states:
58
+ # Get timestamps for this state
59
+ state_mask = df[state_column] == state
60
+ state_timestamps = state_mask[state_mask].index
61
+
62
+ if len(state_timestamps) == 0:
63
+ continue
64
+
65
+ # Rule activations for this state
66
+ rule_activations_state = event_features.loc[state_timestamps, rule_col]
67
+
68
+ # Total rule activations
69
+ total_rule_activations = event_features[rule_col].sum()
70
+
71
+ # Compute coverage and precision
72
+ coverage = rule_activations_state.sum() / len(state_timestamps)
73
+ precision = (rule_activations_state.sum() / total_rule_activations
74
+ if total_rule_activations > 0 else 0)
75
+
76
+ rule_metrics[rule_col][state] = {
77
+ 'coverage': coverage,
78
+ 'precision': precision,
79
+ 'effectiveness': np.sqrt(coverage * precision) if coverage > 0 and precision > 0 else 0
80
+ }
81
+
82
+ results['rule_metrics'] = rule_metrics
83
+
84
+ # Compute state-level metrics
85
+ state_metrics = {}
86
+ for state in states:
87
+ state_mask = df[state_column] == state
88
+ state_timestamps = state_mask[state_mask].index
89
+
90
+ if len(state_timestamps) == 0:
91
+ continue
92
+
93
+ # Find best coverage across all rules for this state
94
+ best_coverage = 0
95
+ best_rule = None
96
+
97
+ for rule_col, state_metrics_dict in rule_metrics.items():
98
+ if state in state_metrics_dict:
99
+ coverage = state_metrics_dict[state]['coverage']
100
+ if coverage > best_coverage:
101
+ best_coverage = coverage
102
+ best_rule = rule_col
103
+
104
+ explainability = best_coverage
105
+ gap = 1 - explainability
106
+
107
+ state_metrics[state] = {
108
+ 'explainability': explainability,
109
+ 'gap': gap,
110
+ 'best_rule': best_rule,
111
+ 'best_coverage': best_coverage,
112
+ 'state_frequency': len(state_timestamps) / len(df),
113
+ 'unexplainable': explainability < self.epsilon_unex
114
+ }
115
+
116
+ if explainability < self.epsilon_unex:
117
+ results['unexplainable_states'].append(state)
118
+
119
+ results['state_metrics'] = state_metrics
120
+
121
+ # Generate recommendations
122
+ results['recommendations'] = self._generate_recommendations(rule_metrics, state_metrics)
123
+
124
+ return results
125
+
126
+ def _generate_recommendations(self, rule_metrics: Dict, state_metrics: Dict) -> List[Dict]:
127
+ """Generate actionable recommendations based on diagnostic metrics"""
128
+ recommendations = []
129
+
130
+ # Analyze each rule-state combination
131
+ for rule_col, state_dict in rule_metrics.items():
132
+ for state, metrics in state_dict.items():
133
+ coverage = metrics['coverage']
134
+ precision = metrics['precision']
135
+
136
+ # Rule categorization and recommendations
137
+ if coverage >= self.c_low and precision >= self.p_low:
138
+ # Optimal rule - no action needed
139
+ continue
140
+
141
+ elif coverage >= self.c_low and precision < self.p_low:
142
+ # Overly sensitive rule
143
+ recommendations.append({
144
+ 'type': 'OVERLY_SENSITIVE_RULE',
145
+ 'rule': rule_col,
146
+ 'state': state,
147
+ 'coverage': coverage,
148
+ 'precision': precision,
149
+ 'action': f"Rule '{rule_col}' for state '{state}' has good coverage ({coverage:.1%}) but low precision ({precision:.1%}). Add temporal stability constraints or interaction features to reduce false positives.",
150
+ 'priority': 'HIGH',
151
+ 'suggested_families': ['stability', 'interaction']
152
+ })
153
+
154
+ elif coverage < self.c_low and precision >= self.p_low:
155
+ # Overly specific rule
156
+ recommendations.append({
157
+ 'type': 'OVERLY_SPECIFIC_RULE',
158
+ 'rule': rule_col,
159
+ 'state': state,
160
+ 'coverage': coverage,
161
+ 'precision': precision,
162
+ 'action': f"Rule '{rule_col}' for state '{state}' has high precision ({precision:.1%}) but low coverage ({coverage:.1%}). Relax thresholds or remove restrictive conditions.",
163
+ 'priority': 'MEDIUM',
164
+ 'suggested_families': ['temporal', 'statistical']
165
+ })
166
+
167
+ elif coverage < self.c_low and precision < self.p_low:
168
+ # Ineffective rule
169
+ recommendations.append({
170
+ 'type': 'INEFFECTIVE_RULE',
171
+ 'rule': rule_col,
172
+ 'state': state,
173
+ 'coverage': coverage,
174
+ 'precision': precision,
175
+ 'action': f"Rule '{rule_col}' for state '{state}' performs poorly (coverage: {coverage:.1%}, precision: {precision:.1%}). Consider complete redesign with alternative sensor combinations.",
176
+ 'priority': 'HIGH',
177
+ 'suggested_families': ['all']
178
+ })
179
+
180
+ # Analyze unexplainable states
181
+ for state, metrics in state_metrics.items():
182
+ if metrics['unexplainable']:
183
+ recommendations.append({
184
+ 'type': 'UNEXPLAINABLE_STATE',
185
+ 'state': state,
186
+ 'explainability': metrics['explainability'],
187
+ 'frequency': metrics['state_frequency'],
188
+ 'action': f"State '{state}' is largely unexplained (explainability: {metrics['explainability']:.1%}). Consider state decomposition, feature space expansion, or probabilistic approaches.",
189
+ 'priority': 'CRITICAL' if metrics['state_frequency'] > 0.1 else 'HIGH',
190
+ 'suggested_approaches': ['state_decomposition', 'feature_expansion', 'probabilistic_modeling']
191
+ })
192
+
193
+ # Sort by priority
194
+ priority_order = {'CRITICAL': 0, 'HIGH': 1, 'MEDIUM': 2, 'LOW': 3}
195
+ recommendations.sort(key=lambda x: priority_order[x['priority']])
196
+
197
+ return recommendations
198
+
199
+ def print_diagnostic_report(self, diagnostic_results: Dict):
200
+ """Print comprehensive diagnostic report"""
201
+ print("=" * 80)
202
+ print("RULE DIAGNOSTIC REPORT")
203
+ print("=" * 80)
204
+
205
+ # Rule performance summary
206
+ print("\n1. RULE PERFORMANCE SUMMARY:")
207
+ print("-" * 40)
208
+
209
+ rule_metrics = diagnostic_results['rule_metrics']
210
+ for rule_col, state_dict in rule_metrics.items():
211
+ print(f"\nRule: {rule_col}")
212
+ for state, metrics in state_dict.items():
213
+ print(f" State: {state:15} | Coverage: {metrics['coverage']:6.1%} | "
214
+ f"Precision: {metrics['precision']:6.1%} | "
215
+ f"Effectiveness: {metrics['effectiveness']:6.1%}")
216
+
217
+ # State explainability
218
+ print("\n2. STATE EXPLAINABILITY ANALYSIS:")
219
+ print("-" * 40)
220
+
221
+ state_metrics = diagnostic_results['state_metrics']
222
+ for state, metrics in state_metrics.items():
223
+ unexplainable_flag = " ⚠ UNEXPLAINABLE" if metrics['unexplainable'] else ""
224
+ print(f"State: {state:15} | Explainability: {metrics['explainability']:6.1%} | "
225
+ f"Best Rule: {metrics['best_rule'] or 'None'}{unexplainable_flag}")
226
+
227
+ # Recommendations
228
+ print("\n3. ACTIONABLE RECOMMENDATIONS:")
229
+ print("-" * 40)
230
+
231
+ for i, rec in enumerate(diagnostic_results['recommendations'], 1):
232
+ print(f"\n{i}. [{rec['priority']}] {rec['type']}")
233
+ print(f" {rec['action']}")
234
+
235
+ if 'suggested_families' in rec:
236
+ print(f" Suggested feature families: {', '.join(rec['suggested_families'])}")
237
+ if 'suggested_approaches' in rec:
238
+ print(f" Suggested approaches: {', '.join(rec['suggested_approaches'])}")
239
+
240
+ # Summary statistics
241
+ print("\n4. SUMMARY STATISTICS:")
242
+ print("-" * 40)
243
+
244
+ total_states = len(state_metrics)
245
+ unexplainable_states = len(diagnostic_results['unexplainable_states'])
246
+ avg_explainability = np.mean([m['explainability'] for m in state_metrics.values()])
247
+
248
+ print(f"Total states analyzed: {total_states}")
249
+ print(f"Unexplainable states: {unexplainable_states} ({unexplainable_states/total_states:.1%})")
250
+ print(f"Average explainability: {avg_explainability:.1%}")
251
+ print(f"Recommendations generated: {len(diagnostic_results['recommendations'])}")
252
+
253
+ # Enhanced ModularFeatureLibrary with diagnostic capabilities
254
+ class ModularFeatureLibrary:
255
+ def __init__(self, window_sizes=[5], stability_eps=1, peak_threshold=0.1):
256
+ self.window_sizes = window_sizes
257
+ self.stability_eps = stability_eps
258
+ self.peak_threshold = peak_threshold
259
+ self.feature_families = {
260
+ 'statistical': self._compute_statistical_features,
261
+ 'temporal': self._compute_temporal_features,
262
+ 'stability': self._compute_stability_features,
263
+ 'interaction': self._compute_interaction_features,
264
+ 'event': self._compute_event_features,
265
+ 'contextual': self._compute_contextual_features
266
+ }
267
+ self.diagnostic_analyzer = RuleDiagnosticAnalyzer()
268
+ self._feature_cache = {}
269
+
270
+ def _normalize_rule_expr(self, expr: str) -> str:
271
+ """Convert human-friendly logical ops to pandas-style bitwise ops and normalize spacing."""
272
+ s = expr.strip()
273
+ s = re.sub(r'\bAND\b', '&', s, flags=re.I)
274
+ s = re.sub(r'\bOR\b', '|', s, flags=re.I)
275
+ s = re.sub(r'\bNOT\b', '~', s, flags=re.I)
276
+ s = re.sub(r'\band\b', '&', s)
277
+ s = re.sub(r'\bor\b', '|', s)
278
+ s = re.sub(r'\bnot\b', '~', s)
279
+ s = re.sub(r'\s*([&|~><=!]+)\s*', r' \1 ', s)
280
+ s = re.sub(r'\s+', ' ', s)
281
+ return s.strip()
282
+
283
+ def _evaluate_rule(self, rule_expr: str, available_features: pd.DataFrame) -> pd.Series:
284
+ """Evaluate a rule expression using the available features."""
285
+ normalized_expr = self._normalize_rule_expr(rule_expr)
286
+
287
+ try:
288
+ eval_env = {col: available_features[col] for col in available_features.columns}
289
+ eval_env.update({
290
+ 'np': np, 'pd': pd, 'abs': np.abs, 'min': np.minimum, 'max': np.maximum
291
+ })
292
+
293
+ result = eval(normalized_expr, {"__builtins__": {}}, eval_env)
294
+
295
+ if isinstance(result, pd.Series):
296
+ return result.astype(bool)
297
+ else:
298
+ return pd.Series([bool(result)] * len(available_features),
299
+ index=available_features.index)
300
+
301
+ except Exception as e:
302
+ print(f"Error evaluating rule '{rule_expr}': {e}")
303
+ return pd.Series(False, index=available_features.index)
304
+
305
+ def _safe_norm(self, series):
306
+ """Z-score normalization"""
307
+ return (series - series.mean()) / (series.std() + 1e-8)
308
+
309
+ def _safe_ratio(self, a, b):
310
+ """Safe ratio calculation"""
311
+ a_safe = np.abs(a) + 1e-6
312
+ b_safe = np.abs(b) + 1e-6
313
+ ratio = np.log1p(a_safe) - np.log1p(b_safe)
314
+ sign = np.sign(a * b)
315
+ return ratio * sign
316
+
317
+ def _compute_statistical_features(self, df, signals, **kwargs):
318
+ """Statistical Features Family"""
319
+ features = pd.DataFrame(index=df.index)
320
+
321
+ for signal in signals:
322
+ s = df[signal]
323
+ for win in self.window_sizes:
324
+ roll = s.rolling(win, min_periods=1)
325
+ features[f"{signal}_roll_mean_{win}"] = roll.mean()
326
+
327
+ return features
328
+
329
+ def _compute_temporal_features(self, df, signals, **kwargs):
330
+ """Temporal Dynamics Family"""
331
+ features = pd.DataFrame(index=df.index)
332
+
333
+ for signal in signals:
334
+ s = df[signal]
335
+ diff = s.diff().fillna(0)
336
+ features[f"{signal}_diff"] = diff
337
+ features[f"{signal}_diff_sign"] = np.sign(diff)
338
+ features[f"{signal}_diff_smooth"] = diff.ewm(span=5).mean()
339
+ features[f"{signal}_abs_diff"] = np.abs(diff)
340
+
341
+ return features
342
+
343
+ def _compute_stability_features(self, df, signals, **kwargs):
344
+ """Stability Features Family"""
345
+ features = pd.DataFrame(index=df.index)
346
+
347
+ for signal in signals:
348
+ s = df[signal]
349
+ diff = s.diff().fillna(0)
350
+ features[f"{signal}_stability"] = 1.0 / (1.0 + np.abs(diff))
351
+ features[f"{signal}_stable_flag"] = (np.abs(diff) < self.stability_eps).astype(int)
352
+
353
+ stable_periods = (np.abs(diff) < self.stability_eps)
354
+ consecutive_stable = stable_periods.groupby((~stable_periods).cumsum()).cumsum()
355
+ features[f"{signal}_consecutive_stable"] = consecutive_stable
356
+
357
+ return features
358
+
359
+ def _compute_interaction_features(self, df, signals, **kwargs):
360
+ """Interaction Features Family"""
361
+ features = pd.DataFrame(index=df.index)
362
+
363
+ if len(signals) < 2:
364
+ return features
365
+
366
+ for i in range(len(signals)):
367
+ for j in range(i + 1, len(signals)):
368
+ sig1, sig2 = signals[i], signals[j]
369
+ features[f"{sig1}_x_{sig2}"] = df[sig1] * df[sig2]
370
+ features[f"{sig1}_ratio_{sig2}"] = self._safe_ratio(df[sig1], df[sig2])
371
+
372
+ return features
373
+
374
+ def _compute_event_features(self, df, signals, **kwargs):
375
+ """Event/Regime Features Family with rule-based features"""
376
+ features = pd.DataFrame(index=df.index)
377
+
378
+ # Create a comprehensive set of available features for rule evaluation
379
+ available_features = df.copy()
380
+
381
+ # Pre-compute all necessary derived features for ALL numeric columns
382
+ for signal in df.columns:
383
+ if pd.api.types.is_numeric_dtype(df[signal]):
384
+ try:
385
+ # Compute temporal features
386
+ diff = df[signal].diff().fillna(0)
387
+ available_features[f"{signal}_diff"] = diff
388
+ available_features[f"{signal}_diff_smooth"] = diff.ewm(span=5).mean()
389
+ available_features[f"{signal}_abs_diff"] = np.abs(diff)
390
+
391
+ # Compute stability features
392
+ available_features[f"{signal}_stability"] = 1.0 / (1.0 + np.abs(diff))
393
+ available_features[f"{signal}_stable_flag"] = (np.abs(diff) < self.stability_eps).astype(int)
394
+
395
+ except (TypeError, ValueError) as e:
396
+ print(f"Warning: Could not compute derived features for {signal}: {e}")
397
+
398
+ # Process event definitions
399
+ rule_counter = 0
400
+ for signal_def in signals:
401
+ if isinstance(signal_def, str) and any(op in signal_def for op in ['>', '<', '==', '&', '|']):
402
+ rule_counter += 1
403
+ try:
404
+ fixed_expr = self._fix_rule_parentheses(signal_def)
405
+ rule_result = self._evaluate_rule(fixed_expr, available_features)
406
+ clean_name = re.sub(r'[^a-zA-Z0-9_]', '_', signal_def[:20])
407
+ feature_name = f"event_{clean_name}"
408
+ if feature_name in features.columns:
409
+ feature_name = f"event_{clean_name}_{rule_counter}"
410
+ features[feature_name] = rule_result.astype(int)
411
+ print(f"Created event feature: {feature_name} from rule: {signal_def}")
412
+ except Exception as e:
413
+ print(f"Error processing rule '{signal_def}': {e}")
414
+ features[f"event_rule_error_{rule_counter}"] = 0
415
+
416
+ elif isinstance(signal_def, dict):
417
+ for rule_name, rule_expr in signal_def.items():
418
+ try:
419
+ fixed_expr = self._fix_rule_parentheses(rule_expr)
420
+ rule_result = self._evaluate_rule(fixed_expr, available_features)
421
+ features[f"event_{rule_name}"] = rule_result.astype(int)
422
+ print(f"Created named event feature: event_{rule_name}")
423
+ except Exception as e:
424
+ print(f"Error processing named rule '{rule_name}': {e}")
425
+ features[f"event_{rule_name}_error"] = 0
426
+
427
+ return features
428
+
429
+ def _compute_contextual_features(self, df, signals, **kwargs):
430
+ """Contextual Features Family"""
431
+ features = pd.DataFrame(index=df.index)
432
+ batch_id = kwargs.get('batch_id', 'batch_id')
433
+
434
+ if batch_id in df.columns:
435
+ batch_pos = df.groupby(batch_id).cumcount()
436
+ features["batch_position"] = batch_pos / batch_pos.groupby(df[batch_id]).transform('max')
437
+ features["is_batch_start"] = (batch_pos == 0).astype(int)
438
+ features["is_batch_end"] = (batch_pos == batch_pos.groupby(df[batch_id]).transform('max')).astype(int)
439
+
440
+ return features
441
+
442
+ def _fix_rule_parentheses(self, expr: str) -> str:
443
+ """Add parentheses around comparison operations to avoid ambiguous truth values."""
444
+ normalized = self._normalize_rule_expr(expr)
445
+ parts = re.split(r'(\s*[&|]\s*)', normalized)
446
+
447
+ if len(parts) == 1:
448
+ return normalized
449
+
450
+ result_parts = []
451
+ for part in parts:
452
+ if part.strip() in ['&', '|']:
453
+ result_parts.append(part)
454
+ else:
455
+ if any(op in part for op in ['>', '<', '==', '!=', '>=', '<=']):
456
+ result_parts.append(f'({part})')
457
+ else:
458
+ result_parts.append(part)
459
+
460
+ return ''.join(result_parts)
461
+
462
+ def compute_features(self, df, feature_plan: Dict[str, List[str]]):
463
+ """Compute features based on a feature plan"""
464
+ all_features = pd.DataFrame(index=df.index)
465
+
466
+ for family, signals in feature_plan.items():
467
+ if family not in self.feature_families:
468
+ print(f"Warning: Unknown feature family '{family}'")
469
+ continue
470
+
471
+ if family == 'interaction':
472
+ for signal_pair in signals:
473
+ if len(signal_pair) == 2:
474
+ family_features = self.feature_families[family](df, signal_pair)
475
+ all_features = pd.concat([all_features, family_features], axis=1)
476
+ else:
477
+ family_features = self.feature_families[family](df, signals)
478
+ all_features = pd.concat([all_features, family_features], axis=1)
479
+
480
+ return all_features.fillna(0)
481
+
482
+ def analyze_rule_performance(self, df: pd.DataFrame, feature_plan: Dict[str, List[str]]) -> Dict:
483
+ """
484
+ Compute features and analyze rule performance
485
+
486
+ Parameters:
487
+ -----------
488
+ df : pd.DataFrame
489
+ Input data with sensor signals and state labels
490
+ feature_plan : dict
491
+ Feature plan including event rules
492
+
493
+ Returns:
494
+ --------
495
+ Dict with features and diagnostic results
496
+ """
497
+ # Compute features
498
+ features = self.compute_features(df, feature_plan)
499
+
500
+ # Extract event features for analysis
501
+ event_features = features[[col for col in features.columns if col.startswith('event_')]]
502
+
503
+ if event_features.empty:
504
+ print("No event features found for analysis")
505
+ return {'features': features, 'diagnostics': None}
506
+
507
+ # Run diagnostic analysis
508
+ diagnostic_results = self.diagnostic_analyzer.compute_rule_metrics(df, event_features)
509
+
510
+ return {
511
+ 'features': features,
512
+ 'diagnostics': diagnostic_results
513
+ }
514
+
515
+ # [Keep all your existing HMM functions here...]
516
+ # empirical_start_trans, emissions_from_labels, viterbi_decode, print_evaluation
517
+
518
+
519
+ def empirical_start_trans(labels, lengths, n_states):
520
+ """Estimate startprob_ and transmat_ from labeled sequences."""
521
+ start = np.zeros(n_states)
522
+ trans = np.zeros((n_states, n_states))
523
+ idx = 0
524
+ for L in lengths:
525
+ seq = labels[idx:idx+L]
526
+ start[seq[0]] += 1
527
+ for i in range(L-1):
528
+ trans[seq[i], seq[i+1]] += 1
529
+ idx += L
530
+ # normalize with small epsilon to avoid zeros
531
+ start = (start + 1e-6) / (start.sum() + 1e-6*n_states)
532
+ trans = (trans + 1e-6)
533
+ trans /= trans.sum(axis=1, keepdims=True)
534
+ return start, trans
535
+
536
+ def emissions_from_labels(X_np, labels_np, n_states):
537
+ """Compute means and covariances per labeled state."""
538
+ D = X_np.shape[1]
539
+ means = np.zeros((n_states, D))
540
+ covars = np.zeros((n_states, D, D))
541
+ for s in range(n_states):
542
+ sel = (labels_np == s)
543
+ Xi = X_np[sel]
544
+ if len(Xi) < 2:
545
+ # fallback tiny variance
546
+ means[s] = 0.0
547
+ covars[s] = np.eye(D)*1e-2
548
+ else:
549
+ means[s] = Xi.mean(axis=0)
550
+ covars[s] = np.cov(Xi.T) + np.eye(D)*1e-6
551
+ return means, covars
552
+
553
+ def viterbi_decode(model, X_np, lengths):
554
+ return model.predict(X_np, lengths)
555
+
556
+ def print_evaluation(y_true_idx, y_pred_idx, title=""):
557
+ labs_true = [idx_to_state[i] for i in y_true_idx]
558
+ labs_pred = [idx_to_state.get(i, f"UNK{i}") for i in y_pred_idx]
559
+ print(f"\n== {title} ==")
560
+ print(classification_report(labs_true, labs_pred, labels=state_list, zero_division=0))
561
+ cm = confusion_matrix(labs_true, labs_pred, labels=state_list)
562
+ print("Confusion matrix (rows=true, cols=pred):")
563
+ print(pd.DataFrame(cm, index=state_list, columns=state_list))
564
+
565
+ # Enhanced main section with diagnostic analysis
566
+ if __name__ == "__main__":
567
+ CSV_PATH = "synthetic_pasteurization_with_cip_signals.csv"
568
+ #CSV_PATH="SWat.csv"
569
+ USE_CIP = False
570
+ MODE = "unsupervised"
571
+ ADD_DERIVS = True
572
+ N_UNSUP = None
573
+ RANDOM_SEED = 42
574
+
575
+ states_prod = ["Idle","Fill","HeatUp","Hold","Cool","Discharge"]
576
+ #states_prod = ["Filling", "Draining", "Hold"]
577
+ states_cip = ["PreRinse","Caustic","InterRinse","Acid","FinalRinse","Sanitize","Verification","Standby"]
578
+
579
+ state_list = states_prod + (states_cip if USE_CIP else [])
580
+ n_states = len(state_list) if N_UNSUP is None else (N_UNSUP if MODE=="unsupervised" else len(state_list))
581
+
582
+ # Load data
583
+ df = pd.read_csv(CSV_PATH)
584
+ df = df[df["state"].isin(state_list)].copy()
585
+ df.sort_values(["batch_id","timestamp"], inplace=True)
586
+
587
+ # Initialize feature library
588
+ feature_lib = ModularFeatureLibrary()
589
+ '''
590
+ feature_plan = {
591
+ 'statistical' : ['FIT101', 'LIT101'],
592
+ 'temporal': ['FIT101', 'LIT101'],
593
+ 'stability': ['FIT101', 'LIT101'],
594
+ 'interaction': [['FIT101', 'LIT101']],
595
+ 'event': [ ],
596
+ 'contextual': []
597
+ }
598
+ '''
599
+ # Define feature plan with event rules
600
+
601
+ feature_plan = {
602
+ 'statistical': ['T', 'Q_in','Q_out'],
603
+ 'temporal': ['T', 'Q_in','Q_out'],
604
+ 'stability': ['T', 'Q_in','Q_out'],
605
+ 'interaction': [['T', 'Q_in','Q_out']],
606
+ 'event': [
607
+ '(T_diff_smooth > 1)', '(T_diff_smooth < -1)',
608
+ '(Q_out > 0.3)',
609
+
610
+ #'(T < 20)',
611
+ '(T > 70) & (T_stable_flag == 1)', #
612
+ '(Q_in > 0.3) AND (T_diff < 0.2)'
613
+ #'T > 75', # Additional test rule
614
+ #'Q_in < 0.2' # Additional test rule
615
+ ],
616
+ 'contextual': []
617
+ }
618
+
619
+ # Compute features and analyze rule performance
620
+ result = feature_lib.analyze_rule_performance(df, feature_plan)
621
+ all_features = result['features']
622
+ diagnostics = result['diagnostics']
623
+
624
+ print(f"Original data shape: {df.shape}")
625
+ print(f"Computed features shape: {all_features.shape}")
626
+
627
+ # Print diagnostic report
628
+ if diagnostics:
629
+ feature_lib.diagnostic_analyzer.print_diagnostic_report(diagnostics)
630
+
631
+ # Continue with HMM training as before...
632
+ all_features.to_csv("data.csv", index=False)
633
+ #event_features = all_features[[col for col in all_features.columns if col.startswith('event_')]]
634
+ #features = event_features
635
+
636
+ important_raw_features = ['T_roll_mean_5', 'Q_in_roll_mean_5', 'Q_out_roll_mean_5', 'T_diff']
637
+ event_features = all_features[[col for col in all_features.columns if col.startswith('event_')]]
638
+
639
+ # Combine them
640
+ features = pd.concat([all_features[important_raw_features], event_features], axis=1)
641
+ # [Rest of your existing HMM code...]
642
+ bids = df["batch_id"].unique()
643
+ bids_train = set(bids[:max(1, int(0.6*len(bids)))])
644
+ bids_test = set(bids) - bids_train
645
+
646
+ def pack_sequences(df_subset, X_subset):
647
+ lengths = df_subset.groupby("batch_id").size().tolist()
648
+ return X_subset.values, lengths
649
+
650
+ scaler = StandardScaler()
651
+ X_train = features[df["batch_id"].isin(bids_train)]
652
+ X_test = features[df["batch_id"].isin(bids_test)]
653
+ X_train_scaled = scaler.fit_transform(X_train)
654
+ X_test_scaled = scaler.transform(X_test)
655
+
656
+ df_train = df[df["batch_id"].isin(bids_train)]
657
+ df_test = df[df["batch_id"].isin(bids_test)]
658
+
659
+ X_train_np, lengths_train = pack_sequences(df_train, pd.DataFrame(X_train_scaled, index=X_train.index))
660
+ X_test_np, lengths_test = pack_sequences(df_test, pd.DataFrame(X_test_scaled, index=X_test.index))
661
+
662
+ state_to_idx = {s:i for i,s in enumerate(state_list)}
663
+ idx_to_state = {i:s for s,i in state_to_idx.items()}
664
+
665
+ # Your existing HMM code continues here...
666
+ # ========= SUPERVISED HMM =========
667
+ if MODE.lower() == "supervised":
668
+ # Labeled indices (train/test)
669
+ y_train_idx = df_train["state"].map(state_to_idx).values
670
+ y_test_idx = df_test["state"].map(state_to_idx).values
671
+
672
+ # Initialize startprob/transmat from labels
673
+ startprob_, transmat_ = empirical_start_trans(y_train_idx, lengths_train, n_states)
674
+
675
+ # Initialize emissions from labels
676
+ means_, covars_ = emissions_from_labels(X_train_np, y_train_idx, n_states)
677
+
678
+ # Build and fit HMM (few EM iters to refine)
679
+ hmm = GaussianHMM(
680
+ n_components=n_states,
681
+ covariance_type="full",
682
+ n_iter=30,
683
+ init_params="", # do not overwrite our inits
684
+ random_state=RANDOM_SEED,
685
+ tol=1e-3,
686
+ verbose=False
687
+ )
688
+ hmm.startprob_ = startprob_
689
+ hmm.transmat_ = transmat_
690
+ hmm.means_ = means_
691
+ hmm.covars_ = covars_
692
+
693
+ hmm.fit(X_train_np, lengths_train)
694
+
695
+ # Decode test and evaluate
696
+ y_pred_test = viterbi_decode(hmm, X_test_np, lengths_test)
697
+ print_evaluation(y_test_idx, y_pred_test, title="Supervised HMM (Test)")
698
+
699
+ # ========= UNSUPERVISED HMM + label mapping =========
700
+ else:
701
+ # Unsupervised fit on TRAIN, then map discovered states to labels (Hungarian)
702
+ hmm = GaussianHMM(
703
+ n_components=n_states,
704
+ covariance_type="diag", # BETTER FOR STATE SEPARATION
705
+ n_iter=100,
706
+ random_state=RANDOM_SEED,
707
+ tol=1e-6,
708
+ init_params="stmc",
709
+ params="stmc"
710
+ )
711
+ hmm.fit(X_train_np, lengths_train)
712
+
713
+ # Predict hidden labels on TRAIN to build contingency with ground truth
714
+ y_train_true = df_train["state"].map(state_to_idx).values
715
+ y_train_hat = viterbi_decode(hmm, X_train_np, lengths_train)
716
+
717
+ # Build contingency (true x pred)
718
+ K = len(state_list)
719
+ cont = np.zeros((K, K), dtype=int)
720
+ for t, p in zip(y_train_true, y_train_hat):
721
+ if t < K and p < K:
722
+ cont[t, p] += 1
723
+
724
+ # Optimal mapping: rows(true)->cols(pred)
725
+ row_ind, col_ind = linear_sum_assignment(cont.max() - cont)
726
+ mapping = {pred: true for true, pred in zip(row_ind, col_ind)}
727
+
728
+ # Decode TEST, remap discovered states to labels
729
+ y_test_hat = viterbi_decode(hmm, X_test_np, lengths_test)
730
+ y_test_mapped = np.array([mapping.get(s, 0) for s in y_test_hat], dtype=int)
731
+
732
+ y_test_true = df_test["state"].map(state_to_idx).values
733
+ print_evaluation(y_test_true, y_test_mapped, title="Unsupervised HMM (mapped) — Test")
734
+
735
+
736
+ #######################################################################################
737
+
738
+
739
+ # ========= TIMESTAMP NORMALIZATION =========
740
+ print("\n=== Normalizing Timestamps ===")
741
+
742
+ def normalize_timestamps(df, timestamp_col="timestamp", case_id_col="batch_id", base_date="2023-01-01"):
743
+ """
744
+ Correctly normalize timestamps by handling different time units properly.
745
+ """
746
+
747
+ df_normalized = df.copy()
748
+
749
+ # First, ensure we understand the timestamp format
750
+ print(f"Original timestamp sample: {df[timestamp_col].iloc[:5].tolist()}")
751
+
752
+ # Check if timestamps are numeric (seconds) or string/datetime
753
+ if np.issubdtype(df[timestamp_col].dtype, np.number):
754
+ print("Timestamps are numeric - assuming they represent seconds")
755
+ # Convert numeric seconds to datetime
756
+ base_datetime = pd.to_datetime(base_date)
757
+ df_normalized[timestamp_col] = base_datetime + pd.to_timedelta(df[timestamp_col], unit='s')
758
+ else:
759
+ # Try to parse as datetime
760
+ try:
761
+ df_normalized[timestamp_col] = pd.to_datetime(df[timestamp_col])
762
+ print("Timestamps successfully parsed as datetime")
763
+ except:
764
+ print("Could not parse timestamps. Please check the format.")
765
+ return df
766
+
767
+
768
+ case_groups = df_normalized.groupby(case_id_col)
769
+
770
+ for case_id, case_data in case_groups:
771
+ case_start = case_data[timestamp_col].min()
772
+ time_deltas = case_data[timestamp_col] - case_start
773
+ df_normalized.loc[case_data.index, timestamp_col] = pd.to_datetime(base_date) + time_deltas
774
+
775
+ return df_normalized
776
+
777
+
778
+ df_normalized = normalize_timestamps(df, base_date="2023-01-01")
779
+
780
+
781
+ def create_interval_event_log_normalized(df, y_pred, state_mapping, case_id_col="batch_id", timestamp_col="timestamp"):
782
+ """
783
+ Create interval-based event log using normalized timestamps
784
+ """
785
+
786
+ df_with_pred = df.copy()
787
+ df_with_pred['predicted_state'] = [state_mapping.get(i, f"Unknown_{i}") for i in y_pred]
788
+
789
+ event_log_segments = []
790
+
791
+
792
+ for case_id in df_with_pred[case_id_col].unique():
793
+ case_data = df_with_pred[df_with_pred[case_id_col] == case_id].copy()
794
+ case_data = case_data.sort_values(timestamp_col)
795
+
796
+
797
+ current_state = None
798
+ segment_start = None
799
+ segment_indices = []
800
+
801
+ for idx, row in case_data.iterrows():
802
+ if current_state is None:
803
+
804
+ current_state = row['predicted_state']
805
+ segment_start = row[timestamp_col]
806
+ segment_indices = [idx]
807
+ elif row['predicted_state'] == current_state:
808
+
809
+ segment_indices.append(idx)
810
+ else:
811
+ segment_end = case_data.loc[segment_indices[-1], timestamp_col]
812
+ duration = (pd.to_datetime(segment_end) - pd.to_datetime(segment_start)).total_seconds()
813
+ event_log_segments.append({
814
+ 'case_id': case_id,
815
+ 'activity': current_state,
816
+ 'start_timestamp': segment_start,
817
+ 'end_timestamp': segment_end,
818
+ 'duration_seconds': duration,
819
+ 'event_count': len(segment_indices)
820
+ })
821
+ current_state = row['predicted_state']
822
+ segment_start = row[timestamp_col]
823
+ segment_indices = [idx]
824
+
825
+
826
+ if current_state is not None and segment_start is not None:
827
+ segment_end = case_data.loc[segment_indices[-1], timestamp_col]
828
+ duration = (pd.to_datetime(segment_end) - pd.to_datetime(segment_start)).total_seconds()
829
+
830
+ event_log_segments.append({
831
+ 'case_id': case_id,
832
+ 'activity': current_state,
833
+ 'start_timestamp': segment_start,
834
+ 'end_timestamp': segment_end,
835
+ 'duration_seconds': duration,
836
+ 'event_count': len(segment_indices)
837
+ })
838
+
839
+
840
+ event_log = pd.DataFrame(event_log_segments)
841
+
842
+
843
+ event_log['activity_sequence'] = event_log.groupby('case_id').cumcount() + 1
844
+
845
+
846
+ event_log = event_log[['case_id', 'activity_sequence', 'activity',
847
+ 'start_timestamp', 'end_timestamp',
848
+ 'duration_seconds', 'event_count']]
849
+
850
+ return event_log
851
+
852
+
853
+ X_test_full_np, lengths_test_full = pack_sequences(df_test, pd.DataFrame(X_test_scaled, index=X_test.index))
854
+ y_test_full_hat = viterbi_decode(hmm, X_test_full_np, lengths_test_full)
855
+
856
+
857
+ if MODE.lower() == "unsupervised":
858
+
859
+ state_mapping = {pred: idx_to_state[true] for pred, true in mapping.items() if true in idx_to_state}
860
+ else:
861
+
862
+ state_mapping = idx_to_state
863
+
864
+
865
+ df_test_normalized = df_normalized[df_normalized["batch_id"].isin(df_test["batch_id"].unique())]
866
+
867
+ # Align lengths
868
+ assert len(df_test_normalized) == len(y_test_full_hat), "Mismatch between test rows and predictions!"
869
+
870
+ interval_event_log_normalized = create_interval_event_log_normalized(
871
+ df_test_normalized, y_test_full_hat, state_mapping
872
+ )
873
+
874
+
875
+ # Save to CSV
876
+ normalized_log_path = "pasteurization_normalized_event_log_MT.csv"
877
+ interval_event_log_normalized.to_csv(normalized_log_path, index=False)
878
+ print(f"Normalized event log saved to: {normalized_log_path}")
879
+
880
+ # Show the beautiful result!
881
+ print("\nSample of normalized event log:")
882
+ print(interval_event_log_normalized.head(10))
883
+
884
+ # ========= COMPARE BEFORE/AFTER =========
885
+ print("\n=== Timestamp Normalization Comparison ===")
886
+
887
+
888
+ sample_case = interval_event_log_normalized['case_id'].iloc[0]
889
+ original_case_data = df[df['batch_id'] == sample_case].copy()
890
+ normalized_case_data = df_normalized[df_normalized['batch_id'] == sample_case].copy()
891
+
892
+ print(f"Sample Case: {sample_case}")
893
+ print(f"Original start: {original_case_data['timestamp'].min()}")
894
+ print(f"Normalized start: {normalized_case_data['timestamp'].min()}")
895
+ print(f"Original duration: {(pd.to_datetime(original_case_data['timestamp'].max()) - pd.to_datetime(original_case_data['timestamp'].min())).total_seconds():.0f} seconds")
896
+ print(f"Normalized duration: {(pd.to_datetime(normalized_case_data['timestamp'].max()) - pd.to_datetime(normalized_case_data['timestamp'].min())).total_seconds():.0f} seconds (same!)")
897
+
898
+
899
+ print("\n=== Creating Enhanced Visualizations ===")
900
+
901
+
902
+ plt.figure(figsize=(14, 8))
903
+
904
+
905
+ activities = interval_event_log_normalized['activity'].unique()
906
+ colors = plt.cm.Set3(np.linspace(0, 1, len(activities)))
907
+ color_map = dict(zip(activities, colors))
908
+
909
+
910
+ for i, case_id in enumerate(interval_event_log_normalized['case_id'].unique()[:10]): # First 10 cases
911
+ case_data = interval_event_log_normalized[interval_event_log_normalized['case_id'] == case_id]
912
+
913
+ for _, activity_row in case_data.iterrows():
914
+ start = pd.to_datetime(activity_row['start_timestamp'])
915
+ end = pd.to_datetime(activity_row['end_timestamp'])
916
+ duration = (end - start).total_seconds() / 3600 # Convert to hours for plotting
917
+
918
+ plt.barh(y=i, width=duration, left=start,
919
+ color=color_map[activity_row['activity']],
920
+ edgecolor='black', alpha=0.7)
921
+
922
+ # Add activity label for longer segments
923
+ if duration > 0.1: # Only label segments longer than 6 minutes
924
+ plt.text(start + pd.Timedelta(seconds=duration*3600/2), i,
925
+ activity_row['activity'], ha='center', va='center',
926
+ fontsize=8, fontweight='bold')
927
+
928
+ plt.yticks(range(10), interval_event_log_normalized['case_id'].unique()[:10])
929
+ plt.xlabel('Time (from normalized start)')
930
+ plt.ylabel('Case ID')
931
+ plt.title('Process Execution Gantt Chart (First 10 Cases)')
932
+ plt.legend([plt.Rectangle((0,0),1,1, color=color_map[act]) for act in activities],
933
+ activities, bbox_to_anchor=(1.05, 1), loc='upper left')
934
+ plt.grid(True, alpha=0.3)
935
+ plt.tight_layout()
936
+ plt.savefig('process_gantt_chart.png', dpi=300, bbox_inches='tight')
937
+ plt.show()
938
+
939
+
940
+ def filter_brief_states(event_log, min_duration_seconds=5.0):
941
+ """
942
+ Remove state segments that are too brief by merging them with adjacent states
943
+ """
944
+ filtered_segments = []
945
+
946
+ for case_id in event_log['case_id'].unique():
947
+ case_data = event_log[event_log['case_id'] == case_id].copy()
948
+
949
+ i = 0
950
+ while i < len(case_data):
951
+ current_segment = case_data.iloc[i]
952
+
953
+ # If segment is too brief, merge with previous or next
954
+ if current_segment['duration_seconds'] < min_duration_seconds and len(case_data) > 1:
955
+
956
+ if i == 0: # First segment - merge with next
957
+ next_segment = case_data.iloc[i + 1]
958
+ merged_segment = {
959
+ 'case_id': case_id,
960
+ 'activity': next_segment['activity'],
961
+ 'start_timestamp': current_segment['start_timestamp'],
962
+ 'end_timestamp': next_segment['end_timestamp'],
963
+ 'duration_seconds': current_segment['duration_seconds'] + next_segment['duration_seconds'],
964
+ 'event_count': current_segment['event_count'] + next_segment['event_count']
965
+ }
966
+ filtered_segments.append(merged_segment)
967
+ i += 2 # Skip next segment since we merged it
968
+
969
+ elif i == len(case_data) - 1: # Last segment - merge with previous
970
+ prev_segment = case_data.iloc[i - 1]
971
+ merged_segment = {
972
+ 'case_id': case_id,
973
+ 'activity': prev_segment['activity'],
974
+ 'start_timestamp': prev_segment['start_timestamp'],
975
+ 'end_timestamp': current_segment['end_timestamp'],
976
+ 'duration_seconds': prev_segment['duration_seconds'] + current_segment['duration_seconds'],
977
+ 'event_count': prev_segment['event_count'] + current_segment['event_count']
978
+ }
979
+ # Replace the last segment we added
980
+ filtered_segments = filtered_segments[:-1]
981
+ filtered_segments.append(merged_segment)
982
+ i += 1
983
+
984
+ else: # Middle segment - merge with previous (you could choose which neighbor to merge with)
985
+ prev_segment = case_data.iloc[i - 1]
986
+ merged_segment = {
987
+ 'case_id': case_id,
988
+ 'activity': prev_segment['activity'],
989
+ 'start_timestamp': prev_segment['start_timestamp'],
990
+ 'end_timestamp': current_segment['end_timestamp'],
991
+ 'duration_seconds': prev_segment['duration_seconds'] + current_segment['duration_seconds'],
992
+ 'event_count': prev_segment['event_count'] + current_segment['event_count']
993
+ }
994
+ # Replace the last segment we added
995
+ filtered_segments = filtered_segments[:-1]
996
+ filtered_segments.append(merged_segment)
997
+ i += 1
998
+ else:
999
+ # Keep segments that are long enough
1000
+ filtered_segments.append(current_segment.to_dict())
1001
+ i += 1
1002
+
1003
+ # Create new event log
1004
+ filtered_log = pd.DataFrame(filtered_segments)
1005
+
1006
+ # Recalculate activity sequence
1007
+ filtered_log['activity_sequence'] = filtered_log.groupby('case_id').cumcount() + 1
1008
+
1009
+ return filtered_log
1010
+
1011
+ # Apply the filter to your event log
1012
+ min_duration = 2.0 # Minimum duration in seconds (adjust as needed)
1013
+ filtered_event_log = filter_brief_states(interval_event_log_normalized, min_duration_seconds=min_duration)
1014
+
1015
+ print(f"Original events: {len(interval_event_log_normalized)}")
1016
+ print(f"Filtered events: {len(interval_event_log_normalized)}")
1017
+ print("Removed", len(interval_event_log_normalized) - len(filtered_event_log), "brief state segments")
1018
+
1019
+ filtered_log_path = "pasteurization_cleaned_event_log_MT.csv"
1020
+ filtered_event_log.to_csv(filtered_log_path, index=False)
1021
+ print(f"\nCleaned event log saved to: {filtered_log_path}")
1022
+
1023
+
1024
+
1025
+