Sensor2EventLog 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
core/pipeline.py ADDED
@@ -0,0 +1,189 @@
1
+ """
2
+ Core pipeline orchestrator for Sensor2EventLog framework
3
+ """
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from sklearn.preprocessing import StandardScaler
8
+ from typing import Optional, Dict, Any, Union
9
+
10
+ from abstraction.mt_loop import MachineTeachingLoop
11
+ from contextualization.event_log import EventLog
12
+ from features.feature_library import ModularFeatureLibrary
13
+ from evaluation.rule_analyzer import RuleDiagnosticAnalyzer
14
+ import config
15
+
16
+
17
+ class Sensor2EventLogPipeline:
18
+ """
19
+ Main pipeline class for transforming sensor data to event logs.
20
+
21
+ This class orchestrates the entire Machine Teaching process:
22
+ 1. Feature extraction
23
+ 2. Model training (HMM with supervised/unsupervised modes)
24
+ 3. Diagnostic analysis
25
+ 4. Event log generation
26
+
27
+ Example:
28
+ >>> pipeline = Sensor2EventLogPipeline(config)
29
+ >>> result = pipeline.run(
30
+ ... data_path="sensor_data.csv",
31
+ ... feature_plan=feature_plan,
32
+ ... mode="unsupervised"
33
+ ... )
34
+ >>> event_log = result['event_log']
35
+ >>> event_log.to_xes("output.xes")
36
+ """
37
+
38
+ def __init__(self, config_module=None):
39
+ """
40
+ Initialize the pipeline with configuration.
41
+
42
+ Parameters:
43
+ -----------
44
+ config_module : module, optional
45
+ Configuration module with all parameters. If None, uses default config
46
+ """
47
+ self.config = config_module or config
48
+
49
+ # Initialize components
50
+ self.feature_library = ModularFeatureLibrary(
51
+ window_sizes=self.config.FEATURE_CONFIG["window_sizes"],
52
+ stability_eps=self.config.FEATURE_CONFIG["stability_eps"],
53
+ peak_threshold=self.config.FEATURE_CONFIG["peak_threshold"]
54
+ )
55
+
56
+ self.diagnostic_analyzer = RuleDiagnosticAnalyzer(
57
+ coverage_threshold=self.config.DIAGNOSTIC_CONFIG["coverage_threshold"],
58
+ precision_threshold=self.config.DIAGNOSTIC_CONFIG["precision_threshold"],
59
+ explainability_threshold=self.config.DIAGNOSTIC_CONFIG["explainability_threshold"]
60
+ )
61
+
62
+ self.mt_loop = MachineTeachingLoop(
63
+ model_type="hmm",
64
+ feature_extractor=self.feature_library,
65
+ diagnostic_analyzer=self.diagnostic_analyzer,
66
+ config=self.config
67
+ )
68
+
69
+ self._scaler = StandardScaler()
70
+ self._fitted = False
71
+
72
+ def run(self,
73
+ data_path: str,
74
+ feature_plan: Dict[str, list],
75
+ mode: str = "unsupervised",
76
+ use_cip: bool = False,
77
+ n_unsup: Optional[int] = None,
78
+ random_seed: int = 42,
79
+ min_duration_seconds: float = 2.0,
80
+ return_intermediate: bool = False) -> Dict[str, Any]:
81
+ """
82
+ Run the complete pipeline.
83
+
84
+ Parameters:
85
+ -----------
86
+ data_path : str
87
+ Path to CSV data file
88
+ feature_plan : dict
89
+ Feature extraction plan with families and signals
90
+ mode : str
91
+ "supervised" or "unsupervised"
92
+ use_cip : bool
93
+ Whether to include CIP states
94
+ n_unsup : int
95
+ Number of states for unsupervised mode
96
+ random_seed : int
97
+ Random seed for reproducibility
98
+ min_duration_seconds : float
99
+ Minimum duration for filtering brief states
100
+ return_intermediate : bool
101
+ If True, returns intermediate results (features, diagnostics)
102
+
103
+ Returns:
104
+ --------
105
+ dict with keys:
106
+ - event_log: EventLog object
107
+ - model: trained HMM model
108
+ - predictions: predicted state sequences
109
+ - features (if return_intermediate): extracted features
110
+ - diagnostics (if return_intermediate): diagnostic results
111
+ """
112
+ # Load and prepare data
113
+ df = self._load_and_prepare_data(data_path, use_cip)
114
+
115
+ # Run Machine Teaching loop
116
+ mt_result = self.mt_loop.run(
117
+ df=df,
118
+ feature_plan=feature_plan,
119
+ mode=mode,
120
+ n_unsup=n_unsup,
121
+ random_seed=random_seed
122
+ )
123
+
124
+ features = mt_result['features']
125
+ model = mt_result['model']
126
+ predictions = mt_result['predictions']
127
+ diagnostics = mt_result['diagnostics']
128
+ state_mapping = mt_result['state_mapping']
129
+
130
+ # Generate event log
131
+ event_log = self._generate_event_log(
132
+ df, predictions, state_mapping, min_duration_seconds
133
+ )
134
+
135
+ result = {
136
+ 'event_log': event_log,
137
+ 'model': model,
138
+ 'predictions': predictions
139
+ }
140
+
141
+ if return_intermediate:
142
+ result['features'] = features
143
+ result['diagnostics'] = diagnostics
144
+
145
+ return result
146
+
147
+ def _load_and_prepare_data(self, data_path: str, use_cip: bool) -> pd.DataFrame:
148
+ """Load and prepare data for analysis."""
149
+ df = pd.read_csv(data_path)
150
+
151
+ # Determine state list
152
+ state_list = self.config.PROCESS_STATES["production"].copy()
153
+ if use_cip:
154
+ state_list += self.config.PROCESS_STATES["cip"]
155
+
156
+ # Filter to relevant states and sort
157
+ df = df[df["state"].isin(state_list)].copy()
158
+ df.sort_values(["batch_id", "timestamp"], inplace=True)
159
+
160
+ return df
161
+
162
+ def _generate_event_log(self, df: pd.DataFrame, predictions: np.ndarray,
163
+ state_mapping: Dict, min_duration_seconds: float) -> 'EventLog':
164
+ """Generate event log from predictions."""
165
+ from contextualization.event_log import create_interval_event_log_normalized
166
+
167
+ # Normalize timestamps
168
+ from utils.hmm_utils import normalize_timestamps
169
+ df_normalized = normalize_timestamps(df)
170
+
171
+ # Create event log
172
+ event_df = create_interval_event_log_normalized(
173
+ df_normalized, predictions, state_mapping
174
+ )
175
+
176
+ # Filter brief states
177
+ from utils.hmm_utils import filter_brief_states
178
+ filtered_df = filter_brief_states(event_df, min_duration_seconds)
179
+
180
+ # Wrap in EventLog object
181
+ event_log = EventLog(filtered_df)
182
+
183
+ # Save if paths are configured
184
+ if hasattr(self.config, 'PATHS'):
185
+ event_log.to_csv(self.config.PATHS.get("event_log", "event_log.csv"))
186
+ event_log.to_csv(self.config.PATHS.get("filtered_log", "filtered_log.csv"),
187
+ filtered=True)
188
+
189
+ return event_log
@@ -0,0 +1,246 @@
1
+ """
2
+ Rule diagnostic analyzer for evaluating rule performance metrics
3
+ """
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from typing import Dict, List, Union
8
+ import re
9
+
10
+
11
+ class RuleDiagnosticAnalyzer:
12
+ """
13
+ Analyzes rule performance using coverage, precision, and explainability metrics.
14
+ """
15
+
16
+ def __init__(self, coverage_threshold: float = 0.6, precision_threshold: float = 0.7,
17
+ explainability_threshold: float = 0.3):
18
+ self.c_low = coverage_threshold
19
+ self.p_low = precision_threshold
20
+ self.epsilon_unex = explainability_threshold
21
+
22
+ def compute_rule_metrics(self, df: pd.DataFrame, event_features: pd.DataFrame,
23
+ state_column: str = 'state') -> Dict:
24
+ """
25
+ Compute coverage, precision, and explainability metrics for all event features.
26
+
27
+ Parameters:
28
+ -----------
29
+ df : DataFrame with state labels
30
+ event_features : DataFrame containing event rule features (binary columns)
31
+ state_column : Column name containing state labels
32
+
33
+ Returns:
34
+ --------
35
+ Dict with comprehensive diagnostic results
36
+ """
37
+ results = {
38
+ 'rule_metrics': {},
39
+ 'state_metrics': {},
40
+ 'recommendations': [],
41
+ 'unexplainable_states': []
42
+ }
43
+
44
+ states = df[state_column].unique()
45
+
46
+ # Compute metrics for each rule-state combination
47
+ rule_metrics = {}
48
+ for rule_col in event_features.columns:
49
+ if rule_col.startswith('event_'):
50
+ rule_metrics[rule_col] = {}
51
+
52
+ for state in states:
53
+ # Get timestamps for this state
54
+ state_mask = df[state_column] == state
55
+ state_timestamps = state_mask[state_mask].index
56
+
57
+ if len(state_timestamps) == 0:
58
+ continue
59
+
60
+ # Rule activations for this state
61
+ rule_activations_state = event_features.loc[state_timestamps, rule_col]
62
+
63
+ # Total rule activations
64
+ total_rule_activations = event_features[rule_col].sum()
65
+
66
+ # Compute coverage and precision
67
+ coverage = rule_activations_state.sum() / len(state_timestamps)
68
+ precision = (rule_activations_state.sum() / total_rule_activations
69
+ if total_rule_activations > 0 else 0)
70
+
71
+ rule_metrics[rule_col][state] = {
72
+ 'coverage': coverage,
73
+ 'precision': precision,
74
+ 'effectiveness': np.sqrt(coverage * precision) if coverage > 0 and precision > 0 else 0
75
+ }
76
+
77
+ results['rule_metrics'] = rule_metrics
78
+
79
+ # Compute state-level metrics
80
+ state_metrics = {}
81
+ for state in states:
82
+ state_mask = df[state_column] == state
83
+ state_timestamps = state_mask[state_mask].index
84
+
85
+ if len(state_timestamps) == 0:
86
+ continue
87
+
88
+ # Find best coverage across all rules for this state
89
+ best_coverage = 0
90
+ best_rule = None
91
+
92
+ for rule_col, state_metrics_dict in rule_metrics.items():
93
+ if state in state_metrics_dict:
94
+ coverage = state_metrics_dict[state]['coverage']
95
+ if coverage > best_coverage:
96
+ best_coverage = coverage
97
+ best_rule = rule_col
98
+
99
+ explainability = best_coverage
100
+ gap = 1 - explainability
101
+
102
+ state_metrics[state] = {
103
+ 'explainability': explainability,
104
+ 'gap': gap,
105
+ 'best_rule': best_rule,
106
+ 'best_coverage': best_coverage,
107
+ 'state_frequency': len(state_timestamps) / len(df),
108
+ 'unexplainable': explainability < self.epsilon_unex
109
+ }
110
+
111
+ if explainability < self.epsilon_unex:
112
+ results['unexplainable_states'].append(state)
113
+
114
+ results['state_metrics'] = state_metrics
115
+
116
+ # Generate recommendations
117
+ results['recommendations'] = self._generate_recommendations(rule_metrics, state_metrics)
118
+
119
+ return results
120
+
121
+ def _generate_recommendations(self, rule_metrics: Dict, state_metrics: Dict) -> List[Dict]:
122
+ """Generate actionable recommendations based on diagnostic metrics."""
123
+ recommendations = []
124
+
125
+ # Analyze each rule-state combination
126
+ for rule_col, state_dict in rule_metrics.items():
127
+ for state, metrics in state_dict.items():
128
+ coverage = metrics['coverage']
129
+ precision = metrics['precision']
130
+
131
+ # Rule categorization and recommendations
132
+ if coverage >= self.c_low and precision >= self.p_low:
133
+ # Optimal rule - no action needed
134
+ continue
135
+
136
+ elif coverage >= self.c_low and precision < self.p_low:
137
+ # Overly sensitive rule
138
+ recommendations.append({
139
+ 'type': 'OVERLY_SENSITIVE_RULE',
140
+ 'rule': rule_col,
141
+ 'state': state,
142
+ 'coverage': coverage,
143
+ 'precision': precision,
144
+ 'action': f"Rule '{rule_col}' for state '{state}' has good coverage ({coverage:.1%}) but low precision ({precision:.1%}). Add temporal stability constraints or interaction features to reduce false positives.",
145
+ 'priority': 'HIGH',
146
+ 'suggested_families': ['stability', 'interaction']
147
+ })
148
+
149
+ elif coverage < self.c_low and precision >= self.p_low:
150
+ # Overly specific rule
151
+ recommendations.append({
152
+ 'type': 'OVERLY_SPECIFIC_RULE',
153
+ 'rule': rule_col,
154
+ 'state': state,
155
+ 'coverage': coverage,
156
+ 'precision': precision,
157
+ 'action': f"Rule '{rule_col}' for state '{state}' has high precision ({precision:.1%}) but low coverage ({coverage:.1%}). Relax thresholds or remove restrictive conditions.",
158
+ 'priority': 'MEDIUM',
159
+ 'suggested_families': ['temporal', 'statistical']
160
+ })
161
+
162
+ elif coverage < self.c_low and precision < self.p_low:
163
+ # Ineffective rule
164
+ recommendations.append({
165
+ 'type': 'INEFFECTIVE_RULE',
166
+ 'rule': rule_col,
167
+ 'state': state,
168
+ 'coverage': coverage,
169
+ 'precision': precision,
170
+ 'action': f"Rule '{rule_col}' for state '{state}' performs poorly (coverage: {coverage:.1%}, precision: {precision:.1%}). Consider complete redesign with alternative sensor combinations.",
171
+ 'priority': 'HIGH',
172
+ 'suggested_families': ['all']
173
+ })
174
+
175
+ # Analyze unexplainable states
176
+ for state, metrics in state_metrics.items():
177
+ if metrics['unexplainable']:
178
+ recommendations.append({
179
+ 'type': 'UNEXPLAINABLE_STATE',
180
+ 'state': state,
181
+ 'explainability': metrics['explainability'],
182
+ 'frequency': metrics['state_frequency'],
183
+ 'action': f"State '{state}' is largely unexplained (explainability: {metrics['explainability']:.1%}). Consider state decomposition, feature space expansion, or probabilistic approaches.",
184
+ 'priority': 'CRITICAL' if metrics['state_frequency'] > 0.1 else 'HIGH',
185
+ 'suggested_approaches': ['state_decomposition', 'feature_expansion', 'probabilistic_modeling']
186
+ })
187
+
188
+ # Sort by priority
189
+ priority_order = {'CRITICAL': 0, 'HIGH': 1, 'MEDIUM': 2, 'LOW': 3}
190
+ recommendations.sort(key=lambda x: priority_order[x['priority']])
191
+
192
+ return recommendations
193
+
194
+ def print_diagnostic_report(self, diagnostic_results: Dict):
195
+ """Print comprehensive diagnostic report."""
196
+ print("=" * 80)
197
+ print("RULE DIAGNOSTIC REPORT")
198
+ print("=" * 80)
199
+
200
+ # Rule performance summary
201
+ print("\n1. RULE PERFORMANCE SUMMARY:")
202
+ print("-" * 40)
203
+
204
+ rule_metrics = diagnostic_results['rule_metrics']
205
+ for rule_col, state_dict in rule_metrics.items():
206
+ print(f"\nRule: {rule_col}")
207
+ for state, metrics in state_dict.items():
208
+ print(f" State: {state:15} | Coverage: {metrics['coverage']:6.1%} | "
209
+ f"Precision: {metrics['precision']:6.1%} | "
210
+ f"Effectiveness: {metrics['effectiveness']:6.1%}")
211
+
212
+ # State explainability
213
+ print("\n2. STATE EXPLAINABILITY ANALYSIS:")
214
+ print("-" * 40)
215
+
216
+ state_metrics = diagnostic_results['state_metrics']
217
+ for state, metrics in state_metrics.items():
218
+ unexplainable_flag = " ⚠ UNEXPLAINABLE" if metrics['unexplainable'] else ""
219
+ print(f"State: {state:15} | Explainability: {metrics['explainability']:6.1%} | "
220
+ f"Best Rule: {metrics['best_rule'] or 'None'}{unexplainable_flag}")
221
+
222
+ # Recommendations
223
+ print("\n3. ACTIONABLE RECOMMENDATIONS:")
224
+ print("-" * 40)
225
+
226
+ for i, rec in enumerate(diagnostic_results['recommendations'], 1):
227
+ print(f"\n{i}. [{rec['priority']}] {rec['type']}")
228
+ print(f" {rec['action']}")
229
+
230
+ if 'suggested_families' in rec:
231
+ print(f" Suggested feature families: {', '.join(rec['suggested_families'])}")
232
+ if 'suggested_approaches' in rec:
233
+ print(f" Suggested approaches: {', '.join(rec['suggested_approaches'])}")
234
+
235
+ # Summary statistics
236
+ print("\n4. SUMMARY STATISTICS:")
237
+ print("-" * 40)
238
+
239
+ total_states = len(state_metrics)
240
+ unexplainable_states = len(diagnostic_results['unexplainable_states'])
241
+ avg_explainability = np.mean([m['explainability'] for m in state_metrics.values()])
242
+
243
+ print(f"Total states analyzed: {total_states}")
244
+ print(f"Unexplainable states: {unexplainable_states} ({unexplainable_states/total_states:.1%})")
245
+ print(f"Average explainability: {avg_explainability:.1%}")
246
+ print(f"Recommendations generated: {len(diagnostic_results['recommendations'])}")