Sensor2EventLog 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- S2E_v2.py +1025 -0
- abstraction/mt_loop.py +141 -0
- config.py +46 -0
- contextualization/event_log.py +339 -0
- core/__init__.py +3 -0
- core/pipeline.py +189 -0
- evaluation/rule_analyzer.py +246 -0
- features/feature_library.py +270 -0
- main.py +394 -0
- models/base_model.py +68 -0
- models/hmm_model.py +174 -0
- sensor2eventlog-2.0.0.dist-info/METADATA +53 -0
- sensor2eventlog-2.0.0.dist-info/RECORD +18 -0
- sensor2eventlog-2.0.0.dist-info/WHEEL +5 -0
- sensor2eventlog-2.0.0.dist-info/licenses/LICENSE +21 -0
- sensor2eventlog-2.0.0.dist-info/top_level.txt +10 -0
- utils/__init__.py +30 -0
- utils/hmm_utils.py +277 -0
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Modular feature extraction library with diagnostic capabilities
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import re
|
|
8
|
+
from typing import Dict, List
|
|
9
|
+
from evaluation.rule_analyzer import RuleDiagnosticAnalyzer
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ModularFeatureLibrary:
|
|
13
|
+
"""
|
|
14
|
+
Modular feature extraction library supporting multiple feature families
|
|
15
|
+
with integrated rule diagnostics.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, window_sizes=None, stability_eps=1, peak_threshold=0.1):
|
|
19
|
+
self.window_sizes = window_sizes or [5]
|
|
20
|
+
self.stability_eps = stability_eps
|
|
21
|
+
self.peak_threshold = peak_threshold
|
|
22
|
+
|
|
23
|
+
# Feature family implementations
|
|
24
|
+
self.feature_families = {
|
|
25
|
+
'statistical': self._compute_statistical_features,
|
|
26
|
+
'temporal': self._compute_temporal_features,
|
|
27
|
+
'stability': self._compute_stability_features,
|
|
28
|
+
'interaction': self._compute_interaction_features,
|
|
29
|
+
'event': self._compute_event_features,
|
|
30
|
+
'contextual': self._compute_contextual_features
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
self.diagnostic_analyzer = RuleDiagnosticAnalyzer()
|
|
34
|
+
self._feature_cache = {}
|
|
35
|
+
|
|
36
|
+
def _normalize_rule_expr(self, expr: str) -> str:
|
|
37
|
+
"""Convert human-friendly logical ops to pandas-style bitwise ops."""
|
|
38
|
+
s = expr.strip()
|
|
39
|
+
s = re.sub(r'\bAND\b', '&', s, flags=re.I)
|
|
40
|
+
s = re.sub(r'\bOR\b', '|', s, flags=re.I)
|
|
41
|
+
s = re.sub(r'\bNOT\b', '~', s, flags=re.I)
|
|
42
|
+
s = re.sub(r'\band\b', '&', s)
|
|
43
|
+
s = re.sub(r'\bor\b', '|', s)
|
|
44
|
+
s = re.sub(r'\bnot\b', '~', s)
|
|
45
|
+
s = re.sub(r'\s*([&|~><=!]+)\s*', r' \1 ', s)
|
|
46
|
+
s = re.sub(r'\s+', ' ', s)
|
|
47
|
+
return s.strip()
|
|
48
|
+
|
|
49
|
+
def _evaluate_rule(self, rule_expr: str, available_features: pd.DataFrame) -> pd.Series:
|
|
50
|
+
"""Evaluate a rule expression using available features."""
|
|
51
|
+
normalized_expr = self._normalize_rule_expr(rule_expr)
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
eval_env = {col: available_features[col] for col in available_features.columns}
|
|
55
|
+
eval_env.update({
|
|
56
|
+
'np': np, 'pd': pd, 'abs': np.abs,
|
|
57
|
+
'min': np.minimum, 'max': np.maximum
|
|
58
|
+
})
|
|
59
|
+
|
|
60
|
+
result = eval(normalized_expr, {"__builtins__": {}}, eval_env)
|
|
61
|
+
|
|
62
|
+
if isinstance(result, pd.Series):
|
|
63
|
+
return result.astype(bool)
|
|
64
|
+
else:
|
|
65
|
+
return pd.Series([bool(result)] * len(available_features),
|
|
66
|
+
index=available_features.index)
|
|
67
|
+
|
|
68
|
+
except Exception as e:
|
|
69
|
+
print(f"Error evaluating rule '{rule_expr}': {e}")
|
|
70
|
+
return pd.Series(False, index=available_features.index)
|
|
71
|
+
|
|
72
|
+
def _safe_ratio(self, a, b):
|
|
73
|
+
"""Safe ratio calculation with log transformation."""
|
|
74
|
+
a_safe = np.abs(a) + 1e-6
|
|
75
|
+
b_safe = np.abs(b) + 1e-6
|
|
76
|
+
ratio = np.log1p(a_safe) - np.log1p(b_safe)
|
|
77
|
+
sign = np.sign(a * b)
|
|
78
|
+
return ratio * sign
|
|
79
|
+
|
|
80
|
+
def _compute_statistical_features(self, df, signals, **kwargs):
|
|
81
|
+
"""Statistical features: rolling means."""
|
|
82
|
+
features = pd.DataFrame(index=df.index)
|
|
83
|
+
|
|
84
|
+
for signal in signals:
|
|
85
|
+
s = df[signal]
|
|
86
|
+
for win in self.window_sizes:
|
|
87
|
+
roll = s.rolling(win, min_periods=1)
|
|
88
|
+
features[f"{signal}_roll_mean_{win}"] = roll.mean()
|
|
89
|
+
|
|
90
|
+
return features
|
|
91
|
+
|
|
92
|
+
def _compute_temporal_features(self, df, signals, **kwargs):
|
|
93
|
+
"""Temporal dynamics features: differences and rates."""
|
|
94
|
+
features = pd.DataFrame(index=df.index)
|
|
95
|
+
|
|
96
|
+
for signal in signals:
|
|
97
|
+
s = df[signal]
|
|
98
|
+
diff = s.diff().fillna(0)
|
|
99
|
+
features[f"{signal}_diff"] = diff
|
|
100
|
+
features[f"{signal}_diff_sign"] = np.sign(diff)
|
|
101
|
+
features[f"{signal}_diff_smooth"] = diff.ewm(span=5).mean()
|
|
102
|
+
features[f"{signal}_abs_diff"] = np.abs(diff)
|
|
103
|
+
|
|
104
|
+
return features
|
|
105
|
+
|
|
106
|
+
def _compute_stability_features(self, df, signals, **kwargs):
|
|
107
|
+
"""Stability features: stability flags and consecutive stable periods."""
|
|
108
|
+
features = pd.DataFrame(index=df.index)
|
|
109
|
+
|
|
110
|
+
for signal in signals:
|
|
111
|
+
s = df[signal]
|
|
112
|
+
diff = s.diff().fillna(0)
|
|
113
|
+
features[f"{signal}_stability"] = 1.0 / (1.0 + np.abs(diff))
|
|
114
|
+
features[f"{signal}_stable_flag"] = (np.abs(diff) < self.stability_eps).astype(int)
|
|
115
|
+
|
|
116
|
+
stable_periods = (np.abs(diff) < self.stability_eps)
|
|
117
|
+
consecutive_stable = stable_periods.groupby((~stable_periods).cumsum()).cumsum()
|
|
118
|
+
features[f"{signal}_consecutive_stable"] = consecutive_stable
|
|
119
|
+
|
|
120
|
+
return features
|
|
121
|
+
|
|
122
|
+
def _compute_interaction_features(self, df, signals, **kwargs):
|
|
123
|
+
"""Interaction features: products and ratios between signals."""
|
|
124
|
+
features = pd.DataFrame(index=df.index)
|
|
125
|
+
|
|
126
|
+
if len(signals) < 2:
|
|
127
|
+
return features
|
|
128
|
+
|
|
129
|
+
for i in range(len(signals)):
|
|
130
|
+
for j in range(i + 1, len(signals)):
|
|
131
|
+
sig1, sig2 = signals[i], signals[j]
|
|
132
|
+
features[f"{sig1}_x_{sig2}"] = df[sig1] * df[sig2]
|
|
133
|
+
features[f"{sig1}_ratio_{sig2}"] = self._safe_ratio(df[sig1], df[sig2])
|
|
134
|
+
|
|
135
|
+
return features
|
|
136
|
+
|
|
137
|
+
def _compute_event_features(self, df, signals, **kwargs):
|
|
138
|
+
"""Event/regime features with rule-based definitions."""
|
|
139
|
+
features = pd.DataFrame(index=df.index)
|
|
140
|
+
|
|
141
|
+
# Create comprehensive set of available features
|
|
142
|
+
available_features = df.copy()
|
|
143
|
+
|
|
144
|
+
# Pre-compute derived features for all numeric columns
|
|
145
|
+
for signal in df.columns:
|
|
146
|
+
if pd.api.types.is_numeric_dtype(df[signal]):
|
|
147
|
+
try:
|
|
148
|
+
diff = df[signal].diff().fillna(0)
|
|
149
|
+
available_features[f"{signal}_diff"] = diff
|
|
150
|
+
available_features[f"{signal}_diff_smooth"] = diff.ewm(span=5).mean()
|
|
151
|
+
available_features[f"{signal}_abs_diff"] = np.abs(diff)
|
|
152
|
+
available_features[f"{signal}_stability"] = 1.0 / (1.0 + np.abs(diff))
|
|
153
|
+
available_features[f"{signal}_stable_flag"] = (np.abs(diff) < self.stability_eps).astype(int)
|
|
154
|
+
except (TypeError, ValueError) as e:
|
|
155
|
+
print(f"Warning: Could not compute derived features for {signal}: {e}")
|
|
156
|
+
|
|
157
|
+
# Process event definitions
|
|
158
|
+
rule_counter = 0
|
|
159
|
+
for signal_def in signals:
|
|
160
|
+
if isinstance(signal_def, str) and any(op in signal_def for op in ['>', '<', '==', '&', '|']):
|
|
161
|
+
rule_counter += 1
|
|
162
|
+
try:
|
|
163
|
+
fixed_expr = self._fix_rule_parentheses(signal_def)
|
|
164
|
+
rule_result = self._evaluate_rule(fixed_expr, available_features)
|
|
165
|
+
clean_name = re.sub(r'[^a-zA-Z0-9_]', '_', signal_def[:20])
|
|
166
|
+
feature_name = f"event_{clean_name}"
|
|
167
|
+
if feature_name in features.columns:
|
|
168
|
+
feature_name = f"event_{clean_name}_{rule_counter}"
|
|
169
|
+
features[feature_name] = rule_result.astype(int)
|
|
170
|
+
print(f"Created event feature: {feature_name} from rule: {signal_def}")
|
|
171
|
+
except Exception as e:
|
|
172
|
+
print(f"Error processing rule '{signal_def}': {e}")
|
|
173
|
+
features[f"event_rule_error_{rule_counter}"] = 0
|
|
174
|
+
|
|
175
|
+
elif isinstance(signal_def, dict):
|
|
176
|
+
for rule_name, rule_expr in signal_def.items():
|
|
177
|
+
try:
|
|
178
|
+
fixed_expr = self._fix_rule_parentheses(rule_expr)
|
|
179
|
+
rule_result = self._evaluate_rule(fixed_expr, available_features)
|
|
180
|
+
features[f"event_{rule_name}"] = rule_result.astype(int)
|
|
181
|
+
print(f"Created named event feature: event_{rule_name}")
|
|
182
|
+
except Exception as e:
|
|
183
|
+
print(f"Error processing named rule '{rule_name}': {e}")
|
|
184
|
+
features[f"event_{rule_name}_error"] = 0
|
|
185
|
+
|
|
186
|
+
return features
|
|
187
|
+
|
|
188
|
+
def _compute_contextual_features(self, df, signals, **kwargs):
|
|
189
|
+
"""Contextual features: batch position and boundaries."""
|
|
190
|
+
features = pd.DataFrame(index=df.index)
|
|
191
|
+
batch_id = kwargs.get('batch_id', 'batch_id')
|
|
192
|
+
|
|
193
|
+
if batch_id in df.columns:
|
|
194
|
+
batch_pos = df.groupby(batch_id).cumcount()
|
|
195
|
+
features["batch_position"] = batch_pos / batch_pos.groupby(df[batch_id]).transform('max')
|
|
196
|
+
features["is_batch_start"] = (batch_pos == 0).astype(int)
|
|
197
|
+
features["is_batch_end"] = (batch_pos == batch_pos.groupby(df[batch_id]).transform('max')).astype(int)
|
|
198
|
+
|
|
199
|
+
return features
|
|
200
|
+
|
|
201
|
+
def _fix_rule_parentheses(self, expr: str) -> str:
|
|
202
|
+
"""Add parentheses around comparison operations to avoid ambiguous truth values."""
|
|
203
|
+
normalized = self._normalize_rule_expr(expr)
|
|
204
|
+
parts = re.split(r'(\s*[&|]\s*)', normalized)
|
|
205
|
+
|
|
206
|
+
if len(parts) == 1:
|
|
207
|
+
return normalized
|
|
208
|
+
|
|
209
|
+
result_parts = []
|
|
210
|
+
for part in parts:
|
|
211
|
+
if part.strip() in ['&', '|']:
|
|
212
|
+
result_parts.append(part)
|
|
213
|
+
else:
|
|
214
|
+
if any(op in part for op in ['>', '<', '==', '!=', '>=', '<=']):
|
|
215
|
+
result_parts.append(f'({part})')
|
|
216
|
+
else:
|
|
217
|
+
result_parts.append(part)
|
|
218
|
+
|
|
219
|
+
return ''.join(result_parts)
|
|
220
|
+
|
|
221
|
+
def compute_features(self, df, feature_plan: Dict[str, List[str]]):
|
|
222
|
+
"""Compute features based on a feature plan."""
|
|
223
|
+
all_features = pd.DataFrame(index=df.index)
|
|
224
|
+
|
|
225
|
+
for family, signals in feature_plan.items():
|
|
226
|
+
if family not in self.feature_families:
|
|
227
|
+
print(f"Warning: Unknown feature family '{family}'")
|
|
228
|
+
continue
|
|
229
|
+
|
|
230
|
+
if family == 'interaction':
|
|
231
|
+
for signal_pair in signals:
|
|
232
|
+
if len(signal_pair) == 2:
|
|
233
|
+
family_features = self.feature_families[family](df, signal_pair)
|
|
234
|
+
all_features = pd.concat([all_features, family_features], axis=1)
|
|
235
|
+
else:
|
|
236
|
+
family_features = self.feature_families[family](df, signals)
|
|
237
|
+
all_features = pd.concat([all_features, family_features], axis=1)
|
|
238
|
+
|
|
239
|
+
return all_features.fillna(0)
|
|
240
|
+
|
|
241
|
+
def analyze_rule_performance(self, df: pd.DataFrame, feature_plan: Dict[str, List[str]]) -> Dict:
|
|
242
|
+
"""
|
|
243
|
+
Compute features and analyze rule performance.
|
|
244
|
+
|
|
245
|
+
Parameters:
|
|
246
|
+
-----------
|
|
247
|
+
df : Input data with sensor signals and state labels
|
|
248
|
+
feature_plan : Feature plan including event rules
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
--------
|
|
252
|
+
Dict with features and diagnostic results
|
|
253
|
+
"""
|
|
254
|
+
# Compute features
|
|
255
|
+
features = self.compute_features(df, feature_plan)
|
|
256
|
+
|
|
257
|
+
# Extract event features for analysis
|
|
258
|
+
event_features = features[[col for col in features.columns if col.startswith('event_')]]
|
|
259
|
+
|
|
260
|
+
if event_features.empty:
|
|
261
|
+
print("No event features found for analysis")
|
|
262
|
+
return {'features': features, 'diagnostics': None}
|
|
263
|
+
|
|
264
|
+
# Run diagnostic analysis
|
|
265
|
+
diagnostic_results = self.diagnostic_analyzer.compute_rule_metrics(df, event_features)
|
|
266
|
+
|
|
267
|
+
return {
|
|
268
|
+
'features': features,
|
|
269
|
+
'diagnostics': diagnostic_results
|
|
270
|
+
}
|
main.py
ADDED
|
@@ -0,0 +1,394 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Main analysis pipeline for HMM process analyzer
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from hmmlearn.hmm import GaussianHMM
|
|
8
|
+
from sklearn.preprocessing import StandardScaler
|
|
9
|
+
from scipy.optimize import linear_sum_assignment
|
|
10
|
+
|
|
11
|
+
from features.feature_library import ModularFeatureLibrary
|
|
12
|
+
from utils.hmm_utils import (
|
|
13
|
+
empirical_start_trans, emissions_from_labels, viterbi_decode,
|
|
14
|
+
print_evaluation, normalize_timestamps, create_interval_event_log_normalized,
|
|
15
|
+
filter_brief_states, create_gantt_chart
|
|
16
|
+
)
|
|
17
|
+
import config
|
|
18
|
+
|
|
19
|
+
import sys
|
|
20
|
+
import io
|
|
21
|
+
import os
|
|
22
|
+
# --- capture print output to save to results.txt ---
|
|
23
|
+
|
|
24
|
+
class _Tee:
|
|
25
|
+
"""Write to multiple streams at once (e.g., console + file buffer)."""
|
|
26
|
+
def __init__(self, *streams):
|
|
27
|
+
self.streams = streams
|
|
28
|
+
def write(self, data):
|
|
29
|
+
for s in self.streams:
|
|
30
|
+
s.write(data)
|
|
31
|
+
def flush(self):
|
|
32
|
+
for s in self.streams:
|
|
33
|
+
s.flush()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def analyze_process(data_path, feature_plan, mode="unsupervised",
|
|
37
|
+
use_cip=False, n_unsup=None, random_seed=42,
|
|
38
|
+
results_txt_path=None):
|
|
39
|
+
if results_txt_path is None:
|
|
40
|
+
script_dir = os.path.dirname(os.path.abspath(__file__))
|
|
41
|
+
results_txt_path = os.path.join(script_dir, "results.txt")
|
|
42
|
+
buffer = io.StringIO()
|
|
43
|
+
original_stdout = sys.stdout
|
|
44
|
+
sys.stdout = _Tee(original_stdout, buffer)
|
|
45
|
+
"""
|
|
46
|
+
Main analysis pipeline for process data.
|
|
47
|
+
|
|
48
|
+
Parameters:
|
|
49
|
+
-----------
|
|
50
|
+
data_path : str
|
|
51
|
+
Path to CSV data file
|
|
52
|
+
feature_plan : dict
|
|
53
|
+
Feature extraction plan
|
|
54
|
+
mode : str
|
|
55
|
+
"supervised" or "unsupervised"
|
|
56
|
+
use_cip : bool
|
|
57
|
+
Whether to include CIP states
|
|
58
|
+
n_unsup : int
|
|
59
|
+
Number of states for unsupervised mode
|
|
60
|
+
random_seed : int
|
|
61
|
+
Random seed for reproducibility
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
--------
|
|
65
|
+
dict with analysis results
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
try:
|
|
69
|
+
# Load and prepare data
|
|
70
|
+
df = load_and_prepare_data(data_path, use_cip)
|
|
71
|
+
|
|
72
|
+
# Initialize feature library
|
|
73
|
+
feature_lib = ModularFeatureLibrary(
|
|
74
|
+
window_sizes=config.FEATURE_CONFIG["window_sizes"],
|
|
75
|
+
stability_eps=config.FEATURE_CONFIG["stability_eps"],
|
|
76
|
+
peak_threshold=config.FEATURE_CONFIG["peak_threshold"]
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
# Compute features and analyze rule performance
|
|
80
|
+
print("Computing features and analyzing rule performance...")
|
|
81
|
+
result = feature_lib.analyze_rule_performance(df, feature_plan)
|
|
82
|
+
all_features = result['features']
|
|
83
|
+
diagnostics = result['diagnostics']
|
|
84
|
+
|
|
85
|
+
# Print diagnostic report
|
|
86
|
+
if diagnostics:
|
|
87
|
+
feature_lib.diagnostic_analyzer.print_diagnostic_report(diagnostics)
|
|
88
|
+
|
|
89
|
+
# Prepare features for HMM
|
|
90
|
+
event_features = all_features[[col for col in all_features.columns if col.startswith('event_')]]
|
|
91
|
+
important_raw_features = ['T_roll_mean_5', 'Q_in_roll_mean_5', 'Q_out_roll_mean_5', 'T_diff']
|
|
92
|
+
|
|
93
|
+
# Combine features
|
|
94
|
+
features = pd.concat([all_features[important_raw_features], event_features], axis=1)
|
|
95
|
+
|
|
96
|
+
# Split data into train/test
|
|
97
|
+
train_features, test_features, df_train, df_test = split_train_test(df, features)
|
|
98
|
+
|
|
99
|
+
# Scale features
|
|
100
|
+
scaler = StandardScaler()
|
|
101
|
+
X_train_scaled = scaler.fit_transform(train_features)
|
|
102
|
+
X_test_scaled = scaler.transform(test_features)
|
|
103
|
+
|
|
104
|
+
# Pack sequences for HMM
|
|
105
|
+
X_train_np, lengths_train = pack_sequences(df_train, X_train_scaled)
|
|
106
|
+
X_test_np, lengths_test = pack_sequences(df_test, X_test_scaled)
|
|
107
|
+
|
|
108
|
+
# Create state mappings
|
|
109
|
+
state_list = get_state_list(use_cip, n_unsup, mode)
|
|
110
|
+
state_to_idx, idx_to_state = create_state_mappings(state_list)
|
|
111
|
+
|
|
112
|
+
# Train and evaluate HMM
|
|
113
|
+
if mode.lower() == "supervised":
|
|
114
|
+
results = train_supervised_hmm(
|
|
115
|
+
X_train_np, lengths_train, X_test_np, lengths_test,
|
|
116
|
+
df_train, df_test, state_to_idx, idx_to_state, state_list
|
|
117
|
+
)
|
|
118
|
+
else:
|
|
119
|
+
results = train_unsupervised_hmm(
|
|
120
|
+
X_train_np, lengths_train, X_test_np, lengths_test,
|
|
121
|
+
df_train, df_test, state_to_idx, idx_to_state, state_list, n_unsup
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Generate event log
|
|
125
|
+
print("\nGenerating event log...")
|
|
126
|
+
event_log = generate_event_log(df, results['model'], results['mapping'],
|
|
127
|
+
results['test_predictions'], scaler, features)
|
|
128
|
+
|
|
129
|
+
# Create visualizations
|
|
130
|
+
print("\nCreating visualizations...")
|
|
131
|
+
create_visualizations(event_log)
|
|
132
|
+
|
|
133
|
+
return {
|
|
134
|
+
'model': results['model'],
|
|
135
|
+
'features': features,
|
|
136
|
+
'event_log': event_log,
|
|
137
|
+
'diagnostics': diagnostics,
|
|
138
|
+
'predictions': results['test_predictions']
|
|
139
|
+
}
|
|
140
|
+
finally:
|
|
141
|
+
# --- always restore stdout and save the captured text ---
|
|
142
|
+
sys.stdout = original_stdout
|
|
143
|
+
with open(results_txt_path, "w", encoding="utf-8") as f:
|
|
144
|
+
f.write(buffer.getvalue())
|
|
145
|
+
print(f"Results text saved to: {results_txt_path}")
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def load_and_prepare_data(data_path, use_cip):
|
|
149
|
+
"""Load and prepare data for analysis."""
|
|
150
|
+
df = pd.read_csv(data_path)
|
|
151
|
+
|
|
152
|
+
# Determine state list
|
|
153
|
+
state_list = config.PROCESS_STATES["production"]
|
|
154
|
+
if use_cip:
|
|
155
|
+
state_list += config.PROCESS_STATES["cip"]
|
|
156
|
+
|
|
157
|
+
# Filter to relevant states and sort
|
|
158
|
+
df = df[df["state"].isin(state_list)].copy()
|
|
159
|
+
df.sort_values(["batch_id", "timestamp"], inplace=True)
|
|
160
|
+
|
|
161
|
+
return df
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def split_train_test(df, features, train_ratio=0.6):
|
|
165
|
+
"""Split data into training and testing sets."""
|
|
166
|
+
batch_ids = df["batch_id"].unique()
|
|
167
|
+
n_train = max(1, int(train_ratio * len(batch_ids)))
|
|
168
|
+
|
|
169
|
+
train_batch_ids = set(batch_ids[:n_train])
|
|
170
|
+
test_batch_ids = set(batch_ids[n_train:])
|
|
171
|
+
|
|
172
|
+
# Split features
|
|
173
|
+
X_train = features[df["batch_id"].isin(train_batch_ids)]
|
|
174
|
+
X_test = features[df["batch_id"].isin(test_batch_ids)]
|
|
175
|
+
|
|
176
|
+
# Split labels
|
|
177
|
+
df_train = df[df["batch_id"].isin(train_batch_ids)]
|
|
178
|
+
df_test = df[df["batch_id"].isin(test_batch_ids)]
|
|
179
|
+
|
|
180
|
+
return X_train, X_test, df_train, df_test
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def pack_sequences(df_subset, X_subset):
|
|
184
|
+
"""Pack sequences for HMM training."""
|
|
185
|
+
lengths = df_subset.groupby("batch_id").size().tolist()
|
|
186
|
+
if isinstance(X_subset, pd.DataFrame):
|
|
187
|
+
X_subset = X_subset.values
|
|
188
|
+
return X_subset, lengths
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def get_state_list(use_cip, n_unsup, mode):
|
|
192
|
+
"""Get the list of states based on configuration."""
|
|
193
|
+
state_list = config.PROCESS_STATES["production"]
|
|
194
|
+
if use_cip:
|
|
195
|
+
state_list += config.PROCESS_STATES["cip"]
|
|
196
|
+
|
|
197
|
+
if mode.lower() == "unsupervised" and n_unsup is not None:
|
|
198
|
+
n_states = n_unsup
|
|
199
|
+
else:
|
|
200
|
+
n_states = len(state_list)
|
|
201
|
+
|
|
202
|
+
return state_list
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def create_state_mappings(state_list):
|
|
206
|
+
"""Create mappings between state names and indices."""
|
|
207
|
+
state_to_idx = {s: i for i, s in enumerate(state_list)}
|
|
208
|
+
idx_to_state = {i: s for s, i in state_to_idx.items()}
|
|
209
|
+
return state_to_idx, idx_to_state
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def train_supervised_hmm(X_train_np, lengths_train, X_test_np, lengths_test,
|
|
213
|
+
df_train, df_test, state_to_idx, idx_to_state, state_list):
|
|
214
|
+
"""Train and evaluate supervised HMM."""
|
|
215
|
+
print("\nTraining supervised HMM...")
|
|
216
|
+
|
|
217
|
+
# Convert labels to indices
|
|
218
|
+
y_train_idx = df_train["state"].map(state_to_idx).values
|
|
219
|
+
y_test_idx = df_test["state"].map(state_to_idx).values
|
|
220
|
+
|
|
221
|
+
# Initialize from labels
|
|
222
|
+
startprob_, transmat_ = empirical_start_trans(y_train_idx, lengths_train, len(state_list))
|
|
223
|
+
means_, covars_ = emissions_from_labels(X_train_np, y_train_idx, len(state_list))
|
|
224
|
+
|
|
225
|
+
# Build and fit HMM
|
|
226
|
+
hmm = GaussianHMM(
|
|
227
|
+
n_components=len(state_list),
|
|
228
|
+
covariance_type="full",
|
|
229
|
+
n_iter=30,
|
|
230
|
+
init_params="",
|
|
231
|
+
random_state=config.HMM_CONFIG["random_seed"],
|
|
232
|
+
tol=config.HMM_CONFIG["tol"],
|
|
233
|
+
verbose=False
|
|
234
|
+
)
|
|
235
|
+
hmm.startprob_ = startprob_
|
|
236
|
+
hmm.transmat_ = transmat_
|
|
237
|
+
hmm.means_ = means_
|
|
238
|
+
hmm.covars_ = covars_
|
|
239
|
+
|
|
240
|
+
hmm.fit(X_train_np, lengths_train)
|
|
241
|
+
|
|
242
|
+
# Decode and evaluate
|
|
243
|
+
y_pred_test = viterbi_decode(hmm, X_test_np, lengths_test)
|
|
244
|
+
print_evaluation(y_test_idx, y_pred_test, idx_to_state, state_list,
|
|
245
|
+
title="Supervised HMM (Test)")
|
|
246
|
+
|
|
247
|
+
return {
|
|
248
|
+
'model': hmm,
|
|
249
|
+
'mapping': idx_to_state,
|
|
250
|
+
'test_predictions': y_pred_test
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def train_unsupervised_hmm(X_train_np, lengths_train, X_test_np, lengths_test,
|
|
255
|
+
df_train, df_test, state_to_idx, idx_to_state, state_list, n_unsup):
|
|
256
|
+
"""Train and evaluate unsupervised HMM with state mapping."""
|
|
257
|
+
print("\nTraining unsupervised HMM...")
|
|
258
|
+
|
|
259
|
+
n_states = n_unsup if n_unsup is not None else len(state_list)
|
|
260
|
+
|
|
261
|
+
# Train unsupervised HMM
|
|
262
|
+
hmm = GaussianHMM(
|
|
263
|
+
n_components=n_states,
|
|
264
|
+
covariance_type=config.HMM_CONFIG["covariance_type"],
|
|
265
|
+
n_iter=config.HMM_CONFIG["n_iter"],
|
|
266
|
+
random_state=config.HMM_CONFIG["random_seed"],
|
|
267
|
+
tol=config.HMM_CONFIG["tol"],
|
|
268
|
+
init_params="stmc",
|
|
269
|
+
params="stmc"
|
|
270
|
+
)
|
|
271
|
+
hmm.fit(X_train_np, lengths_train)
|
|
272
|
+
|
|
273
|
+
# Predict and map states
|
|
274
|
+
y_train_true = df_train["state"].map(state_to_idx).values
|
|
275
|
+
y_train_hat = viterbi_decode(hmm, X_train_np, lengths_train)
|
|
276
|
+
|
|
277
|
+
# Build contingency matrix for mapping
|
|
278
|
+
K = len(state_list)
|
|
279
|
+
cont = np.zeros((K, K), dtype=int)
|
|
280
|
+
for t, p in zip(y_train_true, y_train_hat):
|
|
281
|
+
if t < K and p < K:
|
|
282
|
+
cont[t, p] += 1
|
|
283
|
+
|
|
284
|
+
# Optimal mapping using Hungarian algorithm
|
|
285
|
+
row_ind, col_ind = linear_sum_assignment(cont.max() - cont)
|
|
286
|
+
mapping = {pred: true for true, pred in zip(row_ind, col_ind)}
|
|
287
|
+
|
|
288
|
+
# Decode test set and map states
|
|
289
|
+
y_test_hat = viterbi_decode(hmm, X_test_np, lengths_test)
|
|
290
|
+
y_test_mapped = np.array([mapping.get(s, 0) for s in y_test_hat], dtype=int)
|
|
291
|
+
y_test_true = df_test["state"].map(state_to_idx).values
|
|
292
|
+
|
|
293
|
+
print_evaluation(y_test_true, y_test_mapped, idx_to_state, state_list,
|
|
294
|
+
title="Unsupervised HMM (mapped) — Test")
|
|
295
|
+
|
|
296
|
+
# Create full mapping for event log
|
|
297
|
+
state_mapping = {pred: idx_to_state[true] for pred, true in mapping.items() if true in idx_to_state}
|
|
298
|
+
|
|
299
|
+
return {
|
|
300
|
+
'model': hmm,
|
|
301
|
+
'mapping': state_mapping,
|
|
302
|
+
'test_predictions': y_test_hat
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def generate_event_log(df, hmm, state_mapping, predictions, scaler, features):
|
|
307
|
+
"""Generate event log from HMM predictions."""
|
|
308
|
+
# Normalize timestamps
|
|
309
|
+
df_normalized = normalize_timestamps(df)
|
|
310
|
+
|
|
311
|
+
# Prepare test features
|
|
312
|
+
X_test_scaled = scaler.transform(features)
|
|
313
|
+
X_test_np, lengths_test = pack_sequences(df, X_test_scaled)
|
|
314
|
+
|
|
315
|
+
# Get predictions for full dataset
|
|
316
|
+
y_pred_full = viterbi_decode(hmm, X_test_np, lengths_test)
|
|
317
|
+
|
|
318
|
+
# Create event log
|
|
319
|
+
event_log = create_interval_event_log_normalized(
|
|
320
|
+
df_normalized, y_pred_full, state_mapping
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
# Filter brief states
|
|
324
|
+
filtered_log = filter_brief_states(
|
|
325
|
+
event_log,
|
|
326
|
+
min_duration_seconds=2.0
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
# Save logs
|
|
330
|
+
event_log.to_csv(config.PATHS["event_log"], index=False)
|
|
331
|
+
filtered_log.to_csv(config.PATHS["filtered_log"], index=False)
|
|
332
|
+
|
|
333
|
+
print(f"Event log saved to: {config.PATHS['event_log']}")
|
|
334
|
+
print(f"Filtered event log saved to: {config.PATHS['filtered_log']}")
|
|
335
|
+
|
|
336
|
+
return filtered_log
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def create_visualizations(event_log):
|
|
340
|
+
"""Create visualization of process execution."""
|
|
341
|
+
# Create Gantt chart
|
|
342
|
+
gantt_chart = create_gantt_chart(
|
|
343
|
+
event_log,
|
|
344
|
+
max_cases=10,
|
|
345
|
+
figsize=config.VISUALIZATION_CONFIG["gantt_figsize"],
|
|
346
|
+
color_map=config.VISUALIZATION_CONFIG["colors"]
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
# Save and show
|
|
350
|
+
gantt_chart.savefig(config.PATHS["gantt_chart"], dpi=300, bbox_inches='tight')
|
|
351
|
+
gantt_chart.show()
|
|
352
|
+
|
|
353
|
+
print(f"Gantt chart saved to: {config.PATHS['gantt_chart']}")
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
if __name__ == "__main__":
|
|
357
|
+
# Example feature plan
|
|
358
|
+
feature_plan = {
|
|
359
|
+
'statistical': ['T', 'Q_in','Q_out'],
|
|
360
|
+
'temporal': ['T', 'Q_in','Q_out'],
|
|
361
|
+
'stability': ['T', 'Q_in','Q_out'],
|
|
362
|
+
'interaction': [['T', 'Q_in','Q_out']],
|
|
363
|
+
'event': [
|
|
364
|
+
'(T_diff_smooth < -1)',
|
|
365
|
+
|
|
366
|
+
# Fill rules
|
|
367
|
+
'(Q_in > 0.2) AND (Q_out < 0.05)',
|
|
368
|
+
'(Q_in > 0.2) AND (abs(T_diff) < 0.5) AND (Q_out < 0.05)',
|
|
369
|
+
|
|
370
|
+
# Hold rules
|
|
371
|
+
'(abs(T_diff) < 0.3) AND (T > 70)',
|
|
372
|
+
'(abs(T_diff) < 0.2) AND (Q_in < 0.1) AND (Q_out < 0.1) AND (T > 70)',
|
|
373
|
+
|
|
374
|
+
# Discharge rules
|
|
375
|
+
'(Q_out > 0.2)',
|
|
376
|
+
'(Q_out > 0.3)',
|
|
377
|
+
|
|
378
|
+
# Idle rules
|
|
379
|
+
'(Q_in < 0.05) AND (Q_out < 0.05) AND (abs(T_diff) < 0.2)',
|
|
380
|
+
'(T < 15) AND (Q_in < 0.05) AND (Q_out < 0.05)',
|
|
381
|
+
'(Q_in < 0.05) AND (Q_out < 0.05) AND (abs(T_diff) < 0.2) AND (T < 15)'
|
|
382
|
+
],
|
|
383
|
+
'contextual': []
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
# Run analysis
|
|
387
|
+
results = analyze_process(
|
|
388
|
+
data_path="synthetic_pasteurization_with_cip_signals_100.csv",
|
|
389
|
+
feature_plan=feature_plan,
|
|
390
|
+
mode="unsupervised",
|
|
391
|
+
use_cip=False,
|
|
392
|
+
n_unsup=None,
|
|
393
|
+
random_seed=42
|
|
394
|
+
)
|