Sensor2EventLog 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
abstraction/mt_loop.py ADDED
@@ -0,0 +1,141 @@
1
+ """
2
+ Machine Teaching loop for Sensor2EventLog framework.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Any, Dict, Optional, Tuple, List
8
+
9
+ import numpy as np
10
+ import pandas as pd
11
+ from sklearn.preprocessing import StandardScaler
12
+
13
+ from models.hmm_model import HMMModel
14
+
15
+
16
+ class MachineTeachingLoop:
17
+ """
18
+ Orchestrates feature extraction, diagnostics, and model training.
19
+ """
20
+
21
+ def __init__(self, model_type: str, feature_extractor, diagnostic_analyzer, config):
22
+ self.model_type = model_type
23
+ self.feature_extractor = feature_extractor
24
+ self.diagnostic_analyzer = diagnostic_analyzer
25
+ self.config = config
26
+ self._review_summary: Optional[Dict[str, Any]] = None
27
+
28
+ if model_type != "hmm":
29
+ raise ValueError(f"Unsupported model_type: {model_type}")
30
+
31
+ self.model = HMMModel(config=self.config)
32
+ self._scaler = StandardScaler()
33
+
34
+ def get_review_summary(self) -> Optional[Dict[str, Any]]:
35
+ return self._review_summary
36
+
37
+ def run(
38
+ self,
39
+ df: pd.DataFrame,
40
+ feature_plan: Dict[str, list],
41
+ mode: str = "unsupervised",
42
+ n_unsup: Optional[int] = None,
43
+ random_seed: int = 42,
44
+ ) -> Dict[str, Any]:
45
+ features, diagnostics = self._extract_features_and_diagnostics(df, feature_plan)
46
+
47
+ X_train, X_test, df_train, df_test = self._split_train_test(df, features)
48
+ X_train_scaled = self._scaler.fit_transform(X_train)
49
+ X_test_scaled = self._scaler.transform(X_test)
50
+
51
+ X_train_np, lengths_train = self._pack_sequences(df_train, X_train_scaled)
52
+ X_test_np, lengths_test = self._pack_sequences(df_test, X_test_scaled)
53
+
54
+ state_list, state_to_idx, idx_to_state = self._build_state_maps(df)
55
+ y_train = df_train["state"].map(state_to_idx).values
56
+ y_test = df_test["state"].map(state_to_idx).values
57
+
58
+ if mode.lower() == "supervised":
59
+ y_pred_test, model, state_mapping = self.model.train_supervised(
60
+ X_train_np,
61
+ lengths_train,
62
+ X_test_np,
63
+ lengths_test,
64
+ y_train,
65
+ y_test,
66
+ state_list,
67
+ idx_to_state,
68
+ )
69
+ else:
70
+ y_pred_test, model, state_mapping = self.model.train_unsupervised(
71
+ X_train_np,
72
+ lengths_train,
73
+ X_test_np,
74
+ lengths_test,
75
+ y_train,
76
+ y_test,
77
+ state_list,
78
+ idx_to_state,
79
+ n_unsup,
80
+ )
81
+
82
+ # Predict full sequence for event log generation
83
+ X_full_scaled = self._scaler.transform(features)
84
+ X_full_np, lengths_full = self._pack_sequences(df, X_full_scaled)
85
+ y_pred_full = self.model.predict(X_full_np, lengths_full)
86
+
87
+ self._review_summary = diagnostics
88
+
89
+ return {
90
+ "features": features,
91
+ "diagnostics": diagnostics,
92
+ "model": model,
93
+ "predictions": y_pred_full,
94
+ "state_mapping": state_mapping,
95
+ }
96
+
97
+ def _extract_features_and_diagnostics(
98
+ self, df: pd.DataFrame, feature_plan: Dict[str, list]
99
+ ) -> Tuple[pd.DataFrame, Optional[Dict[str, Any]]]:
100
+ if hasattr(self.feature_extractor, "analyze_rule_performance"):
101
+ result = self.feature_extractor.analyze_rule_performance(df, feature_plan)
102
+ return result["features"], result.get("diagnostics")
103
+
104
+ features = self.feature_extractor.compute_features(df, feature_plan)
105
+ event_cols = [c for c in features.columns if c.startswith("event_")]
106
+ diagnostics = None
107
+ if event_cols:
108
+ diagnostics = self.diagnostic_analyzer.compute_rule_metrics(df, features[event_cols])
109
+ return features, diagnostics
110
+
111
+ def _split_train_test(
112
+ self, df: pd.DataFrame, features: pd.DataFrame, train_ratio: float = 0.6
113
+ ) -> Tuple[np.ndarray, np.ndarray, pd.DataFrame, pd.DataFrame]:
114
+ batch_ids = df["batch_id"].unique()
115
+ n_train = max(1, int(train_ratio * len(batch_ids)))
116
+ train_batch_ids = set(batch_ids[:n_train])
117
+ test_batch_ids = set(batch_ids[n_train:])
118
+
119
+ df_train = df[df["batch_id"].isin(train_batch_ids)]
120
+ df_test = df[df["batch_id"].isin(test_batch_ids)]
121
+
122
+ X_train = features.loc[df_train.index].values
123
+ X_test = features.loc[df_test.index].values
124
+
125
+ return X_train, X_test, df_train, df_test
126
+
127
+ @staticmethod
128
+ def _pack_sequences(df_subset: pd.DataFrame, X_subset: np.ndarray) -> Tuple[np.ndarray, List[int]]:
129
+ lengths = df_subset.groupby("batch_id").size().tolist()
130
+ return X_subset, lengths
131
+
132
+ def _build_state_maps(self, df: pd.DataFrame) -> Tuple[List[str], Dict[str, int], Dict[int, str]]:
133
+ states = df["state"].unique().tolist()
134
+ ordered = []
135
+ for key in ("production", "cip"):
136
+ if hasattr(self.config, "PROCESS_STATES") and key in self.config.PROCESS_STATES:
137
+ ordered.extend(self.config.PROCESS_STATES[key])
138
+ state_list = [s for s in ordered if s in states] or sorted(states)
139
+ state_to_idx = {s: i for i, s in enumerate(state_list)}
140
+ idx_to_state = {i: s for s, i in state_to_idx.items()}
141
+ return state_list, state_to_idx, idx_to_state
config.py ADDED
@@ -0,0 +1,46 @@
1
+ """
2
+ Configuration parameters for HMM process analyzer
3
+ """
4
+
5
+ # Process state definitions
6
+ PROCESS_STATES = {
7
+ "production": ["Idle", "Fill", "HeatUp", "Hold", "Cool", "Discharge"],
8
+ "cip": ["PreRinse", "Caustic", "InterRinse", "Acid", "FinalRinse",
9
+ "Sanitize", "Verification", "Standby"]
10
+ }
11
+
12
+ # Feature extraction parameters
13
+ FEATURE_CONFIG = {
14
+ "window_sizes": [5],
15
+ "stability_eps": 1,
16
+ "peak_threshold": 0.1
17
+ }
18
+
19
+ # HMM parameters
20
+ HMM_CONFIG = {
21
+ "covariance_type": "diag",
22
+ "n_iter": 100,
23
+ "random_seed": 42,
24
+ "tol": 1e-6
25
+ }
26
+
27
+ # Diagnostic thresholds
28
+ DIAGNOSTIC_CONFIG = {
29
+ "coverage_threshold": 0.6,
30
+ "precision_threshold": 0.7,
31
+ "explainability_threshold": 0.3
32
+ }
33
+
34
+ # Visualization settings
35
+ VISUALIZATION_CONFIG = {
36
+ "gantt_figsize": (14, 8),
37
+ "colors": "Set3",
38
+ "min_duration_for_label": 0.1 # hours
39
+ }
40
+
41
+ # File paths
42
+ PATHS = {
43
+ "event_log": "pasteurization_event_log.csv",
44
+ "filtered_log": "pasteurization_cleaned_event_log.csv",
45
+ "gantt_chart": "process_gantt_chart.png"
46
+ }
@@ -0,0 +1,339 @@
1
+ """
2
+ Event log object with PM4Py compatibility
3
+ """
4
+
5
+ import pandas as pd
6
+ import numpy as np
7
+ from typing import Optional, List, Dict, Any, Union
8
+ from datetime import datetime
9
+
10
+
11
+ class Event:
12
+ """
13
+ Single event in an event log.
14
+
15
+ Attributes:
16
+ -----------
17
+ case_id : str
18
+ Identifier for the process case
19
+ activity : str
20
+ Name of the activity/state
21
+ start_time : datetime
22
+ Start timestamp of the event
23
+ end_time : datetime
24
+ End timestamp of the event
25
+ duration : float
26
+ Duration in seconds
27
+ """
28
+
29
+ def __init__(self, case_id: str, activity: str, start_time: datetime,
30
+ end_time: datetime, duration: float = None, **kwargs):
31
+ self.case_id = str(case_id)
32
+ self.activity = activity
33
+ self.start_time = start_time
34
+ self.end_time = end_time
35
+ self.duration = duration or (end_time - start_time).total_seconds()
36
+ self.attributes = kwargs
37
+
38
+ def to_dict(self) -> Dict:
39
+ """Convert event to dictionary."""
40
+ return {
41
+ 'case_id': self.case_id,
42
+ 'activity': self.activity,
43
+ 'start_time': self.start_time,
44
+ 'end_time': self.end_time,
45
+ 'duration': self.duration,
46
+ **self.attributes
47
+ }
48
+
49
+
50
+ class EventLog:
51
+ """
52
+ Event log container with PM4Py compatibility.
53
+
54
+ This class provides a standardized interface for event logs
55
+ that can be exported to various formats (CSV, XES) and used
56
+ with process mining tools like PM4Py.
57
+
58
+ Example:
59
+ >>> log = EventLog(df)
60
+ >>> log.to_csv("event_log.csv")
61
+ >>> log.to_xes("event_log.xes")
62
+ >>> pm4py_log = log.to_pm4py() # Use with PM4Py
63
+ """
64
+
65
+ def __init__(self, data: Union[pd.DataFrame, List[Event]]):
66
+ """
67
+ Initialize event log from DataFrame or list of Events.
68
+
69
+ Parameters:
70
+ -----------
71
+ data : pd.DataFrame or List[Event]
72
+ Input event log data
73
+ """
74
+ if isinstance(data, pd.DataFrame):
75
+ self._df = self._validate_dataframe(data)
76
+ elif isinstance(data, list):
77
+ self._df = self._from_events(data)
78
+ else:
79
+ raise ValueError("Data must be DataFrame or list of Events")
80
+
81
+ self._pm4py_log = None
82
+
83
+ def _validate_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
84
+ """Validate and standardize DataFrame format."""
85
+ required_cols = ['case_id', 'activity', 'start_timestamp', 'end_timestamp']
86
+
87
+ for col in required_cols:
88
+ if col not in df.columns:
89
+ raise ValueError(f"DataFrame missing required column: {col}")
90
+
91
+ # Ensure timestamp columns are datetime
92
+ for col in ['start_timestamp', 'end_timestamp']:
93
+ if not pd.api.types.is_datetime64_any_dtype(df[col]):
94
+ df[col] = pd.to_datetime(df[col])
95
+
96
+ # Add duration if missing
97
+ if 'duration_seconds' not in df.columns:
98
+ df['duration_seconds'] = (
99
+ pd.to_datetime(df['end_timestamp']) -
100
+ pd.to_datetime(df['start_timestamp'])
101
+ ).dt.total_seconds()
102
+
103
+ return df
104
+
105
+ def _from_events(self, events: List[Event]) -> pd.DataFrame:
106
+ """Convert list of Events to DataFrame."""
107
+ return pd.DataFrame([e.to_dict() for e in events])
108
+
109
+ def to_dataframe(self) -> pd.DataFrame:
110
+ """Get event log as pandas DataFrame."""
111
+ return self._df.copy()
112
+
113
+ def to_csv(self, path: str, filtered: bool = False) -> None:
114
+ """
115
+ Export event log to CSV.
116
+
117
+ Parameters:
118
+ -----------
119
+ path : str
120
+ Output file path
121
+ filtered : bool
122
+ If True, saves the filtered version (if available)
123
+ """
124
+ df_to_save = self._df
125
+ if filtered and 'filtered' in self._df.columns:
126
+ df_to_save = self._df[self._df['filtered'] == True]
127
+
128
+ df_to_save.to_csv(path, index=False)
129
+ print(f"Event log saved to: {path}")
130
+
131
+ def to_xes(self, path: str, case_id_key: str = 'case:concept:name',
132
+ timestamp_key: str = 'time:timestamp') -> None:
133
+ """
134
+ Export event log to XES format using PM4Py.
135
+
136
+ Parameters:
137
+ -----------
138
+ path : str
139
+ Output file path
140
+ case_id_key : str
141
+ Column name to use as case identifier in XES
142
+ timestamp_key : str
143
+ Column name to use as timestamp in XES
144
+ """
145
+ try:
146
+ import pm4py
147
+ except ImportError:
148
+ raise ImportError("PM4Py is required for XES export. Install with: pip install pm4py")
149
+
150
+ # Convert to PM4Py format
151
+ pm4py_log = self.to_pm4py(case_id_key, timestamp_key)
152
+
153
+ # Export to XES
154
+ pm4py.write_xes(pm4py_log, path)
155
+ print(f"Event log exported to XES: {path}")
156
+
157
+ def to_pm4py(self, case_id_key: str = 'case:concept:name',
158
+ timestamp_key: str = 'time:timestamp') -> 'pm4py.objects.log.obj.EventLog':
159
+ """
160
+ Convert to PM4Py EventLog object for further analysis.
161
+
162
+ Parameters:
163
+ -----------
164
+ case_id_key : str
165
+ Column name to use as case identifier
166
+ timestamp_key : str
167
+ Column name to use as timestamp
168
+
169
+ Returns:
170
+ --------
171
+ pm4py.objects.log.obj.EventLog
172
+ PM4Py event log object
173
+ """
174
+ try:
175
+ import pm4py
176
+ except ImportError:
177
+ raise ImportError("PM4Py is required for this functionality. Install with: pip install pm4py")
178
+
179
+ # Prepare data for PM4Py format
180
+ df_for_pm4py = self._df.copy()
181
+
182
+ # Rename columns for PM4Py
183
+ df_for_pm4py = df_for_pm4py.rename(columns={
184
+ 'case_id': case_id_key,
185
+ 'activity': 'concept:name',
186
+ 'start_timestamp': timestamp_key
187
+ })
188
+
189
+ # Add end timestamp as separate attribute if available
190
+ if 'end_timestamp' in df_for_pm4py.columns:
191
+ df_for_pm4py['end_timestamp'] = df_for_pm4py['end_timestamp'].astype(str)
192
+
193
+ # Convert to PM4Py event log
194
+ event_log = pm4py.format_dataframe_to_event_log(
195
+ df_for_pm4py,
196
+ case_id=case_id_key,
197
+ activity_key='concept:name',
198
+ timestamp_key=timestamp_key
199
+ )
200
+
201
+ self._pm4py_log = event_log
202
+ return event_log
203
+
204
+ def filter_duration(self, min_seconds: float = 0, max_seconds: float = float('inf')) -> 'EventLog':
205
+ """
206
+ Filter events by duration.
207
+
208
+ Parameters:
209
+ -----------
210
+ min_seconds : float
211
+ Minimum duration in seconds
212
+ max_seconds : float
213
+ Maximum duration in seconds
214
+
215
+ Returns:
216
+ --------
217
+ EventLog
218
+ Filtered event log
219
+ """
220
+ filtered_df = self._df[
221
+ (self._df['duration_seconds'] >= min_seconds) &
222
+ (self._df['duration_seconds'] <= max_seconds)
223
+ ].copy()
224
+ filtered_df['filtered'] = True
225
+
226
+ return EventLog(filtered_df)
227
+
228
+ def get_cases(self) -> List[str]:
229
+ """Get list of unique case IDs."""
230
+ return self._df['case_id'].unique().tolist()
231
+
232
+ def get_activities(self) -> List[str]:
233
+ """Get list of unique activities."""
234
+ return self._df['activity'].unique().tolist()
235
+
236
+ def get_case(self, case_id: str) -> 'EventLog':
237
+ """Get all events for a specific case."""
238
+ case_df = self._df[self._df['case_id'] == str(case_id)].copy()
239
+ return EventLog(case_df)
240
+
241
+ def get_statistics(self) -> Dict[str, Any]:
242
+ """
243
+ Compute basic statistics about the event log.
244
+
245
+ Returns:
246
+ --------
247
+ dict with:
248
+ - total_cases: number of cases
249
+ - total_events: number of events
250
+ - unique_activities: number of distinct activities
251
+ - avg_case_duration: average case duration in seconds
252
+ - activity_frequencies: frequency of each activity
253
+ """
254
+ stats = {
255
+ 'total_cases': self._df['case_id'].nunique(),
256
+ 'total_events': len(self._df),
257
+ 'unique_activities': self._df['activity'].nunique(),
258
+ 'avg_case_duration': self._df.groupby('case_id')['duration_seconds'].sum().mean(),
259
+ 'activity_frequencies': self._df['activity'].value_counts().to_dict()
260
+ }
261
+ return stats
262
+
263
+ def __len__(self) -> int:
264
+ """Return number of events."""
265
+ return len(self._df)
266
+
267
+ def __repr__(self) -> str:
268
+ """String representation."""
269
+ return f"EventLog(cases={self.get_statistics()['total_cases']}, events={len(self._df)}, activities={self.get_statistics()['unique_activities']})"
270
+
271
+ def head(self, n: int = 5) -> pd.DataFrame:
272
+ """Return first n events."""
273
+ return self._df.head(n)
274
+
275
+
276
+ def create_interval_event_log_normalized(df, y_pred, state_mapping,
277
+ case_id_col="batch_id", timestamp_col="timestamp"):
278
+ """
279
+ Create interval-based event log using normalized timestamps.
280
+
281
+ This function is kept for backward compatibility.
282
+ """
283
+ df_with_pred = df.copy()
284
+ df_with_pred['predicted_state'] = [state_mapping.get(i, f"Unknown_{i}") for i in y_pred]
285
+
286
+ event_log_segments = []
287
+
288
+ for case_id in df_with_pred[case_id_col].unique():
289
+ case_data = df_with_pred[df_with_pred[case_id_col] == case_id].copy()
290
+ case_data = case_data.sort_values(timestamp_col)
291
+
292
+ current_state = None
293
+ segment_start = None
294
+ segment_indices = []
295
+
296
+ for idx, row in case_data.iterrows():
297
+ if current_state is None:
298
+ current_state = row['predicted_state']
299
+ segment_start = row[timestamp_col]
300
+ segment_indices = [idx]
301
+ elif row['predicted_state'] == current_state:
302
+ segment_indices.append(idx)
303
+ else:
304
+ segment_end = case_data.loc[segment_indices[-1], timestamp_col]
305
+ duration = (pd.to_datetime(segment_end) - pd.to_datetime(segment_start)).total_seconds()
306
+ event_log_segments.append({
307
+ 'case_id': case_id,
308
+ 'activity': current_state,
309
+ 'start_timestamp': segment_start,
310
+ 'end_timestamp': segment_end,
311
+ 'duration_seconds': duration,
312
+ 'event_count': len(segment_indices)
313
+ })
314
+ current_state = row['predicted_state']
315
+ segment_start = row[timestamp_col]
316
+ segment_indices = [idx]
317
+
318
+ # Add the last segment
319
+ if current_state is not None and segment_start is not None:
320
+ segment_end = case_data.loc[segment_indices[-1], timestamp_col]
321
+ duration = (pd.to_datetime(segment_end) - pd.to_datetime(segment_start)).total_seconds()
322
+
323
+ event_log_segments.append({
324
+ 'case_id': case_id,
325
+ 'activity': current_state,
326
+ 'start_timestamp': segment_start,
327
+ 'end_timestamp': segment_end,
328
+ 'duration_seconds': duration,
329
+ 'event_count': len(segment_indices)
330
+ })
331
+
332
+ event_log = pd.DataFrame(event_log_segments)
333
+ event_log['activity_sequence'] = event_log.groupby('case_id').cumcount() + 1
334
+
335
+ event_log = event_log[['case_id', 'activity_sequence', 'activity',
336
+ 'start_timestamp', 'end_timestamp',
337
+ 'duration_seconds', 'event_count']]
338
+
339
+ return event_log
core/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ from .pipeline import Sensor2EventLogPipeline
2
+
3
+ __all__ = ['Sensor2EventLogPipeline']