Sensor2EventLog 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
models/base_model.py ADDED
@@ -0,0 +1,68 @@
1
+ """
2
+ Base model interface for pluggable models
3
+ """
4
+
5
+ from abc import ABC, abstractmethod
6
+ from typing import Dict, List, Optional, Tuple, Any
7
+ import numpy as np
8
+
9
+
10
+ class BaseModel(ABC):
11
+ """
12
+ Abstract base class for all models in Sensor2EventLog.
13
+
14
+ This interface ensures that all models can be used interchangeably
15
+ in the Machine Teaching loop.
16
+ """
17
+
18
+ @abstractmethod
19
+ def fit(self, X: np.ndarray, lengths: List[int], y: Optional[np.ndarray] = None) -> 'BaseModel':
20
+ """
21
+ Fit the model to training data.
22
+
23
+ Parameters:
24
+ -----------
25
+ X : np.ndarray
26
+ Feature matrix (n_samples, n_features)
27
+ lengths : List[int]
28
+ Lengths of each sequence
29
+ y : np.ndarray, optional
30
+ Labels for supervised learning
31
+
32
+ Returns:
33
+ --------
34
+ self : BaseModel
35
+ Fitted model
36
+ """
37
+ pass
38
+
39
+ @abstractmethod
40
+ def predict(self, X: np.ndarray, lengths: List[int]) -> np.ndarray:
41
+ """
42
+ Predict states for new data.
43
+
44
+ Parameters:
45
+ -----------
46
+ X : np.ndarray
47
+ Feature matrix (n_samples, n_features)
48
+ lengths : List[int]
49
+ Lengths of each sequence
50
+
51
+ Returns:
52
+ --------
53
+ predictions : np.ndarray
54
+ Predicted state indices (n_samples,)
55
+ """
56
+ pass
57
+
58
+ @abstractmethod
59
+ def get_state_mapping(self) -> Dict[int, str]:
60
+ """
61
+ Get mapping from state indices to state names.
62
+
63
+ Returns:
64
+ --------
65
+ Dict[int, str]
66
+ Mapping from index to state name
67
+ """
68
+ pass
models/hmm_model.py ADDED
@@ -0,0 +1,174 @@
1
+ """
2
+ Hidden Markov Model implementation
3
+ """
4
+
5
+ import numpy as np
6
+ from typing import Dict, List, Optional, Tuple
7
+ from hmmlearn.hmm import GaussianHMM
8
+ from scipy.optimize import linear_sum_assignment
9
+
10
+ from models.base_model import BaseModel
11
+ from utils.hmm_utils import (
12
+ empirical_start_trans, emissions_from_labels, viterbi_decode, print_evaluation
13
+ )
14
+
15
+
16
+ class HMMModel(BaseModel):
17
+ """
18
+ Gaussian Hidden Markov Model for process state discovery.
19
+
20
+ Supports both supervised and unsupervised learning modes.
21
+ """
22
+
23
+ def __init__(self, config=None):
24
+ """
25
+ Initialize HMM model.
26
+
27
+ Parameters:
28
+ -----------
29
+ config : module
30
+ Configuration module with HMM parameters
31
+ """
32
+ self.config = config
33
+ self.model = None
34
+ self._state_mapping = None
35
+ self._idx_to_state = None
36
+ self._state_list = None
37
+
38
+ # Default HMM parameters
39
+ self.covariance_type = getattr(config, 'HMM_CONFIG', {}).get("covariance_type", "diag")
40
+ self.n_iter = getattr(config, 'HMM_CONFIG', {}).get("n_iter", 100)
41
+ self.random_seed = getattr(config, 'HMM_CONFIG', {}).get("random_seed", 42)
42
+ self.tol = getattr(config, 'HMM_CONFIG', {}).get("tol", 1e-6)
43
+
44
+ def fit(self, X: np.ndarray, lengths: List[int], y: Optional[np.ndarray] = None) -> 'HMMModel':
45
+ """
46
+ Fit HMM to data.
47
+
48
+ If y is provided, uses supervised initialization.
49
+ Otherwise, uses unsupervised learning.
50
+ """
51
+ if y is not None:
52
+ # Supervised initialization
53
+ n_states = len(np.unique(y))
54
+ startprob, transmat = empirical_start_trans(y, lengths, n_states)
55
+ means, covars = emissions_from_labels(X, y, n_states)
56
+
57
+ self.model = GaussianHMM(
58
+ n_components=n_states,
59
+ covariance_type=self.covariance_type,
60
+ n_iter=self.n_iter,
61
+ init_params="",
62
+ random_state=self.random_seed,
63
+ tol=self.tol
64
+ )
65
+ self.model.startprob_ = startprob
66
+ self.model.transmat_ = transmat
67
+ self.model.means_ = means
68
+ self.model.covars_ = covars
69
+
70
+ self.model.fit(X, lengths)
71
+ else:
72
+ # Unsupervised learning
73
+ # n_components must be set externally
74
+ pass
75
+
76
+ return self
77
+
78
+ def predict(self, X: np.ndarray, lengths: List[int]) -> np.ndarray:
79
+ """Predict state sequence using Viterbi algorithm."""
80
+ if self.model is None:
81
+ raise ValueError("Model must be fitted before prediction")
82
+ return viterbi_decode(self.model, X, lengths)
83
+
84
+ def get_state_mapping(self) -> Dict[int, str]:
85
+ """Get mapping from state indices to state names."""
86
+ return self._state_mapping or {}
87
+
88
+ def train_supervised(self, X_train, lengths_train, X_test, lengths_test,
89
+ y_train, y_test, state_list, idx_to_state) -> Tuple:
90
+ """Train supervised HMM with labeled data."""
91
+ print("\nTraining supervised HMM...")
92
+
93
+ n_states = len(state_list)
94
+ self._state_list = state_list
95
+ self._idx_to_state = idx_to_state
96
+
97
+ # Initialize from labels
98
+ startprob, transmat = empirical_start_trans(y_train, lengths_train, n_states)
99
+ means, covars = emissions_from_labels(X_train, y_train, n_states)
100
+
101
+ # Build and fit HMM
102
+ self.model = GaussianHMM(
103
+ n_components=n_states,
104
+ covariance_type=self.covariance_type,
105
+ n_iter=self.n_iter,
106
+ init_params="",
107
+ random_state=self.random_seed,
108
+ tol=self.tol
109
+ )
110
+ self.model.startprob_ = startprob
111
+ self.model.transmat_ = transmat
112
+ self.model.means_ = means
113
+ self.model.covars_ = covars
114
+
115
+ self.model.fit(X_train, lengths_train)
116
+
117
+ # Decode and evaluate
118
+ y_pred_test = self.predict(X_test, lengths_test)
119
+ print_evaluation(y_test, y_pred_test, idx_to_state, state_list,
120
+ title="Supervised HMM (Test)")
121
+
122
+ # Create state mapping (1:1 for supervised)
123
+ state_mapping = {i: state_list[i] for i in range(n_states)}
124
+ self._state_mapping = state_mapping
125
+
126
+ return y_pred_test, self.model, state_mapping
127
+
128
+ def train_unsupervised(self, X_train, lengths_train, X_test, lengths_test,
129
+ y_train, y_test, state_list, idx_to_state, n_unsup) -> Tuple:
130
+ """Train unsupervised HMM with state mapping."""
131
+ print("\nTraining unsupervised HMM...")
132
+
133
+ n_states = n_unsup if n_unsup is not None else len(state_list)
134
+ self._state_list = state_list
135
+ self._idx_to_state = idx_to_state
136
+
137
+ # Train unsupervised HMM
138
+ self.model = GaussianHMM(
139
+ n_components=n_states,
140
+ covariance_type=self.covariance_type,
141
+ n_iter=self.n_iter,
142
+ random_state=self.random_seed,
143
+ tol=self.tol,
144
+ init_params="stmc",
145
+ params="stmc"
146
+ )
147
+ self.model.fit(X_train, lengths_train)
148
+
149
+ # Predict and map states
150
+ y_train_hat = self.predict(X_train, lengths_train)
151
+
152
+ # Build contingency matrix for mapping
153
+ K = len(state_list)
154
+ cont = np.zeros((K, K), dtype=int)
155
+ for t, p in zip(y_train, y_train_hat):
156
+ if t < K and p < K:
157
+ cont[t, p] += 1
158
+
159
+ # Optimal mapping using Hungarian algorithm
160
+ row_ind, col_ind = linear_sum_assignment(cont.max() - cont)
161
+ mapping = {pred: true for true, pred in zip(row_ind, col_ind)}
162
+
163
+ # Decode test set and map states
164
+ y_test_hat = self.predict(X_test, lengths_test)
165
+ y_test_mapped = np.array([mapping.get(s, 0) for s in y_test_hat], dtype=int)
166
+
167
+ print_evaluation(y_test, y_test_mapped, idx_to_state, state_list,
168
+ title="Unsupervised HMM (mapped) — Test")
169
+
170
+ # Create full mapping for event log
171
+ state_mapping = {pred: idx_to_state[true] for pred, true in mapping.items() if true in idx_to_state}
172
+ self._state_mapping = state_mapping
173
+
174
+ return y_test_hat, self.model, state_mapping
@@ -0,0 +1,53 @@
1
+ Metadata-Version: 2.4
2
+ Name: Sensor2EventLog
3
+ Version: 2.0.0
4
+ Summary: Knowledge-guided framework for transforming sensor data into event logs
5
+ Author: Azin Moradbeikie
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/azinmoradbeikie/Sensor2EventLog
8
+ Project-URL: Repository, https://github.com/azinmoradbeikie/Sensor2EventLog
9
+ Requires-Python: >=3.9
10
+ Description-Content-Type: text/markdown
11
+ License-File: LICENSE
12
+ Requires-Dist: numpy>=1.21.0
13
+ Requires-Dist: pandas>=1.3.0
14
+ Requires-Dist: scipy>=1.7.0
15
+ Requires-Dist: scikit-learn>=0.24.0
16
+ Requires-Dist: hmmlearn>=0.2.8
17
+ Requires-Dist: matplotlib>=3.4.0
18
+ Requires-Dist: seaborn>=0.11.0
19
+ Dynamic: license-file
20
+
21
+ <p align="center">
22
+ <img src="https://raw.githubusercontent.com/azinmoradbeikie/Sensor2EventLog/main/images/Sensor2EventLog_001.png" width="300" />
23
+ </p>
24
+
25
+ [![Documentation Status](https://readthedocs.org/projects/sensor2eventlog/badge/?version=latest)](https://sensor2eventlog.readthedocs.io/en/latest/?badge=latest)
26
+
27
+ # Sensor2EventLog
28
+ knowledge-guided framework that transforms raw sensor data into process-aware event logs by incorporating Machine Teaching (MT) principles. The core of the framework is a modular abstraction layer embedded in an interactive teaching loop (planning, explaining, and reviewing) where human experts map sensor behaviors correspond to process states.
29
+
30
+
31
+
32
+ ## Setup
33
+
34
+ - Clone the repository:
35
+ ```bash
36
+ git clone https://github.com/azinmoradbeikie/Sensor2EventLog.git
37
+ - Install dependencies
38
+ ```bash
39
+ pip install -r requirements.txt
40
+ cd Sensor2EventLog
41
+ - Running the Analysis
42
+ ```bash
43
+ python3 main.py
44
+
45
+ ## Tutorial
46
+
47
+ A self-contained toy walkthrough is available in [`tutorial/`](tutorial/README.md).
48
+
49
+ Run it with:
50
+
51
+ ```bash
52
+ python3 tutorial/toy_walkthrough.py
53
+ ```
@@ -0,0 +1,18 @@
1
+ S2E_v2.py,sha256=_M6i1fUQ801E_a83pSffE0Mh9io_Rpi11op7MmHQEck,46621
2
+ config.py,sha256=SyGlw-g3OcxSG5l3x57rjVgkpaBOLGRMxLDXW0QyfpI,1079
3
+ main.py,sha256=8ZNdhT86oZFw28cRr5aeqwAxB4rAzf_fcMRJl6hlqrQ,13768
4
+ abstraction/mt_loop.py,sha256=U4RSiigFTv-qgGt-csaid95sE2wjsJk-vL1UBXNi7dY,5338
5
+ contextualization/event_log.py,sha256=_2SE7dbfcukKZkHcqd6rl0k7M8j0v9cx7DtaeVl_D2w,12196
6
+ core/__init__.py,sha256=ZSybjdo1g9g9dSWL3r1-KeC7WEatvT_UqX_zxyd5Flc,86
7
+ core/pipeline.py,sha256=vqvEfFnvaCazSdzQg8hofkrs1Khujm-bVzsLpsZ66vE,6828
8
+ evaluation/rule_analyzer.py,sha256=mi7tsxO8TWdE6F4y6akqThWjrk5kKc-_bqPPCQDOU5c,11352
9
+ features/feature_library.py,sha256=jXZ4qH3NG7B_Vp9k4H27jbSN6bpjlAKyNAIZ2TVTXBc,11879
10
+ models/base_model.py,sha256=T9inyiwAeF30KRQRXJN4xUstvl44BG5mEoO3TbSbKkk,1764
11
+ models/hmm_model.py,sha256=6HAFXQtfp9WTMvMf9LA_MElg9rEB-jdZuUZsKuvSWUE,6593
12
+ sensor2eventlog-2.0.0.dist-info/licenses/LICENSE,sha256=IMc9ORhSJ07tr8EolCdcfbQG4k0f6-9OwDaE22yGkns,1077
13
+ utils/__init__.py,sha256=ZmQOyX7EFv5xkDBg1FAfoyf16qqcweapZYA_2H1ZqGI,718
14
+ utils/hmm_utils.py,sha256=lERCRRDqB9zB-aPuBrPNJZs5zPlK6I0s711sHFjEGxk,11976
15
+ sensor2eventlog-2.0.0.dist-info/METADATA,sha256=QiGodv2Vzcc1gB-h7E8ejbLRGBkyN36zARvXxBlDhEo,1803
16
+ sensor2eventlog-2.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
17
+ sensor2eventlog-2.0.0.dist-info/top_level.txt,sha256=IiPSdf5ZLi1NUq0dwUwB8aTFqOXEC-Z1xGeDw_Q1aoc,87
18
+ sensor2eventlog-2.0.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Azin Moradbeikie
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,10 @@
1
+ S2E_v2
2
+ abstraction
3
+ config
4
+ contextualization
5
+ core
6
+ evaluation
7
+ features
8
+ main
9
+ models
10
+ utils
utils/__init__.py ADDED
@@ -0,0 +1,30 @@
1
+ """
2
+ HMM Process Analyzer Utilities
3
+ """
4
+
5
+ # Re-export from their new locations
6
+ from features.feature_library import ModularFeatureLibrary
7
+ from evaluation.rule_analyzer import RuleDiagnosticAnalyzer
8
+
9
+ # Import from local hmm_utils
10
+ from .hmm_utils import (
11
+ empirical_start_trans,
12
+ emissions_from_labels,
13
+ viterbi_decode,
14
+ print_evaluation,
15
+ create_interval_event_log_normalized,
16
+ filter_brief_states,
17
+ normalize_timestamps
18
+ )
19
+
20
+ __all__ = [
21
+ 'ModularFeatureLibrary',
22
+ 'RuleDiagnosticAnalyzer',
23
+ 'empirical_start_trans',
24
+ 'emissions_from_labels',
25
+ 'viterbi_decode',
26
+ 'print_evaluation',
27
+ 'create_interval_event_log_normalized',
28
+ 'filter_brief_states',
29
+ 'normalize_timestamps'
30
+ ]
utils/hmm_utils.py ADDED
@@ -0,0 +1,277 @@
1
+ """
2
+ HMM utility functions for training, evaluation, and event log generation
3
+ """
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from hmmlearn.hmm import GaussianHMM
8
+ from sklearn.preprocessing import StandardScaler
9
+ from sklearn.metrics import confusion_matrix, classification_report
10
+ from scipy.optimize import linear_sum_assignment
11
+ import matplotlib.pyplot as plt
12
+
13
+
14
+ def empirical_start_trans(labels, lengths, n_states):
15
+ """Estimate startprob_ and transmat_ from labeled sequences."""
16
+ start = np.zeros(n_states)
17
+ trans = np.zeros((n_states, n_states))
18
+ idx = 0
19
+ for L in lengths:
20
+ seq = labels[idx:idx+L]
21
+ start[seq[0]] += 1
22
+ for i in range(L-1):
23
+ trans[seq[i], seq[i+1]] += 1
24
+ idx += L
25
+ # normalize with small epsilon to avoid zeros
26
+ start = (start + 1e-6) / (start.sum() + 1e-6*n_states)
27
+ trans = (trans + 1e-6)
28
+ trans /= trans.sum(axis=1, keepdims=True)
29
+ return start, trans
30
+
31
+
32
+ def emissions_from_labels(X_np, labels_np, n_states):
33
+ """Compute means and covariances per labeled state."""
34
+ D = X_np.shape[1]
35
+ means = np.zeros((n_states, D))
36
+ covars = np.zeros((n_states, D, D))
37
+ for s in range(n_states):
38
+ sel = (labels_np == s)
39
+ Xi = X_np[sel]
40
+ if len(Xi) < 2:
41
+ # fallback tiny variance
42
+ means[s] = 0.0
43
+ covars[s] = np.eye(D)*1e-2
44
+ else:
45
+ means[s] = Xi.mean(axis=0)
46
+ covars[s] = np.cov(Xi.T) + np.eye(D)*1e-6
47
+ return means, covars
48
+
49
+
50
+ def viterbi_decode(model, X_np, lengths):
51
+ """Wrapper for HMM Viterbi decoding."""
52
+ return model.predict(X_np, lengths)
53
+
54
+
55
+ def print_evaluation(y_true_idx, y_pred_idx, idx_to_state, state_list, title=""):
56
+ """Print classification report and confusion matrix."""
57
+ labs_true = [idx_to_state[i] for i in y_true_idx]
58
+ labs_pred = [idx_to_state.get(i, f"UNK{i}") for i in y_pred_idx]
59
+ print(f"\n== {title} ==")
60
+ print(classification_report(labs_true, labs_pred, labels=state_list, zero_division=0))
61
+ cm = confusion_matrix(labs_true, labs_pred, labels=state_list)
62
+ print("Confusion matrix (rows=true, cols=pred):")
63
+ print(pd.DataFrame(cm, index=state_list, columns=state_list))
64
+
65
+
66
+ def normalize_timestamps(df, timestamp_col="timestamp", case_id_col="batch_id", base_date="2023-01-01"):
67
+ """
68
+ Normalize timestamps by handling different time units properly.
69
+ """
70
+ df_normalized = df.copy()
71
+
72
+ # Check timestamp format
73
+ print(f"Original timestamp sample: {df[timestamp_col].iloc[:5].tolist()}")
74
+
75
+ # Convert numeric seconds to datetime or parse as datetime
76
+ if np.issubdtype(df[timestamp_col].dtype, np.number):
77
+ print("Timestamps are numeric - assuming they represent seconds")
78
+ base_datetime = pd.to_datetime(base_date)
79
+ df_normalized[timestamp_col] = base_datetime + pd.to_timedelta(df[timestamp_col], unit='s')
80
+ else:
81
+ try:
82
+ df_normalized[timestamp_col] = pd.to_datetime(df[timestamp_col])
83
+ print("Timestamps successfully parsed as datetime")
84
+ except:
85
+ print("Could not parse timestamps. Please check the format.")
86
+ return df
87
+
88
+ # Normalize each case to start at base_date
89
+ case_groups = df_normalized.groupby(case_id_col)
90
+
91
+ for case_id, case_data in case_groups:
92
+ case_start = case_data[timestamp_col].min()
93
+ time_deltas = case_data[timestamp_col] - case_start
94
+ df_normalized.loc[case_data.index, timestamp_col] = pd.to_datetime(base_date) + time_deltas
95
+
96
+ return df_normalized
97
+
98
+
99
+ def create_interval_event_log_normalized(df, y_pred, state_mapping,
100
+ case_id_col="batch_id", timestamp_col="timestamp"):
101
+ """
102
+ Create interval-based event log using normalized timestamps.
103
+ """
104
+ df_with_pred = df.copy()
105
+ df_with_pred['predicted_state'] = [state_mapping.get(i, f"Unknown_{i}") for i in y_pred]
106
+
107
+ event_log_segments = []
108
+
109
+ for case_id in df_with_pred[case_id_col].unique():
110
+ case_data = df_with_pred[df_with_pred[case_id_col] == case_id].copy()
111
+ case_data = case_data.sort_values(timestamp_col)
112
+
113
+ current_state = None
114
+ segment_start = None
115
+ segment_indices = []
116
+
117
+ for idx, row in case_data.iterrows():
118
+ if current_state is None:
119
+ current_state = row['predicted_state']
120
+ segment_start = row[timestamp_col]
121
+ segment_indices = [idx]
122
+ elif row['predicted_state'] == current_state:
123
+ segment_indices.append(idx)
124
+ else:
125
+ segment_end = case_data.loc[segment_indices[-1], timestamp_col]
126
+ duration = (pd.to_datetime(segment_end) - pd.to_datetime(segment_start)).total_seconds()
127
+ event_log_segments.append({
128
+ 'case_id': case_id,
129
+ 'activity': current_state,
130
+ 'start_timestamp': segment_start,
131
+ 'end_timestamp': segment_end,
132
+ 'duration_seconds': duration,
133
+ 'event_count': len(segment_indices)
134
+ })
135
+ current_state = row['predicted_state']
136
+ segment_start = row[timestamp_col]
137
+ segment_indices = [idx]
138
+
139
+ # Add the last segment
140
+ if current_state is not None and segment_start is not None:
141
+ segment_end = case_data.loc[segment_indices[-1], timestamp_col]
142
+ duration = (pd.to_datetime(segment_end) - pd.to_datetime(segment_start)).total_seconds()
143
+
144
+ event_log_segments.append({
145
+ 'case_id': case_id,
146
+ 'activity': current_state,
147
+ 'start_timestamp': segment_start,
148
+ 'end_timestamp': segment_end,
149
+ 'duration_seconds': duration,
150
+ 'event_count': len(segment_indices)
151
+ })
152
+
153
+ event_log = pd.DataFrame(event_log_segments)
154
+ event_log['activity_sequence'] = event_log.groupby('case_id').cumcount() + 1
155
+
156
+ event_log = event_log[['case_id', 'activity_sequence', 'activity',
157
+ 'start_timestamp', 'end_timestamp',
158
+ 'duration_seconds', 'event_count']]
159
+
160
+ return event_log
161
+
162
+
163
+ def filter_brief_states(event_log, min_duration_seconds=5.0):
164
+ """
165
+ Remove state segments that are too brief by merging them with adjacent states.
166
+ """
167
+ filtered_segments = []
168
+
169
+ for case_id in event_log['case_id'].unique():
170
+ case_data = event_log[event_log['case_id'] == case_id].copy()
171
+
172
+ i = 0
173
+ while i < len(case_data):
174
+ current_segment = case_data.iloc[i]
175
+
176
+ # If segment is too brief, merge with previous or next
177
+ if current_segment['duration_seconds'] < min_duration_seconds and len(case_data) > 1:
178
+
179
+ if i == 0: # First segment - merge with next
180
+ next_segment = case_data.iloc[i + 1]
181
+ merged_segment = {
182
+ 'case_id': case_id,
183
+ 'activity': next_segment['activity'],
184
+ 'start_timestamp': current_segment['start_timestamp'],
185
+ 'end_timestamp': next_segment['end_timestamp'],
186
+ 'duration_seconds': current_segment['duration_seconds'] + next_segment['duration_seconds'],
187
+ 'event_count': current_segment['event_count'] + next_segment['event_count']
188
+ }
189
+ filtered_segments.append(merged_segment)
190
+ i += 2 # Skip next segment since we merged it
191
+
192
+ elif i == len(case_data) - 1: # Last segment - merge with previous
193
+ prev_segment = case_data.iloc[i - 1]
194
+ merged_segment = {
195
+ 'case_id': case_id,
196
+ 'activity': prev_segment['activity'],
197
+ 'start_timestamp': prev_segment['start_timestamp'],
198
+ 'end_timestamp': current_segment['end_timestamp'],
199
+ 'duration_seconds': prev_segment['duration_seconds'] + current_segment['duration_seconds'],
200
+ 'event_count': prev_segment['event_count'] + current_segment['event_count']
201
+ }
202
+ # Replace the last segment we added
203
+ filtered_segments = filtered_segments[:-1]
204
+ filtered_segments.append(merged_segment)
205
+ i += 1
206
+
207
+ else: # Middle segment - merge with previous
208
+ prev_segment = case_data.iloc[i - 1]
209
+ merged_segment = {
210
+ 'case_id': case_id,
211
+ 'activity': prev_segment['activity'],
212
+ 'start_timestamp': prev_segment['start_timestamp'],
213
+ 'end_timestamp': current_segment['end_timestamp'],
214
+ 'duration_seconds': prev_segment['duration_seconds'] + current_segment['duration_seconds'],
215
+ 'event_count': prev_segment['event_count'] + current_segment['event_count']
216
+ }
217
+ # Replace the last segment we added
218
+ filtered_segments = filtered_segments[:-1]
219
+ filtered_segments.append(merged_segment)
220
+ i += 1
221
+ else:
222
+ # Keep segments that are long enough
223
+ filtered_segments.append(current_segment.to_dict())
224
+ i += 1
225
+
226
+ # Create new event log
227
+ filtered_log = pd.DataFrame(filtered_segments)
228
+
229
+ # Recalculate activity sequence
230
+ filtered_log['activity_sequence'] = filtered_log.groupby('case_id').cumcount() + 1
231
+
232
+ return filtered_log
233
+
234
+
235
+ def create_gantt_chart(event_log, max_cases=10, figsize=(14, 8), color_map='Set3'):
236
+ """
237
+ Create Gantt chart visualization of process execution.
238
+ """
239
+ plt.figure(figsize=figsize)
240
+
241
+ activities = event_log['activity'].unique()
242
+ colors = plt.cm.get_cmap(color_map)(np.linspace(0, 1, len(activities)))
243
+ color_dict = dict(zip(activities, colors))
244
+
245
+ case_ids = event_log['case_id'].unique()[:max_cases]
246
+
247
+ for i, case_id in enumerate(case_ids):
248
+ case_data = event_log[event_log['case_id'] == case_id]
249
+
250
+ for _, activity_row in case_data.iterrows():
251
+ start = pd.to_datetime(activity_row['start_timestamp'])
252
+ end = pd.to_datetime(activity_row['end_timestamp'])
253
+ duration = (end - start).total_seconds() / 3600 # Convert to hours
254
+
255
+ plt.barh(y=i, width=duration, left=start,
256
+ color=color_dict[activity_row['activity']],
257
+ edgecolor='black', alpha=0.7)
258
+
259
+ # Add activity label for longer segments
260
+ if duration > 0.1: # Only label segments longer than 6 minutes
261
+ plt.text(start + pd.Timedelta(seconds=duration*3600/2), i,
262
+ activity_row['activity'], ha='center', va='center',
263
+ fontsize=8, fontweight='bold')
264
+
265
+ plt.yticks(range(len(case_ids)), case_ids)
266
+ plt.xlabel('Time (from normalized start)')
267
+ plt.ylabel('Case ID')
268
+ plt.title(f'Process Execution Gantt Chart (First {len(case_ids)} Cases)')
269
+
270
+ # Create legend
271
+ legend_patches = [plt.Rectangle((0,0),1,1, color=color_dict[act]) for act in activities]
272
+ plt.legend(legend_patches, activities, bbox_to_anchor=(1.05, 1), loc='upper left')
273
+
274
+ plt.grid(True, alpha=0.3)
275
+ plt.tight_layout()
276
+
277
+ return plt