Sensor2EventLog 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- S2E_v2.py +1025 -0
- abstraction/mt_loop.py +141 -0
- config.py +46 -0
- contextualization/event_log.py +339 -0
- core/__init__.py +3 -0
- core/pipeline.py +189 -0
- evaluation/rule_analyzer.py +246 -0
- features/feature_library.py +270 -0
- main.py +394 -0
- models/base_model.py +68 -0
- models/hmm_model.py +174 -0
- sensor2eventlog-2.0.0.dist-info/METADATA +53 -0
- sensor2eventlog-2.0.0.dist-info/RECORD +18 -0
- sensor2eventlog-2.0.0.dist-info/WHEEL +5 -0
- sensor2eventlog-2.0.0.dist-info/licenses/LICENSE +21 -0
- sensor2eventlog-2.0.0.dist-info/top_level.txt +10 -0
- utils/__init__.py +30 -0
- utils/hmm_utils.py +277 -0
models/base_model.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base model interface for pluggable models
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from typing import Dict, List, Optional, Tuple, Any
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BaseModel(ABC):
|
|
11
|
+
"""
|
|
12
|
+
Abstract base class for all models in Sensor2EventLog.
|
|
13
|
+
|
|
14
|
+
This interface ensures that all models can be used interchangeably
|
|
15
|
+
in the Machine Teaching loop.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
@abstractmethod
|
|
19
|
+
def fit(self, X: np.ndarray, lengths: List[int], y: Optional[np.ndarray] = None) -> 'BaseModel':
|
|
20
|
+
"""
|
|
21
|
+
Fit the model to training data.
|
|
22
|
+
|
|
23
|
+
Parameters:
|
|
24
|
+
-----------
|
|
25
|
+
X : np.ndarray
|
|
26
|
+
Feature matrix (n_samples, n_features)
|
|
27
|
+
lengths : List[int]
|
|
28
|
+
Lengths of each sequence
|
|
29
|
+
y : np.ndarray, optional
|
|
30
|
+
Labels for supervised learning
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
--------
|
|
34
|
+
self : BaseModel
|
|
35
|
+
Fitted model
|
|
36
|
+
"""
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
@abstractmethod
|
|
40
|
+
def predict(self, X: np.ndarray, lengths: List[int]) -> np.ndarray:
|
|
41
|
+
"""
|
|
42
|
+
Predict states for new data.
|
|
43
|
+
|
|
44
|
+
Parameters:
|
|
45
|
+
-----------
|
|
46
|
+
X : np.ndarray
|
|
47
|
+
Feature matrix (n_samples, n_features)
|
|
48
|
+
lengths : List[int]
|
|
49
|
+
Lengths of each sequence
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
--------
|
|
53
|
+
predictions : np.ndarray
|
|
54
|
+
Predicted state indices (n_samples,)
|
|
55
|
+
"""
|
|
56
|
+
pass
|
|
57
|
+
|
|
58
|
+
@abstractmethod
|
|
59
|
+
def get_state_mapping(self) -> Dict[int, str]:
|
|
60
|
+
"""
|
|
61
|
+
Get mapping from state indices to state names.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
--------
|
|
65
|
+
Dict[int, str]
|
|
66
|
+
Mapping from index to state name
|
|
67
|
+
"""
|
|
68
|
+
pass
|
models/hmm_model.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Hidden Markov Model implementation
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from typing import Dict, List, Optional, Tuple
|
|
7
|
+
from hmmlearn.hmm import GaussianHMM
|
|
8
|
+
from scipy.optimize import linear_sum_assignment
|
|
9
|
+
|
|
10
|
+
from models.base_model import BaseModel
|
|
11
|
+
from utils.hmm_utils import (
|
|
12
|
+
empirical_start_trans, emissions_from_labels, viterbi_decode, print_evaluation
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class HMMModel(BaseModel):
|
|
17
|
+
"""
|
|
18
|
+
Gaussian Hidden Markov Model for process state discovery.
|
|
19
|
+
|
|
20
|
+
Supports both supervised and unsupervised learning modes.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(self, config=None):
|
|
24
|
+
"""
|
|
25
|
+
Initialize HMM model.
|
|
26
|
+
|
|
27
|
+
Parameters:
|
|
28
|
+
-----------
|
|
29
|
+
config : module
|
|
30
|
+
Configuration module with HMM parameters
|
|
31
|
+
"""
|
|
32
|
+
self.config = config
|
|
33
|
+
self.model = None
|
|
34
|
+
self._state_mapping = None
|
|
35
|
+
self._idx_to_state = None
|
|
36
|
+
self._state_list = None
|
|
37
|
+
|
|
38
|
+
# Default HMM parameters
|
|
39
|
+
self.covariance_type = getattr(config, 'HMM_CONFIG', {}).get("covariance_type", "diag")
|
|
40
|
+
self.n_iter = getattr(config, 'HMM_CONFIG', {}).get("n_iter", 100)
|
|
41
|
+
self.random_seed = getattr(config, 'HMM_CONFIG', {}).get("random_seed", 42)
|
|
42
|
+
self.tol = getattr(config, 'HMM_CONFIG', {}).get("tol", 1e-6)
|
|
43
|
+
|
|
44
|
+
def fit(self, X: np.ndarray, lengths: List[int], y: Optional[np.ndarray] = None) -> 'HMMModel':
|
|
45
|
+
"""
|
|
46
|
+
Fit HMM to data.
|
|
47
|
+
|
|
48
|
+
If y is provided, uses supervised initialization.
|
|
49
|
+
Otherwise, uses unsupervised learning.
|
|
50
|
+
"""
|
|
51
|
+
if y is not None:
|
|
52
|
+
# Supervised initialization
|
|
53
|
+
n_states = len(np.unique(y))
|
|
54
|
+
startprob, transmat = empirical_start_trans(y, lengths, n_states)
|
|
55
|
+
means, covars = emissions_from_labels(X, y, n_states)
|
|
56
|
+
|
|
57
|
+
self.model = GaussianHMM(
|
|
58
|
+
n_components=n_states,
|
|
59
|
+
covariance_type=self.covariance_type,
|
|
60
|
+
n_iter=self.n_iter,
|
|
61
|
+
init_params="",
|
|
62
|
+
random_state=self.random_seed,
|
|
63
|
+
tol=self.tol
|
|
64
|
+
)
|
|
65
|
+
self.model.startprob_ = startprob
|
|
66
|
+
self.model.transmat_ = transmat
|
|
67
|
+
self.model.means_ = means
|
|
68
|
+
self.model.covars_ = covars
|
|
69
|
+
|
|
70
|
+
self.model.fit(X, lengths)
|
|
71
|
+
else:
|
|
72
|
+
# Unsupervised learning
|
|
73
|
+
# n_components must be set externally
|
|
74
|
+
pass
|
|
75
|
+
|
|
76
|
+
return self
|
|
77
|
+
|
|
78
|
+
def predict(self, X: np.ndarray, lengths: List[int]) -> np.ndarray:
|
|
79
|
+
"""Predict state sequence using Viterbi algorithm."""
|
|
80
|
+
if self.model is None:
|
|
81
|
+
raise ValueError("Model must be fitted before prediction")
|
|
82
|
+
return viterbi_decode(self.model, X, lengths)
|
|
83
|
+
|
|
84
|
+
def get_state_mapping(self) -> Dict[int, str]:
|
|
85
|
+
"""Get mapping from state indices to state names."""
|
|
86
|
+
return self._state_mapping or {}
|
|
87
|
+
|
|
88
|
+
def train_supervised(self, X_train, lengths_train, X_test, lengths_test,
|
|
89
|
+
y_train, y_test, state_list, idx_to_state) -> Tuple:
|
|
90
|
+
"""Train supervised HMM with labeled data."""
|
|
91
|
+
print("\nTraining supervised HMM...")
|
|
92
|
+
|
|
93
|
+
n_states = len(state_list)
|
|
94
|
+
self._state_list = state_list
|
|
95
|
+
self._idx_to_state = idx_to_state
|
|
96
|
+
|
|
97
|
+
# Initialize from labels
|
|
98
|
+
startprob, transmat = empirical_start_trans(y_train, lengths_train, n_states)
|
|
99
|
+
means, covars = emissions_from_labels(X_train, y_train, n_states)
|
|
100
|
+
|
|
101
|
+
# Build and fit HMM
|
|
102
|
+
self.model = GaussianHMM(
|
|
103
|
+
n_components=n_states,
|
|
104
|
+
covariance_type=self.covariance_type,
|
|
105
|
+
n_iter=self.n_iter,
|
|
106
|
+
init_params="",
|
|
107
|
+
random_state=self.random_seed,
|
|
108
|
+
tol=self.tol
|
|
109
|
+
)
|
|
110
|
+
self.model.startprob_ = startprob
|
|
111
|
+
self.model.transmat_ = transmat
|
|
112
|
+
self.model.means_ = means
|
|
113
|
+
self.model.covars_ = covars
|
|
114
|
+
|
|
115
|
+
self.model.fit(X_train, lengths_train)
|
|
116
|
+
|
|
117
|
+
# Decode and evaluate
|
|
118
|
+
y_pred_test = self.predict(X_test, lengths_test)
|
|
119
|
+
print_evaluation(y_test, y_pred_test, idx_to_state, state_list,
|
|
120
|
+
title="Supervised HMM (Test)")
|
|
121
|
+
|
|
122
|
+
# Create state mapping (1:1 for supervised)
|
|
123
|
+
state_mapping = {i: state_list[i] for i in range(n_states)}
|
|
124
|
+
self._state_mapping = state_mapping
|
|
125
|
+
|
|
126
|
+
return y_pred_test, self.model, state_mapping
|
|
127
|
+
|
|
128
|
+
def train_unsupervised(self, X_train, lengths_train, X_test, lengths_test,
|
|
129
|
+
y_train, y_test, state_list, idx_to_state, n_unsup) -> Tuple:
|
|
130
|
+
"""Train unsupervised HMM with state mapping."""
|
|
131
|
+
print("\nTraining unsupervised HMM...")
|
|
132
|
+
|
|
133
|
+
n_states = n_unsup if n_unsup is not None else len(state_list)
|
|
134
|
+
self._state_list = state_list
|
|
135
|
+
self._idx_to_state = idx_to_state
|
|
136
|
+
|
|
137
|
+
# Train unsupervised HMM
|
|
138
|
+
self.model = GaussianHMM(
|
|
139
|
+
n_components=n_states,
|
|
140
|
+
covariance_type=self.covariance_type,
|
|
141
|
+
n_iter=self.n_iter,
|
|
142
|
+
random_state=self.random_seed,
|
|
143
|
+
tol=self.tol,
|
|
144
|
+
init_params="stmc",
|
|
145
|
+
params="stmc"
|
|
146
|
+
)
|
|
147
|
+
self.model.fit(X_train, lengths_train)
|
|
148
|
+
|
|
149
|
+
# Predict and map states
|
|
150
|
+
y_train_hat = self.predict(X_train, lengths_train)
|
|
151
|
+
|
|
152
|
+
# Build contingency matrix for mapping
|
|
153
|
+
K = len(state_list)
|
|
154
|
+
cont = np.zeros((K, K), dtype=int)
|
|
155
|
+
for t, p in zip(y_train, y_train_hat):
|
|
156
|
+
if t < K and p < K:
|
|
157
|
+
cont[t, p] += 1
|
|
158
|
+
|
|
159
|
+
# Optimal mapping using Hungarian algorithm
|
|
160
|
+
row_ind, col_ind = linear_sum_assignment(cont.max() - cont)
|
|
161
|
+
mapping = {pred: true for true, pred in zip(row_ind, col_ind)}
|
|
162
|
+
|
|
163
|
+
# Decode test set and map states
|
|
164
|
+
y_test_hat = self.predict(X_test, lengths_test)
|
|
165
|
+
y_test_mapped = np.array([mapping.get(s, 0) for s in y_test_hat], dtype=int)
|
|
166
|
+
|
|
167
|
+
print_evaluation(y_test, y_test_mapped, idx_to_state, state_list,
|
|
168
|
+
title="Unsupervised HMM (mapped) — Test")
|
|
169
|
+
|
|
170
|
+
# Create full mapping for event log
|
|
171
|
+
state_mapping = {pred: idx_to_state[true] for pred, true in mapping.items() if true in idx_to_state}
|
|
172
|
+
self._state_mapping = state_mapping
|
|
173
|
+
|
|
174
|
+
return y_test_hat, self.model, state_mapping
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: Sensor2EventLog
|
|
3
|
+
Version: 2.0.0
|
|
4
|
+
Summary: Knowledge-guided framework for transforming sensor data into event logs
|
|
5
|
+
Author: Azin Moradbeikie
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/azinmoradbeikie/Sensor2EventLog
|
|
8
|
+
Project-URL: Repository, https://github.com/azinmoradbeikie/Sensor2EventLog
|
|
9
|
+
Requires-Python: >=3.9
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Requires-Dist: numpy>=1.21.0
|
|
13
|
+
Requires-Dist: pandas>=1.3.0
|
|
14
|
+
Requires-Dist: scipy>=1.7.0
|
|
15
|
+
Requires-Dist: scikit-learn>=0.24.0
|
|
16
|
+
Requires-Dist: hmmlearn>=0.2.8
|
|
17
|
+
Requires-Dist: matplotlib>=3.4.0
|
|
18
|
+
Requires-Dist: seaborn>=0.11.0
|
|
19
|
+
Dynamic: license-file
|
|
20
|
+
|
|
21
|
+
<p align="center">
|
|
22
|
+
<img src="https://raw.githubusercontent.com/azinmoradbeikie/Sensor2EventLog/main/images/Sensor2EventLog_001.png" width="300" />
|
|
23
|
+
</p>
|
|
24
|
+
|
|
25
|
+
[](https://sensor2eventlog.readthedocs.io/en/latest/?badge=latest)
|
|
26
|
+
|
|
27
|
+
# Sensor2EventLog
|
|
28
|
+
knowledge-guided framework that transforms raw sensor data into process-aware event logs by incorporating Machine Teaching (MT) principles. The core of the framework is a modular abstraction layer embedded in an interactive teaching loop (planning, explaining, and reviewing) where human experts map sensor behaviors correspond to process states.
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
## Setup
|
|
33
|
+
|
|
34
|
+
- Clone the repository:
|
|
35
|
+
```bash
|
|
36
|
+
git clone https://github.com/azinmoradbeikie/Sensor2EventLog.git
|
|
37
|
+
- Install dependencies
|
|
38
|
+
```bash
|
|
39
|
+
pip install -r requirements.txt
|
|
40
|
+
cd Sensor2EventLog
|
|
41
|
+
- Running the Analysis
|
|
42
|
+
```bash
|
|
43
|
+
python3 main.py
|
|
44
|
+
|
|
45
|
+
## Tutorial
|
|
46
|
+
|
|
47
|
+
A self-contained toy walkthrough is available in [`tutorial/`](tutorial/README.md).
|
|
48
|
+
|
|
49
|
+
Run it with:
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
python3 tutorial/toy_walkthrough.py
|
|
53
|
+
```
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
S2E_v2.py,sha256=_M6i1fUQ801E_a83pSffE0Mh9io_Rpi11op7MmHQEck,46621
|
|
2
|
+
config.py,sha256=SyGlw-g3OcxSG5l3x57rjVgkpaBOLGRMxLDXW0QyfpI,1079
|
|
3
|
+
main.py,sha256=8ZNdhT86oZFw28cRr5aeqwAxB4rAzf_fcMRJl6hlqrQ,13768
|
|
4
|
+
abstraction/mt_loop.py,sha256=U4RSiigFTv-qgGt-csaid95sE2wjsJk-vL1UBXNi7dY,5338
|
|
5
|
+
contextualization/event_log.py,sha256=_2SE7dbfcukKZkHcqd6rl0k7M8j0v9cx7DtaeVl_D2w,12196
|
|
6
|
+
core/__init__.py,sha256=ZSybjdo1g9g9dSWL3r1-KeC7WEatvT_UqX_zxyd5Flc,86
|
|
7
|
+
core/pipeline.py,sha256=vqvEfFnvaCazSdzQg8hofkrs1Khujm-bVzsLpsZ66vE,6828
|
|
8
|
+
evaluation/rule_analyzer.py,sha256=mi7tsxO8TWdE6F4y6akqThWjrk5kKc-_bqPPCQDOU5c,11352
|
|
9
|
+
features/feature_library.py,sha256=jXZ4qH3NG7B_Vp9k4H27jbSN6bpjlAKyNAIZ2TVTXBc,11879
|
|
10
|
+
models/base_model.py,sha256=T9inyiwAeF30KRQRXJN4xUstvl44BG5mEoO3TbSbKkk,1764
|
|
11
|
+
models/hmm_model.py,sha256=6HAFXQtfp9WTMvMf9LA_MElg9rEB-jdZuUZsKuvSWUE,6593
|
|
12
|
+
sensor2eventlog-2.0.0.dist-info/licenses/LICENSE,sha256=IMc9ORhSJ07tr8EolCdcfbQG4k0f6-9OwDaE22yGkns,1077
|
|
13
|
+
utils/__init__.py,sha256=ZmQOyX7EFv5xkDBg1FAfoyf16qqcweapZYA_2H1ZqGI,718
|
|
14
|
+
utils/hmm_utils.py,sha256=lERCRRDqB9zB-aPuBrPNJZs5zPlK6I0s711sHFjEGxk,11976
|
|
15
|
+
sensor2eventlog-2.0.0.dist-info/METADATA,sha256=QiGodv2Vzcc1gB-h7E8ejbLRGBkyN36zARvXxBlDhEo,1803
|
|
16
|
+
sensor2eventlog-2.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
17
|
+
sensor2eventlog-2.0.0.dist-info/top_level.txt,sha256=IiPSdf5ZLi1NUq0dwUwB8aTFqOXEC-Z1xGeDw_Q1aoc,87
|
|
18
|
+
sensor2eventlog-2.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Azin Moradbeikie
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
utils/__init__.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HMM Process Analyzer Utilities
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
# Re-export from their new locations
|
|
6
|
+
from features.feature_library import ModularFeatureLibrary
|
|
7
|
+
from evaluation.rule_analyzer import RuleDiagnosticAnalyzer
|
|
8
|
+
|
|
9
|
+
# Import from local hmm_utils
|
|
10
|
+
from .hmm_utils import (
|
|
11
|
+
empirical_start_trans,
|
|
12
|
+
emissions_from_labels,
|
|
13
|
+
viterbi_decode,
|
|
14
|
+
print_evaluation,
|
|
15
|
+
create_interval_event_log_normalized,
|
|
16
|
+
filter_brief_states,
|
|
17
|
+
normalize_timestamps
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
'ModularFeatureLibrary',
|
|
22
|
+
'RuleDiagnosticAnalyzer',
|
|
23
|
+
'empirical_start_trans',
|
|
24
|
+
'emissions_from_labels',
|
|
25
|
+
'viterbi_decode',
|
|
26
|
+
'print_evaluation',
|
|
27
|
+
'create_interval_event_log_normalized',
|
|
28
|
+
'filter_brief_states',
|
|
29
|
+
'normalize_timestamps'
|
|
30
|
+
]
|
utils/hmm_utils.py
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HMM utility functions for training, evaluation, and event log generation
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from hmmlearn.hmm import GaussianHMM
|
|
8
|
+
from sklearn.preprocessing import StandardScaler
|
|
9
|
+
from sklearn.metrics import confusion_matrix, classification_report
|
|
10
|
+
from scipy.optimize import linear_sum_assignment
|
|
11
|
+
import matplotlib.pyplot as plt
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def empirical_start_trans(labels, lengths, n_states):
|
|
15
|
+
"""Estimate startprob_ and transmat_ from labeled sequences."""
|
|
16
|
+
start = np.zeros(n_states)
|
|
17
|
+
trans = np.zeros((n_states, n_states))
|
|
18
|
+
idx = 0
|
|
19
|
+
for L in lengths:
|
|
20
|
+
seq = labels[idx:idx+L]
|
|
21
|
+
start[seq[0]] += 1
|
|
22
|
+
for i in range(L-1):
|
|
23
|
+
trans[seq[i], seq[i+1]] += 1
|
|
24
|
+
idx += L
|
|
25
|
+
# normalize with small epsilon to avoid zeros
|
|
26
|
+
start = (start + 1e-6) / (start.sum() + 1e-6*n_states)
|
|
27
|
+
trans = (trans + 1e-6)
|
|
28
|
+
trans /= trans.sum(axis=1, keepdims=True)
|
|
29
|
+
return start, trans
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def emissions_from_labels(X_np, labels_np, n_states):
|
|
33
|
+
"""Compute means and covariances per labeled state."""
|
|
34
|
+
D = X_np.shape[1]
|
|
35
|
+
means = np.zeros((n_states, D))
|
|
36
|
+
covars = np.zeros((n_states, D, D))
|
|
37
|
+
for s in range(n_states):
|
|
38
|
+
sel = (labels_np == s)
|
|
39
|
+
Xi = X_np[sel]
|
|
40
|
+
if len(Xi) < 2:
|
|
41
|
+
# fallback tiny variance
|
|
42
|
+
means[s] = 0.0
|
|
43
|
+
covars[s] = np.eye(D)*1e-2
|
|
44
|
+
else:
|
|
45
|
+
means[s] = Xi.mean(axis=0)
|
|
46
|
+
covars[s] = np.cov(Xi.T) + np.eye(D)*1e-6
|
|
47
|
+
return means, covars
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def viterbi_decode(model, X_np, lengths):
|
|
51
|
+
"""Wrapper for HMM Viterbi decoding."""
|
|
52
|
+
return model.predict(X_np, lengths)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def print_evaluation(y_true_idx, y_pred_idx, idx_to_state, state_list, title=""):
|
|
56
|
+
"""Print classification report and confusion matrix."""
|
|
57
|
+
labs_true = [idx_to_state[i] for i in y_true_idx]
|
|
58
|
+
labs_pred = [idx_to_state.get(i, f"UNK{i}") for i in y_pred_idx]
|
|
59
|
+
print(f"\n== {title} ==")
|
|
60
|
+
print(classification_report(labs_true, labs_pred, labels=state_list, zero_division=0))
|
|
61
|
+
cm = confusion_matrix(labs_true, labs_pred, labels=state_list)
|
|
62
|
+
print("Confusion matrix (rows=true, cols=pred):")
|
|
63
|
+
print(pd.DataFrame(cm, index=state_list, columns=state_list))
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def normalize_timestamps(df, timestamp_col="timestamp", case_id_col="batch_id", base_date="2023-01-01"):
|
|
67
|
+
"""
|
|
68
|
+
Normalize timestamps by handling different time units properly.
|
|
69
|
+
"""
|
|
70
|
+
df_normalized = df.copy()
|
|
71
|
+
|
|
72
|
+
# Check timestamp format
|
|
73
|
+
print(f"Original timestamp sample: {df[timestamp_col].iloc[:5].tolist()}")
|
|
74
|
+
|
|
75
|
+
# Convert numeric seconds to datetime or parse as datetime
|
|
76
|
+
if np.issubdtype(df[timestamp_col].dtype, np.number):
|
|
77
|
+
print("Timestamps are numeric - assuming they represent seconds")
|
|
78
|
+
base_datetime = pd.to_datetime(base_date)
|
|
79
|
+
df_normalized[timestamp_col] = base_datetime + pd.to_timedelta(df[timestamp_col], unit='s')
|
|
80
|
+
else:
|
|
81
|
+
try:
|
|
82
|
+
df_normalized[timestamp_col] = pd.to_datetime(df[timestamp_col])
|
|
83
|
+
print("Timestamps successfully parsed as datetime")
|
|
84
|
+
except:
|
|
85
|
+
print("Could not parse timestamps. Please check the format.")
|
|
86
|
+
return df
|
|
87
|
+
|
|
88
|
+
# Normalize each case to start at base_date
|
|
89
|
+
case_groups = df_normalized.groupby(case_id_col)
|
|
90
|
+
|
|
91
|
+
for case_id, case_data in case_groups:
|
|
92
|
+
case_start = case_data[timestamp_col].min()
|
|
93
|
+
time_deltas = case_data[timestamp_col] - case_start
|
|
94
|
+
df_normalized.loc[case_data.index, timestamp_col] = pd.to_datetime(base_date) + time_deltas
|
|
95
|
+
|
|
96
|
+
return df_normalized
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def create_interval_event_log_normalized(df, y_pred, state_mapping,
|
|
100
|
+
case_id_col="batch_id", timestamp_col="timestamp"):
|
|
101
|
+
"""
|
|
102
|
+
Create interval-based event log using normalized timestamps.
|
|
103
|
+
"""
|
|
104
|
+
df_with_pred = df.copy()
|
|
105
|
+
df_with_pred['predicted_state'] = [state_mapping.get(i, f"Unknown_{i}") for i in y_pred]
|
|
106
|
+
|
|
107
|
+
event_log_segments = []
|
|
108
|
+
|
|
109
|
+
for case_id in df_with_pred[case_id_col].unique():
|
|
110
|
+
case_data = df_with_pred[df_with_pred[case_id_col] == case_id].copy()
|
|
111
|
+
case_data = case_data.sort_values(timestamp_col)
|
|
112
|
+
|
|
113
|
+
current_state = None
|
|
114
|
+
segment_start = None
|
|
115
|
+
segment_indices = []
|
|
116
|
+
|
|
117
|
+
for idx, row in case_data.iterrows():
|
|
118
|
+
if current_state is None:
|
|
119
|
+
current_state = row['predicted_state']
|
|
120
|
+
segment_start = row[timestamp_col]
|
|
121
|
+
segment_indices = [idx]
|
|
122
|
+
elif row['predicted_state'] == current_state:
|
|
123
|
+
segment_indices.append(idx)
|
|
124
|
+
else:
|
|
125
|
+
segment_end = case_data.loc[segment_indices[-1], timestamp_col]
|
|
126
|
+
duration = (pd.to_datetime(segment_end) - pd.to_datetime(segment_start)).total_seconds()
|
|
127
|
+
event_log_segments.append({
|
|
128
|
+
'case_id': case_id,
|
|
129
|
+
'activity': current_state,
|
|
130
|
+
'start_timestamp': segment_start,
|
|
131
|
+
'end_timestamp': segment_end,
|
|
132
|
+
'duration_seconds': duration,
|
|
133
|
+
'event_count': len(segment_indices)
|
|
134
|
+
})
|
|
135
|
+
current_state = row['predicted_state']
|
|
136
|
+
segment_start = row[timestamp_col]
|
|
137
|
+
segment_indices = [idx]
|
|
138
|
+
|
|
139
|
+
# Add the last segment
|
|
140
|
+
if current_state is not None and segment_start is not None:
|
|
141
|
+
segment_end = case_data.loc[segment_indices[-1], timestamp_col]
|
|
142
|
+
duration = (pd.to_datetime(segment_end) - pd.to_datetime(segment_start)).total_seconds()
|
|
143
|
+
|
|
144
|
+
event_log_segments.append({
|
|
145
|
+
'case_id': case_id,
|
|
146
|
+
'activity': current_state,
|
|
147
|
+
'start_timestamp': segment_start,
|
|
148
|
+
'end_timestamp': segment_end,
|
|
149
|
+
'duration_seconds': duration,
|
|
150
|
+
'event_count': len(segment_indices)
|
|
151
|
+
})
|
|
152
|
+
|
|
153
|
+
event_log = pd.DataFrame(event_log_segments)
|
|
154
|
+
event_log['activity_sequence'] = event_log.groupby('case_id').cumcount() + 1
|
|
155
|
+
|
|
156
|
+
event_log = event_log[['case_id', 'activity_sequence', 'activity',
|
|
157
|
+
'start_timestamp', 'end_timestamp',
|
|
158
|
+
'duration_seconds', 'event_count']]
|
|
159
|
+
|
|
160
|
+
return event_log
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def filter_brief_states(event_log, min_duration_seconds=5.0):
|
|
164
|
+
"""
|
|
165
|
+
Remove state segments that are too brief by merging them with adjacent states.
|
|
166
|
+
"""
|
|
167
|
+
filtered_segments = []
|
|
168
|
+
|
|
169
|
+
for case_id in event_log['case_id'].unique():
|
|
170
|
+
case_data = event_log[event_log['case_id'] == case_id].copy()
|
|
171
|
+
|
|
172
|
+
i = 0
|
|
173
|
+
while i < len(case_data):
|
|
174
|
+
current_segment = case_data.iloc[i]
|
|
175
|
+
|
|
176
|
+
# If segment is too brief, merge with previous or next
|
|
177
|
+
if current_segment['duration_seconds'] < min_duration_seconds and len(case_data) > 1:
|
|
178
|
+
|
|
179
|
+
if i == 0: # First segment - merge with next
|
|
180
|
+
next_segment = case_data.iloc[i + 1]
|
|
181
|
+
merged_segment = {
|
|
182
|
+
'case_id': case_id,
|
|
183
|
+
'activity': next_segment['activity'],
|
|
184
|
+
'start_timestamp': current_segment['start_timestamp'],
|
|
185
|
+
'end_timestamp': next_segment['end_timestamp'],
|
|
186
|
+
'duration_seconds': current_segment['duration_seconds'] + next_segment['duration_seconds'],
|
|
187
|
+
'event_count': current_segment['event_count'] + next_segment['event_count']
|
|
188
|
+
}
|
|
189
|
+
filtered_segments.append(merged_segment)
|
|
190
|
+
i += 2 # Skip next segment since we merged it
|
|
191
|
+
|
|
192
|
+
elif i == len(case_data) - 1: # Last segment - merge with previous
|
|
193
|
+
prev_segment = case_data.iloc[i - 1]
|
|
194
|
+
merged_segment = {
|
|
195
|
+
'case_id': case_id,
|
|
196
|
+
'activity': prev_segment['activity'],
|
|
197
|
+
'start_timestamp': prev_segment['start_timestamp'],
|
|
198
|
+
'end_timestamp': current_segment['end_timestamp'],
|
|
199
|
+
'duration_seconds': prev_segment['duration_seconds'] + current_segment['duration_seconds'],
|
|
200
|
+
'event_count': prev_segment['event_count'] + current_segment['event_count']
|
|
201
|
+
}
|
|
202
|
+
# Replace the last segment we added
|
|
203
|
+
filtered_segments = filtered_segments[:-1]
|
|
204
|
+
filtered_segments.append(merged_segment)
|
|
205
|
+
i += 1
|
|
206
|
+
|
|
207
|
+
else: # Middle segment - merge with previous
|
|
208
|
+
prev_segment = case_data.iloc[i - 1]
|
|
209
|
+
merged_segment = {
|
|
210
|
+
'case_id': case_id,
|
|
211
|
+
'activity': prev_segment['activity'],
|
|
212
|
+
'start_timestamp': prev_segment['start_timestamp'],
|
|
213
|
+
'end_timestamp': current_segment['end_timestamp'],
|
|
214
|
+
'duration_seconds': prev_segment['duration_seconds'] + current_segment['duration_seconds'],
|
|
215
|
+
'event_count': prev_segment['event_count'] + current_segment['event_count']
|
|
216
|
+
}
|
|
217
|
+
# Replace the last segment we added
|
|
218
|
+
filtered_segments = filtered_segments[:-1]
|
|
219
|
+
filtered_segments.append(merged_segment)
|
|
220
|
+
i += 1
|
|
221
|
+
else:
|
|
222
|
+
# Keep segments that are long enough
|
|
223
|
+
filtered_segments.append(current_segment.to_dict())
|
|
224
|
+
i += 1
|
|
225
|
+
|
|
226
|
+
# Create new event log
|
|
227
|
+
filtered_log = pd.DataFrame(filtered_segments)
|
|
228
|
+
|
|
229
|
+
# Recalculate activity sequence
|
|
230
|
+
filtered_log['activity_sequence'] = filtered_log.groupby('case_id').cumcount() + 1
|
|
231
|
+
|
|
232
|
+
return filtered_log
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def create_gantt_chart(event_log, max_cases=10, figsize=(14, 8), color_map='Set3'):
|
|
236
|
+
"""
|
|
237
|
+
Create Gantt chart visualization of process execution.
|
|
238
|
+
"""
|
|
239
|
+
plt.figure(figsize=figsize)
|
|
240
|
+
|
|
241
|
+
activities = event_log['activity'].unique()
|
|
242
|
+
colors = plt.cm.get_cmap(color_map)(np.linspace(0, 1, len(activities)))
|
|
243
|
+
color_dict = dict(zip(activities, colors))
|
|
244
|
+
|
|
245
|
+
case_ids = event_log['case_id'].unique()[:max_cases]
|
|
246
|
+
|
|
247
|
+
for i, case_id in enumerate(case_ids):
|
|
248
|
+
case_data = event_log[event_log['case_id'] == case_id]
|
|
249
|
+
|
|
250
|
+
for _, activity_row in case_data.iterrows():
|
|
251
|
+
start = pd.to_datetime(activity_row['start_timestamp'])
|
|
252
|
+
end = pd.to_datetime(activity_row['end_timestamp'])
|
|
253
|
+
duration = (end - start).total_seconds() / 3600 # Convert to hours
|
|
254
|
+
|
|
255
|
+
plt.barh(y=i, width=duration, left=start,
|
|
256
|
+
color=color_dict[activity_row['activity']],
|
|
257
|
+
edgecolor='black', alpha=0.7)
|
|
258
|
+
|
|
259
|
+
# Add activity label for longer segments
|
|
260
|
+
if duration > 0.1: # Only label segments longer than 6 minutes
|
|
261
|
+
plt.text(start + pd.Timedelta(seconds=duration*3600/2), i,
|
|
262
|
+
activity_row['activity'], ha='center', va='center',
|
|
263
|
+
fontsize=8, fontweight='bold')
|
|
264
|
+
|
|
265
|
+
plt.yticks(range(len(case_ids)), case_ids)
|
|
266
|
+
plt.xlabel('Time (from normalized start)')
|
|
267
|
+
plt.ylabel('Case ID')
|
|
268
|
+
plt.title(f'Process Execution Gantt Chart (First {len(case_ids)} Cases)')
|
|
269
|
+
|
|
270
|
+
# Create legend
|
|
271
|
+
legend_patches = [plt.Rectangle((0,0),1,1, color=color_dict[act]) for act in activities]
|
|
272
|
+
plt.legend(legend_patches, activities, bbox_to_anchor=(1.05, 1), loc='upper left')
|
|
273
|
+
|
|
274
|
+
plt.grid(True, alpha=0.3)
|
|
275
|
+
plt.tight_layout()
|
|
276
|
+
|
|
277
|
+
return plt
|