pattern-detector 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pattern_detection/__init__.py +3 -0
- pattern_detection/detector.py +152 -0
- pattern_detection/utils.py +87 -0
- pattern_detector-0.1.0.dist-info/LICENSE +21 -0
- pattern_detector-0.1.0.dist-info/METADATA +17 -0
- pattern_detector-0.1.0.dist-info/RECORD +10 -0
- pattern_detector-0.1.0.dist-info/WHEEL +5 -0
- pattern_detector-0.1.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +0 -0
- tests/test_detector.py +27 -0
@@ -0,0 +1,152 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import pandas as pd
|
3
|
+
from scipy.stats import skew, kurtosis
|
4
|
+
from joblib import Parallel, delayed
|
5
|
+
|
6
|
+
class PatternDetector:
|
7
|
+
def __init__(self, df, pattern, column_pattern):
|
8
|
+
self.df = df.copy()
|
9
|
+
self.pattern = pattern
|
10
|
+
self.column_pattern = column_pattern
|
11
|
+
self.similarity_dict = {}
|
12
|
+
self.pattern1 = None
|
13
|
+
self.window_size = None
|
14
|
+
self.bin_parser = None
|
15
|
+
self.len_iter = None
|
16
|
+
self.pattern_constraints = {}
|
17
|
+
|
18
|
+
def preprocess_pattern(self):
|
19
|
+
"""Preprocess the pattern data."""
|
20
|
+
len_iter = 400 if len(self.pattern) >= 400 else 200
|
21
|
+
bin_parser = 3 if len_iter == 400 else 2
|
22
|
+
|
23
|
+
self.pattern['bin'] = self.pattern.index // bin_parser
|
24
|
+
self.pattern1 = self.pattern.groupby('bin')[self.column_pattern].mean().to_numpy()
|
25
|
+
self.len_iter = len_iter
|
26
|
+
self.bin_parser = bin_parser
|
27
|
+
self.window_size = len(self.pattern1)
|
28
|
+
|
29
|
+
# Compute constraints
|
30
|
+
self.pattern_constraints = {
|
31
|
+
"max_pos": np.max(self.pattern1) + 0.1 * np.ptp(self.pattern1),
|
32
|
+
"min_pos": np.min(self.pattern1) - 0.1 * np.ptp(self.pattern1),
|
33
|
+
"mean_pos_upper": np.mean(self.pattern1) + 0.1 * np.ptp(self.pattern1),
|
34
|
+
"mean_pos_lower": np.mean(self.pattern1) - 0.1 * np.ptp(self.pattern1),
|
35
|
+
"pattern_skewness": skew(self.pattern1),
|
36
|
+
"pattern_kurtosis": kurtosis(self.pattern1),
|
37
|
+
"pattern_std": np.std(self.pattern1),
|
38
|
+
"starting_point_lower": self.pattern1[0] - 0.2 * np.ptp(self.pattern1),
|
39
|
+
"starting_point_upper": self.pattern1[0] + 0.2 * np.ptp(self.pattern1),
|
40
|
+
"ending_point_lower": self.pattern1[-1] - 0.2 * np.ptp(self.pattern1),
|
41
|
+
"ending_point_upper": self.pattern1[-1] + 0.2 * np.ptp(self.pattern1),
|
42
|
+
"cross_correlation_threshold": 0.5,
|
43
|
+
}
|
44
|
+
|
45
|
+
def preprocess_data(self):
|
46
|
+
"""Preprocess the main data."""
|
47
|
+
self.df['bin'] = self.df.index // self.bin_parser
|
48
|
+
return self.df.groupby('bin')[self.column_pattern].mean().to_numpy().reshape(-1, 1)
|
49
|
+
|
50
|
+
def apply_constraints(self, window):
|
51
|
+
"""Apply constraints to filter valid windows."""
|
52
|
+
pc = self.pattern_constraints
|
53
|
+
corr_coef = np.corrcoef(window, self.pattern1)[0][1]
|
54
|
+
|
55
|
+
if (
|
56
|
+
np.max(window) <= pc["max_pos"]
|
57
|
+
and np.min(window) >= pc["min_pos"]
|
58
|
+
and pc["mean_pos_upper"] >= np.mean(window) >= pc["mean_pos_lower"]
|
59
|
+
and abs(skew(window) - pc["pattern_skewness"]) < 0.5
|
60
|
+
and abs(kurtosis(window) - pc["pattern_kurtosis"]) < 1.0
|
61
|
+
and pc["pattern_std"] * 0.9 <= np.std(window) <= pc["pattern_std"] * 1.1
|
62
|
+
and corr_coef >= pc["cross_correlation_threshold"]
|
63
|
+
and pc["starting_point_lower"] <= window[0] <= pc["starting_point_upper"]
|
64
|
+
and pc["ending_point_lower"] <= window[-1] <= pc["ending_point_upper"]
|
65
|
+
):
|
66
|
+
return True
|
67
|
+
return False
|
68
|
+
|
69
|
+
def compute_cosine_sim(self, data1, i, j):
|
70
|
+
"""Compute cosine similarity for a given sliding window."""
|
71
|
+
window = data1[i:i + self.window_size - (self.len_iter // 2) + j, :].reshape(-1,)
|
72
|
+
|
73
|
+
if len(window) != len(self.pattern1): # Ensure dimensions match
|
74
|
+
return i, j, 0
|
75
|
+
|
76
|
+
# Apply constraints
|
77
|
+
if not self.apply_constraints(window):
|
78
|
+
return i, j, 0
|
79
|
+
|
80
|
+
fft_pattern = np.fft.fft(self.pattern1)
|
81
|
+
fft_window = np.fft.fft(window)
|
82
|
+
|
83
|
+
dot_product = np.dot(np.abs(fft_pattern), np.abs(fft_window))
|
84
|
+
norm_pattern = np.linalg.norm(np.abs(fft_pattern))
|
85
|
+
norm_window = np.linalg.norm(np.abs(fft_window))
|
86
|
+
similarity = dot_product / (norm_pattern * norm_window)
|
87
|
+
|
88
|
+
return i, j, similarity
|
89
|
+
|
90
|
+
def calculate_similarity(self):
|
91
|
+
"""Calculate sliding window cosine similarity."""
|
92
|
+
data1 = self.preprocess_data()
|
93
|
+
|
94
|
+
results = Parallel(n_jobs=-1)(
|
95
|
+
delayed(self.compute_cosine_sim)(data1, i, j)
|
96
|
+
for i in range(0, len(data1) - self.window_size, 2)
|
97
|
+
for j in range(0, self.len_iter, self.len_iter // 40)
|
98
|
+
)
|
99
|
+
|
100
|
+
for i, j, similarity in results:
|
101
|
+
if similarity > 0:
|
102
|
+
self.similarity_dict.setdefault(i, {})[j] = similarity
|
103
|
+
|
104
|
+
def get_top_similarities(self):
|
105
|
+
"""Extract top similarities from the similarity dictionary."""
|
106
|
+
results = [
|
107
|
+
{'key': key1, 'max_key': max(value, key=value.get), 'max_value': max(value.values())}
|
108
|
+
for key1, value in self.similarity_dict.items()
|
109
|
+
]
|
110
|
+
return pd.DataFrame(results)
|
111
|
+
|
112
|
+
def find_area_of_interest(self):
|
113
|
+
"""Find areas of interest in the data."""
|
114
|
+
self.preprocess_pattern()
|
115
|
+
self.calculate_similarity()
|
116
|
+
df_dist = self.get_top_similarities()
|
117
|
+
|
118
|
+
approx_cycle_length = len(self.pattern1) * 0.95
|
119
|
+
df_dist['app_cycle'] = (df_dist['key'] // approx_cycle_length).astype(int)
|
120
|
+
grouped = df_dist.groupby('app_cycle')
|
121
|
+
|
122
|
+
cyc_concat_df = pd.concat(
|
123
|
+
[
|
124
|
+
group.loc[group['max_value'].idxmax()].assign(cycle=idx_cyc)
|
125
|
+
for idx_cyc, (_, group) in enumerate(grouped)
|
126
|
+
if not group.empty and group['max_value'].max() != 0
|
127
|
+
],
|
128
|
+
ignore_index=True
|
129
|
+
)
|
130
|
+
|
131
|
+
cyc_concat_df['start_index'] = cyc_concat_df['key']
|
132
|
+
cyc_concat_df['end_index'] = (
|
133
|
+
cyc_concat_df['start_index'] + self.window_size +
|
134
|
+
cyc_concat_df['max_key'] - (self.len_iter // 2)
|
135
|
+
)
|
136
|
+
cyc_concat_df['shift_start'] = cyc_concat_df['start_index'].shift(1, fill_value=len(self.pattern1))
|
137
|
+
cyc_concat_df['diff'] = cyc_concat_df['shift_start'] - cyc_concat_df['start_index']
|
138
|
+
limit = len(self.pattern1) * 0.7
|
139
|
+
cyc_concat_df = cyc_concat_df[cyc_concat_df['diff'] < -limit].reset_index(drop=True)
|
140
|
+
cyc_concat_df['cycle'] = cyc_concat_df.index
|
141
|
+
|
142
|
+
cyc_concat_df['shift_end'] = cyc_concat_df['end_index'].shift(1, fill_value=cyc_concat_df['diff'].iloc[0])
|
143
|
+
cyc_concat_df['diff_end'] = cyc_concat_df['shift_end'] - cyc_concat_df['start_index']
|
144
|
+
overlap = cyc_concat_df['diff_end'] > 0
|
145
|
+
cyc_concat_df.loc[overlap, 'start_index'] += cyc_concat_df.loc[overlap, 'diff_end'] + 1
|
146
|
+
|
147
|
+
self.df['cycle'] = np.nan
|
148
|
+
for _, row in cyc_concat_df.iterrows():
|
149
|
+
start, stop = int(row['start_index'] * self.bin_parser), int(row['end_index'] * self.bin_parser)
|
150
|
+
self.df.loc[start:stop, 'cycle'] = row['cycle']
|
151
|
+
|
152
|
+
return self.df
|
@@ -0,0 +1,87 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from scipy.stats import skew, kurtosis
|
3
|
+
|
4
|
+
def calculate_statistics(data):
|
5
|
+
"""
|
6
|
+
Calculate various statistics for a given dataset.
|
7
|
+
|
8
|
+
Parameters:
|
9
|
+
- data (array-like): The input data.
|
10
|
+
|
11
|
+
Returns:
|
12
|
+
- dict: Dictionary containing max, min, mean, std, skewness, and kurtosis.
|
13
|
+
"""
|
14
|
+
return {
|
15
|
+
"max": np.max(data),
|
16
|
+
"min": np.min(data),
|
17
|
+
"mean": np.mean(data),
|
18
|
+
"std": np.std(data),
|
19
|
+
"skewness": skew(data),
|
20
|
+
"kurtosis": kurtosis(data),
|
21
|
+
}
|
22
|
+
|
23
|
+
def normalize_data(data):
|
24
|
+
"""
|
25
|
+
Normalize the input data to range [0, 1].
|
26
|
+
|
27
|
+
Parameters:
|
28
|
+
- data (array-like): Input data to normalize.
|
29
|
+
|
30
|
+
Returns:
|
31
|
+
- np.ndarray: Normalized data.
|
32
|
+
"""
|
33
|
+
min_val = np.min(data)
|
34
|
+
max_val = np.max(data)
|
35
|
+
return (data - min_val) / (max_val - min_val)
|
36
|
+
|
37
|
+
def validate_window(window, constraints):
|
38
|
+
"""
|
39
|
+
Validate a sliding window against constraints.
|
40
|
+
|
41
|
+
Parameters:
|
42
|
+
- window (array-like): The input window data.
|
43
|
+
- constraints (dict): A dictionary of constraints with keys like 'max_pos', 'min_pos', etc.
|
44
|
+
|
45
|
+
Returns:
|
46
|
+
- bool: True if all constraints are satisfied, False otherwise.
|
47
|
+
"""
|
48
|
+
corr_coef = np.corrcoef(window, constraints['pattern'])[0, 1]
|
49
|
+
|
50
|
+
return (
|
51
|
+
np.max(window) <= constraints["max_pos"]
|
52
|
+
and np.min(window) >= constraints["min_pos"]
|
53
|
+
and constraints["mean_pos_upper"] >= np.mean(window) >= constraints["mean_pos_lower"]
|
54
|
+
and abs(skew(window) - constraints["pattern_skewness"]) < 0.5
|
55
|
+
and abs(kurtosis(window) - constraints["pattern_kurtosis"]) < 1.0
|
56
|
+
and constraints["pattern_std"] * 0.9 <= np.std(window) <= constraints["pattern_std"] * 1.1
|
57
|
+
and corr_coef >= constraints["cross_correlation_threshold"]
|
58
|
+
and constraints["starting_point_lower"] <= window[0] <= constraints["starting_point_upper"]
|
59
|
+
and constraints["ending_point_lower"] <= window[-1] <= constraints["ending_point_upper"]
|
60
|
+
)
|
61
|
+
|
62
|
+
def calculate_correlation(data1, data2):
|
63
|
+
"""
|
64
|
+
Calculate correlation coefficient between two datasets.
|
65
|
+
|
66
|
+
Parameters:
|
67
|
+
- data1 (array-like): First dataset.
|
68
|
+
- data2 (array-like): Second dataset.
|
69
|
+
|
70
|
+
Returns:
|
71
|
+
- float: Correlation coefficient.
|
72
|
+
"""
|
73
|
+
return np.corrcoef(data1, data2)[0, 1]
|
74
|
+
|
75
|
+
def bin_data(data, bin_size):
|
76
|
+
"""
|
77
|
+
Bin the data into averages over fixed-size intervals.
|
78
|
+
|
79
|
+
Parameters:
|
80
|
+
- data (array-like): The input data.
|
81
|
+
- bin_size (int): The size of the bins.
|
82
|
+
|
83
|
+
Returns:
|
84
|
+
- np.ndarray: Binned data.
|
85
|
+
"""
|
86
|
+
binned = data[:len(data) // bin_size * bin_size].reshape(-1, bin_size)
|
87
|
+
return np.mean(binned, axis=1)
|
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2025 Yigit Utku Bulut and Ahmet Faruk Minareci
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
@@ -0,0 +1,17 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: pattern_detector
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: A library for detecting patterns in time-series data.
|
5
|
+
Author: Yigit Utku Bulut and Ahmet Faruk Minareci
|
6
|
+
Author-email: yigit.utku.bulut@gmail.com, ahmetfaruk.minareci@gmail.com
|
7
|
+
License: MIT
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
10
|
+
Classifier: Operating System :: OS Independent
|
11
|
+
Requires-Python: >=3.7
|
12
|
+
License-File: LICENSE
|
13
|
+
Requires-Dist: numpy
|
14
|
+
Requires-Dist: pandas
|
15
|
+
Requires-Dist: joblib
|
16
|
+
Requires-Dist: scipy
|
17
|
+
|
@@ -0,0 +1,10 @@
|
|
1
|
+
pattern_detection/__init__.py,sha256=hzZhj16aof1dhcIOAI9Tfo5EoPJgexGChsnCLgXq-68,68
|
2
|
+
pattern_detection/detector.py,sha256=XaYYRledSolgTPTYLRSWhjZhmnAAvZT2pD3h5g9kP9k,6585
|
3
|
+
pattern_detection/utils.py,sha256=jqkfeTqWEnrnvzKBxr_UdKj9cj7zPF15UhHPr9sBSa0,2646
|
4
|
+
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
tests/test_detector.py,sha256=biHGbqd6Uc8dFTtyK-BxYeqFYtJOO9lk_6rQMeQCbQs,972
|
6
|
+
pattern_detector-0.1.0.dist-info/LICENSE,sha256=RslT26mCGxF9nQEyEQL8kFMFUgmYvCA0UivIWefxWmU,1098
|
7
|
+
pattern_detector-0.1.0.dist-info/METADATA,sha256=9pgMSSHwImJHjA1AOAOJm-ou2MsYnNFwPa6AGvl-fow,536
|
8
|
+
pattern_detector-0.1.0.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
|
9
|
+
pattern_detector-0.1.0.dist-info/top_level.txt,sha256=W_tzR9Ah3_TgQsiGDs46ELePkuxfcPmFOosXSYw2hl0,24
|
10
|
+
pattern_detector-0.1.0.dist-info/RECORD,,
|
tests/__init__.py
ADDED
File without changes
|
tests/test_detector.py
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
import pytest
|
2
|
+
import numpy as np
|
3
|
+
import pandas as pd
|
4
|
+
from pattern_detection.detector import PatternDetector
|
5
|
+
|
6
|
+
def test_preprocess_pattern():
|
7
|
+
pattern = pd.DataFrame({"value": range(10)})
|
8
|
+
detector = PatternDetector(None, pattern, "value")
|
9
|
+
detector.preprocess_pattern()
|
10
|
+
assert detector.pattern1 is not None
|
11
|
+
assert len(detector.pattern1) > 0
|
12
|
+
|
13
|
+
def test_calculate_similarity():
|
14
|
+
data = pd.DataFrame({"value": range(100)})
|
15
|
+
pattern = pd.DataFrame({"value": range(10)})
|
16
|
+
detector = PatternDetector(data, pattern, "value")
|
17
|
+
detector.preprocess_pattern()
|
18
|
+
detector.calculate_similarity()
|
19
|
+
assert len(detector.similarity_dict) > 0
|
20
|
+
|
21
|
+
def test_find_area_of_interest():
|
22
|
+
data = pd.DataFrame({"value": range(100)})
|
23
|
+
pattern = pd.DataFrame({"value": range(10)})
|
24
|
+
detector = PatternDetector(data, pattern, "value")
|
25
|
+
result = detector.find_area_of_interest()
|
26
|
+
assert "cycle" in result.columns
|
27
|
+
assert not result["cycle"].isnull().all()
|