pattern-detector 0.1.0__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- pattern_detection/__init__.py +3 -0
- pattern_detection/detector.py +152 -0
- pattern_detection/utils.py +87 -0
- pattern_detector-0.1.0.dist-info/LICENSE +21 -0
- pattern_detector-0.1.0.dist-info/METADATA +17 -0
- pattern_detector-0.1.0.dist-info/RECORD +10 -0
- pattern_detector-0.1.0.dist-info/WHEEL +5 -0
- pattern_detector-0.1.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +0 -0
- tests/test_detector.py +27 -0
@@ -0,0 +1,152 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import pandas as pd
|
3
|
+
from scipy.stats import skew, kurtosis
|
4
|
+
from joblib import Parallel, delayed
|
5
|
+
|
6
|
+
class PatternDetector:
|
7
|
+
def __init__(self, df, pattern, column_pattern):
|
8
|
+
self.df = df.copy()
|
9
|
+
self.pattern = pattern
|
10
|
+
self.column_pattern = column_pattern
|
11
|
+
self.similarity_dict = {}
|
12
|
+
self.pattern1 = None
|
13
|
+
self.window_size = None
|
14
|
+
self.bin_parser = None
|
15
|
+
self.len_iter = None
|
16
|
+
self.pattern_constraints = {}
|
17
|
+
|
18
|
+
def preprocess_pattern(self):
|
19
|
+
"""Preprocess the pattern data."""
|
20
|
+
len_iter = 400 if len(self.pattern) >= 400 else 200
|
21
|
+
bin_parser = 3 if len_iter == 400 else 2
|
22
|
+
|
23
|
+
self.pattern['bin'] = self.pattern.index // bin_parser
|
24
|
+
self.pattern1 = self.pattern.groupby('bin')[self.column_pattern].mean().to_numpy()
|
25
|
+
self.len_iter = len_iter
|
26
|
+
self.bin_parser = bin_parser
|
27
|
+
self.window_size = len(self.pattern1)
|
28
|
+
|
29
|
+
# Compute constraints
|
30
|
+
self.pattern_constraints = {
|
31
|
+
"max_pos": np.max(self.pattern1) + 0.1 * np.ptp(self.pattern1),
|
32
|
+
"min_pos": np.min(self.pattern1) - 0.1 * np.ptp(self.pattern1),
|
33
|
+
"mean_pos_upper": np.mean(self.pattern1) + 0.1 * np.ptp(self.pattern1),
|
34
|
+
"mean_pos_lower": np.mean(self.pattern1) - 0.1 * np.ptp(self.pattern1),
|
35
|
+
"pattern_skewness": skew(self.pattern1),
|
36
|
+
"pattern_kurtosis": kurtosis(self.pattern1),
|
37
|
+
"pattern_std": np.std(self.pattern1),
|
38
|
+
"starting_point_lower": self.pattern1[0] - 0.2 * np.ptp(self.pattern1),
|
39
|
+
"starting_point_upper": self.pattern1[0] + 0.2 * np.ptp(self.pattern1),
|
40
|
+
"ending_point_lower": self.pattern1[-1] - 0.2 * np.ptp(self.pattern1),
|
41
|
+
"ending_point_upper": self.pattern1[-1] + 0.2 * np.ptp(self.pattern1),
|
42
|
+
"cross_correlation_threshold": 0.5,
|
43
|
+
}
|
44
|
+
|
45
|
+
def preprocess_data(self):
|
46
|
+
"""Preprocess the main data."""
|
47
|
+
self.df['bin'] = self.df.index // self.bin_parser
|
48
|
+
return self.df.groupby('bin')[self.column_pattern].mean().to_numpy().reshape(-1, 1)
|
49
|
+
|
50
|
+
def apply_constraints(self, window):
|
51
|
+
"""Apply constraints to filter valid windows."""
|
52
|
+
pc = self.pattern_constraints
|
53
|
+
corr_coef = np.corrcoef(window, self.pattern1)[0][1]
|
54
|
+
|
55
|
+
if (
|
56
|
+
np.max(window) <= pc["max_pos"]
|
57
|
+
and np.min(window) >= pc["min_pos"]
|
58
|
+
and pc["mean_pos_upper"] >= np.mean(window) >= pc["mean_pos_lower"]
|
59
|
+
and abs(skew(window) - pc["pattern_skewness"]) < 0.5
|
60
|
+
and abs(kurtosis(window) - pc["pattern_kurtosis"]) < 1.0
|
61
|
+
and pc["pattern_std"] * 0.9 <= np.std(window) <= pc["pattern_std"] * 1.1
|
62
|
+
and corr_coef >= pc["cross_correlation_threshold"]
|
63
|
+
and pc["starting_point_lower"] <= window[0] <= pc["starting_point_upper"]
|
64
|
+
and pc["ending_point_lower"] <= window[-1] <= pc["ending_point_upper"]
|
65
|
+
):
|
66
|
+
return True
|
67
|
+
return False
|
68
|
+
|
69
|
+
def compute_cosine_sim(self, data1, i, j):
|
70
|
+
"""Compute cosine similarity for a given sliding window."""
|
71
|
+
window = data1[i:i + self.window_size - (self.len_iter // 2) + j, :].reshape(-1,)
|
72
|
+
|
73
|
+
if len(window) != len(self.pattern1): # Ensure dimensions match
|
74
|
+
return i, j, 0
|
75
|
+
|
76
|
+
# Apply constraints
|
77
|
+
if not self.apply_constraints(window):
|
78
|
+
return i, j, 0
|
79
|
+
|
80
|
+
fft_pattern = np.fft.fft(self.pattern1)
|
81
|
+
fft_window = np.fft.fft(window)
|
82
|
+
|
83
|
+
dot_product = np.dot(np.abs(fft_pattern), np.abs(fft_window))
|
84
|
+
norm_pattern = np.linalg.norm(np.abs(fft_pattern))
|
85
|
+
norm_window = np.linalg.norm(np.abs(fft_window))
|
86
|
+
similarity = dot_product / (norm_pattern * norm_window)
|
87
|
+
|
88
|
+
return i, j, similarity
|
89
|
+
|
90
|
+
def calculate_similarity(self):
|
91
|
+
"""Calculate sliding window cosine similarity."""
|
92
|
+
data1 = self.preprocess_data()
|
93
|
+
|
94
|
+
results = Parallel(n_jobs=-1)(
|
95
|
+
delayed(self.compute_cosine_sim)(data1, i, j)
|
96
|
+
for i in range(0, len(data1) - self.window_size, 2)
|
97
|
+
for j in range(0, self.len_iter, self.len_iter // 40)
|
98
|
+
)
|
99
|
+
|
100
|
+
for i, j, similarity in results:
|
101
|
+
if similarity > 0:
|
102
|
+
self.similarity_dict.setdefault(i, {})[j] = similarity
|
103
|
+
|
104
|
+
def get_top_similarities(self):
|
105
|
+
"""Extract top similarities from the similarity dictionary."""
|
106
|
+
results = [
|
107
|
+
{'key': key1, 'max_key': max(value, key=value.get), 'max_value': max(value.values())}
|
108
|
+
for key1, value in self.similarity_dict.items()
|
109
|
+
]
|
110
|
+
return pd.DataFrame(results)
|
111
|
+
|
112
|
+
def find_area_of_interest(self):
|
113
|
+
"""Find areas of interest in the data."""
|
114
|
+
self.preprocess_pattern()
|
115
|
+
self.calculate_similarity()
|
116
|
+
df_dist = self.get_top_similarities()
|
117
|
+
|
118
|
+
approx_cycle_length = len(self.pattern1) * 0.95
|
119
|
+
df_dist['app_cycle'] = (df_dist['key'] // approx_cycle_length).astype(int)
|
120
|
+
grouped = df_dist.groupby('app_cycle')
|
121
|
+
|
122
|
+
cyc_concat_df = pd.concat(
|
123
|
+
[
|
124
|
+
group.loc[group['max_value'].idxmax()].assign(cycle=idx_cyc)
|
125
|
+
for idx_cyc, (_, group) in enumerate(grouped)
|
126
|
+
if not group.empty and group['max_value'].max() != 0
|
127
|
+
],
|
128
|
+
ignore_index=True
|
129
|
+
)
|
130
|
+
|
131
|
+
cyc_concat_df['start_index'] = cyc_concat_df['key']
|
132
|
+
cyc_concat_df['end_index'] = (
|
133
|
+
cyc_concat_df['start_index'] + self.window_size +
|
134
|
+
cyc_concat_df['max_key'] - (self.len_iter // 2)
|
135
|
+
)
|
136
|
+
cyc_concat_df['shift_start'] = cyc_concat_df['start_index'].shift(1, fill_value=len(self.pattern1))
|
137
|
+
cyc_concat_df['diff'] = cyc_concat_df['shift_start'] - cyc_concat_df['start_index']
|
138
|
+
limit = len(self.pattern1) * 0.7
|
139
|
+
cyc_concat_df = cyc_concat_df[cyc_concat_df['diff'] < -limit].reset_index(drop=True)
|
140
|
+
cyc_concat_df['cycle'] = cyc_concat_df.index
|
141
|
+
|
142
|
+
cyc_concat_df['shift_end'] = cyc_concat_df['end_index'].shift(1, fill_value=cyc_concat_df['diff'].iloc[0])
|
143
|
+
cyc_concat_df['diff_end'] = cyc_concat_df['shift_end'] - cyc_concat_df['start_index']
|
144
|
+
overlap = cyc_concat_df['diff_end'] > 0
|
145
|
+
cyc_concat_df.loc[overlap, 'start_index'] += cyc_concat_df.loc[overlap, 'diff_end'] + 1
|
146
|
+
|
147
|
+
self.df['cycle'] = np.nan
|
148
|
+
for _, row in cyc_concat_df.iterrows():
|
149
|
+
start, stop = int(row['start_index'] * self.bin_parser), int(row['end_index'] * self.bin_parser)
|
150
|
+
self.df.loc[start:stop, 'cycle'] = row['cycle']
|
151
|
+
|
152
|
+
return self.df
|
@@ -0,0 +1,87 @@
|
|
1
|
+
import numpy as np
|
2
|
+
from scipy.stats import skew, kurtosis
|
3
|
+
|
4
|
+
def calculate_statistics(data):
|
5
|
+
"""
|
6
|
+
Calculate various statistics for a given dataset.
|
7
|
+
|
8
|
+
Parameters:
|
9
|
+
- data (array-like): The input data.
|
10
|
+
|
11
|
+
Returns:
|
12
|
+
- dict: Dictionary containing max, min, mean, std, skewness, and kurtosis.
|
13
|
+
"""
|
14
|
+
return {
|
15
|
+
"max": np.max(data),
|
16
|
+
"min": np.min(data),
|
17
|
+
"mean": np.mean(data),
|
18
|
+
"std": np.std(data),
|
19
|
+
"skewness": skew(data),
|
20
|
+
"kurtosis": kurtosis(data),
|
21
|
+
}
|
22
|
+
|
23
|
+
def normalize_data(data):
|
24
|
+
"""
|
25
|
+
Normalize the input data to range [0, 1].
|
26
|
+
|
27
|
+
Parameters:
|
28
|
+
- data (array-like): Input data to normalize.
|
29
|
+
|
30
|
+
Returns:
|
31
|
+
- np.ndarray: Normalized data.
|
32
|
+
"""
|
33
|
+
min_val = np.min(data)
|
34
|
+
max_val = np.max(data)
|
35
|
+
return (data - min_val) / (max_val - min_val)
|
36
|
+
|
37
|
+
def validate_window(window, constraints):
|
38
|
+
"""
|
39
|
+
Validate a sliding window against constraints.
|
40
|
+
|
41
|
+
Parameters:
|
42
|
+
- window (array-like): The input window data.
|
43
|
+
- constraints (dict): A dictionary of constraints with keys like 'max_pos', 'min_pos', etc.
|
44
|
+
|
45
|
+
Returns:
|
46
|
+
- bool: True if all constraints are satisfied, False otherwise.
|
47
|
+
"""
|
48
|
+
corr_coef = np.corrcoef(window, constraints['pattern'])[0, 1]
|
49
|
+
|
50
|
+
return (
|
51
|
+
np.max(window) <= constraints["max_pos"]
|
52
|
+
and np.min(window) >= constraints["min_pos"]
|
53
|
+
and constraints["mean_pos_upper"] >= np.mean(window) >= constraints["mean_pos_lower"]
|
54
|
+
and abs(skew(window) - constraints["pattern_skewness"]) < 0.5
|
55
|
+
and abs(kurtosis(window) - constraints["pattern_kurtosis"]) < 1.0
|
56
|
+
and constraints["pattern_std"] * 0.9 <= np.std(window) <= constraints["pattern_std"] * 1.1
|
57
|
+
and corr_coef >= constraints["cross_correlation_threshold"]
|
58
|
+
and constraints["starting_point_lower"] <= window[0] <= constraints["starting_point_upper"]
|
59
|
+
and constraints["ending_point_lower"] <= window[-1] <= constraints["ending_point_upper"]
|
60
|
+
)
|
61
|
+
|
62
|
+
def calculate_correlation(data1, data2):
|
63
|
+
"""
|
64
|
+
Calculate correlation coefficient between two datasets.
|
65
|
+
|
66
|
+
Parameters:
|
67
|
+
- data1 (array-like): First dataset.
|
68
|
+
- data2 (array-like): Second dataset.
|
69
|
+
|
70
|
+
Returns:
|
71
|
+
- float: Correlation coefficient.
|
72
|
+
"""
|
73
|
+
return np.corrcoef(data1, data2)[0, 1]
|
74
|
+
|
75
|
+
def bin_data(data, bin_size):
|
76
|
+
"""
|
77
|
+
Bin the data into averages over fixed-size intervals.
|
78
|
+
|
79
|
+
Parameters:
|
80
|
+
- data (array-like): The input data.
|
81
|
+
- bin_size (int): The size of the bins.
|
82
|
+
|
83
|
+
Returns:
|
84
|
+
- np.ndarray: Binned data.
|
85
|
+
"""
|
86
|
+
binned = data[:len(data) // bin_size * bin_size].reshape(-1, bin_size)
|
87
|
+
return np.mean(binned, axis=1)
|
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2025 Yigit Utku Bulut and Ahmet Faruk Minareci
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
@@ -0,0 +1,17 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: pattern_detector
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: A library for detecting patterns in time-series data.
|
5
|
+
Author: Yigit Utku Bulut and Ahmet Faruk Minareci
|
6
|
+
Author-email: yigit.utku.bulut@gmail.com, ahmetfaruk.minareci@gmail.com
|
7
|
+
License: MIT
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
10
|
+
Classifier: Operating System :: OS Independent
|
11
|
+
Requires-Python: >=3.7
|
12
|
+
License-File: LICENSE
|
13
|
+
Requires-Dist: numpy
|
14
|
+
Requires-Dist: pandas
|
15
|
+
Requires-Dist: joblib
|
16
|
+
Requires-Dist: scipy
|
17
|
+
|
@@ -0,0 +1,10 @@
|
|
1
|
+
pattern_detection/__init__.py,sha256=hzZhj16aof1dhcIOAI9Tfo5EoPJgexGChsnCLgXq-68,68
|
2
|
+
pattern_detection/detector.py,sha256=XaYYRledSolgTPTYLRSWhjZhmnAAvZT2pD3h5g9kP9k,6585
|
3
|
+
pattern_detection/utils.py,sha256=jqkfeTqWEnrnvzKBxr_UdKj9cj7zPF15UhHPr9sBSa0,2646
|
4
|
+
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
tests/test_detector.py,sha256=biHGbqd6Uc8dFTtyK-BxYeqFYtJOO9lk_6rQMeQCbQs,972
|
6
|
+
pattern_detector-0.1.0.dist-info/LICENSE,sha256=RslT26mCGxF9nQEyEQL8kFMFUgmYvCA0UivIWefxWmU,1098
|
7
|
+
pattern_detector-0.1.0.dist-info/METADATA,sha256=9pgMSSHwImJHjA1AOAOJm-ou2MsYnNFwPa6AGvl-fow,536
|
8
|
+
pattern_detector-0.1.0.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
|
9
|
+
pattern_detector-0.1.0.dist-info/top_level.txt,sha256=W_tzR9Ah3_TgQsiGDs46ELePkuxfcPmFOosXSYw2hl0,24
|
10
|
+
pattern_detector-0.1.0.dist-info/RECORD,,
|
tests/__init__.py
ADDED
File without changes
|
tests/test_detector.py
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
import pytest
|
2
|
+
import numpy as np
|
3
|
+
import pandas as pd
|
4
|
+
from pattern_detection.detector import PatternDetector
|
5
|
+
|
6
|
+
def test_preprocess_pattern():
|
7
|
+
pattern = pd.DataFrame({"value": range(10)})
|
8
|
+
detector = PatternDetector(None, pattern, "value")
|
9
|
+
detector.preprocess_pattern()
|
10
|
+
assert detector.pattern1 is not None
|
11
|
+
assert len(detector.pattern1) > 0
|
12
|
+
|
13
|
+
def test_calculate_similarity():
|
14
|
+
data = pd.DataFrame({"value": range(100)})
|
15
|
+
pattern = pd.DataFrame({"value": range(10)})
|
16
|
+
detector = PatternDetector(data, pattern, "value")
|
17
|
+
detector.preprocess_pattern()
|
18
|
+
detector.calculate_similarity()
|
19
|
+
assert len(detector.similarity_dict) > 0
|
20
|
+
|
21
|
+
def test_find_area_of_interest():
|
22
|
+
data = pd.DataFrame({"value": range(100)})
|
23
|
+
pattern = pd.DataFrame({"value": range(10)})
|
24
|
+
detector = PatternDetector(data, pattern, "value")
|
25
|
+
result = detector.find_area_of_interest()
|
26
|
+
assert "cycle" in result.columns
|
27
|
+
assert not result["cycle"].isnull().all()
|