ts-shape 0.0.0.19__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ts-shape-0.0.0.19/LICENSE.txt +21 -0
- ts-shape-0.0.0.19/MANIFEST.in +0 -0
- ts-shape-0.0.0.19/PKG-INFO +75 -0
- ts-shape-0.0.0.19/README.md +60 -0
- ts-shape-0.0.0.19/setup.cfg +4 -0
- ts-shape-0.0.0.19/setup.py +26 -0
- ts-shape-0.0.0.19/src/ts_shape/context/__init__.py +1 -0
- ts-shape-0.0.0.19/src/ts_shape/context/value_mapping.py +89 -0
- ts-shape-0.0.0.19/src/ts_shape/events/__init__.py +0 -0
- ts-shape-0.0.0.19/src/ts_shape/events/maintenance/__init__.py +0 -0
- ts-shape-0.0.0.19/src/ts_shape/events/production/__init__.py +0 -0
- ts-shape-0.0.0.19/src/ts_shape/events/quality/__init__.py +0 -0
- ts-shape-0.0.0.19/src/ts_shape/events/quality/outlier_detection.py +120 -0
- ts-shape-0.0.0.19/src/ts_shape/events/quality/statistical_process_control.py +191 -0
- ts-shape-0.0.0.19/src/ts_shape/events/quality/tolerance_deviation.py +87 -0
- ts-shape-0.0.0.19/src/ts_shape/features/__init__.py +0 -0
- ts-shape-0.0.0.19/src/ts_shape/features/cycles/__init__.py +0 -0
- ts-shape-0.0.0.19/src/ts_shape/features/cycles/cycle_processor.py +121 -0
- ts-shape-0.0.0.19/src/ts_shape/features/cycles/cycles_extractor.py +109 -0
- ts-shape-0.0.0.19/src/ts_shape/features/stats/__init__.py +0 -0
- ts-shape-0.0.0.19/src/ts_shape/features/stats/boolean_stats.py +71 -0
- ts-shape-0.0.0.19/src/ts_shape/features/stats/feature_table.py +118 -0
- ts-shape-0.0.0.19/src/ts_shape/features/stats/numeric_stats.py +122 -0
- ts-shape-0.0.0.19/src/ts_shape/features/stats/string_stats.py +124 -0
- ts-shape-0.0.0.19/src/ts_shape/features/stats/timestamp_stats.py +103 -0
- ts-shape-0.0.0.19/src/ts_shape/features/time_stats/__init__.py +0 -0
- ts-shape-0.0.0.19/src/ts_shape/features/time_stats/time_stats_numeric.py +89 -0
- ts-shape-0.0.0.19/src/ts_shape/loader/__init__.py +0 -0
- ts-shape-0.0.0.19/src/ts_shape/loader/combine/__init__.py +0 -0
- ts-shape-0.0.0.19/src/ts_shape/loader/combine/integrator.py +99 -0
- ts-shape-0.0.0.19/src/ts_shape/loader/context/__init__.py +0 -0
- ts-shape-0.0.0.19/src/ts_shape/loader/metadata/__init__.py +0 -0
- ts-shape-0.0.0.19/src/ts_shape/loader/metadata/metadata_api_loader.py +109 -0
- ts-shape-0.0.0.19/src/ts_shape/loader/metadata/metadata_db_loader.py +107 -0
- ts-shape-0.0.0.19/src/ts_shape/loader/timeseries/__init__.py +0 -0
- ts-shape-0.0.0.19/src/ts_shape/loader/timeseries/parquet_loader.py +169 -0
- ts-shape-0.0.0.19/src/ts_shape/loader/timeseries/s3proxy_parquet_loader.py +83 -0
- ts-shape-0.0.0.19/src/ts_shape/loader/timeseries/timescale_loader.py +55 -0
- ts-shape-0.0.0.19/src/ts_shape/transform/__init__.py +0 -0
- ts-shape-0.0.0.19/src/ts_shape/transform/calculator/__init__.py +0 -0
- ts-shape-0.0.0.19/src/ts_shape/transform/calculator/numeric_calc.py +120 -0
- ts-shape-0.0.0.19/src/ts_shape/transform/filter/__init__.py +0 -0
- ts-shape-0.0.0.19/src/ts_shape/transform/filter/boolean_filter.py +37 -0
- ts-shape-0.0.0.19/src/ts_shape/transform/filter/custom_filter.py +32 -0
- ts-shape-0.0.0.19/src/ts_shape/transform/filter/datetime_filter.py +123 -0
- ts-shape-0.0.0.19/src/ts_shape/transform/filter/numeric_filter.py +39 -0
- ts-shape-0.0.0.19/src/ts_shape/transform/filter/string_filter.py +44 -0
- ts-shape-0.0.0.19/src/ts_shape/transform/functions/__init__.py +0 -0
- ts-shape-0.0.0.19/src/ts_shape/transform/functions/lambda_func.py +28 -0
- ts-shape-0.0.0.19/src/ts_shape/transform/time_functions/__init__.py +0 -0
- ts-shape-0.0.0.19/src/ts_shape/transform/time_functions/timestamp_converter.py +41 -0
- ts-shape-0.0.0.19/src/ts_shape/transform/time_functions/timezone_shift.py +150 -0
- ts-shape-0.0.0.19/src/ts_shape.egg-info/PKG-INFO +75 -0
- ts-shape-0.0.0.19/src/ts_shape.egg-info/SOURCES.txt +54 -0
- ts-shape-0.0.0.19/src/ts_shape.egg-info/dependency_links.txt +1 -0
- ts-shape-0.0.0.19/src/ts_shape.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) [2024] Jakob Gabriel
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
File without changes
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: ts-shape
|
|
3
|
+
Version: 0.0.0.19
|
|
4
|
+
Summary: ts-shape filters, transforms and engineers your timeseries dataframe
|
|
5
|
+
Home-page: https://jakobgabriel.github.io/ts-shape/
|
|
6
|
+
Author: Jakob Gabriel
|
|
7
|
+
Author-email: jakob.gabriel5@googlemail.com
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/jakobgabriel/ts-shape
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Requires-Python: >=3.10
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
License-File: LICENSE.txt
|
|
15
|
+
|
|
16
|
+
# ts-shape | Timeseries Shaper
|
|
17
|
+
|
|
18
|
+
[](https://pypi.org/project/timeseries-shaper/)
|
|
19
|
+
[](https://pepy.tech/projects/timeseries-shaper)
|
|
20
|
+

|
|
21
|
+
[](https://jakobgabriel.github.io/ts-shape/)
|
|
22
|
+
|
|
23
|
+
----
|
|
24
|
+
|
|
25
|
+
This repository contains the *ts-shape* python package. The abbreviation stands for
|
|
26
|
+
|
|
27
|
+
*"Time Series shaping with rule based methods"*.
|
|
28
|
+
|
|
29
|
+
ts-shape is a Python library for efficiently transforms, contextualizes and extracts events from time series data. It provides a set of tools to handle various transformations, making data preparation tasks easier and more intuitive.
|
|
30
|
+
|
|
31
|
+
Besides that multiple engineering specific methods are utilized to make it fast and easy to work with time series data.
|
|
32
|
+
|
|
33
|
+
## Features | Concept
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
| **Category** | **Feature** | **Status** |
|
|
37
|
+
|---------------|--------------------------------------------------------|------------|
|
|
38
|
+
| **Transform** | Filters: Datatype-specific filters | ✔️ |
|
|
39
|
+
| | Functions: Lambda functions for transformations | ✔️ |
|
|
40
|
+
| | Time Functions: Time-specific transformations | ✔️ |
|
|
41
|
+
| | Calculator: Calculation-based transformations | ✔️ |
|
|
42
|
+
| **Features** | Stats: Datatype-specific statistics | ✔️ |
|
|
43
|
+
| | Time Stats: Timestamp-specific statistics | ✔️ |
|
|
44
|
+
| **Context** | Contextualize Timeseries datasets with foreign sources | ❌ |
|
|
45
|
+
| **Events** | Quality Events | ❌ |
|
|
46
|
+
| | Maintenance Events | ❌ |
|
|
47
|
+
| | Production Events | ❌ |
|
|
48
|
+
| | Engineering Events | ❌ |
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
## Installation
|
|
52
|
+
|
|
53
|
+
Install ts-shape using pip:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
pip install timeseries-shaper
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Documentation
|
|
60
|
+
|
|
61
|
+
For full documentation, visit GitHub Pages or check out the docstrings in the code.
|
|
62
|
+
|
|
63
|
+
## Contributing
|
|
64
|
+
|
|
65
|
+
Contributions are welcome! For major changes, please open an issue first to discuss what you would like to change.
|
|
66
|
+
|
|
67
|
+
Please ensure to update tests as appropriate.
|
|
68
|
+
|
|
69
|
+
## License
|
|
70
|
+
|
|
71
|
+
Distributed under the MIT License. See LICENSE for more information.
|
|
72
|
+
|
|
73
|
+
## Acknowledgements
|
|
74
|
+
|
|
75
|
+
!TODO
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# ts-shape | Timeseries Shaper
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/timeseries-shaper/)
|
|
4
|
+
[](https://pepy.tech/projects/timeseries-shaper)
|
|
5
|
+

|
|
6
|
+
[](https://jakobgabriel.github.io/ts-shape/)
|
|
7
|
+
|
|
8
|
+
----
|
|
9
|
+
|
|
10
|
+
This repository contains the *ts-shape* python package. The abbreviation stands for
|
|
11
|
+
|
|
12
|
+
*"Time Series shaping with rule based methods"*.
|
|
13
|
+
|
|
14
|
+
ts-shape is a Python library for efficiently transforms, contextualizes and extracts events from time series data. It provides a set of tools to handle various transformations, making data preparation tasks easier and more intuitive.
|
|
15
|
+
|
|
16
|
+
Besides that multiple engineering specific methods are utilized to make it fast and easy to work with time series data.
|
|
17
|
+
|
|
18
|
+
## Features | Concept
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
| **Category** | **Feature** | **Status** |
|
|
22
|
+
|---------------|--------------------------------------------------------|------------|
|
|
23
|
+
| **Transform** | Filters: Datatype-specific filters | ✔️ |
|
|
24
|
+
| | Functions: Lambda functions for transformations | ✔️ |
|
|
25
|
+
| | Time Functions: Time-specific transformations | ✔️ |
|
|
26
|
+
| | Calculator: Calculation-based transformations | ✔️ |
|
|
27
|
+
| **Features** | Stats: Datatype-specific statistics | ✔️ |
|
|
28
|
+
| | Time Stats: Timestamp-specific statistics | ✔️ |
|
|
29
|
+
| **Context** | Contextualize Timeseries datasets with foreign sources | ❌ |
|
|
30
|
+
| **Events** | Quality Events | ❌ |
|
|
31
|
+
| | Maintenance Events | ❌ |
|
|
32
|
+
| | Production Events | ❌ |
|
|
33
|
+
| | Engineering Events | ❌ |
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
## Installation
|
|
37
|
+
|
|
38
|
+
Install ts-shape using pip:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install timeseries-shaper
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Documentation
|
|
45
|
+
|
|
46
|
+
For full documentation, visit GitHub Pages or check out the docstrings in the code.
|
|
47
|
+
|
|
48
|
+
## Contributing
|
|
49
|
+
|
|
50
|
+
Contributions are welcome! For major changes, please open an issue first to discuss what you would like to change.
|
|
51
|
+
|
|
52
|
+
Please ensure to update tests as appropriate.
|
|
53
|
+
|
|
54
|
+
## License
|
|
55
|
+
|
|
56
|
+
Distributed under the MIT License. See LICENSE for more information.
|
|
57
|
+
|
|
58
|
+
## Acknowledgements
|
|
59
|
+
|
|
60
|
+
!TODO
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import setuptools
|
|
2
|
+
|
|
3
|
+
with open("README.md", "r", encoding = "utf-8") as fh:
|
|
4
|
+
long_description = fh.read()
|
|
5
|
+
|
|
6
|
+
setuptools.setup(
|
|
7
|
+
name = "ts-shape",
|
|
8
|
+
version = "0.0.0.19",
|
|
9
|
+
author = "Jakob Gabriel",
|
|
10
|
+
author_email = "jakob.gabriel5@googlemail.com",
|
|
11
|
+
description = "ts-shape filters, transforms and engineers your timeseries dataframe",
|
|
12
|
+
long_description = long_description,
|
|
13
|
+
long_description_content_type = "text/markdown",
|
|
14
|
+
url = "https://jakobgabriel.github.io/ts-shape/",
|
|
15
|
+
project_urls = {
|
|
16
|
+
"Bug Tracker": "https://github.com/jakobgabriel/ts-shape",
|
|
17
|
+
},
|
|
18
|
+
classifiers = [
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Operating System :: OS Independent",
|
|
22
|
+
],
|
|
23
|
+
package_dir = {"": "src"},
|
|
24
|
+
packages = setuptools.find_packages(where="src"),
|
|
25
|
+
python_requires = ">=3.10"
|
|
26
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
'''test'''
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import pandas as pd # type: ignore
|
|
2
|
+
from typing import Union
|
|
3
|
+
from ts_shape.utils.base import Base
|
|
4
|
+
|
|
5
|
+
class ValueMapper(Base):
|
|
6
|
+
"""
|
|
7
|
+
A class to map values from specified columns of a DataFrame using a mapping table (CSV or JSON file),
|
|
8
|
+
inheriting from the Base class.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
def __init__(
|
|
12
|
+
self,
|
|
13
|
+
dataframe: pd.DataFrame,
|
|
14
|
+
mapping_file: str,
|
|
15
|
+
map_column: str,
|
|
16
|
+
mapping_key_column: str,
|
|
17
|
+
mapping_value_column: str,
|
|
18
|
+
file_type: str = 'csv',
|
|
19
|
+
sep: str = ',',
|
|
20
|
+
encoding: str = 'utf-8',
|
|
21
|
+
column_name: str = 'systime'
|
|
22
|
+
) -> None:
|
|
23
|
+
"""
|
|
24
|
+
Initializes ValueMapper and the base DataFrame from the Base class.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
dataframe (pd.DataFrame): The DataFrame to be processed and mapped.
|
|
28
|
+
mapping_file (str): The file path of the mapping table (CSV or JSON).
|
|
29
|
+
map_column (str): The name of the column in the DataFrame that needs to be mapped.
|
|
30
|
+
mapping_key_column (str): The column in the mapping table to match with values from the DataFrame.
|
|
31
|
+
mapping_value_column (str): The column in the mapping table containing the values to map to.
|
|
32
|
+
file_type (str): The type of the mapping file ('csv' or 'json'). Defaults to 'csv'.
|
|
33
|
+
sep (str): The separator for CSV files. Defaults to ','.
|
|
34
|
+
encoding (str): The encoding to use for reading the file. Defaults to 'utf-8'.
|
|
35
|
+
column_name (str): The name of the column to sort the DataFrame by in the base class. Defaults to 'systime'.
|
|
36
|
+
"""
|
|
37
|
+
# Initialize the Base class with the sorted DataFrame
|
|
38
|
+
super().__init__(dataframe, column_name)
|
|
39
|
+
|
|
40
|
+
# Additional attributes for ValueMapper
|
|
41
|
+
self.map_column: str = map_column
|
|
42
|
+
self.mapping_key_column: str = mapping_key_column
|
|
43
|
+
self.mapping_value_column: str = mapping_value_column
|
|
44
|
+
self.sep: str = sep
|
|
45
|
+
self.encoding: str = encoding
|
|
46
|
+
|
|
47
|
+
# Load the mapping table based on file type
|
|
48
|
+
self.mapping_table: pd.DataFrame = self._load_mapping_table(mapping_file, file_type)
|
|
49
|
+
|
|
50
|
+
def _load_mapping_table(self, mapping_file: str, file_type: str) -> pd.DataFrame:
|
|
51
|
+
"""
|
|
52
|
+
Loads the mapping table from a CSV or JSON file.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
mapping_file (str): The file path of the mapping table.
|
|
56
|
+
file_type (str): The type of the file ('csv' or 'json').
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
pd.DataFrame: The loaded mapping table as a DataFrame.
|
|
60
|
+
"""
|
|
61
|
+
if file_type == 'csv':
|
|
62
|
+
return pd.read_csv(mapping_file, sep=self.sep, encoding=self.encoding)
|
|
63
|
+
elif file_type == 'json':
|
|
64
|
+
return pd.read_json(mapping_file, encoding=self.encoding)
|
|
65
|
+
else:
|
|
66
|
+
raise ValueError("Unsupported file type. Please use 'csv' or 'json'.")
|
|
67
|
+
|
|
68
|
+
def map_values(self) -> pd.DataFrame:
|
|
69
|
+
"""
|
|
70
|
+
Maps values in the specified DataFrame column based on the mapping table.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
pd.DataFrame: A new DataFrame with the mapped values.
|
|
74
|
+
"""
|
|
75
|
+
# Merge the mapping table with the DataFrame based on the map_column and mapping_key_column
|
|
76
|
+
mapped_df = self.dataframe.merge(
|
|
77
|
+
self.mapping_table[[self.mapping_key_column, self.mapping_value_column]],
|
|
78
|
+
left_on=self.map_column,
|
|
79
|
+
right_on=self.mapping_key_column,
|
|
80
|
+
how='left'
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Replace the original column with the mapped values
|
|
84
|
+
mapped_df[self.map_column] = mapped_df[self.mapping_value_column]
|
|
85
|
+
|
|
86
|
+
# Drop unnecessary columns
|
|
87
|
+
mapped_df = mapped_df.drop([self.mapping_key_column, self.mapping_value_column], axis=1)
|
|
88
|
+
|
|
89
|
+
return mapped_df
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
import pandas as pd # type: ignore
|
|
2
|
+
import numpy as np
|
|
3
|
+
from scipy.stats import zscore
|
|
4
|
+
from typing import Callable, Union
|
|
5
|
+
from ts_shape.utils.base import Base
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class OutlierDetectionEvents(Base):
|
|
9
|
+
"""
|
|
10
|
+
Processes time series data to detect outliers based on specified statistical methods.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(self, dataframe: pd.DataFrame, value_column: str, event_uuid: str = 'outlier_event',
|
|
14
|
+
time_threshold: str = '5min') -> None:
|
|
15
|
+
"""
|
|
16
|
+
Initializes the OutlierDetectionEvents with specific attributes for outlier detection.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
dataframe (pd.DataFrame): The input time series DataFrame.
|
|
20
|
+
value_column (str): The name of the column containing the values for outlier detection.
|
|
21
|
+
event_uuid (str): A UUID or identifier for detected outlier events.
|
|
22
|
+
time_threshold (str): The time threshold to group close events together.
|
|
23
|
+
"""
|
|
24
|
+
super().__init__(dataframe)
|
|
25
|
+
self.value_column = value_column
|
|
26
|
+
self.event_uuid = event_uuid
|
|
27
|
+
self.time_threshold = time_threshold
|
|
28
|
+
|
|
29
|
+
def _group_outliers(self, outliers_df: pd.DataFrame) -> pd.DataFrame:
|
|
30
|
+
"""
|
|
31
|
+
Groups detected outliers that are close in time and prepares the final events DataFrame.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
pd.DataFrame: A DataFrame of grouped outlier events.
|
|
35
|
+
"""
|
|
36
|
+
# Grouping outliers that are close to each other in terms of time
|
|
37
|
+
outliers_df['group_id'] = (outliers_df['systime'].diff().abs() > pd.to_timedelta(self.time_threshold)).cumsum()
|
|
38
|
+
|
|
39
|
+
# Prepare events DataFrame
|
|
40
|
+
events_data = []
|
|
41
|
+
|
|
42
|
+
for group_id in outliers_df['group_id'].unique():
|
|
43
|
+
group_data = outliers_df[outliers_df['group_id'] == group_id]
|
|
44
|
+
if group_data.shape[0] > 1: # Ensure there's more than one row to work with
|
|
45
|
+
first_row = group_data.nsmallest(1, 'systime')
|
|
46
|
+
last_row = group_data.nlargest(1, 'systime')
|
|
47
|
+
combined_rows = pd.concat([first_row, last_row])
|
|
48
|
+
events_data.append(combined_rows)
|
|
49
|
+
|
|
50
|
+
# Convert list of DataFrame slices to a single DataFrame
|
|
51
|
+
if events_data:
|
|
52
|
+
events_df = pd.concat(events_data)
|
|
53
|
+
events_df['uuid'] = self.event_uuid
|
|
54
|
+
else:
|
|
55
|
+
events_df = pd.DataFrame(columns=outliers_df.columns) # Create empty DataFrame if no data
|
|
56
|
+
|
|
57
|
+
events_df[self.value_column] = np.nan
|
|
58
|
+
events_df['is_delta'] = True
|
|
59
|
+
|
|
60
|
+
return events_df.drop(['outlier', 'group_id'], axis=1)
|
|
61
|
+
|
|
62
|
+
def detect_outliers_zscore(self, threshold: float = 3.0) -> pd.DataFrame:
|
|
63
|
+
"""
|
|
64
|
+
Detects outliers using the Z-score method.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
threshold (float): The Z-score threshold for detecting outliers.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
pd.DataFrame: A DataFrame of detected outliers and grouped events.
|
|
71
|
+
"""
|
|
72
|
+
df = self.dataframe.copy()
|
|
73
|
+
|
|
74
|
+
# Convert 'systime' to datetime and sort the DataFrame by 'systime' in descending order
|
|
75
|
+
df['systime'] = pd.to_datetime(df['systime'])
|
|
76
|
+
df = df.sort_values(by='systime', ascending=False)
|
|
77
|
+
|
|
78
|
+
# Detect outliers using the Z-score method
|
|
79
|
+
df['outlier'] = np.abs(zscore(df[self.value_column])) > threshold
|
|
80
|
+
|
|
81
|
+
# Filter to keep only outliers
|
|
82
|
+
outliers_df = df[df['outlier']]
|
|
83
|
+
|
|
84
|
+
# Group and return the outliers
|
|
85
|
+
return self._group_outliers(outliers_df)
|
|
86
|
+
|
|
87
|
+
def detect_outliers_iqr(self, threshold: tuple = (1.5, 1.5)) -> pd.DataFrame:
|
|
88
|
+
"""
|
|
89
|
+
Detects outliers using the IQR method.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
threshold (tuple): The multipliers for the IQR range for detecting outliers (lower, upper).
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
pd.DataFrame: A DataFrame of detected outliers and grouped events.
|
|
96
|
+
"""
|
|
97
|
+
df = self.dataframe.copy()
|
|
98
|
+
|
|
99
|
+
# Convert 'systime' to datetime and sort the DataFrame by 'systime' in descending order
|
|
100
|
+
df['systime'] = pd.to_datetime(df['systime'])
|
|
101
|
+
df = df.sort_values(by='systime', ascending=False)
|
|
102
|
+
|
|
103
|
+
# Detect outliers using the IQR method
|
|
104
|
+
Q1 = df[self.value_column].quantile(0.25)
|
|
105
|
+
Q3 = df[self.value_column].quantile(0.75)
|
|
106
|
+
IQR = Q3 - Q1
|
|
107
|
+
lower_bound = Q1 - threshold[0] * IQR
|
|
108
|
+
upper_bound = Q3 + threshold[1] * IQR
|
|
109
|
+
df['outlier'] = (df[self.value_column] < lower_bound) | (df[self.value_column] > upper_bound)
|
|
110
|
+
|
|
111
|
+
# Filter to keep only outliers
|
|
112
|
+
outliers_df = df[df['outlier']]
|
|
113
|
+
|
|
114
|
+
# Group and return the outliers
|
|
115
|
+
return self._group_outliers(outliers_df)
|
|
116
|
+
|
|
117
|
+
# Example usage:
|
|
118
|
+
# outlier_detector = OutlierDetectionEvents(dataframe=df, value_column='value')
|
|
119
|
+
# detected_outliers_zscore = outlier_detector.detect_outliers_zscore(threshold=3.0)
|
|
120
|
+
# detected_outliers_iqr = outlier_detector.detect_outliers_iqr(threshold=(1.5, 1.5))
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
import pandas as pd # type: ignore
|
|
2
|
+
import numpy as np
|
|
3
|
+
from typing import Callable, List, Optional
|
|
4
|
+
from ts_shape.utils.base import Base
|
|
5
|
+
|
|
6
|
+
class StatisticalProcessControlRuleBased(Base):
|
|
7
|
+
"""
|
|
8
|
+
Inherits from Base and applies SPC rules (Western Electric Rules) to a DataFrame for event detection.
|
|
9
|
+
Processes data based on control limit UUIDs, actual value UUIDs, and generates events with an event UUID.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def __init__(self, dataframe: pd.DataFrame, value_column: str, tolerance_uuid: str, actual_uuid: str, event_uuid: str) -> None:
|
|
13
|
+
"""
|
|
14
|
+
Initializes the SPCMonitor with UUIDs for tolerance, actual, and event values.
|
|
15
|
+
Inherits the sorted dataframe from the Base class.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
dataframe (pd.DataFrame): The input DataFrame containing the data to be processed.
|
|
19
|
+
value_column (str): The column containing the values to monitor.
|
|
20
|
+
tolerance_uuid (str): UUID identifier for rows that set tolerance values.
|
|
21
|
+
actual_uuid (str): UUID identifier for rows containing actual values.
|
|
22
|
+
event_uuid (str): UUID to assign to generated events.
|
|
23
|
+
"""
|
|
24
|
+
super().__init__(dataframe) # Initialize the Base class
|
|
25
|
+
self.value_column: str = value_column
|
|
26
|
+
self.tolerance_uuid: str = tolerance_uuid
|
|
27
|
+
self.actual_uuid: str = actual_uuid
|
|
28
|
+
self.event_uuid: str = event_uuid
|
|
29
|
+
|
|
30
|
+
def calculate_control_limits(self) -> pd.DataFrame:
|
|
31
|
+
"""
|
|
32
|
+
Calculate the control limits (mean ± 1σ, 2σ, 3σ) for the tolerance values.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
pd.DataFrame: DataFrame with control limits for each tolerance group.
|
|
36
|
+
"""
|
|
37
|
+
df = self.dataframe[self.dataframe['uuid'] == self.tolerance_uuid]
|
|
38
|
+
mean = df[self.value_column].mean()
|
|
39
|
+
sigma = df[self.value_column].std()
|
|
40
|
+
|
|
41
|
+
control_limits = {
|
|
42
|
+
'mean': mean,
|
|
43
|
+
'1sigma_upper': mean + sigma,
|
|
44
|
+
'1sigma_lower': mean - sigma,
|
|
45
|
+
'2sigma_upper': mean + 2 * sigma,
|
|
46
|
+
'2sigma_lower': mean - 2 * sigma,
|
|
47
|
+
'3sigma_upper': mean + 3 * sigma,
|
|
48
|
+
'3sigma_lower': mean - 3 * sigma,
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
return pd.DataFrame([control_limits])
|
|
52
|
+
|
|
53
|
+
def rule_1(self, df: pd.DataFrame, limits: pd.DataFrame) -> pd.DataFrame:
|
|
54
|
+
"""
|
|
55
|
+
Rule 1: One point beyond the 3σ control limits.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
pd.DataFrame: Filtered DataFrame with rule violations.
|
|
59
|
+
"""
|
|
60
|
+
df['rule_1'] = (df[self.value_column] > limits['3sigma_upper'].values[0]) | (df[self.value_column] < limits['3sigma_lower'].values[0])
|
|
61
|
+
return df[df['rule_1']]
|
|
62
|
+
|
|
63
|
+
def rule_2(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
64
|
+
"""
|
|
65
|
+
Rule 2: Nine consecutive points on one side of the mean.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
pd.DataFrame: Filtered DataFrame with rule violations.
|
|
69
|
+
"""
|
|
70
|
+
mean = df[self.value_column].mean()
|
|
71
|
+
df['above_mean'] = df[self.value_column] > mean
|
|
72
|
+
df['below_mean'] = df[self.value_column] < mean
|
|
73
|
+
df['rule_2'] = (df['above_mean'].rolling(window=9).sum() == 9) | (df['below_mean'].rolling(window=9).sum() == 9)
|
|
74
|
+
return df[df['rule_2']]
|
|
75
|
+
|
|
76
|
+
def rule_3(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
77
|
+
"""
|
|
78
|
+
Rule 3: Six consecutive points steadily increasing or decreasing.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
pd.DataFrame: Filtered DataFrame with rule violations.
|
|
82
|
+
"""
|
|
83
|
+
df['increasing'] = df[self.value_column].diff().gt(0)
|
|
84
|
+
df['decreasing'] = df[self.value_column].diff().lt(0)
|
|
85
|
+
df['rule_3'] = (df['increasing'].rolling(window=6).sum() == 6) | (df['decreasing'].rolling(window=6).sum() == 6)
|
|
86
|
+
return df[df['rule_3']]
|
|
87
|
+
|
|
88
|
+
def rule_4(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
89
|
+
"""
|
|
90
|
+
Rule 4: Fourteen consecutive points alternating up and down.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
pd.DataFrame: Filtered DataFrame with rule violations.
|
|
94
|
+
"""
|
|
95
|
+
df['alternating'] = df[self.value_column].diff().apply(np.sign)
|
|
96
|
+
df['rule_4'] = df['alternating'].rolling(window=14).apply(lambda x: (x != x.shift()).sum() == 13, raw=True)
|
|
97
|
+
return df[df['rule_4']]
|
|
98
|
+
|
|
99
|
+
def rule_5(self, df: pd.DataFrame, limits: pd.DataFrame) -> pd.DataFrame:
|
|
100
|
+
"""
|
|
101
|
+
Rule 5: Two out of three consecutive points near the control limit (beyond 2σ but within 3σ).
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
pd.DataFrame: Filtered DataFrame with rule violations.
|
|
105
|
+
"""
|
|
106
|
+
df['rule_5'] = df[self.value_column].apply(
|
|
107
|
+
lambda x: 1 if ((x > limits['2sigma_upper'].values[0] and x < limits['3sigma_upper'].values[0]) or
|
|
108
|
+
(x < limits['2sigma_lower'].values[0] and x > limits['3sigma_lower'].values[0])) else 0
|
|
109
|
+
)
|
|
110
|
+
df['rule_5'] = df['rule_5'].rolling(window=3).sum() >= 2
|
|
111
|
+
return df[df['rule_5']]
|
|
112
|
+
|
|
113
|
+
def rule_6(self, df: pd.DataFrame, limits: pd.DataFrame) -> pd.DataFrame:
|
|
114
|
+
"""
|
|
115
|
+
Rule 6: Four out of five consecutive points near the control limit (beyond 1σ but within 2σ).
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
pd.DataFrame: Filtered DataFrame with rule violations.
|
|
119
|
+
"""
|
|
120
|
+
df['rule_6'] = df[self.value_column].apply(
|
|
121
|
+
lambda x: 1 if ((x > limits['1sigma_upper'].values[0] and x < limits['2sigma_upper'].values[0]) or
|
|
122
|
+
(x < limits['1sigma_lower'].values[0] and x > limits['2sigma_lower'].values[0])) else 0
|
|
123
|
+
)
|
|
124
|
+
df['rule_6'] = df['rule_6'].rolling(window=5).sum() >= 4
|
|
125
|
+
return df[df['rule_6']]
|
|
126
|
+
|
|
127
|
+
def rule_7(self, df: pd.DataFrame, limits: pd.DataFrame) -> pd.DataFrame:
|
|
128
|
+
"""
|
|
129
|
+
Rule 7: Fifteen consecutive points within 1σ of the centerline.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
pd.DataFrame: Filtered DataFrame with rule violations.
|
|
133
|
+
"""
|
|
134
|
+
df['rule_7'] = df[self.value_column].apply(
|
|
135
|
+
lambda x: 1 if (x < limits['1sigma_upper'].values[0] and x > limits['1sigma_lower'].values[0]) else 0
|
|
136
|
+
)
|
|
137
|
+
df['rule_7'] = df['rule_7'].rolling(window=15).sum() == 15
|
|
138
|
+
return df[df['rule_7']]
|
|
139
|
+
|
|
140
|
+
def rule_8(self, df: pd.DataFrame, limits: pd.DataFrame) -> pd.DataFrame:
|
|
141
|
+
"""
|
|
142
|
+
Rule 8: Eight consecutive points on both sides of the mean within 1σ.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
pd.DataFrame: Filtered DataFrame with rule violations.
|
|
146
|
+
"""
|
|
147
|
+
df['rule_8'] = df[self.value_column].apply(
|
|
148
|
+
lambda x: 1 if (x < limits['1sigma_upper'].values[0] and x > limits['1sigma_lower'].values[0]) else 0
|
|
149
|
+
)
|
|
150
|
+
df['rule_8'] = df['rule_8'].rolling(window=8).sum() == 8
|
|
151
|
+
return df[df['rule_8']]
|
|
152
|
+
|
|
153
|
+
def process(self, selected_rules: Optional[List[str]] = None) -> pd.DataFrame:
|
|
154
|
+
"""
|
|
155
|
+
Applies the selected SPC rules and generates a DataFrame of events where any rules are violated.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
selected_rules (Optional[List[str]]): List of rule names (e.g., ['rule_1', 'rule_3']) to apply.
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
pd.DataFrame: DataFrame with rule violations and detected events.
|
|
162
|
+
"""
|
|
163
|
+
df = self.dataframe[self.dataframe['uuid'] == self.actual_uuid]
|
|
164
|
+
df['systime'] = pd.to_datetime(df['systime'])
|
|
165
|
+
df = df.sort_values(by='systime')
|
|
166
|
+
|
|
167
|
+
limits = self.calculate_control_limits()
|
|
168
|
+
|
|
169
|
+
# Dictionary of rule functions
|
|
170
|
+
rules = {
|
|
171
|
+
'rule_1': lambda df: self.rule_1(df, limits),
|
|
172
|
+
'rule_2': lambda df: self.rule_2(df),
|
|
173
|
+
'rule_3': lambda df: self.rule_3(df),
|
|
174
|
+
'rule_4': lambda df: self.rule_4(df),
|
|
175
|
+
'rule_5': lambda df: self.rule_5(df, limits),
|
|
176
|
+
'rule_6': lambda df: self.rule_6(df, limits),
|
|
177
|
+
'rule_7': lambda df: self.rule_7(df, limits),
|
|
178
|
+
'rule_8': lambda df: self.rule_8(df, limits)
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
# If no specific rules are provided, use all rules
|
|
182
|
+
if selected_rules is None:
|
|
183
|
+
selected_rules = list(rules.keys())
|
|
184
|
+
|
|
185
|
+
# Apply selected rules and concatenate results
|
|
186
|
+
events = pd.concat([rules[rule](df) for rule in selected_rules if rule in rules]).drop_duplicates()
|
|
187
|
+
|
|
188
|
+
# Add the event UUID to the detected events
|
|
189
|
+
events['uuid'] = self.event_uuid
|
|
190
|
+
|
|
191
|
+
return events[['systime', self.value_column, 'uuid']].drop_duplicates()
|