gensor 0.2.3__tar.gz → 0.2.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gensor-0.2.3 → gensor-0.2.5}/PKG-INFO +1 -1
- {gensor-0.2.3 → gensor-0.2.5}/gensor/__init__.py +29 -29
- {gensor-0.2.3 → gensor-0.2.5}/gensor/analysis/outliers.py +173 -173
- {gensor-0.2.3 → gensor-0.2.5}/gensor/analysis/stats.py +28 -28
- {gensor-0.2.3 → gensor-0.2.5}/gensor/config.py +17 -17
- {gensor-0.2.3 → gensor-0.2.5}/gensor/core/base.py +412 -409
- {gensor-0.2.3 → gensor-0.2.5}/gensor/core/dataset.py +207 -203
- {gensor-0.2.3 → gensor-0.2.5}/gensor/core/indexer.py +37 -37
- {gensor-0.2.3 → gensor-0.2.5}/gensor/core/timeseries.py +78 -78
- {gensor-0.2.3 → gensor-0.2.5}/gensor/db/__init__.py +14 -14
- {gensor-0.2.3 → gensor-0.2.5}/gensor/db/connection.py +197 -197
- {gensor-0.2.3 → gensor-0.2.5}/gensor/exceptions.py +55 -55
- {gensor-0.2.3 → gensor-0.2.5}/gensor/io/read.py +192 -192
- {gensor-0.2.3 → gensor-0.2.5}/gensor/log.py +7 -7
- {gensor-0.2.3 → gensor-0.2.5}/gensor/parse/__init__.py +4 -4
- {gensor-0.2.3 → gensor-0.2.5}/gensor/parse/plain.py +61 -61
- {gensor-0.2.3 → gensor-0.2.5}/gensor/parse/utils.py +87 -87
- {gensor-0.2.3 → gensor-0.2.5}/gensor/parse/vanessen.py +89 -89
- {gensor-0.2.3 → gensor-0.2.5}/gensor/processing/compensation.py +195 -195
- {gensor-0.2.3 → gensor-0.2.5}/gensor/processing/smoothing.py +66 -66
- {gensor-0.2.3 → gensor-0.2.5}/gensor/processing/transform.py +148 -148
- {gensor-0.2.3 → gensor-0.2.5}/gensor/testdata/__init__.py +25 -25
- {gensor-0.2.3 → gensor-0.2.5}/pyproject.toml +2 -1
- {gensor-0.2.3 → gensor-0.2.5}/LICENSE +0 -0
- {gensor-0.2.3 → gensor-0.2.5}/README.md +0 -0
- {gensor-0.2.3 → gensor-0.2.5}/gensor/analysis/__init__.py +0 -0
- {gensor-0.2.3 → gensor-0.2.5}/gensor/core/__init__.py +0 -0
- {gensor-0.2.3 → gensor-0.2.5}/gensor/io/__init__.py +0 -0
- {gensor-0.2.3 → gensor-0.2.5}/gensor/processing/__init__.py +0 -0
- {gensor-0.2.3 → gensor-0.2.5}/gensor/testdata/Barodiver_220427183008_BY222.csv +0 -0
- {gensor-0.2.3 → gensor-0.2.5}/gensor/testdata/PB01A_moni_AV319_220427183019_AV319.csv +0 -0
- {gensor-0.2.3 → gensor-0.2.5}/gensor/testdata/PB02A_plain.csv +0 -0
- {gensor-0.2.3 → gensor-0.2.5}/py.typed +0 -0
|
@@ -1,29 +1,29 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
|
|
3
|
-
from .core.dataset import Dataset
|
|
4
|
-
from .core.timeseries import Timeseries
|
|
5
|
-
from .io.read import read_from_csv, read_from_sql
|
|
6
|
-
from .log import set_log_level
|
|
7
|
-
from .processing.compensation import compensate
|
|
8
|
-
|
|
9
|
-
__all__ = [
|
|
10
|
-
# basic data types
|
|
11
|
-
"Dataset",
|
|
12
|
-
"Timeseries",
|
|
13
|
-
"compensate",
|
|
14
|
-
# getters
|
|
15
|
-
"read_from_csv",
|
|
16
|
-
"read_from_sql",
|
|
17
|
-
"set_log_level",
|
|
18
|
-
]
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
logger = logging.getLogger(__name__)
|
|
22
|
-
logger.setLevel(logging.INFO)
|
|
23
|
-
|
|
24
|
-
if not logger.hasHandlers():
|
|
25
|
-
console_handler = logging.StreamHandler()
|
|
26
|
-
console_handler.setLevel(logging.INFO)
|
|
27
|
-
formatter = logging.Formatter("%(levelname)s: %(message)s")
|
|
28
|
-
console_handler.setFormatter(formatter)
|
|
29
|
-
logger.addHandler(console_handler)
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from .core.dataset import Dataset
|
|
4
|
+
from .core.timeseries import Timeseries
|
|
5
|
+
from .io.read import read_from_csv, read_from_sql
|
|
6
|
+
from .log import set_log_level
|
|
7
|
+
from .processing.compensation import compensate
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
# basic data types
|
|
11
|
+
"Dataset",
|
|
12
|
+
"Timeseries",
|
|
13
|
+
"compensate",
|
|
14
|
+
# getters
|
|
15
|
+
"read_from_csv",
|
|
16
|
+
"read_from_sql",
|
|
17
|
+
"set_log_level",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
logger.setLevel(logging.INFO)
|
|
23
|
+
|
|
24
|
+
if not logger.hasHandlers():
|
|
25
|
+
console_handler = logging.StreamHandler()
|
|
26
|
+
console_handler.setLevel(logging.INFO)
|
|
27
|
+
formatter = logging.Formatter("%(levelname)s: %(message)s")
|
|
28
|
+
console_handler.setFormatter(formatter)
|
|
29
|
+
logger.addHandler(console_handler)
|
|
@@ -1,173 +1,173 @@
|
|
|
1
|
-
from collections.abc import Callable
|
|
2
|
-
from typing import Any, Literal
|
|
3
|
-
|
|
4
|
-
import numpy as np
|
|
5
|
-
from pandas import Series
|
|
6
|
-
from sklearn.ensemble import IsolationForest
|
|
7
|
-
from sklearn.neighbors import LocalOutlierFactor
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class OutlierDetection:
|
|
11
|
-
"""Detecting outliers in groundwater timeseries data.
|
|
12
|
-
|
|
13
|
-
Each method in this class returns a pandas.Series containing predicted outliers in
|
|
14
|
-
the dataset.
|
|
15
|
-
|
|
16
|
-
Methods:
|
|
17
|
-
iqr: Use interquartile range (IQR).
|
|
18
|
-
zscore: Use the z-score method.
|
|
19
|
-
isolation_forest: Using the isolation forest algorithm.
|
|
20
|
-
lof: Using the local outlier factor (LOF) method.
|
|
21
|
-
"""
|
|
22
|
-
|
|
23
|
-
def __init__(
|
|
24
|
-
self,
|
|
25
|
-
data: Series,
|
|
26
|
-
method: Literal["iqr", "zscore", "isolation_forest", "lof"],
|
|
27
|
-
rolling: bool,
|
|
28
|
-
window: int,
|
|
29
|
-
**kwargs: Any,
|
|
30
|
-
) -> None:
|
|
31
|
-
"""Find outliers in a time series using the specified method, with an option for rolling window."""
|
|
32
|
-
|
|
33
|
-
FUNCS: dict[str, Callable] = {
|
|
34
|
-
"iqr": self.iqr,
|
|
35
|
-
"zscore": self.zscore,
|
|
36
|
-
"isolation_forest": self.isolation_forest,
|
|
37
|
-
"lof": self.lof,
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
method_func = FUNCS[method]
|
|
41
|
-
|
|
42
|
-
if method in ["iqr", "zscore"]:
|
|
43
|
-
# For 'iqr' and 'zscore' methods
|
|
44
|
-
y = (
|
|
45
|
-
kwargs.get("k", 1.5)
|
|
46
|
-
if method == "iqr"
|
|
47
|
-
else kwargs.get("threshold", 3.0)
|
|
48
|
-
)
|
|
49
|
-
if rolling:
|
|
50
|
-
roll = data.rolling(window=window)
|
|
51
|
-
mask = roll.apply(lambda x: method_func(x, y, rolling=True), raw=True)
|
|
52
|
-
else:
|
|
53
|
-
mask = method_func(data.to_numpy(), y, rolling=False)
|
|
54
|
-
|
|
55
|
-
bool_mask = mask.astype(bool)
|
|
56
|
-
bool_mask_series = Series(bool_mask, index=data.index)
|
|
57
|
-
self.outliers = data[bool_mask_series]
|
|
58
|
-
|
|
59
|
-
else:
|
|
60
|
-
# For 'isolation_forest' and 'lof' methods
|
|
61
|
-
self.outliers = method_func(data, **kwargs)
|
|
62
|
-
|
|
63
|
-
@staticmethod
|
|
64
|
-
def iqr(data: np.ndarray, k: float, rolling: bool) -> np.ndarray:
|
|
65
|
-
"""Use interquartile range (IQR).
|
|
66
|
-
|
|
67
|
-
Parameters:
|
|
68
|
-
data (pandas.Series): The time series data.
|
|
69
|
-
|
|
70
|
-
Keyword Args:
|
|
71
|
-
k (float): The multiplier for the IQR to define the range. Defaults to 1.5.
|
|
72
|
-
|
|
73
|
-
Returns:
|
|
74
|
-
np.ndarray: Binary mask representing the outliers as 1.
|
|
75
|
-
"""
|
|
76
|
-
|
|
77
|
-
Q1 = np.percentile(data, 0.25)
|
|
78
|
-
Q3 = np.percentile(data, 0.75)
|
|
79
|
-
IQR = Q3 - Q1
|
|
80
|
-
|
|
81
|
-
lower_bound = Q1 - k * IQR
|
|
82
|
-
upper_bound = Q3 + k * IQR
|
|
83
|
-
|
|
84
|
-
if rolling:
|
|
85
|
-
return (
|
|
86
|
-
np.array([1])
|
|
87
|
-
if (data[-1] < lower_bound or data[-1] > upper_bound)
|
|
88
|
-
else np.array([0])
|
|
89
|
-
)
|
|
90
|
-
|
|
91
|
-
return np.where((data < lower_bound) | (data > upper_bound), 1, 0)
|
|
92
|
-
|
|
93
|
-
@staticmethod
|
|
94
|
-
def zscore(data: np.ndarray, threshold: float, rolling: bool) -> np.ndarray:
|
|
95
|
-
"""Use the z-score method.
|
|
96
|
-
|
|
97
|
-
Parameters:
|
|
98
|
-
data (pandas.Series): The time series data.
|
|
99
|
-
|
|
100
|
-
Keyword Args:
|
|
101
|
-
threshold (float): The threshold for the z-score method. Defaults to 3.0.
|
|
102
|
-
|
|
103
|
-
Returns:
|
|
104
|
-
pandas.Series: Binary mask representing outliers.
|
|
105
|
-
"""
|
|
106
|
-
|
|
107
|
-
mean = np.mean(data)
|
|
108
|
-
std_dev = np.std(data)
|
|
109
|
-
|
|
110
|
-
z_scores = np.abs((data - mean) / std_dev)
|
|
111
|
-
|
|
112
|
-
if rolling:
|
|
113
|
-
return np.array([1]) if z_scores[-1] > threshold else np.array([0])
|
|
114
|
-
return np.where(z_scores > threshold, 1, 0)
|
|
115
|
-
|
|
116
|
-
def isolation_forest(self, data: Series, **kwargs: Any) -> Series:
|
|
117
|
-
"""Using the isolation forest algorithm.
|
|
118
|
-
|
|
119
|
-
Parameters:
|
|
120
|
-
data (pandas.Series): The time series data.
|
|
121
|
-
|
|
122
|
-
Keyword Args:
|
|
123
|
-
n_estimators (int): The number of base estimators in the ensemble. Defaults to 100.
|
|
124
|
-
max_samples (int | 'auto' | float): The number of samples to draw from X to train each base estimator. Defaults to 'auto'.
|
|
125
|
-
contamination (float): The proportion of outliers in the data. Defaults to 0.01.
|
|
126
|
-
max_features (int | float): The number of features to draw from X to train each base estimator. Defaults to 1.0.
|
|
127
|
-
bootstrap (bool): Whether to use bootstrapping when sampling the data. Defaults to False.
|
|
128
|
-
n_jobs (int): The number of jobs to run in parallel. Defaults to 1.
|
|
129
|
-
random_state (int | RandomState | None): The random state to use. Defaults to None.
|
|
130
|
-
verbose (int): The verbosity level. Defaults to 0.
|
|
131
|
-
warm_start (bool): Whether to reuse the solution of the previous call to fit and add more estimators to the ensemble. Defaults to False.
|
|
132
|
-
|
|
133
|
-
Note:
|
|
134
|
-
For details on kwargs see: sklearn.ensemble.IsolationForest.
|
|
135
|
-
"""
|
|
136
|
-
|
|
137
|
-
X = data.to_numpy().reshape(-1, 1)
|
|
138
|
-
|
|
139
|
-
clf = IsolationForest(**kwargs)
|
|
140
|
-
clf.fit(X)
|
|
141
|
-
|
|
142
|
-
is_outlier = clf.predict(X)
|
|
143
|
-
outliers: Series = data[is_outlier == -1]
|
|
144
|
-
|
|
145
|
-
return outliers
|
|
146
|
-
|
|
147
|
-
def lof(self, data: Series, **kwargs: Any) -> Series:
|
|
148
|
-
"""Using the local outlier factor (LOF) method.
|
|
149
|
-
|
|
150
|
-
Parameters:
|
|
151
|
-
data (pandas.Series): The time series data.
|
|
152
|
-
|
|
153
|
-
Keyword Args:
|
|
154
|
-
n_neighbors (int): The number of neighbors to consider for each sample. Defaults to 20.
|
|
155
|
-
algorithm (str): The algorithm to use. Either 'auto', 'ball_tree', 'kd_tree' or 'brute'. Defaults to 'auto'.
|
|
156
|
-
leaf_size (int): The leaf size of the tree. Defaults to 30.
|
|
157
|
-
metric (str): The distance metric to use. Defaults to 'minkowski'.
|
|
158
|
-
p (int): The power parameter for the Minkowski metric. Defaults to 2.
|
|
159
|
-
contamination (float): The proportion of outliers in the data. Defaults to 0.01.
|
|
160
|
-
novelty (bool): Whether to consider the samples as normal or outliers. Defaults to False.
|
|
161
|
-
n_jobs (int): The number of jobs to run in parallel. Defaults to 1.
|
|
162
|
-
Note:
|
|
163
|
-
For details on kwargs see: sklearn.neighbors.LocalOutlierFactor.
|
|
164
|
-
"""
|
|
165
|
-
|
|
166
|
-
X = data.to_numpy().reshape(-1, 1)
|
|
167
|
-
|
|
168
|
-
clf = LocalOutlierFactor(**kwargs)
|
|
169
|
-
|
|
170
|
-
is_outlier = clf.fit_predict(X)
|
|
171
|
-
outliers: Series = data[is_outlier == -1]
|
|
172
|
-
|
|
173
|
-
return outliers
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from typing import Any, Literal
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
from pandas import Series
|
|
6
|
+
from sklearn.ensemble import IsolationForest
|
|
7
|
+
from sklearn.neighbors import LocalOutlierFactor
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class OutlierDetection:
|
|
11
|
+
"""Detecting outliers in groundwater timeseries data.
|
|
12
|
+
|
|
13
|
+
Each method in this class returns a pandas.Series containing predicted outliers in
|
|
14
|
+
the dataset.
|
|
15
|
+
|
|
16
|
+
Methods:
|
|
17
|
+
iqr: Use interquartile range (IQR).
|
|
18
|
+
zscore: Use the z-score method.
|
|
19
|
+
isolation_forest: Using the isolation forest algorithm.
|
|
20
|
+
lof: Using the local outlier factor (LOF) method.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
data: Series,
|
|
26
|
+
method: Literal["iqr", "zscore", "isolation_forest", "lof"],
|
|
27
|
+
rolling: bool,
|
|
28
|
+
window: int,
|
|
29
|
+
**kwargs: Any,
|
|
30
|
+
) -> None:
|
|
31
|
+
"""Find outliers in a time series using the specified method, with an option for rolling window."""
|
|
32
|
+
|
|
33
|
+
FUNCS: dict[str, Callable] = {
|
|
34
|
+
"iqr": self.iqr,
|
|
35
|
+
"zscore": self.zscore,
|
|
36
|
+
"isolation_forest": self.isolation_forest,
|
|
37
|
+
"lof": self.lof,
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
method_func = FUNCS[method]
|
|
41
|
+
|
|
42
|
+
if method in ["iqr", "zscore"]:
|
|
43
|
+
# For 'iqr' and 'zscore' methods
|
|
44
|
+
y = (
|
|
45
|
+
kwargs.get("k", 1.5)
|
|
46
|
+
if method == "iqr"
|
|
47
|
+
else kwargs.get("threshold", 3.0)
|
|
48
|
+
)
|
|
49
|
+
if rolling:
|
|
50
|
+
roll = data.rolling(window=window)
|
|
51
|
+
mask = roll.apply(lambda x: method_func(x, y, rolling=True), raw=True)
|
|
52
|
+
else:
|
|
53
|
+
mask = method_func(data.to_numpy(), y, rolling=False)
|
|
54
|
+
|
|
55
|
+
bool_mask = mask.astype(bool)
|
|
56
|
+
bool_mask_series = Series(bool_mask, index=data.index)
|
|
57
|
+
self.outliers = data[bool_mask_series]
|
|
58
|
+
|
|
59
|
+
else:
|
|
60
|
+
# For 'isolation_forest' and 'lof' methods
|
|
61
|
+
self.outliers = method_func(data, **kwargs)
|
|
62
|
+
|
|
63
|
+
@staticmethod
|
|
64
|
+
def iqr(data: np.ndarray, k: float, rolling: bool) -> np.ndarray:
|
|
65
|
+
"""Use interquartile range (IQR).
|
|
66
|
+
|
|
67
|
+
Parameters:
|
|
68
|
+
data (pandas.Series): The time series data.
|
|
69
|
+
|
|
70
|
+
Keyword Args:
|
|
71
|
+
k (float): The multiplier for the IQR to define the range. Defaults to 1.5.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
np.ndarray: Binary mask representing the outliers as 1.
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
Q1 = np.percentile(data, 0.25)
|
|
78
|
+
Q3 = np.percentile(data, 0.75)
|
|
79
|
+
IQR = Q3 - Q1
|
|
80
|
+
|
|
81
|
+
lower_bound = Q1 - k * IQR
|
|
82
|
+
upper_bound = Q3 + k * IQR
|
|
83
|
+
|
|
84
|
+
if rolling:
|
|
85
|
+
return (
|
|
86
|
+
np.array([1])
|
|
87
|
+
if (data[-1] < lower_bound or data[-1] > upper_bound)
|
|
88
|
+
else np.array([0])
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
return np.where((data < lower_bound) | (data > upper_bound), 1, 0)
|
|
92
|
+
|
|
93
|
+
@staticmethod
|
|
94
|
+
def zscore(data: np.ndarray, threshold: float, rolling: bool) -> np.ndarray:
|
|
95
|
+
"""Use the z-score method.
|
|
96
|
+
|
|
97
|
+
Parameters:
|
|
98
|
+
data (pandas.Series): The time series data.
|
|
99
|
+
|
|
100
|
+
Keyword Args:
|
|
101
|
+
threshold (float): The threshold for the z-score method. Defaults to 3.0.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
pandas.Series: Binary mask representing outliers.
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
mean = np.mean(data)
|
|
108
|
+
std_dev = np.std(data)
|
|
109
|
+
|
|
110
|
+
z_scores = np.abs((data - mean) / std_dev)
|
|
111
|
+
|
|
112
|
+
if rolling:
|
|
113
|
+
return np.array([1]) if z_scores[-1] > threshold else np.array([0])
|
|
114
|
+
return np.where(z_scores > threshold, 1, 0)
|
|
115
|
+
|
|
116
|
+
def isolation_forest(self, data: Series, **kwargs: Any) -> Series:
|
|
117
|
+
"""Using the isolation forest algorithm.
|
|
118
|
+
|
|
119
|
+
Parameters:
|
|
120
|
+
data (pandas.Series): The time series data.
|
|
121
|
+
|
|
122
|
+
Keyword Args:
|
|
123
|
+
n_estimators (int): The number of base estimators in the ensemble. Defaults to 100.
|
|
124
|
+
max_samples (int | 'auto' | float): The number of samples to draw from X to train each base estimator. Defaults to 'auto'.
|
|
125
|
+
contamination (float): The proportion of outliers in the data. Defaults to 0.01.
|
|
126
|
+
max_features (int | float): The number of features to draw from X to train each base estimator. Defaults to 1.0.
|
|
127
|
+
bootstrap (bool): Whether to use bootstrapping when sampling the data. Defaults to False.
|
|
128
|
+
n_jobs (int): The number of jobs to run in parallel. Defaults to 1.
|
|
129
|
+
random_state (int | RandomState | None): The random state to use. Defaults to None.
|
|
130
|
+
verbose (int): The verbosity level. Defaults to 0.
|
|
131
|
+
warm_start (bool): Whether to reuse the solution of the previous call to fit and add more estimators to the ensemble. Defaults to False.
|
|
132
|
+
|
|
133
|
+
Note:
|
|
134
|
+
For details on kwargs see: sklearn.ensemble.IsolationForest.
|
|
135
|
+
"""
|
|
136
|
+
|
|
137
|
+
X = data.to_numpy().reshape(-1, 1)
|
|
138
|
+
|
|
139
|
+
clf = IsolationForest(**kwargs)
|
|
140
|
+
clf.fit(X)
|
|
141
|
+
|
|
142
|
+
is_outlier = clf.predict(X)
|
|
143
|
+
outliers: Series = data[is_outlier == -1]
|
|
144
|
+
|
|
145
|
+
return outliers
|
|
146
|
+
|
|
147
|
+
def lof(self, data: Series, **kwargs: Any) -> Series:
|
|
148
|
+
"""Using the local outlier factor (LOF) method.
|
|
149
|
+
|
|
150
|
+
Parameters:
|
|
151
|
+
data (pandas.Series): The time series data.
|
|
152
|
+
|
|
153
|
+
Keyword Args:
|
|
154
|
+
n_neighbors (int): The number of neighbors to consider for each sample. Defaults to 20.
|
|
155
|
+
algorithm (str): The algorithm to use. Either 'auto', 'ball_tree', 'kd_tree' or 'brute'. Defaults to 'auto'.
|
|
156
|
+
leaf_size (int): The leaf size of the tree. Defaults to 30.
|
|
157
|
+
metric (str): The distance metric to use. Defaults to 'minkowski'.
|
|
158
|
+
p (int): The power parameter for the Minkowski metric. Defaults to 2.
|
|
159
|
+
contamination (float): The proportion of outliers in the data. Defaults to 0.01.
|
|
160
|
+
novelty (bool): Whether to consider the samples as normal or outliers. Defaults to False.
|
|
161
|
+
n_jobs (int): The number of jobs to run in parallel. Defaults to 1.
|
|
162
|
+
Note:
|
|
163
|
+
For details on kwargs see: sklearn.neighbors.LocalOutlierFactor.
|
|
164
|
+
"""
|
|
165
|
+
|
|
166
|
+
X = data.to_numpy().reshape(-1, 1)
|
|
167
|
+
|
|
168
|
+
clf = LocalOutlierFactor(**kwargs)
|
|
169
|
+
|
|
170
|
+
is_outlier = clf.fit_predict(X)
|
|
171
|
+
outliers: Series = data[is_outlier == -1]
|
|
172
|
+
|
|
173
|
+
return outliers
|
|
@@ -1,28 +1,28 @@
|
|
|
1
|
-
"""Module to compute timeseries statistics, similar to pastas.stats.signatures module
|
|
2
|
-
and following Heudorfer et al. 2019
|
|
3
|
-
|
|
4
|
-
To be implemented:
|
|
5
|
-
|
|
6
|
-
- Structure
|
|
7
|
-
* Flashiness
|
|
8
|
-
- Distribution
|
|
9
|
-
* Modality
|
|
10
|
-
* Density
|
|
11
|
-
- Shape
|
|
12
|
-
* Scale
|
|
13
|
-
* Slope
|
|
14
|
-
"""
|
|
15
|
-
|
|
16
|
-
import numpy as np
|
|
17
|
-
|
|
18
|
-
from gensor.core.timeseries import Timeseries
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
def trend(ts: Timeseries) -> tuple:
|
|
22
|
-
time_numeric = np.arange(len(ts.timeseries))
|
|
23
|
-
|
|
24
|
-
# Perform linear regression using numpy's polyfit
|
|
25
|
-
# This returns the slope and intercept of the best fit line
|
|
26
|
-
slope, intercept = np.polyfit(time_numeric, ts.timeseries, 1)
|
|
27
|
-
|
|
28
|
-
return slope, intercept
|
|
1
|
+
"""Module to compute timeseries statistics, similar to pastas.stats.signatures module
|
|
2
|
+
and following Heudorfer et al. 2019
|
|
3
|
+
|
|
4
|
+
To be implemented:
|
|
5
|
+
|
|
6
|
+
- Structure
|
|
7
|
+
* Flashiness
|
|
8
|
+
- Distribution
|
|
9
|
+
* Modality
|
|
10
|
+
* Density
|
|
11
|
+
- Shape
|
|
12
|
+
* Scale
|
|
13
|
+
* Slope
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
|
|
18
|
+
from gensor.core.timeseries import Timeseries
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def trend(ts: Timeseries) -> tuple:
|
|
22
|
+
time_numeric = np.arange(len(ts.timeseries))
|
|
23
|
+
|
|
24
|
+
# Perform linear regression using numpy's polyfit
|
|
25
|
+
# This returns the slope and intercept of the best fit line
|
|
26
|
+
slope, intercept = np.polyfit(time_numeric, ts.timeseries, 1)
|
|
27
|
+
|
|
28
|
+
return slope, intercept
|
|
@@ -1,17 +1,17 @@
|
|
|
1
|
-
"""
|
|
2
|
-
!!! warning
|
|
3
|
-
|
|
4
|
-
Whenever Timeseries objects are created via read_from_csv and use a parser (e.g.,
|
|
5
|
-
'vanessen'), the timestamps are localized and converted to UTC. Therefore, if the
|
|
6
|
-
user creates his own timeseries outside the read_from_csv, they should ensure that
|
|
7
|
-
the timestamps are in UTC format.
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
VARIABLE_TYPES_AND_UNITS = {
|
|
11
|
-
"temperature": ["degc"],
|
|
12
|
-
"pressure": ["cmh2o", "mmh2o"],
|
|
13
|
-
"conductivity": ["ms/cm"],
|
|
14
|
-
"flux": ["m/s"],
|
|
15
|
-
"head": ["m asl"],
|
|
16
|
-
"depth": ["m"],
|
|
17
|
-
}
|
|
1
|
+
"""
|
|
2
|
+
!!! warning
|
|
3
|
+
|
|
4
|
+
Whenever Timeseries objects are created via read_from_csv and use a parser (e.g.,
|
|
5
|
+
'vanessen'), the timestamps are localized and converted to UTC. Therefore, if the
|
|
6
|
+
user creates his own timeseries outside the read_from_csv, they should ensure that
|
|
7
|
+
the timestamps are in UTC format.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
VARIABLE_TYPES_AND_UNITS = {
|
|
11
|
+
"temperature": ["degc"],
|
|
12
|
+
"pressure": ["cmh2o", "mmh2o"],
|
|
13
|
+
"conductivity": ["ms/cm"],
|
|
14
|
+
"flux": ["m/s"],
|
|
15
|
+
"head": ["m asl"],
|
|
16
|
+
"depth": ["m"],
|
|
17
|
+
}
|