gensor 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gensor/__init__.py +14 -0
- gensor/compensation.py +110 -0
- gensor/db/__init__.py +3 -0
- gensor/db/connection.py +49 -0
- gensor/dtypes.py +429 -0
- gensor/exceptions.py +56 -0
- gensor/getters.py +79 -0
- gensor/parse/__init__.py +3 -0
- gensor/parse/vanessen.py +125 -0
- gensor/preprocessing.py +280 -0
- gensor/smoothing.py +66 -0
- gensor/trend.py +31 -0
- gensor-0.0.1.dist-info/LICENSE +21 -0
- gensor-0.0.1.dist-info/METADATA +75 -0
- gensor-0.0.1.dist-info/RECORD +17 -0
- gensor-0.0.1.dist-info/WHEEL +4 -0
- py.typed +0 -0
gensor/getters.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Fetching the data from various sources."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Literal
|
|
5
|
+
|
|
6
|
+
from pandas import read_sql
|
|
7
|
+
from sqlalchemy import MetaData, Table, select
|
|
8
|
+
|
|
9
|
+
from .db.connection import DatabaseConnection
|
|
10
|
+
from .dtypes import Dataset, Timeseries
|
|
11
|
+
from .exceptions import NoFilesToLoad
|
|
12
|
+
from .parse import parse_vanessen_csv
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def read_from_csv(path: Path, file_format: Literal["vanessen"] = "vanessen", **kwargs):
|
|
16
|
+
"""Loads the data from the Van Essen CSV file(s) and returns a list of Timeseries objects.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
path (Path): The path to the file or directory containing the files.
|
|
20
|
+
**kwargs (dict): Optional keyword arguments passed to `parse_vanessen_csv()` to specify the regex patterns for the serial number and station.
|
|
21
|
+
serial_number_pattern (str): The regex pattern to extract the serial number from the file.
|
|
22
|
+
location_pattern (str): The regex pattern to extract the station from the file.
|
|
23
|
+
col_names (list): The column names for the dataframe.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
parsers = {
|
|
27
|
+
"vanessen": parse_vanessen_csv,
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
if not isinstance(path, Path):
|
|
31
|
+
message = "The path argument must be a Path object."
|
|
32
|
+
raise TypeError(message)
|
|
33
|
+
|
|
34
|
+
if path.is_dir() and not any(path.iterdir()):
|
|
35
|
+
raise NoFilesToLoad()
|
|
36
|
+
|
|
37
|
+
files = (
|
|
38
|
+
[file for file in path.iterdir() if file.is_file()] if path.is_dir() else [path]
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
parser = parsers[file_format]
|
|
42
|
+
ds = Dataset()
|
|
43
|
+
for f in files:
|
|
44
|
+
print(f"Loading file: {f}")
|
|
45
|
+
ts_in_file: list = parser(f, **kwargs)
|
|
46
|
+
ds.add(ts_in_file)
|
|
47
|
+
|
|
48
|
+
return ds
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def read_from_sql(
|
|
52
|
+
db: DatabaseConnection, location: str, sensor: str, variable: str, unit: str
|
|
53
|
+
) -> list[Timeseries]:
|
|
54
|
+
"""Returns the timeseries from a sql database.
|
|
55
|
+
|
|
56
|
+
Parameters:
|
|
57
|
+
db (DatabaseConnection): The database connection object
|
|
58
|
+
location (str): The station name
|
|
59
|
+
sensor (str): Sensor name
|
|
60
|
+
variable (str): The measurement type
|
|
61
|
+
unit (str): Unit of the measurement
|
|
62
|
+
|
|
63
|
+
"""
|
|
64
|
+
metadata = MetaData()
|
|
65
|
+
schema = Table(f"{location}_{sensor}_{variable}", metadata)
|
|
66
|
+
|
|
67
|
+
query = select(schema)
|
|
68
|
+
df = read_sql(query, con=db.engine)
|
|
69
|
+
|
|
70
|
+
ts_object = Timeseries(
|
|
71
|
+
timeseries=df, variable=variable, location=location, sensor=sensor, unit=unit
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
return ts_object
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def read_from_api() -> Dataset:
|
|
78
|
+
"""Fetch data from the API."""
|
|
79
|
+
return NotImplemented
|
gensor/parse/__init__.py
ADDED
gensor/parse/vanessen.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""Logic parsing CSV files from van Essen Instruments Divers."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from io import StringIO
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import chardet
|
|
9
|
+
import pytz
|
|
10
|
+
from pandas import DataFrame, read_csv, to_datetime
|
|
11
|
+
|
|
12
|
+
from ..dtypes import Timeseries
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def detect_encoding(path: Path, num_bytes: int = 1024) -> str:
|
|
16
|
+
"""Detect the encoding of a file using chardet.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
path (Path): The path to the file.
|
|
20
|
+
num_bytes (int): Number of bytes to read for encoding detection (default is 1024).
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
str: The detected encoding of the file.
|
|
24
|
+
"""
|
|
25
|
+
with path.open("rb") as f:
|
|
26
|
+
raw_data = f.read(num_bytes)
|
|
27
|
+
result = chardet.detect(raw_data)
|
|
28
|
+
return result["encoding"] or "utf-8"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def handle_timestamps(df: DataFrame, tz: str) -> DataFrame:
|
|
32
|
+
"""Converts the timestamps in the dataframe to the specified timezone.
|
|
33
|
+
|
|
34
|
+
The timezone is obtained from the file metadata. If the timezone is UTC, the offset is extracted
|
|
35
|
+
and the timestamps are converted to the corresponding timezone. If the timezone is not UTC, the
|
|
36
|
+
timestamps are converted to UTC and then to the specified timezone.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
df (pd.DataFrame): The dataframe with the data.
|
|
40
|
+
tz (str): The timezone string obtained from the file metadata.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
if tz.startswith("UTC"):
|
|
44
|
+
offset_hours = int(tz[3:])
|
|
45
|
+
timezone = pytz.FixedOffset(offset_hours * 60)
|
|
46
|
+
else:
|
|
47
|
+
timezone = pytz.UTC
|
|
48
|
+
|
|
49
|
+
df.index = to_datetime(df.index).tz_localize("UTC").tz_convert(timezone)
|
|
50
|
+
|
|
51
|
+
return df
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def parse_vanessen_csv(path: Path, **kwargs) -> list[Any]:
|
|
55
|
+
"""Parses a van Essen csv file and returns a list of Timeseries objects. At this point it
|
|
56
|
+
does not matter whether the file is a barometric or piezometric logger file.
|
|
57
|
+
|
|
58
|
+
The function will use regex patterns to extract the serial number and station from the file. It is
|
|
59
|
+
important to use the appropriate regex patterns, particularily for the station. If the default patterns
|
|
60
|
+
are not working (whihc most likely will be the case), the user should provide their own patterns. The patterns
|
|
61
|
+
can be provided as keyword arguments to the function and it is possible to use OR (|) in the regex pattern.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
path (Path): The path to the file.
|
|
65
|
+
**kwargs (dict): Optional keyword arguments to specify the regex patterns for the serial number and station.
|
|
66
|
+
serial_number_pattern (str): The regex pattern to extract the serial number from the file.
|
|
67
|
+
location_pattern (str): The regex pattern to extract the station from the file.
|
|
68
|
+
col_names (list): The column names for the dataframe.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
list: A list of Timeseries objects.
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
data = {
|
|
75
|
+
"sensor": kwargs.get("serial_number_pattern", r"[A-Za-z]{2}\d{3,4}"),
|
|
76
|
+
"location": kwargs.get(
|
|
77
|
+
"location_pattern", r"[A-Za-z]{2}\d{2}[A-Za-z]{1}|Barodiver"
|
|
78
|
+
),
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
column_names = kwargs.get("col_names", ["timestamp", "pressure", "temperature"])
|
|
82
|
+
|
|
83
|
+
encoding = detect_encoding(path, num_bytes=10_000)
|
|
84
|
+
|
|
85
|
+
with path.open(mode="r", encoding=encoding) as f:
|
|
86
|
+
text = f.read()
|
|
87
|
+
|
|
88
|
+
try:
|
|
89
|
+
data = {k: re.search(v, text).group() for k, v in data.items()}
|
|
90
|
+
except AttributeError:
|
|
91
|
+
print(
|
|
92
|
+
f"Skipping file {path} due to missing patterns. If this is not expected, please provide the correct patterns."
|
|
93
|
+
)
|
|
94
|
+
return []
|
|
95
|
+
|
|
96
|
+
data_io = StringIO(
|
|
97
|
+
text[
|
|
98
|
+
text.index("Date/time") : text.index(
|
|
99
|
+
"END OF DATA FILE OF DATALOGGER FOR WINDOWS"
|
|
100
|
+
)
|
|
101
|
+
]
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
df = read_csv(
|
|
105
|
+
data_io, skiprows=1, header=None, names=column_names, index_col="timestamp"
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
timezone_match = re.search(
|
|
109
|
+
kwargs.get("timezone_pattern", r"UTC[+-]?\d+"), text
|
|
110
|
+
).group()
|
|
111
|
+
|
|
112
|
+
df = handle_timestamps(df, timezone_match)
|
|
113
|
+
|
|
114
|
+
ts_list = [
|
|
115
|
+
Timeseries(
|
|
116
|
+
ts=df[col],
|
|
117
|
+
variable=col,
|
|
118
|
+
location=data.get("location"),
|
|
119
|
+
sensor=data.get("sensor"),
|
|
120
|
+
unit="cmH2O" if col == "pressure" else "degC",
|
|
121
|
+
)
|
|
122
|
+
for col in df.columns
|
|
123
|
+
]
|
|
124
|
+
|
|
125
|
+
return ts_list
|
gensor/preprocessing.py
ADDED
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
"""Class and methods for preprocessing groundwater level data."""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Literal
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from pandas import Series
|
|
7
|
+
from scipy import stats
|
|
8
|
+
from sklearn.ensemble import IsolationForest
|
|
9
|
+
from sklearn.neighbors import LocalOutlierFactor
|
|
10
|
+
from sklearn.preprocessing import (
|
|
11
|
+
MaxAbsScaler,
|
|
12
|
+
MinMaxScaler,
|
|
13
|
+
RobustScaler,
|
|
14
|
+
StandardScaler,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Transform:
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
data: Series,
|
|
22
|
+
method: Literal[
|
|
23
|
+
"difference",
|
|
24
|
+
"log",
|
|
25
|
+
"square_root",
|
|
26
|
+
"box_cox",
|
|
27
|
+
"standard_scaler",
|
|
28
|
+
"minmax_scaler",
|
|
29
|
+
"robust_scaler",
|
|
30
|
+
"maxabs_scaler",
|
|
31
|
+
],
|
|
32
|
+
**kwargs: Any,
|
|
33
|
+
) -> None:
|
|
34
|
+
self.data = data
|
|
35
|
+
|
|
36
|
+
if method == "difference":
|
|
37
|
+
self.transformed_data, self.scaler = self.difference(**kwargs)
|
|
38
|
+
elif method == "log":
|
|
39
|
+
self.transformed_data, self.scaler = self.log()
|
|
40
|
+
elif method == "square_root":
|
|
41
|
+
self.transformed_data, self.scaler = self.square_root()
|
|
42
|
+
elif method == "box_cox":
|
|
43
|
+
self.transformed_data, self.scaler = self.box_cox(**kwargs)
|
|
44
|
+
elif method == "standard_scaler":
|
|
45
|
+
self.transformed_data, self.scaler = self.standard_scaler()
|
|
46
|
+
elif method == "minmax_scaler":
|
|
47
|
+
self.transformed_data, self.scaler = self.minmax_scaler()
|
|
48
|
+
elif method == "robust_scaler":
|
|
49
|
+
self.transformed_data, self.scaler = self.robust_scaler()
|
|
50
|
+
elif method == "maxabs_scaler":
|
|
51
|
+
self.transformed_data, self.scaler = self.maxabs_scaler()
|
|
52
|
+
else:
|
|
53
|
+
raise NotImplementedError()
|
|
54
|
+
|
|
55
|
+
def get_transformation(self) -> tuple:
|
|
56
|
+
return self.transformed_data, self.scaler
|
|
57
|
+
|
|
58
|
+
def difference(self, **kwargs: int) -> tuple[Series, str]:
|
|
59
|
+
"""Difference the time series data.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
periods (int): The number of periods to shift. Defaults to 1.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
pandas.Series: The differenced time series data.
|
|
66
|
+
"""
|
|
67
|
+
periods = kwargs.get("periods", 1)
|
|
68
|
+
transformed = self.data.diff(periods=periods).dropna()
|
|
69
|
+
return (transformed, "difference")
|
|
70
|
+
|
|
71
|
+
def log(self) -> tuple[Series, str]:
|
|
72
|
+
"""Take the natural logarithm of the time series data.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
pandas.Series: The natural logarithm of the time series data.
|
|
76
|
+
"""
|
|
77
|
+
transformed = self.data.apply(lambda x: x if x <= 0 else np.log(x))
|
|
78
|
+
return (transformed, "log")
|
|
79
|
+
|
|
80
|
+
def square_root(self) -> tuple[Series, str]:
|
|
81
|
+
"""Take the square root of the time series data.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
pandas.Series: The square root of the time series data.
|
|
85
|
+
"""
|
|
86
|
+
transformed = self.data.apply(lambda x: x if x <= 0 else np.sqrt(x))
|
|
87
|
+
return (transformed, "square_root")
|
|
88
|
+
|
|
89
|
+
def box_cox(self, **kwargs: float) -> tuple[Series, str]:
|
|
90
|
+
"""Apply the Box-Cox transformation to the time series data. Only works
|
|
91
|
+
for all positive datasets!
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
lmbda (float): The transformation parameter. Defaults to 0.
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
pandas.Series: The Box-Cox transformed time series data.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
lmbda = kwargs.get("lmbda", None)
|
|
101
|
+
|
|
102
|
+
if (self.data <= 0).any():
|
|
103
|
+
message = (
|
|
104
|
+
"Box-Cox transformation requires all values to be strictly positive."
|
|
105
|
+
)
|
|
106
|
+
raise ValueError(message)
|
|
107
|
+
|
|
108
|
+
if not lmbda:
|
|
109
|
+
result = stats.boxcox(self.data, lmbda=lmbda)
|
|
110
|
+
transformed_series = Series(result, index=self.data.index)
|
|
111
|
+
else:
|
|
112
|
+
result = stats.boxcox(self.data, lmbda=lmbda)
|
|
113
|
+
transformed_series = Series(result[0], index=self.data.index)
|
|
114
|
+
|
|
115
|
+
return transformed_series, "box-cox"
|
|
116
|
+
|
|
117
|
+
def standard_scaler(self) -> tuple[Series, Any]:
|
|
118
|
+
"""Normalize a pandas Series using StandardScaler."""
|
|
119
|
+
scaler = StandardScaler()
|
|
120
|
+
scaled_values = scaler.fit_transform(
|
|
121
|
+
self.data.to_numpy().reshape(-1, 1)
|
|
122
|
+
).flatten()
|
|
123
|
+
scaled_series = Series(scaled_values, index=self.data.index)
|
|
124
|
+
return scaled_series, scaler
|
|
125
|
+
|
|
126
|
+
def minmax_scaler(self) -> tuple[Series, Any]:
|
|
127
|
+
"""Normalize a pandas Series using MinMaxScaler."""
|
|
128
|
+
scaler = MinMaxScaler()
|
|
129
|
+
scaled_values = scaler.fit_transform(
|
|
130
|
+
self.data.to_numpy().reshape(-1, 1)
|
|
131
|
+
).flatten()
|
|
132
|
+
scaled_series = Series(scaled_values, index=self.data.index)
|
|
133
|
+
return scaled_series, scaler
|
|
134
|
+
|
|
135
|
+
def robust_scaler(self) -> tuple[Series, Any]:
|
|
136
|
+
"""Normalize a pandas Series using RobustScaler."""
|
|
137
|
+
scaler = RobustScaler()
|
|
138
|
+
scaled_values = scaler.fit_transform(
|
|
139
|
+
self.data.to_numpy().reshape(-1, 1)
|
|
140
|
+
).flatten()
|
|
141
|
+
scaled_series = Series(scaled_values, index=self.data.index)
|
|
142
|
+
return scaled_series, scaler
|
|
143
|
+
|
|
144
|
+
def maxabs_scaler(self) -> tuple[Series, Any]:
|
|
145
|
+
"""Normalize a pandas Series using MaxAbsScaler."""
|
|
146
|
+
scaler = MaxAbsScaler()
|
|
147
|
+
scaled_values = scaler.fit_transform(
|
|
148
|
+
self.data.to_numpy().reshape(-1, 1)
|
|
149
|
+
).flatten()
|
|
150
|
+
scaled_series = Series(scaled_values, index=self.data.index)
|
|
151
|
+
return scaled_series, scaler
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class OutlierDetection:
|
|
155
|
+
"""Class for detecting outliers in time series data."""
|
|
156
|
+
|
|
157
|
+
def __init__(
|
|
158
|
+
self,
|
|
159
|
+
data: Series,
|
|
160
|
+
method: Literal["iqr", "zscore", "isolation_forest", "lof"],
|
|
161
|
+
**kwargs: Any,
|
|
162
|
+
) -> None:
|
|
163
|
+
"""Find outliers in a time series using the specified method."""
|
|
164
|
+
if method == "iqr":
|
|
165
|
+
self.outliers = self.iqr(data, **kwargs)
|
|
166
|
+
elif method == "zscore":
|
|
167
|
+
self.outliers = self.zscore(data, **kwargs)
|
|
168
|
+
elif method == "isolation_forest":
|
|
169
|
+
self.outliers = self.isolation_forest(data, **kwargs)
|
|
170
|
+
elif method == "lof":
|
|
171
|
+
self.outliers = self.lof(data, **kwargs)
|
|
172
|
+
else:
|
|
173
|
+
raise NotImplementedError()
|
|
174
|
+
|
|
175
|
+
def iqr(self, data: Series, **kwargs: float) -> Series:
|
|
176
|
+
"""Use interquartile range (IQR).
|
|
177
|
+
|
|
178
|
+
Parameters:
|
|
179
|
+
data (pandas.Series): The time series data.
|
|
180
|
+
|
|
181
|
+
Keyword Args:
|
|
182
|
+
k (float): The multiplier for the IQR to define the range. Defaults to 1.5.
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
pandas.Series: Outliers detected in the data.
|
|
186
|
+
"""
|
|
187
|
+
|
|
188
|
+
k: float = kwargs.get("k", 1.5)
|
|
189
|
+
|
|
190
|
+
Q1 = data.quantile(0.25)
|
|
191
|
+
Q3 = data.quantile(0.75)
|
|
192
|
+
IQR = Q3 - Q1
|
|
193
|
+
|
|
194
|
+
lower_bound = Q1 - k * IQR
|
|
195
|
+
upper_bound = Q3 + k * IQR
|
|
196
|
+
|
|
197
|
+
outliers = data[(data < lower_bound) | (data > upper_bound)]
|
|
198
|
+
|
|
199
|
+
return outliers
|
|
200
|
+
|
|
201
|
+
def zscore(self, data: Series, **kwargs: float) -> Series:
|
|
202
|
+
"""Detect outliers in a time series using the z-score method.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
data (pandas.Series): The time series data.
|
|
206
|
+
|
|
207
|
+
Keyword Args:
|
|
208
|
+
threshold (float): The threshold for the z-score method. Defaults to 3.0.
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
pandas.Series: Outliers detected in the data.
|
|
212
|
+
"""
|
|
213
|
+
|
|
214
|
+
threshold = kwargs.get("threshold", 3.0)
|
|
215
|
+
|
|
216
|
+
mean = data.mean()
|
|
217
|
+
std_dev = data.std()
|
|
218
|
+
|
|
219
|
+
outliers: Series = data[(data - mean).abs() > threshold * std_dev]
|
|
220
|
+
|
|
221
|
+
return outliers
|
|
222
|
+
|
|
223
|
+
def isolation_forest(self, data: Series, **kwargs: Any) -> Series:
|
|
224
|
+
"""Detect outliers in a time series using the isolation forest method.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
data (pandas.Series): The time series data.
|
|
228
|
+
|
|
229
|
+
Keyword Args:
|
|
230
|
+
n_estimators (int): The number of base estimators in the ensemble. Defaults to 100.
|
|
231
|
+
max_samples (int | 'auto' | float): The number of samples to draw from X to train each base estimator. Defaults to 'auto'.
|
|
232
|
+
contamination (float): The proportion of outliers in the data. Defaults to 0.01.
|
|
233
|
+
max_features (int | float): The number of features to draw from X to train each base estimator. Defaults to 1.0.
|
|
234
|
+
bootstrap (bool): Whether to use bootstrapping when sampling the data. Defaults to False.
|
|
235
|
+
n_jobs (int): The number of jobs to run in parallel. Defaults to 1.
|
|
236
|
+
random_state (int | RandomState | None): The random state to use. Defaults to None.
|
|
237
|
+
verbose (int): The verbosity level. Defaults to 0.
|
|
238
|
+
warm_start (bool): Whether to reuse the solution of the previous call to fit and add more estimators to the ensemble. Defaults to False.
|
|
239
|
+
|
|
240
|
+
Note:
|
|
241
|
+
For details on kwargs see: sklearn.ensemble.IsolationForest.
|
|
242
|
+
"""
|
|
243
|
+
|
|
244
|
+
X = data.to_numpy().reshape(-1, 1)
|
|
245
|
+
|
|
246
|
+
clf = IsolationForest(**kwargs)
|
|
247
|
+
clf.fit(X)
|
|
248
|
+
|
|
249
|
+
is_outlier = clf.predict(X)
|
|
250
|
+
outliers: Series = data[is_outlier == -1]
|
|
251
|
+
|
|
252
|
+
return outliers
|
|
253
|
+
|
|
254
|
+
def lof(self, data: Series, **kwargs: Any) -> Series:
|
|
255
|
+
"""Detect outliers in a time series using the local outlier factor (LOF) method.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
data (pandas.Series): The time series data.
|
|
259
|
+
|
|
260
|
+
Keyword Args:
|
|
261
|
+
n_neighbors (int): The number of neighbors to consider for each sample. Defaults to 20.
|
|
262
|
+
algorithm (str): The algorithm to use. Either 'auto', 'ball_tree', 'kd_tree' or 'brute'. Defaults to 'auto'.
|
|
263
|
+
leaf_size (int): The leaf size of the tree. Defaults to 30.
|
|
264
|
+
metric (str): The distance metric to use. Defaults to 'minkowski'.
|
|
265
|
+
p (int): The power parameter for the Minkowski metric. Defaults to 2.
|
|
266
|
+
contamination (float): The proportion of outliers in the data. Defaults to 0.01.
|
|
267
|
+
novelty (bool): Whether to consider the samples as normal or outliers. Defaults to False.
|
|
268
|
+
n_jobs (int): The number of jobs to run in parallel. Defaults to 1.
|
|
269
|
+
Note:
|
|
270
|
+
For details on kwargs see: sklearn.neighbors.LocalOutlierFactor.
|
|
271
|
+
"""
|
|
272
|
+
|
|
273
|
+
X = data.to_numpy().reshape(-1, 1)
|
|
274
|
+
|
|
275
|
+
clf = LocalOutlierFactor(**kwargs)
|
|
276
|
+
|
|
277
|
+
is_outlier = clf.fit_predict(X)
|
|
278
|
+
outliers: Series = data[is_outlier == -1]
|
|
279
|
+
|
|
280
|
+
return outliers
|
gensor/smoothing.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Tools for smoothing the data."""
|
|
2
|
+
|
|
3
|
+
from matplotlib import pyplot as plt
|
|
4
|
+
from pandas import Series
|
|
5
|
+
from sklearn.metrics import mean_squared_error
|
|
6
|
+
|
|
7
|
+
from .dtypes import Timeseries
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def smooth_data(
|
|
11
|
+
data: Timeseries,
|
|
12
|
+
window: int = 5,
|
|
13
|
+
method: str = "rolling_mean",
|
|
14
|
+
print_statistics: bool = False,
|
|
15
|
+
inplace: bool = False,
|
|
16
|
+
plot: bool = False,
|
|
17
|
+
) -> Series | None:
|
|
18
|
+
"""Smooth a time series using a rolling mean or median.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
data (pandas.Series): The time series data.
|
|
22
|
+
window (int): The size of the window for the rolling mean or median. Defaults to 5.
|
|
23
|
+
method (str): The method to use for smoothing. Either 'rolling_mean' or 'rolling_median'. Defaults to 'rolling_mean'.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
pandas.Series: The smoothed time series.
|
|
27
|
+
"""
|
|
28
|
+
if method == "rolling_mean":
|
|
29
|
+
smoothed_data = data.ts.rolling(window=window, center=True).mean()
|
|
30
|
+
elif method == "rolling_median":
|
|
31
|
+
smoothed_data = data.ts.rolling(window=window, center=True).median()
|
|
32
|
+
else:
|
|
33
|
+
raise NotImplementedError()
|
|
34
|
+
|
|
35
|
+
valid_indices = smoothed_data.notna()
|
|
36
|
+
original_data_aligned = data.ts[valid_indices]
|
|
37
|
+
smoothed_data_aligned = smoothed_data[valid_indices]
|
|
38
|
+
|
|
39
|
+
if print_statistics:
|
|
40
|
+
mse = mean_squared_error(original_data_aligned, smoothed_data_aligned)
|
|
41
|
+
print(f"Mean Squared Error of {method}: {mse:.2f}")
|
|
42
|
+
|
|
43
|
+
if plot:
|
|
44
|
+
plt.figure(figsize=(12, 6))
|
|
45
|
+
plt.plot(
|
|
46
|
+
data.timeseries.index, data.timeseries, label="Original Data", color="black"
|
|
47
|
+
)
|
|
48
|
+
plt.plot(
|
|
49
|
+
smoothed_data.index,
|
|
50
|
+
smoothed_data,
|
|
51
|
+
label=f"Moving Average ({method})",
|
|
52
|
+
color="green",
|
|
53
|
+
linestyle="dotted",
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
plt.legend()
|
|
57
|
+
plt.title("Groundwater Level with Moving Average")
|
|
58
|
+
plt.xlabel("Date")
|
|
59
|
+
plt.ylabel("Groundwater Level")
|
|
60
|
+
plt.show()
|
|
61
|
+
|
|
62
|
+
if inplace:
|
|
63
|
+
data.ts = smoothed_data
|
|
64
|
+
return None
|
|
65
|
+
else:
|
|
66
|
+
return smoothed_data
|
gensor/trend.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Analyse trends in the logger data."""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from matplotlib import pyplot as plt
|
|
5
|
+
|
|
6
|
+
from .dtypes import Timeseries
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def trend_analysis(ts: Timeseries, plot=True) -> None:
|
|
10
|
+
time_numeric = np.arange(len(ts.timeseries))
|
|
11
|
+
|
|
12
|
+
# Perform linear regression using numpy's polyfit
|
|
13
|
+
# This returns the slope and intercept of the best fit line
|
|
14
|
+
slope, intercept = np.polyfit(time_numeric, ts.timeseries, 1)
|
|
15
|
+
|
|
16
|
+
# Print the slope and intercept
|
|
17
|
+
print(f"Slope: {slope}, Intercept: {intercept}")
|
|
18
|
+
|
|
19
|
+
if plot:
|
|
20
|
+
# Compute the values of the trend line
|
|
21
|
+
trend_line = intercept + slope * time_numeric
|
|
22
|
+
|
|
23
|
+
# Plotting the original series and the trend line
|
|
24
|
+
plt.figure(figsize=(10, 5))
|
|
25
|
+
plt.plot(ts.timeseries.index, ts.timeseries, label="Original Data")
|
|
26
|
+
plt.plot(ts.timeseries.index, trend_line, color="red", label="Trend Line")
|
|
27
|
+
plt.xlabel("Time")
|
|
28
|
+
plt.ylabel("Groundwater Level")
|
|
29
|
+
plt.title("Groundwater Level Trend Analysis")
|
|
30
|
+
plt.legend()
|
|
31
|
+
plt.show()
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024, Mateusz Zawadzki
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: gensor
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Library for handling groundwater sensor data.
|
|
5
|
+
Home-page: https://github.com/zawadzkim/gensor
|
|
6
|
+
Author: Mateusz Zawadzki
|
|
7
|
+
Author-email: fzawadzkimat@outlook.com
|
|
8
|
+
Requires-Python: >=3.11
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Requires-Dist: chardet (>=5.2.0,<6.0.0)
|
|
13
|
+
Requires-Dist: matplotlib (>=3.9.2,<4.0.0)
|
|
14
|
+
Requires-Dist: numpy (>=2.1.0,<3.0.0)
|
|
15
|
+
Requires-Dist: pandas (>=2.2.2,<3.0.0)
|
|
16
|
+
Requires-Dist: pandera (>=0.20.3,<0.21.0)
|
|
17
|
+
Requires-Dist: pydantic (>=2.8.2,<3.0.0)
|
|
18
|
+
Requires-Dist: pytz (>=2024.1,<2025.0)
|
|
19
|
+
Requires-Dist: scikit-learn (>=1.5.1,<2.0.0)
|
|
20
|
+
Requires-Dist: scipy (>=1.14.1,<2.0.0)
|
|
21
|
+
Requires-Dist: sqlalchemy (>=2.0.32,<3.0.0)
|
|
22
|
+
Project-URL: Documentation, https://zawadzkim.github.io/gensor/
|
|
23
|
+
Project-URL: Repository, https://github.com/zawadzkim/gensor
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
# gensor
|
|
27
|
+
|
|
28
|
+
[](https://img.shields.io/github/v/release/zawadzkim/gensor)
|
|
29
|
+
[](https://github.com/zawadzkim/gensor/actions/workflows/main.yml?query=branch%3Amain)
|
|
30
|
+
[](https://codecov.io/gh/zawadzkim/gensor)
|
|
31
|
+
[](https://img.shields.io/github/commit-activity/m/zawadzkim/gensor)
|
|
32
|
+
[](https://img.shields.io/github/license/zawadzkim/gensor)
|
|
33
|
+
|
|
34
|
+
Library for handling groundwater sensor data.
|
|
35
|
+
|
|
36
|
+
- **Github repository**: <https://github.com/zawadzkim/gensor/>
|
|
37
|
+
- **Documentation** <https://zawadzkim.github.io/gensor/>
|
|
38
|
+
|
|
39
|
+
## Getting started with your project
|
|
40
|
+
|
|
41
|
+
First, create a repository on GitHub with the same name as this project, and then run the following commands:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
git init -b main
|
|
45
|
+
git add .
|
|
46
|
+
git commit -m "init commit"
|
|
47
|
+
git remote add origin git@github.com:zawadzkim/gensor.git
|
|
48
|
+
git push -u origin main
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Finally, install the environment and the pre-commit hooks with
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
make install
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
You are now ready to start development on your project!
|
|
58
|
+
The CI/CD pipeline will be triggered when you open a pull request, merge to main, or when you create a new release.
|
|
59
|
+
|
|
60
|
+
To finalize the set-up for publishing to PyPi or Artifactory, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/publishing/#set-up-for-pypi).
|
|
61
|
+
For activating the automatic documentation with MkDocs, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/mkdocs/#enabling-the-documentation-on-github).
|
|
62
|
+
To enable the code coverage reports, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/codecov/).
|
|
63
|
+
|
|
64
|
+
## Releasing a new version
|
|
65
|
+
|
|
66
|
+
- Create an API Token on [Pypi](https://pypi.org/).
|
|
67
|
+
- Add the API Token to your projects secrets with the name `PYPI_TOKEN` by visiting [this page](https://github.com/zawadzkim/gensor/settings/secrets/actions/new).
|
|
68
|
+
- Create a [new release](https://github.com/zawadzkim/gensor/releases/new) on Github.
|
|
69
|
+
- Create a new tag in the form `*.*.*`.
|
|
70
|
+
- For more details, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/cicd/#how-to-trigger-a-release).
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
Repository initiated with [fpgmaas/cookiecutter-poetry](https://github.com/fpgmaas/cookiecutter-poetry).
|
|
75
|
+
|