gensor 0.1.7__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {gensor-0.1.7 → gensor-0.2.2}/LICENSE +21 -21
  2. {gensor-0.1.7 → gensor-0.2.2}/PKG-INFO +1 -1
  3. gensor-0.2.2/gensor/__init__.py +29 -0
  4. {gensor-0.1.7 → gensor-0.2.2}/gensor/analysis/outliers.py +173 -173
  5. {gensor-0.1.7 → gensor-0.2.2}/gensor/analysis/stats.py +28 -28
  6. {gensor-0.1.7 → gensor-0.2.2}/gensor/config.py +17 -17
  7. {gensor-0.1.7 → gensor-0.2.2}/gensor/core/base.py +409 -374
  8. {gensor-0.1.7 → gensor-0.2.2}/gensor/core/dataset.py +203 -186
  9. {gensor-0.1.7 → gensor-0.2.2}/gensor/core/indexer.py +37 -32
  10. gensor-0.2.2/gensor/core/timeseries.py +78 -0
  11. {gensor-0.1.7 → gensor-0.2.2}/gensor/db/__init__.py +14 -14
  12. {gensor-0.1.7 → gensor-0.2.2}/gensor/db/connection.py +197 -144
  13. {gensor-0.1.7 → gensor-0.2.2}/gensor/exceptions.py +55 -55
  14. gensor-0.2.2/gensor/io/read.py +192 -0
  15. gensor-0.2.2/gensor/log.py +7 -0
  16. {gensor-0.1.7 → gensor-0.2.2}/gensor/parse/__init__.py +4 -4
  17. {gensor-0.1.7 → gensor-0.2.2}/gensor/parse/plain.py +61 -61
  18. {gensor-0.1.7 → gensor-0.2.2}/gensor/parse/utils.py +87 -67
  19. {gensor-0.1.7 → gensor-0.2.2}/gensor/parse/vanessen.py +89 -86
  20. {gensor-0.1.7 → gensor-0.2.2}/gensor/processing/compensation.py +195 -195
  21. {gensor-0.1.7 → gensor-0.2.2}/gensor/processing/smoothing.py +66 -66
  22. {gensor-0.1.7 → gensor-0.2.2}/gensor/processing/transform.py +148 -148
  23. {gensor-0.1.7 → gensor-0.2.2}/gensor/testdata/__init__.py +25 -25
  24. {gensor-0.1.7 → gensor-0.2.2}/pyproject.toml +121 -121
  25. gensor-0.1.7/gensor/__init__.py +0 -20
  26. gensor-0.1.7/gensor/core/timeseries.py +0 -147
  27. gensor-0.1.7/gensor/io/read.py +0 -169
  28. {gensor-0.1.7 → gensor-0.2.2}/README.md +0 -0
  29. {gensor-0.1.7 → gensor-0.2.2}/gensor/analysis/__init__.py +0 -0
  30. {gensor-0.1.7 → gensor-0.2.2}/gensor/core/__init__.py +0 -0
  31. {gensor-0.1.7 → gensor-0.2.2}/gensor/io/__init__.py +0 -0
  32. {gensor-0.1.7 → gensor-0.2.2}/gensor/processing/__init__.py +0 -0
  33. {gensor-0.1.7 → gensor-0.2.2}/gensor/testdata/Barodiver_220427183008_BY222.csv +0 -0
  34. {gensor-0.1.7 → gensor-0.2.2}/gensor/testdata/PB01A_moni_AV319_220427183019_AV319.csv +0 -0
  35. {gensor-0.1.7 → gensor-0.2.2}/gensor/testdata/PB02A_plain.csv +0 -0
  36. {gensor-0.1.7 → gensor-0.2.2}/py.typed +0 -0
@@ -1,21 +1,21 @@
1
- MIT License
2
-
3
- Copyright (c) 2024, Mateusz Zawadzki
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
1
+ MIT License
2
+
3
+ Copyright (c) 2024, Mateusz Zawadzki
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: gensor
3
- Version: 0.1.7
3
+ Version: 0.2.2
4
4
  Summary: Library for handling groundwater sensor data.
5
5
  Home-page: https://github.com/zawadzkim/gensor
6
6
  Author: Mateusz Zawadzki
@@ -0,0 +1,29 @@
1
+ import logging
2
+
3
+ from .core.dataset import Dataset
4
+ from .core.timeseries import Timeseries
5
+ from .io.read import read_from_csv, read_from_sql
6
+ from .log import set_log_level
7
+ from .processing.compensation import compensate
8
+
9
+ __all__ = [
10
+ # basic data types
11
+ "Dataset",
12
+ "Timeseries",
13
+ "compensate",
14
+ # getters
15
+ "read_from_csv",
16
+ "read_from_sql",
17
+ "set_log_level",
18
+ ]
19
+
20
+
21
+ logger = logging.getLogger(__name__)
22
+ logger.setLevel(logging.INFO)
23
+
24
+ if not logger.hasHandlers():
25
+ console_handler = logging.StreamHandler()
26
+ console_handler.setLevel(logging.INFO)
27
+ formatter = logging.Formatter("%(levelname)s: %(message)s")
28
+ console_handler.setFormatter(formatter)
29
+ logger.addHandler(console_handler)
@@ -1,173 +1,173 @@
1
- from collections.abc import Callable
2
- from typing import Any, Literal
3
-
4
- import numpy as np
5
- from pandas import Series
6
- from sklearn.ensemble import IsolationForest
7
- from sklearn.neighbors import LocalOutlierFactor
8
-
9
-
10
- class OutlierDetection:
11
- """Detecting outliers in groundwater timeseries data.
12
-
13
- Each method in this class returns a pandas.Series containing predicted outliers in
14
- the dataset.
15
-
16
- Methods:
17
- iqr: Use interquartile range (IQR).
18
- zscore: Use the z-score method.
19
- isolation_forest: Using the isolation forest algorithm.
20
- lof: Using the local outlier factor (LOF) method.
21
- """
22
-
23
- def __init__(
24
- self,
25
- data: Series,
26
- method: Literal["iqr", "zscore", "isolation_forest", "lof"],
27
- rolling: bool,
28
- window: int,
29
- **kwargs: Any,
30
- ) -> None:
31
- """Find outliers in a time series using the specified method, with an option for rolling window."""
32
-
33
- FUNCS: dict[str, Callable] = {
34
- "iqr": self.iqr,
35
- "zscore": self.zscore,
36
- "isolation_forest": self.isolation_forest,
37
- "lof": self.lof,
38
- }
39
-
40
- method_func = FUNCS[method]
41
-
42
- if method in ["iqr", "zscore"]:
43
- # For 'iqr' and 'zscore' methods
44
- y = (
45
- kwargs.get("k", 1.5)
46
- if method == "iqr"
47
- else kwargs.get("threshold", 3.0)
48
- )
49
- if rolling:
50
- roll = data.rolling(window=window)
51
- mask = roll.apply(lambda x: method_func(x, y, rolling=True), raw=True)
52
- else:
53
- mask = method_func(data.to_numpy(), y, rolling=False)
54
-
55
- bool_mask = mask.astype(bool)
56
- bool_mask_series = Series(bool_mask, index=data.index)
57
- self.outliers = data[bool_mask_series]
58
-
59
- else:
60
- # For 'isolation_forest' and 'lof' methods
61
- self.outliers = method_func(data, **kwargs)
62
-
63
- @staticmethod
64
- def iqr(data: np.ndarray, k: float, rolling: bool) -> np.ndarray:
65
- """Use interquartile range (IQR).
66
-
67
- Parameters:
68
- data (pandas.Series): The time series data.
69
-
70
- Keyword Args:
71
- k (float): The multiplier for the IQR to define the range. Defaults to 1.5.
72
-
73
- Returns:
74
- np.ndarray: Binary mask representing the outliers as 1.
75
- """
76
-
77
- Q1 = np.percentile(data, 0.25)
78
- Q3 = np.percentile(data, 0.75)
79
- IQR = Q3 - Q1
80
-
81
- lower_bound = Q1 - k * IQR
82
- upper_bound = Q3 + k * IQR
83
-
84
- if rolling:
85
- return (
86
- np.array([1])
87
- if (data[-1] < lower_bound or data[-1] > upper_bound)
88
- else np.array([0])
89
- )
90
-
91
- return np.where((data < lower_bound) | (data > upper_bound), 1, 0)
92
-
93
- @staticmethod
94
- def zscore(data: np.ndarray, threshold: float, rolling: bool) -> np.ndarray:
95
- """Use the z-score method.
96
-
97
- Parameters:
98
- data (pandas.Series): The time series data.
99
-
100
- Keyword Args:
101
- threshold (float): The threshold for the z-score method. Defaults to 3.0.
102
-
103
- Returns:
104
- pandas.Series: Binary mask representing outliers.
105
- """
106
-
107
- mean = np.mean(data)
108
- std_dev = np.std(data)
109
-
110
- z_scores = np.abs((data - mean) / std_dev)
111
-
112
- if rolling:
113
- return np.array([1]) if z_scores[-1] > threshold else np.array([0])
114
- return np.where(z_scores > threshold, 1, 0)
115
-
116
- def isolation_forest(self, data: Series, **kwargs: Any) -> Series:
117
- """Using the isolation forest algorithm.
118
-
119
- Parameters:
120
- data (pandas.Series): The time series data.
121
-
122
- Keyword Args:
123
- n_estimators (int): The number of base estimators in the ensemble. Defaults to 100.
124
- max_samples (int | 'auto' | float): The number of samples to draw from X to train each base estimator. Defaults to 'auto'.
125
- contamination (float): The proportion of outliers in the data. Defaults to 0.01.
126
- max_features (int | float): The number of features to draw from X to train each base estimator. Defaults to 1.0.
127
- bootstrap (bool): Whether to use bootstrapping when sampling the data. Defaults to False.
128
- n_jobs (int): The number of jobs to run in parallel. Defaults to 1.
129
- random_state (int | RandomState | None): The random state to use. Defaults to None.
130
- verbose (int): The verbosity level. Defaults to 0.
131
- warm_start (bool): Whether to reuse the solution of the previous call to fit and add more estimators to the ensemble. Defaults to False.
132
-
133
- Note:
134
- For details on kwargs see: sklearn.ensemble.IsolationForest.
135
- """
136
-
137
- X = data.to_numpy().reshape(-1, 1)
138
-
139
- clf = IsolationForest(**kwargs)
140
- clf.fit(X)
141
-
142
- is_outlier = clf.predict(X)
143
- outliers: Series = data[is_outlier == -1]
144
-
145
- return outliers
146
-
147
- def lof(self, data: Series, **kwargs: Any) -> Series:
148
- """Using the local outlier factor (LOF) method.
149
-
150
- Parameters:
151
- data (pandas.Series): The time series data.
152
-
153
- Keyword Args:
154
- n_neighbors (int): The number of neighbors to consider for each sample. Defaults to 20.
155
- algorithm (str): The algorithm to use. Either 'auto', 'ball_tree', 'kd_tree' or 'brute'. Defaults to 'auto'.
156
- leaf_size (int): The leaf size of the tree. Defaults to 30.
157
- metric (str): The distance metric to use. Defaults to 'minkowski'.
158
- p (int): The power parameter for the Minkowski metric. Defaults to 2.
159
- contamination (float): The proportion of outliers in the data. Defaults to 0.01.
160
- novelty (bool): Whether to consider the samples as normal or outliers. Defaults to False.
161
- n_jobs (int): The number of jobs to run in parallel. Defaults to 1.
162
- Note:
163
- For details on kwargs see: sklearn.neighbors.LocalOutlierFactor.
164
- """
165
-
166
- X = data.to_numpy().reshape(-1, 1)
167
-
168
- clf = LocalOutlierFactor(**kwargs)
169
-
170
- is_outlier = clf.fit_predict(X)
171
- outliers: Series = data[is_outlier == -1]
172
-
173
- return outliers
1
+ from collections.abc import Callable
2
+ from typing import Any, Literal
3
+
4
+ import numpy as np
5
+ from pandas import Series
6
+ from sklearn.ensemble import IsolationForest
7
+ from sklearn.neighbors import LocalOutlierFactor
8
+
9
+
10
+ class OutlierDetection:
11
+ """Detecting outliers in groundwater timeseries data.
12
+
13
+ Each method in this class returns a pandas.Series containing predicted outliers in
14
+ the dataset.
15
+
16
+ Methods:
17
+ iqr: Use interquartile range (IQR).
18
+ zscore: Use the z-score method.
19
+ isolation_forest: Using the isolation forest algorithm.
20
+ lof: Using the local outlier factor (LOF) method.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ data: Series,
26
+ method: Literal["iqr", "zscore", "isolation_forest", "lof"],
27
+ rolling: bool,
28
+ window: int,
29
+ **kwargs: Any,
30
+ ) -> None:
31
+ """Find outliers in a time series using the specified method, with an option for rolling window."""
32
+
33
+ FUNCS: dict[str, Callable] = {
34
+ "iqr": self.iqr,
35
+ "zscore": self.zscore,
36
+ "isolation_forest": self.isolation_forest,
37
+ "lof": self.lof,
38
+ }
39
+
40
+ method_func = FUNCS[method]
41
+
42
+ if method in ["iqr", "zscore"]:
43
+ # For 'iqr' and 'zscore' methods
44
+ y = (
45
+ kwargs.get("k", 1.5)
46
+ if method == "iqr"
47
+ else kwargs.get("threshold", 3.0)
48
+ )
49
+ if rolling:
50
+ roll = data.rolling(window=window)
51
+ mask = roll.apply(lambda x: method_func(x, y, rolling=True), raw=True)
52
+ else:
53
+ mask = method_func(data.to_numpy(), y, rolling=False)
54
+
55
+ bool_mask = mask.astype(bool)
56
+ bool_mask_series = Series(bool_mask, index=data.index)
57
+ self.outliers = data[bool_mask_series]
58
+
59
+ else:
60
+ # For 'isolation_forest' and 'lof' methods
61
+ self.outliers = method_func(data, **kwargs)
62
+
63
+ @staticmethod
64
+ def iqr(data: np.ndarray, k: float, rolling: bool) -> np.ndarray:
65
+ """Use interquartile range (IQR).
66
+
67
+ Parameters:
68
+ data (pandas.Series): The time series data.
69
+
70
+ Keyword Args:
71
+ k (float): The multiplier for the IQR to define the range. Defaults to 1.5.
72
+
73
+ Returns:
74
+ np.ndarray: Binary mask representing the outliers as 1.
75
+ """
76
+
77
+ Q1 = np.percentile(data, 0.25)
78
+ Q3 = np.percentile(data, 0.75)
79
+ IQR = Q3 - Q1
80
+
81
+ lower_bound = Q1 - k * IQR
82
+ upper_bound = Q3 + k * IQR
83
+
84
+ if rolling:
85
+ return (
86
+ np.array([1])
87
+ if (data[-1] < lower_bound or data[-1] > upper_bound)
88
+ else np.array([0])
89
+ )
90
+
91
+ return np.where((data < lower_bound) | (data > upper_bound), 1, 0)
92
+
93
+ @staticmethod
94
+ def zscore(data: np.ndarray, threshold: float, rolling: bool) -> np.ndarray:
95
+ """Use the z-score method.
96
+
97
+ Parameters:
98
+ data (pandas.Series): The time series data.
99
+
100
+ Keyword Args:
101
+ threshold (float): The threshold for the z-score method. Defaults to 3.0.
102
+
103
+ Returns:
104
+ pandas.Series: Binary mask representing outliers.
105
+ """
106
+
107
+ mean = np.mean(data)
108
+ std_dev = np.std(data)
109
+
110
+ z_scores = np.abs((data - mean) / std_dev)
111
+
112
+ if rolling:
113
+ return np.array([1]) if z_scores[-1] > threshold else np.array([0])
114
+ return np.where(z_scores > threshold, 1, 0)
115
+
116
+ def isolation_forest(self, data: Series, **kwargs: Any) -> Series:
117
+ """Using the isolation forest algorithm.
118
+
119
+ Parameters:
120
+ data (pandas.Series): The time series data.
121
+
122
+ Keyword Args:
123
+ n_estimators (int): The number of base estimators in the ensemble. Defaults to 100.
124
+ max_samples (int | 'auto' | float): The number of samples to draw from X to train each base estimator. Defaults to 'auto'.
125
+ contamination (float): The proportion of outliers in the data. Defaults to 0.01.
126
+ max_features (int | float): The number of features to draw from X to train each base estimator. Defaults to 1.0.
127
+ bootstrap (bool): Whether to use bootstrapping when sampling the data. Defaults to False.
128
+ n_jobs (int): The number of jobs to run in parallel. Defaults to 1.
129
+ random_state (int | RandomState | None): The random state to use. Defaults to None.
130
+ verbose (int): The verbosity level. Defaults to 0.
131
+ warm_start (bool): Whether to reuse the solution of the previous call to fit and add more estimators to the ensemble. Defaults to False.
132
+
133
+ Note:
134
+ For details on kwargs see: sklearn.ensemble.IsolationForest.
135
+ """
136
+
137
+ X = data.to_numpy().reshape(-1, 1)
138
+
139
+ clf = IsolationForest(**kwargs)
140
+ clf.fit(X)
141
+
142
+ is_outlier = clf.predict(X)
143
+ outliers: Series = data[is_outlier == -1]
144
+
145
+ return outliers
146
+
147
+ def lof(self, data: Series, **kwargs: Any) -> Series:
148
+ """Using the local outlier factor (LOF) method.
149
+
150
+ Parameters:
151
+ data (pandas.Series): The time series data.
152
+
153
+ Keyword Args:
154
+ n_neighbors (int): The number of neighbors to consider for each sample. Defaults to 20.
155
+ algorithm (str): The algorithm to use. Either 'auto', 'ball_tree', 'kd_tree' or 'brute'. Defaults to 'auto'.
156
+ leaf_size (int): The leaf size of the tree. Defaults to 30.
157
+ metric (str): The distance metric to use. Defaults to 'minkowski'.
158
+ p (int): The power parameter for the Minkowski metric. Defaults to 2.
159
+ contamination (float): The proportion of outliers in the data. Defaults to 0.01.
160
+ novelty (bool): Whether to consider the samples as normal or outliers. Defaults to False.
161
+ n_jobs (int): The number of jobs to run in parallel. Defaults to 1.
162
+ Note:
163
+ For details on kwargs see: sklearn.neighbors.LocalOutlierFactor.
164
+ """
165
+
166
+ X = data.to_numpy().reshape(-1, 1)
167
+
168
+ clf = LocalOutlierFactor(**kwargs)
169
+
170
+ is_outlier = clf.fit_predict(X)
171
+ outliers: Series = data[is_outlier == -1]
172
+
173
+ return outliers
@@ -1,28 +1,28 @@
1
- """Module to compute timeseries statistics, similar to pastas.stats.signatures module
2
- and following Heudorfer et al. 2019
3
-
4
- To be implemented:
5
-
6
- - Structure
7
- * Flashiness
8
- - Distribution
9
- * Modality
10
- * Density
11
- - Shape
12
- * Scale
13
- * Slope
14
- """
15
-
16
- import numpy as np
17
-
18
- from gensor.core.timeseries import Timeseries
19
-
20
-
21
- def trend(ts: Timeseries) -> tuple:
22
- time_numeric = np.arange(len(ts.timeseries))
23
-
24
- # Perform linear regression using numpy's polyfit
25
- # This returns the slope and intercept of the best fit line
26
- slope, intercept = np.polyfit(time_numeric, ts.timeseries, 1)
27
-
28
- return slope, intercept
1
+ """Module to compute timeseries statistics, similar to pastas.stats.signatures module
2
+ and following Heudorfer et al. 2019
3
+
4
+ To be implemented:
5
+
6
+ - Structure
7
+ * Flashiness
8
+ - Distribution
9
+ * Modality
10
+ * Density
11
+ - Shape
12
+ * Scale
13
+ * Slope
14
+ """
15
+
16
+ import numpy as np
17
+
18
+ from gensor.core.timeseries import Timeseries
19
+
20
+
21
+ def trend(ts: Timeseries) -> tuple:
22
+ time_numeric = np.arange(len(ts.timeseries))
23
+
24
+ # Perform linear regression using numpy's polyfit
25
+ # This returns the slope and intercept of the best fit line
26
+ slope, intercept = np.polyfit(time_numeric, ts.timeseries, 1)
27
+
28
+ return slope, intercept
@@ -1,17 +1,17 @@
1
- """
2
- !!! warning
3
-
4
- Whenever Timeseries objects are created via read_from_csv and use a parser (e.g.,
5
- 'vanessen'), the timestamps are localized and converted to UTC. Therefore, if the
6
- user creates his own timeseries outside the read_from_csv, they should ensure that
7
- the timestamps are in UTC format.
8
- """
9
-
10
- VARIABLE_TYPES_AND_UNITS = {
11
- "temperature": ["degc"],
12
- "pressure": ["cmh2o", "mmh2o"],
13
- "conductivity": ["ms/cm"],
14
- "flux": ["m/s"],
15
- "head": ["m asl"],
16
- "depth": ["m"],
17
- }
1
+ """
2
+ !!! warning
3
+
4
+ Whenever Timeseries objects are created via read_from_csv and use a parser (e.g.,
5
+ 'vanessen'), the timestamps are localized and converted to UTC. Therefore, if the
6
+ user creates his own timeseries outside the read_from_csv, they should ensure that
7
+ the timestamps are in UTC format.
8
+ """
9
+
10
+ VARIABLE_TYPES_AND_UNITS = {
11
+ "temperature": ["degc"],
12
+ "pressure": ["cmh2o", "mmh2o"],
13
+ "conductivity": ["ms/cm"],
14
+ "flux": ["m/s"],
15
+ "head": ["m asl"],
16
+ "depth": ["m"],
17
+ }