gensor 0.2.4__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {gensor-0.2.4 → gensor-0.4.0}/PKG-INFO +1 -1
  2. {gensor-0.2.4 → gensor-0.4.0}/gensor/__init__.py +33 -29
  3. {gensor-0.2.4 → gensor-0.4.0}/gensor/analysis/outliers.py +173 -173
  4. {gensor-0.2.4 → gensor-0.4.0}/gensor/analysis/stats.py +28 -28
  5. {gensor-0.2.4 → gensor-0.4.0}/gensor/config.py +17 -17
  6. {gensor-0.2.4 → gensor-0.4.0}/gensor/core/base.py +423 -412
  7. gensor-0.4.0/gensor/core/dataset.py +944 -0
  8. {gensor-0.2.4 → gensor-0.4.0}/gensor/core/indexer.py +37 -37
  9. {gensor-0.2.4 → gensor-0.4.0}/gensor/core/timeseries.py +78 -78
  10. {gensor-0.2.4 → gensor-0.4.0}/gensor/db/__init__.py +14 -14
  11. {gensor-0.2.4 → gensor-0.4.0}/gensor/db/connection.py +197 -197
  12. {gensor-0.2.4 → gensor-0.4.0}/gensor/exceptions.py +55 -55
  13. {gensor-0.2.4 → gensor-0.4.0}/gensor/io/read.py +192 -192
  14. {gensor-0.2.4 → gensor-0.4.0}/gensor/log.py +7 -7
  15. {gensor-0.2.4 → gensor-0.4.0}/gensor/parse/__init__.py +4 -4
  16. {gensor-0.2.4 → gensor-0.4.0}/gensor/parse/plain.py +61 -61
  17. gensor-0.4.0/gensor/parse/utils.py +143 -0
  18. {gensor-0.2.4 → gensor-0.4.0}/gensor/parse/vanessen.py +119 -89
  19. gensor-0.4.0/gensor/processing/compensation.py +317 -0
  20. {gensor-0.2.4 → gensor-0.4.0}/gensor/processing/smoothing.py +66 -66
  21. {gensor-0.2.4 → gensor-0.4.0}/gensor/processing/transform.py +148 -148
  22. {gensor-0.2.4 → gensor-0.4.0}/gensor/testdata/__init__.py +25 -25
  23. {gensor-0.2.4 → gensor-0.4.0}/pyproject.toml +2 -1
  24. gensor-0.2.4/gensor/core/dataset.py +0 -207
  25. gensor-0.2.4/gensor/parse/utils.py +0 -87
  26. gensor-0.2.4/gensor/processing/compensation.py +0 -195
  27. {gensor-0.2.4 → gensor-0.4.0}/LICENSE +0 -0
  28. {gensor-0.2.4 → gensor-0.4.0}/README.md +0 -0
  29. {gensor-0.2.4 → gensor-0.4.0}/gensor/analysis/__init__.py +0 -0
  30. {gensor-0.2.4 → gensor-0.4.0}/gensor/core/__init__.py +0 -0
  31. {gensor-0.2.4 → gensor-0.4.0}/gensor/io/__init__.py +0 -0
  32. {gensor-0.2.4 → gensor-0.4.0}/gensor/processing/__init__.py +0 -0
  33. {gensor-0.2.4 → gensor-0.4.0}/gensor/testdata/Barodiver_220427183008_BY222.csv +0 -0
  34. {gensor-0.2.4 → gensor-0.4.0}/gensor/testdata/PB01A_moni_AV319_220427183019_AV319.csv +0 -0
  35. {gensor-0.2.4 → gensor-0.4.0}/gensor/testdata/PB02A_plain.csv +0 -0
  36. {gensor-0.2.4 → gensor-0.4.0}/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: gensor
3
- Version: 0.2.4
3
+ Version: 0.4.0
4
4
  Summary: Library for handling groundwater sensor data.
5
5
  Home-page: https://github.com/zawadzkim/gensor
6
6
  Author: Mateusz Zawadzki
@@ -1,29 +1,33 @@
1
- import logging
2
-
3
- from .core.dataset import Dataset
4
- from .core.timeseries import Timeseries
5
- from .io.read import read_from_csv, read_from_sql
6
- from .log import set_log_level
7
- from .processing.compensation import compensate
8
-
9
- __all__ = [
10
- # basic data types
11
- "Dataset",
12
- "Timeseries",
13
- "compensate",
14
- # getters
15
- "read_from_csv",
16
- "read_from_sql",
17
- "set_log_level",
18
- ]
19
-
20
-
21
- logger = logging.getLogger(__name__)
22
- logger.setLevel(logging.INFO)
23
-
24
- if not logger.hasHandlers():
25
- console_handler = logging.StreamHandler()
26
- console_handler.setLevel(logging.INFO)
27
- formatter = logging.Formatter("%(levelname)s: %(message)s")
28
- console_handler.setFormatter(formatter)
29
- logger.addHandler(console_handler)
1
+ import logging
2
+
3
+ from .core.dataset import Dataset, Where, diff
4
+ from .core.timeseries import Timeseries
5
+ from .io.read import read_from_csv, read_from_sql
6
+ from .log import set_log_level
7
+ from .processing.compensation import compensate, water_column
8
+
9
+ __all__ = [
10
+ # basic data types
11
+ "Dataset",
12
+ "Timeseries",
13
+ "Where",
14
+ "compensate",
15
+ "water_column",
16
+ # comparison
17
+ "diff",
18
+ # getters
19
+ "read_from_csv",
20
+ "read_from_sql",
21
+ "set_log_level",
22
+ ]
23
+
24
+
25
+ logger = logging.getLogger(__name__)
26
+ logger.setLevel(logging.INFO)
27
+
28
+ if not logger.hasHandlers():
29
+ console_handler = logging.StreamHandler()
30
+ console_handler.setLevel(logging.INFO)
31
+ formatter = logging.Formatter("%(levelname)s: %(message)s")
32
+ console_handler.setFormatter(formatter)
33
+ logger.addHandler(console_handler)
@@ -1,173 +1,173 @@
1
- from collections.abc import Callable
2
- from typing import Any, Literal
3
-
4
- import numpy as np
5
- from pandas import Series
6
- from sklearn.ensemble import IsolationForest
7
- from sklearn.neighbors import LocalOutlierFactor
8
-
9
-
10
- class OutlierDetection:
11
- """Detecting outliers in groundwater timeseries data.
12
-
13
- Each method in this class returns a pandas.Series containing predicted outliers in
14
- the dataset.
15
-
16
- Methods:
17
- iqr: Use interquartile range (IQR).
18
- zscore: Use the z-score method.
19
- isolation_forest: Using the isolation forest algorithm.
20
- lof: Using the local outlier factor (LOF) method.
21
- """
22
-
23
- def __init__(
24
- self,
25
- data: Series,
26
- method: Literal["iqr", "zscore", "isolation_forest", "lof"],
27
- rolling: bool,
28
- window: int,
29
- **kwargs: Any,
30
- ) -> None:
31
- """Find outliers in a time series using the specified method, with an option for rolling window."""
32
-
33
- FUNCS: dict[str, Callable] = {
34
- "iqr": self.iqr,
35
- "zscore": self.zscore,
36
- "isolation_forest": self.isolation_forest,
37
- "lof": self.lof,
38
- }
39
-
40
- method_func = FUNCS[method]
41
-
42
- if method in ["iqr", "zscore"]:
43
- # For 'iqr' and 'zscore' methods
44
- y = (
45
- kwargs.get("k", 1.5)
46
- if method == "iqr"
47
- else kwargs.get("threshold", 3.0)
48
- )
49
- if rolling:
50
- roll = data.rolling(window=window)
51
- mask = roll.apply(lambda x: method_func(x, y, rolling=True), raw=True)
52
- else:
53
- mask = method_func(data.to_numpy(), y, rolling=False)
54
-
55
- bool_mask = mask.astype(bool)
56
- bool_mask_series = Series(bool_mask, index=data.index)
57
- self.outliers = data[bool_mask_series]
58
-
59
- else:
60
- # For 'isolation_forest' and 'lof' methods
61
- self.outliers = method_func(data, **kwargs)
62
-
63
- @staticmethod
64
- def iqr(data: np.ndarray, k: float, rolling: bool) -> np.ndarray:
65
- """Use interquartile range (IQR).
66
-
67
- Parameters:
68
- data (pandas.Series): The time series data.
69
-
70
- Keyword Args:
71
- k (float): The multiplier for the IQR to define the range. Defaults to 1.5.
72
-
73
- Returns:
74
- np.ndarray: Binary mask representing the outliers as 1.
75
- """
76
-
77
- Q1 = np.percentile(data, 0.25)
78
- Q3 = np.percentile(data, 0.75)
79
- IQR = Q3 - Q1
80
-
81
- lower_bound = Q1 - k * IQR
82
- upper_bound = Q3 + k * IQR
83
-
84
- if rolling:
85
- return (
86
- np.array([1])
87
- if (data[-1] < lower_bound or data[-1] > upper_bound)
88
- else np.array([0])
89
- )
90
-
91
- return np.where((data < lower_bound) | (data > upper_bound), 1, 0)
92
-
93
- @staticmethod
94
- def zscore(data: np.ndarray, threshold: float, rolling: bool) -> np.ndarray:
95
- """Use the z-score method.
96
-
97
- Parameters:
98
- data (pandas.Series): The time series data.
99
-
100
- Keyword Args:
101
- threshold (float): The threshold for the z-score method. Defaults to 3.0.
102
-
103
- Returns:
104
- pandas.Series: Binary mask representing outliers.
105
- """
106
-
107
- mean = np.mean(data)
108
- std_dev = np.std(data)
109
-
110
- z_scores = np.abs((data - mean) / std_dev)
111
-
112
- if rolling:
113
- return np.array([1]) if z_scores[-1] > threshold else np.array([0])
114
- return np.where(z_scores > threshold, 1, 0)
115
-
116
- def isolation_forest(self, data: Series, **kwargs: Any) -> Series:
117
- """Using the isolation forest algorithm.
118
-
119
- Parameters:
120
- data (pandas.Series): The time series data.
121
-
122
- Keyword Args:
123
- n_estimators (int): The number of base estimators in the ensemble. Defaults to 100.
124
- max_samples (int | 'auto' | float): The number of samples to draw from X to train each base estimator. Defaults to 'auto'.
125
- contamination (float): The proportion of outliers in the data. Defaults to 0.01.
126
- max_features (int | float): The number of features to draw from X to train each base estimator. Defaults to 1.0.
127
- bootstrap (bool): Whether to use bootstrapping when sampling the data. Defaults to False.
128
- n_jobs (int): The number of jobs to run in parallel. Defaults to 1.
129
- random_state (int | RandomState | None): The random state to use. Defaults to None.
130
- verbose (int): The verbosity level. Defaults to 0.
131
- warm_start (bool): Whether to reuse the solution of the previous call to fit and add more estimators to the ensemble. Defaults to False.
132
-
133
- Note:
134
- For details on kwargs see: sklearn.ensemble.IsolationForest.
135
- """
136
-
137
- X = data.to_numpy().reshape(-1, 1)
138
-
139
- clf = IsolationForest(**kwargs)
140
- clf.fit(X)
141
-
142
- is_outlier = clf.predict(X)
143
- outliers: Series = data[is_outlier == -1]
144
-
145
- return outliers
146
-
147
- def lof(self, data: Series, **kwargs: Any) -> Series:
148
- """Using the local outlier factor (LOF) method.
149
-
150
- Parameters:
151
- data (pandas.Series): The time series data.
152
-
153
- Keyword Args:
154
- n_neighbors (int): The number of neighbors to consider for each sample. Defaults to 20.
155
- algorithm (str): The algorithm to use. Either 'auto', 'ball_tree', 'kd_tree' or 'brute'. Defaults to 'auto'.
156
- leaf_size (int): The leaf size of the tree. Defaults to 30.
157
- metric (str): The distance metric to use. Defaults to 'minkowski'.
158
- p (int): The power parameter for the Minkowski metric. Defaults to 2.
159
- contamination (float): The proportion of outliers in the data. Defaults to 0.01.
160
- novelty (bool): Whether to consider the samples as normal or outliers. Defaults to False.
161
- n_jobs (int): The number of jobs to run in parallel. Defaults to 1.
162
- Note:
163
- For details on kwargs see: sklearn.neighbors.LocalOutlierFactor.
164
- """
165
-
166
- X = data.to_numpy().reshape(-1, 1)
167
-
168
- clf = LocalOutlierFactor(**kwargs)
169
-
170
- is_outlier = clf.fit_predict(X)
171
- outliers: Series = data[is_outlier == -1]
172
-
173
- return outliers
1
+ from collections.abc import Callable
2
+ from typing import Any, Literal
3
+
4
+ import numpy as np
5
+ from pandas import Series
6
+ from sklearn.ensemble import IsolationForest
7
+ from sklearn.neighbors import LocalOutlierFactor
8
+
9
+
10
+ class OutlierDetection:
11
+ """Detecting outliers in groundwater timeseries data.
12
+
13
+ Each method in this class returns a pandas.Series containing predicted outliers in
14
+ the dataset.
15
+
16
+ Methods:
17
+ iqr: Use interquartile range (IQR).
18
+ zscore: Use the z-score method.
19
+ isolation_forest: Using the isolation forest algorithm.
20
+ lof: Using the local outlier factor (LOF) method.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ data: Series,
26
+ method: Literal["iqr", "zscore", "isolation_forest", "lof"],
27
+ rolling: bool,
28
+ window: int,
29
+ **kwargs: Any,
30
+ ) -> None:
31
+ """Find outliers in a time series using the specified method, with an option for rolling window."""
32
+
33
+ FUNCS: dict[str, Callable] = {
34
+ "iqr": self.iqr,
35
+ "zscore": self.zscore,
36
+ "isolation_forest": self.isolation_forest,
37
+ "lof": self.lof,
38
+ }
39
+
40
+ method_func = FUNCS[method]
41
+
42
+ if method in ["iqr", "zscore"]:
43
+ # For 'iqr' and 'zscore' methods
44
+ y = (
45
+ kwargs.get("k", 1.5)
46
+ if method == "iqr"
47
+ else kwargs.get("threshold", 3.0)
48
+ )
49
+ if rolling:
50
+ roll = data.rolling(window=window)
51
+ mask = roll.apply(lambda x: method_func(x, y, rolling=True), raw=True)
52
+ else:
53
+ mask = method_func(data.to_numpy(), y, rolling=False)
54
+
55
+ bool_mask = mask.astype(bool)
56
+ bool_mask_series = Series(bool_mask, index=data.index)
57
+ self.outliers = data[bool_mask_series]
58
+
59
+ else:
60
+ # For 'isolation_forest' and 'lof' methods
61
+ self.outliers = method_func(data, **kwargs)
62
+
63
+ @staticmethod
64
+ def iqr(data: np.ndarray, k: float, rolling: bool) -> np.ndarray:
65
+ """Use interquartile range (IQR).
66
+
67
+ Parameters:
68
+ data (pandas.Series): The time series data.
69
+
70
+ Keyword Args:
71
+ k (float): The multiplier for the IQR to define the range. Defaults to 1.5.
72
+
73
+ Returns:
74
+ np.ndarray: Binary mask representing the outliers as 1.
75
+ """
76
+
77
+ Q1 = np.percentile(data, 0.25)
78
+ Q3 = np.percentile(data, 0.75)
79
+ IQR = Q3 - Q1
80
+
81
+ lower_bound = Q1 - k * IQR
82
+ upper_bound = Q3 + k * IQR
83
+
84
+ if rolling:
85
+ return (
86
+ np.array([1])
87
+ if (data[-1] < lower_bound or data[-1] > upper_bound)
88
+ else np.array([0])
89
+ )
90
+
91
+ return np.where((data < lower_bound) | (data > upper_bound), 1, 0)
92
+
93
+ @staticmethod
94
+ def zscore(data: np.ndarray, threshold: float, rolling: bool) -> np.ndarray:
95
+ """Use the z-score method.
96
+
97
+ Parameters:
98
+ data (pandas.Series): The time series data.
99
+
100
+ Keyword Args:
101
+ threshold (float): The threshold for the z-score method. Defaults to 3.0.
102
+
103
+ Returns:
104
+ pandas.Series: Binary mask representing outliers.
105
+ """
106
+
107
+ mean = np.mean(data)
108
+ std_dev = np.std(data)
109
+
110
+ z_scores = np.abs((data - mean) / std_dev)
111
+
112
+ if rolling:
113
+ return np.array([1]) if z_scores[-1] > threshold else np.array([0])
114
+ return np.where(z_scores > threshold, 1, 0)
115
+
116
+ def isolation_forest(self, data: Series, **kwargs: Any) -> Series:
117
+ """Using the isolation forest algorithm.
118
+
119
+ Parameters:
120
+ data (pandas.Series): The time series data.
121
+
122
+ Keyword Args:
123
+ n_estimators (int): The number of base estimators in the ensemble. Defaults to 100.
124
+ max_samples (int | 'auto' | float): The number of samples to draw from X to train each base estimator. Defaults to 'auto'.
125
+ contamination (float): The proportion of outliers in the data. Defaults to 0.01.
126
+ max_features (int | float): The number of features to draw from X to train each base estimator. Defaults to 1.0.
127
+ bootstrap (bool): Whether to use bootstrapping when sampling the data. Defaults to False.
128
+ n_jobs (int): The number of jobs to run in parallel. Defaults to 1.
129
+ random_state (int | RandomState | None): The random state to use. Defaults to None.
130
+ verbose (int): The verbosity level. Defaults to 0.
131
+ warm_start (bool): Whether to reuse the solution of the previous call to fit and add more estimators to the ensemble. Defaults to False.
132
+
133
+ Note:
134
+ For details on kwargs see: sklearn.ensemble.IsolationForest.
135
+ """
136
+
137
+ X = data.to_numpy().reshape(-1, 1)
138
+
139
+ clf = IsolationForest(**kwargs)
140
+ clf.fit(X)
141
+
142
+ is_outlier = clf.predict(X)
143
+ outliers: Series = data[is_outlier == -1]
144
+
145
+ return outliers
146
+
147
+ def lof(self, data: Series, **kwargs: Any) -> Series:
148
+ """Using the local outlier factor (LOF) method.
149
+
150
+ Parameters:
151
+ data (pandas.Series): The time series data.
152
+
153
+ Keyword Args:
154
+ n_neighbors (int): The number of neighbors to consider for each sample. Defaults to 20.
155
+ algorithm (str): The algorithm to use. Either 'auto', 'ball_tree', 'kd_tree' or 'brute'. Defaults to 'auto'.
156
+ leaf_size (int): The leaf size of the tree. Defaults to 30.
157
+ metric (str): The distance metric to use. Defaults to 'minkowski'.
158
+ p (int): The power parameter for the Minkowski metric. Defaults to 2.
159
+ contamination (float): The proportion of outliers in the data. Defaults to 0.01.
160
+ novelty (bool): Whether to consider the samples as normal or outliers. Defaults to False.
161
+ n_jobs (int): The number of jobs to run in parallel. Defaults to 1.
162
+ Note:
163
+ For details on kwargs see: sklearn.neighbors.LocalOutlierFactor.
164
+ """
165
+
166
+ X = data.to_numpy().reshape(-1, 1)
167
+
168
+ clf = LocalOutlierFactor(**kwargs)
169
+
170
+ is_outlier = clf.fit_predict(X)
171
+ outliers: Series = data[is_outlier == -1]
172
+
173
+ return outliers
@@ -1,28 +1,28 @@
1
- """Module to compute timeseries statistics, similar to pastas.stats.signatures module
2
- and following Heudorfer et al. 2019
3
-
4
- To be implemented:
5
-
6
- - Structure
7
- * Flashiness
8
- - Distribution
9
- * Modality
10
- * Density
11
- - Shape
12
- * Scale
13
- * Slope
14
- """
15
-
16
- import numpy as np
17
-
18
- from gensor.core.timeseries import Timeseries
19
-
20
-
21
- def trend(ts: Timeseries) -> tuple:
22
- time_numeric = np.arange(len(ts.timeseries))
23
-
24
- # Perform linear regression using numpy's polyfit
25
- # This returns the slope and intercept of the best fit line
26
- slope, intercept = np.polyfit(time_numeric, ts.timeseries, 1)
27
-
28
- return slope, intercept
1
+ """Module to compute timeseries statistics, similar to pastas.stats.signatures module
2
+ and following Heudorfer et al. 2019
3
+
4
+ To be implemented:
5
+
6
+ - Structure
7
+ * Flashiness
8
+ - Distribution
9
+ * Modality
10
+ * Density
11
+ - Shape
12
+ * Scale
13
+ * Slope
14
+ """
15
+
16
+ import numpy as np
17
+
18
+ from gensor.core.timeseries import Timeseries
19
+
20
+
21
+ def trend(ts: Timeseries) -> tuple:
22
+ time_numeric = np.arange(len(ts.timeseries))
23
+
24
+ # Perform linear regression using numpy's polyfit
25
+ # This returns the slope and intercept of the best fit line
26
+ slope, intercept = np.polyfit(time_numeric, ts.timeseries, 1)
27
+
28
+ return slope, intercept
@@ -1,17 +1,17 @@
1
- """
2
- !!! warning
3
-
4
- Whenever Timeseries objects are created via read_from_csv and use a parser (e.g.,
5
- 'vanessen'), the timestamps are localized and converted to UTC. Therefore, if the
6
- user creates his own timeseries outside the read_from_csv, they should ensure that
7
- the timestamps are in UTC format.
8
- """
9
-
10
- VARIABLE_TYPES_AND_UNITS = {
11
- "temperature": ["degc"],
12
- "pressure": ["cmh2o", "mmh2o"],
13
- "conductivity": ["ms/cm"],
14
- "flux": ["m/s"],
15
- "head": ["m asl"],
16
- "depth": ["m"],
17
- }
1
+ """
2
+ !!! warning
3
+
4
+ Whenever Timeseries objects are created via read_from_csv and use a parser (e.g.,
5
+ 'vanessen'), the timestamps are localized and converted to UTC. Therefore, if the
6
+ user creates his own timeseries outside the read_from_csv, they should ensure that
7
+ the timestamps are in UTC format.
8
+ """
9
+
10
+ VARIABLE_TYPES_AND_UNITS = {
11
+ "temperature": ["degc"],
12
+ "pressure": ["cmh2o", "mmh2o"],
13
+ "conductivity": ["ms/cm"],
14
+ "flux": ["m/s"],
15
+ "head": ["m asl"],
16
+ "depth": ["m"],
17
+ }