gensor 0.0.4__tar.gz → 0.0.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gensor-0.0.4 → gensor-0.0.6}/PKG-INFO +6 -3
- {gensor-0.0.4 → gensor-0.0.6}/gensor/__init__.py +6 -2
- gensor-0.0.6/gensor/compensation.py +194 -0
- gensor-0.0.6/gensor/db/connection.py +144 -0
- {gensor-0.0.4 → gensor-0.0.6}/gensor/dtypes.py +186 -59
- gensor-0.0.6/gensor/getters.py +162 -0
- gensor-0.0.6/gensor/parse/__init__.py +4 -0
- gensor-0.0.6/gensor/parse/plain.py +60 -0
- gensor-0.0.6/gensor/parse/utils.py +67 -0
- {gensor-0.0.4 → gensor-0.0.6}/gensor/parse/vanessen.py +12 -72
- {gensor-0.0.4 → gensor-0.0.6}/gensor/preprocessing.py +62 -37
- gensor-0.0.6/gensor/testdata/Barodiver_220427183008_BY222.csv +15279 -0
- gensor-0.0.6/gensor/testdata/PB01A_moni_AV319_220427183019_AV319.csv +15275 -0
- gensor-0.0.6/gensor/testdata/PB02A_plain.csv +14005 -0
- gensor-0.0.6/gensor/testdata/__init__.py +25 -0
- {gensor-0.0.4 → gensor-0.0.6}/pyproject.toml +10 -4
- gensor-0.0.4/gensor/compensation.py +0 -129
- gensor-0.0.4/gensor/db/connection.py +0 -53
- gensor-0.0.4/gensor/getters.py +0 -95
- gensor-0.0.4/gensor/parse/__init__.py +0 -3
- {gensor-0.0.4 → gensor-0.0.6}/LICENSE +0 -0
- {gensor-0.0.4 → gensor-0.0.6}/README.md +0 -0
- {gensor-0.0.4 → gensor-0.0.6}/gensor/db/__init__.py +0 -0
- {gensor-0.0.4 → gensor-0.0.6}/gensor/exceptions.py +0 -0
- {gensor-0.0.4 → gensor-0.0.6}/gensor/smoothing.py +0 -0
- {gensor-0.0.4 → gensor-0.0.6}/gensor/trend.py +0 -0
- {gensor-0.0.4 → gensor-0.0.6}/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: gensor
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.6
|
|
4
4
|
Summary: Library for handling groundwater sensor data.
|
|
5
5
|
Home-page: https://github.com/zawadzkim/gensor
|
|
6
6
|
Author: Mateusz Zawadzki
|
|
@@ -11,14 +11,17 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
11
11
|
Classifier: Programming Language :: Python :: 3.12
|
|
12
12
|
Requires-Dist: chardet (>=5.2.0,<6.0.0)
|
|
13
13
|
Requires-Dist: matplotlib (>=3.9.2,<4.0.0)
|
|
14
|
-
Requires-Dist:
|
|
14
|
+
Requires-Dist: numba (>=0.60.0,<0.61.0)
|
|
15
|
+
Requires-Dist: numpy (>=2.0.0,<3.0.0)
|
|
15
16
|
Requires-Dist: pandas (>=2.2.2,<3.0.0)
|
|
16
17
|
Requires-Dist: pandera (>=0.20.3,<0.21.0)
|
|
17
18
|
Requires-Dist: pydantic (>=2.8.2,<3.0.0)
|
|
18
|
-
Requires-Dist:
|
|
19
|
+
Requires-Dist: python-dateutil (>=2.9.0.post0,<3.0.0)
|
|
19
20
|
Requires-Dist: scikit-learn (>=1.5.1,<2.0.0)
|
|
20
21
|
Requires-Dist: scipy (>=1.14.1,<2.0.0)
|
|
22
|
+
Requires-Dist: seaborn (>=0.13.2,<0.14.0)
|
|
21
23
|
Requires-Dist: sqlalchemy (>=2.0.32,<3.0.0)
|
|
24
|
+
Requires-Dist: statsmodels (>=0.14.3,<0.15.0)
|
|
22
25
|
Project-URL: Documentation, https://zawadzkim.github.io/gensor/
|
|
23
26
|
Project-URL: Repository, https://github.com/zawadzkim/gensor
|
|
24
27
|
Description-Content-Type: text/markdown
|
|
@@ -1,14 +1,18 @@
|
|
|
1
1
|
from .compensation import Compensator, compensate
|
|
2
2
|
from .dtypes import Dataset, Timeseries
|
|
3
|
-
from .getters import read_from_csv
|
|
3
|
+
from .getters import read_from_csv, read_from_sql
|
|
4
4
|
from .preprocessing import OutlierDetection, Transform
|
|
5
5
|
|
|
6
6
|
__all__ = [
|
|
7
|
+
# basic data types
|
|
7
8
|
"Dataset",
|
|
8
9
|
"Timeseries",
|
|
9
|
-
|
|
10
|
+
# data transformation
|
|
10
11
|
"OutlierDetection",
|
|
11
12
|
"Transform",
|
|
12
13
|
"Compensator",
|
|
13
14
|
"compensate",
|
|
15
|
+
# getters
|
|
16
|
+
"read_from_csv",
|
|
17
|
+
"read_from_sql",
|
|
14
18
|
]
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"""Compensating the raw data from the absolute pressure transducer to the actual water
|
|
2
|
+
level using the barometric pressure data.
|
|
3
|
+
|
|
4
|
+
Because van Essen Instrument divers are non-vented pressure transducers, to obtain the
|
|
5
|
+
pressure resulting from the water column above the logger (i.e. the water level), the
|
|
6
|
+
barometric pressure must be subtracted from the raw pressure measurements. In the
|
|
7
|
+
first step the function aligns the two series to the same time step and then subtracts
|
|
8
|
+
the barometric pressure from the raw pressure measurements. For short time periods (when
|
|
9
|
+
for instance a slug test is performed) the barometric pressure can be provided as a
|
|
10
|
+
single float value.
|
|
11
|
+
|
|
12
|
+
Subsequently the function filters out all records where the absolute water column is
|
|
13
|
+
less than or equal to the cutoff value. This is because when the logger is out of the
|
|
14
|
+
water when the measurement is taken, the absolute water column is close to zero,
|
|
15
|
+
producing erroneous results and spikes in the plots. The cutoff value is set to 5 cm by
|
|
16
|
+
default, but can be adjusted using the cutoff_wc kwarg.
|
|
17
|
+
|
|
18
|
+
Functions:
|
|
19
|
+
|
|
20
|
+
compensate: Compensate raw sensor pressure measurement with barometric pressure.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from typing import Literal
|
|
24
|
+
|
|
25
|
+
import pandas as pd
|
|
26
|
+
import pydantic as pyd
|
|
27
|
+
|
|
28
|
+
from .dtypes import Dataset, Timeseries
|
|
29
|
+
from .exceptions import (
|
|
30
|
+
InvalidMeasurementTypeError,
|
|
31
|
+
MissingInputError,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class Compensator(pyd.BaseModel):
|
|
36
|
+
"""Compensate raw sensor pressure measurement with barometric pressure.
|
|
37
|
+
|
|
38
|
+
Attributes:
|
|
39
|
+
ts (Timeseries): Raw sensor timeseries
|
|
40
|
+
barometric (Timeseries | float): Barometric pressure timeseries or a single
|
|
41
|
+
float value. If a float value is provided, it is assumed to be in cmH2O.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
ts: Timeseries
|
|
45
|
+
barometric: Timeseries | float
|
|
46
|
+
|
|
47
|
+
@pyd.field_validator("ts", "barometric", mode="before")
|
|
48
|
+
def validate_timeseries_type(cls, v: Timeseries) -> Timeseries:
|
|
49
|
+
if isinstance(v, Timeseries) and v.variable != "pressure":
|
|
50
|
+
raise InvalidMeasurementTypeError()
|
|
51
|
+
return v
|
|
52
|
+
|
|
53
|
+
@pyd.field_validator("ts")
|
|
54
|
+
def validate_sensor_information(cls, v: Timeseries) -> Timeseries:
|
|
55
|
+
if v.sensor is not None and not v.sensor_alt:
|
|
56
|
+
raise MissingInputError("sensor_alt")
|
|
57
|
+
return v
|
|
58
|
+
|
|
59
|
+
def compensate(
|
|
60
|
+
self,
|
|
61
|
+
alignment_period: Literal["D", "ME", "SME", "MS", "YE", "YS", "h", "min", "s"],
|
|
62
|
+
threshold_wc: float | None,
|
|
63
|
+
fieldwork_dates: list | None,
|
|
64
|
+
) -> Timeseries | None:
|
|
65
|
+
"""Perform compensation.
|
|
66
|
+
|
|
67
|
+
Parameters:
|
|
68
|
+
alignment_period Literal['D', 'ME', 'SME', 'MS', 'YE', 'YS', 'h', 'min', 's']: The alignment period for the timeseries.
|
|
69
|
+
Default is 'h'. See pandas offset aliases for definitinos.
|
|
70
|
+
threshold_wc (float): The threshold for the absolute water column.
|
|
71
|
+
fieldwork_dates (Optional[list]): List of dates when fieldwork was done. All
|
|
72
|
+
measurement from a fieldwork day will be set to None.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
Timeseries: A new Timeseries instance with the compensated data and updated unit and variable. Optionally removed outliers are included.
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
resample_params = {"freq": alignment_period, "agg_func": pd.Series.mean}
|
|
79
|
+
resampled_ts = self.ts.resample(**resample_params)
|
|
80
|
+
|
|
81
|
+
if isinstance(self.barometric, Timeseries):
|
|
82
|
+
if self.ts == self.barometric:
|
|
83
|
+
print("Skipping compensation: both timeseries are the same.")
|
|
84
|
+
return None
|
|
85
|
+
resampled_baro = self.barometric.resample(**resample_params).ts
|
|
86
|
+
|
|
87
|
+
elif isinstance(self.barometric, float):
|
|
88
|
+
resampled_baro = pd.Series(
|
|
89
|
+
[self.barometric] * len(resampled_ts.ts), index=resampled_ts.ts.index
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# dividing by 100 to convert water column from cmH2O to mH2O
|
|
93
|
+
watercolumn_ts = resampled_ts.ts.sub(resampled_baro).divide(100).dropna()
|
|
94
|
+
|
|
95
|
+
if not isinstance(watercolumn_ts.index, pd.DatetimeIndex):
|
|
96
|
+
watercolumn_ts.index = pd.to_datetime(watercolumn_ts.index)
|
|
97
|
+
|
|
98
|
+
if fieldwork_dates:
|
|
99
|
+
fieldwork_timestamps = pd.to_datetime(fieldwork_dates).tz_localize(
|
|
100
|
+
watercolumn_ts.index.tz
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
watercolumn_ts.loc[
|
|
104
|
+
watercolumn_ts.index.normalize().isin(fieldwork_timestamps)
|
|
105
|
+
] = None
|
|
106
|
+
|
|
107
|
+
if threshold_wc:
|
|
108
|
+
watercolumn_ts_filtered = watercolumn_ts[
|
|
109
|
+
watercolumn_ts.abs() > threshold_wc
|
|
110
|
+
]
|
|
111
|
+
|
|
112
|
+
dropped_outliers = watercolumn_ts[watercolumn_ts.abs() <= threshold_wc]
|
|
113
|
+
|
|
114
|
+
print(
|
|
115
|
+
f"{len(dropped_outliers)} records \
|
|
116
|
+
dropped due to low water column."
|
|
117
|
+
)
|
|
118
|
+
gwl = watercolumn_ts_filtered.add(float(resampled_ts.sensor_alt or 0))
|
|
119
|
+
|
|
120
|
+
compensated = resampled_ts.model_copy(
|
|
121
|
+
update={
|
|
122
|
+
"ts": gwl,
|
|
123
|
+
"outliers": dropped_outliers,
|
|
124
|
+
"unit": "m asl",
|
|
125
|
+
"variable": "head",
|
|
126
|
+
},
|
|
127
|
+
deep=True,
|
|
128
|
+
)
|
|
129
|
+
else:
|
|
130
|
+
gwl = watercolumn_ts.add(float(resampled_ts.sensor_alt or 0))
|
|
131
|
+
|
|
132
|
+
compensated = resampled_ts.model_copy(
|
|
133
|
+
update={"ts": gwl, "unit": "m asl", "variable": "head"}, deep=True
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
return compensated
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def compensate(
|
|
140
|
+
raw: Timeseries | Dataset,
|
|
141
|
+
barometric: Timeseries | float,
|
|
142
|
+
alignment_period: Literal[
|
|
143
|
+
"D", "ME", "SME", "MS", "YE", "YS", "h", "min", "s"
|
|
144
|
+
] = "h",
|
|
145
|
+
threshold_wc: float | None = None,
|
|
146
|
+
fieldwork_dates: dict | None = None,
|
|
147
|
+
interpolate_method: str | None = None,
|
|
148
|
+
) -> Timeseries | Dataset | None:
|
|
149
|
+
"""Constructor for the Comensator object.
|
|
150
|
+
|
|
151
|
+
Parameters:
|
|
152
|
+
raw (Timeseries | Dataset): Raw sensor timeseries
|
|
153
|
+
barometric (Timeseries | float): Barometric pressure timeseries or a single
|
|
154
|
+
float value. If a float value is provided, it is assumed to be in cmH2O.
|
|
155
|
+
alignment_period (Literal['D', 'ME', 'SME', 'MS', 'YE', 'YS', 'h', 'min', 's']): The alignment period for the timeseries.
|
|
156
|
+
Default is 'h'. See pandas offset aliases for definitinos.
|
|
157
|
+
threshold_wc (float): The threshold for the absolute water column. If it is
|
|
158
|
+
provided, the records below that threshold are dropped.
|
|
159
|
+
fieldwork_dates (Dict[str, list]): Dictionary of location name and a list of
|
|
160
|
+
fieldwork days. All records on the fieldwork day are set to None.
|
|
161
|
+
interpolate_method (str): String representing the interpolate method as in
|
|
162
|
+
pd.Series.interpolate() method.
|
|
163
|
+
"""
|
|
164
|
+
if fieldwork_dates is None:
|
|
165
|
+
fieldwork_dates = {}
|
|
166
|
+
|
|
167
|
+
def _compensate_one(
|
|
168
|
+
raw: Timeseries, fieldwork_dates: list | None
|
|
169
|
+
) -> Timeseries | None:
|
|
170
|
+
comp = Compensator(ts=raw, barometric=barometric)
|
|
171
|
+
compensated = comp.compensate(
|
|
172
|
+
alignment_period=alignment_period,
|
|
173
|
+
threshold_wc=threshold_wc,
|
|
174
|
+
fieldwork_dates=fieldwork_dates,
|
|
175
|
+
)
|
|
176
|
+
if compensated is not None and interpolate_method:
|
|
177
|
+
# .interpolate() called on Timeseries object is wrapped to return a
|
|
178
|
+
# Timeseries object from the original pandas.Series.interpolate().
|
|
179
|
+
return compensated.interpolate(method=interpolate_method) # type: ignore[no-any-return]
|
|
180
|
+
|
|
181
|
+
else:
|
|
182
|
+
return compensated
|
|
183
|
+
|
|
184
|
+
if isinstance(raw, Timeseries):
|
|
185
|
+
dates = fieldwork_dates.get(raw.location)
|
|
186
|
+
return _compensate_one(raw, dates)
|
|
187
|
+
|
|
188
|
+
elif isinstance(raw, Dataset):
|
|
189
|
+
compensated_series = []
|
|
190
|
+
for item in raw:
|
|
191
|
+
dates = fieldwork_dates.get(item.location)
|
|
192
|
+
compensated_series.append(_compensate_one(item, dates))
|
|
193
|
+
|
|
194
|
+
return raw.model_copy(update={"timeseries": compensated_series}, deep=True)
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""Module defining database connection object.
|
|
2
|
+
|
|
3
|
+
Classes:
|
|
4
|
+
DatabaseConnection: Database connection object
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import pydantic as pyd
|
|
11
|
+
from sqlalchemy import (
|
|
12
|
+
Column,
|
|
13
|
+
Connection,
|
|
14
|
+
Engine,
|
|
15
|
+
Float,
|
|
16
|
+
Integer,
|
|
17
|
+
MetaData,
|
|
18
|
+
String,
|
|
19
|
+
Table,
|
|
20
|
+
create_engine,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
from ..exceptions import DatabaseNotFound
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class DatabaseConnection(pyd.BaseModel):
|
|
27
|
+
"""Database connection object.
|
|
28
|
+
If no database exists at the specified path, it will be created.
|
|
29
|
+
If no database is specified, an in-memory database will be used."""
|
|
30
|
+
|
|
31
|
+
model_config = pyd.ConfigDict(
|
|
32
|
+
arbitrary_types_allowed=True, validate_assignment=True
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
metadata: MetaData = MetaData()
|
|
36
|
+
db_directory: Path = Path.cwd()
|
|
37
|
+
db_name: str = "gensor.db"
|
|
38
|
+
engine: Engine | None = None
|
|
39
|
+
|
|
40
|
+
def _verify_path(self) -> str:
|
|
41
|
+
"""Verify database path."""
|
|
42
|
+
|
|
43
|
+
if not self.db_directory.exists():
|
|
44
|
+
raise DatabaseNotFound()
|
|
45
|
+
return f"sqlite:///{self.db_directory}/{self.db_name}"
|
|
46
|
+
|
|
47
|
+
def connect(self) -> Connection:
|
|
48
|
+
"""Connect to the database and initialize the engine.
|
|
49
|
+
If engine is None > create it with verified path > reflect.
|
|
50
|
+
After connecting, ensure the timeseries_metadata table is present.
|
|
51
|
+
"""
|
|
52
|
+
if self.engine is None:
|
|
53
|
+
sqlite_path = self._verify_path()
|
|
54
|
+
self.engine = create_engine(sqlite_path)
|
|
55
|
+
|
|
56
|
+
connection = self.engine.connect()
|
|
57
|
+
|
|
58
|
+
self.create_metadata()
|
|
59
|
+
|
|
60
|
+
return connection
|
|
61
|
+
|
|
62
|
+
def dispose(self) -> None:
|
|
63
|
+
"""Dispose of the engine, closing all connections."""
|
|
64
|
+
if self.metadata:
|
|
65
|
+
self.metadata.clear()
|
|
66
|
+
if self.engine:
|
|
67
|
+
self.engine.dispose()
|
|
68
|
+
|
|
69
|
+
def __enter__(self) -> Connection:
|
|
70
|
+
"""Enable usage in a `with` block by returning the engine."""
|
|
71
|
+
con = self.connect()
|
|
72
|
+
if self.engine:
|
|
73
|
+
self.metadata.reflect(bind=self.engine)
|
|
74
|
+
return con
|
|
75
|
+
|
|
76
|
+
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
|
77
|
+
"""Dispose of the engine when exiting the `with` block."""
|
|
78
|
+
self.dispose()
|
|
79
|
+
|
|
80
|
+
def get_tables(self) -> list | None:
|
|
81
|
+
"""Return the list of tables, excluding the 'timeseries_metadata' table."""
|
|
82
|
+
with self:
|
|
83
|
+
tables = self.metadata.tables
|
|
84
|
+
|
|
85
|
+
if not tables:
|
|
86
|
+
print("This database has no tables.")
|
|
87
|
+
return None
|
|
88
|
+
else:
|
|
89
|
+
filtered_tables = [
|
|
90
|
+
table for table in tables if table != "__timeseries_metadata__"
|
|
91
|
+
]
|
|
92
|
+
return filtered_tables
|
|
93
|
+
|
|
94
|
+
def create_metadata(self) -> str | Table:
|
|
95
|
+
"""Create a metadata table if it doesn't exist yet and store ts metadata."""
|
|
96
|
+
|
|
97
|
+
metadata_table = Table(
|
|
98
|
+
"__timeseries_metadata__",
|
|
99
|
+
self.metadata,
|
|
100
|
+
Column("id", Integer, primary_key=True),
|
|
101
|
+
Column("table_name", String, unique=True),
|
|
102
|
+
Column("location", String),
|
|
103
|
+
Column("sensor", String),
|
|
104
|
+
Column("variable", String),
|
|
105
|
+
Column("unit", String),
|
|
106
|
+
Column("logger_alt", Float, nullable=True),
|
|
107
|
+
Column("location_alt", Float, nullable=True),
|
|
108
|
+
Column("timestamp_start", String, nullable=True),
|
|
109
|
+
Column("timestamp_end", String, nullable=True),
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
if self.engine:
|
|
113
|
+
metadata_table.create(self.engine, checkfirst=True)
|
|
114
|
+
self.metadata.reflect(bind=self.engine)
|
|
115
|
+
return metadata_table
|
|
116
|
+
else:
|
|
117
|
+
return "Engine does not exist."
|
|
118
|
+
|
|
119
|
+
def create_table(self, schema_name: str, column_name: str) -> Table | str:
|
|
120
|
+
"""Create a table in the database.
|
|
121
|
+
|
|
122
|
+
Schema name is a string representing the location, sensor, variable measured and
|
|
123
|
+
unit of measurement. This is a way of preserving the metadata of the Timeseries.
|
|
124
|
+
The index is always `timestamp` and the column name is dynamicly create from
|
|
125
|
+
the measured variable.
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
if schema_name in self.metadata.tables:
|
|
129
|
+
return self.metadata.tables[schema_name]
|
|
130
|
+
|
|
131
|
+
ts_table = Table(
|
|
132
|
+
schema_name,
|
|
133
|
+
self.metadata,
|
|
134
|
+
Column("timestamp", String, primary_key=True),
|
|
135
|
+
Column(column_name, Float),
|
|
136
|
+
info={},
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
if self.engine:
|
|
140
|
+
ts_table.create(self.engine, checkfirst=True)
|
|
141
|
+
self.metadata.reflect(bind=self.engine)
|
|
142
|
+
return ts_table
|
|
143
|
+
else:
|
|
144
|
+
return "Engine does not exist."
|