gensor 0.0.4__tar.gz → 0.0.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: gensor
3
- Version: 0.0.4
3
+ Version: 0.0.6
4
4
  Summary: Library for handling groundwater sensor data.
5
5
  Home-page: https://github.com/zawadzkim/gensor
6
6
  Author: Mateusz Zawadzki
@@ -11,14 +11,17 @@ Classifier: Programming Language :: Python :: 3.11
11
11
  Classifier: Programming Language :: Python :: 3.12
12
12
  Requires-Dist: chardet (>=5.2.0,<6.0.0)
13
13
  Requires-Dist: matplotlib (>=3.9.2,<4.0.0)
14
- Requires-Dist: numpy (>=2.1.0,<3.0.0)
14
+ Requires-Dist: numba (>=0.60.0,<0.61.0)
15
+ Requires-Dist: numpy (>=2.0.0,<3.0.0)
15
16
  Requires-Dist: pandas (>=2.2.2,<3.0.0)
16
17
  Requires-Dist: pandera (>=0.20.3,<0.21.0)
17
18
  Requires-Dist: pydantic (>=2.8.2,<3.0.0)
18
- Requires-Dist: pytz (>=2024.1,<2025.0)
19
+ Requires-Dist: python-dateutil (>=2.9.0.post0,<3.0.0)
19
20
  Requires-Dist: scikit-learn (>=1.5.1,<2.0.0)
20
21
  Requires-Dist: scipy (>=1.14.1,<2.0.0)
22
+ Requires-Dist: seaborn (>=0.13.2,<0.14.0)
21
23
  Requires-Dist: sqlalchemy (>=2.0.32,<3.0.0)
24
+ Requires-Dist: statsmodels (>=0.14.3,<0.15.0)
22
25
  Project-URL: Documentation, https://zawadzkim.github.io/gensor/
23
26
  Project-URL: Repository, https://github.com/zawadzkim/gensor
24
27
  Description-Content-Type: text/markdown
@@ -1,14 +1,18 @@
1
1
  from .compensation import Compensator, compensate
2
2
  from .dtypes import Dataset, Timeseries
3
- from .getters import read_from_csv
3
+ from .getters import read_from_csv, read_from_sql
4
4
  from .preprocessing import OutlierDetection, Transform
5
5
 
6
6
  __all__ = [
7
+ # basic data types
7
8
  "Dataset",
8
9
  "Timeseries",
9
- "read_from_csv",
10
+ # data transformation
10
11
  "OutlierDetection",
11
12
  "Transform",
12
13
  "Compensator",
13
14
  "compensate",
15
+ # getters
16
+ "read_from_csv",
17
+ "read_from_sql",
14
18
  ]
@@ -0,0 +1,194 @@
1
+ """Compensating the raw data from the absolute pressure transducer to the actual water
2
+ level using the barometric pressure data.
3
+
4
+ Because van Essen Instrument divers are non-vented pressure transducers, to obtain the
5
+ pressure resulting from the water column above the logger (i.e. the water level), the
6
+ barometric pressure must be subtracted from the raw pressure measurements. In the
7
+ first step the function aligns the two series to the same time step and then subtracts
8
+ the barometric pressure from the raw pressure measurements. For short time periods (when
9
+ for instance a slug test is performed) the barometric pressure can be provided as a
10
+ single float value.
11
+
12
+ Subsequently the function filters out all records where the absolute water column is
13
+ less than or equal to the cutoff value. This is because when the logger is out of the
14
+ water when the measurement is taken, the absolute water column is close to zero,
15
+ producing erroneous results and spikes in the plots. The cutoff value is set to 5 cm by
16
+ default, but can be adjusted using the cutoff_wc kwarg.
17
+
18
+ Functions:
19
+
20
+ compensate: Compensate raw sensor pressure measurement with barometric pressure.
21
+ """
22
+
23
+ from typing import Literal
24
+
25
+ import pandas as pd
26
+ import pydantic as pyd
27
+
28
+ from .dtypes import Dataset, Timeseries
29
+ from .exceptions import (
30
+ InvalidMeasurementTypeError,
31
+ MissingInputError,
32
+ )
33
+
34
+
35
+ class Compensator(pyd.BaseModel):
36
+ """Compensate raw sensor pressure measurement with barometric pressure.
37
+
38
+ Attributes:
39
+ ts (Timeseries): Raw sensor timeseries
40
+ barometric (Timeseries | float): Barometric pressure timeseries or a single
41
+ float value. If a float value is provided, it is assumed to be in cmH2O.
42
+ """
43
+
44
+ ts: Timeseries
45
+ barometric: Timeseries | float
46
+
47
+ @pyd.field_validator("ts", "barometric", mode="before")
48
+ def validate_timeseries_type(cls, v: Timeseries) -> Timeseries:
49
+ if isinstance(v, Timeseries) and v.variable != "pressure":
50
+ raise InvalidMeasurementTypeError()
51
+ return v
52
+
53
+ @pyd.field_validator("ts")
54
+ def validate_sensor_information(cls, v: Timeseries) -> Timeseries:
55
+ if v.sensor is not None and not v.sensor_alt:
56
+ raise MissingInputError("sensor_alt")
57
+ return v
58
+
59
+ def compensate(
60
+ self,
61
+ alignment_period: Literal["D", "ME", "SME", "MS", "YE", "YS", "h", "min", "s"],
62
+ threshold_wc: float | None,
63
+ fieldwork_dates: list | None,
64
+ ) -> Timeseries | None:
65
+ """Perform compensation.
66
+
67
+ Parameters:
68
+ alignment_period Literal['D', 'ME', 'SME', 'MS', 'YE', 'YS', 'h', 'min', 's']: The alignment period for the timeseries.
69
+ Default is 'h'. See pandas offset aliases for definitinos.
70
+ threshold_wc (float): The threshold for the absolute water column.
71
+ fieldwork_dates (Optional[list]): List of dates when fieldwork was done. All
72
+ measurement from a fieldwork day will be set to None.
73
+
74
+ Returns:
75
+ Timeseries: A new Timeseries instance with the compensated data and updated unit and variable. Optionally removed outliers are included.
76
+ """
77
+
78
+ resample_params = {"freq": alignment_period, "agg_func": pd.Series.mean}
79
+ resampled_ts = self.ts.resample(**resample_params)
80
+
81
+ if isinstance(self.barometric, Timeseries):
82
+ if self.ts == self.barometric:
83
+ print("Skipping compensation: both timeseries are the same.")
84
+ return None
85
+ resampled_baro = self.barometric.resample(**resample_params).ts
86
+
87
+ elif isinstance(self.barometric, float):
88
+ resampled_baro = pd.Series(
89
+ [self.barometric] * len(resampled_ts.ts), index=resampled_ts.ts.index
90
+ )
91
+
92
+ # dividing by 100 to convert water column from cmH2O to mH2O
93
+ watercolumn_ts = resampled_ts.ts.sub(resampled_baro).divide(100).dropna()
94
+
95
+ if not isinstance(watercolumn_ts.index, pd.DatetimeIndex):
96
+ watercolumn_ts.index = pd.to_datetime(watercolumn_ts.index)
97
+
98
+ if fieldwork_dates:
99
+ fieldwork_timestamps = pd.to_datetime(fieldwork_dates).tz_localize(
100
+ watercolumn_ts.index.tz
101
+ )
102
+
103
+ watercolumn_ts.loc[
104
+ watercolumn_ts.index.normalize().isin(fieldwork_timestamps)
105
+ ] = None
106
+
107
+ if threshold_wc:
108
+ watercolumn_ts_filtered = watercolumn_ts[
109
+ watercolumn_ts.abs() > threshold_wc
110
+ ]
111
+
112
+ dropped_outliers = watercolumn_ts[watercolumn_ts.abs() <= threshold_wc]
113
+
114
+ print(
115
+ f"{len(dropped_outliers)} records \
116
+ dropped due to low water column."
117
+ )
118
+ gwl = watercolumn_ts_filtered.add(float(resampled_ts.sensor_alt or 0))
119
+
120
+ compensated = resampled_ts.model_copy(
121
+ update={
122
+ "ts": gwl,
123
+ "outliers": dropped_outliers,
124
+ "unit": "m asl",
125
+ "variable": "head",
126
+ },
127
+ deep=True,
128
+ )
129
+ else:
130
+ gwl = watercolumn_ts.add(float(resampled_ts.sensor_alt or 0))
131
+
132
+ compensated = resampled_ts.model_copy(
133
+ update={"ts": gwl, "unit": "m asl", "variable": "head"}, deep=True
134
+ )
135
+
136
+ return compensated
137
+
138
+
139
+ def compensate(
140
+ raw: Timeseries | Dataset,
141
+ barometric: Timeseries | float,
142
+ alignment_period: Literal[
143
+ "D", "ME", "SME", "MS", "YE", "YS", "h", "min", "s"
144
+ ] = "h",
145
+ threshold_wc: float | None = None,
146
+ fieldwork_dates: dict | None = None,
147
+ interpolate_method: str | None = None,
148
+ ) -> Timeseries | Dataset | None:
149
+ """Constructor for the Comensator object.
150
+
151
+ Parameters:
152
+ raw (Timeseries | Dataset): Raw sensor timeseries
153
+ barometric (Timeseries | float): Barometric pressure timeseries or a single
154
+ float value. If a float value is provided, it is assumed to be in cmH2O.
155
+ alignment_period (Literal['D', 'ME', 'SME', 'MS', 'YE', 'YS', 'h', 'min', 's']): The alignment period for the timeseries.
156
+ Default is 'h'. See pandas offset aliases for definitinos.
157
+ threshold_wc (float): The threshold for the absolute water column. If it is
158
+ provided, the records below that threshold are dropped.
159
+ fieldwork_dates (Dict[str, list]): Dictionary of location name and a list of
160
+ fieldwork days. All records on the fieldwork day are set to None.
161
+ interpolate_method (str): String representing the interpolate method as in
162
+ pd.Series.interpolate() method.
163
+ """
164
+ if fieldwork_dates is None:
165
+ fieldwork_dates = {}
166
+
167
+ def _compensate_one(
168
+ raw: Timeseries, fieldwork_dates: list | None
169
+ ) -> Timeseries | None:
170
+ comp = Compensator(ts=raw, barometric=barometric)
171
+ compensated = comp.compensate(
172
+ alignment_period=alignment_period,
173
+ threshold_wc=threshold_wc,
174
+ fieldwork_dates=fieldwork_dates,
175
+ )
176
+ if compensated is not None and interpolate_method:
177
+ # .interpolate() called on Timeseries object is wrapped to return a
178
+ # Timeseries object from the original pandas.Series.interpolate().
179
+ return compensated.interpolate(method=interpolate_method) # type: ignore[no-any-return]
180
+
181
+ else:
182
+ return compensated
183
+
184
+ if isinstance(raw, Timeseries):
185
+ dates = fieldwork_dates.get(raw.location)
186
+ return _compensate_one(raw, dates)
187
+
188
+ elif isinstance(raw, Dataset):
189
+ compensated_series = []
190
+ for item in raw:
191
+ dates = fieldwork_dates.get(item.location)
192
+ compensated_series.append(_compensate_one(item, dates))
193
+
194
+ return raw.model_copy(update={"timeseries": compensated_series}, deep=True)
@@ -0,0 +1,144 @@
1
+ """Module defining database connection object.
2
+
3
+ Classes:
4
+ DatabaseConnection: Database connection object
5
+ """
6
+
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ import pydantic as pyd
11
+ from sqlalchemy import (
12
+ Column,
13
+ Connection,
14
+ Engine,
15
+ Float,
16
+ Integer,
17
+ MetaData,
18
+ String,
19
+ Table,
20
+ create_engine,
21
+ )
22
+
23
+ from ..exceptions import DatabaseNotFound
24
+
25
+
26
+ class DatabaseConnection(pyd.BaseModel):
27
+ """Database connection object.
28
+ If no database exists at the specified path, it will be created.
29
+ If no database is specified, an in-memory database will be used."""
30
+
31
+ model_config = pyd.ConfigDict(
32
+ arbitrary_types_allowed=True, validate_assignment=True
33
+ )
34
+
35
+ metadata: MetaData = MetaData()
36
+ db_directory: Path = Path.cwd()
37
+ db_name: str = "gensor.db"
38
+ engine: Engine | None = None
39
+
40
+ def _verify_path(self) -> str:
41
+ """Verify database path."""
42
+
43
+ if not self.db_directory.exists():
44
+ raise DatabaseNotFound()
45
+ return f"sqlite:///{self.db_directory}/{self.db_name}"
46
+
47
+ def connect(self) -> Connection:
48
+ """Connect to the database and initialize the engine.
49
+ If engine is None > create it with verified path > reflect.
50
+ After connecting, ensure the timeseries_metadata table is present.
51
+ """
52
+ if self.engine is None:
53
+ sqlite_path = self._verify_path()
54
+ self.engine = create_engine(sqlite_path)
55
+
56
+ connection = self.engine.connect()
57
+
58
+ self.create_metadata()
59
+
60
+ return connection
61
+
62
+ def dispose(self) -> None:
63
+ """Dispose of the engine, closing all connections."""
64
+ if self.metadata:
65
+ self.metadata.clear()
66
+ if self.engine:
67
+ self.engine.dispose()
68
+
69
+ def __enter__(self) -> Connection:
70
+ """Enable usage in a `with` block by returning the engine."""
71
+ con = self.connect()
72
+ if self.engine:
73
+ self.metadata.reflect(bind=self.engine)
74
+ return con
75
+
76
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
77
+ """Dispose of the engine when exiting the `with` block."""
78
+ self.dispose()
79
+
80
+ def get_tables(self) -> list | None:
81
+ """Return the list of tables, excluding the 'timeseries_metadata' table."""
82
+ with self:
83
+ tables = self.metadata.tables
84
+
85
+ if not tables:
86
+ print("This database has no tables.")
87
+ return None
88
+ else:
89
+ filtered_tables = [
90
+ table for table in tables if table != "__timeseries_metadata__"
91
+ ]
92
+ return filtered_tables
93
+
94
+ def create_metadata(self) -> str | Table:
95
+ """Create a metadata table if it doesn't exist yet and store ts metadata."""
96
+
97
+ metadata_table = Table(
98
+ "__timeseries_metadata__",
99
+ self.metadata,
100
+ Column("id", Integer, primary_key=True),
101
+ Column("table_name", String, unique=True),
102
+ Column("location", String),
103
+ Column("sensor", String),
104
+ Column("variable", String),
105
+ Column("unit", String),
106
+ Column("logger_alt", Float, nullable=True),
107
+ Column("location_alt", Float, nullable=True),
108
+ Column("timestamp_start", String, nullable=True),
109
+ Column("timestamp_end", String, nullable=True),
110
+ )
111
+
112
+ if self.engine:
113
+ metadata_table.create(self.engine, checkfirst=True)
114
+ self.metadata.reflect(bind=self.engine)
115
+ return metadata_table
116
+ else:
117
+ return "Engine does not exist."
118
+
119
+ def create_table(self, schema_name: str, column_name: str) -> Table | str:
120
+ """Create a table in the database.
121
+
122
+ Schema name is a string representing the location, sensor, variable measured and
123
+ unit of measurement. This is a way of preserving the metadata of the Timeseries.
124
+ The index is always `timestamp` and the column name is dynamicly create from
125
+ the measured variable.
126
+ """
127
+
128
+ if schema_name in self.metadata.tables:
129
+ return self.metadata.tables[schema_name]
130
+
131
+ ts_table = Table(
132
+ schema_name,
133
+ self.metadata,
134
+ Column("timestamp", String, primary_key=True),
135
+ Column(column_name, Float),
136
+ info={},
137
+ )
138
+
139
+ if self.engine:
140
+ ts_table.create(self.engine, checkfirst=True)
141
+ self.metadata.reflect(bind=self.engine)
142
+ return ts_table
143
+ else:
144
+ return "Engine does not exist."