gensor 0.0.1__tar.gz → 0.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,10 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: gensor
3
- Version: 0.0.1
3
+ Version: 0.0.3
4
4
  Summary: Library for handling groundwater sensor data.
5
5
  Home-page: https://github.com/zawadzkim/gensor
6
6
  Author: Mateusz Zawadzki
7
- Author-email: fzawadzkimat@outlook.com
7
+ Author-email: zawadzkimat@outlook.com
8
8
  Requires-Python: >=3.11
9
9
  Classifier: Programming Language :: Python :: 3
10
10
  Classifier: Programming Language :: Python :: 3.11
@@ -20,8 +20,9 @@ Functions:
20
20
  compensate: Compensate raw sensor pressure measurement with barometric pressure.
21
21
  """
22
22
 
23
- from typing import Self
23
+ from typing import Any
24
24
 
25
+ import pandas as pd
25
26
  import pydantic as pyd
26
27
 
27
28
  from .dtypes import Timeseries
@@ -32,32 +33,35 @@ from .exceptions import (
32
33
 
33
34
 
34
35
  class Compensator(pyd.BaseModel):
36
+ """Compensate raw sensor pressure measurement with barometric pressure.
37
+
38
+ Attributes:
39
+ ts (Timeseries): Raw sensor timeseries
40
+ barometric (Timeseries | float): Barometric pressure timeseries or a single
41
+ float value. If a float value is provided, it is assumed to be in cmH2O.
42
+ drop_low_wc (bool): Whether to drop records where the absolute water column is
43
+ less than or equal to the cutoff value. Defaults to True.
44
+
45
+ """
46
+
35
47
  ts: Timeseries
36
48
  barometric: Timeseries | float
37
49
  drop_low_wc: bool = True
38
50
 
39
51
  @pyd.field_validator("ts", "barometric", mode="before")
40
- def validate_timeseries_type(cls, v):
52
+ def validate_timeseries_type(cls, v: Timeseries) -> Timeseries:
41
53
  if isinstance(v, Timeseries) and v.variable != "pressure":
42
- raise InvalidMeasurementTypeError(v.location)
54
+ raise InvalidMeasurementTypeError()
43
55
  return v
44
56
 
45
57
  @pyd.field_validator("ts")
46
- def validate_sensor_information(cls, v: Timeseries):
58
+ def validate_sensor_information(cls, v: Timeseries) -> Timeseries:
47
59
  if v.sensor is not None and not v.sensor_alt:
48
60
  raise MissingInputError("sensor_alt")
49
61
  return v
50
62
 
51
- def compensate(self, **kwargs) -> Self | None:
52
- """Compensate raw sensor pressure measurement with barometric pressure.
53
-
54
- Parameters:
55
- ts (Timeseries): Raw sensor timeseries
56
- barometric (Timeseries or float): Barometric pressure timeseries or a single
57
- float value. If a float value is provided, it is assumed to be in cmH2O.
58
- drop_low_wc (bool): Whether to drop records where the absolute water column is
59
- less than or equal to the cutoff value. Defaults to True.
60
- inplace (bool): Whether to update the timeseries in place. Defaults to True.
63
+ def compensate(self, **kwargs: Any) -> Timeseries | None:
64
+ """Perform compensation.
61
65
 
62
66
  Keyword Arguments:
63
67
  alignment_period (str): The alignment period for the timeseries.
@@ -72,16 +76,17 @@ class Compensator(pyd.BaseModel):
72
76
  alignment_period = kwargs.get("alignment_period", "h")
73
77
  threshold_wc = kwargs.get("threshold_wc", 0.5)
74
78
  resample_params = {"freq": alignment_period, "agg_func": "mean"}
79
+ resampled_ts = self.ts.resample(**resample_params)
75
80
 
76
81
  if isinstance(self.barometric, Timeseries):
77
82
  if self.ts == self.barometric:
78
83
  print("Skipping compensation: both timeseries are the same.")
79
84
  return None
80
85
  baro = self.barometric.resample(**resample_params).ts
81
- else:
82
- baro = self.barometric
83
-
84
- resampled_ts = self.ts.resample(**resample_params)
86
+ elif isinstance(self.barometric, float):
87
+ baro = pd.Series(
88
+ [self.barometric] * len(resampled_ts.ts), index=resampled_ts.ts.index
89
+ )
85
90
 
86
91
  # dividing by 100 to convert water column from cmH2O to mH2O
87
92
  watercolumn_ts = resampled_ts.ts.sub(baro).divide(100).dropna()
@@ -94,9 +99,9 @@ class Compensator(pyd.BaseModel):
94
99
  f"{len(watercolumn_ts) - len(watercolumn_ts_filtered)} records \
95
100
  dropped due to low water column."
96
101
  )
97
- gwl = watercolumn_ts_filtered.add(float(resampled_ts.sensor_alt))
102
+ gwl = watercolumn_ts_filtered.add(float(resampled_ts.sensor_alt or 0))
98
103
  else:
99
- gwl = watercolumn_ts.add(float(resampled_ts.sensor_alt))
104
+ gwl = watercolumn_ts.add(float(resampled_ts.sensor_alt or 0))
100
105
 
101
106
  compensated = resampled_ts.model_copy(
102
107
  update={"ts": gwl, "unit": "m asl", "variable": "head"}
@@ -105,6 +110,20 @@ class Compensator(pyd.BaseModel):
105
110
  return compensated
106
111
 
107
112
 
108
- def compensate(ts, barometric, drop_low_wc, **kwargs) -> Timeseries:
113
+ def compensate(
114
+ ts: Timeseries,
115
+ barometric: Timeseries | float,
116
+ drop_low_wc: bool,
117
+ **kwargs: Any,
118
+ ) -> Timeseries | None:
119
+ """Constructor for the Comensate class object.
120
+
121
+ Parameters:
122
+ ts (Timeseries): Raw sensor timeseries
123
+ barometric (Timeseries | float): Barometric pressure timeseries or a single
124
+ float value. If a float value is provided, it is assumed to be in cmH2O.
125
+ drop_low_wc (bool): Whether to drop records where the absolute water column is
126
+ less than or equal to the cutoff value. Defaults to True.
127
+ """
109
128
  comp = Compensator(ts=ts, barometric=barometric, drop_low_wc=drop_low_wc)
110
129
  return comp.compensate(**kwargs)
@@ -0,0 +1,14 @@
1
+ """
2
+ # DB
3
+
4
+ Module handling database connection in case saving and loading from SQLite database is
5
+ used.
6
+
7
+ Modules:
8
+
9
+ connection.py
10
+ """
11
+
12
+ from .connection import DatabaseConnection
13
+
14
+ __all__ = ["DatabaseConnection"]
@@ -1,4 +1,8 @@
1
- """Module for database connection."""
1
+ """Module defining database connection object.
2
+
3
+ Classes:
4
+ DatabaseConnection: Database connection object
5
+ """
2
6
 
3
7
  from pathlib import Path
4
8
 
@@ -10,7 +14,7 @@ from ..exceptions import DatabaseNotFound
10
14
 
11
15
 
12
16
  class DatabaseConnection(pyd.BaseModel):
13
- """Class for handling the database connection.
17
+ """Database connection object.
14
18
  If no database exists at the specified path, it will be created.
15
19
  If no database is specified, an in-memory database will be used.
16
20
 
@@ -18,6 +18,15 @@ ts_schema = pa.SeriesSchema(
18
18
  coerce=True,
19
19
  )
20
20
 
21
+ VARIABLE_TYPES_AND_UNITS = {
22
+ "temperature": ["degC"],
23
+ "pressure": ["cmH2O", "mmH2O"],
24
+ "conductivity": ["mS/cm"],
25
+ "flux": ["m/s"],
26
+ "head": ["m asl"],
27
+ "depth": ["m"],
28
+ }
29
+
21
30
 
22
31
  class Timeseries(pyd.BaseModel):
23
32
  """Timeseries from a sensor including measurement metadata.
@@ -215,8 +224,14 @@ class Timeseries(pyd.BaseModel):
215
224
  str: A message indicating the number of rows inserted into the database.
216
225
  """
217
226
  schema_name = f"{self.location}_{self.sensor}_{self.variable}_{self.unit}"
218
- con = db.engine.connect()
219
- self.ts.to_sql(name=schema_name, con=con, if_exists="append", index=False)
227
+ if db.engine is not None:
228
+ with db.engine.connect() as con:
229
+ self.ts.to_sql(
230
+ name=schema_name, con=con, if_exists="append", index=False
231
+ )
232
+ else:
233
+ message = "Database engine is not initialized."
234
+ raise ValueError(message)
220
235
 
221
236
  return f"{schema_name} table updated."
222
237
 
@@ -293,7 +308,7 @@ class Dataset(pyd.BaseModel):
293
308
  def __repr__(self) -> str:
294
309
  return f"Dataset({len(self)})"
295
310
 
296
- def __getitem__(self, index: int) -> Timeseries:
311
+ def __getitem__(self, index: int) -> Timeseries | None:
297
312
  """Retrieve a Timeseries object by its index in the dataset.
298
313
 
299
314
  Parameters:
@@ -310,11 +325,11 @@ class Dataset(pyd.BaseModel):
310
325
  except IndexError:
311
326
  raise IndexOutOfRangeError(index, len(self)) from None
312
327
 
313
- def get_stations(self):
328
+ def get_stations(self) -> list:
314
329
  """List all unique locations in the dataset."""
315
330
  return [ts.location for ts in self.timeseries if ts is not None]
316
331
 
317
- def add(self, other: Timeseries):
332
+ def add(self, other: Timeseries | list[Timeseries]) -> None:
318
333
  """Appends a new series to the Dataset or merges series if an equal
319
334
  one exists.
320
335
 
@@ -331,7 +346,9 @@ class Dataset(pyd.BaseModel):
331
346
  else:
332
347
  self._add_single_timeseries(other)
333
348
 
334
- def _add_single_timeseries(self, ts: Timeseries):
349
+ return
350
+
351
+ def _add_single_timeseries(self, ts: Timeseries) -> None:
335
352
  """Adds a single Timeseries to the Dataset or merges if an equal one exists."""
336
353
  for i, existing_ts in enumerate(self.timeseries):
337
354
  if existing_ts == ts:
@@ -340,6 +357,8 @@ class Dataset(pyd.BaseModel):
340
357
 
341
358
  self.timeseries.append(ts)
342
359
 
360
+ return
361
+
343
362
  def filter(
344
363
  self,
345
364
  station: str | None = None,
@@ -358,9 +377,11 @@ class Dataset(pyd.BaseModel):
358
377
  Timeseries or Dataset: A single Timeseries if exactly one match is found,
359
378
  or a new Dataset if multiple matches are found.
360
379
  """
380
+
361
381
  matching_timeseries = [
362
382
  ts
363
383
  for ts in self.timeseries
384
+ if ts is not None
364
385
  if (station is None or ts.location == station)
365
386
  and (sensor is None or ts.sensor == sensor)
366
387
  and (variable is None or ts.variable == variable)
@@ -1,10 +1,9 @@
1
1
  class InvalidMeasurementTypeError(ValueError):
2
2
  """Raised when a timeseries of a wrong measurement type is operated upon."""
3
3
 
4
- def __init__(self, timeseries_name: str, expected_type: str = "pressure") -> None:
5
- self.timeseries_name = timeseries_name
4
+ def __init__(self, expected_type: str = "pressure") -> None:
6
5
  self.expected_type = expected_type
7
- message = f"Timeseries '{self.timeseries_name}' must be of measurement type '{self.expected_type}'."
6
+ message = f"Timeseries must be of measurement type '{self.expected_type}'."
8
7
  super().__init__(message)
9
8
 
10
9
 
@@ -1,9 +1,12 @@
1
- """Fetching the data from various sources."""
1
+ """Fetching the data from various sources.
2
+
3
+ TODO: Fix up the read_from_sql() function to actually work properly.
4
+ """
2
5
 
3
6
  from pathlib import Path
4
- from typing import Literal
7
+ from typing import Any, Literal
5
8
 
6
- from pandas import read_sql
9
+ from pandas import Series, read_sql
7
10
  from sqlalchemy import MetaData, Table, select
8
11
 
9
12
  from .db.connection import DatabaseConnection
@@ -12,7 +15,9 @@ from .exceptions import NoFilesToLoad
12
15
  from .parse import parse_vanessen_csv
13
16
 
14
17
 
15
- def read_from_csv(path: Path, file_format: Literal["vanessen"] = "vanessen", **kwargs):
18
+ def read_from_csv(
19
+ path: Path, file_format: Literal["vanessen"] = "vanessen", **kwargs: Any
20
+ ) -> Dataset:
16
21
  """Loads the data from the Van Essen CSV file(s) and returns a list of Timeseries objects.
17
22
 
18
23
  Args:
@@ -42,7 +47,7 @@ def read_from_csv(path: Path, file_format: Literal["vanessen"] = "vanessen", **k
42
47
  ds = Dataset()
43
48
  for f in files:
44
49
  print(f"Loading file: {f}")
45
- ts_in_file: list = parser(f, **kwargs)
50
+ ts_in_file = parser(f, **kwargs)
46
51
  ds.add(ts_in_file)
47
52
 
48
53
  return ds
@@ -65,10 +70,21 @@ def read_from_sql(
65
70
  schema = Table(f"{location}_{sensor}_{variable}", metadata)
66
71
 
67
72
  query = select(schema)
68
- df = read_sql(query, con=db.engine)
73
+ if db.engine:
74
+ with db.engine.connect() as con:
75
+ df = read_sql(query, con=con, index_col="timestamp")
76
+
77
+ if not isinstance(df, Series):
78
+ raise TypeError
69
79
 
70
80
  ts_object = Timeseries(
71
- timeseries=df, variable=variable, location=location, sensor=sensor, unit=unit
81
+ ts=df,
82
+ # Validation done in Pydantic
83
+ variable=variable,
84
+ location=location,
85
+ sensor=sensor,
86
+ # Validation done in Pydantic
87
+ unit=unit,
72
88
  )
73
89
 
74
90
  return ts_object
@@ -9,7 +9,7 @@ import chardet
9
9
  import pytz
10
10
  from pandas import DataFrame, read_csv, to_datetime
11
11
 
12
- from ..dtypes import Timeseries
12
+ from ..dtypes import VARIABLE_TYPES_AND_UNITS, Timeseries
13
13
 
14
14
 
15
15
  def detect_encoding(path: Path, num_bytes: int = 1024) -> str:
@@ -51,7 +51,7 @@ def handle_timestamps(df: DataFrame, tz: str) -> DataFrame:
51
51
  return df
52
52
 
53
53
 
54
- def parse_vanessen_csv(path: Path, **kwargs) -> list[Any]:
54
+ def parse_vanessen_csv(path: Path, **kwargs: Any) -> list[Timeseries]:
55
55
  """Parses a van Essen csv file and returns a list of Timeseries objects. At this point it
56
56
  does not matter whether the file is a barometric or piezometric logger file.
57
57
 
@@ -60,12 +60,17 @@ def parse_vanessen_csv(path: Path, **kwargs) -> list[Any]:
60
60
  are not working (whihc most likely will be the case), the user should provide their own patterns. The patterns
61
61
  can be provided as keyword arguments to the function and it is possible to use OR (|) in the regex pattern.
62
62
 
63
- Args:
63
+ !!! warning
64
+
65
+ A better check for the variable type and units has to be implemented.
66
+
67
+ Parameters:
64
68
  path (Path): The path to the file.
65
- **kwargs (dict): Optional keyword arguments to specify the regex patterns for the serial number and station.
66
- serial_number_pattern (str): The regex pattern to extract the serial number from the file.
67
- location_pattern (str): The regex pattern to extract the station from the file.
68
- col_names (list): The column names for the dataframe.
69
+
70
+ Other Parameters:
71
+ serial_number_pattern (str): The regex pattern to extract the serial number from the file.
72
+ location_pattern (str): The regex pattern to extract the station from the file.
73
+ col_names (list): The column names for the dataframe.
69
74
 
70
75
  Returns:
71
76
  list: A list of Timeseries objects.
@@ -86,7 +91,11 @@ def parse_vanessen_csv(path: Path, **kwargs) -> list[Any]:
86
91
  text = f.read()
87
92
 
88
93
  try:
89
- data = {k: re.search(v, text).group() for k, v in data.items()}
94
+ data = {
95
+ k: (match.group() if (match := re.search(v, text)) else None)
96
+ for k, v in data.items()
97
+ }
98
+
90
99
  except AttributeError:
91
100
  print(
92
101
  f"Skipping file {path} due to missing patterns. If this is not expected, please provide the correct patterns."
@@ -104,22 +113,33 @@ def parse_vanessen_csv(path: Path, **kwargs) -> list[Any]:
104
113
  df = read_csv(
105
114
  data_io, skiprows=1, header=None, names=column_names, index_col="timestamp"
106
115
  )
107
-
108
- timezone_match = re.search(
109
- kwargs.get("timezone_pattern", r"UTC[+-]?\d+"), text
110
- ).group()
111
-
112
- df = handle_timestamps(df, timezone_match)
113
-
114
- ts_list = [
115
- Timeseries(
116
- ts=df[col],
117
- variable=col,
118
- location=data.get("location"),
119
- sensor=data.get("sensor"),
120
- unit="cmH2O" if col == "pressure" else "degC",
121
- )
122
- for col in df.columns
123
- ]
116
+ timezone_pattern = kwargs.get("timezone_pattern", r"UTC[+-]?\d+")
117
+ timezone_match = re.search(timezone_pattern, text)
118
+
119
+ timezone = timezone_match.group() if timezone_match else "UTC"
120
+
121
+ df = handle_timestamps(df, timezone)
122
+
123
+ ts_list = []
124
+
125
+ for col in df.columns:
126
+ if col in VARIABLE_TYPES_AND_UNITS:
127
+ unit = VARIABLE_TYPES_AND_UNITS[col][0]
128
+ ts_list.append(
129
+ Timeseries(
130
+ ts=df[col],
131
+ # Validation will be done in Pydantic
132
+ variable=col, # type: ignore[arg-type]
133
+ location=data.get("location"),
134
+ sensor=data.get("sensor"),
135
+ # Validation will be done in Pydantic
136
+ unit=unit, # type: ignore[arg-type]
137
+ )
138
+ )
139
+ else:
140
+ message = (
141
+ "Unsupported variable: {col}. Please provide a valid variable type."
142
+ )
143
+ raise ValueError(message)
124
144
 
125
145
  return ts_list
@@ -58,7 +58,7 @@ class Transform:
58
58
  def difference(self, **kwargs: int) -> tuple[Series, str]:
59
59
  """Difference the time series data.
60
60
 
61
- Args:
61
+ Keword Arguments:
62
62
  periods (int): The number of periods to shift. Defaults to 1.
63
63
 
64
64
  Returns:
@@ -90,7 +90,7 @@ class Transform:
90
90
  """Apply the Box-Cox transformation to the time series data. Only works
91
91
  for all positive datasets!
92
92
 
93
- Args:
93
+ Keyword Arguments:
94
94
  lmbda (float): The transformation parameter. Defaults to 0.
95
95
 
96
96
  Returns:
@@ -6,7 +6,7 @@ from matplotlib import pyplot as plt
6
6
  from .dtypes import Timeseries
7
7
 
8
8
 
9
- def trend_analysis(ts: Timeseries, plot=True) -> None:
9
+ def trend_analysis(ts: Timeseries, plot: bool = True) -> None:
10
10
  time_numeric = np.arange(len(ts.timeseries))
11
11
 
12
12
  # Perform linear regression using numpy's polyfit
@@ -1,8 +1,8 @@
1
1
  [tool.poetry]
2
2
  name = "gensor"
3
- version = "v0.0.1"
3
+ version = "0.0.3"
4
4
  description = "Library for handling groundwater sensor data."
5
- authors = ["Mateusz Zawadzki <fzawadzkimat@outlook.com>"]
5
+ authors = ["Mateusz Zawadzki <zawadzkimat@outlook.com>"]
6
6
  repository = "https://github.com/zawadzkim/gensor"
7
7
  documentation = "https://zawadzkim.github.io/gensor/"
8
8
  readme = "README.md"
@@ -38,6 +38,7 @@ pandas-stubs = "^2.2.2.240807"
38
38
  mkdocs = "^1.6.1"
39
39
  mkdocs-material = "^9.5.34"
40
40
  mkdocstrings-python = "^1.11.1"
41
+ mkdocs-autorefs = "^1.2.0"
41
42
 
42
43
  [build-system]
43
44
  requires = ["poetry-core>=1.0.0"]
@@ -1,3 +0,0 @@
1
- from .connection import DatabaseConnection
2
-
3
- __all__ = ["DatabaseConnection"]
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes