gensor 0.1.6__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gensor-0.1.6 → gensor-0.2.1}/PKG-INFO +1 -1
- gensor-0.2.1/gensor/__init__.py +29 -0
- {gensor-0.1.6 → gensor-0.2.1}/gensor/core/base.py +64 -36
- gensor-0.2.1/gensor/core/dataset.py +203 -0
- {gensor-0.1.6 → gensor-0.2.1}/gensor/core/indexer.py +5 -0
- gensor-0.2.1/gensor/core/timeseries.py +78 -0
- {gensor-0.1.6 → gensor-0.2.1}/gensor/db/connection.py +5 -5
- {gensor-0.1.6 → gensor-0.2.1}/gensor/io/read.py +42 -28
- gensor-0.2.1/gensor/log.py +7 -0
- {gensor-0.1.6 → gensor-0.2.1}/gensor/parse/utils.py +23 -3
- {gensor-0.1.6 → gensor-0.2.1}/gensor/parse/vanessen.py +4 -1
- {gensor-0.1.6 → gensor-0.2.1}/gensor/processing/transform.py +1 -1
- {gensor-0.1.6 → gensor-0.2.1}/pyproject.toml +1 -1
- gensor-0.1.6/gensor/__init__.py +0 -20
- gensor-0.1.6/gensor/core/dataset.py +0 -174
- gensor-0.1.6/gensor/core/timeseries.py +0 -169
- {gensor-0.1.6 → gensor-0.2.1}/LICENSE +0 -0
- {gensor-0.1.6 → gensor-0.2.1}/README.md +0 -0
- {gensor-0.1.6 → gensor-0.2.1}/gensor/analysis/__init__.py +0 -0
- {gensor-0.1.6 → gensor-0.2.1}/gensor/analysis/outliers.py +0 -0
- {gensor-0.1.6 → gensor-0.2.1}/gensor/analysis/stats.py +0 -0
- {gensor-0.1.6 → gensor-0.2.1}/gensor/config.py +0 -0
- {gensor-0.1.6 → gensor-0.2.1}/gensor/core/__init__.py +0 -0
- {gensor-0.1.6 → gensor-0.2.1}/gensor/db/__init__.py +0 -0
- {gensor-0.1.6 → gensor-0.2.1}/gensor/exceptions.py +0 -0
- {gensor-0.1.6 → gensor-0.2.1}/gensor/io/__init__.py +0 -0
- {gensor-0.1.6 → gensor-0.2.1}/gensor/parse/__init__.py +0 -0
- {gensor-0.1.6 → gensor-0.2.1}/gensor/parse/plain.py +0 -0
- {gensor-0.1.6 → gensor-0.2.1}/gensor/processing/__init__.py +0 -0
- {gensor-0.1.6 → gensor-0.2.1}/gensor/processing/compensation.py +0 -0
- {gensor-0.1.6 → gensor-0.2.1}/gensor/processing/smoothing.py +0 -0
- {gensor-0.1.6 → gensor-0.2.1}/gensor/testdata/Barodiver_220427183008_BY222.csv +0 -0
- {gensor-0.1.6 → gensor-0.2.1}/gensor/testdata/PB01A_moni_AV319_220427183019_AV319.csv +0 -0
- {gensor-0.1.6 → gensor-0.2.1}/gensor/testdata/PB02A_plain.csv +0 -0
- {gensor-0.1.6 → gensor-0.2.1}/gensor/testdata/__init__.py +0 -0
- {gensor-0.1.6 → gensor-0.2.1}/py.typed +0 -0
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from .core.dataset import Dataset
|
|
4
|
+
from .core.timeseries import Timeseries
|
|
5
|
+
from .io.read import read_from_csv, read_from_sql
|
|
6
|
+
from .log import set_log_level
|
|
7
|
+
from .processing.compensation import compensate
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
# basic data types
|
|
11
|
+
"Dataset",
|
|
12
|
+
"Timeseries",
|
|
13
|
+
"compensate",
|
|
14
|
+
# getters
|
|
15
|
+
"read_from_csv",
|
|
16
|
+
"read_from_sql",
|
|
17
|
+
"set_log_level",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
logger.setLevel(logging.INFO)
|
|
23
|
+
|
|
24
|
+
if not logger.hasHandlers():
|
|
25
|
+
console_handler = logging.StreamHandler()
|
|
26
|
+
console_handler.setLevel(logging.INFO)
|
|
27
|
+
formatter = logging.Formatter("%(levelname)s: %(message)s")
|
|
28
|
+
console_handler.setFormatter(formatter)
|
|
29
|
+
logger.addHandler(console_handler)
|
|
@@ -6,6 +6,8 @@ import pandas as pd
|
|
|
6
6
|
import pandera as pa
|
|
7
7
|
import pydantic as pyd
|
|
8
8
|
from matplotlib import pyplot as plt
|
|
9
|
+
from matplotlib.axes import Axes
|
|
10
|
+
from matplotlib.figure import Figure
|
|
9
11
|
from sqlalchemy import Table
|
|
10
12
|
from sqlalchemy.dialects.sqlite import insert as sqlite_insert
|
|
11
13
|
|
|
@@ -47,14 +49,14 @@ class BaseTimeseries(pyd.BaseModel):
|
|
|
47
49
|
arbitrary_types_allowed=True, validate_assignment=True
|
|
48
50
|
)
|
|
49
51
|
|
|
50
|
-
ts: pd.Series = pyd.Field(repr=False)
|
|
52
|
+
ts: pd.Series = pyd.Field(repr=False, exclude=True)
|
|
51
53
|
variable: Literal[
|
|
52
54
|
"temperature", "pressure", "conductivity", "flux", "head", "depth"
|
|
53
55
|
]
|
|
54
56
|
unit: Literal["degc", "cmh2o", "ms/cm", "m/s", "m asl", "m"]
|
|
55
57
|
location: str | None = None
|
|
56
|
-
outliers: pd.Series | None = pyd.Field(default=None, repr=False)
|
|
57
|
-
transformation: Any = pyd.Field(default=None, repr=False)
|
|
58
|
+
outliers: pd.Series | None = pyd.Field(default=None, repr=False, exclude=True)
|
|
59
|
+
transformation: Any = pyd.Field(default=None, repr=False, exclude=True)
|
|
58
60
|
|
|
59
61
|
@pyd.computed_field() # type: ignore[prop-decorator]
|
|
60
62
|
@property
|
|
@@ -66,6 +68,11 @@ class BaseTimeseries(pyd.BaseModel):
|
|
|
66
68
|
def end(self) -> pd.Timestamp | Any:
|
|
67
69
|
return self.ts.index.max()
|
|
68
70
|
|
|
71
|
+
@pyd.field_serializer("start", "end")
|
|
72
|
+
def serialize_timestamps(self, value: pd.Timestamp | None) -> str | None:
|
|
73
|
+
"""Serialize `pd.Timestamp` to ISO format."""
|
|
74
|
+
return value.strftime("%Y%m%d%H%M%S") if value is not None else None
|
|
75
|
+
|
|
69
76
|
def __eq__(self, other: object) -> bool:
|
|
70
77
|
"""Check equality based on location, sensor, variable, unit and sensor_alt."""
|
|
71
78
|
if not isinstance(other, BaseTimeseries):
|
|
@@ -85,6 +92,9 @@ class BaseTimeseries(pyd.BaseModel):
|
|
|
85
92
|
if attr == "loc":
|
|
86
93
|
return TimeseriesIndexer(self, self.ts.loc)
|
|
87
94
|
|
|
95
|
+
if attr == "iloc":
|
|
96
|
+
return TimeseriesIndexer(self, self.ts.iloc)
|
|
97
|
+
|
|
88
98
|
error_message = f"'{self.__class__.__name__}' object has no attribute '{attr}'"
|
|
89
99
|
|
|
90
100
|
if hasattr(self.ts, attr):
|
|
@@ -97,6 +107,7 @@ class BaseTimeseries(pyd.BaseModel):
|
|
|
97
107
|
# If the result is a Series, return a new Timeseries; otherwise, return the result
|
|
98
108
|
if isinstance(result, pd.Series):
|
|
99
109
|
return self.model_copy(update={"ts": result}, deep=True)
|
|
110
|
+
|
|
100
111
|
return result
|
|
101
112
|
|
|
102
113
|
return wrapper
|
|
@@ -256,19 +267,29 @@ class BaseTimeseries(pyd.BaseModel):
|
|
|
256
267
|
`to_sql` method. Additionally, metadata about the timeseries is stored in the
|
|
257
268
|
'timeseries_metadata' table.
|
|
258
269
|
|
|
259
|
-
|
|
270
|
+
Parameters:
|
|
260
271
|
db (DatabaseConnection): The database connection object.
|
|
261
272
|
|
|
262
273
|
Returns:
|
|
263
274
|
str: A message indicating the number of rows inserted into the database.
|
|
264
275
|
"""
|
|
265
|
-
|
|
276
|
+
|
|
277
|
+
def separate_metadata() -> tuple:
|
|
278
|
+
_core_metadata_fields = {"location", "variable", "unit", "start", "end"}
|
|
279
|
+
|
|
280
|
+
core_metadata = self.model_dump(include=_core_metadata_fields)
|
|
281
|
+
core_metadata.update({
|
|
282
|
+
"cls": f"{self.__module__}.{self.__class__.__name__}"
|
|
283
|
+
})
|
|
284
|
+
|
|
285
|
+
extra_metadata = self.model_dump(exclude=_core_metadata_fields)
|
|
286
|
+
|
|
287
|
+
return core_metadata, extra_metadata
|
|
288
|
+
|
|
266
289
|
timestamp_start_fmt = self.start.strftime("%Y%m%d%H%M%S")
|
|
290
|
+
timestamp_end_fmt = self.end.strftime("%Y%m%d%H%M%S")
|
|
267
291
|
|
|
268
|
-
|
|
269
|
-
schema_name = (
|
|
270
|
-
f"{self.location}_{self.variable}_{self.unit}_{timestamp_start_fmt}".lower()
|
|
271
|
-
)
|
|
292
|
+
schema_name = f"{self.location}_{self.variable}_{self.unit}".lower()
|
|
272
293
|
|
|
273
294
|
# Ensure the index is a pandas DatetimeIndex
|
|
274
295
|
if isinstance(self.ts.index, pd.DatetimeIndex):
|
|
@@ -281,66 +302,71 @@ class BaseTimeseries(pyd.BaseModel):
|
|
|
281
302
|
message = "The index is not a DatetimeIndex and cannot be converted to UTC."
|
|
282
303
|
raise TypeError(message)
|
|
283
304
|
|
|
284
|
-
# Prepare the timeseries data as records for insertion
|
|
285
305
|
series_as_records = list(
|
|
286
306
|
zip(utc_index.strftime("%Y-%m-%dT%H:%M:%S%z"), self.ts, strict=False)
|
|
287
307
|
)
|
|
288
308
|
|
|
309
|
+
core_metadata, extra_metadata = separate_metadata()
|
|
310
|
+
|
|
311
|
+
metadata_entry = {
|
|
312
|
+
**core_metadata,
|
|
313
|
+
"extra": extra_metadata,
|
|
314
|
+
"table_name": schema_name,
|
|
315
|
+
}
|
|
316
|
+
|
|
289
317
|
with db as con:
|
|
290
|
-
# Create the timeseries table if it doesn't exist
|
|
291
318
|
schema = db.create_table(schema_name, self.variable)
|
|
292
|
-
|
|
293
|
-
# Ensure that the timeseries_metadata table exists
|
|
294
319
|
metadata_schema = db.metadata.tables["__timeseries_metadata__"]
|
|
295
320
|
|
|
296
321
|
if isinstance(schema, Table):
|
|
297
|
-
# Insert the timeseries data
|
|
298
322
|
stmt = sqlite_insert(schema).values(series_as_records)
|
|
299
323
|
stmt = stmt.on_conflict_do_nothing(index_elements=["timestamp"])
|
|
300
324
|
con.execute(stmt)
|
|
301
|
-
con.commit()
|
|
302
|
-
|
|
303
|
-
metadata_stmt = sqlite_insert(metadata_schema).values(
|
|
304
|
-
table_name=schema_name,
|
|
305
|
-
location=self.location,
|
|
306
|
-
variable=self.variable,
|
|
307
|
-
unit=self.unit,
|
|
308
|
-
timestamp_start=timestamp_start_fmt,
|
|
309
|
-
timestamp_end=self.end.strftime("%Y%m%d%H%M%S"),
|
|
310
|
-
)
|
|
311
325
|
|
|
326
|
+
metadata_stmt = sqlite_insert(metadata_schema).values(metadata_entry)
|
|
312
327
|
metadata_stmt = metadata_stmt.on_conflict_do_update(
|
|
313
328
|
index_elements=["table_name"],
|
|
314
329
|
set_={
|
|
315
|
-
"
|
|
316
|
-
"
|
|
330
|
+
"start": timestamp_start_fmt,
|
|
331
|
+
"end": timestamp_end_fmt,
|
|
317
332
|
},
|
|
318
333
|
)
|
|
319
|
-
|
|
320
334
|
con.execute(metadata_stmt)
|
|
321
|
-
|
|
335
|
+
|
|
336
|
+
# Commit all changes at once
|
|
337
|
+
con.commit()
|
|
322
338
|
|
|
323
339
|
return f"{schema_name} table and metadata updated."
|
|
324
340
|
|
|
325
341
|
def plot(
|
|
326
|
-
self: T,
|
|
327
|
-
|
|
342
|
+
self: T,
|
|
343
|
+
include_outliers: bool = False,
|
|
344
|
+
ax: Axes | None = None,
|
|
345
|
+
plot_kwargs: dict[str, Any] | None = None,
|
|
346
|
+
legend_kwargs: dict[str, Any] | None = None,
|
|
347
|
+
) -> tuple[Figure, Axes]:
|
|
328
348
|
"""Plots the timeseries data.
|
|
329
349
|
|
|
330
|
-
|
|
350
|
+
Parameters:
|
|
331
351
|
include_outliers (bool): Whether to include outliers in the plot.
|
|
332
352
|
ax (matplotlib.axes.Axes, optional): Matplotlib axes object to plot on.
|
|
333
353
|
If None, a new figure and axes are created.
|
|
334
|
-
|
|
354
|
+
plot_kwargs (dict[str, Any] | None): kwargs passed to matplotlib.axes.Axes.plot() method to customize the plot.
|
|
355
|
+
legend_kwargs (dict[str, Any] | None): kwargs passed to matplotlib.axes.Axes.legend() to customize the legend.
|
|
335
356
|
|
|
336
357
|
Returns:
|
|
337
358
|
(fig, ax): Matplotlib figure and axes to allow further customization.
|
|
338
359
|
"""
|
|
339
|
-
|
|
360
|
+
|
|
361
|
+
plot_kwargs = plot_kwargs or {}
|
|
362
|
+
legend_kwargs = legend_kwargs or {}
|
|
363
|
+
|
|
340
364
|
if ax is None:
|
|
341
365
|
fig, ax = plt.subplots(figsize=(10, 5))
|
|
342
366
|
else:
|
|
343
|
-
|
|
367
|
+
# mypy complained that the get_figure() can return None, but there is no
|
|
368
|
+
# situation here in which this could be the case.
|
|
369
|
+
fig = ax.get_figure() # type: ignore [assignment]
|
|
344
370
|
|
|
345
371
|
ax.plot(
|
|
346
372
|
self.ts.index,
|
|
@@ -353,11 +379,13 @@ class BaseTimeseries(pyd.BaseModel):
|
|
|
353
379
|
ax.scatter(
|
|
354
380
|
self.outliers.index, self.outliers, color="red", label="Outliers"
|
|
355
381
|
)
|
|
356
|
-
|
|
382
|
+
for label in ax.get_xticklabels():
|
|
383
|
+
label.set_rotation(45)
|
|
384
|
+
|
|
357
385
|
ax.set_xlabel("Time")
|
|
358
386
|
ax.set_ylabel(f"{self.variable} ({self.unit})")
|
|
359
387
|
ax.set_title(f"{self.variable.capitalize()} at {self.location}")
|
|
360
388
|
|
|
361
|
-
ax.legend()
|
|
389
|
+
ax.legend(**legend_kwargs)
|
|
362
390
|
|
|
363
391
|
return fig, ax
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import Any, Generic
|
|
5
|
+
|
|
6
|
+
import pydantic as pyd
|
|
7
|
+
from matplotlib import pyplot as plt
|
|
8
|
+
from matplotlib.axes import Axes
|
|
9
|
+
from matplotlib.figure import Figure
|
|
10
|
+
|
|
11
|
+
from gensor.core.base import BaseTimeseries, T
|
|
12
|
+
from gensor.db import DatabaseConnection
|
|
13
|
+
from gensor.exceptions import IndexOutOfRangeError
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Dataset(pyd.BaseModel, Generic[T]):
|
|
17
|
+
"""Store and operate on a collection of Timeseries.
|
|
18
|
+
|
|
19
|
+
Attributes:
|
|
20
|
+
timeseries (list[Timeseries]): A list of Timeseries objects.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
timeseries: list[T | None] = pyd.Field(default_factory=list)
|
|
24
|
+
|
|
25
|
+
def __iter__(self) -> Any:
|
|
26
|
+
"""Allows to iterate directly over the dataset."""
|
|
27
|
+
return iter(self.timeseries)
|
|
28
|
+
|
|
29
|
+
def __len__(self) -> int:
|
|
30
|
+
"""Gives the number of timeseries in the Dataset."""
|
|
31
|
+
return len(self.timeseries)
|
|
32
|
+
|
|
33
|
+
def __repr__(self) -> str:
|
|
34
|
+
return f"Dataset({len(self)})"
|
|
35
|
+
|
|
36
|
+
def __getitem__(self, index: int) -> T | None:
|
|
37
|
+
"""Retrieve a Timeseries object by its index in the dataset.
|
|
38
|
+
|
|
39
|
+
Parameters:
|
|
40
|
+
index (int): The index of the Timeseries to retrieve.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Timeseries: The Timeseries object at the specified index.
|
|
44
|
+
|
|
45
|
+
Raises:
|
|
46
|
+
IndexError: If the index is out of range.
|
|
47
|
+
"""
|
|
48
|
+
try:
|
|
49
|
+
return self.timeseries[index]
|
|
50
|
+
except IndexError:
|
|
51
|
+
raise IndexOutOfRangeError(index, len(self)) from None
|
|
52
|
+
|
|
53
|
+
def get_locations(self) -> list:
|
|
54
|
+
"""List all unique locations in the dataset."""
|
|
55
|
+
return [ts.location for ts in self.timeseries if ts is not None]
|
|
56
|
+
|
|
57
|
+
def add(self, other: T | list[T] | Dataset) -> Dataset:
|
|
58
|
+
"""Appends new Timeseries to the Dataset.
|
|
59
|
+
|
|
60
|
+
If an equal Timeseries already exists, merge the new data into the existing
|
|
61
|
+
Timeseries, dropping duplicate timestamps.
|
|
62
|
+
|
|
63
|
+
Parameters:
|
|
64
|
+
other (Timeseries): The Timeseries object to add.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
# I need to check for BaseTimeseries instance in the add() method, but also
|
|
68
|
+
# type hint VarType T.
|
|
69
|
+
if isinstance(other, list | Dataset):
|
|
70
|
+
for ts in other:
|
|
71
|
+
if isinstance(ts, BaseTimeseries):
|
|
72
|
+
self._add_single_timeseries(ts) # type: ignore[arg-type]
|
|
73
|
+
|
|
74
|
+
elif isinstance(other, BaseTimeseries):
|
|
75
|
+
self._add_single_timeseries(other)
|
|
76
|
+
|
|
77
|
+
return self
|
|
78
|
+
|
|
79
|
+
def _add_single_timeseries(self, ts: T) -> None:
|
|
80
|
+
"""Adds a single Timeseries to the Dataset or merges if an equal one exists."""
|
|
81
|
+
for i, existing_ts in enumerate(self.timeseries):
|
|
82
|
+
if existing_ts == ts:
|
|
83
|
+
self.timeseries[i] = existing_ts.concatenate(ts)
|
|
84
|
+
return
|
|
85
|
+
|
|
86
|
+
self.timeseries.append(ts)
|
|
87
|
+
|
|
88
|
+
return
|
|
89
|
+
|
|
90
|
+
def filter(
|
|
91
|
+
self,
|
|
92
|
+
location: str | list | None = None,
|
|
93
|
+
variable: str | list | None = None,
|
|
94
|
+
unit: str | list | None = None,
|
|
95
|
+
**kwargs: dict[str, str | list],
|
|
96
|
+
) -> T | Dataset:
|
|
97
|
+
"""Return a Timeseries or a new Dataset filtered by station, sensor,
|
|
98
|
+
and/or variable.
|
|
99
|
+
|
|
100
|
+
Parameters:
|
|
101
|
+
location (Optional[str]): The location name.
|
|
102
|
+
variable (Optional[str]): The variable being measured.
|
|
103
|
+
unit (Optional[str]): Unit of the measurement.
|
|
104
|
+
**kwargs (dict): Attributes of subclassed timeseries used for filtering
|
|
105
|
+
(e.g., sensor, method).
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
Timeseries | Dataset: A single Timeseries if exactly one match is found,
|
|
109
|
+
or a new Dataset if multiple matches are found.
|
|
110
|
+
"""
|
|
111
|
+
|
|
112
|
+
def matches(ts: T, attr: str, value: dict[str, str | list]) -> bool | None:
|
|
113
|
+
"""Check if the Timeseries object has the attribute and if it matches the value."""
|
|
114
|
+
if not hasattr(ts, attr):
|
|
115
|
+
message = f"'{ts.__class__.__name__}' object has no attribute '{attr}'"
|
|
116
|
+
raise AttributeError(message)
|
|
117
|
+
return getattr(ts, attr) in value
|
|
118
|
+
|
|
119
|
+
if isinstance(location, str):
|
|
120
|
+
location = [location]
|
|
121
|
+
if isinstance(variable, str):
|
|
122
|
+
variable = [variable]
|
|
123
|
+
if isinstance(unit, str):
|
|
124
|
+
unit = [unit]
|
|
125
|
+
for key, value in kwargs.items():
|
|
126
|
+
if isinstance(value, str):
|
|
127
|
+
kwargs[key] = [value]
|
|
128
|
+
|
|
129
|
+
matching_timeseries = [
|
|
130
|
+
ts
|
|
131
|
+
for ts in self.timeseries
|
|
132
|
+
if ts is not None
|
|
133
|
+
and (location is None or ts.location in location)
|
|
134
|
+
and (variable is None or ts.variable in variable)
|
|
135
|
+
and (unit is None or ts.unit in unit)
|
|
136
|
+
and all(matches(ts, attr, value) for attr, value in kwargs.items())
|
|
137
|
+
]
|
|
138
|
+
|
|
139
|
+
if not matching_timeseries:
|
|
140
|
+
return Dataset()
|
|
141
|
+
|
|
142
|
+
if len(matching_timeseries) == 1:
|
|
143
|
+
return matching_timeseries[0]
|
|
144
|
+
|
|
145
|
+
return self.model_copy(update={"timeseries": matching_timeseries})
|
|
146
|
+
|
|
147
|
+
def to_sql(self, db: DatabaseConnection) -> None:
|
|
148
|
+
"""Save the entire timeseries to a SQLite database.
|
|
149
|
+
|
|
150
|
+
Parameters:
|
|
151
|
+
db (DatabaseConnection): SQLite database connection object.
|
|
152
|
+
"""
|
|
153
|
+
for ts in self.timeseries:
|
|
154
|
+
if ts:
|
|
155
|
+
ts.to_sql(db)
|
|
156
|
+
return
|
|
157
|
+
|
|
158
|
+
def plot(
|
|
159
|
+
self,
|
|
160
|
+
include_outliers: bool = False,
|
|
161
|
+
plot_kwargs: dict[str, Any] | None = None,
|
|
162
|
+
legend_kwargs: dict[str, Any] | None = None,
|
|
163
|
+
) -> tuple[Figure, Axes]:
|
|
164
|
+
"""Plots the timeseries data, grouping by variable type.
|
|
165
|
+
|
|
166
|
+
Parameters:
|
|
167
|
+
include_outliers (bool): Whether to include outliers in the plot.
|
|
168
|
+
plot_kwargs (dict[str, Any] | None): kwargs passed to matplotlib.axes.Axes.plot() method to customize the plot.
|
|
169
|
+
legend_kwargs (dict[str, Any] | None): kwargs passed to matplotlib.axes.Axes.legend() to customize the legend.
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
(fig, ax): Matplotlib figure and axes to allow further customization.
|
|
173
|
+
"""
|
|
174
|
+
|
|
175
|
+
grouped_ts = defaultdict(list)
|
|
176
|
+
|
|
177
|
+
for ts in self.timeseries:
|
|
178
|
+
if ts:
|
|
179
|
+
grouped_ts[ts.variable].append(ts)
|
|
180
|
+
|
|
181
|
+
num_variables = len(grouped_ts)
|
|
182
|
+
|
|
183
|
+
fig, axes = plt.subplots(
|
|
184
|
+
num_variables, 1, figsize=(10, 5 * num_variables), sharex=True
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
if num_variables == 1:
|
|
188
|
+
axes = [axes]
|
|
189
|
+
|
|
190
|
+
for ax, (variable, ts_list) in zip(axes, grouped_ts.items(), strict=False):
|
|
191
|
+
for ts in ts_list:
|
|
192
|
+
ts.plot(
|
|
193
|
+
include_outliers=include_outliers,
|
|
194
|
+
ax=ax,
|
|
195
|
+
plot_kwargs=plot_kwargs,
|
|
196
|
+
legend_kwargs=legend_kwargs,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
ax.set_title(f"Timeseries for {variable.capitalize()}")
|
|
200
|
+
ax.set_xlabel("Time")
|
|
201
|
+
|
|
202
|
+
fig.tight_layout()
|
|
203
|
+
return fig, axes
|
|
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
from typing import Any
|
|
4
4
|
|
|
5
|
+
import numpy as np
|
|
5
6
|
import pandas as pd
|
|
6
7
|
|
|
7
8
|
|
|
@@ -23,5 +24,9 @@ class TimeseriesIndexer:
|
|
|
23
24
|
|
|
24
25
|
if isinstance(result, pd.Series):
|
|
25
26
|
return self.parent.model_copy(update={"ts": result}, deep=True)
|
|
27
|
+
|
|
28
|
+
if isinstance(result, (int | float | str | pd.Timestamp | np.float64)):
|
|
29
|
+
return result
|
|
30
|
+
|
|
26
31
|
message = f"Expected pd.Series, but got {type(result)} instead."
|
|
27
32
|
raise TypeError(message)
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import pandera as pa
|
|
7
|
+
import pydantic as pyd
|
|
8
|
+
from matplotlib.axes import Axes
|
|
9
|
+
from matplotlib.figure import Figure
|
|
10
|
+
|
|
11
|
+
from gensor.core.base import BaseTimeseries
|
|
12
|
+
|
|
13
|
+
ts_schema = pa.SeriesSchema(
|
|
14
|
+
float,
|
|
15
|
+
index=pa.Index(pd.DatetimeTZDtype(tz="UTC"), coerce=False),
|
|
16
|
+
coerce=True,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Timeseries(BaseTimeseries):
|
|
21
|
+
"""Timeseries of groundwater sensor data.
|
|
22
|
+
|
|
23
|
+
Attributes:
|
|
24
|
+
ts (pd.Series): The timeseries data.
|
|
25
|
+
variable (Literal['temperature', 'pressure', 'conductivity', 'flux']):
|
|
26
|
+
The type of the measurement.
|
|
27
|
+
unit (Literal['degC', 'mmH2O', 'mS/cm', 'm/s']): The unit of
|
|
28
|
+
the measurement.
|
|
29
|
+
sensor (str): The serial number of the sensor.
|
|
30
|
+
sensor_alt (float): Altitude of the sensor (ncessary to compute groundwater levels).
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
model_config = pyd.ConfigDict(
|
|
34
|
+
arbitrary_types_allowed=True, validate_assignment=True
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
sensor: str | None = None
|
|
38
|
+
sensor_alt: float | None = None
|
|
39
|
+
|
|
40
|
+
def __eq__(self, other: object) -> bool:
|
|
41
|
+
"""Check equality based on location, sensor, variable, unit and sensor_alt."""
|
|
42
|
+
if not isinstance(other, Timeseries):
|
|
43
|
+
return NotImplemented
|
|
44
|
+
|
|
45
|
+
if not super().__eq__(other):
|
|
46
|
+
return False
|
|
47
|
+
|
|
48
|
+
return self.sensor == other.sensor and self.sensor_alt == other.sensor_alt
|
|
49
|
+
|
|
50
|
+
def plot(
|
|
51
|
+
self,
|
|
52
|
+
include_outliers: bool = False,
|
|
53
|
+
ax: Axes | None = None,
|
|
54
|
+
plot_kwargs: dict[str, Any] | None = None,
|
|
55
|
+
legend_kwargs: dict[str, Any] | None = None,
|
|
56
|
+
) -> tuple[Figure, Axes]:
|
|
57
|
+
"""Plots the timeseries data.
|
|
58
|
+
|
|
59
|
+
Parameters:
|
|
60
|
+
include_outliers (bool): Whether to include outliers in the plot.
|
|
61
|
+
ax (matplotlib.axes.Axes, optional): Matplotlib axes object to plot on.
|
|
62
|
+
If None, a new figure and axes are created.
|
|
63
|
+
plot_kwargs (dict[str, Any] | None): kwargs passed to matplotlib.axes.Axes.plot() method to customize the plot.
|
|
64
|
+
legend_kwargs (dict[str, Any] | None): kwargs passed to matplotlib.axes.Axes.legend() to customize the legend.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
(fig, ax): Matplotlib figure and axes to allow further customization.
|
|
68
|
+
"""
|
|
69
|
+
fig, ax = super().plot(
|
|
70
|
+
include_outliers=include_outliers,
|
|
71
|
+
ax=ax,
|
|
72
|
+
plot_kwargs=plot_kwargs,
|
|
73
|
+
legend_kwargs=legend_kwargs,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
ax.set_title(f"{self.variable.capitalize()} at {self.location} ({self.sensor})")
|
|
77
|
+
|
|
78
|
+
return fig, ax
|
|
@@ -9,6 +9,7 @@ from typing import Any
|
|
|
9
9
|
|
|
10
10
|
import pydantic as pyd
|
|
11
11
|
from sqlalchemy import (
|
|
12
|
+
JSON,
|
|
12
13
|
Column,
|
|
13
14
|
Connection,
|
|
14
15
|
Engine,
|
|
@@ -100,13 +101,12 @@ class DatabaseConnection(pyd.BaseModel):
|
|
|
100
101
|
Column("id", Integer, primary_key=True),
|
|
101
102
|
Column("table_name", String, unique=True),
|
|
102
103
|
Column("location", String),
|
|
103
|
-
Column("sensor", String),
|
|
104
104
|
Column("variable", String),
|
|
105
105
|
Column("unit", String),
|
|
106
|
-
Column("
|
|
107
|
-
Column("
|
|
108
|
-
Column("
|
|
109
|
-
Column("
|
|
106
|
+
Column("start", String, nullable=True),
|
|
107
|
+
Column("end", String, nullable=True),
|
|
108
|
+
Column("extra", JSON, nullable=True),
|
|
109
|
+
Column("cls", String, nullable=False),
|
|
110
110
|
)
|
|
111
111
|
|
|
112
112
|
if self.engine:
|
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
TODO: Fix up the read_from_sql() function to actually work properly.
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
|
+
import logging
|
|
7
|
+
from importlib import import_module
|
|
6
8
|
from pathlib import Path
|
|
7
9
|
from typing import Any, Literal
|
|
8
10
|
|
|
@@ -12,13 +14,14 @@ from sqlalchemy import select
|
|
|
12
14
|
from ..core.dataset import Dataset
|
|
13
15
|
from ..core.timeseries import Timeseries
|
|
14
16
|
from ..db.connection import DatabaseConnection
|
|
15
|
-
from ..exceptions import NoFilesToLoad
|
|
16
17
|
from ..parse import parse_plain, parse_vanessen_csv
|
|
17
18
|
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
18
21
|
|
|
19
22
|
def read_from_csv(
|
|
20
23
|
path: Path, file_format: Literal["vanessen", "plain"] = "vanessen", **kwargs: Any
|
|
21
|
-
) -> Dataset:
|
|
24
|
+
) -> Dataset | Timeseries:
|
|
22
25
|
"""Loads the data from csv files with given file_format and returns a list of Timeseries objects.
|
|
23
26
|
|
|
24
27
|
Parameters:
|
|
@@ -44,7 +47,8 @@ def read_from_csv(
|
|
|
44
47
|
if path.is_dir() and not any(
|
|
45
48
|
file.is_file() and file.suffix.lower() == ".csv" for file in path.iterdir()
|
|
46
49
|
):
|
|
47
|
-
|
|
50
|
+
logger.info("No CSV files found. Operation skipped.")
|
|
51
|
+
return Dataset()
|
|
48
52
|
|
|
49
53
|
files = (
|
|
50
54
|
[
|
|
@@ -58,24 +62,33 @@ def read_from_csv(
|
|
|
58
62
|
else []
|
|
59
63
|
)
|
|
60
64
|
|
|
65
|
+
if not files:
|
|
66
|
+
logger.info("No CSV files found. Operation skipped.")
|
|
67
|
+
return Dataset()
|
|
68
|
+
|
|
61
69
|
parser = parsers[file_format]
|
|
62
|
-
|
|
70
|
+
|
|
71
|
+
ds: Dataset = Dataset()
|
|
72
|
+
|
|
63
73
|
for f in files:
|
|
64
|
-
|
|
74
|
+
logger.info(f"Loading file: {f}")
|
|
65
75
|
ts_in_file = parser(f, **kwargs)
|
|
66
76
|
ds.add(ts_in_file)
|
|
67
77
|
|
|
68
|
-
|
|
78
|
+
# If there is only one Timeseries in Dataset (as in the condition), ds[0] will always
|
|
79
|
+
# be a Timeseries; so the line below does not introduce potential None in the return
|
|
80
|
+
return ds[0] if len(ds) == 1 else ds # type: ignore[return-value]
|
|
69
81
|
|
|
70
82
|
|
|
71
83
|
def read_from_sql(
|
|
72
84
|
db: DatabaseConnection,
|
|
73
85
|
load_all: bool,
|
|
74
86
|
location: str | None = None,
|
|
75
|
-
sensor: str | None = None,
|
|
76
87
|
variable: str | None = None,
|
|
77
88
|
unit: str | None = None,
|
|
78
89
|
timestamp_start: pd.Timestamp | None = None,
|
|
90
|
+
timestamp_stop: pd.Timestamp | None = None,
|
|
91
|
+
**kwargs: dict,
|
|
79
92
|
) -> Timeseries | Dataset:
|
|
80
93
|
"""Returns the timeseries or a dataset from a SQL database.
|
|
81
94
|
|
|
@@ -83,7 +96,6 @@ def read_from_sql(
|
|
|
83
96
|
db (DatabaseConnection): The database connection object.
|
|
84
97
|
load_all (bool): Whether to load all timeseries from the database.
|
|
85
98
|
location (str): The station name.
|
|
86
|
-
sensor (str): The sensor name.
|
|
87
99
|
variable (str): The measurement type.
|
|
88
100
|
unit (str): The unit of the measurement.
|
|
89
101
|
|
|
@@ -95,7 +107,7 @@ def read_from_sql(
|
|
|
95
107
|
TypeError: If the retrieved data is not a DataFrame or is of incorrect type.
|
|
96
108
|
"""
|
|
97
109
|
|
|
98
|
-
def _read_from_sql(schema_name: str) ->
|
|
110
|
+
def _read_from_sql(schema_name: str) -> Any:
|
|
99
111
|
with db as con:
|
|
100
112
|
schema = db.metadata.tables[schema_name]
|
|
101
113
|
metadata_table = db.metadata.tables["__timeseries_metadata__"]
|
|
@@ -122,21 +134,23 @@ def read_from_sql(
|
|
|
122
134
|
message = f"No metadata found for table {schema_name}"
|
|
123
135
|
raise ValueError(message)
|
|
124
136
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
137
|
+
# Core metadata extraction
|
|
138
|
+
core_metadata = {
|
|
139
|
+
"location": metadata_result[2],
|
|
140
|
+
"variable": metadata_result[3],
|
|
141
|
+
"unit": metadata_result[4],
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
extra_metadata = metadata_result[7] or {}
|
|
145
|
+
cls = metadata_result[8]
|
|
146
|
+
|
|
147
|
+
metadata = {**core_metadata, **extra_metadata}
|
|
148
|
+
|
|
149
|
+
module_name, class_name = cls.rsplit(".", 1)
|
|
150
|
+
module = import_module(module_name)
|
|
151
|
+
|
|
152
|
+
TimeseriesClass = getattr(module, class_name)
|
|
153
|
+
ts_object = TimeseriesClass(ts=ts, **metadata)
|
|
140
154
|
|
|
141
155
|
return ts_object
|
|
142
156
|
|
|
@@ -151,12 +165,12 @@ def read_from_sql(
|
|
|
151
165
|
else:
|
|
152
166
|
return Dataset()
|
|
153
167
|
else:
|
|
154
|
-
|
|
155
|
-
timestamp_start_fmt = timestamp_start.strftime("%Y%m%d%H%M%S")
|
|
168
|
+
|
|
156
169
|
schema_name = (
|
|
157
|
-
f"{location}_{
|
|
170
|
+
f"{location}_{variable}_{unit}".lower()
|
|
158
171
|
)
|
|
159
|
-
|
|
172
|
+
# This will always returm Timeseries or Dataset.
|
|
173
|
+
return _read_from_sql(schema_name) # type: ignore[no-any-return]
|
|
160
174
|
|
|
161
175
|
|
|
162
176
|
# fmt: on
|
|
@@ -10,6 +10,18 @@ from pandas import DataFrame, read_csv, to_datetime
|
|
|
10
10
|
def get_data(
|
|
11
11
|
text: str, data_start: str, data_end: str, column_names: list
|
|
12
12
|
) -> DataFrame:
|
|
13
|
+
"""Search for data in the file.
|
|
14
|
+
|
|
15
|
+
Parameters:
|
|
16
|
+
text (str): string obtained from the CSV file.
|
|
17
|
+
data_start (str): string at the first row of the data.
|
|
18
|
+
data_end (str): string at the last row of the data.
|
|
19
|
+
column_names (list): list of expected column names.
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
pd.DataFrame
|
|
23
|
+
"""
|
|
24
|
+
|
|
13
25
|
data_io = StringIO(text[text.index(data_start) : text.index(data_end)])
|
|
14
26
|
|
|
15
27
|
df = read_csv(
|
|
@@ -20,7 +32,15 @@ def get_data(
|
|
|
20
32
|
|
|
21
33
|
|
|
22
34
|
def get_metadata(text: str, patterns: dict) -> dict:
|
|
23
|
-
"""Search for metadata in the file header with given regex patterns.
|
|
35
|
+
"""Search for metadata in the file header with given regex patterns.
|
|
36
|
+
|
|
37
|
+
Parameters:
|
|
38
|
+
text (str): string obtained from the CSV file.
|
|
39
|
+
patterns (dict): regex patterns matching the location and sensor information.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
dict: metadata of the timeseries.
|
|
43
|
+
"""
|
|
24
44
|
metadata = {}
|
|
25
45
|
|
|
26
46
|
for k, v in patterns.items():
|
|
@@ -36,7 +56,7 @@ def get_metadata(text: str, patterns: dict) -> dict:
|
|
|
36
56
|
def detect_encoding(path: Path, num_bytes: int = 1024) -> str:
|
|
37
57
|
"""Detect the encoding of a file using chardet.
|
|
38
58
|
|
|
39
|
-
|
|
59
|
+
Parameters:
|
|
40
60
|
path (Path): The path to the file.
|
|
41
61
|
num_bytes (int): Number of bytes to read for encoding detection (default is 1024).
|
|
42
62
|
|
|
@@ -52,7 +72,7 @@ def detect_encoding(path: Path, num_bytes: int = 1024) -> str:
|
|
|
52
72
|
def handle_timestamps(df: DataFrame, tz_string: str) -> DataFrame:
|
|
53
73
|
"""Converts timestamps in the dataframe to the specified timezone (e.g., 'UTC+1').
|
|
54
74
|
|
|
55
|
-
|
|
75
|
+
Parameters:
|
|
56
76
|
df (pd.DataFrame): The dataframe with timestamps.
|
|
57
77
|
tz_string (str): A timezone string like 'UTC+1' or 'UTC-5'.
|
|
58
78
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Logic parsing CSV files from van Essen Instruments Divers."""
|
|
2
2
|
|
|
3
|
+
import logging
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
from typing import Any
|
|
5
6
|
|
|
@@ -7,6 +8,8 @@ from ..config import VARIABLE_TYPES_AND_UNITS
|
|
|
7
8
|
from ..core.timeseries import Timeseries
|
|
8
9
|
from .utils import detect_encoding, get_data, get_metadata, handle_timestamps
|
|
9
10
|
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
10
13
|
|
|
11
14
|
def parse_vanessen_csv(path: Path, **kwargs: Any) -> list[Timeseries]:
|
|
12
15
|
"""Parses a van Essen csv file and returns a list of Timeseries objects. At this point it
|
|
@@ -51,7 +54,7 @@ def parse_vanessen_csv(path: Path, **kwargs: Any) -> list[Timeseries]:
|
|
|
51
54
|
metadata = get_metadata(text, patterns)
|
|
52
55
|
|
|
53
56
|
if not metadata:
|
|
54
|
-
|
|
57
|
+
logger.info(f"Skipping file {path} due to missing metadata.")
|
|
55
58
|
return []
|
|
56
59
|
|
|
57
60
|
data_start = "Date/time"
|
gensor-0.1.6/gensor/__init__.py
DELETED
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
from .analysis.outliers import OutlierDetection
|
|
2
|
-
from .core.dataset import Dataset
|
|
3
|
-
from .core.timeseries import Timeseries
|
|
4
|
-
from .io.read import read_from_csv, read_from_sql
|
|
5
|
-
from .processing.compensation import Compensator, compensate
|
|
6
|
-
from .processing.transform import Transformation
|
|
7
|
-
|
|
8
|
-
__all__ = [
|
|
9
|
-
# basic data types
|
|
10
|
-
"Dataset",
|
|
11
|
-
"Timeseries",
|
|
12
|
-
# data transformation
|
|
13
|
-
"OutlierDetection",
|
|
14
|
-
"Transformation",
|
|
15
|
-
"Compensator",
|
|
16
|
-
"compensate",
|
|
17
|
-
# getters
|
|
18
|
-
"read_from_csv",
|
|
19
|
-
"read_from_sql",
|
|
20
|
-
]
|
|
@@ -1,174 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from collections import defaultdict
|
|
4
|
-
from typing import Any, Self
|
|
5
|
-
|
|
6
|
-
import pydantic as pyd
|
|
7
|
-
from matplotlib import pyplot as plt
|
|
8
|
-
|
|
9
|
-
from gensor.core.timeseries import Timeseries
|
|
10
|
-
from gensor.db import DatabaseConnection
|
|
11
|
-
from gensor.exceptions import IndexOutOfRangeError, TimeseriesNotFound
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class Dataset(pyd.BaseModel):
|
|
15
|
-
"""Class to store a collection of timeseries.
|
|
16
|
-
|
|
17
|
-
The Dataset class is used to store a collection of Timeseries objects. It
|
|
18
|
-
is meant to be created when the van Essen CSV file is parsed.
|
|
19
|
-
|
|
20
|
-
Attributes:
|
|
21
|
-
timeseries (list[Timeseries]): A list of Timeseries objects.
|
|
22
|
-
|
|
23
|
-
Methods:
|
|
24
|
-
__iter__: Returns timeseries when iterated over.
|
|
25
|
-
__len__: Gives the number of timeseries in the Dataset.
|
|
26
|
-
get_stations: List all unique locations in the dataset.
|
|
27
|
-
add: Appends a new series to the Dataset or merges series if
|
|
28
|
-
an equal one exists.
|
|
29
|
-
align: Aligns the timeseries to a common time axis.
|
|
30
|
-
plot: Plots the timeseries data.
|
|
31
|
-
"""
|
|
32
|
-
|
|
33
|
-
timeseries: list[Timeseries | None] = pyd.Field(default_factory=list)
|
|
34
|
-
|
|
35
|
-
def __iter__(self) -> Any:
|
|
36
|
-
"""Allows to iterate directly over the dataset."""
|
|
37
|
-
return iter(self.timeseries)
|
|
38
|
-
|
|
39
|
-
def __len__(self) -> int:
|
|
40
|
-
"""Gives the number of timeseries in the Dataset."""
|
|
41
|
-
return len(self.timeseries)
|
|
42
|
-
|
|
43
|
-
def __repr__(self) -> str:
|
|
44
|
-
return f"Dataset({len(self)})"
|
|
45
|
-
|
|
46
|
-
def __getitem__(self, index: int) -> Timeseries | None:
|
|
47
|
-
"""Retrieve a Timeseries object by its index in the dataset.
|
|
48
|
-
|
|
49
|
-
Parameters:
|
|
50
|
-
index (int): The index of the Timeseries to retrieve.
|
|
51
|
-
|
|
52
|
-
Returns:
|
|
53
|
-
Timeseries: The Timeseries object at the specified index.
|
|
54
|
-
|
|
55
|
-
Raises:
|
|
56
|
-
IndexError: If the index is out of range.
|
|
57
|
-
"""
|
|
58
|
-
try:
|
|
59
|
-
return self.timeseries[index]
|
|
60
|
-
except IndexError:
|
|
61
|
-
raise IndexOutOfRangeError(index, len(self)) from None
|
|
62
|
-
|
|
63
|
-
def get_stations(self) -> list:
|
|
64
|
-
"""List all unique locations in the dataset."""
|
|
65
|
-
return [ts.location for ts in self.timeseries if ts is not None]
|
|
66
|
-
|
|
67
|
-
def add(self, other: Timeseries | list[Timeseries] | Self) -> None:
|
|
68
|
-
"""Appends a new series to the Dataset or merges series if an equal
|
|
69
|
-
one exists.
|
|
70
|
-
|
|
71
|
-
If a Timeseries with the same location, sensor, and variable already
|
|
72
|
-
exists, merge the new data into the existing Timeseries, dropping
|
|
73
|
-
duplicate timestamps.
|
|
74
|
-
|
|
75
|
-
Parameters:
|
|
76
|
-
other (Timeseries): The Timeseries object to add.
|
|
77
|
-
"""
|
|
78
|
-
if isinstance(other, list):
|
|
79
|
-
for ts in other:
|
|
80
|
-
if isinstance(ts, Timeseries):
|
|
81
|
-
self._add_single_timeseries(ts)
|
|
82
|
-
elif isinstance(other, Dataset):
|
|
83
|
-
for ts in other.timeseries: # type: ignore[assignment]
|
|
84
|
-
if isinstance(ts, Timeseries):
|
|
85
|
-
self._add_single_timeseries(ts)
|
|
86
|
-
elif isinstance(other, Timeseries):
|
|
87
|
-
self._add_single_timeseries(other)
|
|
88
|
-
|
|
89
|
-
return
|
|
90
|
-
|
|
91
|
-
def _add_single_timeseries(self, ts: Timeseries) -> None:
|
|
92
|
-
"""Adds a single Timeseries to the Dataset or merges if an equal one exists."""
|
|
93
|
-
for i, existing_ts in enumerate(self.timeseries):
|
|
94
|
-
if existing_ts == ts:
|
|
95
|
-
self.timeseries[i] = existing_ts.concatenate(ts)
|
|
96
|
-
return
|
|
97
|
-
|
|
98
|
-
self.timeseries.append(ts)
|
|
99
|
-
|
|
100
|
-
return
|
|
101
|
-
|
|
102
|
-
def filter(
|
|
103
|
-
self,
|
|
104
|
-
stations: str | list | None = None,
|
|
105
|
-
sensors: str | list | None = None,
|
|
106
|
-
variables: str | list | None = None,
|
|
107
|
-
) -> Timeseries | Dataset:
|
|
108
|
-
"""Return a Timeseries or a new Dataset filtered by station, sensor,
|
|
109
|
-
and/or variable.
|
|
110
|
-
|
|
111
|
-
Parameters:
|
|
112
|
-
stations (Optional[str]): The location of the station.
|
|
113
|
-
sensors (Optional[str]): The sensor identifier.
|
|
114
|
-
variables (Optional[str]): The variable being measured.
|
|
115
|
-
|
|
116
|
-
Returns:
|
|
117
|
-
Timeseries or Dataset: A single Timeseries if exactly one match is found,
|
|
118
|
-
or a new Dataset if multiple matches are found.
|
|
119
|
-
"""
|
|
120
|
-
|
|
121
|
-
if isinstance(stations, str):
|
|
122
|
-
stations = [stations]
|
|
123
|
-
|
|
124
|
-
if isinstance(sensors, str):
|
|
125
|
-
sensors = [sensors]
|
|
126
|
-
|
|
127
|
-
if isinstance(variables, str):
|
|
128
|
-
variables = [variables]
|
|
129
|
-
|
|
130
|
-
matching_timeseries = [
|
|
131
|
-
ts
|
|
132
|
-
for ts in self.timeseries
|
|
133
|
-
if ts is not None
|
|
134
|
-
if (stations is None or ts.location in stations)
|
|
135
|
-
and (sensors is None or ts.sensor in sensors)
|
|
136
|
-
and (variables is None or ts.variable in variables)
|
|
137
|
-
]
|
|
138
|
-
|
|
139
|
-
if not matching_timeseries:
|
|
140
|
-
raise TimeseriesNotFound()
|
|
141
|
-
|
|
142
|
-
if len(matching_timeseries) == 1:
|
|
143
|
-
return matching_timeseries[0]
|
|
144
|
-
|
|
145
|
-
return self.model_copy(update={"timeseries": matching_timeseries})
|
|
146
|
-
|
|
147
|
-
def to_sql(self, db: DatabaseConnection) -> None:
|
|
148
|
-
for ts in self.timeseries:
|
|
149
|
-
if ts:
|
|
150
|
-
ts.to_sql(db)
|
|
151
|
-
return
|
|
152
|
-
|
|
153
|
-
def plot(self, include_outliers: bool = False) -> None:
|
|
154
|
-
"""Plots the timeseries data, grouping by variable type.
|
|
155
|
-
|
|
156
|
-
Args:
|
|
157
|
-
include_outliers (bool): Whether to include outliers in the plot.
|
|
158
|
-
"""
|
|
159
|
-
# Group timeseries by variable
|
|
160
|
-
grouped_ts = defaultdict(list)
|
|
161
|
-
for ts in self.timeseries:
|
|
162
|
-
if ts:
|
|
163
|
-
grouped_ts[ts.variable].append(ts)
|
|
164
|
-
|
|
165
|
-
# Create a plot for each group of timeseries with the same variable
|
|
166
|
-
for variable, ts_list in grouped_ts.items():
|
|
167
|
-
fig, ax = plt.subplots(figsize=(10, 5))
|
|
168
|
-
for ts in ts_list:
|
|
169
|
-
ts.plot(include_outliers=include_outliers, ax=ax)
|
|
170
|
-
|
|
171
|
-
ax.set_title(f"Timeseries for {variable.capitalize()}")
|
|
172
|
-
plt.show()
|
|
173
|
-
|
|
174
|
-
return
|
|
@@ -1,169 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import Any
|
|
4
|
-
|
|
5
|
-
import pandas as pd
|
|
6
|
-
import pandera as pa
|
|
7
|
-
import pydantic as pyd
|
|
8
|
-
from matplotlib import pyplot as plt
|
|
9
|
-
from sqlalchemy import Table
|
|
10
|
-
from sqlalchemy.dialects.sqlite import insert as sqlite_insert
|
|
11
|
-
|
|
12
|
-
from gensor.core.base import BaseTimeseries
|
|
13
|
-
from gensor.db import DatabaseConnection
|
|
14
|
-
|
|
15
|
-
ts_schema = pa.SeriesSchema(
|
|
16
|
-
float,
|
|
17
|
-
index=pa.Index(pd.DatetimeTZDtype(tz="UTC"), coerce=False),
|
|
18
|
-
coerce=True,
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class Timeseries(BaseTimeseries):
|
|
23
|
-
"""Timeseries for groundwater sensor data
|
|
24
|
-
|
|
25
|
-
Attributes:
|
|
26
|
-
ts (pd.Series): The timeseries data.
|
|
27
|
-
variable (Literal['temperature', 'pressure', 'conductivity', 'flux']):
|
|
28
|
-
The type of the measurement.
|
|
29
|
-
unit (Literal['degC', 'mmH2O', 'mS/cm', 'm/s']): The unit of
|
|
30
|
-
the measurement.
|
|
31
|
-
sensor (SensorInfo): The serial number of the sensor.
|
|
32
|
-
|
|
33
|
-
Methods:
|
|
34
|
-
validate_ts: if the pd.Series is not exactly what is required, coerce.
|
|
35
|
-
"""
|
|
36
|
-
|
|
37
|
-
model_config = pyd.ConfigDict(
|
|
38
|
-
arbitrary_types_allowed=True, validate_assignment=True
|
|
39
|
-
)
|
|
40
|
-
|
|
41
|
-
sensor: str | None = None
|
|
42
|
-
sensor_alt: float | None = None
|
|
43
|
-
|
|
44
|
-
def __eq__(self, other: object) -> bool:
|
|
45
|
-
"""Check equality based on location, sensor, variable, unit and sensor_alt."""
|
|
46
|
-
if not isinstance(other, Timeseries):
|
|
47
|
-
return NotImplemented
|
|
48
|
-
|
|
49
|
-
return (
|
|
50
|
-
self.variable == other.variable
|
|
51
|
-
and self.unit == other.unit
|
|
52
|
-
and self.location == other.location
|
|
53
|
-
and self.sensor == other.sensor
|
|
54
|
-
and self.sensor_alt == other.sensor_alt
|
|
55
|
-
)
|
|
56
|
-
|
|
57
|
-
def to_sql(self, db: DatabaseConnection) -> str:
|
|
58
|
-
"""Converts the timeseries to a list of dictionaries and uploads it to the database.
|
|
59
|
-
|
|
60
|
-
The Timeseries data is uploaded to the SQL database by using the pandas
|
|
61
|
-
`to_sql` method. Additionally, metadata about the timeseries is stored in the
|
|
62
|
-
'timeseries_metadata' table.
|
|
63
|
-
|
|
64
|
-
Args:
|
|
65
|
-
db (DatabaseConnection): The database connection object.
|
|
66
|
-
|
|
67
|
-
Returns:
|
|
68
|
-
str: A message indicating the number of rows inserted into the database.
|
|
69
|
-
"""
|
|
70
|
-
# Format the start timestamp as 'YYYYMMDDHHMMSS'
|
|
71
|
-
timestamp_start_fmt = self.start.strftime("%Y%m%d%H%M%S")
|
|
72
|
-
|
|
73
|
-
# Construct the schema name using the location, sensor, variable, unit, and timestamp
|
|
74
|
-
schema_name = f"{self.location}_{self.sensor}_{self.variable}_{self.unit}_{timestamp_start_fmt}".lower()
|
|
75
|
-
|
|
76
|
-
# Ensure the index is a pandas DatetimeIndex
|
|
77
|
-
if isinstance(self.ts.index, pd.DatetimeIndex):
|
|
78
|
-
utc_index = (
|
|
79
|
-
self.ts.index.tz_convert("UTC")
|
|
80
|
-
if self.ts.index.tz is not None
|
|
81
|
-
else self.ts.index
|
|
82
|
-
)
|
|
83
|
-
else:
|
|
84
|
-
message = "The index is not a DatetimeIndex and cannot be converted to UTC."
|
|
85
|
-
raise TypeError(message)
|
|
86
|
-
|
|
87
|
-
# Prepare the timeseries data as records for insertion
|
|
88
|
-
series_as_records = list(
|
|
89
|
-
zip(utc_index.strftime("%Y-%m-%dT%H:%M:%S%z"), self.ts, strict=False)
|
|
90
|
-
)
|
|
91
|
-
|
|
92
|
-
with db as con:
|
|
93
|
-
# Create the timeseries table if it doesn't exist
|
|
94
|
-
schema = db.create_table(schema_name, self.variable)
|
|
95
|
-
|
|
96
|
-
# Ensure that the timeseries_metadata table exists
|
|
97
|
-
metadata_schema = db.metadata.tables["__timeseries_metadata__"]
|
|
98
|
-
|
|
99
|
-
if isinstance(schema, Table):
|
|
100
|
-
# Insert the timeseries data
|
|
101
|
-
stmt = sqlite_insert(schema).values(series_as_records)
|
|
102
|
-
stmt = stmt.on_conflict_do_nothing(index_elements=["timestamp"])
|
|
103
|
-
con.execute(stmt)
|
|
104
|
-
con.commit()
|
|
105
|
-
|
|
106
|
-
metadata_stmt = sqlite_insert(metadata_schema).values(
|
|
107
|
-
table_name=schema_name,
|
|
108
|
-
location=self.location,
|
|
109
|
-
sensor=self.sensor,
|
|
110
|
-
variable=self.variable,
|
|
111
|
-
unit=self.unit,
|
|
112
|
-
logger_alt=self.sensor_alt,
|
|
113
|
-
location_alt=self.sensor_alt,
|
|
114
|
-
timestamp_start=timestamp_start_fmt,
|
|
115
|
-
timestamp_end=self.end.strftime("%Y%m%d%H%M%S"),
|
|
116
|
-
)
|
|
117
|
-
|
|
118
|
-
metadata_stmt = metadata_stmt.on_conflict_do_update(
|
|
119
|
-
index_elements=["table_name"],
|
|
120
|
-
set_={
|
|
121
|
-
"timestamp_start": timestamp_start_fmt,
|
|
122
|
-
"timestamp_end": self.end.strftime("%Y%m%d%H%M%S"),
|
|
123
|
-
},
|
|
124
|
-
)
|
|
125
|
-
|
|
126
|
-
con.execute(metadata_stmt)
|
|
127
|
-
con.commit()
|
|
128
|
-
|
|
129
|
-
return f"{schema_name} table and metadata updated."
|
|
130
|
-
|
|
131
|
-
def plot(
|
|
132
|
-
self, include_outliers: bool = False, ax: Any = None, **plot_kwargs: Any
|
|
133
|
-
) -> tuple:
|
|
134
|
-
"""Plots the timeseries data.
|
|
135
|
-
|
|
136
|
-
Args:
|
|
137
|
-
include_outliers (bool): Whether to include outliers in the plot.
|
|
138
|
-
ax (matplotlib.axes.Axes, optional): Matplotlib axes object to plot on.
|
|
139
|
-
If None, a new figure and axes are created.
|
|
140
|
-
**plot_kwargs: Additional keyword arguments passed to plt.plot.
|
|
141
|
-
|
|
142
|
-
Returns:
|
|
143
|
-
(fig, ax): Matplotlib figure and axes to allow further customization.
|
|
144
|
-
"""
|
|
145
|
-
# Create new figure and axes if not provided
|
|
146
|
-
if ax is None:
|
|
147
|
-
fig, ax = plt.subplots(figsize=(10, 5))
|
|
148
|
-
else:
|
|
149
|
-
fig = ax.get_figure()
|
|
150
|
-
|
|
151
|
-
ax.plot(
|
|
152
|
-
self.ts.index,
|
|
153
|
-
self.ts,
|
|
154
|
-
label=f"{self.location} ({self.sensor})",
|
|
155
|
-
**plot_kwargs,
|
|
156
|
-
)
|
|
157
|
-
|
|
158
|
-
if include_outliers and self.outliers is not None:
|
|
159
|
-
ax.scatter(
|
|
160
|
-
self.outliers.index, self.outliers, color="red", label="Outliers"
|
|
161
|
-
)
|
|
162
|
-
plt.xticks(rotation=45)
|
|
163
|
-
ax.set_xlabel("Time")
|
|
164
|
-
ax.set_ylabel(f"{self.variable} ({self.unit})")
|
|
165
|
-
ax.set_title(f"{self.variable.capitalize()} at {self.location}")
|
|
166
|
-
|
|
167
|
-
ax.legend()
|
|
168
|
-
|
|
169
|
-
return fig, ax
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|