gensor 0.0.3__tar.gz → 0.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gensor-0.0.3 → gensor-0.0.5}/PKG-INFO +2 -2
- {gensor-0.0.3 → gensor-0.0.5}/gensor/__init__.py +6 -2
- gensor-0.0.5/gensor/db/connection.py +106 -0
- {gensor-0.0.3 → gensor-0.0.5}/gensor/dtypes.py +87 -43
- gensor-0.0.5/gensor/getters.py +131 -0
- {gensor-0.0.3 → gensor-0.0.5}/gensor/parse/vanessen.py +11 -16
- {gensor-0.0.3 → gensor-0.0.5}/pyproject.toml +2 -2
- gensor-0.0.3/gensor/db/connection.py +0 -53
- gensor-0.0.3/gensor/getters.py +0 -95
- {gensor-0.0.3 → gensor-0.0.5}/LICENSE +0 -0
- {gensor-0.0.3 → gensor-0.0.5}/README.md +0 -0
- {gensor-0.0.3 → gensor-0.0.5}/gensor/compensation.py +0 -0
- {gensor-0.0.3 → gensor-0.0.5}/gensor/db/__init__.py +0 -0
- {gensor-0.0.3 → gensor-0.0.5}/gensor/exceptions.py +0 -0
- {gensor-0.0.3 → gensor-0.0.5}/gensor/parse/__init__.py +0 -0
- {gensor-0.0.3 → gensor-0.0.5}/gensor/preprocessing.py +0 -0
- {gensor-0.0.3 → gensor-0.0.5}/gensor/smoothing.py +0 -0
- {gensor-0.0.3 → gensor-0.0.5}/gensor/trend.py +0 -0
- {gensor-0.0.3 → gensor-0.0.5}/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: gensor
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.5
|
|
4
4
|
Summary: Library for handling groundwater sensor data.
|
|
5
5
|
Home-page: https://github.com/zawadzkim/gensor
|
|
6
6
|
Author: Mateusz Zawadzki
|
|
@@ -15,7 +15,7 @@ Requires-Dist: numpy (>=2.1.0,<3.0.0)
|
|
|
15
15
|
Requires-Dist: pandas (>=2.2.2,<3.0.0)
|
|
16
16
|
Requires-Dist: pandera (>=0.20.3,<0.21.0)
|
|
17
17
|
Requires-Dist: pydantic (>=2.8.2,<3.0.0)
|
|
18
|
-
Requires-Dist:
|
|
18
|
+
Requires-Dist: python-dateutil (>=2.9.0.post0,<3.0.0)
|
|
19
19
|
Requires-Dist: scikit-learn (>=1.5.1,<2.0.0)
|
|
20
20
|
Requires-Dist: scipy (>=1.14.1,<2.0.0)
|
|
21
21
|
Requires-Dist: sqlalchemy (>=2.0.32,<3.0.0)
|
|
@@ -1,14 +1,18 @@
|
|
|
1
1
|
from .compensation import Compensator, compensate
|
|
2
2
|
from .dtypes import Dataset, Timeseries
|
|
3
|
-
from .getters import read_from_csv
|
|
3
|
+
from .getters import read_from_csv, read_from_sql
|
|
4
4
|
from .preprocessing import OutlierDetection, Transform
|
|
5
5
|
|
|
6
6
|
__all__ = [
|
|
7
|
+
# basic data types
|
|
7
8
|
"Dataset",
|
|
8
9
|
"Timeseries",
|
|
9
|
-
|
|
10
|
+
# data transformation
|
|
10
11
|
"OutlierDetection",
|
|
11
12
|
"Transform",
|
|
12
13
|
"Compensator",
|
|
13
14
|
"compensate",
|
|
15
|
+
# getters
|
|
16
|
+
"read_from_csv",
|
|
17
|
+
"read_from_sql",
|
|
14
18
|
]
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Module defining database connection object.
|
|
2
|
+
|
|
3
|
+
Classes:
|
|
4
|
+
DatabaseConnection: Database connection object
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import pydantic as pyd
|
|
11
|
+
from sqlalchemy import (
|
|
12
|
+
Column,
|
|
13
|
+
Connection,
|
|
14
|
+
Engine,
|
|
15
|
+
Float,
|
|
16
|
+
MetaData,
|
|
17
|
+
String,
|
|
18
|
+
Table,
|
|
19
|
+
create_engine,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
from ..exceptions import DatabaseNotFound
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class DatabaseConnection(pyd.BaseModel):
|
|
26
|
+
"""Database connection object.
|
|
27
|
+
If no database exists at the specified path, it will be created.
|
|
28
|
+
If no database is specified, an in-memory database will be used."""
|
|
29
|
+
|
|
30
|
+
model_config = pyd.ConfigDict(
|
|
31
|
+
arbitrary_types_allowed=True, validate_assignment=True
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
metadata: MetaData = MetaData()
|
|
35
|
+
db_directory: Path = Path.cwd()
|
|
36
|
+
db_name: str = "gensor.db"
|
|
37
|
+
engine: Engine | None = None
|
|
38
|
+
|
|
39
|
+
def _verify_path(self) -> str:
|
|
40
|
+
"""Verify database path."""
|
|
41
|
+
|
|
42
|
+
if not self.db_directory.exists():
|
|
43
|
+
raise DatabaseNotFound()
|
|
44
|
+
return f"sqlite:///{self.db_directory}/{self.db_name}"
|
|
45
|
+
|
|
46
|
+
def connect(self) -> Connection:
|
|
47
|
+
"""Connect to the database and initialize the engine.
|
|
48
|
+
If engine is None > create it with verified path > reflect
|
|
49
|
+
"""
|
|
50
|
+
if self.engine is None:
|
|
51
|
+
sqlite_path = self._verify_path()
|
|
52
|
+
self.engine = create_engine(sqlite_path)
|
|
53
|
+
return self.engine.connect()
|
|
54
|
+
|
|
55
|
+
def dispose(self) -> None:
|
|
56
|
+
"""Dispose of the engine, closing all connections."""
|
|
57
|
+
if self.metadata:
|
|
58
|
+
self.metadata.clear()
|
|
59
|
+
if self.engine:
|
|
60
|
+
self.engine.dispose()
|
|
61
|
+
|
|
62
|
+
def __enter__(self) -> Connection:
|
|
63
|
+
"""Enable usage in a `with` block by returning the engine."""
|
|
64
|
+
con = self.connect()
|
|
65
|
+
if self.engine:
|
|
66
|
+
self.metadata.reflect(bind=self.engine)
|
|
67
|
+
return con
|
|
68
|
+
|
|
69
|
+
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
|
70
|
+
"""Dispose of the engine when exiting the `with` block."""
|
|
71
|
+
self.dispose()
|
|
72
|
+
|
|
73
|
+
def get_tables(self) -> list | None:
|
|
74
|
+
with self:
|
|
75
|
+
tables = self.metadata.tables
|
|
76
|
+
|
|
77
|
+
if not tables:
|
|
78
|
+
print("This database has no tables.")
|
|
79
|
+
return None
|
|
80
|
+
else:
|
|
81
|
+
return list(tables)
|
|
82
|
+
|
|
83
|
+
def create_table(self, schema_name: str, column_name: str) -> Table | str:
|
|
84
|
+
"""Create a table in the database.
|
|
85
|
+
|
|
86
|
+
Schema name is a string representing the location, sensor, variable measured and
|
|
87
|
+
unit of measurement. This is a way of preserving the metadata of the Timeseries.
|
|
88
|
+
The index is always `timestamp` and the column name is dynamicly create from
|
|
89
|
+
the measured variable.
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
if schema_name in self.metadata.tables:
|
|
93
|
+
return self.metadata.tables[schema_name]
|
|
94
|
+
|
|
95
|
+
ts_table = Table(
|
|
96
|
+
schema_name,
|
|
97
|
+
self.metadata,
|
|
98
|
+
Column("timestamp", String, primary_key=True),
|
|
99
|
+
Column(column_name, Float),
|
|
100
|
+
)
|
|
101
|
+
if self.engine:
|
|
102
|
+
ts_table.create(self.engine, checkfirst=True)
|
|
103
|
+
self.metadata.reflect(bind=self.engine)
|
|
104
|
+
return ts_table
|
|
105
|
+
else:
|
|
106
|
+
return "Engine does not exist."
|
|
@@ -1,12 +1,24 @@
|
|
|
1
|
+
"""
|
|
2
|
+
!!! warning
|
|
3
|
+
|
|
4
|
+
Whenever Timeseries objects are created via read_from_csv and use a parser (e.g.,
|
|
5
|
+
'vanessen'), the timestamps are localized and converted to UTC. Therefore, if the
|
|
6
|
+
user creates his own timeseries outside the read_from_csv, they should ensure that
|
|
7
|
+
the timestamps are in UTC format.
|
|
8
|
+
"""
|
|
9
|
+
|
|
1
10
|
from __future__ import annotations
|
|
2
11
|
|
|
12
|
+
from collections import defaultdict
|
|
3
13
|
from collections.abc import Callable
|
|
4
|
-
from typing import Any, Literal
|
|
14
|
+
from typing import Any, Literal, Self
|
|
5
15
|
|
|
6
16
|
import pandas as pd
|
|
7
17
|
import pandera as pa
|
|
8
18
|
import pydantic as pyd
|
|
9
19
|
from matplotlib import pyplot as plt
|
|
20
|
+
from sqlalchemy import Table
|
|
21
|
+
from sqlalchemy.dialects.sqlite import insert as sqlite_insert
|
|
10
22
|
|
|
11
23
|
from .db import DatabaseConnection
|
|
12
24
|
from .exceptions import IndexOutOfRangeError, TimeseriesNotFound, TimeseriesUnequal
|
|
@@ -14,14 +26,14 @@ from .preprocessing import OutlierDetection, Transform
|
|
|
14
26
|
|
|
15
27
|
ts_schema = pa.SeriesSchema(
|
|
16
28
|
float,
|
|
17
|
-
index=pa.Index(
|
|
29
|
+
index=pa.Index(pd.DatetimeTZDtype(tz="UTC"), coerce=False),
|
|
18
30
|
coerce=True,
|
|
19
31
|
)
|
|
20
32
|
|
|
21
33
|
VARIABLE_TYPES_AND_UNITS = {
|
|
22
|
-
"temperature": ["
|
|
23
|
-
"pressure": ["
|
|
24
|
-
"conductivity": ["
|
|
34
|
+
"temperature": ["degc"],
|
|
35
|
+
"pressure": ["cmh2o", "mmh2o"],
|
|
36
|
+
"conductivity": ["ms/cm"],
|
|
25
37
|
"flux": ["m/s"],
|
|
26
38
|
"head": ["m asl"],
|
|
27
39
|
"depth": ["m"],
|
|
@@ -63,7 +75,7 @@ class Timeseries(pyd.BaseModel):
|
|
|
63
75
|
variable: Literal[
|
|
64
76
|
"temperature", "pressure", "conductivity", "flux", "head", "depth"
|
|
65
77
|
]
|
|
66
|
-
unit: Literal["
|
|
78
|
+
unit: Literal["degc", "cmh2o", "ms/cm", "m/s", "m asl", "m"]
|
|
67
79
|
location: str | None = None
|
|
68
80
|
sensor: str | None = None
|
|
69
81
|
sensor_alt: float | None = None
|
|
@@ -213,25 +225,41 @@ class Timeseries(pyd.BaseModel):
|
|
|
213
225
|
def to_sql(self, db: DatabaseConnection) -> str:
|
|
214
226
|
"""Converts the timeseries to a list of dictionaries and uploads it to the database.
|
|
215
227
|
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
data has to be uploaded as a list of dictionaries.
|
|
228
|
+
The Timeseries data is uploaded to the SQL database by using the pandas
|
|
229
|
+
`to_sql` method.
|
|
219
230
|
|
|
220
231
|
Args:
|
|
221
|
-
db (DatabaseConnection): The database connection object
|
|
232
|
+
db (DatabaseConnection): The database connection object.
|
|
222
233
|
|
|
223
234
|
Returns:
|
|
224
235
|
str: A message indicating the number of rows inserted into the database.
|
|
225
236
|
"""
|
|
226
|
-
schema_name =
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
237
|
+
schema_name = (
|
|
238
|
+
f"{self.location}_{self.sensor}_{self.variable}_{self.unit}".lower()
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
if isinstance(self.ts.index, pd.DatetimeIndex):
|
|
242
|
+
utc_index = (
|
|
243
|
+
self.ts.index.tz_convert("UTC")
|
|
244
|
+
if self.ts.index.tz is not None # tzinfo becomes tz for DatetimeIndex
|
|
245
|
+
else self.ts.index
|
|
246
|
+
)
|
|
232
247
|
else:
|
|
233
|
-
message = "
|
|
234
|
-
raise
|
|
248
|
+
message = "The index is not a DatetimeIndex and cannot be converted to UTC."
|
|
249
|
+
raise TypeError(message)
|
|
250
|
+
|
|
251
|
+
series_as_records = list(
|
|
252
|
+
zip(utc_index.strftime("%Y-%m-%dT%H:%M:%S%z"), self.ts, strict=False)
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
with db as con:
|
|
256
|
+
schema = db.create_table(schema_name, self.variable)
|
|
257
|
+
if isinstance(schema, Table):
|
|
258
|
+
stmt = sqlite_insert(schema).values(series_as_records)
|
|
259
|
+
stmt = stmt.on_conflict_do_nothing(index_elements=["timestamp"])
|
|
260
|
+
|
|
261
|
+
con.execute(stmt)
|
|
262
|
+
con.commit()
|
|
235
263
|
|
|
236
264
|
return f"{schema_name} table updated."
|
|
237
265
|
|
|
@@ -258,7 +286,7 @@ class Timeseries(pyd.BaseModel):
|
|
|
258
286
|
ax.plot(
|
|
259
287
|
self.ts.index,
|
|
260
288
|
self.ts,
|
|
261
|
-
label=f"{self.
|
|
289
|
+
label=f"{self.location} ({self.sensor})",
|
|
262
290
|
**plot_kwargs,
|
|
263
291
|
)
|
|
264
292
|
|
|
@@ -329,7 +357,7 @@ class Dataset(pyd.BaseModel):
|
|
|
329
357
|
"""List all unique locations in the dataset."""
|
|
330
358
|
return [ts.location for ts in self.timeseries if ts is not None]
|
|
331
359
|
|
|
332
|
-
def add(self, other: Timeseries | list[Timeseries]) -> None:
|
|
360
|
+
def add(self, other: Timeseries | list[Timeseries] | Self) -> None:
|
|
333
361
|
"""Appends a new series to the Dataset or merges series if an equal
|
|
334
362
|
one exists.
|
|
335
363
|
|
|
@@ -342,8 +370,13 @@ class Dataset(pyd.BaseModel):
|
|
|
342
370
|
"""
|
|
343
371
|
if isinstance(other, list):
|
|
344
372
|
for ts in other:
|
|
345
|
-
|
|
346
|
-
|
|
373
|
+
if isinstance(ts, Timeseries):
|
|
374
|
+
self._add_single_timeseries(ts)
|
|
375
|
+
elif isinstance(other, Dataset):
|
|
376
|
+
for ts in other.timeseries: # type: ignore[assignment]
|
|
377
|
+
if isinstance(ts, Timeseries):
|
|
378
|
+
self._add_single_timeseries(ts)
|
|
379
|
+
elif isinstance(other, Timeseries):
|
|
347
380
|
self._add_single_timeseries(other)
|
|
348
381
|
|
|
349
382
|
return
|
|
@@ -395,9 +428,38 @@ class Dataset(pyd.BaseModel):
|
|
|
395
428
|
|
|
396
429
|
return self.model_copy(update={"timeseries": matching_timeseries})
|
|
397
430
|
|
|
431
|
+
def to_sql(self, db: DatabaseConnection) -> None:
|
|
432
|
+
for ts in self.timeseries:
|
|
433
|
+
if ts:
|
|
434
|
+
ts.to_sql(db)
|
|
435
|
+
return
|
|
436
|
+
|
|
437
|
+
def plot(self, include_outliers: bool = False) -> None:
|
|
438
|
+
"""Plots the timeseries data, grouping by variable type.
|
|
439
|
+
|
|
440
|
+
Args:
|
|
441
|
+
include_outliers (bool): Whether to include outliers in the plot.
|
|
442
|
+
"""
|
|
443
|
+
# Group timeseries by variable
|
|
444
|
+
grouped_ts = defaultdict(list)
|
|
445
|
+
for ts in self.timeseries:
|
|
446
|
+
if ts:
|
|
447
|
+
grouped_ts[ts.variable].append(ts)
|
|
448
|
+
|
|
449
|
+
# Create a plot for each group of timeseries with the same variable
|
|
450
|
+
for variable, ts_list in grouped_ts.items():
|
|
451
|
+
fig, ax = plt.subplots(figsize=(10, 5))
|
|
452
|
+
for ts in ts_list:
|
|
453
|
+
ts.plot(include_outliers=include_outliers, ax=ax)
|
|
454
|
+
|
|
455
|
+
ax.set_title(f"Timeseries for {variable.capitalize()}")
|
|
456
|
+
plt.show()
|
|
457
|
+
|
|
458
|
+
return
|
|
459
|
+
|
|
398
460
|
# def align(self,
|
|
399
|
-
#
|
|
400
|
-
#
|
|
461
|
+
# freq: str = 'h',
|
|
462
|
+
# inplace: bool = True):
|
|
401
463
|
# """Aligns the timeseries to a common time axis.
|
|
402
464
|
|
|
403
465
|
# Args:
|
|
@@ -406,7 +468,7 @@ class Dataset(pyd.BaseModel):
|
|
|
406
468
|
# """
|
|
407
469
|
|
|
408
470
|
# index_sets = [set(serie._resample(freq).index)
|
|
409
|
-
#
|
|
471
|
+
# for serie in self.timeseries]
|
|
410
472
|
|
|
411
473
|
# # Find the intersection of all index sets to get the common dates
|
|
412
474
|
# common_dates = set.intersection(*index_sets)
|
|
@@ -430,21 +492,3 @@ class Dataset(pyd.BaseModel):
|
|
|
430
492
|
# aligned_series = Dataset(aligned_series)
|
|
431
493
|
|
|
432
494
|
# return aligned_series
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
# def plot(self, stations: list[str] | None = None):
|
|
436
|
-
# """Plots the timeseries data.
|
|
437
|
-
|
|
438
|
-
# Args:
|
|
439
|
-
# ts (Timeseries): The timeseries to plot.
|
|
440
|
-
# """
|
|
441
|
-
# plt.figure(figsize=(10, 5))
|
|
442
|
-
|
|
443
|
-
# for ts in self.timeseries:
|
|
444
|
-
# plt.plot(ts.timeseries.index, ts.timeseries,
|
|
445
|
-
# label=f'{ts.measurement_type} at {ts.station}')
|
|
446
|
-
# plt.xlabel('Time')
|
|
447
|
-
# plt.ylabel('Value')
|
|
448
|
-
# plt.title('Timeseries data')
|
|
449
|
-
# plt.legend()
|
|
450
|
-
# plt.show()
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""Fetching the data from various sources.
|
|
2
|
+
|
|
3
|
+
TODO: Fix up the read_from_sql() function to actually work properly.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Literal
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from sqlalchemy import select
|
|
11
|
+
|
|
12
|
+
from .db.connection import DatabaseConnection
|
|
13
|
+
from .dtypes import Dataset, Timeseries
|
|
14
|
+
from .exceptions import NoFilesToLoad
|
|
15
|
+
from .parse import parse_vanessen_csv
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def read_from_csv(
|
|
19
|
+
path: Path, file_format: Literal["vanessen"] = "vanessen", **kwargs: Any
|
|
20
|
+
) -> Dataset:
|
|
21
|
+
"""Loads the data from the Van Essen CSV file(s) and returns a list of Timeseries objects.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
path (Path): The path to the file or directory containing the files.
|
|
25
|
+
**kwargs (dict): Optional keyword arguments passed to `parse_vanessen_csv()` to specify the regex patterns for the serial number and station.
|
|
26
|
+
serial_number_pattern (str): The regex pattern to extract the serial number from the file.
|
|
27
|
+
location_pattern (str): The regex pattern to extract the station from the file.
|
|
28
|
+
col_names (list): The column names for the dataframe.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
parsers = {
|
|
32
|
+
"vanessen": parse_vanessen_csv,
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
if not isinstance(path, Path):
|
|
36
|
+
message = "The path argument must be a Path object."
|
|
37
|
+
raise TypeError(message)
|
|
38
|
+
|
|
39
|
+
if path.is_dir() and not any(path.iterdir()):
|
|
40
|
+
raise NoFilesToLoad()
|
|
41
|
+
|
|
42
|
+
files = (
|
|
43
|
+
[file for file in path.iterdir() if file.is_file()] if path.is_dir() else [path]
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
parser = parsers[file_format]
|
|
47
|
+
ds = Dataset()
|
|
48
|
+
for f in files:
|
|
49
|
+
print(f"Loading file: {f}")
|
|
50
|
+
ts_in_file = parser(f, **kwargs)
|
|
51
|
+
ds.add(ts_in_file)
|
|
52
|
+
|
|
53
|
+
return ds
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def read_from_sql(
|
|
57
|
+
db: DatabaseConnection,
|
|
58
|
+
load_all: bool,
|
|
59
|
+
location: str | None = None,
|
|
60
|
+
sensor: str | None = None,
|
|
61
|
+
variable: str | None = None,
|
|
62
|
+
unit: str | None = None,
|
|
63
|
+
) -> Timeseries | Dataset:
|
|
64
|
+
"""Returns the timeseries or a dataset from a SQL database.
|
|
65
|
+
|
|
66
|
+
Parameters:
|
|
67
|
+
db (DatabaseConnection): The database connection object.
|
|
68
|
+
load_all (bool): Whether to load all timeseries from the database.
|
|
69
|
+
location (str): The station name.
|
|
70
|
+
sensor (str): The sensor name.
|
|
71
|
+
variable (str): The measurement type.
|
|
72
|
+
unit (str): The unit of the measurement.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
Timeseries: The Timeseries object retrieved from the database.
|
|
76
|
+
|
|
77
|
+
Raises:
|
|
78
|
+
ValueError: If the DataFrame cannot be retrieved or if it's empty.
|
|
79
|
+
TypeError: If the retrieved data is not a DataFrame or is of incorrect type.
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
def _read_from_sql(
|
|
83
|
+
location: str, sensor: str, variable: str, unit: str
|
|
84
|
+
) -> Timeseries:
|
|
85
|
+
schema_name = f"{location}_{sensor}_{variable}_{unit}".lower()
|
|
86
|
+
|
|
87
|
+
with db as con:
|
|
88
|
+
schema = db.metadata.tables[schema_name]
|
|
89
|
+
query = select(schema)
|
|
90
|
+
ts = pd.read_sql(
|
|
91
|
+
query,
|
|
92
|
+
con=con,
|
|
93
|
+
parse_dates={"timestamp": "%Y-%m-%dT%H:%M:%S%z"},
|
|
94
|
+
index_col="timestamp",
|
|
95
|
+
).squeeze()
|
|
96
|
+
if ts.empty:
|
|
97
|
+
message = f"No data found in table {schema_name}"
|
|
98
|
+
raise ValueError(message)
|
|
99
|
+
|
|
100
|
+
# Variable and type validation are handled by pydantic model
|
|
101
|
+
ts_object = Timeseries(
|
|
102
|
+
ts=ts,
|
|
103
|
+
variable=variable, # type: ignore[arg-type]
|
|
104
|
+
location=location,
|
|
105
|
+
sensor=sensor,
|
|
106
|
+
unit=unit, # type: ignore[arg-type]
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
return ts_object
|
|
110
|
+
|
|
111
|
+
# fmt: off
|
|
112
|
+
if load_all:
|
|
113
|
+
schemas = db.get_tables()
|
|
114
|
+
if schemas:
|
|
115
|
+
timeseries = [_read_from_sql(*ts_name.split("_"))
|
|
116
|
+
for ts_name in schemas]
|
|
117
|
+
|
|
118
|
+
return Dataset(timeseries=[ts for ts in timeseries if ts is not None])
|
|
119
|
+
else:
|
|
120
|
+
return Dataset()
|
|
121
|
+
else:
|
|
122
|
+
|
|
123
|
+
return _read_from_sql(location, sensor, variable, unit) # type: ignore[arg-type]
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
# fmt: on
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def read_from_api() -> Dataset:
|
|
130
|
+
"""Fetch data from the API."""
|
|
131
|
+
return NotImplemented
|
|
@@ -6,7 +6,7 @@ from pathlib import Path
|
|
|
6
6
|
from typing import Any
|
|
7
7
|
|
|
8
8
|
import chardet
|
|
9
|
-
import
|
|
9
|
+
from dateutil import tz
|
|
10
10
|
from pandas import DataFrame, read_csv, to_datetime
|
|
11
11
|
|
|
12
12
|
from ..dtypes import VARIABLE_TYPES_AND_UNITS, Timeseries
|
|
@@ -28,25 +28,20 @@ def detect_encoding(path: Path, num_bytes: int = 1024) -> str:
|
|
|
28
28
|
return result["encoding"] or "utf-8"
|
|
29
29
|
|
|
30
30
|
|
|
31
|
-
def handle_timestamps(df: DataFrame,
|
|
32
|
-
"""Converts
|
|
33
|
-
|
|
34
|
-
The timezone is obtained from the file metadata. If the timezone is UTC, the offset is extracted
|
|
35
|
-
and the timestamps are converted to the corresponding timezone. If the timezone is not UTC, the
|
|
36
|
-
timestamps are converted to UTC and then to the specified timezone.
|
|
31
|
+
def handle_timestamps(df: DataFrame, tz_string: str) -> DataFrame:
|
|
32
|
+
"""Converts timestamps in the dataframe to the specified timezone (e.g., 'UTC+1').
|
|
37
33
|
|
|
38
34
|
Args:
|
|
39
|
-
df (pd.DataFrame): The dataframe with
|
|
40
|
-
|
|
41
|
-
"""
|
|
35
|
+
df (pd.DataFrame): The dataframe with timestamps.
|
|
36
|
+
tz_string (str): A timezone string like 'UTC+1' or 'UTC-5'.
|
|
42
37
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
timezone = pytz.UTC
|
|
38
|
+
Returns:
|
|
39
|
+
pd.DataFrame: The dataframe with timestamps converted to UTC.
|
|
40
|
+
"""
|
|
41
|
+
timezone = tz.gettz(tz_string)
|
|
48
42
|
|
|
49
|
-
df.index = to_datetime(df.index).tz_localize(
|
|
43
|
+
df.index = to_datetime(df.index).tz_localize(timezone)
|
|
44
|
+
df.index = df.index.tz_convert("UTC")
|
|
50
45
|
|
|
51
46
|
return df
|
|
52
47
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "gensor"
|
|
3
|
-
version = "0.0.
|
|
3
|
+
version = "0.0.5"
|
|
4
4
|
description = "Library for handling groundwater sensor data."
|
|
5
5
|
authors = ["Mateusz Zawadzki <zawadzkimat@outlook.com>"]
|
|
6
6
|
repository = "https://github.com/zawadzkim/gensor"
|
|
@@ -22,7 +22,7 @@ numpy = "^2.1.0"
|
|
|
22
22
|
scikit-learn = "^1.5.1"
|
|
23
23
|
sqlalchemy = "^2.0.32"
|
|
24
24
|
pandas = "^2.2.2"
|
|
25
|
-
|
|
25
|
+
python-dateutil = "^2.9.0.post0"
|
|
26
26
|
|
|
27
27
|
[tool.poetry.group.dev.dependencies]
|
|
28
28
|
pytest = "^7.2.0"
|
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
"""Module defining database connection object.
|
|
2
|
-
|
|
3
|
-
Classes:
|
|
4
|
-
DatabaseConnection: Database connection object
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from pathlib import Path
|
|
8
|
-
|
|
9
|
-
import pydantic as pyd
|
|
10
|
-
from sqlalchemy import Engine, create_engine
|
|
11
|
-
from sqlalchemy.orm import Session, sessionmaker
|
|
12
|
-
|
|
13
|
-
from ..exceptions import DatabaseNotFound
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class DatabaseConnection(pyd.BaseModel):
|
|
17
|
-
"""Database connection object.
|
|
18
|
-
If no database exists at the specified path, it will be created.
|
|
19
|
-
If no database is specified, an in-memory database will be used.
|
|
20
|
-
|
|
21
|
-
The user should specify the database directory and name separately. If directory is not specified,
|
|
22
|
-
current directory and a default name are used. ."""
|
|
23
|
-
|
|
24
|
-
model_config = pyd.ConfigDict(
|
|
25
|
-
arbitrary_types_allowed=True, validate_assignment=True
|
|
26
|
-
)
|
|
27
|
-
|
|
28
|
-
in_memory: bool = False
|
|
29
|
-
db_directory: Path = Path.cwd()
|
|
30
|
-
db_name: str = "gensor.db"
|
|
31
|
-
engine: Engine | None = None
|
|
32
|
-
session: Session | None = None
|
|
33
|
-
|
|
34
|
-
def __post_init__(self) -> None:
|
|
35
|
-
self.connect()
|
|
36
|
-
|
|
37
|
-
def _verify_path(self) -> str:
|
|
38
|
-
if self.in_memory:
|
|
39
|
-
return "sqlite:///:memory:"
|
|
40
|
-
else:
|
|
41
|
-
if not self.db_directory.exists():
|
|
42
|
-
raise DatabaseNotFound()
|
|
43
|
-
else:
|
|
44
|
-
return f"sqlite:///{self.db_directory}/{self.db_name}"
|
|
45
|
-
|
|
46
|
-
def connect(self) -> Session:
|
|
47
|
-
sqlite_path = self._verify_path()
|
|
48
|
-
|
|
49
|
-
self.engine = create_engine(sqlite_path)
|
|
50
|
-
session = sessionmaker(bind=self.engine)
|
|
51
|
-
self.session = session()
|
|
52
|
-
|
|
53
|
-
return session()
|
gensor-0.0.3/gensor/getters.py
DELETED
|
@@ -1,95 +0,0 @@
|
|
|
1
|
-
"""Fetching the data from various sources.
|
|
2
|
-
|
|
3
|
-
TODO: Fix up the read_from_sql() function to actually work properly.
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
from typing import Any, Literal
|
|
8
|
-
|
|
9
|
-
from pandas import Series, read_sql
|
|
10
|
-
from sqlalchemy import MetaData, Table, select
|
|
11
|
-
|
|
12
|
-
from .db.connection import DatabaseConnection
|
|
13
|
-
from .dtypes import Dataset, Timeseries
|
|
14
|
-
from .exceptions import NoFilesToLoad
|
|
15
|
-
from .parse import parse_vanessen_csv
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def read_from_csv(
|
|
19
|
-
path: Path, file_format: Literal["vanessen"] = "vanessen", **kwargs: Any
|
|
20
|
-
) -> Dataset:
|
|
21
|
-
"""Loads the data from the Van Essen CSV file(s) and returns a list of Timeseries objects.
|
|
22
|
-
|
|
23
|
-
Args:
|
|
24
|
-
path (Path): The path to the file or directory containing the files.
|
|
25
|
-
**kwargs (dict): Optional keyword arguments passed to `parse_vanessen_csv()` to specify the regex patterns for the serial number and station.
|
|
26
|
-
serial_number_pattern (str): The regex pattern to extract the serial number from the file.
|
|
27
|
-
location_pattern (str): The regex pattern to extract the station from the file.
|
|
28
|
-
col_names (list): The column names for the dataframe.
|
|
29
|
-
"""
|
|
30
|
-
|
|
31
|
-
parsers = {
|
|
32
|
-
"vanessen": parse_vanessen_csv,
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
if not isinstance(path, Path):
|
|
36
|
-
message = "The path argument must be a Path object."
|
|
37
|
-
raise TypeError(message)
|
|
38
|
-
|
|
39
|
-
if path.is_dir() and not any(path.iterdir()):
|
|
40
|
-
raise NoFilesToLoad()
|
|
41
|
-
|
|
42
|
-
files = (
|
|
43
|
-
[file for file in path.iterdir() if file.is_file()] if path.is_dir() else [path]
|
|
44
|
-
)
|
|
45
|
-
|
|
46
|
-
parser = parsers[file_format]
|
|
47
|
-
ds = Dataset()
|
|
48
|
-
for f in files:
|
|
49
|
-
print(f"Loading file: {f}")
|
|
50
|
-
ts_in_file = parser(f, **kwargs)
|
|
51
|
-
ds.add(ts_in_file)
|
|
52
|
-
|
|
53
|
-
return ds
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def read_from_sql(
|
|
57
|
-
db: DatabaseConnection, location: str, sensor: str, variable: str, unit: str
|
|
58
|
-
) -> list[Timeseries]:
|
|
59
|
-
"""Returns the timeseries from a sql database.
|
|
60
|
-
|
|
61
|
-
Parameters:
|
|
62
|
-
db (DatabaseConnection): The database connection object
|
|
63
|
-
location (str): The station name
|
|
64
|
-
sensor (str): Sensor name
|
|
65
|
-
variable (str): The measurement type
|
|
66
|
-
unit (str): Unit of the measurement
|
|
67
|
-
|
|
68
|
-
"""
|
|
69
|
-
metadata = MetaData()
|
|
70
|
-
schema = Table(f"{location}_{sensor}_{variable}", metadata)
|
|
71
|
-
|
|
72
|
-
query = select(schema)
|
|
73
|
-
if db.engine:
|
|
74
|
-
with db.engine.connect() as con:
|
|
75
|
-
df = read_sql(query, con=con, index_col="timestamp")
|
|
76
|
-
|
|
77
|
-
if not isinstance(df, Series):
|
|
78
|
-
raise TypeError
|
|
79
|
-
|
|
80
|
-
ts_object = Timeseries(
|
|
81
|
-
ts=df,
|
|
82
|
-
# Validation done in Pydantic
|
|
83
|
-
variable=variable,
|
|
84
|
-
location=location,
|
|
85
|
-
sensor=sensor,
|
|
86
|
-
# Validation done in Pydantic
|
|
87
|
-
unit=unit,
|
|
88
|
-
)
|
|
89
|
-
|
|
90
|
-
return ts_object
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
def read_from_api() -> Dataset:
|
|
94
|
-
"""Fetch data from the API."""
|
|
95
|
-
return NotImplemented
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|