gensor 0.0.4__tar.gz → 0.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: gensor
3
- Version: 0.0.4
3
+ Version: 0.0.5
4
4
  Summary: Library for handling groundwater sensor data.
5
5
  Home-page: https://github.com/zawadzkim/gensor
6
6
  Author: Mateusz Zawadzki
@@ -15,7 +15,7 @@ Requires-Dist: numpy (>=2.1.0,<3.0.0)
15
15
  Requires-Dist: pandas (>=2.2.2,<3.0.0)
16
16
  Requires-Dist: pandera (>=0.20.3,<0.21.0)
17
17
  Requires-Dist: pydantic (>=2.8.2,<3.0.0)
18
- Requires-Dist: pytz (>=2024.1,<2025.0)
18
+ Requires-Dist: python-dateutil (>=2.9.0.post0,<3.0.0)
19
19
  Requires-Dist: scikit-learn (>=1.5.1,<2.0.0)
20
20
  Requires-Dist: scipy (>=1.14.1,<2.0.0)
21
21
  Requires-Dist: sqlalchemy (>=2.0.32,<3.0.0)
@@ -1,14 +1,18 @@
1
1
  from .compensation import Compensator, compensate
2
2
  from .dtypes import Dataset, Timeseries
3
- from .getters import read_from_csv
3
+ from .getters import read_from_csv, read_from_sql
4
4
  from .preprocessing import OutlierDetection, Transform
5
5
 
6
6
  __all__ = [
7
+ # basic data types
7
8
  "Dataset",
8
9
  "Timeseries",
9
- "read_from_csv",
10
+ # data transformation
10
11
  "OutlierDetection",
11
12
  "Transform",
12
13
  "Compensator",
13
14
  "compensate",
15
+ # getters
16
+ "read_from_csv",
17
+ "read_from_sql",
14
18
  ]
@@ -0,0 +1,106 @@
1
+ """Module defining database connection object.
2
+
3
+ Classes:
4
+ DatabaseConnection: Database connection object
5
+ """
6
+
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ import pydantic as pyd
11
+ from sqlalchemy import (
12
+ Column,
13
+ Connection,
14
+ Engine,
15
+ Float,
16
+ MetaData,
17
+ String,
18
+ Table,
19
+ create_engine,
20
+ )
21
+
22
+ from ..exceptions import DatabaseNotFound
23
+
24
+
25
+ class DatabaseConnection(pyd.BaseModel):
26
+ """Database connection object.
27
+ If no database exists at the specified path, it will be created.
28
+ If no database is specified, an in-memory database will be used."""
29
+
30
+ model_config = pyd.ConfigDict(
31
+ arbitrary_types_allowed=True, validate_assignment=True
32
+ )
33
+
34
+ metadata: MetaData = MetaData()
35
+ db_directory: Path = Path.cwd()
36
+ db_name: str = "gensor.db"
37
+ engine: Engine | None = None
38
+
39
+ def _verify_path(self) -> str:
40
+ """Verify database path."""
41
+
42
+ if not self.db_directory.exists():
43
+ raise DatabaseNotFound()
44
+ return f"sqlite:///{self.db_directory}/{self.db_name}"
45
+
46
+ def connect(self) -> Connection:
47
+ """Connect to the database and initialize the engine.
48
+ If engine is None > create it with verified path > reflect
49
+ """
50
+ if self.engine is None:
51
+ sqlite_path = self._verify_path()
52
+ self.engine = create_engine(sqlite_path)
53
+ return self.engine.connect()
54
+
55
+ def dispose(self) -> None:
56
+ """Dispose of the engine, closing all connections."""
57
+ if self.metadata:
58
+ self.metadata.clear()
59
+ if self.engine:
60
+ self.engine.dispose()
61
+
62
+ def __enter__(self) -> Connection:
63
+ """Enable usage in a `with` block by returning the engine."""
64
+ con = self.connect()
65
+ if self.engine:
66
+ self.metadata.reflect(bind=self.engine)
67
+ return con
68
+
69
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
70
+ """Dispose of the engine when exiting the `with` block."""
71
+ self.dispose()
72
+
73
+ def get_tables(self) -> list | None:
74
+ with self:
75
+ tables = self.metadata.tables
76
+
77
+ if not tables:
78
+ print("This database has no tables.")
79
+ return None
80
+ else:
81
+ return list(tables)
82
+
83
+ def create_table(self, schema_name: str, column_name: str) -> Table | str:
84
+ """Create a table in the database.
85
+
86
+ Schema name is a string representing the location, sensor, variable measured and
87
+ unit of measurement. This is a way of preserving the metadata of the Timeseries.
88
+ The index is always `timestamp` and the column name is dynamicly create from
89
+ the measured variable.
90
+ """
91
+
92
+ if schema_name in self.metadata.tables:
93
+ return self.metadata.tables[schema_name]
94
+
95
+ ts_table = Table(
96
+ schema_name,
97
+ self.metadata,
98
+ Column("timestamp", String, primary_key=True),
99
+ Column(column_name, Float),
100
+ )
101
+ if self.engine:
102
+ ts_table.create(self.engine, checkfirst=True)
103
+ self.metadata.reflect(bind=self.engine)
104
+ return ts_table
105
+ else:
106
+ return "Engine does not exist."
@@ -1,12 +1,24 @@
1
+ """
2
+ !!! warning
3
+
4
+ Whenever Timeseries objects are created via read_from_csv and use a parser (e.g.,
5
+ 'vanessen'), the timestamps are localized and converted to UTC. Therefore, if the
6
+ user creates his own timeseries outside the read_from_csv, they should ensure that
7
+ the timestamps are in UTC format.
8
+ """
9
+
1
10
  from __future__ import annotations
2
11
 
12
+ from collections import defaultdict
3
13
  from collections.abc import Callable
4
- from typing import Any, Literal
14
+ from typing import Any, Literal, Self
5
15
 
6
16
  import pandas as pd
7
17
  import pandera as pa
8
18
  import pydantic as pyd
9
19
  from matplotlib import pyplot as plt
20
+ from sqlalchemy import Table
21
+ from sqlalchemy.dialects.sqlite import insert as sqlite_insert
10
22
 
11
23
  from .db import DatabaseConnection
12
24
  from .exceptions import IndexOutOfRangeError, TimeseriesNotFound, TimeseriesUnequal
@@ -14,14 +26,14 @@ from .preprocessing import OutlierDetection, Transform
14
26
 
15
27
  ts_schema = pa.SeriesSchema(
16
28
  float,
17
- index=pa.Index(pa.DateTime, coerce=True),
29
+ index=pa.Index(pd.DatetimeTZDtype(tz="UTC"), coerce=False),
18
30
  coerce=True,
19
31
  )
20
32
 
21
33
  VARIABLE_TYPES_AND_UNITS = {
22
- "temperature": ["degC"],
23
- "pressure": ["cmH2O", "mmH2O"],
24
- "conductivity": ["mS/cm"],
34
+ "temperature": ["degc"],
35
+ "pressure": ["cmh2o", "mmh2o"],
36
+ "conductivity": ["ms/cm"],
25
37
  "flux": ["m/s"],
26
38
  "head": ["m asl"],
27
39
  "depth": ["m"],
@@ -63,7 +75,7 @@ class Timeseries(pyd.BaseModel):
63
75
  variable: Literal[
64
76
  "temperature", "pressure", "conductivity", "flux", "head", "depth"
65
77
  ]
66
- unit: Literal["degC", "cmH2O", "mS/cm", "m/s", "m asl", "m"]
78
+ unit: Literal["degc", "cmh2o", "ms/cm", "m/s", "m asl", "m"]
67
79
  location: str | None = None
68
80
  sensor: str | None = None
69
81
  sensor_alt: float | None = None
@@ -213,25 +225,41 @@ class Timeseries(pyd.BaseModel):
213
225
  def to_sql(self, db: DatabaseConnection) -> str:
214
226
  """Converts the timeseries to a list of dictionaries and uploads it to the database.
215
227
 
216
- Normally the upload of the data with SQLAlchemy ORM would require creation of LoggerRecords instances,
217
- but since the on_conflict_do_nothing clause is is used to avoid inserting duplicate rows, the
218
- data has to be uploaded as a list of dictionaries.
228
+ The Timeseries data is uploaded to the SQL database by using the pandas
229
+ `to_sql` method.
219
230
 
220
231
  Args:
221
- db (DatabaseConnection): The database connection object (see gwlogger.db.connection).
232
+ db (DatabaseConnection): The database connection object.
222
233
 
223
234
  Returns:
224
235
  str: A message indicating the number of rows inserted into the database.
225
236
  """
226
- schema_name = f"{self.location}_{self.sensor}_{self.variable}_{self.unit}"
227
- if db.engine is not None:
228
- with db.engine.connect() as con:
229
- self.ts.to_sql(
230
- name=schema_name, con=con, if_exists="append", index=False
231
- )
237
+ schema_name = (
238
+ f"{self.location}_{self.sensor}_{self.variable}_{self.unit}".lower()
239
+ )
240
+
241
+ if isinstance(self.ts.index, pd.DatetimeIndex):
242
+ utc_index = (
243
+ self.ts.index.tz_convert("UTC")
244
+ if self.ts.index.tz is not None # tzinfo becomes tz for DatetimeIndex
245
+ else self.ts.index
246
+ )
232
247
  else:
233
- message = "Database engine is not initialized."
234
- raise ValueError(message)
248
+ message = "The index is not a DatetimeIndex and cannot be converted to UTC."
249
+ raise TypeError(message)
250
+
251
+ series_as_records = list(
252
+ zip(utc_index.strftime("%Y-%m-%dT%H:%M:%S%z"), self.ts, strict=False)
253
+ )
254
+
255
+ with db as con:
256
+ schema = db.create_table(schema_name, self.variable)
257
+ if isinstance(schema, Table):
258
+ stmt = sqlite_insert(schema).values(series_as_records)
259
+ stmt = stmt.on_conflict_do_nothing(index_elements=["timestamp"])
260
+
261
+ con.execute(stmt)
262
+ con.commit()
235
263
 
236
264
  return f"{schema_name} table updated."
237
265
 
@@ -258,7 +286,7 @@ class Timeseries(pyd.BaseModel):
258
286
  ax.plot(
259
287
  self.ts.index,
260
288
  self.ts,
261
- label=f"{self.variable} ({self.unit})",
289
+ label=f"{self.location} ({self.sensor})",
262
290
  **plot_kwargs,
263
291
  )
264
292
 
@@ -329,7 +357,7 @@ class Dataset(pyd.BaseModel):
329
357
  """List all unique locations in the dataset."""
330
358
  return [ts.location for ts in self.timeseries if ts is not None]
331
359
 
332
- def add(self, other: Timeseries | list[Timeseries]) -> None:
360
+ def add(self, other: Timeseries | list[Timeseries] | Self) -> None:
333
361
  """Appends a new series to the Dataset or merges series if an equal
334
362
  one exists.
335
363
 
@@ -342,8 +370,13 @@ class Dataset(pyd.BaseModel):
342
370
  """
343
371
  if isinstance(other, list):
344
372
  for ts in other:
345
- self._add_single_timeseries(ts)
346
- else:
373
+ if isinstance(ts, Timeseries):
374
+ self._add_single_timeseries(ts)
375
+ elif isinstance(other, Dataset):
376
+ for ts in other.timeseries: # type: ignore[assignment]
377
+ if isinstance(ts, Timeseries):
378
+ self._add_single_timeseries(ts)
379
+ elif isinstance(other, Timeseries):
347
380
  self._add_single_timeseries(other)
348
381
 
349
382
  return
@@ -395,9 +428,38 @@ class Dataset(pyd.BaseModel):
395
428
 
396
429
  return self.model_copy(update={"timeseries": matching_timeseries})
397
430
 
431
+ def to_sql(self, db: DatabaseConnection) -> None:
432
+ for ts in self.timeseries:
433
+ if ts:
434
+ ts.to_sql(db)
435
+ return
436
+
437
+ def plot(self, include_outliers: bool = False) -> None:
438
+ """Plots the timeseries data, grouping by variable type.
439
+
440
+ Args:
441
+ include_outliers (bool): Whether to include outliers in the plot.
442
+ """
443
+ # Group timeseries by variable
444
+ grouped_ts = defaultdict(list)
445
+ for ts in self.timeseries:
446
+ if ts:
447
+ grouped_ts[ts.variable].append(ts)
448
+
449
+ # Create a plot for each group of timeseries with the same variable
450
+ for variable, ts_list in grouped_ts.items():
451
+ fig, ax = plt.subplots(figsize=(10, 5))
452
+ for ts in ts_list:
453
+ ts.plot(include_outliers=include_outliers, ax=ax)
454
+
455
+ ax.set_title(f"Timeseries for {variable.capitalize()}")
456
+ plt.show()
457
+
458
+ return
459
+
398
460
  # def align(self,
399
- # freq: str = 'h',
400
- # inplace: bool = True):
461
+ # freq: str = 'h',
462
+ # inplace: bool = True):
401
463
  # """Aligns the timeseries to a common time axis.
402
464
 
403
465
  # Args:
@@ -406,7 +468,7 @@ class Dataset(pyd.BaseModel):
406
468
  # """
407
469
 
408
470
  # index_sets = [set(serie._resample(freq).index)
409
- # for serie in self.timeseries]
471
+ # for serie in self.timeseries]
410
472
 
411
473
  # # Find the intersection of all index sets to get the common dates
412
474
  # common_dates = set.intersection(*index_sets)
@@ -430,21 +492,3 @@ class Dataset(pyd.BaseModel):
430
492
  # aligned_series = Dataset(aligned_series)
431
493
 
432
494
  # return aligned_series
433
-
434
-
435
- # def plot(self, stations: list[str] | None = None):
436
- # """Plots the timeseries data.
437
-
438
- # Args:
439
- # ts (Timeseries): The timeseries to plot.
440
- # """
441
- # plt.figure(figsize=(10, 5))
442
-
443
- # for ts in self.timeseries:
444
- # plt.plot(ts.timeseries.index, ts.timeseries,
445
- # label=f'{ts.measurement_type} at {ts.station}')
446
- # plt.xlabel('Time')
447
- # plt.ylabel('Value')
448
- # plt.title('Timeseries data')
449
- # plt.legend()
450
- # plt.show()
@@ -0,0 +1,131 @@
1
+ """Fetching the data from various sources.
2
+
3
+ TODO: Fix up the read_from_sql() function to actually work properly.
4
+ """
5
+
6
+ from pathlib import Path
7
+ from typing import Any, Literal
8
+
9
+ import pandas as pd
10
+ from sqlalchemy import select
11
+
12
+ from .db.connection import DatabaseConnection
13
+ from .dtypes import Dataset, Timeseries
14
+ from .exceptions import NoFilesToLoad
15
+ from .parse import parse_vanessen_csv
16
+
17
+
18
+ def read_from_csv(
19
+ path: Path, file_format: Literal["vanessen"] = "vanessen", **kwargs: Any
20
+ ) -> Dataset:
21
+ """Loads the data from the Van Essen CSV file(s) and returns a list of Timeseries objects.
22
+
23
+ Args:
24
+ path (Path): The path to the file or directory containing the files.
25
+ **kwargs (dict): Optional keyword arguments passed to `parse_vanessen_csv()` to specify the regex patterns for the serial number and station.
26
+ serial_number_pattern (str): The regex pattern to extract the serial number from the file.
27
+ location_pattern (str): The regex pattern to extract the station from the file.
28
+ col_names (list): The column names for the dataframe.
29
+ """
30
+
31
+ parsers = {
32
+ "vanessen": parse_vanessen_csv,
33
+ }
34
+
35
+ if not isinstance(path, Path):
36
+ message = "The path argument must be a Path object."
37
+ raise TypeError(message)
38
+
39
+ if path.is_dir() and not any(path.iterdir()):
40
+ raise NoFilesToLoad()
41
+
42
+ files = (
43
+ [file for file in path.iterdir() if file.is_file()] if path.is_dir() else [path]
44
+ )
45
+
46
+ parser = parsers[file_format]
47
+ ds = Dataset()
48
+ for f in files:
49
+ print(f"Loading file: {f}")
50
+ ts_in_file = parser(f, **kwargs)
51
+ ds.add(ts_in_file)
52
+
53
+ return ds
54
+
55
+
56
+ def read_from_sql(
57
+ db: DatabaseConnection,
58
+ load_all: bool,
59
+ location: str | None = None,
60
+ sensor: str | None = None,
61
+ variable: str | None = None,
62
+ unit: str | None = None,
63
+ ) -> Timeseries | Dataset:
64
+ """Returns the timeseries or a dataset from a SQL database.
65
+
66
+ Parameters:
67
+ db (DatabaseConnection): The database connection object.
68
+ load_all (bool): Whether to load all timeseries from the database.
69
+ location (str): The station name.
70
+ sensor (str): The sensor name.
71
+ variable (str): The measurement type.
72
+ unit (str): The unit of the measurement.
73
+
74
+ Returns:
75
+ Timeseries: The Timeseries object retrieved from the database.
76
+
77
+ Raises:
78
+ ValueError: If the DataFrame cannot be retrieved or if it's empty.
79
+ TypeError: If the retrieved data is not a DataFrame or is of incorrect type.
80
+ """
81
+
82
+ def _read_from_sql(
83
+ location: str, sensor: str, variable: str, unit: str
84
+ ) -> Timeseries:
85
+ schema_name = f"{location}_{sensor}_{variable}_{unit}".lower()
86
+
87
+ with db as con:
88
+ schema = db.metadata.tables[schema_name]
89
+ query = select(schema)
90
+ ts = pd.read_sql(
91
+ query,
92
+ con=con,
93
+ parse_dates={"timestamp": "%Y-%m-%dT%H:%M:%S%z"},
94
+ index_col="timestamp",
95
+ ).squeeze()
96
+ if ts.empty:
97
+ message = f"No data found in table {schema_name}"
98
+ raise ValueError(message)
99
+
100
+ # Variable and type validation are handled by pydantic model
101
+ ts_object = Timeseries(
102
+ ts=ts,
103
+ variable=variable, # type: ignore[arg-type]
104
+ location=location,
105
+ sensor=sensor,
106
+ unit=unit, # type: ignore[arg-type]
107
+ )
108
+
109
+ return ts_object
110
+
111
+ # fmt: off
112
+ if load_all:
113
+ schemas = db.get_tables()
114
+ if schemas:
115
+ timeseries = [_read_from_sql(*ts_name.split("_"))
116
+ for ts_name in schemas]
117
+
118
+ return Dataset(timeseries=[ts for ts in timeseries if ts is not None])
119
+ else:
120
+ return Dataset()
121
+ else:
122
+
123
+ return _read_from_sql(location, sensor, variable, unit) # type: ignore[arg-type]
124
+
125
+
126
+ # fmt: on
127
+
128
+
129
+ def read_from_api() -> Dataset:
130
+ """Fetch data from the API."""
131
+ return NotImplemented
@@ -6,7 +6,7 @@ from pathlib import Path
6
6
  from typing import Any
7
7
 
8
8
  import chardet
9
- import pytz
9
+ from dateutil import tz
10
10
  from pandas import DataFrame, read_csv, to_datetime
11
11
 
12
12
  from ..dtypes import VARIABLE_TYPES_AND_UNITS, Timeseries
@@ -28,25 +28,20 @@ def detect_encoding(path: Path, num_bytes: int = 1024) -> str:
28
28
  return result["encoding"] or "utf-8"
29
29
 
30
30
 
31
- def handle_timestamps(df: DataFrame, tz: str) -> DataFrame:
32
- """Converts the timestamps in the dataframe to the specified timezone.
33
-
34
- The timezone is obtained from the file metadata. If the timezone is UTC, the offset is extracted
35
- and the timestamps are converted to the corresponding timezone. If the timezone is not UTC, the
36
- timestamps are converted to UTC and then to the specified timezone.
31
+ def handle_timestamps(df: DataFrame, tz_string: str) -> DataFrame:
32
+ """Converts timestamps in the dataframe to the specified timezone (e.g., 'UTC+1').
37
33
 
38
34
  Args:
39
- df (pd.DataFrame): The dataframe with the data.
40
- tz (str): The timezone string obtained from the file metadata.
41
- """
35
+ df (pd.DataFrame): The dataframe with timestamps.
36
+ tz_string (str): A timezone string like 'UTC+1' or 'UTC-5'.
42
37
 
43
- if tz.startswith("UTC"):
44
- offset_hours = int(tz[3:])
45
- timezone = pytz.FixedOffset(offset_hours * 60)
46
- else:
47
- timezone = pytz.UTC
38
+ Returns:
39
+ pd.DataFrame: The dataframe with timestamps converted to UTC.
40
+ """
41
+ timezone = tz.gettz(tz_string)
48
42
 
49
- df.index = to_datetime(df.index).tz_localize("UTC").tz_convert(timezone)
43
+ df.index = to_datetime(df.index).tz_localize(timezone)
44
+ df.index = df.index.tz_convert("UTC")
50
45
 
51
46
  return df
52
47
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "gensor"
3
- version = "0.0.4"
3
+ version = "0.0.5"
4
4
  description = "Library for handling groundwater sensor data."
5
5
  authors = ["Mateusz Zawadzki <zawadzkimat@outlook.com>"]
6
6
  repository = "https://github.com/zawadzkim/gensor"
@@ -22,7 +22,7 @@ numpy = "^2.1.0"
22
22
  scikit-learn = "^1.5.1"
23
23
  sqlalchemy = "^2.0.32"
24
24
  pandas = "^2.2.2"
25
- pytz = "^2024.1"
25
+ python-dateutil = "^2.9.0.post0"
26
26
 
27
27
  [tool.poetry.group.dev.dependencies]
28
28
  pytest = "^7.2.0"
@@ -1,53 +0,0 @@
1
- """Module defining database connection object.
2
-
3
- Classes:
4
- DatabaseConnection: Database connection object
5
- """
6
-
7
- from pathlib import Path
8
-
9
- import pydantic as pyd
10
- from sqlalchemy import Engine, create_engine
11
- from sqlalchemy.orm import Session, sessionmaker
12
-
13
- from ..exceptions import DatabaseNotFound
14
-
15
-
16
- class DatabaseConnection(pyd.BaseModel):
17
- """Database connection object.
18
- If no database exists at the specified path, it will be created.
19
- If no database is specified, an in-memory database will be used.
20
-
21
- The user should specify the database directory and name separately. If directory is not specified,
22
- current directory and a default name are used. ."""
23
-
24
- model_config = pyd.ConfigDict(
25
- arbitrary_types_allowed=True, validate_assignment=True
26
- )
27
-
28
- in_memory: bool = False
29
- db_directory: Path = Path.cwd()
30
- db_name: str = "gensor.db"
31
- engine: Engine | None = None
32
- session: Session | None = None
33
-
34
- def __post_init__(self) -> None:
35
- self.connect()
36
-
37
- def _verify_path(self) -> str:
38
- if self.in_memory:
39
- return "sqlite:///:memory:"
40
- else:
41
- if not self.db_directory.exists():
42
- raise DatabaseNotFound()
43
- else:
44
- return f"sqlite:///{self.db_directory}/{self.db_name}"
45
-
46
- def connect(self) -> Session:
47
- sqlite_path = self._verify_path()
48
-
49
- self.engine = create_engine(sqlite_path)
50
- session = sessionmaker(bind=self.engine)
51
- self.session = session()
52
-
53
- return session()
@@ -1,95 +0,0 @@
1
- """Fetching the data from various sources.
2
-
3
- TODO: Fix up the read_from_sql() function to actually work properly.
4
- """
5
-
6
- from pathlib import Path
7
- from typing import Any, Literal
8
-
9
- from pandas import Series, read_sql
10
- from sqlalchemy import MetaData, Table, select
11
-
12
- from .db.connection import DatabaseConnection
13
- from .dtypes import Dataset, Timeseries
14
- from .exceptions import NoFilesToLoad
15
- from .parse import parse_vanessen_csv
16
-
17
-
18
- def read_from_csv(
19
- path: Path, file_format: Literal["vanessen"] = "vanessen", **kwargs: Any
20
- ) -> Dataset:
21
- """Loads the data from the Van Essen CSV file(s) and returns a list of Timeseries objects.
22
-
23
- Args:
24
- path (Path): The path to the file or directory containing the files.
25
- **kwargs (dict): Optional keyword arguments passed to `parse_vanessen_csv()` to specify the regex patterns for the serial number and station.
26
- serial_number_pattern (str): The regex pattern to extract the serial number from the file.
27
- location_pattern (str): The regex pattern to extract the station from the file.
28
- col_names (list): The column names for the dataframe.
29
- """
30
-
31
- parsers = {
32
- "vanessen": parse_vanessen_csv,
33
- }
34
-
35
- if not isinstance(path, Path):
36
- message = "The path argument must be a Path object."
37
- raise TypeError(message)
38
-
39
- if path.is_dir() and not any(path.iterdir()):
40
- raise NoFilesToLoad()
41
-
42
- files = (
43
- [file for file in path.iterdir() if file.is_file()] if path.is_dir() else [path]
44
- )
45
-
46
- parser = parsers[file_format]
47
- ds = Dataset()
48
- for f in files:
49
- print(f"Loading file: {f}")
50
- ts_in_file = parser(f, **kwargs)
51
- ds.add(ts_in_file)
52
-
53
- return ds
54
-
55
-
56
- def read_from_sql(
57
- db: DatabaseConnection, location: str, sensor: str, variable: str, unit: str
58
- ) -> list[Timeseries]:
59
- """Returns the timeseries from a sql database.
60
-
61
- Parameters:
62
- db (DatabaseConnection): The database connection object
63
- location (str): The station name
64
- sensor (str): Sensor name
65
- variable (str): The measurement type
66
- unit (str): Unit of the measurement
67
-
68
- """
69
- metadata = MetaData()
70
- schema = Table(f"{location}_{sensor}_{variable}", metadata)
71
-
72
- query = select(schema)
73
- if db.engine:
74
- with db.engine.connect() as con:
75
- df = read_sql(query, con=con, index_col="timestamp")
76
-
77
- if not isinstance(df, Series):
78
- raise TypeError
79
-
80
- ts_object = Timeseries(
81
- ts=df,
82
- # Validation done in Pydantic
83
- variable=variable,
84
- location=location,
85
- sensor=sensor,
86
- # Validation done in Pydantic
87
- unit=unit,
88
- )
89
-
90
- return ts_object
91
-
92
-
93
- def read_from_api() -> Dataset:
94
- """Fetch data from the API."""
95
- return NotImplemented
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes