hydroserverpy 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hydroserverpy might be problematic. Click here for more details.
- hydroserverpy/__init__.py +1 -1
- hydroserverpy/core/endpoints/base.py +44 -31
- hydroserverpy/core/endpoints/data_loaders.py +6 -5
- hydroserverpy/core/endpoints/data_sources.py +6 -5
- hydroserverpy/core/endpoints/datastreams.py +89 -52
- hydroserverpy/core/endpoints/observed_properties.py +36 -18
- hydroserverpy/core/endpoints/processing_levels.py +36 -18
- hydroserverpy/core/endpoints/result_qualifiers.py +37 -19
- hydroserverpy/core/endpoints/sensors.py +37 -19
- hydroserverpy/core/endpoints/things.py +58 -37
- hydroserverpy/core/endpoints/units.py +37 -19
- hydroserverpy/core/schemas/base.py +13 -6
- hydroserverpy/core/schemas/data_loaders.py +6 -4
- hydroserverpy/core/schemas/data_sources.py +73 -56
- hydroserverpy/core/schemas/datastreams.py +101 -70
- hydroserverpy/core/schemas/observed_properties.py +18 -10
- hydroserverpy/core/schemas/processing_levels.py +10 -6
- hydroserverpy/core/schemas/result_qualifiers.py +7 -4
- hydroserverpy/core/schemas/sensors.py +33 -18
- hydroserverpy/core/schemas/things.py +97 -60
- hydroserverpy/core/schemas/units.py +7 -8
- hydroserverpy/core/service.py +31 -17
- hydroserverpy/etl/__init__.py +21 -0
- hydroserverpy/etl/extractors/__init__.py +0 -0
- hydroserverpy/etl/extractors/base.py +13 -0
- hydroserverpy/etl/extractors/ftp_extractor.py +50 -0
- hydroserverpy/etl/extractors/http_extractor.py +84 -0
- hydroserverpy/etl/extractors/local_file_extractor.py +25 -0
- hydroserverpy/etl/hydroserver_etl.py +40 -0
- hydroserverpy/etl/loaders/__init__.py +0 -0
- hydroserverpy/etl/loaders/base.py +13 -0
- hydroserverpy/etl/loaders/hydroserver_loader.py +68 -0
- hydroserverpy/etl/transformers/__init__.py +0 -0
- hydroserverpy/etl/transformers/base.py +52 -0
- hydroserverpy/etl/transformers/csv_transformer.py +88 -0
- hydroserverpy/etl/transformers/json_transformer.py +62 -0
- hydroserverpy/etl/types.py +7 -0
- hydroserverpy/etl_csv/__init__.py +0 -0
- hydroserverpy/{etl/service.py → etl_csv/hydroserver_etl_csv.py} +92 -54
- hydroserverpy/quality/service.py +84 -70
- hydroserverpy-0.4.0.dist-info/METADATA +18 -0
- hydroserverpy-0.4.0.dist-info/RECORD +51 -0
- {hydroserverpy-0.3.0.dist-info → hydroserverpy-0.4.0.dist-info}/WHEEL +1 -1
- hydroserverpy-0.3.0.dist-info/METADATA +0 -18
- hydroserverpy-0.3.0.dist-info/RECORD +0 -36
- /hydroserverpy/{etl → etl_csv}/exceptions.py +0 -0
- {hydroserverpy-0.3.0.dist-info → hydroserverpy-0.4.0.dist-info}/LICENSE +0 -0
- {hydroserverpy-0.3.0.dist-info → hydroserverpy-0.4.0.dist-info}/top_level.txt +0 -0
- {hydroserverpy-0.3.0.dist-info → hydroserverpy-0.4.0.dist-info}/zip-safe +0 -0
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class HydroServerETL:
|
|
6
|
+
def __init__(self, extractor, transformer, loader, source_target_map):
|
|
7
|
+
self.extractor = extractor
|
|
8
|
+
self.transformer = transformer
|
|
9
|
+
self.loader = loader
|
|
10
|
+
self.source_target_map = source_target_map
|
|
11
|
+
|
|
12
|
+
def run(self):
|
|
13
|
+
"""
|
|
14
|
+
Extracts, transforms, and loads data as defined by the class parameters.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
# Step 1: Get Target System data requirements from the Loader & prepare parameters for the Extractor
|
|
18
|
+
data_requirements = self.loader.get_data_requirements(self.source_target_map)
|
|
19
|
+
self.extractor.prepare_params(data_requirements)
|
|
20
|
+
|
|
21
|
+
# Step 2: Extract
|
|
22
|
+
data = self.extractor.extract()
|
|
23
|
+
if data is None or (isinstance(data, pd.DataFrame) and data.empty):
|
|
24
|
+
logging.warning(f"No data was returned from the extractor. Ending ETL run.")
|
|
25
|
+
return
|
|
26
|
+
else:
|
|
27
|
+
logging.info(f"Successfully extracted data.")
|
|
28
|
+
|
|
29
|
+
# Step 3: Transform
|
|
30
|
+
if self.transformer:
|
|
31
|
+
data = self.transformer.transform(data)
|
|
32
|
+
if data is None or (isinstance(data, pd.DataFrame) and data.empty):
|
|
33
|
+
logging.warning(f"No data returned from the transformer. Ending run.")
|
|
34
|
+
return
|
|
35
|
+
else:
|
|
36
|
+
logging.info(f"Successfully transformed data. {data}")
|
|
37
|
+
|
|
38
|
+
# Step 4: Load
|
|
39
|
+
self.loader.load(data, self.source_target_map)
|
|
40
|
+
logging.info("Successfully loaded data.")
|
|
File without changes
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Dict
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Loader(ABC):
|
|
7
|
+
@abstractmethod
|
|
8
|
+
def load(self, *args, **kwargs) -> None:
|
|
9
|
+
pass
|
|
10
|
+
|
|
11
|
+
@abstractmethod
|
|
12
|
+
def get_data_requirements(self, df: pd.DataFrame) -> Dict[str, pd.Timestamp]:
|
|
13
|
+
pass
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from hydroserverpy.core.service import HydroServer
|
|
2
|
+
from typing import Dict, Optional
|
|
3
|
+
from .base import Loader
|
|
4
|
+
import logging
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class HydroServerLoader(HydroServer, Loader):
|
|
9
|
+
"""
|
|
10
|
+
A class that extends the HydroServer client with ETL-specific functionalities.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
host: str,
|
|
16
|
+
username: Optional[str] = None,
|
|
17
|
+
password: Optional[str] = None,
|
|
18
|
+
apikey: Optional[str] = None,
|
|
19
|
+
api_route: str = "api",
|
|
20
|
+
):
|
|
21
|
+
super().__init__(host, username, password, apikey, api_route)
|
|
22
|
+
|
|
23
|
+
def load(self, data: pd.DataFrame, source_target_map) -> None:
|
|
24
|
+
"""
|
|
25
|
+
Load observations from a DataFrame to the HydroServer.
|
|
26
|
+
|
|
27
|
+
:param data: A Pandas DataFrame where each column corresponds to a datastream.
|
|
28
|
+
"""
|
|
29
|
+
data_requirements = self.get_data_requirements(source_target_map)
|
|
30
|
+
for ds_id in data.columns:
|
|
31
|
+
if ds_id == "timestamp":
|
|
32
|
+
continue
|
|
33
|
+
|
|
34
|
+
df = data[["timestamp", ds_id]].copy()
|
|
35
|
+
df.rename(columns={ds_id: "value"}, inplace=True)
|
|
36
|
+
df.dropna(subset=["value"], inplace=True)
|
|
37
|
+
|
|
38
|
+
phenomenon_end_time = data_requirements[ds_id]["start_time"]
|
|
39
|
+
if phenomenon_end_time:
|
|
40
|
+
df = df[df["timestamp"] > phenomenon_end_time]
|
|
41
|
+
if df.empty:
|
|
42
|
+
logging.warning(
|
|
43
|
+
f"No new data to upload for datastream {ds_id}. Skipping."
|
|
44
|
+
)
|
|
45
|
+
continue
|
|
46
|
+
self.datastreams.load_observations(uid=ds_id, observations=df)
|
|
47
|
+
|
|
48
|
+
def get_data_requirements(
|
|
49
|
+
self, source_target_map
|
|
50
|
+
) -> Dict[str, Dict[str, pd.Timestamp]]:
|
|
51
|
+
"""
|
|
52
|
+
Each target system needs to be able to answer the question: 'What data do you need?'
|
|
53
|
+
and return a time range for each target time series. Usually the answer will be
|
|
54
|
+
'anything newer than my most recent observation'.
|
|
55
|
+
"""
|
|
56
|
+
data_requirements = {}
|
|
57
|
+
for ds_id in source_target_map.values():
|
|
58
|
+
datastream = self.datastreams.get(uid=ds_id)
|
|
59
|
+
if not datastream:
|
|
60
|
+
message = "Couldn't fetch target datastream. ETL process aborted."
|
|
61
|
+
logging.error(message)
|
|
62
|
+
raise message
|
|
63
|
+
start_time = pd.Timestamp(
|
|
64
|
+
datastream.phenomenon_end_time or "1970-01-01T00:00:00Z"
|
|
65
|
+
)
|
|
66
|
+
end_time = pd.Timestamp.now()
|
|
67
|
+
data_requirements[ds_id] = {"start_time": start_time, "end_time": end_time}
|
|
68
|
+
return data_requirements
|
|
File without changes
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
import logging
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Transformer(ABC):
|
|
7
|
+
@abstractmethod
|
|
8
|
+
def transform(self, *args, **kwargs) -> None:
|
|
9
|
+
pass
|
|
10
|
+
|
|
11
|
+
@property
|
|
12
|
+
def needs_datastreams(self) -> bool:
|
|
13
|
+
return False
|
|
14
|
+
|
|
15
|
+
@staticmethod
|
|
16
|
+
def standardize_dataframe(
|
|
17
|
+
df,
|
|
18
|
+
datastream_ids,
|
|
19
|
+
timestamp_column: str = "timestamp",
|
|
20
|
+
timestamp_format: str = "ISO8601",
|
|
21
|
+
):
|
|
22
|
+
df.rename(
|
|
23
|
+
columns={timestamp_column: "timestamp", **datastream_ids},
|
|
24
|
+
inplace=True,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
# Verify timestamp column is present in the DataFrame
|
|
28
|
+
if "timestamp" not in df.columns:
|
|
29
|
+
message = f"Timestamp column '{timestamp_column}' not found in data."
|
|
30
|
+
logging.error(message)
|
|
31
|
+
raise ValueError(message)
|
|
32
|
+
|
|
33
|
+
# Verify that all datastream_ids are present in the DataFrame
|
|
34
|
+
expected_columns = set(datastream_ids.values())
|
|
35
|
+
actual_columns = set(df.columns)
|
|
36
|
+
missing_datastream_ids = expected_columns - actual_columns
|
|
37
|
+
|
|
38
|
+
if missing_datastream_ids:
|
|
39
|
+
raise ValueError(
|
|
40
|
+
"The following datastream IDs are specified in the config file but their related keys could not be "
|
|
41
|
+
f"found in the source system's extracted data: {missing_datastream_ids}"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# Keep only 'timestamp' and datastream_id columns
|
|
45
|
+
columns_to_keep = ["timestamp"] + list(expected_columns)
|
|
46
|
+
df = df[columns_to_keep]
|
|
47
|
+
|
|
48
|
+
# Convert timestamp column to datetime if not already
|
|
49
|
+
if not pd.api.types.is_datetime64_any_dtype(df["timestamp"]):
|
|
50
|
+
df["timestamp"] = pd.to_datetime(df["timestamp"], format=timestamp_format)
|
|
51
|
+
|
|
52
|
+
return df
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from typing import Dict, Optional, Union
|
|
4
|
+
from .base import Transformer
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class CSVTransformer(Transformer):
|
|
8
|
+
def __init__(
|
|
9
|
+
self,
|
|
10
|
+
header_row: Optional[int],
|
|
11
|
+
data_start_row: int,
|
|
12
|
+
timestamp_column: Union[str, int],
|
|
13
|
+
datastream_ids: Dict[Union[str, int], str],
|
|
14
|
+
delimiter: Optional[str] = ",",
|
|
15
|
+
timestamp_format: Optional[str] = "ISO8601",
|
|
16
|
+
):
|
|
17
|
+
# Pandas is zero-based while CSV is one-based so convert
|
|
18
|
+
self.header_row = None if header_row is None else header_row - 1
|
|
19
|
+
self.data_start_row = data_start_row - 1
|
|
20
|
+
self.timestamp_column = self.convert_to_zero_based(timestamp_column)
|
|
21
|
+
self.datastream_ids = datastream_ids
|
|
22
|
+
self.timestamp_format = timestamp_format
|
|
23
|
+
self.delimiter = delimiter
|
|
24
|
+
|
|
25
|
+
def transform(self, data_file) -> Union[pd.DataFrame, None]:
|
|
26
|
+
"""
|
|
27
|
+
Transforms a CSV file-like object into a Pandas DataFrame where the column
|
|
28
|
+
names are replaced with their target datastream ids.
|
|
29
|
+
|
|
30
|
+
Parameters:
|
|
31
|
+
data_file: File-like object containing CSV data.
|
|
32
|
+
Returns:
|
|
33
|
+
observations_map (dict): Dict mapping datastream IDs to pandas DataFrames.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
df = pd.read_csv(
|
|
38
|
+
data_file,
|
|
39
|
+
delimiter=self.delimiter,
|
|
40
|
+
header=self.header_row,
|
|
41
|
+
parse_dates=[self.timestamp_column],
|
|
42
|
+
date_format=self.timestamp_format,
|
|
43
|
+
skiprows=self.calculate_skiprows(),
|
|
44
|
+
usecols=[self.timestamp_column] + list(self.datastream_ids.keys()),
|
|
45
|
+
)
|
|
46
|
+
except Exception as e:
|
|
47
|
+
logging.error(f"Error reading CSV data: {e}")
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
if self.header_row is None:
|
|
51
|
+
df.columns = list(range(1, len(df.columns) + 1))
|
|
52
|
+
|
|
53
|
+
return self.standardize_dataframe(
|
|
54
|
+
df, self.datastream_ids, self.timestamp_column, self.timestamp_format
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
def calculate_skiprows(self):
|
|
58
|
+
"""
|
|
59
|
+
Calculates the skiprows parameter for pd.read_csv.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
skiprows (list or None): List of row indices to skip, or None if no rows need to be skipped.
|
|
63
|
+
Raises:
|
|
64
|
+
ValueError: If header_row is not compatible with data_start_row.
|
|
65
|
+
"""
|
|
66
|
+
if self.data_start_row == 0:
|
|
67
|
+
if self.header_row is not None:
|
|
68
|
+
# Cannot have a header row if data starts at the first row
|
|
69
|
+
raise ValueError(
|
|
70
|
+
"header_row must be None when data_start_row is 1 (first row)"
|
|
71
|
+
)
|
|
72
|
+
return None # No rows to skip
|
|
73
|
+
|
|
74
|
+
skiprows = list(range(self.data_start_row))
|
|
75
|
+
|
|
76
|
+
if self.header_row is not None:
|
|
77
|
+
if self.header_row >= self.data_start_row:
|
|
78
|
+
raise ValueError("header_row must be less than data_start_row")
|
|
79
|
+
if self.header_row in skiprows:
|
|
80
|
+
# Do not skip the header row
|
|
81
|
+
skiprows.remove(self.header_row)
|
|
82
|
+
return skiprows
|
|
83
|
+
|
|
84
|
+
@staticmethod
|
|
85
|
+
def convert_to_zero_based(index: Union[str, int]) -> Union[str, int]:
|
|
86
|
+
if isinstance(index, int):
|
|
87
|
+
return index - 1
|
|
88
|
+
return index
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from typing import Dict, Optional, Any, List
|
|
4
|
+
from .base import Transformer
|
|
5
|
+
import json
|
|
6
|
+
import jmespath
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class JSONTransformer(Transformer):
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
query_string: str,
|
|
13
|
+
datastream_ids: Dict[str, str],
|
|
14
|
+
timestamp_format: Optional[str] = "ISO8601",
|
|
15
|
+
):
|
|
16
|
+
"""
|
|
17
|
+
Initializes the JSONTransformer.
|
|
18
|
+
|
|
19
|
+
Parameters:
|
|
20
|
+
query_string (str): JMESPath to the data array containing time series data.
|
|
21
|
+
Since JMESPath can natively rename column names, the assumption is the timestamp column
|
|
22
|
+
is always named 'timestamp' or converted to 'timestamp' in the JMESPath query.
|
|
23
|
+
datastream_ids (dict): Mapping from JSON field names to datastream IDs.
|
|
24
|
+
timestamp_format (str, optional): The format of the timestamp, if it needs special parsing.
|
|
25
|
+
"""
|
|
26
|
+
self.query_string = query_string
|
|
27
|
+
self.datastream_ids = datastream_ids
|
|
28
|
+
self.timestamp_format = timestamp_format
|
|
29
|
+
|
|
30
|
+
def transform(self, data_file):
|
|
31
|
+
"""
|
|
32
|
+
Transforms a JSON file-like object into the standard Pandas dataframe format.
|
|
33
|
+
Since JMESPath can natively rename column names, the assumption is the timestamp column
|
|
34
|
+
is always named 'timestamp' for JSON data or converted to 'timestamp' in the JMESPath query.
|
|
35
|
+
|
|
36
|
+
Parameters:
|
|
37
|
+
data_file: File-like object containing JSON data.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
pd.DataFrame: pandas DataFrames in the format pd.Timestamp, datastream_id_1, datastream_id_2, ...
|
|
41
|
+
"""
|
|
42
|
+
json_data = json.load(data_file)
|
|
43
|
+
data_points = self.extract_data_points(json_data)
|
|
44
|
+
if not data_points:
|
|
45
|
+
logging.warning("No data points found in the JSON data.")
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
df = pd.DataFrame(data_points)
|
|
49
|
+
|
|
50
|
+
return self.standardize_dataframe(
|
|
51
|
+
df,
|
|
52
|
+
self.datastream_ids,
|
|
53
|
+
timestamp_format=self.timestamp_format,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
def extract_data_points(self, json_data: Any) -> Optional[List[dict]]:
|
|
57
|
+
"""Extracts data points from the JSON data using the data_path."""
|
|
58
|
+
data_points = jmespath.search(self.query_string, json_data)
|
|
59
|
+
|
|
60
|
+
if isinstance(data_points, dict):
|
|
61
|
+
data_points = [data_points]
|
|
62
|
+
return data_points
|
|
File without changes
|
|
@@ -7,22 +7,28 @@ from requests import HTTPError
|
|
|
7
7
|
from datetime import datetime, timezone, timedelta
|
|
8
8
|
from dateutil.parser import isoparse
|
|
9
9
|
from .exceptions import HeaderParsingError, TimestampParsingError
|
|
10
|
+
import warnings
|
|
10
11
|
|
|
11
12
|
if TYPE_CHECKING:
|
|
12
13
|
from ..core.schemas import DataSource
|
|
13
14
|
|
|
14
|
-
logger = logging.getLogger(
|
|
15
|
+
logger = logging.getLogger("hydroserver_etl")
|
|
15
16
|
logger.addHandler(logging.NullHandler())
|
|
16
17
|
|
|
17
18
|
|
|
18
|
-
class
|
|
19
|
+
class HydroServerETLCSV:
|
|
19
20
|
|
|
20
21
|
def __init__(
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
22
|
+
self,
|
|
23
|
+
service,
|
|
24
|
+
data_file: IO[str],
|
|
25
|
+
data_source: "DataSource",
|
|
25
26
|
):
|
|
27
|
+
warnings.warn(
|
|
28
|
+
"HydroServerETLCSV is deprecated and will be removed in a future version. "
|
|
29
|
+
"Please use the new HydroServerETL class.",
|
|
30
|
+
DeprecationWarning,
|
|
31
|
+
)
|
|
26
32
|
self._service = service
|
|
27
33
|
self._data_file = data_file
|
|
28
34
|
self._data_source = data_source
|
|
@@ -66,12 +72,12 @@ class HydroServerETL:
|
|
|
66
72
|
self._failed_datastreams.extend(self._post_observations())
|
|
67
73
|
|
|
68
74
|
except HeaderParsingError as e:
|
|
69
|
-
self._message = f
|
|
75
|
+
self._message = f"Failed to parse header for {self._data_source.name} with error: {str(e)}"
|
|
70
76
|
logger.error(self._message)
|
|
71
77
|
self._file_header_error = True
|
|
72
78
|
|
|
73
79
|
except TimestampParsingError as e:
|
|
74
|
-
self._message = f
|
|
80
|
+
self._message = f"Failed to parse one or more timestamps for {self._data_source.name} with error: {str(e)}"
|
|
75
81
|
logger.error(self._message)
|
|
76
82
|
self._file_timestamp_error = True
|
|
77
83
|
|
|
@@ -79,7 +85,7 @@ class HydroServerETL:
|
|
|
79
85
|
self._failed_datastreams.extend(self._post_observations())
|
|
80
86
|
|
|
81
87
|
if not self._message and len(self._failed_datastreams) > 0:
|
|
82
|
-
self._message = f
|
|
88
|
+
self._message = f"One or more datastreams failed to sync with HydroServer for {self._data_source.name}."
|
|
83
89
|
|
|
84
90
|
self._update_data_source()
|
|
85
91
|
|
|
@@ -99,7 +105,8 @@ class HydroServerETL:
|
|
|
99
105
|
"""
|
|
100
106
|
|
|
101
107
|
if index == self._data_source.header_row or (
|
|
102
|
-
|
|
108
|
+
index == self._data_source.data_start_row
|
|
109
|
+
and self._timestamp_column_index is None
|
|
103
110
|
):
|
|
104
111
|
self._parse_file_header(row)
|
|
105
112
|
|
|
@@ -110,18 +117,29 @@ class HydroServerETL:
|
|
|
110
117
|
|
|
111
118
|
for datastream in self._datastreams.values():
|
|
112
119
|
if str(datastream.uid) not in self._datastream_start_row_indexes.keys():
|
|
113
|
-
if
|
|
120
|
+
if (
|
|
121
|
+
not datastream.phenomenon_end_time
|
|
122
|
+
or timestamp > datastream.phenomenon_end_time
|
|
123
|
+
):
|
|
114
124
|
self._datastream_start_row_indexes[str(datastream.uid)] = index
|
|
115
125
|
|
|
116
|
-
if
|
|
117
|
-
|
|
126
|
+
if (
|
|
127
|
+
str(datastream.uid) in self._datastream_start_row_indexes.keys()
|
|
128
|
+
and self._datastream_start_row_indexes[str(datastream.uid)] <= index
|
|
129
|
+
):
|
|
118
130
|
if str(datastream.uid) not in self._observations.keys():
|
|
119
131
|
self._observations[str(datastream.uid)] = []
|
|
120
132
|
|
|
121
|
-
self._observations[str(datastream.uid)].append(
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
133
|
+
self._observations[str(datastream.uid)].append(
|
|
134
|
+
{
|
|
135
|
+
"phenomenon_time": timestamp,
|
|
136
|
+
"result": row[
|
|
137
|
+
self._datastream_column_indexes[
|
|
138
|
+
datastream.data_source_column
|
|
139
|
+
]
|
|
140
|
+
],
|
|
141
|
+
}
|
|
142
|
+
)
|
|
125
143
|
|
|
126
144
|
def _parse_file_header(self, row: List[str]) -> None:
|
|
127
145
|
"""
|
|
@@ -136,22 +154,29 @@ class HydroServerETL:
|
|
|
136
154
|
"""
|
|
137
155
|
|
|
138
156
|
try:
|
|
139
|
-
self._timestamp_column_index =
|
|
140
|
-
|
|
157
|
+
self._timestamp_column_index = (
|
|
158
|
+
row.index(self._data_source.timestamp_column)
|
|
159
|
+
if isinstance(self._data_source.timestamp_column, str)
|
|
141
160
|
else int(self._data_source.timestamp_column) - 1
|
|
161
|
+
)
|
|
142
162
|
if self._timestamp_column_index > len(row):
|
|
143
163
|
raise ValueError
|
|
144
164
|
self._datastream_column_indexes = {
|
|
145
|
-
datastream.data_source_column:
|
|
146
|
-
|
|
147
|
-
|
|
165
|
+
datastream.data_source_column: (
|
|
166
|
+
row.index(datastream.data_source_column)
|
|
167
|
+
if not datastream.data_source_column.isdigit()
|
|
168
|
+
else int(datastream.data_source_column) - 1
|
|
169
|
+
)
|
|
148
170
|
for datastream in self._datastreams.values()
|
|
149
171
|
}
|
|
150
|
-
if len(self._datastream_column_indexes.values()) > 0 and
|
|
151
|
-
|
|
172
|
+
if len(self._datastream_column_indexes.values()) > 0 and max(
|
|
173
|
+
self._datastream_column_indexes.values()
|
|
174
|
+
) > len(row):
|
|
152
175
|
raise ValueError
|
|
153
176
|
except ValueError as e:
|
|
154
|
-
logger.error(
|
|
177
|
+
logger.error(
|
|
178
|
+
f'Failed to load data from data source: "{self._data_source.name}"'
|
|
179
|
+
)
|
|
155
180
|
raise HeaderParsingError(str(e)) from e
|
|
156
181
|
|
|
157
182
|
def _parse_row_timestamp(self, row: List[str]) -> datetime:
|
|
@@ -164,32 +189,36 @@ class HydroServerETL:
|
|
|
164
189
|
"""
|
|
165
190
|
|
|
166
191
|
try:
|
|
167
|
-
if
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
192
|
+
if (
|
|
193
|
+
self._data_source.timestamp_format == "iso"
|
|
194
|
+
or self._data_source.timestamp_format is None
|
|
195
|
+
):
|
|
196
|
+
timestamp = isoparse(row[self._timestamp_column_index])
|
|
171
197
|
else:
|
|
172
198
|
timestamp = datetime.strptime(
|
|
173
199
|
row[self._timestamp_column_index],
|
|
174
|
-
self._data_source.timestamp_format
|
|
200
|
+
self._data_source.timestamp_format,
|
|
175
201
|
)
|
|
176
202
|
except ValueError as e:
|
|
177
203
|
raise TimestampParsingError(str(e)) from e
|
|
178
204
|
|
|
179
205
|
if timestamp.tzinfo is None:
|
|
180
206
|
if not self._data_source.timestamp_offset:
|
|
181
|
-
timestamp = timestamp.replace(
|
|
182
|
-
tzinfo=timezone.utc
|
|
183
|
-
)
|
|
207
|
+
timestamp = timestamp.replace(tzinfo=timezone.utc)
|
|
184
208
|
else:
|
|
185
209
|
try:
|
|
186
210
|
timestamp = timestamp.replace(
|
|
187
211
|
tzinfo=datetime.strptime(
|
|
188
|
-
self._data_source.timestamp_offset[:-2]
|
|
212
|
+
self._data_source.timestamp_offset[:-2]
|
|
213
|
+
+ ":"
|
|
214
|
+
+ self._data_source.timestamp_offset[3:],
|
|
215
|
+
"%z",
|
|
189
216
|
).tzinfo
|
|
190
217
|
)
|
|
191
218
|
except ValueError as e:
|
|
192
|
-
logger.error(
|
|
219
|
+
logger.error(
|
|
220
|
+
f'Failed to load data from data source: "{self._data_source.name}"'
|
|
221
|
+
)
|
|
193
222
|
raise TimestampParsingError(str(e)) from e
|
|
194
223
|
|
|
195
224
|
return timestamp
|
|
@@ -213,15 +242,18 @@ class HydroServerETL:
|
|
|
213
242
|
if datastream_id not in self._failed_datastreams and len(observations) > 0:
|
|
214
243
|
|
|
215
244
|
logger.info(
|
|
216
|
-
f
|
|
217
|
-
f'{observations[0]["phenomenon_time"].strftime("%Y-%m-%dT%H:%M:%S%z")} to '
|
|
218
|
-
f'{observations[-1]["phenomenon_time"].strftime("%Y-%m-%dT%H:%M:%S%z")} for datastream: '
|
|
219
|
-
f'{str(datastream_id)} in data source "{self._data_source.name}".'
|
|
245
|
+
f"Loading observations from "
|
|
246
|
+
+ f'{observations[0]["phenomenon_time"].strftime("%Y-%m-%dT%H:%M:%S%z")} to '
|
|
247
|
+
+ f'{observations[-1]["phenomenon_time"].strftime("%Y-%m-%dT%H:%M:%S%z")} for datastream: '
|
|
248
|
+
+ f'{str(datastream_id)} in data source "{self._data_source.name}".'
|
|
220
249
|
)
|
|
221
250
|
|
|
222
251
|
observations_df = pd.DataFrame(
|
|
223
|
-
[
|
|
224
|
-
|
|
252
|
+
[
|
|
253
|
+
[observation["phenomenon_time"], observation["result"]]
|
|
254
|
+
for observation in observations
|
|
255
|
+
],
|
|
256
|
+
columns=["timestamp", "value"],
|
|
225
257
|
)
|
|
226
258
|
|
|
227
259
|
try:
|
|
@@ -233,17 +265,18 @@ class HydroServerETL:
|
|
|
233
265
|
failed_datastreams.append(datastream_id)
|
|
234
266
|
|
|
235
267
|
if not self._last_loaded_timestamp or (
|
|
236
|
-
|
|
237
|
-
|
|
268
|
+
observations[-1]["phenomenon_time"]
|
|
269
|
+
and observations[-1]["phenomenon_time"]
|
|
270
|
+
> self._last_loaded_timestamp
|
|
238
271
|
):
|
|
239
|
-
self._last_loaded_timestamp = observations[-1][
|
|
272
|
+
self._last_loaded_timestamp = observations[-1]["phenomenon_time"]
|
|
240
273
|
elif datastream_id in self._failed_datastreams:
|
|
241
274
|
logger.info(
|
|
242
|
-
f
|
|
243
|
-
f'{observations[0]["phenomenon_time"].strftime("%Y-%m-%dT%H:%M:%S%z")} to '
|
|
244
|
-
f'{observations[-1]["phenomenon_time"].strftime("%Y-%m-%dT%H:%M:%S%z")} for datastream: '
|
|
245
|
-
f'{str(datastream_id)} in data source "{self._data_source.name}",'
|
|
246
|
-
f
|
|
275
|
+
f"Skipping observations POST request from "
|
|
276
|
+
+ f'{observations[0]["phenomenon_time"].strftime("%Y-%m-%dT%H:%M:%S%z")} to '
|
|
277
|
+
+ f'{observations[-1]["phenomenon_time"].strftime("%Y-%m-%dT%H:%M:%S%z")} for datastream: '
|
|
278
|
+
+ f'{str(datastream_id)} in data source "{self._data_source.name}",'
|
|
279
|
+
+ f"due to previous failed POST request."
|
|
247
280
|
)
|
|
248
281
|
|
|
249
282
|
self._observations = {}
|
|
@@ -260,10 +293,12 @@ class HydroServerETL:
|
|
|
260
293
|
|
|
261
294
|
if self._data_source.crontab is not None:
|
|
262
295
|
next_sync = croniter.croniter(
|
|
263
|
-
self._data_source.crontab,
|
|
264
|
-
datetime.now()
|
|
296
|
+
self._data_source.crontab, datetime.now()
|
|
265
297
|
).get_next(datetime)
|
|
266
|
-
elif
|
|
298
|
+
elif (
|
|
299
|
+
self._data_source.interval is not None
|
|
300
|
+
and self._data_source.interval_units is not None
|
|
301
|
+
):
|
|
267
302
|
next_sync = datetime.now() + timedelta(
|
|
268
303
|
**{self._data_source.interval_units: self._data_source.interval}
|
|
269
304
|
)
|
|
@@ -272,8 +307,11 @@ class HydroServerETL:
|
|
|
272
307
|
|
|
273
308
|
self._data_source.data_source_thru = self._last_loaded_timestamp
|
|
274
309
|
self._data_source.last_sync_successful = (
|
|
275
|
-
True
|
|
276
|
-
|
|
310
|
+
True
|
|
311
|
+
if not self._file_timestamp_error
|
|
312
|
+
and not self._file_header_error
|
|
313
|
+
and len(self._failed_datastreams) == 0
|
|
314
|
+
else False
|
|
277
315
|
)
|
|
278
316
|
self._data_source.last_sync_message = self._message
|
|
279
317
|
self._data_source.last_synced = datetime.now(timezone.utc)
|