hydroserverpy 0.2.5__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hydroserverpy might be problematic. Click here for more details.

Files changed (77) hide show
  1. hydroserverpy/__init__.py +6 -15
  2. hydroserverpy/core/endpoints/__init__.py +9 -0
  3. hydroserverpy/core/endpoints/base.py +146 -0
  4. hydroserverpy/core/endpoints/data_loaders.py +93 -0
  5. hydroserverpy/core/endpoints/data_sources.py +93 -0
  6. hydroserverpy/core/endpoints/datastreams.py +225 -0
  7. hydroserverpy/core/endpoints/observed_properties.py +111 -0
  8. hydroserverpy/core/endpoints/processing_levels.py +111 -0
  9. hydroserverpy/core/endpoints/result_qualifiers.py +111 -0
  10. hydroserverpy/core/endpoints/sensors.py +111 -0
  11. hydroserverpy/core/endpoints/things.py +261 -0
  12. hydroserverpy/core/endpoints/units.py +111 -0
  13. hydroserverpy/{components → core/schemas}/__init__.py +1 -2
  14. hydroserverpy/core/schemas/base.py +124 -0
  15. hydroserverpy/core/schemas/data_loaders.py +73 -0
  16. hydroserverpy/core/schemas/data_sources.py +223 -0
  17. hydroserverpy/core/schemas/datastreams.py +330 -0
  18. hydroserverpy/core/schemas/observed_properties.py +43 -0
  19. hydroserverpy/core/schemas/processing_levels.py +31 -0
  20. hydroserverpy/core/schemas/result_qualifiers.py +26 -0
  21. hydroserverpy/core/schemas/sensors.py +68 -0
  22. hydroserverpy/core/schemas/things.py +346 -0
  23. hydroserverpy/core/schemas/units.py +29 -0
  24. hydroserverpy/core/service.py +200 -0
  25. hydroserverpy/etl/__init__.py +21 -0
  26. hydroserverpy/etl/extractors/__init__.py +0 -0
  27. hydroserverpy/etl/extractors/base.py +13 -0
  28. hydroserverpy/etl/extractors/ftp_extractor.py +50 -0
  29. hydroserverpy/etl/extractors/http_extractor.py +84 -0
  30. hydroserverpy/etl/extractors/local_file_extractor.py +25 -0
  31. hydroserverpy/etl/hydroserver_etl.py +40 -0
  32. hydroserverpy/etl/loaders/__init__.py +0 -0
  33. hydroserverpy/etl/loaders/base.py +13 -0
  34. hydroserverpy/etl/loaders/hydroserver_loader.py +68 -0
  35. hydroserverpy/etl/transformers/__init__.py +0 -0
  36. hydroserverpy/etl/transformers/base.py +52 -0
  37. hydroserverpy/etl/transformers/csv_transformer.py +88 -0
  38. hydroserverpy/etl/transformers/json_transformer.py +62 -0
  39. hydroserverpy/etl/types.py +7 -0
  40. hydroserverpy/etl_csv/__init__.py +0 -0
  41. hydroserverpy/{etl.py → etl_csv/hydroserver_etl_csv.py} +118 -95
  42. hydroserverpy/quality/__init__.py +1 -0
  43. hydroserverpy/quality/service.py +405 -0
  44. hydroserverpy-0.4.0.dist-info/METADATA +18 -0
  45. hydroserverpy-0.4.0.dist-info/RECORD +51 -0
  46. {hydroserverpy-0.2.5.dist-info → hydroserverpy-0.4.0.dist-info}/WHEEL +1 -1
  47. hydroserverpy/components/data_loaders.py +0 -67
  48. hydroserverpy/components/data_sources.py +0 -98
  49. hydroserverpy/components/datastreams.py +0 -47
  50. hydroserverpy/components/observed_properties.py +0 -48
  51. hydroserverpy/components/processing_levels.py +0 -48
  52. hydroserverpy/components/result_qualifiers.py +0 -48
  53. hydroserverpy/components/sensors.py +0 -48
  54. hydroserverpy/components/things.py +0 -48
  55. hydroserverpy/components/units.py +0 -48
  56. hydroserverpy/components/users.py +0 -28
  57. hydroserverpy/main.py +0 -62
  58. hydroserverpy/models.py +0 -218
  59. hydroserverpy/schemas/data_loaders.py +0 -27
  60. hydroserverpy/schemas/data_sources.py +0 -58
  61. hydroserverpy/schemas/datastreams.py +0 -56
  62. hydroserverpy/schemas/observed_properties.py +0 -33
  63. hydroserverpy/schemas/processing_levels.py +0 -33
  64. hydroserverpy/schemas/result_qualifiers.py +0 -32
  65. hydroserverpy/schemas/sensors.py +0 -39
  66. hydroserverpy/schemas/things.py +0 -107
  67. hydroserverpy/schemas/units.py +0 -32
  68. hydroserverpy/schemas/users.py +0 -28
  69. hydroserverpy/service.py +0 -170
  70. hydroserverpy/utils.py +0 -37
  71. hydroserverpy-0.2.5.dist-info/METADATA +0 -15
  72. hydroserverpy-0.2.5.dist-info/RECORD +0 -35
  73. /hydroserverpy/{schemas → core}/__init__.py +0 -0
  74. /hydroserverpy/{exceptions.py → etl_csv/exceptions.py} +0 -0
  75. {hydroserverpy-0.2.5.dist-info → hydroserverpy-0.4.0.dist-info}/LICENSE +0 -0
  76. {hydroserverpy-0.2.5.dist-info → hydroserverpy-0.4.0.dist-info}/top_level.txt +0 -0
  77. {hydroserverpy-0.2.5.dist-info → hydroserverpy-0.4.0.dist-info}/zip-safe +0 -0
@@ -0,0 +1,84 @@
1
+ import logging
2
+ from hydroserverpy.etl.types import TimeRange
3
+ import requests
4
+ from io import BytesIO
5
+ from typing import Dict
6
+ from .base import Extractor
7
+
8
+
9
+ class HTTPExtractor(Extractor):
10
+ def __init__(
11
+ self,
12
+ url: str,
13
+ url_variables: dict = None,
14
+ params: dict = None,
15
+ headers: dict = None,
16
+ auth: tuple = None,
17
+ ):
18
+ self.url = self.format_url(url, url_variables or {})
19
+ self.params = params
20
+ self.headers = headers
21
+ self.auth = auth
22
+ self.start_date = None
23
+
24
+ def prepare_params(self, data_requirements: Dict[str, TimeRange]):
25
+ start_times = [
26
+ req["start_time"] for req in data_requirements.values() if req["start_time"]
27
+ ]
28
+
29
+ if start_times:
30
+ oldest_start_time = min(start_times).isoformat()
31
+ start_time_key = self.params.pop("start_time_key", None)
32
+ if start_time_key:
33
+ self.params[start_time_key] = oldest_start_time
34
+ logging.info(
35
+ f"Set start_time to {oldest_start_time} and removed 'start_time_key'"
36
+ )
37
+ else:
38
+ logging.warning("'start_time_key' not found in params.")
39
+
40
+ end_times = [
41
+ req["end_time"] for req in data_requirements.values() if req["end_time"]
42
+ ]
43
+
44
+ if end_times:
45
+ newest_end_time = max(end_times).isoformat()
46
+ end_time_key = self.params.pop("end_time_key", None)
47
+ if end_time_key:
48
+ self.params[end_time_key] = newest_end_time
49
+ logging.info(
50
+ f"Set end_time to {newest_end_time} and removed 'end_time_key'"
51
+ )
52
+ else:
53
+ logging.warning("'end_time_key' not found in params.")
54
+
55
+ def extract(self):
56
+ """
57
+ Downloads the file from the HTTP/HTTPS server and returns a file-like object.
58
+ """
59
+ response = requests.get(
60
+ url=self.url,
61
+ params=self.params,
62
+ headers=self.headers,
63
+ auth=self.auth,
64
+ stream=True,
65
+ )
66
+ response.raise_for_status()
67
+ logging.info(f"Successfully downloaded file from {response.url}")
68
+
69
+ data = BytesIO()
70
+ for chunk in response.iter_content(chunk_size=8192):
71
+ if chunk:
72
+ data.write(chunk)
73
+ data.seek(0)
74
+ return data
75
+
76
+ @staticmethod
77
+ def format_url(url_template, url_variables):
78
+ try:
79
+ url = url_template.format(**url_variables)
80
+ except KeyError as e:
81
+ missing_key = e.args[0]
82
+ raise KeyError(f"Missing configuration url_variable: {missing_key}")
83
+
84
+ return url
@@ -0,0 +1,25 @@
1
+ import logging
2
+ from typing import Dict
3
+
4
+ from .base import Extractor
5
+ from ..types import TimeRange
6
+
7
+
8
+ class LocalFileExtractor(Extractor):
9
+ def __init__(self, filepath: str):
10
+ self.filepath = filepath
11
+
12
+ def prepare_params(self, data_requirements: Dict[str, TimeRange]):
13
+ pass
14
+
15
+ def extract(self):
16
+ """
17
+ Opens the file and returns a file-like object.
18
+ """
19
+ try:
20
+ file_handle = open(self.filepath, "r")
21
+ logging.info(f"Successfully opened file '{self.filepath}'.")
22
+ return file_handle
23
+ except Exception as e:
24
+ logging.error(f"Error opening file '{self.filepath}': {e}")
25
+ return None
@@ -0,0 +1,40 @@
1
+ import logging
2
+ import pandas as pd
3
+
4
+
5
+ class HydroServerETL:
6
+ def __init__(self, extractor, transformer, loader, source_target_map):
7
+ self.extractor = extractor
8
+ self.transformer = transformer
9
+ self.loader = loader
10
+ self.source_target_map = source_target_map
11
+
12
+ def run(self):
13
+ """
14
+ Extracts, transforms, and loads data as defined by the class parameters.
15
+ """
16
+
17
+ # Step 1: Get Target System data requirements from the Loader & prepare parameters for the Extractor
18
+ data_requirements = self.loader.get_data_requirements(self.source_target_map)
19
+ self.extractor.prepare_params(data_requirements)
20
+
21
+ # Step 2: Extract
22
+ data = self.extractor.extract()
23
+ if data is None or (isinstance(data, pd.DataFrame) and data.empty):
24
+ logging.warning(f"No data was returned from the extractor. Ending ETL run.")
25
+ return
26
+ else:
27
+ logging.info(f"Successfully extracted data.")
28
+
29
+ # Step 3: Transform
30
+ if self.transformer:
31
+ data = self.transformer.transform(data)
32
+ if data is None or (isinstance(data, pd.DataFrame) and data.empty):
33
+ logging.warning(f"No data returned from the transformer. Ending run.")
34
+ return
35
+ else:
36
+ logging.info(f"Successfully transformed data. {data}")
37
+
38
+ # Step 4: Load
39
+ self.loader.load(data, self.source_target_map)
40
+ logging.info("Successfully loaded data.")
File without changes
@@ -0,0 +1,13 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Dict
3
+ import pandas as pd
4
+
5
+
6
+ class Loader(ABC):
7
+ @abstractmethod
8
+ def load(self, *args, **kwargs) -> None:
9
+ pass
10
+
11
+ @abstractmethod
12
+ def get_data_requirements(self, df: pd.DataFrame) -> Dict[str, pd.Timestamp]:
13
+ pass
@@ -0,0 +1,68 @@
1
+ from hydroserverpy.core.service import HydroServer
2
+ from typing import Dict, Optional
3
+ from .base import Loader
4
+ import logging
5
+ import pandas as pd
6
+
7
+
8
+ class HydroServerLoader(HydroServer, Loader):
9
+ """
10
+ A class that extends the HydroServer client with ETL-specific functionalities.
11
+ """
12
+
13
+ def __init__(
14
+ self,
15
+ host: str,
16
+ username: Optional[str] = None,
17
+ password: Optional[str] = None,
18
+ apikey: Optional[str] = None,
19
+ api_route: str = "api",
20
+ ):
21
+ super().__init__(host, username, password, apikey, api_route)
22
+
23
+ def load(self, data: pd.DataFrame, source_target_map) -> None:
24
+ """
25
+ Load observations from a DataFrame to the HydroServer.
26
+
27
+ :param data: A Pandas DataFrame where each column corresponds to a datastream.
28
+ """
29
+ data_requirements = self.get_data_requirements(source_target_map)
30
+ for ds_id in data.columns:
31
+ if ds_id == "timestamp":
32
+ continue
33
+
34
+ df = data[["timestamp", ds_id]].copy()
35
+ df.rename(columns={ds_id: "value"}, inplace=True)
36
+ df.dropna(subset=["value"], inplace=True)
37
+
38
+ phenomenon_end_time = data_requirements[ds_id]["start_time"]
39
+ if phenomenon_end_time:
40
+ df = df[df["timestamp"] > phenomenon_end_time]
41
+ if df.empty:
42
+ logging.warning(
43
+ f"No new data to upload for datastream {ds_id}. Skipping."
44
+ )
45
+ continue
46
+ self.datastreams.load_observations(uid=ds_id, observations=df)
47
+
48
+ def get_data_requirements(
49
+ self, source_target_map
50
+ ) -> Dict[str, Dict[str, pd.Timestamp]]:
51
+ """
52
+ Each target system needs to be able to answer the question: 'What data do you need?'
53
+ and return a time range for each target time series. Usually the answer will be
54
+ 'anything newer than my most recent observation'.
55
+ """
56
+ data_requirements = {}
57
+ for ds_id in source_target_map.values():
58
+ datastream = self.datastreams.get(uid=ds_id)
59
+ if not datastream:
60
+ message = "Couldn't fetch target datastream. ETL process aborted."
61
+ logging.error(message)
62
+ raise message
63
+ start_time = pd.Timestamp(
64
+ datastream.phenomenon_end_time or "1970-01-01T00:00:00Z"
65
+ )
66
+ end_time = pd.Timestamp.now()
67
+ data_requirements[ds_id] = {"start_time": start_time, "end_time": end_time}
68
+ return data_requirements
File without changes
@@ -0,0 +1,52 @@
1
+ from abc import ABC, abstractmethod
2
+ import logging
3
+ import pandas as pd
4
+
5
+
6
+ class Transformer(ABC):
7
+ @abstractmethod
8
+ def transform(self, *args, **kwargs) -> None:
9
+ pass
10
+
11
+ @property
12
+ def needs_datastreams(self) -> bool:
13
+ return False
14
+
15
+ @staticmethod
16
+ def standardize_dataframe(
17
+ df,
18
+ datastream_ids,
19
+ timestamp_column: str = "timestamp",
20
+ timestamp_format: str = "ISO8601",
21
+ ):
22
+ df.rename(
23
+ columns={timestamp_column: "timestamp", **datastream_ids},
24
+ inplace=True,
25
+ )
26
+
27
+ # Verify timestamp column is present in the DataFrame
28
+ if "timestamp" not in df.columns:
29
+ message = f"Timestamp column '{timestamp_column}' not found in data."
30
+ logging.error(message)
31
+ raise ValueError(message)
32
+
33
+ # Verify that all datastream_ids are present in the DataFrame
34
+ expected_columns = set(datastream_ids.values())
35
+ actual_columns = set(df.columns)
36
+ missing_datastream_ids = expected_columns - actual_columns
37
+
38
+ if missing_datastream_ids:
39
+ raise ValueError(
40
+ "The following datastream IDs are specified in the config file but their related keys could not be "
41
+ f"found in the source system's extracted data: {missing_datastream_ids}"
42
+ )
43
+
44
+ # Keep only 'timestamp' and datastream_id columns
45
+ columns_to_keep = ["timestamp"] + list(expected_columns)
46
+ df = df[columns_to_keep]
47
+
48
+ # Convert timestamp column to datetime if not already
49
+ if not pd.api.types.is_datetime64_any_dtype(df["timestamp"]):
50
+ df["timestamp"] = pd.to_datetime(df["timestamp"], format=timestamp_format)
51
+
52
+ return df
@@ -0,0 +1,88 @@
1
+ import logging
2
+ import pandas as pd
3
+ from typing import Dict, Optional, Union
4
+ from .base import Transformer
5
+
6
+
7
+ class CSVTransformer(Transformer):
8
+ def __init__(
9
+ self,
10
+ header_row: Optional[int],
11
+ data_start_row: int,
12
+ timestamp_column: Union[str, int],
13
+ datastream_ids: Dict[Union[str, int], str],
14
+ delimiter: Optional[str] = ",",
15
+ timestamp_format: Optional[str] = "ISO8601",
16
+ ):
17
+ # Pandas is zero-based while CSV is one-based so convert
18
+ self.header_row = None if header_row is None else header_row - 1
19
+ self.data_start_row = data_start_row - 1
20
+ self.timestamp_column = self.convert_to_zero_based(timestamp_column)
21
+ self.datastream_ids = datastream_ids
22
+ self.timestamp_format = timestamp_format
23
+ self.delimiter = delimiter
24
+
25
+ def transform(self, data_file) -> Union[pd.DataFrame, None]:
26
+ """
27
+ Transforms a CSV file-like object into a Pandas DataFrame where the column
28
+ names are replaced with their target datastream ids.
29
+
30
+ Parameters:
31
+ data_file: File-like object containing CSV data.
32
+ Returns:
33
+ observations_map (dict): Dict mapping datastream IDs to pandas DataFrames.
34
+ """
35
+
36
+ try:
37
+ df = pd.read_csv(
38
+ data_file,
39
+ delimiter=self.delimiter,
40
+ header=self.header_row,
41
+ parse_dates=[self.timestamp_column],
42
+ date_format=self.timestamp_format,
43
+ skiprows=self.calculate_skiprows(),
44
+ usecols=[self.timestamp_column] + list(self.datastream_ids.keys()),
45
+ )
46
+ except Exception as e:
47
+ logging.error(f"Error reading CSV data: {e}")
48
+ return None
49
+
50
+ if self.header_row is None:
51
+ df.columns = list(range(1, len(df.columns) + 1))
52
+
53
+ return self.standardize_dataframe(
54
+ df, self.datastream_ids, self.timestamp_column, self.timestamp_format
55
+ )
56
+
57
+ def calculate_skiprows(self):
58
+ """
59
+ Calculates the skiprows parameter for pd.read_csv.
60
+
61
+ Returns:
62
+ skiprows (list or None): List of row indices to skip, or None if no rows need to be skipped.
63
+ Raises:
64
+ ValueError: If header_row is not compatible with data_start_row.
65
+ """
66
+ if self.data_start_row == 0:
67
+ if self.header_row is not None:
68
+ # Cannot have a header row if data starts at the first row
69
+ raise ValueError(
70
+ "header_row must be None when data_start_row is 1 (first row)"
71
+ )
72
+ return None # No rows to skip
73
+
74
+ skiprows = list(range(self.data_start_row))
75
+
76
+ if self.header_row is not None:
77
+ if self.header_row >= self.data_start_row:
78
+ raise ValueError("header_row must be less than data_start_row")
79
+ if self.header_row in skiprows:
80
+ # Do not skip the header row
81
+ skiprows.remove(self.header_row)
82
+ return skiprows
83
+
84
+ @staticmethod
85
+ def convert_to_zero_based(index: Union[str, int]) -> Union[str, int]:
86
+ if isinstance(index, int):
87
+ return index - 1
88
+ return index
@@ -0,0 +1,62 @@
1
+ import logging
2
+ import pandas as pd
3
+ from typing import Dict, Optional, Any, List
4
+ from .base import Transformer
5
+ import json
6
+ import jmespath
7
+
8
+
9
+ class JSONTransformer(Transformer):
10
+ def __init__(
11
+ self,
12
+ query_string: str,
13
+ datastream_ids: Dict[str, str],
14
+ timestamp_format: Optional[str] = "ISO8601",
15
+ ):
16
+ """
17
+ Initializes the JSONTransformer.
18
+
19
+ Parameters:
20
+ query_string (str): JMESPath to the data array containing time series data.
21
+ Since JMESPath can natively rename column names, the assumption is the timestamp column
22
+ is always named 'timestamp' or converted to 'timestamp' in the JMESPath query.
23
+ datastream_ids (dict): Mapping from JSON field names to datastream IDs.
24
+ timestamp_format (str, optional): The format of the timestamp, if it needs special parsing.
25
+ """
26
+ self.query_string = query_string
27
+ self.datastream_ids = datastream_ids
28
+ self.timestamp_format = timestamp_format
29
+
30
+ def transform(self, data_file):
31
+ """
32
+ Transforms a JSON file-like object into the standard Pandas dataframe format.
33
+ Since JMESPath can natively rename column names, the assumption is the timestamp column
34
+ is always named 'timestamp' for JSON data or converted to 'timestamp' in the JMESPath query.
35
+
36
+ Parameters:
37
+ data_file: File-like object containing JSON data.
38
+
39
+ Returns:
40
+ pd.DataFrame: pandas DataFrames in the format pd.Timestamp, datastream_id_1, datastream_id_2, ...
41
+ """
42
+ json_data = json.load(data_file)
43
+ data_points = self.extract_data_points(json_data)
44
+ if not data_points:
45
+ logging.warning("No data points found in the JSON data.")
46
+ return None
47
+
48
+ df = pd.DataFrame(data_points)
49
+
50
+ return self.standardize_dataframe(
51
+ df,
52
+ self.datastream_ids,
53
+ timestamp_format=self.timestamp_format,
54
+ )
55
+
56
+ def extract_data_points(self, json_data: Any) -> Optional[List[dict]]:
57
+ """Extracts data points from the JSON data using the data_path."""
58
+ data_points = jmespath.search(self.query_string, json_data)
59
+
60
+ if isinstance(data_points, dict):
61
+ data_points = [data_points]
62
+ return data_points
@@ -0,0 +1,7 @@
1
+ from typing import TypedDict
2
+ import pandas as pd
3
+
4
+
5
+ class TimeRange(TypedDict):
6
+ start_time: pd.Timestamp
7
+ end_time: pd.Timestamp
File without changes